diff -Naur -X dontdiff linux-2.4.29/Documentation/Configure.help linux-2.4.29-iswraid/Documentation/Configure.help
--- linux-2.4.29/Documentation/Configure.help	2005-01-25 20:55:34.000000000 -0500
+++ linux-2.4.29-iswraid/Documentation/Configure.help	2005-01-27 17:15:40.000000000 -0500
@@ -2161,6 +2161,59 @@
   If you choose to compile this as a module, the module will be called
   silraid.o.
 
+Intel Software RAID
+CONFIG_BLK_DEV_ATARAID_ISW
+  This option enables support for the Intel Software RAID.
+  Say Y or M if you have hardware which supports this RAID format.
+  Currently most Intel's ICH5R/ICH6R/ICH7R chipsets do, but some do not.
+  Make sure that the RAID Option ROM is Intel's.
+
+  This driver uses /dev/ataraid/dXpY (X and Y numbers) as device
+  names. Please see <file:Documentation/iswraid.txt> for more 
+  information about this driver. It is an experimental driver and 
+  may not work properly.
+
+  If you choose to compile this as a module, the module will be called
+  iswraid.o.
+
+Halt I/O to a volume if it becomes degraded
+CONFIG_ISWRAID_HALT_DEGRADED
+  In some situations it is beneficial to learn right away that a volume
+  has become degraded, instead of keeping using it. If you say Y, all
+  input/output operations to a degraded volume will be halted and
+  returned as failed. If unsure, say N. (This option sets the compile
+  time default for the driver, the behavior can still be changed with
+  the load-time parameter iswraid_halt_degraded.)
+
+Never fail an already degraded RAID1 volume
+CONFIG_ISWRAID_RESIST_FAILING
+  When the ability to recover data on a RAID1/RAID10/RAID1E volume
+  is lost due to yet another disk failing, it is customary to mark the 
+  disk as failed and to mark the volume as failed. However, doing so 
+  destroys the ability to tell which disk (or set of disks) have the 
+  most up-to-date data. If you say Y, iswraid will try to leave such
+  volumes as degraded and to not fail the disk either, and merely fail
+  the I/O that exposed this problem. However, the state of other volumes
+  containing the failing disk may dictate that the disk is to be marked
+  failed and if so the intention to not fail the RAID1E volumes can
+  become overruled. If unsure, say N. (This option sets the compile time
+  default for the driver, the behavior can still be changed with the 
+  load-time parameter iswraid_resist_failing.)  
+
+Error threshold for marking a disk as failed
+CONFIG_ISWRAID_ERROR_THRESHOLD
+  While failed writes to non-RAID0 volumes cause the disk and such
+  volumes containing it to change their state, there are also other
+  kinds of input/output that do not have such drastic effects upon
+  failures. However, when the error count for a disk reaches a certain 
+  threshold, it is considered proper to mark it as failed. The count
+  is non-persistent across module loads. You can set this threshold 
+  to 0 if you do not want to use it. (This option sets the compile
+  time default for the driver, the value can still be changed with 
+  the load-time parameter iswraid_error_threshold.)
+
+  Default: 10
+
 Support for Acer PICA 1 chipset
 CONFIG_ACER_PICA_61
   This is a machine with a R4400 133/150 MHz CPU. To compile a Linux
diff -Naur -X dontdiff linux-2.4.29/Documentation/iswraid.txt linux-2.4.29-iswraid/Documentation/iswraid.txt
--- linux-2.4.29/Documentation/iswraid.txt	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.29-iswraid/Documentation/iswraid.txt	2005-01-28 18:32:14.000000000 -0500
@@ -0,0 +1,329 @@
+                  Intel Software RAID Driver (iswraid)
+                  ====================================
+
+
+
+
+                              Overview
+                            
+Intel Software RAID driver works in conjunction with the Intel RAID Option
+ROM, distributed with most (but not all) ICH5R/ICH6R/ICH7R chipsets. It
+understands the Intel RAID metadata and allows booting from RAID volumes,
+regardless of their RAID level. It is useful when there is a need for 
+compatibility with other operating systems using these RAID volumes.
+
+
+
+ 
+                     License, Copyright, Authors
+
+Copyright (C) 2003,2004,2005 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it 
+under the terms of the GNU General Public License as published by the 
+Free Software Foundation; either version 2, or (at your option) any later 
+version.
+
+You should have received a copy of the GNU General Public License (for 
+example /usr/src/linux/COPYING); if not, write to the Free Software 
+Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Authors: 
+Boji Tony Kannanthanam < boji dot t dot kannanthanam at intel dot com >,
+Martins Krikis         < martins dot krikis at intel dot com >.
+
+
+
+
+                               Features
+
+This driver is an ataraid subdriver, albeit utilizing a very minimal set
+of facilities provided by it. There are several features that currently
+distinguish iswraid from other ataraid subdrivers:
+* it scans the Linux SCSI subsystem's disks looking for the Intel RAID 
+  metadata instead of IDE disks;
+* it notices and reports I/O errors for its RAID volumes;
+* it updates the Intel RAID metadata when necessary upon errors, thus
+  causing volumes to become degraded or even failed, and this status
+  is persistent across reboots and operating system changes;
+* it provides a user interface via the /proc filesystem that allows the
+  inspection of the status of its RAID arrays, disks and volumes;
+* it has several module load-time parameters that influence its behavior
+  and configurable compile-time defaults for these parameters;
+* when necessary to split an I/O request, it does so on natural strip 
+  boundaries;
+* it uses slab caches for efficiency;
+* it generally does a lot of things its own way, thus avoiding any existing
+  problems specific to ataraid subdrivers (and possibly introducing its own).
+
+While they may or may not be distinguishing features, iswraid also:
+* supports RAID0 (striping) over n-disk volumes;
+* supports RAID1E (mirroring with striping) over n-disk volumes---this is
+  equivalent to RAID1 for 2-disk volumes and to RAID10 for 4-disk volumes;
+* supports multiple volumes per array ("Matrix RAID");
+* deals with missing disks in a reasonable manner;
+* can operate with volumes in degraded mode (unless instructed not to);
+* implements disk error thresholds;
+* tries to satisfy failed RAID1E reads using each failed disk's mirror.
+
+
+
+
+                     Requirements and Installation
+
+Intel RAID metadata is generally created using the Intel RAID OROM. Most
+mainboards based on Intel chipsets with ICH5R/ICH6R/ICH7R southbridges 
+have this OROM. The "RAID" mode needs to be selected in BIOS configuration
+to enable the RAID OROM. The ICH5R/ICH6R/ICH7R are Serial ATA controllers
+and iswraid depends on the ata_piix or ahci driver, either of which can 
+present the SATA disks as SCSI devices to the Linux kernel. Thus, the basic
+requirements for using this driver are:
+* Intel RAID OROM (or Intel RAID metadata already created on disks);
+* ata_piix or ahci driver (or any other driver that can present disks
+  with Intel RAID metadata as SCSI devices);
+* ataraid (comes standard with 2.4 kernels).
+
+For older 2.4 series kernels, unless your kernel source came with libata, 
+please install it before installing iswraid.
+
+The iswraid driver should compile cleanly for all 2.4 series kernels 
+but has seen more testing with 2.4.22 and above kernels, and such
+kernels have the BH_Sync buffer_head flag that this driver likes to use.
+
+In your kernel configuration file you should have "Support for IDE RAID
+controllers" (CONFIG_BLK_DEV_ATARAID) and "Support for Intel software RAID"
+(CONFIG_BLK_DEV_ATARAID_ISW) enabled (as modules or statically linked,
+it does not matter). You should also enable the driver that will present
+the disks with Intel RAID metadata as SCSI disks. Normally this means
+enabling "Serial ATA (SATA) support" (CONFIG_SCSI_SATA) and either "Intel 
+PIIX/ICH SATA support" (CONFIG_SCSI_ATA_PIIX) or "AHCI SATA support"
+(CONFIG_SCSI_SATA_AHCI). Obviously, SCSI support and SCSI disk
+support are also necessary.
+
+Note that the iswraid driver is built as part of the Linux SCSI subsystem, 
+not as part of the IDE modules because when statically linked it needs to 
+be initialized after the SCSI subsystem. When loading it as a module,
+you should load the scsi low level driver first (ata_piix or ahci, 
+typically).
+
+Please pay special attention to whether all the necessary disks are
+visible by the lower level driver. There can be some unwanted consequences 
+if iswraid is loaded when not all disks are available to it. Please read
+below for how to use one of the module parameters as an additional safety 
+measure in this situation.
+
+If all the SCSI drivers are built as modules and module dependencies are 
+current (do "depmod -a"), it is possible to cause the low level driver to 
+be loaded on demand when loading iswraid. For this, add a line like
+  alias scsi_hostadapter ata_piix
+to your /etc/modules.conf file or to any files that participate in
+generating this file (such as /etc/modutils/* or /etc/modprobe.d/*,
+depending on your distribution and how recent the modutils package is).
+Please only do so once you have made sure that the lower level driver 
+can access all the necessary devices. 
+
+When the iswraid driver runs, it scans the Linux SCSI subsystem and makes
+the Intel RAID volumes available as ataraid devices. Their device nodes
+typically are called /dev/ataraid/d0, /dev/ataraid/d1, etc. The individual
+partitions on disk dX (where X is 0, 1, ...) are typically named 
+/dev/ataraid/dXpY (where Y is 1, 2, ...). These details may be distribution-
+specific; the nodes can be created if necessary---ataraid's major number
+is 114 and minor numbers from 16 * X to 16 * X + 15 (where X = 0, 1, ...)
+belong to the same volume. Numbers in the form 16 * X are for the whole
+volumes, numbers in the form 16 * X + Y (where Y > 0) are for partition Y
+of volume X. For example:
+  mkdir /dev/ataraid
+  mknod /dev/ataraid/d2   b 114 32
+  mknod /dev/ataraid/d2p8 b 114 40
+
+When modifying LILO configuration file for booting from volumes, it may
+help to use lines like:
+  disk=/dev/sda
+  inaccessible
+in order to tell the map installer to not bother with direct access to
+the disks. It may also be necessary sometimes to specify how BIOS will 
+be seeing the disks, e.g.:
+  disk=/dev/ataraid/d0
+  bios=0x80
+  disk=/dev/hda
+  bios=0x81
+
+
+
+
+                          Module Parameters
+
+Iswraid recognizes a few module load time parameters, explained below.
+
+* iswraid_halt_degraded:
+Normally set to 0, i.e., not enabled, unless CONFIG_ISWRAID_HALT_DEGRADED
+is defined, in which case it is set to 1 and thus enabled. This feature
+when enabled causes iswraid to stop using RAID1E (and that includes the
+normal RAID1 and 4-disk RAID10, too) volumes that are degraded. It will
+instead fail all I/O requests for such volumes. This parameter also has a
+useful side effect on RAID metadata updates done at startup, which is 
+described in detail later in this document.
+
+* iswraid_resist_failing:
+Normally set to 0, i.e., not enabled, unless CONFIG_ISWRAID_RESIST_FAILING
+is defined, in which case it is set to 1 and thus enabled. When a RAID1E
+(including normal RAID1 and 4-disk RAID10) volume is already degraded, a 
+failed write or exceeding the disk error threshold can cause it to become
+failed and this is the default and generally expected behavior (except for
+some lucky many-disk RAID1E cases where several disks can fail safely
+without losing the ability to restore data). When this parameter is set, 
+however, iswraid will try to not mark the disk and the RAID1E volumes
+containing it as failed. Instead it will merely fail the I/O that exposed
+the disk problem. Some people may prefer this behavior because it always
+makes it clear which disk (or sets of disks) have the more up-to-date data
+and thus should be used to recover the failed disk(s). Please note however
+that the state of other volumes containing the failing disk may dictate
+that the disk really is to be marked as failed and therefore the states
+of all volumes containing it adjusted accordingly. This may cause the 
+intentions of this option to be overruled and thus RAID1E volumes can
+become failed despite this option being enabled..
+
+* iswraid_error_threshold:
+Set to CONFIG_ISWRAID_ERROR_THRESHOLD, which is 10 by default. Iswraid 
+counts the errors on each disk and if they exceed this threshold, it marks
+the disk as failed. This could cause the volumes containing the disk to 
+become degraded or failed (depending on RAID levels and other module load
+parameters). Setting this value to 0 disables checking the error counts on
+disks. The error counts are not persistent.
+
+
+
+
+                          Proc Filesystem
+
+The iswraid driver can output information about the state of Intel RAID
+arrays, disks and volumes through the /proc filesystem. Each /proc file
+generated by iswraid has a header line starting with '#' and containing
+space-separated field names. The following lines each correspond to
+one object (array, disk or volume) being listed and their fields are
+tab-separated. Each of these real data lines is also associated with an
+implicit index (starting at 0) and the objects cross-reference each other
+using these indices.
+
+In order to query the iswraid arrays, do "cat /proc/iswraid/arrays". Here 
+is a sample output:
+
+# family generation numdisks numvolumes disks volumes
+3e37c9ab	78	2	2	0,2	0,1
+3a57e490	74	2	2	1,3	2,3
+
+The first field is the "array family number", which basically distinguishes 
+each array from any other. The second field is the "array generation number" 
+that shows how many times this array's metadata have been written out to its
+disks. The next fields give the number of disks and volumes in the array, 
+respectively. The final two fields give comma-separated listings of
+disks and volumes that this array contains. The disks and volumes
+are given by their implicit indices in the disk and volume listings.
+
+In order to query the disks, do "cat /proc/iswraid/disks". Here is a sample
+output:
+
+# major minor status errorcount array serial
+ 8	 0	0x13a	 0	 0	3JT3L0J2
+ 8	16	0x13a	 0	 1	3JT3LCX6
+ 8	32	0x13a	 0	 0	3JT3KXRX
+ 8	48	0x13a	 0	 1	3JT3FX3X
+
+The first two fields are the major and minor numbers of the block devices
+corresponding to the disks. The status field is next (the status field
+has many bits, not all of which are actually used by iswraid). Each
+disk's error count follows. The next field shows which array the disk
+belongs to, using the implicit array indices. The last field gives each
+disk's serial number (possibly altered by iswraid to strip spaces and
+non-printable characters).
+
+The likely most useful information comes from the volume listing, which
+can be obtained by doing "cat /proc/iswraid/volumes". A sample output
+looks like this:
+
+# node state degradedbits refcnt raidlevel sectors blocksperstrip pbaoflba0 numdisks array disks serial
+d0	0x0	0x0	0	0	 104026112	  8	         0	2	0	0,2	RAID_Volume0
+--	0x1	0x0	0	1	 104287744	256	  52013056	2	0	0,2	RAID_Volume1
+--	0x1	0x0	0	1	 104026112	256	         0	2	1	1,3	RAID_Volume2
+d1	0x0	0x0	0	0	 104549888	  8	 104026112	2	1	1,3	RAID_Volume3
+
+The first field gives the ataraid device name that the volume corresponds
+to. (Actually, the driver does not know the name, but if ataraid device
+nodes are created in the usual manner described above, the dX should be 
+accurate.) If the volume is in use, it will have an ataraid device 
+corresponding to it, and this field will show dX (where X is 0, 1, ...).
+If the volume is disabled (this only happens if it is "a hopeless volume"
+on iswraid startup), then it will not have a corresponding ataraid device 
+and this field will be "--". When a volume gets disabled, iswraid prints
+the reason for this action, so you can check the kernel log. 
+
+The second field gives volume state, which is a bitfield; ideally no bits
+should be set. The third field, degradedbits, is a bitfield identifying any 
+disks that are degraded (and thus not in use by RAID1E volumes). The next 
+field, refcnt gives the number of references to this volume (how many times
+its block device has been opened). The RAID level is next, 0 or 1 (and
+RAID10 or multi-disk RAID1E are all listed as raid level 1). The total sector 
+count and blocks per strip follow. The "physical block address" of volume's 
+"logical block address 0" tells where (in each of its constituent disks) the
+volume begins. Next comes the number of disks the volume contains (which in
+theory could be less than the number of disks in the array) and the implicit
+array index. The next-to-last field is a comma-separated list of the disks
+that the volume contains, using the indices that are implicit in the disk 
+listing. Please note that this order may be different from the order in 
+which the volume's array lists the disks. Finally, we have the "serial 
+number" (symbolic name) of the volume in the last field.
+
+The array, disk and volume indices are not present in the output 
+intentionally, in order to save space. Any user-space tools processing
+these /proc files can easily generate these missing indices and thus
+be able to cross reference the data from all 3 files.
+
+
+
+
+                 Intel RAID Metadata Updates
+
+The iswraid driver is relatively reluctant to update the Intel RAID 
+metadata. There are a couple of situations when it considers updating 
+the metadata, explained below.
+
+It normally does update the metadata in error cases, to mark the disks
+that have failed and volumes that have changed their state. Sometimes 
+this can be suppressed, however, by the use of the iswraid_resist_failing 
+parameter and some luck. If there are no volumes that need to change their 
+state, the RAID metadata will be unchanged despite I/O errors.
+
+It will also update the metadata when a formerly missing disk is found.
+Unless the Intel RAID Option ROM is misbehaving, however, this should
+be hard to observe. This update can only be done on module startup.
+
+Finally, iswraid may update the RAID metadata if a disk needed by some
+RAID volumes is missing. RAID0 volumes will simply be disabled in this
+case (without marking them failed in the RAID metadata), but RAID1E volumes
+would become degraded or failed. This update, too, can only happen during
+module startup, not during its operation. Furthermore, in the typical
+case of loading iswraid after OROM has updated the metadata, the disk
+should already be marked as missing, so iswraid will not have to do it. 
+
+The last update scenario _could_ unfortunately come up when it really
+should not---it could be caused by the lower level driver (e.g.,
+ata_piix) not seeing all the disks that it should be seeing. For example,
+if 4 disks are plugged into an ICH6R-based mainboard and the OROM sees 
+them all but iswraid is given only 2 of them by the lower level driver
+to work with then many volumes could be missing disks and requiring RAID
+metadata updates. Performing such updates would not be helpful overall 
+because they would later require lengthy array rebuild operations
+(to be done with the help of OROM and other operating systems or by
+using user-space utilities such as dd and your favorite hex editor).
+This situation is where the above mentioned "iswraid_halt_degraded"
+parameter can be used as an insurance against needless metadata updates.
+It is now explained how.
+
+If iswraid_halt_degraded is set, iswraid will realize that it cannot
+use the volumes requiring the missing disks because they are either the
+disabled RAID0 volumes or the degraded-or-failed (but definitely not usable)
+RAID1E volumes. Because of this, it will skip updating the RAID metadata
+because it has no volumes to work with anyway. Therefore, for the first
+invocation of iswraid it is recommended to do it with the parameter
+iswraid_halt_degraded set to 1 for safety. This way, even if only some
+disks are found, the RAID metadata on disks will be unaltered.
diff -Naur -X dontdiff linux-2.4.29/drivers/ide/Config.in linux-2.4.29-iswraid/drivers/ide/Config.in
--- linux-2.4.29/drivers/ide/Config.in	2005-01-25 20:55:24.000000000 -0500
+++ linux-2.4.29-iswraid/drivers/ide/Config.in	2005-01-27 17:09:48.000000000 -0500
@@ -195,5 +195,11 @@
 dep_tristate '   Highpoint 370 software RAID (EXPERIMENTAL)' CONFIG_BLK_DEV_ATARAID_HPT $CONFIG_BLK_DEV_IDE $CONFIG_EXPERIMENTAL $CONFIG_BLK_DEV_ATARAID
 dep_tristate '   CMD/Silicon Image Medley Software RAID (EXPERIMENTAL)' CONFIG_BLK_DEV_ATARAID_MEDLEY $CONFIG_BLK_DEV_IDE $CONFIG_EXPERIMENTAL $CONFIG_BLK_DEV_ATARAID
 dep_tristate '   Silicon Image Medley software RAID (EXPERIMENTAL)' CONFIG_BLK_DEV_ATARAID_SII $CONFIG_BLK_DEV_IDE $CONFIG_EXPERIMENTAL $CONFIG_BLK_DEV_ATARAID
+dep_tristate '   Support for Intel software RAID (EXPERIMENTAL)' CONFIG_BLK_DEV_ATARAID_ISW $CONFIG_BLK_DEV_IDE $CONFIG_EXPERIMENTAL $CONFIG_BLK_DEV_SD $CONFIG_BLK_DEV_ATARAID
+if [ "$CONFIG_BLK_DEV_ATARAID_ISW" != "n" ]; then
+   bool '      Halt I/O to a volume if it becomes degraded' CONFIG_ISWRAID_HALT_DEGRADED
+   bool '      Resist failing already degraded RAID1E volumes' CONFIG_ISWRAID_RESIST_FAILING
+   int  '      Error threshold for marking a disk as failed' CONFIG_ISWRAID_ERROR_THRESHOLD 10
+fi
 
 endmenu
diff -Naur -X dontdiff linux-2.4.29/drivers/ide/raid/Makefile linux-2.4.29-iswraid/drivers/ide/raid/Makefile
--- linux-2.4.29/drivers/ide/raid/Makefile	2004-04-14 09:05:30.000000000 -0400
+++ linux-2.4.29-iswraid/drivers/ide/raid/Makefile	2005-01-25 21:01:36.000000000 -0500
@@ -14,6 +14,9 @@
 obj-$(CONFIG_BLK_DEV_ATARAID_HPT)	+= hptraid.o
 obj-$(CONFIG_BLK_DEV_ATARAID_MEDLEY)	+= medley.o
 obj-$(CONFIG_BLK_DEV_ATARAID_SII)	+= silraid.o
+# iswraid.o build has been moved over to the SCSI Makefile 
+# to ensure proper initcall ordering in statically linked kernels.
+#obj-$(CONFIG_BLK_DEV_ATARAID_ISW)	+= iswraid.o
 
 EXTRA_CFLAGS	:= -I../
 
diff -Naur -X dontdiff linux-2.4.29/drivers/ide/raid/iswraid.c linux-2.4.29-iswraid/drivers/ide/raid/iswraid.c
--- linux-2.4.29/drivers/ide/raid/iswraid.c	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.29-iswraid/drivers/ide/raid/iswraid.c	2005-01-28 18:09:35.000000000 -0500
@@ -0,0 +1,2583 @@
+/*
+ *   iswraid.c Copyright (C) 2003,2004,2005 Intel Corporation.
+ *   All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2, or (at your option)
+ *   any later version.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   (for example /usr/src/linux/COPYING); if not, write to the Free
+ *   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *   Authors: Boji Tony Kannanthanam
+ *            < boji dot t dot kannanthanam at intel dot com >
+ *            Martins Krikis
+ *            < martins dot krikis at intel dot com >
+ *
+ *   Based on ataraid codebase by Arjan van de Ven
+ */
+
+/* "iswraid" is an ataraid subdriver for Intel's ICH5R, ICH6R, ICH7R chipsets.
+ * The "ataraid" module needs to be loaded before this driver can load.
+ * This subdriver differs from the other ataraid subdrivers in that it probes
+ * SCSI disks looking for RAID member disks instead of the ATA/IDE subsystem.
+ * Therefore, a driver which detects the SATA drives connected to ICHxR and
+ * presents them as SCSI devices is also needed, e.g., "ata_piix" or "ahci".
+ *
+ * Please read Documentation/iswraid.txt for more information.
+ */
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/tqueue.h>
+#include <linux/smp_lock.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/genhd.h>
+#include <linux/ioctl.h>
+#include <linux/list.h>
+#include <linux/ide.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include "../../scsi/scsi.h"
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include "ataraid.h"
+#include "iswraid.h"
+
+/* Debug masks, set DRIVERDEBUG to bitwise "or" of the needed ones */
+#define DEBUG_INIT  (1 << 0)
+#define DEBUG_MPB   (1 << 1)
+#define DEBUG_IOCTL (1 << 2)
+#define DEBUG_ERROR (1 << 3)
+#define DEBUG_EXIT  (1 << 4)
+#define DEBUG_R0    (1 << 8)
+#define DEBUG_R0S   (1 << 9)
+#define DEBUG_R0M   (1 << 10)
+#define DEBUG_R1    (1 << 16)
+#define DEBUG_R1R   (1 << 17)
+#define DEBUG_R1W   (1 << 18)
+#define DEBUG_R1ER  (1 << 19)
+#define DEBUG_R1EW  (1 << 20)
+
+#undef DRIVERDEBUG
+
+#ifdef DRIVERDEBUG
+#define DEBUG(msg_kind, args...) \
+	do { \
+		if (msg_kind & iswraid_debug_mask) \
+			printk(KERN_DEBUG "iswraid DEBUG: " args); \
+	} while (0)
+#else
+#define DEBUG(msg_kind, args...)
+#define DRIVERDEBUG 0
+#endif
+
+#define ISW_VERSION_STRING "Version 0.1.5"
+#define MPB_VERSION MPB_VERSION_RAID5
+
+#ifndef CONFIG_ISWRAID_HALT_DEGRADED
+#define CONFIG_ISWRAID_HALT_DEGRADED 0
+#endif
+
+#ifndef CONFIG_ISWRAID_RESIST_FAILING
+#define CONFIG_ISWRAID_RESIST_FAILING 0
+#endif
+
+#ifndef CONFIG_ISWRAID_ERROR_THRESHOLD
+#define CONFIG_ISWRAID_ERROR_THRESHOLD 10
+#endif
+
+static int iswraid_debug_mask = DRIVERDEBUG;
+static int iswraid_halt_degraded = CONFIG_ISWRAID_HALT_DEGRADED;
+static int iswraid_resist_failing = CONFIG_ISWRAID_RESIST_FAILING;
+static int iswraid_error_threshold = CONFIG_ISWRAID_ERROR_THRESHOLD;
+
+MODULE_PARM(iswraid_debug_mask, "i");
+MODULE_PARM_DESC(iswraid_debug_mask,
+		 "Debug output mask; define DRIVERDEBUG to enable");
+MODULE_PARM(iswraid_halt_degraded, "i");
+MODULE_PARM_DESC(iswraid_halt_degraded,
+		 "Halt IOs if a volume becomes degraded, non-0 to enable");
+MODULE_PARM(iswraid_resist_failing, "i");
+MODULE_PARM_DESC(iswraid_resist_failing,
+		 "Resist failing degraded RAID1E volumes, non-0 to enable");
+MODULE_PARM(iswraid_error_threshold, "i");
+MODULE_PARM_DESC(iswraid_error_threshold,
+		 "Error threshold for marking disks failed, 0 to disable");
+
+static int iswraid_open(struct inode *inode, struct file *filp);
+static int iswraid_release(struct inode *inode, struct file *filp);
+static int iswraid_ioctl(struct inode *inode, struct file *file,
+			 unsigned int cmd, unsigned long arg);
+static int iswraid0_make_request(request_queue_t *q, int rw,
+				 struct buffer_head *bh);
+static int iswraid1_make_request(request_queue_t *q, int rw,
+				 struct buffer_head *bh);
+
+#define MAX_RAID_ARRAYS 8
+#define MAX_RAID_VOLUMES 16
+#define MAX_RAID_MEMBER_DISKS 8
+/* ataraid.c uses bits in variable ataraiduse to keep track of raid devices */
+#define MAX_ATARAID_RAIDDEVS (sizeof(unsigned int) * 8)
+
+struct disk;
+struct volume;
+
+struct array {
+	int saveneeded;        /* used only by startup code */
+	struct tq_struct task; /* for scheduling MPB saves */
+	struct _raid_mpb *mpb; /* the most up-to-date MPB among member disks */
+	struct disk *disks[MAX_RAID_MEMBER_DISKS];
+	struct volume *volumes[MAX_RAID_VOLUMES];
+};
+
+static struct array arrays[MAX_RAID_ARRAYS];
+static int arraycount = 0;
+
+struct disk {
+	struct list_head head;
+	kdev_t dev;
+	struct block_device *bdev;
+	atomic_t errorcount;
+	unsigned int status; /* access by atomic bit operations */
+	spinlock_t lock;     /* to protect last_pos */
+	unsigned long last_pos;
+	struct array *array;
+	struct _raid_mpb *mpb;
+	unsigned char serial[MAX_RAID_SERIAL_LEN + 1];
+};
+
+static LIST_HEAD(disklist);
+static int diskcount = 0;
+
+#define DEGRADED_MAP 2
+#define FAILED_MAP   3
+
+#define SPARE_DISK      0x01  /* Spare */
+#define CONFIGURED_DISK 0x02  /* Member of some RaidDev */
+#define FAILED_DISK     0x04  /* Permanent failure */
+#define USABLE_DISK     0x08  /* Fully usable unless FAILED_DISK is set */
+
+#define DISABLED_BIT 0
+#define DEGRADED_BIT 1
+#define FAILED_BIT   2   /* should match FAILED_DISK's bit, above */
+
+struct volume {
+	int devbit;
+	int refcnt;
+	int raidlevel;
+	unsigned int state; /* access by atomic bit operations */
+	unsigned int pba_of_lba0;
+	unsigned int blocks_per_strip;
+	unsigned long sectors;  /* size of the whole volume in blocks */
+	int tiebreak;           /* helps choose a disk when there is a tie */
+	struct array *array;
+	int numdisks;
+	unsigned int degradedbits;
+	struct disk *disks[MAX_RAID_MEMBER_DISKS];
+	struct hd_big_geometry hb_geom;
+	unsigned char serial[MAX_RAID_SERIAL_LEN + 1];
+};
+
+static struct volume volumes[MAX_RAID_VOLUMES];
+static int volumecount = 0;
+
+struct bh_private {
+	struct buffer_head *parent;
+	struct volume *volume;
+	atomic_t count;
+	int rw;
+	unsigned long status; /* access by atomic bit ops; disk status bits */
+	void (*old_endiofn)(struct buffer_head *bh, int uptodate);
+	void *old_private;
+};
+
+static kmem_cache_t *privcache = NULL; /* slab cache for bh_private structs */
+static kmem_cache_t *bhcache = NULL;   /* slab cache for buffer_head structs */
+
+static struct raid_device_operations iswraid0_ops = {
+	open:         iswraid_open,
+	release:      iswraid_release,
+	ioctl:        iswraid_ioctl,
+	make_request: iswraid0_make_request
+};
+
+static struct raid_device_operations iswraid1_ops = {
+	open:         iswraid_open,
+	release:      iswraid_release,
+	ioctl:        iswraid_ioctl,
+	make_request: iswraid1_make_request
+};
+
+static struct volume *raid[MAX_ATARAID_RAIDDEVS];
+
+static DECLARE_MUTEX(iswraid_sem);
+
+#define BH_Mirror 31 /* bits BH_PrivateStart to 22 in use by journaling fs-s */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 4, 22)
+#define BH_Sync BH_Lock /* map the new flag to something we set anyway */
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+struct proc_dir_entry *iswraid_proc_dir;
+
+static int
+find_disk_index(struct disk *wanted_disk)
+{
+	struct disk *disk;
+	int k = 0;
+	list_for_each_entry(disk, &disklist, head) {
+		if (disk == wanted_disk)
+			return k;
+		k++;
+	}
+	return -1;
+}
+
+static int
+iswraid_proc_readarrays(char *page, char **start, off_t offset, int count,
+			int *eof, void *data)
+{
+	/* with the current MAX_RAID_ARRAYS 8, MAX_RAID_MEMBER_DISKS 8 and
+	 * MAX_RAID_VOLUMES 16 there is no need to check anything; it will fit.
+	 */
+	int i, j, len = 0;
+	MOD_INC_USE_COUNT;
+
+	len += sprintf(page + len, "# family generation numdisks numvolumes "
+		       "disks volumes\n");
+	for (i = 0; i < arraycount; i++) {
+		len += sprintf(page + len, "%08x\t%u\t%d\t%d\t",
+			       arrays[i].mpb->family_num,
+			       arrays[i].mpb->generation_num,
+			       arrays[i].mpb->num_disks,
+			       arrays[i].mpb->num_raid_devs);
+		for (j = 0; j < arrays[i].mpb->num_disks; j++) {
+			int k = find_disk_index(arrays[i].disks[j]);
+			if (!j)
+				len += sprintf(page + len, "%d", k);
+			else
+				len += sprintf(page + len, ",%d", k);
+		}
+		len += sprintf(page + len, "\t");
+		for (j = 0; j < arrays[i].mpb->num_raid_devs; j++) {
+			if (!j)
+				len += sprintf(page + len, "%d",
+					       (arrays[i].volumes[j]
+						- &volumes[0]));
+			else
+				len += sprintf(page + len, ",%d",
+					       (arrays[i].volumes[j]
+						- &volumes[0]));
+		}
+		len += sprintf(page + len, "\n");
+	}
+	*eof = 1;
+
+	MOD_DEC_USE_COUNT;
+	return len;
+}
+
+#define DISK_LINE_LEN 128    /* overkill */
+
+static int
+iswraid_proc_readdisks(char *page, char **start, off_t offset, int count,
+		       int *eof, void *data)
+{
+	struct disk *disk;
+	int limit, len = 0, item = 0;
+	MOD_INC_USE_COUNT;
+
+	limit = count - DISK_LINE_LEN;
+	len += sprintf(page + len, "# major minor status errorcount "
+		       "array serial\n");
+	list_for_each_entry(disk, &disklist, head) {
+		if (len > limit)
+			break;
+		item++;           /* consider it processed already */
+		if (offset && item <= offset) /* attempt to use the "hack" */
+			continue;
+
+		len += sprintf(page + len,
+			       "%2d\t%2d\t0x%x\t%2d\t%2d\t%s\n",
+			       MAJOR(disk->dev), MINOR(disk->dev),
+			       disk->status, atomic_read(&disk->errorcount),
+			       disk->array - &arrays[0], disk->serial);
+	}
+
+	if (item < diskcount) {         /* all of them didn't fit */
+		*start = (char *) item;
+		*eof = 0;
+	} else
+		*eof = 1;
+
+	MOD_DEC_USE_COUNT;
+	return len;
+}
+
+static int
+iswraid_proc_readvolumes(char *page, char **start, off_t offset, int count,
+			 int *eof, void *data)
+{
+	/* with the current MAX_RAID_VOLUMES 16 and MAX_RAID_MEMBER_DISKS 8
+	 * there is no need to check anything; it will fit.
+	 */
+	int i, j, len = 0;
+	MOD_INC_USE_COUNT;
+
+	len += sprintf(page + len, "# node state degradedbits refcnt "
+		       "raidlevel sectors blocksperstrip pbaoflba0 "
+		       "numdisks array disks serial\n");
+	for (i = 0; i < volumecount; i++) {
+		if (volumes[i].devbit >= 0)
+			len += sprintf(page + len, "d%d\t", volumes[i].devbit);
+		else
+			len += sprintf(page + len, "--\t");
+		len += sprintf(page + len, "0x%x\t0x%x\t%d\t%d\t%10lu\t%3d"
+			       "\t%10u\t%d\t%d\t",
+			       volumes[i].state, volumes[i].degradedbits,
+			       volumes[i].refcnt, volumes[i].raidlevel,
+			       volumes[i].sectors, volumes[i].blocks_per_strip,
+			       volumes[i].pba_of_lba0, volumes[i].numdisks,
+			       volumes[i].array - &arrays[0]);
+		for (j = 0; j < volumes[i].numdisks; j++) {
+			int k = find_disk_index(volumes[i].disks[j]);
+			if (!j)
+				len += sprintf(page + len, "%d", k);
+			else
+				len += sprintf(page + len, ",%d", k);
+		}
+		len += sprintf(page + len, "\t%s\n", volumes[i].serial);
+	}
+	*eof = 1;
+
+	MOD_DEC_USE_COUNT;
+	return len;
+}
+
+static int __init
+iswraid_proc_init(void)
+{
+	if (!(iswraid_proc_dir = proc_mkdir("iswraid", NULL)))
+		goto out;
+	if (!create_proc_read_entry("arrays", 0, iswraid_proc_dir,
+				    &iswraid_proc_readarrays, NULL))
+		goto freedir;
+	if (!create_proc_read_entry("disks", 0, iswraid_proc_dir,
+				    &iswraid_proc_readdisks, NULL))
+		goto freearrays;
+	if (!create_proc_read_entry("volumes", 0, iswraid_proc_dir,
+				    &iswraid_proc_readvolumes, NULL))
+		goto freedisks;
+	return 0;
+
+ freedisks:
+	remove_proc_entry("disks", iswraid_proc_dir);
+ freearrays:
+	remove_proc_entry("arrays", iswraid_proc_dir);
+ freedir:
+	remove_proc_entry("iswraid", NULL);
+ out:
+	return -ENOMEM;
+}
+
+static void __exit
+iswraid_proc_cleanup(void)
+{
+	remove_proc_entry("volumes", iswraid_proc_dir);
+	remove_proc_entry("disks", iswraid_proc_dir);
+	remove_proc_entry("arrays", iswraid_proc_dir);
+	remove_proc_entry("iswraid", NULL);
+}
+
+#else
+
+#define iswraid_proc_init() 0
+#define iswraid_proc_cleanup()
+
+#endif /* #ifdef CONFIG_PROC_FS */
+
+/* Find where a placeholder's tag starts (the colon). It must not be at the
+ * very first or the very last char and needs to be followed by a decimal
+ * number. The placeholder string is not necessarily terminated.
+ */
+static unsigned char *
+find_tag(unsigned char *pholder)
+{
+	unsigned char *tag, *end = pholder;
+	while (*end && end < pholder + MAX_RAID_SERIAL_LEN)
+		end++;     /* find the end of pholder */
+	tag = --end; /* start from the last character */
+	while (tag > pholder && *tag >= '0' && *tag <= '9')
+		tag--; /* move left over all digits, not reaching first char */
+	if (tag < end && tag > pholder && *tag == ':')
+		return tag; /* colon found not at very start or very end */
+	return NULL; /* this turned out not to be a placeholder */
+}
+
+/* Create a placeholder out of a potentially unterminated serial number */
+static int
+create_placeholder(unsigned char *serial)
+{
+	int space;
+	unsigned char *p, *end = serial;
+	if (find_tag(serial)) /* this is a placeholder already */
+		return 0;
+	while (*end && end < serial + MAX_RAID_SERIAL_LEN)
+		end++;     /* find the end of serial */
+	space = serial + MAX_RAID_SERIAL_LEN - end;
+	if (space >= 2)    /* our tag will fit */
+		strncpy(end, ":0", space);
+	else { /* 0 or 1 bytes of space, need to shift serial left */
+		for (p = serial + 2 - space; p < end; p++)
+			*serial++ = *p;
+		strncpy(serial, ":0", 2); /* now just add tag at the end */
+	}
+	return 1;
+}
+
+/* Find a disk given its serial number or at least a placeholder,
+ * which isn't necessarily terminated.
+ */
+static struct disk *
+find_disk_by_serial(unsigned char *serial)
+{
+	struct disk *disk;
+	list_for_each_entry(disk, &disklist, head) {
+		/* first try to match the serial numbers directly */
+		if (!strncmp(disk->serial, serial, MAX_RAID_SERIAL_LEN))
+			return disk;
+		else { /* now consider that serial could be a placeholder */
+			unsigned char *colon = find_tag(serial);
+			if (colon /* yes, it really is a placeholder */
+			    && !strncmp(disk->serial, serial, colon - serial))
+				return disk; /* and matches up to the tag */
+		}
+	}
+	return NULL;
+}
+
+/* Allocate a bh struct, code from ataraid.c but updated to use the bh cache.
+ * We could have used the already existing bh_cachep, especially if we did it
+ * very carefully w/o trashing too many fields of bh-s obtained from there.
+ * However, it was realized that it may scare people when a new driver starts
+ * messing with the pool of bufferheads that is so widely used and so critical.
+ */
+static struct buffer_head *
+get_bhead(void)
+{
+	for ( ; ; ) {
+		void *ptr = kmem_cache_alloc(bhcache, SLAB_NOIO);
+		if (likely(ptr != NULL))
+			return (struct buffer_head *) ptr;
+		__set_current_state(TASK_RUNNING);
+		yield();
+	}
+	return NULL; /* not reached */
+}
+
+/* Allocate a bh_private, code from ataraid.c but updated to use a cache */
+static struct bh_private *
+get_private(void)
+{
+	for ( ; ; ) {
+		void *ptr = kmem_cache_alloc(privcache, SLAB_NOIO);
+		if (likely(ptr != NULL))
+			return (struct bh_private *) ptr;
+		__set_current_state(TASK_RUNNING);
+		yield();
+	}
+	return NULL; /* not reached */
+}
+
+/* Figure out where the MPB data "starts" */
+static unsigned long __init
+calc_mpb_blocknum(int major, int minor)
+{
+	struct gendisk *gdisk = get_gendisk(MKDEV(major, minor));
+	if (!gdisk) {
+		printk(KERN_ERR "iswraid: can't get partitioning info for "
+		       "major %d\n", major);
+		return 0;
+	}
+	/* Our MPB's "first" block is 1024 bytes from the end of the disk */
+	return gdisk->part[minor].nr_sects - 2;
+}
+
+/* Generate checksum of Raid metadata for mpb_size/sizeof(U32) words
+ * Note that the checksum field itself should be ignored for this calculation
+ */
+static u32
+compute_checksum(const u32 *buffer, u32 mpb_size)
+{
+	u32 i, sum = -((struct _raid_mpb *) buffer)->check_sum;
+	for (i = 0; i < (mpb_size / sizeof (u32)); i++)
+		sum += *buffer++;
+	return sum;
+}
+
+static void end_io(struct buffer_head *bh, int uptodate); /* forward decl. */
+
+/* Completion routine used with start_mpb_write() */
+static void
+end_mpb_write(struct buffer_head *bh, int uptodate)
+{
+	struct disk *disk = bh->b_private;
+	if (uptodate)
+		printk(KERN_INFO "iswraid: MPB write to disk major %d "
+		       "minor %d completed successfully\n",
+		       MAJOR(disk->dev), MINOR(disk->dev));
+	else
+		printk(KERN_ERR "iswraid: MPB write to disk major %d minor %d "
+		       "failed\n", MAJOR(disk->dev), MINOR(disk->dev));
+	kmem_cache_free(bhcache, bh); /* free the last, end_io frees the 1st */
+}
+
+/* This starts an MPB write to a particular disk. Should not be called w/o a
+ * proper process context, because in theory generic_make_request could sleep
+ * and because we may use non-atomic memory allocation flags in future...
+ */
+static int
+start_mpb_write(struct disk *disk, struct _raid_mpb *mpb)
+{
+	kdev_t dev = disk->dev;
+	unsigned long mpb_blocknum;
+	int mpbblocks;
+	struct buffer_head *bh1, *bh2;
+	struct bh_private *private;
+	DEBUG(DEBUG_MPB, "start_mpb_write, MPB size is %d bytes\n",
+	      mpb->mpb_size);
+
+	/* Find the block number of the "first" block of Intel RAID metadata */
+	if (!(mpb_blocknum = calc_mpb_blocknum(MAJOR(dev), MINOR(dev))))
+		return -EINVAL;
+
+	/* FIXME? would SLAB_NOIO be more appropriate? scheduler's queue... */
+	/* FIXME? static bh-s would also do; 2 per disk is all we'd need */
+	if (!(bh1 = kmem_cache_alloc(bhcache, SLAB_ATOMIC)))
+		return -ENOMEM;
+
+	memset(bh1, 0, sizeof(*bh1)); /* most fields will be NULL */
+	/* don't init the wait queue, nobody will be waiting on this */
+	bh1->b_rsector = mpb_blocknum;
+	bh1->b_rdev = dev;
+	bh1->b_size = ISW_DISK_BLOCK_SIZE;
+	bh1->b_data = (char *) mpb;
+	bh1->b_page = virt_to_page(mpb); /* safe for logical addresses */
+	bh1->b_end_io = &end_io; /* our usual completion routine */
+	set_bit(BH_Mapped, &bh1->b_state); /* checked in __make_request */
+	set_bit(BH_Sync, &bh1->b_state);
+	set_bit(BH_Lock, &bh1->b_state);
+
+	/* FIXME? would SLAB_NOIO be more appropriate? scheduler's queue... */
+	/* FIXME? keeping a static bh_private for each disk would do, too */
+	if (!(private = kmem_cache_alloc(privcache, SLAB_ATOMIC))) {
+		kmem_cache_free(bhcache, bh1);
+		return -ENOMEM;
+	}
+
+	bh1->b_private = private;
+	private->parent = NULL;
+	private->volume = NULL;
+	private->rw = WRITE;    /* irrelevant here */
+	private->status = 0;
+	private->old_endiofn = &end_mpb_write;
+	private->old_private = disk;
+
+	mpbblocks = ((mpb->mpb_size + ISW_DISK_BLOCK_SIZE - 1)
+		     / ISW_DISK_BLOCK_SIZE);
+	if (mpbblocks <= 1) /* only == 1 possible, really */
+		atomic_set(&private->count, 1);
+	else {
+		atomic_set(&private->count, 2);
+		/* FIXME? would SLAB_NOIO be better? or a static bh? */
+		if (!(bh2 = kmem_cache_alloc(bhcache, SLAB_ATOMIC))) {
+			kmem_cache_free(bhcache, bh1);
+			kmem_cache_free(privcache, private);
+			return -ENOMEM;
+		}
+		memcpy(bh2, bh1, sizeof(*bh1)); /* most fields like in bh1 */
+		bh2->b_rsector = mpb_blocknum - (mpbblocks - 1);
+		bh2->b_size = ISW_DISK_BLOCK_SIZE * (mpbblocks - 1);
+		bh2->b_data = (char *) mpb + ISW_DISK_BLOCK_SIZE;
+		bh2->b_page = virt_to_page(bh2->b_data); /* safe for logical */
+		get_bh(bh2); /* increment the usage count; we never put_bh() */
+		generic_make_request(WRITE, bh2);
+	}
+
+	get_bh(bh1); /* increment the usage count; we never do put_bh() */
+	generic_make_request(WRITE, bh1);
+	DEBUG(DEBUG_MPB, "start_mpb_write exiting for major %d minor %d\n",
+	      MAJOR(dev), MINOR(dev));
+	return 0;
+}
+
+/* Start MPB writes to all disks in array. Run from the scheduler's queue. */
+static void
+start_mpb_writes(void *arg)
+{
+	struct array *array = (struct array *) arg;
+	int i;
+	for (i = 0; i < array->mpb->num_disks; i++) {
+		struct disk *disk = array->disks[i]; /* could be NULL */
+		if (disk && start_mpb_write(disk, array->mpb))
+			printk(KERN_ERR "iswraid: Could not write the MPB "
+			       "for major %d minor %d: no memory\n",
+			       MAJOR(disk->dev), MINOR(disk->dev));
+	}
+}
+
+/* Return a pointer to the next raiddev in MPB, given the current one */
+static struct _mpb_raid_dev *
+advance_raiddev(struct _mpb_raid_dev *raiddev)
+{
+	int k; /* map size correction */
+	struct _mpb_raid_vol *vol = &raiddev->raid_vol;
+	struct _mpb_raid_map *map = &vol->lo_map;
+	k = (map->num_members - 1) * sizeof(u32);
+	raiddev++; /* off by k bytes for sure */
+	raiddev = (struct _mpb_raid_dev *) ((u8 *) raiddev + k);
+	if (vol->migr_state) { /* need to add space for another map */
+		map = (struct _mpb_raid_map *) raiddev;
+		k = (map->num_members - 1) * sizeof(u32); /* correction */
+		raiddev = (struct _mpb_raid_dev *) ((u8 *) raiddev
+						    + sizeof(*map) + k);
+	}
+	return raiddev;
+}
+
+/* Update array's MPB, start an MPB write for each disk. May not sleep */
+static void
+update_mpb(struct array *array)
+{
+	int i, j;
+	struct _raid_mpb *mpb = array->mpb;
+	struct _mpb_raid_dev *raiddev
+		= (struct _mpb_raid_dev *) &mpb->disk_tbl[mpb->num_disks];
+
+	/* easy update of the status field for each of array's disks */
+	for (i = 0; i < mpb->num_disks; i++) {
+		struct _mpb_disk *mpbdisk = &mpb->disk_tbl[i];
+		struct disk *disk = array->disks[i]; /* could be NULL */
+		if (disk)
+			mpbdisk->status = disk->status;
+	}
+
+	/* volume degraded or failed update, plus disk degraded updates */
+	for (i = 0; i < mpb->num_raid_devs; i++) {
+		struct _mpb_raid_vol *vol = &raiddev->raid_vol;
+		struct _mpb_raid_map *map = &vol->lo_map;
+		struct volume *volume = array->volumes[i];
+		if (test_bit(FAILED_BIT, &volume->state))
+			map->map_state = FAILED_MAP;
+		else if (test_bit(DEGRADED_BIT, &volume->state))
+			map->map_state = DEGRADED_MAP;
+		for (j = 0; j < volume->numdisks; j++) {
+			if ((volume->disks[j]
+			     && test_bit(FAILED_BIT,
+					 &volume->disks[j]->status))
+			    || test_bit(j, &volume->degradedbits))
+				map->disk_ord_tbl[j] |= (1 << 24);
+		}
+		raiddev = advance_raiddev(raiddev);
+	}
+
+	mpb->generation_num++; /* increment the generation number */
+	/* update the checksum */
+	mpb->check_sum = compute_checksum((const u32 *) mpb, mpb->mpb_size);
+
+	/* Now we try to schedule the writes of this MPB to all of its disks,
+	 * and not call start_mpb_writes() directly, because in theory
+	 * make_request functions could sleep.
+	 * If we fail, that means that the task is still on the queue and
+	 * is not yet being processed. I.e., we've managed to make two or more
+	 * updates to the same MPB before having time to save it. That's OK.
+	 * It is also possible that we are changing the MPB at the same
+	 * time that it is being written to the disk. We don't care about
+	 * this either, because then sync is off and there are no impediments
+	 * for scheduling one more write. And that write will get it right.
+	 */
+	schedule_task(&array->task);
+}
+
+/* Checks whether the mirrors of the given disk in a RAID1E volume are OK */
+static int
+check_r1e_mirrors(struct volume *vol, int k)
+{
+	if ((!(vol->numdisks & 0x1) && !test_bit(k ^ 0x1, &vol->degradedbits))
+	    || ((vol->numdisks & 0x1) /* odd numdisks, check both neighbors */
+		&& !test_bit((k + 1) % vol->numdisks, &vol->degradedbits)
+		&& !test_bit(((k - 1 + vol->numdisks) % vol->numdisks),
+			     &vol->degradedbits)))
+		return 1;
+
+	return 0;
+}
+
+/* "Notifies" about disk failure every enabled volume containing this disk.
+ * All such volumes in theory should set the degraded bit for the disk, the
+ * disk itself should become failed, and the RAID1 or higher level volumes
+ * should become degraded or failed. An exception to this rule and some
+ * extra complexity is caused by the iswraid_resist_failing option. If the
+ * option indicates that RAID1E volumes rather not be failed and if no volumes
+ * strictly impose failing the disk, then iswraid does obey the option.
+ * (Not to be used during module initialization, only when fully operational.)
+ */
+static int
+notify_volumes(struct disk *disk)
+{
+	int j, k, fail = 1; /* mild indication to fail the disk */
+	BUG_ON(!disk);
+	/* scan each volume in the array containing this disk... */
+	for (j = 0; j < disk->array->mpb->num_raid_devs; j++) {
+		struct volume *vol = disk->array->volumes[j];
+		/* ignore RAID0 and disabled volumes in this first scan */
+		if (vol->raidlevel < 1 || test_bit(DISABLED_BIT, &vol->state))
+			continue;
+
+		/* see whether the disk is in use by the volume's map */
+		for (k = 0; k < vol->numdisks; k++)
+			if (vol->disks[k] == disk)
+				break;
+		/* if disk not in use or known to be bad, check next volume */
+		if (k >= vol->numdisks || test_bit(k, &vol->degradedbits))
+			continue;
+
+		if (test_bit(DEGRADED_BIT, &vol->state)) {
+			/* RAID1E can have recoverable multi-disk failures */
+			if (vol->raidlevel == 1 && check_r1e_mirrors(vol, k)) {
+				printk(KERN_WARNING "iswraid: Volume '%s' "
+					"(d%d) staying degraded\n",
+					vol->serial, vol->devbit);
+				fail = 2; /* strong indication to fail disk */
+			} else if (!iswraid_resist_failing
+				   || vol->raidlevel > 1) {
+				set_bit(FAILED_BIT, &vol->state);
+				printk(KERN_WARNING "iswraid: Volume '%s' "
+				       "(d%d) is now failed\n",
+				       vol->serial, vol->devbit);
+				fail = 2; /* strong indication to fail disk */
+			} else {
+				printk(KERN_WARNING "iswraid: Volume '%s' "
+				       "(d%d) should be failed but perhaps "
+				       "will not be due to enabled "
+				       "iswraid_resist_failing\n",
+				       vol->serial, vol->devbit);
+				if (fail < 2) /* if no strong ind. to fail */
+					fail = 0; /* suggest not failing */
+			}
+		} else {
+			set_bit(DEGRADED_BIT, &vol->state);
+			printk(KERN_WARNING "iswraid: Volume '%s' (d%d) "
+			       "is now degraded\n", vol->serial, vol->devbit);
+			fail = 2; /* strong indication to fail the disk */
+		}
+	}
+
+	if (!fail) /* don't have to mark the disk as failed */
+		return 0; /* no need to update the MPB */
+	
+	set_bit(FAILED_BIT, &disk->status);
+	/* a second pass through the volumes containing the disk */
+	for (j = 0; j < disk->array->mpb->num_raid_devs; j++) {
+		struct volume *vol = disk->array->volumes[j];
+
+		/* see whether the disk is in use by the volume's map */
+		for (k = 0; k < vol->numdisks; k++)
+			if (vol->disks[k] == disk)
+				break;
+		/* if disk not in use or known to be bad, go to the next vol */
+		if (k >= vol->numdisks || test_bit(k, &vol->degradedbits))
+			continue;
+		set_bit(k, &vol->degradedbits);
+
+		/* now just fail any RAID1E volumes that were spared above */
+		if (iswraid_resist_failing && vol->raidlevel == 1
+		    && !test_bit(FAILED_BIT, &vol->state)
+		    && test_bit(DEGRADED_BIT, &vol->state)
+		    && !check_r1e_mirrors(vol, k)) {
+			set_bit(FAILED_BIT, &vol->state);
+			printk(KERN_WARNING "iswraid: Volume '%s' (d%d) is "
+			       "now failed despite iswraid_resist_failing\n",
+			       vol->serial, vol->devbit);
+		}
+	}
+	
+	return 1; /* need to update the MPB */
+}
+
+/* This makes a quick note of a disk error (but doesn't update errorcount yet).
+ * Fails a disk and initiates MPB writes to its array if a RAID1 volume write
+ * to the disk fails. Resubmits failed RAID1 reads to a mirror. May not sleep.
+ */
+static int
+handle_io_error(struct buffer_head *bh)
+{
+	struct disk *disk, *mirror;
+	int i, mirrornum;
+	struct bh_private *private = bh->b_private;
+	struct volume *volume = private->volume;
+	if (!volume) {
+		printk(KERN_ERR "iswraid: Non-volume related IO error on "
+		       "disk major %d minor %d\n",
+		       MAJOR(bh->b_rdev), MINOR(bh->b_rdev));
+		set_bit(0, &private->status); /* indicate error with any bit */
+		return 0;
+	}
+
+	/* look for the disk number in this volume that got error */
+	for (i = 0; i < volume->numdisks; i++)
+		if (volume->disks[i] && volume->disks[i]->dev == bh->b_rdev)
+			break;
+	BUG_ON(i >= volume->numdisks); /* disk not found, impossible */
+	set_bit(i, &private->status); /* just mark that disk had a problem */
+	printk(KERN_ERR "iswraid: %s IO error on disk major %d minor %d "
+	       "sector %lu, volume '%s' (d%d), RAID level %d\n",
+	       private->rw == WRITE ? "Write" : "Read", MAJOR(bh->b_rdev),
+	       MINOR(bh->b_rdev), bh->b_rsector,
+	       volume->serial, volume->devbit, volume->raidlevel);
+
+	if (volume->raidlevel == 0)
+		return 0; /* nothing else to do at the moment */
+
+	if (volume->raidlevel > 1)
+		return 0; /* high RAID levels not yet supported */
+
+	if (private->rw == WRITE) { /* RAID1/RAID10/RAID1E write */
+		disk = volume->disks[i]; /* not NULL, BTW */
+		if (!test_bit(i, &volume->degradedbits)) { /* if not bad yet */
+			DEBUG(DEBUG_ERROR, "notifying volumes about disk "
+			      "going bad, private->status = 0x%08lx\n",
+			      private->status);
+			/* Degrade, fail volumes, probably mark disk failed.
+			 * It is critical to have our disk marked bad before
+			 * checking for mirror goodness and clearing the
+			 * error (in hopes that the mirror write will succeed)
+			 * because otherwise we could miss to notice a double
+			 * disk failure (assuming that each one is handled by a
+			 * different error-handler thread and that both can run
+			 * at the same time, which may not even be possible).
+			 */
+			if (notify_volumes(disk))
+				update_mpb(volume->array);
+		}
+
+		if (check_r1e_mirrors(volume, i)) { /* if mirror(s) are OK */
+			clear_bit(i, &private->status); /* clear the error */
+			/* but remember that it happened, just in case */
+			set_bit(i + MAX_RAID_MEMBER_DISKS, &private->status);
+			DEBUG(DEBUG_ERROR, "letting the WRITE success be "
+			      "decided by the mirror, private->status = "
+			      "0x%08lx\n", private->status);
+		}
+
+		return 0;
+	}
+
+	/* RAID1/RAID10/RAID1E read */
+	if (test_bit(BH_Mirror, &bh->b_state) /* a mirror read */
+	    || volume->numdisks == 1) /* or there is no mirror */
+		return 0; /* then we've done all we can, bail */
+
+	/* try to find a suitable mirror for this IO */
+	if (!(volume->numdisks & 0x01)) /* even number of disks */
+		mirrornum = i ^ 0x01; /* just move 1 up or down */
+	else {  /* odd number of disks, not possible with ICHxR, not tested */
+		int rsect = bh->b_rsector - volume->pba_of_lba0;
+		int strip_in_disk = rsect / volume->blocks_per_strip;
+		if ((i ^ strip_in_disk) & 0x01) /* even disk, odd strip
+						   or vice versa */
+			mirrornum = i - 1; /* mirror behind */
+		else         /* even disk, even strip or odd disk, odd strip */
+			mirrornum = i + 1; /* mirror ahead */
+		if (mirrornum >= volume->numdisks) { /* forward wraparound */
+			mirrornum = 0;
+			bh->b_rsector += volume->blocks_per_strip;
+		} else if (mirrornum < 0) { /* backward wraparound */
+			mirrornum = volume->numdisks - 1;
+			bh->b_rsector -= volume->blocks_per_strip;
+		}
+	}
+	mirror = volume->disks[mirrornum]; /* could be NULL */
+	
+	if (mirror && !test_bit(mirrornum, &volume->degradedbits)) { /* OK? */
+		bh->b_rdev = mirror->dev;
+		set_bit(BH_Mirror, &bh->b_state); /* a mirror read */
+		set_bit(BH_Sync, &bh->b_state); /* so make it fast */
+		/* clear disk error bit to avoid spoiling overall success */
+		clear_bit(i, &private->status);
+		/* but remember that this error happened, anyway */
+		set_bit(i + MAX_RAID_MEMBER_DISKS, &private->status);
+		DEBUG(DEBUG_ERROR, "resubmitting RAID1 read IO to a mirror: "
+		      "new rsector = %lu, major %d minor %d\n", bh->b_rsector,
+		      MAJOR(bh->b_rdev), MINOR(bh->b_rdev));
+		generic_make_request(READ, bh);  /* submit the new IO */
+		return 1; /* let the caller know that there is still hope */
+	}
+
+	return 0;
+}
+
+/* This is called when the original, "big" IO completes. It increments the
+ * errorcounts for each disk that is marked as having had a problem with one
+ * or more of components of the big IO. Also clears the bits that remember
+ * problems before an IO was reissued to a mirror---this way, if the overall
+ * IO succeeded, the following code will see it as successful.
+ * Fails those disks whose errorcount exceeded the threshold and starts
+ * the MPB writes (all these disks are in the same array). May not sleep.
+ */
+static void
+update_error_counts(struct buffer_head *bh)
+{
+	int i, mpbchanged = 0;
+	struct bh_private *private = bh->b_private;
+	struct volume *volume = private->volume;
+	if (!volume) /* ignore non-volume related IOs */
+		return;
+
+	for (i = 0; i < volume->numdisks; i++) {
+		struct disk *disk = volume->disks[i]; /* could be NULL */
+		/* test normal and pre-mirror bits for each disk */
+		if (test_bit(i, &private->status)
+		    || test_bit(i + MAX_RAID_MEMBER_DISKS, &private->status)) {
+			BUG_ON(!disk); /* can't get errors for missing disks */
+			atomic_inc(&disk->errorcount);
+			clear_bit(BH_Mirror, &bh->b_state); /* keep it clean */
+			if (iswraid_error_threshold /* threshold in use */
+			    && (atomic_read(&disk->errorcount)
+				>= iswraid_error_threshold) /* and reached */
+			    && !test_bit(FAILED_BIT, &disk->status)) {
+				printk(KERN_ERR "iswraid: Disk major %d minor "
+				       "%d of volume '%s' (d%d) has reached "
+				       "the error threshold %d\n",
+				       MAJOR(bh->b_rdev), MINOR(bh->b_rdev),
+				       volume->serial, volume->devbit,
+				       iswraid_error_threshold);
+				/* degrade, fail volumes, possibly mark disk */
+				if (notify_volumes(disk))
+					mpbchanged = 1; /* need to write MPB */
+			}
+		}
+		/* hide an error that may have happened before a mirror read */
+		clear_bit(i + MAX_RAID_MEMBER_DISKS, &private->status);
+	}
+
+	DEBUG(DEBUG_ERROR, "error counts updated, private->status = 0x%08lx\n",
+	      private->status);
+	if (mpbchanged)
+		update_mpb(volume->array);
+}
+
+/* The normal IO completion function, used to keep track of IO success. */
+/* Called from a softirq context, so must not sleep. */
+static void
+end_io(struct buffer_head *bh, int uptodate)
+{
+	struct bh_private *private = bh->b_private;
+	BUG_ON(!private); /* it should not have come to us if !private */
+
+	if (unlikely(!uptodate)) { /* this IO failed */
+		if (handle_io_error(bh))  /* if failed IO got resubmitted */
+			return;           /* then we must bail early here */
+	}
+
+	if (atomic_dec_and_test(&private->count)) { /* if last or only IO */
+		if (unlikely(private->status)) /* if any disks had errors */
+			update_error_counts(bh);
+		if (private->parent) /* if we have a master bh linked */
+			private->parent->b_end_io(private->parent,
+						  private->status ? 0 : 1);
+		else {  /* if we had just changed a few fields in bh    */
+			/* (or in case this is the (very rare) MPB write) */
+			bh->b_private = private->old_private;
+			bh->b_end_io = private->old_endiofn;
+			private->old_endiofn(bh, private->status ? 0 : 1);
+			kmem_cache_free(privcache, private);
+			/* don't free bh because we didn't alloc it     */
+			/* (for MPB writes, the end_mpb_write frees it) */
+			return;
+		}
+		kmem_cache_free(privcache, private);
+	}
+	kmem_cache_free(bhcache, bh);
+}
+
+/* Our copy of end_buffer_io_sync, which fs/buffer.c does not export */
+static void
+end_io_sync(struct buffer_head *bh, int uptodate)
+{
+	mark_buffer_uptodate(bh, uptodate);
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+/* Build and submit an IO that goes to only 1 disk of a RAID0 volume */
+static inline void
+build_singlestrip_r0io(struct buffer_head *bh, struct volume *volume,
+		       struct bh_private *private, unsigned long disk,
+		       unsigned long strip_in_disk,
+		       unsigned long block_in_strip)
+{
+	unsigned long drsect = (strip_in_disk * volume->blocks_per_strip
+				+ block_in_strip);
+	DEBUG(DEBUG_R0S, "block_in_strip = %lu, disk = %lu,"
+	      " strip_in_disk = %lu, drsect (without pba_of_lba0 offset)"
+	      " = %lu, size = %u, rw = %d\n", block_in_strip,
+	      disk, strip_in_disk, drsect, bh->b_size, private->rw);
+
+	bh->b_rdev = volume->disks[disk]->dev;
+	bh->b_rsector = drsect + volume->pba_of_lba0;
+	DEBUG(DEBUG_R0S, "final (only) b_rsector = %lu\n", bh->b_rsector);
+
+	/* we carefully save the fields we'll muck up */
+	private->old_private = bh->b_private;
+	private->old_endiofn = bh->b_end_io;
+	private->parent = NULL; /* clear parent; the rest was set in caller */
+	bh->b_private = private;
+	bh->b_end_io = &end_io;
+
+	/* update the last known head position for the drive */
+	spin_lock(&volume->disks[disk]->lock);
+	volume->disks[disk]->last_pos = bh->b_rsector + (bh->b_size >> 9);
+	spin_unlock(&volume->disks[disk]->lock);
+}
+
+/* Build and submit a large number of IOs going to disks comprising a
+ * RAID0 volume; all these IOs point back to the same struct bh_private.
+ */
+static inline void
+build_multistrip_r0io(struct buffer_head *bh, struct volume *volume,
+		      struct bh_private *private, unsigned long disk,
+		      unsigned long strip_in_disk,
+		      unsigned long block_in_strip, unsigned long strip,
+		      unsigned long rsect, unsigned long nextsect)
+{
+	/* Last and first strips are different from the rest */
+	unsigned long drsect = (strip_in_disk * volume->blocks_per_strip
+				+ block_in_strip);
+	unsigned long size = ((volume->blocks_per_strip - block_in_strip)
+			      << 9);
+	char *bufferposition = bh->b_data;
+	private->parent = bh; /* don't need old_endiofn and old_private */
+	/* volume, count, rw and status were already set in caller */
+
+	while (rsect < nextsect) {
+		struct buffer_head *bh1 = get_bhead();
+		/* dupe the bufferhead and update what's different */
+		memcpy(bh1, bh, sizeof(*bh));
+		bh1->b_end_io = &end_io; /* return to us */
+		bh1->b_private = private;
+
+		/* adjust the size for the last strip */
+		if (nextsect - rsect < (size >> 9))
+			size = (nextsect - rsect) << 9;
+
+		DEBUG(DEBUG_R0M, "strip = %lu, block_in_strip = %lu, "
+		      "disk = %lu, strip_in_disk = %lu, drsect "
+		      "(without pba_of_lba0 offset) = %lu, size = %lu, "
+		      "rw = %d\n", strip, block_in_strip, disk,
+		      strip_in_disk, drsect, size, private->rw);
+
+		bh1->b_rdev = volume->disks[disk]->dev;
+		bh1->b_rsector = drsect + volume->pba_of_lba0;
+		bh1->b_size = size;
+		bh1->b_data = bufferposition;
+		DEBUG(DEBUG_R0M, "final (many) b_rsector = %lu\n",
+		      bh1->b_rsector);
+
+		/* update the last known head position for the drive */
+		spin_lock(&volume->disks[disk]->lock);
+		volume->disks[disk]->last_pos
+			= bh1->b_rsector + (bh1->b_size >> 9);
+		spin_unlock(&volume->disks[disk]->lock);
+
+		/* submit and update our variables for next strip */
+		generic_make_request(private->rw, bh1);
+		strip++; /* don't really need this, except for debug */
+		bufferposition += size;
+		drsect -= block_in_strip; /* needed on first strip */
+		block_in_strip = 0; /* all after first start at 0 */
+		rsect += (size >> 9);
+		size = volume->blocks_per_strip << 9; /* full strip */
+		if (++disk >= volume->numdisks) { /* start next stripe */
+			disk = 0;
+			strip_in_disk++;
+			drsect += volume->blocks_per_strip;
+		}
+	}
+}
+
+/* make_request_fn for RAID0 volumes */
+static int
+iswraid0_make_request(request_queue_t *q, int rw, struct buffer_head *bh)
+{
+	unsigned long strip, strip_in_disk, disk, block_in_strip, numstrips;
+	int minor = MINOR(bh->b_rdev);
+	int devbit = minor >> SHIFT;
+	unsigned long nextsect, rsect = bh->b_rsector;
+	struct volume *volume = raid[devbit];
+	struct bh_private *private;
+
+	DEBUG(DEBUG_R0, "iswraid0_make_request, minor = %d\n", minor);
+
+	/* add partition offset because we work with the whole volume */
+	rsect += ataraid_gendisk.part[minor].start_sect;
+	/* nextsect > rsect unless b_size <= 0, which it shouldn't be */
+	nextsect = rsect + (bh->b_size >> 9);
+	DEBUG(DEBUG_R0, "rsect with partition offset = %lu, nextsect = %lu\n",
+	      rsect, nextsect);
+
+	if (nextsect > volume->sectors) { /* request beyond volume end */
+		printk(KERN_ERR "iswraid: Request beyond end of volume, "
+		       "minor = %d, startsect = %lu, endsect = %lu\n",
+		       minor, rsect, nextsect - 1);
+		bh->b_end_io(bh, 0); /* fail it immediately */
+		return 0;
+	}
+
+	/* time for the kernel to provide the div function... */
+	/* it's a good thing we checked blocks_per_strip in detect_volumes */
+	block_in_strip = rsect % volume->blocks_per_strip;
+	strip = rsect / volume->blocks_per_strip;
+	/* it's a good thing we checked numdisks in detect_volumes */
+	disk = strip % volume->numdisks;
+	strip_in_disk = strip / volume->numdisks;
+
+	numstrips = ((nextsect - (rsect - block_in_strip) - 1)
+		     / volume->blocks_per_strip) + 1; /* strips to play with */
+	DEBUG(DEBUG_R0, "numstrips = %lu\n", numstrips);
+
+	private = get_private(); /* need this to track bh */
+	private->status = 0;
+	atomic_set(&private->count, numstrips); /* 1 or more */
+	private->rw = rw;
+	private->volume = volume;
+
+	if (numstrips <= 1) { /* optimizable case, numstrips == 1, actually */
+		build_singlestrip_r0io(bh, volume, private, disk,
+				       strip_in_disk, block_in_strip);
+		return 1; /* force the upper level to resubmit this IO */
+	} else { /* The complicated case where we work with numerous strips.
+		  * Obviously, we could call ataraid's split function,
+		  * but it simply splits in halves with no regard for
+		  * natural strip boundaries. Furthermore, we want to be
+		  * in charge of the complete IO success/failure determination,
+		  * and ataraid does not let us do so.
+		  */
+		build_multistrip_r0io(bh, volume, private, disk,
+				      strip_in_disk, block_in_strip,
+				      strip, rsect, nextsect);
+	}
+	return 0; /* We've submitted all IOs ourselves */
+}
+
+/* Build and submit a RAID1E write request */
+static void
+raid1e_write_request(request_queue_t *q, struct buffer_head *bh,
+		     struct bh_private *private, struct volume *volume,
+		     unsigned long rsect, unsigned long nextsect)
+{
+	/* it's good we checked blocks_per_strip in detect_volumes */
+	unsigned long block_in_strip = rsect % volume->blocks_per_strip;
+	unsigned long strip = rsect / volume->blocks_per_strip;
+	unsigned long realstrip = strip << 1; /* considering mirroring */
+	/* it's a good thing we checked numdisks in detect_volumes */
+	int disk = realstrip % volume->numdisks;
+	unsigned long strip_in_disk = realstrip / volume->numdisks;
+	/* Last and first strips are different from the rest */
+	unsigned long drsect = (strip_in_disk * volume->blocks_per_strip
+				+ block_in_strip);
+	unsigned long size = ((volume->blocks_per_strip - block_in_strip)
+			      << 9);
+	char *bufferposition = bh->b_data;
+	struct buffer_head sentinel; /* we prefer keeping bh-s in order */
+	struct buffer_head *bhlist = &sentinel; /* typically points to end */
+	int i, count = 0;
+	sentinel.b_data = (char *) 0xffffffff; /* needed for a check below */
+	DEBUG(DEBUG_R1EW, "raid1e_write_request, raiddev = %d, rsect = %lu, "
+	      "nextsect = %lu\n", volume->devbit, rsect, nextsect);
+	
+	while (rsect < nextsect) {
+		for (i = 0; i < 2; i++) { 
+			if (test_bit(disk, &volume->degradedbits)) {
+				DEBUG(DEBUG_R1EW, "skipping%s disk %d in "
+				      "volume %s, disk state = 0x%08x, "
+				      "volume degraded bits = 0x%08x\n",
+				      volume->disks[disk] ? "" : " missing",
+				      disk, volume->serial,
+				      (volume->disks[disk]
+				       ? volume->disks[disk]->status : 0),
+				      volume->degradedbits);
+				if (++disk >= volume->numdisks) {
+					disk = 0;
+					strip_in_disk++;
+					drsect += volume->blocks_per_strip;
+				}
+				continue;
+			}
+			struct buffer_head *bh1 = get_bhead();
+			/* dupe the bufferhead and update what's different */
+			memcpy(bh1, bh, sizeof(*bh));
+			bh1->b_end_io = &end_io; /* return to us */
+			bh1->b_private = private;
+			
+			/* adjust the size for the last strip */
+			if (nextsect - rsect < (size >> 9))
+				size = (nextsect - rsect) << 9;
+
+			DEBUG(DEBUG_R1EW, "strip = %lu, block_in_strip = %lu, "
+			      "disk = %d, strip_in_disk = %lu, drsect (w/o "
+			      "pba_of_lba0 offset) = %lu, size = %lu, "
+			      "WRITE\n", strip, block_in_strip, disk,
+			      strip_in_disk, drsect, size);
+
+			bh1->b_rdev = volume->disks[disk]->dev;
+			bh1->b_rsector = drsect + volume->pba_of_lba0;
+			bh1->b_size = size;
+			bh1->b_data = bufferposition;
+
+			/* update the last known head position for the drive */
+			spin_lock(&volume->disks[disk]->lock);
+			volume->disks[disk]->last_pos = (bh1->b_rsector
+							 + (bh1->b_size >> 9));
+			spin_unlock(&volume->disks[disk]->lock);
+
+			bhlist->b_next = bh1; /* put it on the list */
+			bhlist = bh1;
+			count++;
+			if (++disk >= volume->numdisks) { /* next stripe */
+				disk = 0;
+				strip_in_disk++;
+				drsect += volume->blocks_per_strip;
+			}
+		}
+		
+		/* if this strip can't get written to any disk among mirrors */
+		if (bhlist->b_data != bufferposition)
+		{
+			printk(KERN_ERR "iswraid: No nonfailed disks were "
+			       "found for volume w/ devnum %d, failing IO\n",
+			       volume->devbit);
+			bhlist->b_next = NULL; /* mark the end of the list */
+			for (bhlist = sentinel.b_next; bhlist; ) {
+				struct buffer_head *bh1 = bhlist;
+				bhlist = bhlist->b_next;
+				kmem_cache_free(bhcache, bh1); /* free all */
+			}
+			bh->b_end_io(bh, 0); /* fail the whole IO */
+			return;              /* and bail */
+		}
+		
+		strip++; /* don't really need this, except for debug */
+		bufferposition += size;
+		drsect -= block_in_strip; /* needed on first strip */
+		block_in_strip = 0; /* all after first start at 0 */
+		rsect += (size >> 9);
+		size = volume->blocks_per_strip << 9; /* full strip */
+	}
+
+	atomic_set(&private->count, count);
+	bhlist->b_next = NULL; /* mark the end of the list */
+	for (bhlist = sentinel.b_next; bhlist; ) {
+		struct buffer_head *bh1 = bhlist;
+		bhlist = bhlist->b_next;
+		generic_make_request(WRITE, bh1);
+	}
+}
+
+/* Build and submit a RAID1 write request */
+static int
+raid1_write_request(request_queue_t *q, struct buffer_head *bh,
+		    unsigned long rsect, unsigned long nextsect)
+{
+	int i, minor = MINOR(bh->b_rdev);
+	int devbit = minor >> SHIFT;
+	struct volume *volume = raid[devbit];
+	struct bh_private *private;
+	DEBUG(DEBUG_R1W, "raid1_write_request, minor = %d\n", minor);
+
+	private = get_private(); /* for tracking the original bh */
+	private->status = 0;
+	private->parent = bh; /* don't need old_endiofn and old_private */
+	private->rw = WRITE;
+	private->volume = volume;
+
+	if (volume->numdisks <= 2) { /* numdisks == 2 is normal RAID1 */
+		int count = 0; /* we'll count the IOs submitted */
+		struct buffer_head *bhs[2]; /* at most 2 bh-s to worry about */
+
+		for (i = 0; i < volume->numdisks; i++) {
+			if (test_bit(i, &volume->degradedbits)) {
+				DEBUG(DEBUG_R1W, "skipping%s disk %d in "
+				      "volume %s, disk state = 0x%08x, "
+				      "volume degraded bits = 0x%08x\n",
+				      volume->disks[i] ? "" : " missing",
+				      i, volume->serial,
+				      (volume->disks[i]
+				       ? volume->disks[i]->status : 0),
+				      volume->degradedbits);
+				continue;
+			}
+
+			bhs[count] = get_bhead();
+			/* dupe the bufferhead and change what's needed */
+			memcpy(bhs[count], bh, sizeof(*bh));
+			bhs[count]->b_end_io = &end_io; /* return to us */
+			bhs[count]->b_private = private;
+			bhs[count]->b_rsector = rsect + volume->pba_of_lba0;
+			bhs[count]->b_rdev = volume->disks[i]->dev;
+
+			/* update the last known head position for the drive */
+			spin_lock(&volume->disks[i]->lock);
+			volume->disks[i]->last_pos = nextsect;
+			spin_unlock(&volume->disks[i]->lock);
+			count++;
+		}
+		if (!count) { /* if we didn't build anything */
+			printk(KERN_ERR "iswraid: No nonfailed disks were "
+			       "found for volume minor %d, failing IO\n",
+			       minor);
+			kmem_cache_free(privcache, private);
+			bh->b_end_io(bh, 0);
+		} else {
+			atomic_set(&private->count, count); /* normally 2 */
+			/* we try to dispatch both IOs at the same time */
+			for (i = 0; i < count; i++)
+				generic_make_request(WRITE, bhs[i]);
+		}
+
+	} else /* this is RAID1E and the same as RAID10 for even numdisks */
+		raid1e_write_request(q, bh, private, volume, rsect, nextsect);
+
+	return 0; /* We've submitted all IOs ourselves */
+}
+
+#define HUGE_MOVE_BLOCKS 4096
+
+/* Build and submit a RAID1E read request */
+static void
+raid1e_read_request(request_queue_t *q,
+		    struct buffer_head *bh, struct volume *volume,
+		    unsigned long rsect, unsigned long nextsect)
+{
+	struct bh_private *private = get_private(); /* for tracking bh */
+	/* it's good we checked blocks_per_strip in detect_volumes */
+	unsigned long block_in_strip = rsect % volume->blocks_per_strip;
+	unsigned long strip = rsect / volume->blocks_per_strip;
+	unsigned long realstrip = strip << 1; /* considering mirroring */
+	/* it's a good thing we checked numdisks in detect_volumes */
+	int disk = realstrip % volume->numdisks;
+	unsigned long strip_in_disk = realstrip / volume->numdisks;
+	/* Last and first strips are different from the rest */
+	unsigned long drsect = (strip_in_disk * volume->blocks_per_strip
+				+ block_in_strip);
+	unsigned long size = ((volume->blocks_per_strip - block_in_strip)
+			      << 9);
+	char *bufferposition = bh->b_data;
+	struct buffer_head sentinel; /* we'll keep all bh-s in order */
+	struct buffer_head *bhlist = &sentinel; /* bhlist points to end */
+	int i, count = 0;
+	unsigned long positions[MAX_RAID_MEMBER_DISKS];
+	DEBUG(DEBUG_R1R, "raid1e_read_request, raiddev = %d, rsect = %lu, "
+	      "nextsect = %lu\n", volume->devbit, rsect, nextsect);
+	
+	/* prepare for choosing the best disk based on head move distance */
+	for (i = 0; i < volume->numdisks; i++)
+		if (volume->disks[i]) { /* if disk present */
+			spin_lock(&volume->disks[i]->lock);
+			positions[i] = volume->disks[i]->last_pos;
+			spin_unlock(&volume->disks[i]->lock);
+		}
+	private->status = 0;
+	private->parent = bh; /* don't need old_endiofn and old_private */
+	private->rw = READ;
+	private->volume = volume;
+	clear_bit(BH_Mirror, &bh->b_state); /* not a mirror read */
+
+	while (rsect < nextsect) {
+		int bestdisk = -1; /* best disk found; initially bogus */
+		unsigned int bestdistance = ~0; /* as far as possible */
+		int distance;
+		struct buffer_head *bh1 = get_bhead();
+		/* dupe the bufferhead and update what's different */
+		memcpy(bh1, bh, sizeof(*bh));
+		bh1->b_end_io = &end_io; /* return to us */
+		bh1->b_private = private;
+		/* adjust the size for the last strip */
+		if (nextsect - rsect < (size >> 9))
+			size = (nextsect - rsect) << 9;
+		bh1->b_rsector = drsect + volume->pba_of_lba0;
+		bh1->b_size = size;
+		bh1->b_data = bufferposition;
+		DEBUG(DEBUG_R1ER, "strip = %lu, block_in_strip = %lu, "
+		      "disk = %d (or next), strip_in_disk = %lu, drsect (w/o "
+		      "pba_of_lba0 offset) = %lu, size = %lu, "
+		      "READ\n", strip, block_in_strip, disk,
+		      strip_in_disk, drsect, size);
+		
+		for (i = 0; i < 2; i++) {
+			if (test_bit(disk, &volume->degradedbits))
+				DEBUG(DEBUG_R1R, "skipping%s disk %d in "
+				      "volume %s, disk state = 0x%08x, "
+				      "volume degraded bits = 0x%08x\n",
+				      volume->disks[disk] ? "" : " missing",
+				      disk, volume->serial,
+				      (volume->disks[disk]
+				       ? volume->disks[disk]->status : 0),
+				      volume->degradedbits);
+			else {
+				distance = abs(bh->b_rsector
+					       - positions[disk]);
+				if (distance > HUGE_MOVE_BLOCKS)
+					distance = HUGE_MOVE_BLOCKS;
+				if (distance < bestdistance) {
+					bestdistance = distance;
+					bestdisk = disk;
+				} else if (distance == bestdistance) {
+					bestdisk = (volume->tiebreak
+						    ? disk : bestdisk);
+					volume->tiebreak
+						= 1 - volume->tiebreak;
+				}
+			}
+			if (++disk >= volume->numdisks) {
+				disk = 0;
+				strip_in_disk++;
+				drsect += volume->blocks_per_strip;
+			}
+		}
+
+		if (bestdisk < 0) {
+			printk(KERN_ERR "iswraid: No present and nonfailed "
+			       "disks were found for volume w/ ataraid devnum "
+			       "%d, failing IO\n", volume->devbit);
+			bhlist->b_next = NULL; /* mark the end of the list */
+			for (bhlist = sentinel.b_next; bhlist; ) {
+				struct buffer_head *bh1 = bhlist;
+				bhlist = bhlist->b_next;
+				kmem_cache_free(bhcache, bh1); /* free all */
+			}
+			bh->b_end_io(bh, 0); /* fail the whole IO */
+			return;              /* and bail */
+		}
+
+		DEBUG(DEBUG_R1R, "bestdisk = #%d, bestdistance = %d\n",
+		      bestdisk, bestdistance);
+		bh1->b_rdev = volume->disks[bestdisk]->dev;
+		positions[bestdisk] = bh->b_rsector + (size >> 9);
+		bhlist->b_next = bh1; /* put it on the list */
+		bhlist = bh1;
+		count++;
+		strip++; /* don't really need this, except for debug */
+		bufferposition += size;
+		drsect -= block_in_strip; /* needed on first strip */
+		block_in_strip = 0; /* all after first start at 0 */
+		rsect += (size >> 9);
+		size = volume->blocks_per_strip << 9; /* full strip */
+	}
+
+	atomic_set(&private->count, count);
+	bhlist->b_next = NULL; /* mark the end of the list */
+	for (bhlist = sentinel.b_next; bhlist; ) { /* dispatch all bh-s */
+		struct buffer_head *bh1 = bhlist;
+		bhlist = bhlist->b_next;
+		generic_make_request(READ, bh1);
+	}
+	for (i = 0; i < volume->numdisks; i++) /* update head positions */
+		if (volume->disks[i]) { /* if disk present */
+			spin_lock(&volume->disks[i]->lock);
+			volume->disks[i]->last_pos = positions[i];
+			spin_unlock(&volume->disks[i]->lock);
+		}
+}
+
+/* Build and submit a RAID1 read request */
+static int
+raid1_read_request(request_queue_t *q, struct buffer_head *bh,
+		   unsigned long rsect, unsigned long nextsect)
+{
+	int minor = MINOR(bh->b_rdev);
+	int devbit = minor >> SHIFT;
+	struct volume *volume = raid[devbit];
+	struct bh_private *private;
+	int i, bestdisk = -1; /* best disk found; initially a bogus value */
+	unsigned int bestdistance = ~0; /* as far as we could possibly go */
+	int distance;
+	DEBUG(DEBUG_R1R, "raid1_read_request, minor = %d\n", minor);
+
+	if (volume->numdisks <= 2) { /* numdisks == 2 is normal RAID1 */
+		bh->b_rsector = rsect + volume->pba_of_lba0;
+
+		/* Reads are simple in principle---pick a disk and use it.
+		 * We choose the disk w/ the closest last known head position.
+		 * All moves over HUGE_MOVE_BLOCKS are considered equally bad.
+		 * FIXME performance considerations, offline checking needed.
+		 * When there is a tie, use a volume-specific tiebreaker.
+		 * Of course, without knowing the true physical disk geometry
+		 * or at least knowing which requests from the queue have truly
+		 * been dispatched to the disk and whether the head is moving
+		 * down or up, all this is likely fairly worthless...
+		 * We could also keep an IO count for each disk and use that,
+		 * but looking the disk up from b_rdev is a bit clumsy...
+		 */
+		for (i = 0; i < volume->numdisks; i++) {
+			if (test_bit(i, &volume->degradedbits)) {
+				DEBUG(DEBUG_R1R, "skipping%s disk %d in "
+				      "volume %s, disk state = 0x%08x, "
+				      "volume degraded bits = 0x%08x\n",
+				      volume->disks[i] ? "" : " missing",
+				      i, volume->serial,
+				      (volume->disks[i]
+				       ? volume->disks[i]->status : 0),
+				      volume->degradedbits);
+				continue; /* ignore failed disks */
+			}
+			spin_lock(&volume->disks[i]->lock);
+			distance = abs(bh->b_rsector
+				       - volume->disks[i]->last_pos);
+			spin_unlock(&volume->disks[i]->lock);
+
+			if (distance > HUGE_MOVE_BLOCKS)     /* huge moves  */
+				distance = HUGE_MOVE_BLOCKS; /* are all bad */
+			if (distance < bestdistance) { /* improvement */
+				bestdistance = distance;
+				bestdisk = i;
+			} else if (distance == bestdistance) {
+				/* races regarding tiebreak are irrelevant */
+				bestdisk = volume->tiebreak;
+				volume->tiebreak = 1 - volume->tiebreak;
+			}
+		}
+
+		if (bestdisk < 0) {
+			printk(KERN_ERR "iswraid: No present and nonfailed "
+			       "disks were found for volume minor %d, "
+			       "failing IO\n", minor);
+			bh->b_end_io(bh, 0);
+			return 0;
+		}
+
+		DEBUG(DEBUG_R1R, "bestdisk = #%d, bestdistance = %d\n",
+		      bestdisk, bestdistance);
+		bh->b_rdev = volume->disks[bestdisk]->dev;
+		private = get_private(); /* for tracking bh */
+		private->status = 0;
+		/* we save the fields we'll mess up */
+		private->old_private = bh->b_private;
+		private->old_endiofn = bh->b_end_io;
+		/* Clear parent, set count and tie to bh with return to us */
+		private->parent = NULL;
+		atomic_set(&private->count, 1);
+		private->rw = READ;
+		private->volume = volume;
+		bh->b_private = private;
+		bh->b_end_io = &end_io;
+		clear_bit(BH_Mirror, &bh->b_state); /* not a mirror read */
+
+		/* update the last known head position for the drive */
+		spin_lock(&volume->disks[bestdisk]->lock);
+		volume->disks[bestdisk]->last_pos = nextsect;
+		spin_unlock(&volume->disks[bestdisk]->lock);
+		return 1; /* force the upper level to resubmit the IO */
+	} else /* this is RAID1E and the same as RAID10 for even numdisks */
+		raid1e_read_request(q, bh, volume, rsect, nextsect);
+
+	return 0; /* We've submitted all IO ourselves */
+}
+
+/* make_request_fn for RAID1 volumes */
+static int
+iswraid1_make_request(request_queue_t *q, int rw, struct buffer_head *bh)
+{
+	int minor = MINOR(bh->b_rdev);
+	int devbit = minor >> SHIFT;
+	unsigned long nextsect, rsect = bh->b_rsector;
+	struct volume *volume = raid[devbit];
+	DEBUG(DEBUG_R1, "iswraid1_make_request, minor = %d\n", minor);
+
+	if (test_bit(FAILED_BIT, &volume->state)) {
+		printk(KERN_ERR "iswraid: Volume '%s' (d%d) is in a failed "
+		       "state, failing IO\n", volume->serial, volume->devbit);
+		bh->b_end_io(bh, 0); /* fail it immediately */
+		return 0;
+	}
+	if (iswraid_halt_degraded && test_bit(DEGRADED_BIT, &volume->state)) {
+		printk(KERN_ERR "iswraid: Volume '%s' (d%d) is in a degraded "
+		       "state and iswraid_halt_degraded is set, failing IO\n",
+		       volume->serial, volume->devbit);
+		bh->b_end_io(bh, 0); /* fail it immediately */
+		return 0;
+	}
+
+	/* add partition offset because we work with the whole volume */
+	rsect += ataraid_gendisk.part[minor].start_sect;
+	/* nextsect > rsect unless b_size <= 0, which it shouldn't be */
+	nextsect = rsect + (bh->b_size >> 9);
+	DEBUG(DEBUG_R1, "rsect with partition offset = %lu, nextsect = %lu\n",
+	      rsect, nextsect);
+	if (nextsect > volume->sectors) { /* request beyond volume end */
+		printk(KERN_ERR "iswraid: Request beyond end of volume, "
+		       "failing IO, minor = %d, startsect = %lu, endsect = "
+		       "%lu\n", minor, rsect, nextsect - 1);
+		bh->b_end_io(bh, 0); /* fail it immediately */
+		return 0;
+	}
+
+	if (rw == READ || rw == READA) /* separate reads from writes */
+		return raid1_read_request(q, bh, rsect, nextsect);
+	else
+		return raid1_write_request(q, bh, rsect, nextsect);
+}
+
+/* The IOCTL handler */
+static int
+iswraid_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+	      unsigned long arg)
+{
+	unsigned int minor, devbit, i;
+
+	if (!inode || !inode->i_rdev)
+		return -EINVAL;
+
+	minor = MINOR(inode->i_rdev);
+	devbit = minor >> SHIFT; /* SHIFT comes from ataraid, = 4 */
+
+	switch (cmd) {
+	case BLKGETSIZE:	/* Return device size */
+		DEBUG(DEBUG_IOCTL, "ioctl BLKGETSIZE, minor = %d\n", minor);
+		if (!arg)
+			return -EINVAL;
+
+		if (minor & 0xf) /* individual partition, not the whole vol */
+			return put_user(ataraid_gendisk.part[minor].nr_sects,
+					(unsigned long *) arg);
+
+		return put_user(raid[devbit]->sectors, /* the whole volume */
+				(unsigned long *) arg);
+
+	case HDIO_GETGEO_BIG:
+		DEBUG(DEBUG_IOCTL, "ioctl HDIO_BIG_GETGEO, minor = %d\n",
+		      minor);
+		if (!arg)
+			return -EINVAL;
+
+		raid[devbit]->hb_geom.start
+			= ataraid_gendisk.part[minor].start_sect;
+		if (copy_to_user((void *) arg, &raid[devbit]->hb_geom,
+				 sizeof(struct hd_big_geometry)))
+			return -EFAULT;
+
+		return 0;
+
+	case HDIO_GETGEO:
+		DEBUG(DEBUG_IOCTL, "ioctl HDIO_GETGEO, minor = %d\n", minor);
+		if (!arg)
+			return -EINVAL;
+
+		{
+			struct hd_geometry temp;
+			temp.heads = raid[devbit]->hb_geom.heads;
+			temp.sectors = raid[devbit]->hb_geom.sectors;
+			/* cylinders may get truncated here */
+			temp.cylinders = raid[devbit]->hb_geom.cylinders;
+			temp.start = ataraid_gendisk.part[minor].start_sect;
+			if (copy_to_user((void *) arg, &temp,
+					 sizeof(struct hd_geometry)))
+				return -EFAULT;
+		}
+
+		return 0;
+
+	case BLKRRPART: 	    /* Re-Read Partition Table. */
+		DEBUG(DEBUG_IOCTL, "ioctl BLKRRPART, minor = %d\n", minor);
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		if (down_interruptible(&iswraid_sem))
+			return -ERESTARTSYS;
+
+		if (raid[devbit]->refcnt > 1) {
+			up(&iswraid_sem);
+			return -EBUSY;
+		}
+
+		for (i = 0; i < 16; i++) { /* for whole disk and partitions */
+			int m = (minor & ~0xf) + i;
+			if (ataraid_gendisk.part[m].nr_sects > 0) {
+				invalidate_device(
+					MKDEV(ataraid_gendisk.major, m), 1);
+			}
+			/* Clear existing partition sizes  */
+			ataraid_gendisk.part[m].start_sect = 0;
+			ataraid_gendisk.part[m].nr_sects = 0;
+			/* Reset the Block Size */
+			set_blocksize(MKDEV(ataraid_gendisk.major, m),
+				      ISW_DISK_BLOCK_SIZE);
+		}
+
+		ataraid_register_disk(devbit, raid[devbit]->sectors);
+		up(&iswraid_sem);
+		return 0;
+
+	default:
+		DEBUG(DEBUG_IOCTL, "other ioctl, cmd = %d, minor = %d\n",
+		      cmd, minor);
+		return blk_ioctl(inode->i_rdev, cmd, arg);
+	} /* ENDOF switch (cmd) */
+
+	return 0;
+}
+
+/* Synchronous read or write of a bunch of contiguous disk sectors.
+ * For reads we could have used bread() instead. But there is little point
+ * for a driver that sits below the buffer cache to go reading blocks through
+ * the buffer cache. Furthermore, it would then pollute the cache with blocks
+ * that nobody else needs (we use this function only for MPB reads/writes).
+ * And, finally, there is no bwrite(), so we do need our own function.
+ */
+static int __init
+rw_sectors_sync(int major, int minor, int rw, unsigned char *buffer,
+		unsigned long sector, int count)
+{
+	int ret = 0;
+	kdev_t dev = MKDEV(major, minor);
+	struct buffer_head *bh = get_bhead(); /* bh->b_wait already inited */
+ 	memset(bh, 0, sizeof(*bh)); /* most fields will be NULL */
+	init_waitqueue_head(&bh->b_wait); /* restore what we just trashed */
+	bh->b_rsector = sector;
+	bh->b_rdev = dev;
+	bh->b_size = ISW_DISK_BLOCK_SIZE * count;
+	bh->b_data = buffer;
+	bh->b_page = virt_to_page(buffer); /* safe for logical addresses */
+	set_bit(BH_Mapped, &bh->b_state); /* checked in __make_request */
+	/* From here on we're picking lines from bread() (see fs/buffer.c) and
+	 * from ll_rw_block() and submit_bh() (see drivers/block/ll_rw_blk.c).
+	 */
+	set_bit(BH_Sync, &bh->b_state);
+	set_bit(BH_Lock, &bh->b_state);
+	bh->b_end_io = &end_io_sync;
+	get_bh(bh); /* increment usage count, end_io_sync() will do put_bh() */
+	generic_make_request(rw, bh);
+	wait_on_buffer(bh);
+	/* The end of line-picking is here. If anything important is missing,
+	 * please tell me what and why (MK).
+	 */
+	if (!buffer_uptodate(bh)) { /* this test works for writes, too */
+		printk(KERN_ERR "iswraid: Synchronous %s IO to disk major %d "
+		       "minor %d failed\n", rw == WRITE ? "write" : "read",
+		       major, minor);
+		ret = -EIO;
+	}
+	kmem_cache_free(bhcache, bh);
+	return ret;
+}
+
+#define INQUIRY_BUFLEN 256 /* max SCSI serial num length could be 255 */
+
+/* The raid member disks are scsi devices. We do an inquiry to
+ * determine the disk serial number. This information is used
+ * to order the member disks correctly in the raid array.
+ * isw_serial_no should point to a buffer of length at least
+ * (MAX_RAID_SERIAL_LEN + 1), so that we can null-terminate the serial number.
+ */
+static int __init
+do_inquiry(int major, int minor, unsigned char *isw_serial_no)
+{
+	kdev_t dev = MKDEV(major, minor);
+	int i, j, retval, lenplus4;
+	unsigned char *cmd;
+	/* FIXME? Is this too much to put on the stack? */
+	u32 buffer[INQUIRY_BUFLEN / sizeof(u32) + 2]; /* 2 extra for lengths */
+	Scsi_Device *scsidev;
+
+	/* If the device exists it must have a request queue.
+	 * And, according to scsi.c, Scsi_Device * was put in queuedata.
+	 */
+	scsidev = blk_get_queue(dev)->queuedata;
+	if (!scsidev) {
+		printk(KERN_ERR "iswraid: could not get scsidev for major %d "
+		       "minor %d\n", major, minor);
+		return -EINVAL;
+	}
+
+	/* input data size. No input. */
+	buffer[0] = 0;
+	/* output buffer size. Doesn't include the two length words up front */
+	buffer[1] = INQUIRY_BUFLEN;
+	/* cmd is the SCSI command to send */
+	cmd = (unsigned char *) &buffer[2];
+
+	cmd[0] = 0x12;		/* Opcode INQUIRY=12h */
+	cmd[1] = 0x01;		/* EVPD=1. Return the vital product data
+				   specified in page code */
+	cmd[2] = 0x80;		/* Page Code Unit serial number page=80h */
+	cmd[3] = 0x00;		/* Reserved byte */
+	cmd[4] = 0xff;		/* allocation length */
+	cmd[5] = 0x00;		/* Control byte */
+
+	retval = kernel_scsi_ioctl(scsidev, SCSI_IOCTL_SEND_COMMAND, buffer);
+
+	if (retval) {
+		printk(KERN_ERR "iswraid: kernel_scsi_ioctl("
+		       "SCSI_IOCTL_SEND_COMMAND) failed, code= %d\n", retval);
+		return retval;
+	}
+
+	/* Unit Serial Number page is returned at the same location as cmd. */
+	/* Fourth byte holds the length of the serial number which follows. */
+	lenplus4 = cmd[3] + 4; /* index to just beyond the serial number */
+
+	/* Start from the beginning, assuming that everything will fit. */
+	for (i = 4, j = 0; i < lenplus4 && j < MAX_RAID_SERIAL_LEN; i++) {
+		if (cmd[i] > 0x20) /* skip blanks and ctrl chars */
+			isw_serial_no[j++] = cmd[i];
+	}
+	isw_serial_no[j--] = '\0'; /* terminate what we got and move left */
+
+	if (i < lenplus4) /* if we ran out of space, redo it from the tail */
+		for (i = lenplus4 - 1; j >= 0; i--)
+			if (cmd[i] > 0x20) /* skip blanks and ctrl chars */
+				isw_serial_no[j--] = cmd[i];
+
+	return 0;
+}
+
+#define TYPICAL_MPBSIZE 1024
+
+/* Read all the MPB blocks, check signature and checksum */
+static struct _raid_mpb * __init
+read_mpb(int major, int minor)
+{
+	unsigned long mpb_blocknum;
+	unsigned char *mpbbuf, *mpbbuf2;
+	int mpbblocks;
+	struct _raid_mpb *mpb;
+
+	/* Find the block number of the "first" block of Intel RAID metadata */
+	if (!(mpb_blocknum = calc_mpb_blocknum(major, minor)))
+		return NULL;
+
+	/* get a permanent storage space for the MPB */
+	mpbbuf = kmalloc(TYPICAL_MPBSIZE, GFP_KERNEL);
+
+	if (!mpbbuf) {
+		printk(KERN_ERR "iswraid: Can't kmalloc %d bytes\n",
+		       TYPICAL_MPBSIZE);
+		return NULL;
+	}
+
+	/* Read the RAID metadata "header" */
+	if (rw_sectors_sync(major, minor, READ, mpbbuf, mpb_blocknum, 1))
+		goto freempbbuf;
+
+	mpb = (struct _raid_mpb *) mpbbuf;
+
+	/* Check Signature and version info, older versions supported */
+	if (strncmp(mpb->sig.text, MPB_SIGNATURE, sizeof(MPB_SIGNATURE) - 1)
+	    || (strcmp(mpb->sig.text + sizeof(MPB_SIGNATURE) - 1, MPB_VERSION)
+		> 0)) {
+		printk(KERN_INFO "iswraid: disk with major %d minor %d does "
+		       "not have a valid Intel Software Raid signature, or "
+		       "the version is newer than supported: '%-.32s'\n",
+		       major, minor, mpb->sig.text);
+		goto freempbbuf;
+	}
+
+	DEBUG(DEBUG_MPB, "member disk found at major %d minor %d\n",
+	      major, minor);
+	mpbblocks = ((mpb->mpb_size + ISW_DISK_BLOCK_SIZE - 1)
+		     / ISW_DISK_BLOCK_SIZE);
+
+	/* If we need more space for the MPB */
+	if (mpbblocks > TYPICAL_MPBSIZE / ISW_DISK_BLOCK_SIZE) {
+		mpbbuf2 = kmalloc(mpbblocks * ISW_DISK_BLOCK_SIZE, GFP_KERNEL);
+
+		if (!mpbbuf2) {
+			printk(KERN_ERR "iswraid: Can't kmalloc %d bytes\n",
+			       mpbblocks * ISW_DISK_BLOCK_SIZE);
+			goto freempbbuf;
+		}
+
+		memcpy(mpbbuf2, mpbbuf, ISW_DISK_BLOCK_SIZE);
+		mpb = (struct _raid_mpb *) mpbbuf2;
+		kfree(mpbbuf);
+		mpbbuf = mpbbuf2;
+	}
+
+	/* Here we read the rest of the MPB, if necessary */
+	if (mpbblocks > 1) {
+		DEBUG(DEBUG_MPB, "size of RAID metadata is %d bytes\n",
+		      mpb->mpb_size);
+
+		/* note that the rest of the MPB lives _before_ its "header" */
+		if (rw_sectors_sync(major, minor, READ,
+				    mpbbuf + ISW_DISK_BLOCK_SIZE,
+				    mpb_blocknum - (mpbblocks - 1),
+				    mpbblocks - 1)) {
+			printk(KERN_ERR "iswraid: couldn't read the rest of "
+			       "RAID MPB\n");
+			goto freempbbuf;
+		}
+	}
+
+	/* Compare checksum read from MPB with newly calculated value */
+	if (mpb->check_sum
+	    != compute_checksum((const u32 *) mpb, mpb->mpb_size)) {
+		printk(KERN_ERR "iswraid: MPB checksum error\n");
+		goto freempbbuf;
+	}
+
+	DEBUG(DEBUG_MPB, "checksum OK for major %d minor %d\n", major, minor);
+	return mpb;
+
+ freempbbuf:
+	kfree(mpbbuf);
+	return NULL;
+}
+
+/* Update the checksum and write the MPB "synchronously" */
+static int
+write_mpb(int major, int minor, struct _raid_mpb *mpb)
+{
+	unsigned long mpb_blocknum;
+	int mpbblocks;
+	DEBUG(DEBUG_MPB, "write_mpb\n");
+
+	/* Find the block number of the "first" block of Intel RAID metadata */
+	if (!(mpb_blocknum = calc_mpb_blocknum(major, minor)))
+		return -EINVAL;
+
+	mpb->generation_num++; /* increment the generation number */
+	/* Update the checksum */
+	mpb->check_sum = compute_checksum((const u32 *) mpb, mpb->mpb_size);
+
+	/* Write the RAID metadata "header" */
+	if (rw_sectors_sync(major, minor, WRITE, (unsigned char *) mpb,
+			    mpb_blocknum, 1))
+		return -EIO;
+
+	mpbblocks = ((mpb->mpb_size + ISW_DISK_BLOCK_SIZE - 1)
+		     / ISW_DISK_BLOCK_SIZE);
+
+	if (mpbblocks <= 1) /* < 1 isn't really possible */
+		return 0;
+
+	/* Here we write the rest of the MPB */
+	DEBUG(DEBUG_MPB, "size of RAID metadata is %d bytes\n", mpb->mpb_size);
+
+	/* note that the rest of the MPB lives _before_ its "header" */
+	if (rw_sectors_sync(major, minor, WRITE,
+			    (unsigned char *) mpb + ISW_DISK_BLOCK_SIZE,
+			    mpb_blocknum - (mpbblocks - 1), mpbblocks - 1))
+		return -EIO;
+
+	printk(KERN_INFO "iswraid: MPB written to disk major %d minor %d\n",
+	       major, minor);
+	return 0;
+}
+
+/* Read the MPB, get disk serial number */
+static struct disk * __init
+probe_disk(int major, int minor)
+{
+	struct disk *newdisk;
+	struct _raid_mpb *mpb;
+
+	DEBUG(DEBUG_INIT, "probe_disk for major %d minor %d\n", major, minor);
+	if (!(mpb = read_mpb(major, minor)))
+		return NULL;
+
+	newdisk = kmalloc(sizeof(struct disk), GFP_KERNEL);
+
+	if (!newdisk) {
+		printk(KERN_ERR "iswraid: Can't kmalloc struct disk\n");
+		goto freempb;
+	}
+
+	/* FIXME? We could read the serial number for every disk. Then we
+	 * could do operations like array creation and adding disks to arrays,
+	 * marking failed disks normal, etc. If we start reading serial
+	 * numbers for disks regardless of MPB existence/state, then it might
+	 * be best to limit disks to those attached to Intel SATA controllers.
+	 * (Or we could have some easy to parse parameter specifying which
+	 * SCSI disks to probe. Suggestions and patches are welcome.)
+	 * But currently we only read serial numbers for disks that have a
+	 * decent MPB. The Option ROM should be used to fix any broken MPBs.
+	 */
+	if (do_inquiry(major, minor, newdisk->serial)) {
+		printk(KERN_INFO "iswraid: Inquiry returned error\n");
+		goto freenewdisk;
+	}
+
+	newdisk->dev = MKDEV(major, minor);
+	newdisk->array = NULL;
+	newdisk->mpb = mpb;
+	newdisk->status = 0; /* will match mpbdisk->status soon */
+	atomic_set(&newdisk->errorcount, 0);
+	newdisk->last_pos = 0; /* we don't know it, but make it same for all */
+	spin_lock_init(&newdisk->lock); /* access to last_pos only thru lock */
+	return newdisk;
+
+ freenewdisk:
+	kfree(newdisk);
+ freempb:
+	kfree(mpb);
+	return NULL;
+}
+
+/* Set up a new array (or redo an existing one) from the given MPB data */
+static void __init
+update_array(struct array *array, struct _raid_mpb *mpb)
+{
+	int j;
+	array->mpb = mpb;
+
+	/* caller must ensure that mpb->num_disks <= MAX_RAID_MEMBER_DISKS */
+	for (j = 0; j < mpb->num_disks; j++) {
+		struct _mpb_disk *mpbdisk = &mpb->disk_tbl[j];
+		struct disk *disk
+			= find_disk_by_serial(mpbdisk->serial.serial);
+		array->disks[j] = NULL; /* assume trouble with this disk */
+
+		if (!disk) { /* can't find the disk mentioned in MPB */
+			unsigned char tmp[MAX_RAID_SERIAL_LEN + 1];
+			strncpy(tmp, mpbdisk->serial.serial,
+				MAX_RAID_SERIAL_LEN);
+			tmp[MAX_RAID_SERIAL_LEN] = 0;
+			printk(KERN_WARNING "iswraid: Disk '%s' missing from "
+			       "array with family_num 0x%08x\n",
+			       tmp, mpb->family_num);
+			/* If we change a normal serial into a placeholder,
+			 * then we should update the MPB, except that we don't
+			 * really have to bother in the case when we can't use
+			 * degraded volumes anyway. In this case no volumes
+			 * from this array can be used, so we don't have to
+			 * mark in the MPBs that volumes were degraded and/or
+			 * failed due to a missing disk. This little twist can
+			 * offer some protection for the case when it isn't
+			 * clear whether all the relevant disks will be found
+			 * by the SCSI low-level driver(s). It wouldn't be nice
+			 * to change the MPBs when the fault is elsewhere...
+			 */
+			if (create_placeholder(mpbdisk->serial.serial)
+			    && !iswraid_halt_degraded)
+				array->saveneeded = 1;
+			continue;
+		}
+
+		if (find_tag(mpbdisk->serial.serial)) { /* placeholder */
+			strncpy(mpbdisk->serial.serial, disk->serial,
+				MAX_RAID_SERIAL_LEN); /* replace w/ serial */
+			array->saveneeded = 1;
+		}
+
+		/* if disk's MPB contradicts our original MPB, the disk wins */
+		if (disk->mpb->family_num != mpb->family_num) {
+			printk(KERN_WARNING "iswraid: Disk '%s' claims to "
+			       "belong to array w/ family_num 0x%08x, hence "
+			       "marking it missing from array w/ family_num "
+			       "0x%08x\n", disk->serial, disk->mpb->family_num,
+			       mpb->family_num);
+			if (create_placeholder(mpbdisk->serial.serial))
+				array->saveneeded = 1;
+			continue;
+		}
+
+		disk->status = mpbdisk->status;
+		disk->array = array;
+		array->disks[j] = disk;
+		DEBUG(DEBUG_INIT, "disk '%s' is #%d in array with family_num "
+		      "0x%08x\n", disk->serial, j, mpb->family_num);
+
+		/* shouldn't encounter any other than CONFIGURED_DISKs here */
+		if (mpbdisk->status & FAILED_DISK) {
+			printk(KERN_WARNING "iswraid: Disk '%s' from array "
+			       "with family_num 0x%08x has failed\n",
+			       disk->serial, mpb->family_num);
+		}
+	}
+}
+
+/* Find arrays by parsing the MPB data on all disks */
+static void __init
+find_arrays(void)
+{
+	struct disk *disk;
+	DEBUG(DEBUG_INIT, "looking for RAID arrays\n");
+
+	/* for each disk, set up the array that it belongs to, if any */
+	/* for each array being set up, make a list of its disks */
+	list_for_each_entry(disk, &disklist, head) {
+		int i;
+		int needupdate = 1; /* assume array's MPB needs an update */
+
+		for (i = 0; i < arraycount; i++)
+			if (arrays[i].mpb->family_num
+			    == disk->mpb->family_num) {
+				/* seems like the disk belongs to the array */
+				if ((arrays[i].mpb->check_sum /* same MPBs */
+				     == disk->mpb->check_sum)
+				    /* or not quite, but array's MPB wins */
+				    || (arrays[i].mpb->generation_num
+					>= disk->mpb->generation_num))
+					needupdate = 0; /* ignore disk's MPB */
+				break; /* disk belonged to array, so get out */
+			}
+		/* Now we have either an array that the disk seems to belong to
+		 * or perhaps a lack of array for this disk. Also it is
+		 * possible that the array has thrown the disk out but the
+		 * disk doesn't know it yet. We won't change the disk's MPB
+		 * in this case, OROM should have done it. We simply ignore
+		 * such disks, yet we don't release them to the user.
+		 */
+
+		if (i >= MAX_RAID_ARRAYS) { /* just a sanity check */
+			printk(KERN_ERR "iswraid: Maximum array count %d "
+			       "exceeded, ignoring MPB on disk %s",
+			       MAX_RAID_ARRAYS, disk->serial);
+			continue;
+		}
+
+		if (disk->mpb->num_disks == 1) { /* 1-disk arrays suspicious */
+			struct _mpb_disk *mpbdisk = &disk->mpb->disk_tbl[0];
+			disk->status = mpbdisk->status;
+			if (!(mpbdisk->status & CONFIGURED_DISK)) {
+				printk(KERN_INFO "iswraid: Disk '%s' with "
+				       "status %d is not a member of any "
+				       "array, ignoring it\n",
+				       disk->serial, mpbdisk->status);
+				continue;
+			}
+		}
+
+		/* unless we do this check, may not call update_array */
+		if (disk->mpb->num_disks > MAX_RAID_MEMBER_DISKS) {
+			printk(KERN_ERR "iswraid: MPB on disk %s contains "
+			       "too many (%d) disks, ignoring it\n",
+			       disk->serial, disk->mpb->num_disks);
+			continue;
+		}
+
+		if (i >= arraycount) { /* the case where we need a new array */
+			arraycount++;
+			arrays[i].saveneeded = 0;
+			INIT_TQUEUE(&(arrays[i].task),
+				    &start_mpb_writes, &arrays[i]);
+		} else if (needupdate) /* existing array's MPB will change */
+			arrays[i].saveneeded = 1; /* so mark it already */
+
+		if (needupdate) /* array's MPB should be updated from disk's */
+			update_array(&arrays[i], disk->mpb);
+	}
+
+	printk(KERN_INFO "iswraid: Found %d ISWRAID arrays\n", arraycount);
+}
+
+/* Order the disks belonging to a volume. */
+static void
+order_disks(struct volume *volume,
+	    struct _raid_mpb *mpb, struct _mpb_raid_map *map)
+{
+	int k, numbaddisks = 0; /* missing, failed, considered degraded */
+	for (k = 0; k < volume->numdisks; k++) {
+		int index = map->disk_ord_tbl[k] & 0xffffff;
+		int status = map->disk_ord_tbl[k] >> 24;
+		if (index > mpb->num_disks) {
+			printk(KERN_WARNING "iswraid: Disk #%d of volume #%d "
+			       "with name '%s' refers to a nonexisting disk "
+			       "#%d in array; disabling volume\n",
+			       k, volumecount, volume->serial, index);
+			set_bit(DISABLED_BIT, &volume->state);
+			continue;
+		}
+		volume->disks[k] = volume->array->disks[index];
+
+		if (!volume->disks[k]) { /* this disk is missing */
+			printk(KERN_WARNING "iswraid: Disk #%d of volume #%d "
+			       "with name '%s' is missing\n",
+			       k, volumecount, volume->serial);
+			set_bit(k, &volume->degradedbits);
+			if (volume->raidlevel == 0) {
+				printk(KERN_WARNING "iswraid: RAID0 volume %s "
+				       "disabled\n", volume->serial);
+				set_bit(DISABLED_BIT, &volume->state);
+			}
+		}
+
+		if (status) { /* volume considers the disk degraded */
+			/* (which should cover FAILED_DISK-s, too) */
+			printk(KERN_WARNING "iswraid: Disk #%d of volume "
+			       "#%d with name '%s' is degraded\n",
+			       k, volumecount, volume->serial);
+			set_bit(k, &volume->degradedbits);
+		}
+
+		if ((status || !volume->disks[k]) /* bad or missing */
+		    && volume->raidlevel > 0) { /* except in RAID0 volumes */
+			numbaddisks++;
+			set_bit(DEGRADED_BIT, &volume->state);
+		}
+	}
+
+	/* Degraded volumes are usable, unless we're instructed otherwise */
+	if (iswraid_halt_degraded && test_bit(DEGRADED_BIT, &volume->state)) {
+		printk(KERN_WARNING "iswraid: Volume #%d with name '%s' "
+		       "and raid level %d has been disabled---it is degraded "
+		       "and iswraid_halt_degraded is set\n", volumecount,
+		       volume->serial, volume->raidlevel);
+		set_bit(DISABLED_BIT, &volume->state);
+	}
+	
+	/* If we got more than 1 disk missing or degraded for a RAID1 or
+	 * higher level volume, it ought to be marked failed and not be used.
+	 * (Except for some RAID1E cases when data is recoverable in spite
+	 * of multi-disk failures.) However, since we noticed it upfront and
+	 * didn't attempt to use the volume yet, we simply disable it and
+	 * leave it to OROM or other OSs to mark it failed permanently.
+	 */
+	if (numbaddisks > 1) {
+		if (volume->raidlevel == 1) { /* check for lucky RAID1E */
+			for (k = 0; k < volume->numdisks; k++)
+				if (test_bit(k, &volume->degradedbits)
+				    && !check_r1e_mirrors(volume, k))
+					break; /* no luck */
+			if (k >= volume->numdisks) /* checked all */
+				return; /* lucky RAID1E */
+		}				
+		printk(KERN_WARNING "iswraid: Volume #%d with name '%s' "
+		       "and raid level %d has been disabled due "
+		       "to %d missing or degraded disks\n",
+		       volumecount, volume->serial, volume->raidlevel,
+		       numbaddisks);
+		set_bit(DISABLED_BIT, &volume->state);
+	}
+
+	if (volume->raidlevel > 1) {
+		printk(KERN_WARNING "iswraid: Volume #%d with name '%s' "
+		       "and raid level %d has been disabled---unsupported "
+		       "raid level\n", volumecount, volume->serial,
+		       volume->raidlevel);
+		set_bit(DISABLED_BIT, &volume->state);
+	}
+}
+
+/* Detect volumes belonging to one particular array */
+static int __init
+detect_volumes(struct array *array)
+{
+	int j, k, count = 0;
+	struct _raid_mpb *mpb = array->mpb;
+	struct _mpb_raid_dev *raiddev
+		= (struct _mpb_raid_dev *) &mpb->disk_tbl[mpb->num_disks];
+	struct volume *volume = &volumes[volumecount];
+
+	if (mpb->num_raid_devs > MAX_RAID_VOLUMES) {
+		printk(KERN_ERR "iswraid: Array with family_num 0x%08x "
+		       "contains too many (%d) volumes, using only the first "
+		       "%d of them\n", mpb->family_num, mpb->num_raid_devs,
+		       MAX_RAID_VOLUMES);
+		mpb->num_raid_devs = MAX_RAID_VOLUMES;
+	}
+
+	for (j = 0; j < mpb->num_raid_devs; j++, volumecount++, volume++) {
+		struct _mpb_raid_vol *vol = &raiddev->raid_vol;
+		struct _mpb_raid_map *map = &vol->lo_map;
+		volume->state = 0;      /* assume OK, but check on map */
+		if (map->map_state == FAILED_MAP)
+			set_bit(FAILED_BIT, &volume->state);
+		else if (map->map_state == DEGRADED_MAP)
+			set_bit(DEGRADED_BIT, &volume->state);
+		volume->devbit = -1;
+		volume->refcnt = 0;
+		volume->tiebreak = 0;
+		volume->raidlevel = map->raid_level;
+		volume->numdisks = map->num_members; /* <= mpb->num_disks */
+		volume->pba_of_lba0 = map->pba_of_lba0;
+		volume->blocks_per_strip = map->blocks_per_strip;
+		/* FIXME we don't use num_data_blocks_hi, so 2TB is max size */
+		volume->sectors = raiddev->num_data_blocks_lo;
+		/* We will be compatible with at least some other OSs if we
+		 * choose 255 heads and 63 sectors for volumes that are
+		 * larger than 4.2GB. (Whether that was decimal or binary
+		 * "giga" is still a mystery.) For smaller volumes the
+		 * number of heads should be 128 (if over 2.1GB), 64 (if over
+		 * 1GB), 32 (if over 528MB), or 16 otherwise.
+		 */
+		volume->hb_geom.heads = 255;  /* FIXME? see comment above */
+		volume->hb_geom.sectors = 63; /* always */
+		volume->hb_geom.cylinders
+			= (volume->sectors / volume->hb_geom.heads
+			   / volume->hb_geom.sectors);
+		strncpy(volume->serial, raiddev->serial.serial,
+			MAX_RAID_SERIAL_LEN);
+		volume->serial[MAX_RAID_SERIAL_LEN] = 0;
+		DEBUG(DEBUG_INIT, "volume #%d with name '%s' has %lu sectors,"
+		      " H/S/C = 255/63/%u, pba_of_lba0 = %u, blocks_per_member"
+		      " = %u, blocks_per_strip = %u, num_data_stripes = %u\n",
+		      volumecount, volume->serial, volume->sectors,
+		      volume->hb_geom.cylinders, volume->pba_of_lba0,
+		      map->blocks_per_member, volume->blocks_per_strip,
+		      map->num_data_stripes);
+		volume->array = array;
+		array->volumes[j] = volume;
+		DEBUG(DEBUG_INIT, "volume #%d with name '%s' is volume #%d "
+		      "in array with family_num 0x%08x and it has "
+		      "raid level %d\n", volumecount, volume->serial, j,
+		      mpb->family_num, volume->raidlevel);
+
+		volume->degradedbits = 0;
+		for (k = 0; k < MAX_RAID_MEMBER_DISKS; k++)
+			volume->disks[k] = NULL;
+		/* We do not attempt to use volumes that are not in a normal
+		 * state. Use the Option ROM or another OS to fix everything.
+		 * The only exception is degraded map state---we allow it.
+		 * (raiddev->status can be ignored---nothing meaningful there.)
+		 */
+		if (!volume->numdisks || volume->numdisks > mpb->num_disks
+		    || !volume->blocks_per_strip || vol->migr_state
+		    || (map->map_state && map->map_state != DEGRADED_MAP)
+		    || volume->raidlevel > 1) {
+			printk(KERN_INFO "iswraid: Volume #%d with name '%s' "
+			       "has %d member disks, %u blocks per strip, "
+			       "migration state %d, map state %d and raid "
+			       "level %d; disabling it\n", volumecount,
+			       volume->serial, volume->numdisks,
+			       volume->blocks_per_strip, vol->migr_state,
+			       map->map_state, volume->raidlevel);
+			set_bit(DISABLED_BIT, &volume->state);
+		}
+
+		if (volume->numdisks <= mpb->num_disks) /* can order disks */
+			order_disks(volume, mpb, map);
+
+		/* Register the volume with ATARAID, if not disabled */
+		if (!test_bit(DISABLED_BIT, &volume->state)) {
+			int devbit = ataraid_get_device(volume->raidlevel
+							? &iswraid1_ops
+							: &iswraid0_ops);
+			if (devbit < 0) {
+				printk(KERN_ERR "iswraid: Too many "
+				       "RAID devices for ATARAID\n");
+				break;
+			}
+			raid[devbit] = volume;
+			volume->devbit = devbit;
+			printk(KERN_INFO "iswraid: Registering%s volume #%d "
+			       "with name %s over %d member disks as a RAID "
+			       "device with minor %d, ATARAID raiddev %d\n",
+			       (test_bit(DEGRADED_BIT, &volume->state)
+				? " degraded" : ""),
+			       volumecount, volume->serial, volume->numdisks,
+			       devbit << SHIFT, devbit);
+			/* this will already do IOs to the volume */
+			ataraid_register_disk(devbit, volume->sectors);
+			count++;
+		}
+
+		raiddev = advance_raiddev(raiddev); /* find next vol. in MPB */
+	}
+	return count;
+}
+
+/* Unregister volumes, release block devices, free memory, destroy caches */
+static void
+free_resources(void)
+{
+	struct disk *disk;
+	int i;
+	/* first release any raid devs that we may have registered */
+	for (i = 0; i < MAX_ATARAID_RAIDDEVS; i++)
+		if (raid[i]) { /* if this one got registered */
+			printk(KERN_INFO "iswraid: Releasing ATARAID raiddev "
+			       "%d\n", i);
+			ataraid_release_device(i);
+		}
+	list_for_each_entry(disk, &disklist, head) {
+		DEBUG(DEBUG_EXIT, "freeing resources associated with major "
+		      "%d, minor %d\n", MAJOR(disk->dev), MINOR(disk->dev));
+		blkdev_put(disk->bdev, BDEV_RAW);
+		if (disk->mpb)
+			kfree(disk->mpb);
+		kfree(disk); /* we don't bother with list_del, reinit at end */
+	}
+	kmem_cache_destroy(privcache);
+	kmem_cache_destroy(bhcache);
+
+	/* the following aren't really necessary since we're exiting anyway */
+	INIT_LIST_HEAD(&disklist);
+	diskcount = 0;
+	arraycount = 0;
+	volumecount = 0;
+}
+
+static int __init
+iswraid_init(void)
+{
+	struct block_device *bdev;
+	struct disk *disk;
+	int dev_count = 0, vol_count = 0;
+	int major = SCSI_DISK0_MAJOR, minor = 0; /* start w/ major 8 */
+	int i, retval = 0; /* assume success */
+
+	printk(KERN_INFO "iswraid: Intel(tm) Software RAID driver %s\n",
+	       ISW_VERSION_STRING);
+	printk(KERN_INFO "iswraid: Options: iswraid_halt_degraded=%d, "
+	       "iswraid_resist_failing=%d, iswraid_error_threshold=%d\n",
+	       iswraid_halt_degraded, iswraid_resist_failing,
+	       iswraid_error_threshold);
+	if (!(privcache = kmem_cache_create("iswraid_bh_private",
+					    sizeof(struct bh_private), 0,
+					    SLAB_HWCACHE_ALIGN, NULL, NULL)))
+		return -ENOMEM;
+	if (!(bhcache = kmem_cache_create("iswraid_bh",
+					  sizeof(struct buffer_head), 0,
+					  SLAB_HWCACHE_ALIGN, NULL, NULL))) {
+		kmem_cache_destroy(privcache);
+		return -ENOMEM;
+	}
+
+	/* Initialize the raid structure to init values */
+	for (i = 0; i < MAX_ATARAID_RAIDDEVS; i++)
+		raid[i] = NULL;
+
+	DEBUG(DEBUG_INIT, "probing SCSI disks\n");
+	/* Probe each SCSI disk looking for our MPB.
+	 * SCSI disks are claimed sequentially, so we can stop searching
+	 * when we encounter the first invalid device.
+	 * The blkdev_get call may cause a request_module("block-major-%d"),
+	 * typically causing sd_mod to be loaded.
+	 */
+	for ( ; ; ) {
+		if (!(bdev = bdget(MKDEV(major, minor)))
+		    || blkdev_get(bdev, FMODE_READ | FMODE_WRITE, 0, BDEV_RAW))
+			break;
+
+		disk = probe_disk(major, minor);
+		if (disk) {
+			disk->bdev = bdev;
+			list_add_tail(&disk->head, &disklist);
+			diskcount++; /* the global counter of raid disks */
+		} else
+			blkdev_put(bdev, BDEV_RAW);
+
+		dev_count++; /* local counter of all disks processed */
+		minor += 16;
+
+		/* finding the next major for SCSI disks is a bit tricky */
+		if (minor > 255) {
+			minor = 0;
+			if (major == SCSI_DISK7_MAJOR) /* stop after maj. 71 */
+				break;
+			if (major == SCSI_DISK0_MAJOR)    /* from major    8 */
+				major = SCSI_DISK1_MAJOR; /* leap to maj. 65 */
+			else
+				major++;
+		}
+	}
+
+	printk(KERN_INFO "iswraid: Probed %d SCSI disks, "
+	       "found %d ISWRAID disks\n", dev_count, diskcount);
+
+	if (!diskcount) {
+		retval = -ENODEV;
+		goto errorexit; /* nothing to free except both caches */
+	}
+
+	find_arrays(); /* parse MPB data looking for RAID arrays */
+	DEBUG(DEBUG_INIT, "detecting RAID volumes\n");
+
+	/* for each array's MPB, discover the volumes listed there */
+	for (i = 0; i < arraycount; i++)
+		vol_count += detect_volumes(&arrays[i]);
+
+	printk(KERN_INFO "iswraid: Detected %d ISWRAID volumes, registered "
+	       "%d of them as RAID devices\n", volumecount, vol_count);
+
+	/* for each array, free up all disk MPBs except the array MPB */
+	/* write out the array MPB to all disks of the array if needed */
+	list_for_each_entry(disk, &disklist, head) {
+		if (disk->mpb != disk->array->mpb) {
+			kfree(disk->mpb);
+			disk->mpb = NULL;
+		} else if (disk->array->saveneeded) { /* need to save MPB */
+			struct array *array = disk->array;
+			array->saveneeded = 0; /* avoid doing it again */
+			/* we prefer to do these MPB writes synchronously,
+			 * with a real feedback on how they went, not just
+			 * submit the IOs and forget about them as we're
+			 * forced to do later when failing/degrading disks.
+			 */
+			for (i = 0; i < array->mpb->num_disks; i++) {
+				struct disk *d = array->disks[i]; /* 0? */
+				if (d && (retval = write_mpb(MAJOR(d->dev),
+							     MINOR(d->dev),
+							     array->mpb)))
+					printk(KERN_ERR "iswraid: Could not "
+					       "write the MPB for major %d "
+					       "minor %d: %d\n", MAJOR(d->dev),
+					       MINOR(d->dev), retval);
+			}
+		}
+	}
+
+	if (vol_count && !retval) {  /* everything is fine */
+		iswraid_proc_init(); /* we don't care if this fails, */
+		return 0;            /* the driver is operational anyway */
+	} else if (!vol_count)
+		retval = -ENODEV; /* possibly override retval */
+
+ errorexit:
+	free_resources();
+	return retval;
+}
+
+static void __exit
+iswraid_exit(void)
+{
+	iswraid_proc_cleanup();
+	free_resources();
+}
+
+static int
+iswraid_open(struct inode *inode, struct file *filp)
+{
+	unsigned int devbit;
+	MOD_INC_USE_COUNT;
+	if (!inode || !inode->i_rdev)
+		return -EINVAL;
+	devbit = MINOR(inode->i_rdev) >> SHIFT;
+	if (down_interruptible(&iswraid_sem))
+		return -ERESTARTSYS;
+	raid[devbit]->refcnt++;
+	up(&iswraid_sem);
+	return 0;
+}
+
+static int
+iswraid_release(struct inode *inode, struct file *filp)
+{
+	unsigned int devbit;
+	if (!inode || !inode->i_rdev)
+		return -EINVAL;
+	devbit = MINOR(inode->i_rdev) >> SHIFT;
+	if (down_interruptible(&iswraid_sem))
+		return -ERESTARTSYS;
+	raid[devbit]->refcnt--;
+	up(&iswraid_sem);
+	MOD_DEC_USE_COUNT;
+	return 0;
+}
+
+EXPORT_NO_SYMBOLS;
+module_init(iswraid_init);
+module_exit(iswraid_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Intel");
+MODULE_DESCRIPTION("Intel Software RAID support at block device level");
diff -Naur -X dontdiff linux-2.4.29/drivers/ide/raid/iswraid.h linux-2.4.29-iswraid/drivers/ide/raid/iswraid.h
--- linux-2.4.29/drivers/ide/raid/iswraid.h	1969-12-31 19:00:00.000000000 -0500
+++ linux-2.4.29-iswraid/drivers/ide/raid/iswraid.h	2005-01-25 21:01:36.000000000 -0500
@@ -0,0 +1,103 @@
+/*
+ *   iswraid.h Copyright (C) 2003,2004,2005 Intel Corporation. 
+ *   All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2, or (at your option)
+ *   any later version.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   (for example /usr/src/linux/COPYING); if not, write to the Free
+ *   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *   Authors: Boji Tony Kannanthanam 
+ *            < boji dot t dot kannanthanam at intel dot com >
+ *            Martins Krikis
+ *            < martins dot krikis at intel dot com >
+ */
+
+//                           "12345678901234567890123456789012"
+#define MPB_SIGNATURE	     "Intel Raid ISM Cfg Sig. "
+#define MPB_VERSION_RAID0                             "1.0.00"
+#define MPB_VERSION_RAID1                             "1.1.00"
+#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY            "1.2.00"
+#define MPB_VERSION_3_AND_4_DISK_ARRAYS               "1.2.01"
+#define MPB_VERSION_RAID5                             "1.2.02"
+#define MAX_SIGNATURE_LENGTH  32
+#define MAX_RAID_SERIAL_LEN   16
+#define ISW_DISK_BLOCK_SIZE  512
+
+struct mpb_serial {
+	char serial[MAX_RAID_SERIAL_LEN];
+};
+
+/* DISK CONFIGURATION INFO */
+struct _mpb_disk {
+	struct mpb_serial serial;	// ascii serial number
+	u32 total_blocks;	// total blocks
+	u32 scsi_id;		// scsi ID
+	u32 status;		// spare, configured, failed, useable, etc.
+#define	MPB_DISK_FILLERS	(5)	// filler space reserved for
+	u32 filler[MPB_DISK_FILLERS];	//   future expansion
+};
+
+/* RAID MAP CONFIGURATION INFO */
+struct _mpb_raid_map {
+	u32 pba_of_lba0;	// start address of partition
+	u32 blocks_per_member;	// blocks per member
+	u32 num_data_stripes;	// number of data stripes
+	u16 blocks_per_strip;
+	u8 map_state;		// Normal, Uninitialized, Degraded, Failed
+	u8 raid_level;		// 0, 1
+	u8 num_members;		// number of member disks
+	u8 reserved[3];
+#define MPB_RAID_MAP_FILLERS	(7)	// expansion area
+	u32 filler[MPB_RAID_MAP_FILLERS];	// expansion area
+	u32 disk_ord_tbl[1];	// disk_ord_tbl[num_members], top byte special
+};
+
+/* RAID VOLUME INFO */
+struct _mpb_raid_vol {
+	u32 reserved[2];
+	u8 migr_state;		// Normal or Migrating
+	u8 migr_type;		// Initializing, Rebuilding, ...
+	u8 dirty;
+	u8 fill[1];
+#define	MPB_RAID_VOL_FILLERS	(5)	// expansion area keeps the loMap
+	u32 filler[MPB_RAID_VOL_FILLERS];
+	struct _mpb_raid_map lo_map;
+	// here comes another one if migr_state
+};
+
+/* RAID DEVICE CONFIGURATION INFO */
+struct _mpb_raid_dev {
+	struct mpb_serial serial;	// serial number
+	u32 num_data_blocks_lo;	// Data blocks on device (low 32 bits)
+	u32 num_data_blocks_hi;	// Data blocks on device (high 32 bits)
+	u32 status;		// Persistent RaidDev status
+	u32 reserved_blocks;	// Reserved blocks at beginning of volume
+#define	MPB_RAID_DEV_FILLERS	(12)
+	u32 filler[MPB_RAID_DEV_FILLERS];
+	struct _mpb_raid_vol raid_vol;
+};
+
+struct raid_cfg_sig {
+	char text[MAX_SIGNATURE_LENGTH];
+};
+
+struct _raid_mpb {
+	struct raid_cfg_sig sig;
+	u32 check_sum;		// MPB Checksum
+	u32 mpb_size;		// Size of MPB
+	u32 family_num;		// Cksum from 1st time this config was written
+	u32 generation_num;	// Incremented each time array's MPB is written
+	u32 reserved[2];
+	u8 num_disks;		// Number of configured disks
+	u8 num_raid_devs;	// Number of configured volumes
+	u8 fill[2];
+#define RAID_MPB_FILLERS	(39)
+	u32 filler[RAID_MPB_FILLERS];
+	struct _mpb_disk disk_tbl[1];	// disk_tbl[num_disks]
+	// here comes _mpb_raid_dev[num_raid_devs]
+};
diff -Naur -X dontdiff linux-2.4.29/drivers/scsi/Makefile linux-2.4.29-iswraid/drivers/scsi/Makefile
--- linux-2.4.29/drivers/scsi/Makefile	2005-01-25 20:55:40.000000000 -0500
+++ linux-2.4.29-iswraid/drivers/scsi/Makefile	2005-01-25 21:01:36.000000000 -0500
@@ -151,6 +151,10 @@
 obj-$(CONFIG_BLK_DEV_SR)	+= sr_mod.o
 obj-$(CONFIG_CHR_DEV_SG)	+= sg.o
 
+# iswraid.o build moved here to ensure proper initcall ordering
+# in statically linked kernels; it must be linked _after_ sd_mod.
+obj-$(CONFIG_BLK_DEV_ATARAID_ISW)	+= ../ide/raid/iswraid.o
+
 list-multi	:= scsi_mod.o sd_mod.o sr_mod.o initio.o a100u2w.o cpqfc.o \
 			zalon7xx_mod.o libata.o
 scsi_mod-objs	:= scsi.o hosts.o scsi_ioctl.o constants.o \