diff -urN linux-2.4.28/Documentation/Configure.help linux-2.4.28-loop-AES-v3.0b/Documentation/Configure.help
--- linux-2.4.28/Documentation/Configure.help	Sun Dec 12 12:06:27 2004
+++ linux-2.4.28-loop-AES-v3.0b/Documentation/Configure.help	Sun Feb  6 18:45:39 2005
@@ -620,6 +620,21 @@
 
   If unsure, say N.
 
+AES encrypted loop device support
+CONFIG_BLK_DEV_LOOP_AES
+  If you want to use AES encryption algorithm to encrypt loop devices,
+  say Y here. If you don't know what to do here, say N.
+
+loop encryption key scrubbing support
+CONFIG_BLK_DEV_LOOP_KEYSCRUB
+  Loop encryption key scrubbing moves and inverts key bits in
+  kernel RAM so that the thin oxide which forms the storage
+  capacitor dielectric of DRAM cells is not permitted to develop
+  detectable property. For more info, see Peter Gutmann's paper:
+  http://www.cs.auckland.ac.nz/~pgut001/pubs/secure_del.html
+
+  Paranoid tinfoil hat crowd say Y here, everyone else say N.
+
 ATA/IDE/MFM/RLL support
 CONFIG_IDE
   If you say Y here, your kernel will be able to manage low cost mass
diff -urN linux-2.4.28/drivers/block/Config.in linux-2.4.28-loop-AES-v3.0b/drivers/block/Config.in
--- linux-2.4.28/drivers/block/Config.in	Sat Jul 31 18:45:19 2004
+++ linux-2.4.28-loop-AES-v3.0b/drivers/block/Config.in	Sun Feb  6 18:45:39 2005
@@ -42,6 +42,10 @@
 dep_tristate 'Promise SATA SX8 support' CONFIG_BLK_DEV_SX8 $CONFIG_PCI
 
 tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP
+if [ "$CONFIG_BLK_DEV_LOOP" != "n" ]; then
+   bool '  AES encrypted loop device support' CONFIG_BLK_DEV_LOOP_AES
+   bool '  loop encryption key scrubbing support' CONFIG_BLK_DEV_LOOP_KEYSCRUB
+fi
 dep_tristate 'Network block device support' CONFIG_BLK_DEV_NBD $CONFIG_NET
 
 tristate 'RAM disk support' CONFIG_BLK_DEV_RAM
diff -urN linux-2.4.28/drivers/block/loop.c linux-2.4.28-loop-AES-v3.0b/drivers/block/loop.c
--- linux-2.4.28/drivers/block/loop.c	Sat Sep 13 07:57:22 2003
+++ linux-2.4.28-loop-AES-v3.0b/drivers/block/loop.c	Sun Feb  6 18:45:39 2005
@@ -2,7 +2,7 @@
  *  linux/drivers/block/loop.c
  *
  *  Written by Theodore Ts'o, 3/29/93
- * 
+ *
  * Copyright 1993 by Theodore Ts'o.  Redistribution of this file is
  * permitted under the GNU General Public License.
  *
@@ -21,12 +21,12 @@
  * Loadable modules and other fixes by AK, 1998
  *
  * Make real block number available to downstream transfer functions, enables
- * CBC (and relatives) mode encryption requiring unique IVs per data block. 
+ * CBC (and relatives) mode encryption requiring unique IVs per data block.
  * Reed H. Petty, rhp@draper.net
  *
  * Maximum number of loop devices now dynamic via max_loop module parameter.
  * Russell Kroll <rkroll@exploits.org> 19990701
- * 
+ *
  * Maximum number of loop devices when compiled-in now selectable by passing
  * max_loop=<1-255> to the kernel on boot.
  * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
@@ -39,20 +39,30 @@
  * Support up to 256 loop devices
  * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
  *
- * Still To Fix:
- * - Advisory locking is ignored here. 
- * - Should use an own CAP_* category instead of CAP_SYS_ADMIN 
+ * AES transfer added. IV is now passed as (512 byte) sector number.
+ * Jari Ruusu, May 18 2001
+ *
+ * External encryption module locking bug fixed.
+ * Ingo Rohloff <rohloff@in.tum.de>, June 21 2001
+ *
+ * Make device backed loop work with swap (pre-allocated buffers + queue rewrite).
+ * Jari Ruusu, September 2 2001
  *
- * WARNING/FIXME:
- * - The block number as IV passing to low level transfer functions is broken:
- *   it passes the underlying device's block number instead of the
- *   offset. This makes it change for a given block when the file is 
- *   moved/restored/copied and also doesn't work over NFS. 
- * AV, Feb 12, 2000: we pass the logical block number now. It fixes the
- *   problem above. Encryption modules that used to rely on the old scheme
- *   should just call ->i_mapping->bmap() to calculate the physical block
- *   number.
- */ 
+ * File backed code now uses file->f_op->read/write. Based on Andrew Morton's idea.
+ * Jari Ruusu, May 23 2002
+ *
+ * Backported struct loop_info64 ioctls from 2.6 kernels (64 bit offsets and
+ * 64 bit sizelimits). Added support for removing offset from IV computations.
+ * Jari Ruusu, September 21 2003
+ *
+ * Added support for MD5 IV computation and multi-key operation.
+ * Jari Ruusu, October 8 2003
+ *
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ */
 
 #include <linux/config.h>
 #include <linux/module.h>
@@ -71,10 +81,14 @@
 #include <linux/smp_lock.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
+#include <asm/byteorder.h>
 
 #include <linux/loop.h>		
+#include "../misc/aes.h"
+#include "../misc/md5.h"
 
 #define MAJOR_NR LOOP_MAJOR
 
@@ -82,21 +96,31 @@
 static struct loop_device *loop_dev;
 static int *loop_sizes;
 static int *loop_blksizes;
+static int *loop_hardsizes;
 static devfs_handle_t devfs_handle;      /*  For the directory */
 
+#if defined(__x86_64__) && defined(CONFIG_IA32_EMULATION)
+# include <asm/ioctl32.h>
+# define IOCTL32_COMPATIBLE_PTR ((void*)sys_ioctl)
+#endif
+#if (defined(__sparc__) || defined(__sparc64__)) && defined(CONFIG_SPARC32_COMPAT)
+  extern int register_ioctl32_conversion(unsigned int cmd, int (*handler)(unsigned int, unsigned int, unsigned long, struct file *));
+  extern int unregister_ioctl32_conversion(unsigned int cmd);
+  extern int sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
+# define IOCTL32_COMPATIBLE_PTR ((void*)sys_ioctl)
+#endif
+
 /*
  * Transfer functions
  */
 static int transfer_none(struct loop_device *lo, int cmd, char *raw_buf,
 			 char *loop_buf, int size, int real_block)
 {
-	if (raw_buf != loop_buf) {
-		if (cmd == READ)
-			memcpy(loop_buf, raw_buf, size);
-		else
-			memcpy(raw_buf, loop_buf, size);
-	}
+	/* this code is only called from file backed loop  */
+	/* and that code expects this function to be no-op */
 
+	if (current->need_resched)
+		{set_current_state(TASK_RUNNING);schedule();}
 	return 0;
 }
 
@@ -118,12 +142,13 @@
 	keysize = lo->lo_encrypt_key_size;
 	for (i = 0; i < size; i++)
 		*out++ = *in++ ^ key[(i & 511) % keysize];
+	if (current->need_resched)
+		{set_current_state(TASK_RUNNING);schedule();}
 	return 0;
 }
 
 static int none_status(struct loop_device *lo, struct loop_info *info)
 {
-	lo->lo_flags |= LO_FLAGS_BH_REMAP;
 	return 0;
 }
 
@@ -134,336 +159,949 @@
 	return 0;
 }
 
-struct loop_func_table none_funcs = { 
+struct loop_func_table none_funcs = {
 	number: LO_CRYPT_NONE,
 	transfer: transfer_none,
 	init: none_status,
 }; 	
 
-struct loop_func_table xor_funcs = { 
+struct loop_func_table xor_funcs = {
 	number: LO_CRYPT_XOR,
 	transfer: transfer_xor,
-	init: xor_status
+	init: xor_status,
 }; 	
 
-/* xfer_funcs[0] is special - its release function is never called */ 
-struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
-	&none_funcs,
-	&xor_funcs  
+#if CONFIG_BLK_DEV_LOOP_AES
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+# define KEY_ALLOC_COUNT  128
+#else
+# define KEY_ALLOC_COUNT  64
+#endif
+
+typedef struct {
+    aes_context *keyPtr[KEY_ALLOC_COUNT];
+    unsigned    keyMask;
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    u_int32_t   *partialMD5;
+    u_int32_t   partialMD5buf[8];
+    rwlock_t    rwlock;
+    unsigned    reversed;
+    unsigned    blocked;
+    struct timer_list timer;
+#else
+    u_int32_t   partialMD5[4];
+#endif
+} AESmultiKey;
+
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+static void keyScrubWork(AESmultiKey *m)
+{
+    aes_context *a0, *a1;
+    u_int32_t *p;
+    int x, y, z;
+
+    z = m->keyMask + 1;
+    for(x = 0; x < z; x++) {
+        a0 = m->keyPtr[x];
+        a1 = m->keyPtr[x + z];
+        memcpy(a1, a0, sizeof(aes_context));
+        m->keyPtr[x] = a1;
+        m->keyPtr[x + z] = a0;
+        p = (u_int32_t *) a0;
+        y = sizeof(aes_context) / sizeof(u_int32_t);
+        while(y > 0) {
+            *p ^= 0xFFFFFFFF;
+            p++;
+            y--;
+        }
+    }
+
+    x = m->reversed;    /* x is 0 or 4 */
+    m->reversed ^= 4;
+    y = m->reversed;    /* y is 4 or 0 */
+    p = &m->partialMD5buf[x];
+    memcpy(&m->partialMD5buf[y], p, 16);
+    m->partialMD5 = &m->partialMD5buf[y];
+    p[0] ^= 0xFFFFFFFF;
+    p[1] ^= 0xFFFFFFFF;
+    p[2] ^= 0xFFFFFFFF;
+    p[3] ^= 0xFFFFFFFF;
+
+    /* try to flush dirty cache data to RAM */
+#if defined(CONFIG_X86_64) || (defined(CONFIG_X86) && !defined(CONFIG_M386) && !defined(CONFIG_CPU_386))
+    __asm__ __volatile__ ("wbinvd": : :"memory");
+#else
+    mb();
+#endif
+}
+
+/* called only from loop thread process context */
+static void keyScrubThreadFn(AESmultiKey *m)
+{
+    write_lock(&m->rwlock);
+    if(!m->blocked) keyScrubWork(m);
+    write_unlock(&m->rwlock);
+}
+
+static void keyScrubTimerInit(struct loop_device *lo)
+{
+    AESmultiKey     *m;
+    unsigned long   expire;
+    static void keyScrubTimerFn(unsigned long);
+
+    m = (AESmultiKey *)lo->key_data;
+    expire = jiffies + HZ;
+    init_timer(&m->timer);
+    m->timer.expires = expire;
+    m->timer.data = (unsigned long)lo;
+    m->timer.function = keyScrubTimerFn;
+    add_timer(&m->timer);
+}
+
+/* called only from timer handler context */
+static void keyScrubTimerFn(unsigned long d)
+{
+    struct loop_device *lo = (struct loop_device *)d;
+    extern void loop_add_keyscrub_fn(struct loop_device *, void (*)(void *), void *);
+
+    /* rw lock needs process context, so make loop thread do scrubbing */
+    loop_add_keyscrub_fn(lo, (void (*)(void*))keyScrubThreadFn, lo->key_data);
+    /* start timer again */
+    keyScrubTimerInit(lo);
+}
+#endif
+
+static AESmultiKey *allocMultiKey(void)
+{
+    AESmultiKey *m;
+    aes_context *a;
+    int x = 0, n;
+
+    m = (AESmultiKey *) kmalloc(sizeof(AESmultiKey), GFP_KERNEL);
+    if(!m) return 0;
+    memset(m, 0, sizeof(AESmultiKey));
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    m->partialMD5 = &m->partialMD5buf[0];
+    rwlock_init(&m->rwlock);
+    init_timer(&m->timer);
+    again:
+#endif
+
+    n = PAGE_SIZE / sizeof(aes_context);
+    if(!n) n = 1;
+
+    a = (aes_context *) kmalloc(sizeof(aes_context) * n, GFP_KERNEL);
+    if(!a) {
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+        if(x) kfree(m->keyPtr[0]);
+#endif
+        kfree(m);
+        return 0;
+    }
+
+    while((x < KEY_ALLOC_COUNT) && n) {
+        m->keyPtr[x] = a;
+        a++;
+        x++;
+        n--;
+    }
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    if(x < 2) goto again;
+#endif
+    return m;
+}
+
+static void clearAndFreeMultiKey(AESmultiKey *m)
+{
+    aes_context *a;
+    int x, n;
+
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    /* stop scrub timer. loop thread was killed earlier */
+    del_timer_sync(&m->timer);
+    /* make sure allocated keys are in original order */
+    if(m->reversed) keyScrubWork(m);
+#endif
+    n = PAGE_SIZE / sizeof(aes_context);
+    if(!n) n = 1;
+
+    x = 0;
+    while(x < KEY_ALLOC_COUNT) {
+        a = m->keyPtr[x];
+        if(!a) break;
+        memset(a, 0, sizeof(aes_context) * n);
+        kfree(a);
+        x += n;
+    }
+
+    memset(m, 0, sizeof(AESmultiKey));
+    kfree(m);
+}
+
+static int multiKeySetup(struct loop_device *lo, unsigned char *k, int version3)
+{
+    AESmultiKey *m;
+    aes_context *a;
+    int x, y, n, err = 0;
+    union {
+        u_int32_t     w[16];
+        unsigned char b[64];
+    } un;
+
+    if(lo->lo_key_owner != current->uid && !capable(CAP_SYS_ADMIN))
+        return -EPERM;
+
+    m = (AESmultiKey *)lo->key_data;
+    if(!m) return -ENXIO;
+
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    /* temporarily prevent loop thread from messing with keys */
+    write_lock(&m->rwlock);
+    m->blocked = 1;
+    /* make sure allocated keys are in original order */
+    if(m->reversed) keyScrubWork(m);
+    write_unlock(&m->rwlock);
+#endif
+    n = PAGE_SIZE / sizeof(aes_context);
+    if(!n) n = 1;
+
+    x = 0;
+    while(x < KEY_ALLOC_COUNT) {
+        if(!m->keyPtr[x]) {
+            a = (aes_context *) kmalloc(sizeof(aes_context) * n, GFP_KERNEL);
+            if(!a) {
+                err = -ENOMEM;
+                goto error_out;
+            }
+            y = x;
+            while((y < (x + n)) && (y < KEY_ALLOC_COUNT)) {
+                m->keyPtr[y] = a;
+                a++;
+                y++;
+            }
+        }
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+        if(x >= 64) {
+            x++;
+            continue;
+        }
+#endif
+        if(copy_from_user(&un.b[0], k, 32)) {
+            err = -EFAULT;
+            goto error_out;
+        }
+        aes_set_key(m->keyPtr[x], &un.b[0], lo->lo_encrypt_key_size, 0);
+        k += 32;
+        x++;
+    }
+
+    m->partialMD5[0] = 0x67452301;
+    m->partialMD5[1] = 0xefcdab89;
+    m->partialMD5[2] = 0x98badcfe;
+    m->partialMD5[3] = 0x10325476;
+    if(version3) {
+        /* only first 128 bits of iv-key is used */
+        if(copy_from_user(&un.b[0], k, 16)) {
+            err = -EFAULT;
+            goto error_out;
+        }
+#if defined(__BIG_ENDIAN)
+        un.w[0] = cpu_to_le32(un.w[0]);
+        un.w[1] = cpu_to_le32(un.w[1]);
+        un.w[2] = cpu_to_le32(un.w[2]);
+        un.w[3] = cpu_to_le32(un.w[3]);
+#endif
+        memset(&un.b[16], 0, 48);
+        md5_transform_CPUbyteorder(&m->partialMD5[0], &un.w[0]);
+        lo->lo_flags |= 0x080000;  /* multi-key-v3 (info exported to user space) */
+    }
+
+    m->keyMask = 0x3F;          /* range 0...63 */
+    lo->lo_flags |= 0x100000;   /* multi-key (info exported to user space) */
+    memset(&un.b[0], 0, 32);
+error_out:
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    /* re-enable loop thread key scrubbing */
+    write_lock(&m->rwlock);
+    m->blocked = 0;
+    write_unlock(&m->rwlock);
+#endif
+    return err;
+}
+
+void loop_compute_sector_iv(int devSect, u_int32_t *ivout)
+{
+    ivout[0] = cpu_to_le32(devSect);
+    ivout[3] = ivout[2] = ivout[1] = 0;
+}
+
+void loop_compute_md5_iv_v3(int devSect, u_int32_t *ivout, u_int32_t *data)
+{
+    int         x;
+#if defined(__BIG_ENDIAN)
+    int         y, e;
+#endif
+    u_int32_t   buf[16];
+
+#if defined(__BIG_ENDIAN)
+    y = 7;
+    e = 16;
+    do {
+        if (!y) {
+            e = 12;
+            /* md5_transform_CPUbyteorder wants data in CPU byte order */
+            /* devSect is already in CPU byte order -- no need to convert */
+            /* 32 bits of sector number + 24 zero bits */
+            buf[12] = devSect;
+            buf[13] = 0x80000000;
+            /* 4024 bits == 31 * 128 bit plaintext blocks + 56 bits of sector number */
+            buf[14] = 4024;
+            buf[15] = 0;
+        }
+        x = 0;
+        do {
+            buf[x    ] = cpu_to_le32(data[0]);
+            buf[x + 1] = cpu_to_le32(data[1]);
+            buf[x + 2] = cpu_to_le32(data[2]);
+            buf[x + 3] = cpu_to_le32(data[3]);
+            x += 4;
+            data += 4;
+        } while (x < e);
+        md5_transform_CPUbyteorder(&ivout[0], &buf[0]);
+    } while (--y >= 0);
+    ivout[0] = cpu_to_le32(ivout[0]);
+    ivout[1] = cpu_to_le32(ivout[1]);
+    ivout[2] = cpu_to_le32(ivout[2]);
+    ivout[3] = cpu_to_le32(ivout[3]);
+#else
+    x = 6;
+    do {
+        md5_transform_CPUbyteorder(&ivout[0], data);
+        data += 16;
+    } while (--x >= 0);
+    memcpy(buf, data, 48);
+    /* md5_transform_CPUbyteorder wants data in CPU byte order */
+    /* devSect is already in CPU byte order -- no need to convert */
+    /* 32 bits of sector number + 24 zero bits */
+    buf[12] = devSect;
+    buf[13] = 0x80000000;
+    /* 4024 bits == 31 * 128 bit plaintext blocks + 56 bits of sector number */
+    buf[14] = 4024;
+    buf[15] = 0;
+    md5_transform_CPUbyteorder(&ivout[0], &buf[0]);
+#endif
+}
+
+/* this function exists for compatibility with old external cipher modules */
+void loop_compute_md5_iv(int devSect, u_int32_t *ivout, u_int32_t *data)
+{
+    ivout[0] = 0x67452301;
+    ivout[1] = 0xefcdab89;
+    ivout[2] = 0x98badcfe;
+    ivout[3] = 0x10325476;
+    loop_compute_md5_iv_v3(devSect, ivout, data);
+}
+
+/* Some external modules do not know if md5_transform_CPUbyteorder() */
+/* is asmlinkage or not, so here is C language wrapper for them. */
+void md5_transform_CPUbyteorder_C(u_int32_t *hash, u_int32_t const *in)
+{
+    md5_transform_CPUbyteorder(hash, in);
+}
+
+static int transfer_aes(struct loop_device *lo, int cmd, char *raw_buf,
+          char *loop_buf, int size, int devSect)
+{
+    aes_context     *a;
+    AESmultiKey     *m;
+    int             x;
+    unsigned        y;
+    u_int32_t       iv[8];
+
+    if(!size || (size & 511)) {
+        return -EINVAL;
+    }
+    m = (AESmultiKey *)lo->key_data;
+    y = m->keyMask;
+    if(cmd == READ) {
+        while(size) {
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+            read_lock(&m->rwlock);
+#endif
+            a = m->keyPtr[((unsigned)devSect) & y];
+            if(y) {
+                memcpy(&iv[0], raw_buf, 16);
+                raw_buf += 16;
+                loop_buf += 16;
+            } else {
+                loop_compute_sector_iv(devSect, &iv[0]);
+            }
+            x = 15;
+            do {
+                memcpy(&iv[4], raw_buf, 16);
+                aes_decrypt(a, raw_buf, loop_buf);
+                *((u_int32_t *)(&loop_buf[ 0])) ^= iv[0];
+                *((u_int32_t *)(&loop_buf[ 4])) ^= iv[1];
+                *((u_int32_t *)(&loop_buf[ 8])) ^= iv[2];
+                *((u_int32_t *)(&loop_buf[12])) ^= iv[3];
+                if(y && !x) {
+                    raw_buf -= 496;
+                    loop_buf -= 496;
+                    memcpy(&iv[4], &m->partialMD5[0], 16);
+                    loop_compute_md5_iv_v3(devSect, &iv[4], (u_int32_t *)(&loop_buf[16]));
+                } else {
+                    raw_buf += 16;
+                    loop_buf += 16;
+                    memcpy(&iv[0], raw_buf, 16);
+                }
+                aes_decrypt(a, raw_buf, loop_buf);
+                *((u_int32_t *)(&loop_buf[ 0])) ^= iv[4];
+                *((u_int32_t *)(&loop_buf[ 4])) ^= iv[5];
+                *((u_int32_t *)(&loop_buf[ 8])) ^= iv[6];
+                *((u_int32_t *)(&loop_buf[12])) ^= iv[7];
+                if(y && !x) {
+                    raw_buf += 512;
+                    loop_buf += 512;
+                } else {
+                    raw_buf += 16;
+                    loop_buf += 16;
+                }
+            } while(--x >= 0);
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+            read_unlock(&m->rwlock);
+#endif
+            if(current->need_resched) {set_current_state(TASK_RUNNING);schedule();}
+            size -= 512;
+            devSect++;
+        }
+    } else {
+        while(size) {
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+            read_lock(&m->rwlock);
+#endif
+            a = m->keyPtr[((unsigned)devSect) & y];
+            if(y) {
+                /* on 2.4 and later kernels, real raw_buf is not doing */
+                /* any writes now so it can be used as temp buffer */
+                memcpy(raw_buf, loop_buf, 512);
+                memcpy(&iv[0], &m->partialMD5[0], 16);
+                loop_compute_md5_iv_v3(devSect, &iv[0], (u_int32_t *)(&raw_buf[16]));
+                x = 15;
+                do {
+                    iv[0] ^= *((u_int32_t *)(&raw_buf[ 0]));
+                    iv[1] ^= *((u_int32_t *)(&raw_buf[ 4]));
+                    iv[2] ^= *((u_int32_t *)(&raw_buf[ 8]));
+                    iv[3] ^= *((u_int32_t *)(&raw_buf[12]));
+                    aes_encrypt(a, (unsigned char *)(&iv[0]), raw_buf);
+                    memcpy(&iv[0], raw_buf, 16);
+                    raw_buf += 16;
+                    iv[0] ^= *((u_int32_t *)(&raw_buf[ 0]));
+                    iv[1] ^= *((u_int32_t *)(&raw_buf[ 4]));
+                    iv[2] ^= *((u_int32_t *)(&raw_buf[ 8]));
+                    iv[3] ^= *((u_int32_t *)(&raw_buf[12]));
+                    aes_encrypt(a, (unsigned char *)(&iv[0]), raw_buf);
+                    memcpy(&iv[0], raw_buf, 16);
+                    raw_buf += 16;
+                } while(--x >= 0);
+                loop_buf += 512;
+            } else {
+                loop_compute_sector_iv(devSect, &iv[0]);
+                x = 15;
+                do {
+                    iv[0] ^= *((u_int32_t *)(&loop_buf[ 0]));
+                    iv[1] ^= *((u_int32_t *)(&loop_buf[ 4]));
+                    iv[2] ^= *((u_int32_t *)(&loop_buf[ 8]));
+                    iv[3] ^= *((u_int32_t *)(&loop_buf[12]));
+                    aes_encrypt(a, (unsigned char *)(&iv[0]), raw_buf);
+                    memcpy(&iv[0], raw_buf, 16);
+                    loop_buf += 16;
+                    raw_buf += 16;
+                    iv[0] ^= *((u_int32_t *)(&loop_buf[ 0]));
+                    iv[1] ^= *((u_int32_t *)(&loop_buf[ 4]));
+                    iv[2] ^= *((u_int32_t *)(&loop_buf[ 8]));
+                    iv[3] ^= *((u_int32_t *)(&loop_buf[12]));
+                    aes_encrypt(a, (unsigned char *)(&iv[0]), raw_buf);
+                    memcpy(&iv[0], raw_buf, 16);
+                    loop_buf += 16;
+                    raw_buf += 16;
+                } while(--x >= 0);
+            }
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+            read_unlock(&m->rwlock);
+#endif
+            if(current->need_resched) {set_current_state(TASK_RUNNING);schedule();}
+            size -= 512;
+            devSect++;
+        }
+    }
+    return(0);
+}
+
+static int keySetup_aes(struct loop_device *lo, struct loop_info *info)
+{
+    AESmultiKey     *m;
+    union {
+        u_int32_t     w[8]; /* needed for 4 byte alignment for b[] */
+        unsigned char b[32];
+    } un;
+
+    lo->key_data = m = allocMultiKey();
+    if(!m) return(-ENOMEM);
+    memcpy(&un.b[0], &info->lo_encrypt_key[0], 32);
+    aes_set_key(m->keyPtr[0], &un.b[0], info->lo_encrypt_key_size, 0);
+    memset(&info->lo_encrypt_key[0], 0, sizeof(info->lo_encrypt_key));
+    memset(&un.b[0], 0, 32);
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+    keyScrubTimerInit(lo);
+#endif
+    return(0);
+}
+
+static int keyClean_aes(struct loop_device *lo)
+{
+    if(lo->key_data) {
+        clearAndFreeMultiKey((AESmultiKey *)lo->key_data);
+        lo->key_data = 0;
+    }
+    return(0);
+}
+
+static int handleIoctl_aes(struct loop_device *lo, int cmd, unsigned long arg)
+{
+    int err;
+
+    switch (cmd) {
+    case LOOP_MULTI_KEY_SETUP:
+        err = multiKeySetup(lo, (unsigned char *)arg, 0);
+        break;
+    case LOOP_MULTI_KEY_SETUP_V3:
+        err = multiKeySetup(lo, (unsigned char *)arg, 1);
+        break;
+    default:
+        err = -EINVAL;
+    }
+    return err;
+}
+
+static struct loop_func_table funcs_aes = {
+    number:     16,     /* 16 == AES */
+    transfer:   transfer_aes,
+    init:       keySetup_aes,
+    release:    keyClean_aes,
+    ioctl:      handleIoctl_aes
 };
 
-#define MAX_DISK_SIZE 1024*1024*1024
+EXPORT_SYMBOL(loop_compute_sector_iv);
+EXPORT_SYMBOL(loop_compute_md5_iv_v3);
+EXPORT_SYMBOL(loop_compute_md5_iv);
+EXPORT_SYMBOL(md5_transform_CPUbyteorder_C);
+#endif /* CONFIG_BLK_DEV_LOOP_AES */
 
-static int compute_loop_size(struct loop_device *lo, struct dentry * lo_dentry, kdev_t lodev)
-{
-	if (S_ISREG(lo_dentry->d_inode->i_mode))
-		return (lo_dentry->d_inode->i_size - lo->lo_offset) >> BLOCK_SIZE_BITS;
-	if (blk_size[MAJOR(lodev)])
-		return blk_size[MAJOR(lodev)][MINOR(lodev)] -
-                                (lo->lo_offset >> BLOCK_SIZE_BITS);
-	return MAX_DISK_SIZE;
-}
+/* xfer_funcs[0] is special - its release function is never called */
+struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
+	&none_funcs,
+	&xor_funcs,
+#if CONFIG_BLK_DEV_LOOP_AES
+	[LO_CRYPT_AES] = &funcs_aes,
+#endif
+};
 
-static void figure_loop_size(struct loop_device *lo)
-{
-	loop_sizes[lo->lo_number] = compute_loop_size(lo,
-					lo->lo_backing_file->f_dentry,
-					lo->lo_device);
+/*
+ *  First number of 'lo_prealloc' is the default number of RAM pages
+ *  to pre-allocate for each device backed loop. Every (configured)
+ *  device backed loop pre-allocates this amount of RAM pages unless
+ *  later 'lo_prealloc' numbers provide an override. 'lo_prealloc'
+ *  overrides are defined in pairs: loop_index,number_of_pages
+ */
+static int lo_prealloc[9] = { 125, 999, 0, 999, 0, 999, 0, 999, 0 };
+#define LO_PREALLOC_MIN 4    /* minimum user defined pre-allocated RAM pages */
+#define LO_PREALLOC_MAX 512  /* maximum user defined pre-allocated RAM pages */
+
+#ifdef MODULE
+MODULE_PARM(lo_prealloc, "1-9i");
+MODULE_PARM_DESC(lo_prealloc, "Number of pre-allocated pages [,index,pages]...");
+#else
+static int __init lo_prealloc_setup(char *str)
+{
+	int x, y, z;
+
+	for (x = 0; x < (sizeof(lo_prealloc) / sizeof(int)); x++) {
+		z = get_option(&str, &y);
+		if (z > 0)
+			lo_prealloc[x] = y;
+		if (z < 2)
+			break;
+	}
+	return 1;
 }
+__setup("lo_prealloc=", lo_prealloc_setup);
+#endif
 
-static int lo_send(struct loop_device *lo, struct buffer_head *bh, int bsize,
-		   loff_t pos)
+/*
+ * This is loop helper thread nice value in range
+ * from 0 (low priority) to -20 (high priority).
+ */
+#if defined(DEF_NICE) && defined(DEF_COUNTER)
+static int lo_nice = -20;   /* old scheduler default */
+#else
+static int lo_nice = -1;    /* O(1) scheduler default */
+#endif
+
+#ifdef MODULE
+MODULE_PARM(lo_nice, "1i");
+MODULE_PARM_DESC(lo_nice, "Loop thread scheduler nice (0 ... -20)");
+#else
+static int __init lo_nice_setup(char *str)
 {
-	struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
-	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
-	struct address_space_operations *aops = mapping->a_ops;
-	struct page *page;
-	char *kaddr, *data;
-	unsigned long index;
-	unsigned size, offset;
-	int len;
-
-	down(&mapping->host->i_sem);
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & (PAGE_CACHE_SIZE - 1);
-	len = bh->b_size;
-	data = bh->b_data;
-	while (len > 0) {
-		int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize;
-		int transfer_result;
-
-		size = PAGE_CACHE_SIZE - offset;
-		if (size > len)
-			size = len;
-
-		page = grab_cache_page(mapping, index);
-		if (!page)
-			goto fail;
-		kaddr = kmap(page);
-		if (aops->prepare_write(file, page, offset, offset+size))
-			goto unlock;
-		flush_dcache_page(page);
-		transfer_result = lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV);
-		if (transfer_result) {
-			/*
-			 * The transfer failed, but we still write the data to
-			 * keep prepare/commit calls balanced.
-			 */
-			printk(KERN_ERR "loop: transfer error block %ld\n", index);
-			memset(kaddr + offset, 0, size);
-		}
-		if (aops->commit_write(file, page, offset, offset+size))
-			goto unlock;
-		if (transfer_result)
-			goto unlock;
-		kunmap(page);
-		data += size;
-		len -= size;
-		offset = 0;
-		index++;
-		pos += size;
-		UnlockPage(page);
-		page_cache_release(page);
-	}
-	up(&mapping->host->i_sem);
-	return 0;
+	int y;
 
-unlock:
-	kunmap(page);
-	UnlockPage(page);
-	page_cache_release(page);
-fail:
-	up(&mapping->host->i_sem);
-	return -1;
+	if (get_option(&str, &y) == 1)
+		lo_nice = y;
+	return 1;
 }
+__setup("lo_nice=", lo_nice_setup);
+#endif
 
-struct lo_read_data {
-	struct loop_device *lo;
-	char *data;
-	int bsize;
-};
+typedef struct {
+	struct buffer_head	**q0;
+	struct buffer_head	**q1;
+	struct buffer_head	**q2;
+	int			x0;
+	int			x1;
+	int			x2;
+} que_look_up_table;
 
-static int lo_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
+static void loop_prealloc_cleanup(struct loop_device *lo)
 {
-	char *kaddr;
-	unsigned long count = desc->count;
-	struct lo_read_data *p = (struct lo_read_data*)desc->buf;
-	struct loop_device *lo = p->lo;
-	int IV = page->index * (PAGE_CACHE_SIZE/p->bsize) + offset/p->bsize;
-
-	if (size > count)
-		size = count;
+	struct buffer_head *bh;
 
-	kaddr = kmap(page);
-	if (lo_do_transfer(lo, READ, kaddr + offset, p->data, size, IV)) {
-		size = 0;
-		printk(KERN_ERR "loop: transfer error block %ld\n",page->index);
-		desc->error = -EINVAL;
+	while ((bh = lo->lo_bh_free)) {
+		__free_page(bh->b_page);
+		lo->lo_bh_free = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		kmem_cache_free(bh_cachep, bh);
 	}
-	kunmap(page);
-	
-	desc->count = count - size;
-	desc->written += size;
-	p->data += size;
-	return size;
-}
-
-static int lo_receive(struct loop_device *lo, struct buffer_head *bh, int bsize,
-		      loff_t pos)
-{
-	struct lo_read_data cookie;
-	read_descriptor_t desc;
-	struct file *file;
-
-	cookie.lo = lo;
-	cookie.data = bh->b_data;
-	cookie.bsize = bsize;
-	desc.written = 0;
-	desc.count = bh->b_size;
-	desc.buf = (char*)&cookie;
-	desc.error = 0;
-	spin_lock_irq(&lo->lo_lock);
-	file = lo->lo_backing_file;
-	spin_unlock_irq(&lo->lo_lock);
-	do_generic_file_read(file, &pos, &desc, lo_read_actor);
-	return desc.error;
 }
 
-static inline int loop_get_bs(struct loop_device *lo)
+static int loop_prealloc_init(struct loop_device *lo, int y)
 {
-	int bs = 0;
+	struct buffer_head *bh;
+	int x;
 
-	if (blksize_size[MAJOR(lo->lo_device)])
-		bs = blksize_size[MAJOR(lo->lo_device)][MINOR(lo->lo_device)];
-	if (!bs)
-		bs = BLOCK_SIZE;	
+	if(!y) {
+		y = lo_prealloc[0];
+		for (x = 1; x < (sizeof(lo_prealloc) / sizeof(int)); x += 2) {
+			if (lo_prealloc[x + 1] && (lo->lo_number == lo_prealloc[x])) {
+				y = lo_prealloc[x + 1];
+				break;
+			}
+		}
+	}
+	lo->lo_bh_flsh = (y * 3) / 4;
 
-	return bs;
+	for (x = 0; x < y; x++) {
+		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
+		if (!bh) {
+			loop_prealloc_cleanup(lo);
+			return 1;
+		}
+		bh->b_page = alloc_page(GFP_KERNEL);
+		if (!bh->b_page) {
+			bh->b_reqnext = NULL;
+			kmem_cache_free(bh_cachep, bh);
+			loop_prealloc_cleanup(lo);
+			return 1;
+		}
+		bh->b_reqnext = lo->lo_bh_free;
+		lo->lo_bh_free = bh;
+	}
+	return 0;
 }
 
-static inline unsigned long loop_get_iv(struct loop_device *lo,
-					unsigned long sector)
+static void loop_add_queue_last(struct loop_device *lo, struct buffer_head *bh, struct buffer_head **q)
 {
-	int bs = loop_get_bs(lo);
-	unsigned long offset, IV;
+	unsigned long flags;
 
-	IV = sector / (bs >> 9) + lo->lo_offset / bs;
-	offset = ((sector % (bs >> 9)) << 9) + lo->lo_offset % bs;
-	if (offset >= bs)
-		IV++;
+	spin_lock_irqsave(&lo->lo_lock, flags);
+	if (*q) {
+		bh->b_reqnext = (*q)->b_reqnext;
+		(*q)->b_reqnext = bh;
+	} else {
+		bh->b_reqnext = bh;
+	}
+	*q = bh;
+	spin_unlock_irqrestore(&lo->lo_lock, flags);
 
-	return IV;
+	if (waitqueue_active(&lo->lo_bh_wait))
+		wake_up_interruptible(&lo->lo_bh_wait);
 }
 
-static int do_bh_filebacked(struct loop_device *lo, struct buffer_head *bh, int rw)
+static void loop_add_queue_first(struct loop_device *lo, struct buffer_head *bh, struct buffer_head **q)
 {
-	loff_t pos;
-	int ret;
-
-	pos = ((loff_t) bh->b_rsector << 9) + lo->lo_offset;
-
-	if (rw == WRITE)
-		ret = lo_send(lo, bh, loop_get_bs(lo), pos);
-	else
-		ret = lo_receive(lo, bh, loop_get_bs(lo), pos);
-
-	return ret;
+	spin_lock_irq(&lo->lo_lock);
+	if (*q) {
+		bh->b_reqnext = (*q)->b_reqnext;
+		(*q)->b_reqnext = bh;
+	} else {
+		bh->b_reqnext = bh;
+		*q = bh;
+	}
+	spin_unlock_irq(&lo->lo_lock);
 }
 
-static void loop_end_io_transfer(struct buffer_head *bh, int uptodate);
-static void loop_put_buffer(struct buffer_head *bh)
+static struct buffer_head *loop_get_bh(struct loop_device *lo, int *list_nr,
+					que_look_up_table *qt)
 {
-	/*
-	 * check b_end_io, may just be a remapped bh and not an allocated one
-	 */
-	if (bh && bh->b_end_io == loop_end_io_transfer) {
-		__free_page(bh->b_page);
-		kmem_cache_free(bh_cachep, bh);
+	struct buffer_head *bh = NULL, *last;
+
+	spin_lock_irq(&lo->lo_lock);
+	if ((last = *qt->q0)) {
+		bh = last->b_reqnext;
+		if (bh == last)
+			*qt->q0 = NULL;
+		else
+			last->b_reqnext = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		*list_nr = qt->x0;
+	} else if ((last = *qt->q1)) {
+		bh = last->b_reqnext;
+		if (bh == last)
+			*qt->q1 = NULL;
+		else
+			last->b_reqnext = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		*list_nr = qt->x1;
+	} else if ((last = *qt->q2)) {
+		bh = last->b_reqnext;
+		if (bh == last)
+			*qt->q2 = NULL;
+		else
+			last->b_reqnext = bh->b_reqnext;
+		bh->b_reqnext = NULL;
+		*list_nr = qt->x2;
 	}
+	spin_unlock_irq(&lo->lo_lock);
+	return bh;
 }
 
-/*
- * Add buffer_head to back of pending list
- */
-static void loop_add_bh(struct loop_device *lo, struct buffer_head *bh)
+static void loop_put_buffer(struct loop_device *lo, struct buffer_head *b)
 {
 	unsigned long flags;
+	int wk;
 
 	spin_lock_irqsave(&lo->lo_lock, flags);
-	if (lo->lo_bhtail) {
-		lo->lo_bhtail->b_reqnext = bh;
-		lo->lo_bhtail = bh;
-	} else
-		lo->lo_bh = lo->lo_bhtail = bh;
+	b->b_reqnext = lo->lo_bh_free;
+	lo->lo_bh_free = b;
+	wk = lo->lo_bh_need;
 	spin_unlock_irqrestore(&lo->lo_lock, flags);
 
-	up(&lo->lo_bh_mutex);
+	if (wk && waitqueue_active(&lo->lo_bh_wait))
+		wake_up_interruptible(&lo->lo_bh_wait);
 }
 
-/*
- * Grab first pending buffer
- */
-static struct buffer_head *loop_get_bh(struct loop_device *lo)
+static void loop_end_io_transfer_wr(struct buffer_head *bh, int uptodate)
 {
-	struct buffer_head *bh;
-
-	spin_lock_irq(&lo->lo_lock);
-	if ((bh = lo->lo_bh)) {
-		if (bh == lo->lo_bhtail)
-			lo->lo_bhtail = NULL;
-		lo->lo_bh = bh->b_reqnext;
-		bh->b_reqnext = NULL;
-	}
-	spin_unlock_irq(&lo->lo_lock);
+	struct loop_device *lo = &loop_dev[MINOR(bh->b_dev)];
+	struct buffer_head *rbh = bh->b_private;
 
-	return bh;
+	rbh->b_reqnext = NULL;
+	rbh->b_end_io(rbh, uptodate);
+	loop_put_buffer(lo, bh);
+	if (atomic_dec_and_test(&lo->lo_pending))
+		wake_up_interruptible(&lo->lo_bh_wait);
 }
 
-/*
- * when buffer i/o has completed. if BH_Dirty is set, this was a WRITE
- * and lo->transfer stuff has already been done. if not, it was a READ
- * so queue it for the loop thread and let it do the transfer out of
- * b_end_io context (we don't want to do decrypt of a page with irqs
- * disabled)
- */
-static void loop_end_io_transfer(struct buffer_head *bh, int uptodate)
+static void loop_end_io_transfer_rd(struct buffer_head *bh, int uptodate)
 {
 	struct loop_device *lo = &loop_dev[MINOR(bh->b_dev)];
 
-	if (!uptodate || test_bit(BH_Dirty, &bh->b_state)) {
-		struct buffer_head *rbh = bh->b_private;
-
-		rbh->b_end_io(rbh, uptodate);
-		if (atomic_dec_and_test(&lo->lo_pending))
-			up(&lo->lo_bh_mutex);
-		loop_put_buffer(bh);
-	} else
-		loop_add_bh(lo, bh);
+	if (!uptodate)
+		loop_end_io_transfer_wr(bh, uptodate);
+	else
+		loop_add_queue_last(lo, bh, &lo->lo_bh_que0);
 }
 
 static struct buffer_head *loop_get_buffer(struct loop_device *lo,
-					   struct buffer_head *rbh)
+		struct buffer_head *rbh, int from_thread, int rw)
 {
 	struct buffer_head *bh;
+	struct page *p;
+	unsigned long flags;
 
-	/*
-	 * for xfer_funcs that can operate on the same bh, do that
-	 */
-	if (lo->lo_flags & LO_FLAGS_BH_REMAP) {
-		bh = rbh;
-		goto out_bh;
+	spin_lock_irqsave(&lo->lo_lock, flags);
+	bh = lo->lo_bh_free;
+	if (bh) {
+		lo->lo_bh_free = bh->b_reqnext;
+		if (from_thread)
+			lo->lo_bh_need = 0;
+	} else {
+		if (from_thread)
+			lo->lo_bh_need = 1;
 	}
+	spin_unlock_irqrestore(&lo->lo_lock, flags);
+	if (!bh)
+		return (struct buffer_head *)0;
 
-	do {
-		bh = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
-		if (bh)
-			break;
-
-		run_task_queue(&tq_disk);
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ);
-	} while (1);
-	memset(bh, 0, sizeof(*bh));
+	p = bh->b_page;
+	memset(bh, 0, sizeof(struct buffer_head));
+	bh->b_page = p;
 
+	bh->b_private = rbh;
 	bh->b_size = rbh->b_size;
 	bh->b_dev = rbh->b_rdev;
+	bh->b_rdev = lo->lo_device;
 	bh->b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+	bh->b_data = page_address(bh->b_page);
+	bh->b_end_io = (rw == WRITE) ? loop_end_io_transfer_wr : loop_end_io_transfer_rd;
+	bh->b_rsector = rbh->b_rsector + lo->lo_offs_sec;
+	init_waitqueue_head(&bh->b_wait);
+
+	return bh;
+}
+
+static int figure_loop_size(struct loop_device *lo)
+{
+	loff_t size, offs;
+	unsigned int x;
+	int err = 0;
+	kdev_t lodev = lo->lo_device;
+
+	offs = lo->lo_offset;
+	if (S_ISREG(lo->lo_backing_file->f_dentry->d_inode->i_mode)) {
+		size = lo->lo_backing_file->f_dentry->d_inode->i_size;
+	} else {
+		offs &= ~((loff_t)511);
+		if (blk_size[MAJOR(lodev)])
+			size = (loff_t)(blk_size[MAJOR(lodev)][MINOR(lodev)]) << BLOCK_SIZE_BITS;
+		else
+                    	size = 1024*1024*1024; /* unknown size */
+	}
+	if ((offs > 0) && (offs < size)) {
+		size -= offs;
+	} else {
+		if (offs)
+			err = -EINVAL;
+		lo->lo_offset = 0;
+		lo->lo_offs_sec = lo->lo_iv_remove = 0;
+	}
+	if ((lo->lo_sizelimit > 0) && (lo->lo_sizelimit <= size)) {
+		size = lo->lo_sizelimit;
+	} else {
+		if (lo->lo_sizelimit)
+			err = -EINVAL;
+		lo->lo_sizelimit = 0;
+	}
+	size >>= BLOCK_SIZE_BITS;
 
 	/*
-	 * easy way out, although it does waste some memory for < PAGE_SIZE
-	 * blocks... if highmem bounce buffering can get away with it,
-	 * so can we :-)
+	 * Unfortunately, if we want to do I/O on the device,
+	 * the number of 1024-byte blocks has to fit into unsigned int
 	 */
-	do {
-		bh->b_page = alloc_page(GFP_NOIO);
-		if (bh->b_page)
-			break;
+	x = (unsigned int)size;
+	if ((loff_t)x != size) {
+		err = -EFBIG;
+		size = 0;
+	}
 
-		run_task_queue(&tq_disk);
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ);
-	} while (1);
+	loop_sizes[lo->lo_number] = size;
+	return err;
+}
 
-	bh->b_data = page_address(bh->b_page);
-	bh->b_end_io = loop_end_io_transfer;
-	bh->b_private = rbh;
-	init_waitqueue_head(&bh->b_wait);
+static int loop_file_io(struct file *file, char *buf, int size, loff_t *ppos, int w)
+{
+	mm_segment_t fs;
+	int x, y, z;
 
-out_bh:
-	bh->b_rsector = rbh->b_rsector + (lo->lo_offset >> 9);
-	spin_lock_irq(&lo->lo_lock);
-	bh->b_rdev = lo->lo_device;
-	spin_unlock_irq(&lo->lo_lock);
+	y = 0;
+	do {
+		z = size - y;
+		fs = get_fs();
+		set_fs(get_ds());
+		if (w) {
+			x = file->f_op->write(file, buf + y, z, ppos);
+			set_fs(fs);
+		} else {
+			x = file->f_op->read(file, buf + y, z, ppos);
+			set_fs(fs);
+			if (!x)
+				return 1;
+		}
+		if (x < 0) {
+			if ((x == -EAGAIN) || (x == -ENOMEM) || (x == -ERESTART) || (x == -EINTR)) {
+				run_task_queue(&tq_disk);
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ / 2);
+				continue;
+			}
+			return 1;
+		}
+		y += x;
+	} while (y < size);
+	return 0;
+}
 
-	return bh;
+static int do_bh_filebacked(struct loop_device *lo, struct buffer_head *bh, int rw)
+{
+	loff_t pos;
+	struct file *file = lo->lo_backing_file;
+	char *data, *buf;
+	unsigned int size, len;
+	unsigned long IV;
+
+	pos = ((loff_t) bh->b_rsector << 9) + lo->lo_offset;
+	buf = page_address(lo->lo_bh_free->b_page);
+	len = bh->b_size;
+	data = bh_kmap(bh);
+	IV = bh->b_rsector;
+	if (!lo->lo_iv_remove)
+		IV += lo->lo_offs_sec;
+	while (len > 0) {
+		if (lo->lo_encrypt_type == LO_CRYPT_NONE) {
+			/* this code relies that NONE transfer is a no-op */
+			buf = data;
+		}
+		size = PAGE_SIZE;
+		if (size > len)
+			size = len;
+		if (rw == WRITE) {
+			if (lo_do_transfer(lo, WRITE, buf, data, size, IV)) {
+				printk(KERN_ERR "loop%d: write transfer error, sector %lu\n", lo->lo_number, IV);
+				goto kunmap_and_out;
+			}
+			if (loop_file_io(file, buf, size, &pos, 1)) {
+				printk(KERN_ERR "loop%d: write i/o error, sector %lu\n", lo->lo_number, IV);
+				goto kunmap_and_out;
+			}
+		} else {
+			if (loop_file_io(file, buf, size, &pos, 0)) {
+				printk(KERN_ERR "loop%d: read i/o error, sector %lu\n", lo->lo_number, IV);
+				goto kunmap_and_out;
+			}
+			if (lo_do_transfer(lo, READ, buf, data, size, IV)) {
+				printk(KERN_ERR "loop%d: read transfer error, sector %lu\n", lo->lo_number, IV);
+				goto kunmap_and_out;
+			}
+			flush_dcache_page(bh->b_page);
+		}
+		data += size;
+		len -= size;
+		IV += size >> 9;
+	}
+	bh_kunmap(bh);
+	return 0;
+
+kunmap_and_out:
+	bh_kunmap(bh);
+	return 1;
 }
 
 static int loop_make_request(request_queue_t *q, int rw, struct buffer_head *rbh)
 {
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh;
 	struct loop_device *lo;
-	unsigned long IV;
+	char *md;
 
+	set_current_state(TASK_RUNNING);
 	if (!buffer_locked(rbh))
 		BUG();
 
@@ -483,45 +1121,55 @@
 	} else if (rw == READA) {
 		rw = READ;
 	} else if (rw != READ) {
-		printk(KERN_ERR "loop: unknown command (%d)\n", rw);
+		printk(KERN_ERR "loop%d: unknown command (%d)\n", lo->lo_number, rw);
 		goto err;
 	}
 
-	rbh = blk_queue_bounce(q, rw, rbh);
-
 	/*
 	 * file backed, queue for loop_thread to handle
 	 */
 	if (lo->lo_flags & LO_FLAGS_DO_BMAP) {
-		/*
-		 * rbh locked at this point, noone else should clear
-		 * the dirty flag
-		 */
-		if (rw == WRITE)
-			set_bit(BH_Dirty, &rbh->b_state);
-		loop_add_bh(lo, rbh);
+		loop_add_queue_last(lo, rbh, (rw == WRITE) ? &lo->lo_bh_que1 : &lo->lo_bh_que0);
 		return 0;
 	}
 
 	/*
-	 * piggy old buffer on original, and submit for I/O
+	 * device backed, just remap rdev & rsector for NONE transfer
 	 */
-	bh = loop_get_buffer(lo, rbh);
-	IV = loop_get_iv(lo, rbh->b_rsector);
+	if (lo->lo_encrypt_type == LO_CRYPT_NONE) {
+		rbh->b_rsector += lo->lo_offs_sec;
+		rbh->b_rdev = lo->lo_device;
+		generic_make_request(rw, rbh);
+		if (atomic_dec_and_test(&lo->lo_pending))
+			wake_up_interruptible(&lo->lo_bh_wait);
+		return 0;
+	}
+
+	/*
+	 * device backed, start reads and writes now if buffer available
+	 */
+	bh = loop_get_buffer(lo, rbh, 0, rw);
+	if (!bh) {
+		/* just queue request and let thread handle alloc later */
+		loop_add_queue_last(lo, rbh, (rw == WRITE) ? &lo->lo_bh_que1 : &lo->lo_bh_que2);
+		return 0;
+	}
 	if (rw == WRITE) {
-		set_bit(BH_Dirty, &bh->b_state);
-		if (lo_do_transfer(lo, WRITE, bh->b_data, rbh->b_data,
-				   bh->b_size, IV))
+		int trv;
+		md = bh_kmap(rbh);
+		trv = lo_do_transfer(lo, WRITE, bh->b_data, md, bh->b_size, bh->b_rsector - lo->lo_iv_remove);
+		bh_kunmap(rbh);
+		if (trv) {
+			loop_put_buffer(lo, bh);
 			goto err;
+		}
 	}
-
 	generic_make_request(rw, bh);
 	return 0;
 
 err:
 	if (atomic_dec_and_test(&lo->lo_pending))
-		up(&lo->lo_bh_mutex);
-	loop_put_buffer(bh);
+		wake_up_interruptible(&lo->lo_bh_wait);
 out:
 	buffer_IO_error(rbh);
 	return 0;
@@ -530,30 +1178,6 @@
 	goto out;
 }
 
-static inline void loop_handle_bh(struct loop_device *lo,struct buffer_head *bh)
-{
-	int ret;
-
-	/*
-	 * For block backed loop, we know this is a READ
-	 */
-	if (lo->lo_flags & LO_FLAGS_DO_BMAP) {
-		int rw = !!test_and_clear_bit(BH_Dirty, &bh->b_state);
-
-		ret = do_bh_filebacked(lo, bh, rw);
-		bh->b_end_io(bh, !ret);
-	} else {
-		struct buffer_head *rbh = bh->b_private;
-		unsigned long IV = loop_get_iv(lo, rbh->b_rsector);
-
-		ret = lo_do_transfer(lo, READ, bh->b_data, rbh->b_data,
-				     bh->b_size, IV);
-
-		rbh->b_end_io(rbh, !ret);
-		loop_put_buffer(bh);
-	}
-}
-
 /*
  * worker thread that handles reads/writes to file backed loop devices,
  * to avoid blocking in our make_request_fn. it also does loop decrypting
@@ -563,8 +1187,20 @@
 static int loop_thread(void *data)
 {
 	struct loop_device *lo = data;
-	struct buffer_head *bh;
+	struct buffer_head *bh, *xbh;
+	int x, rw, qi = 0, flushcnt = 0;
+	wait_queue_t waitq;
+	que_look_up_table qt[4] = {
+		{ &lo->lo_bh_que0, &lo->lo_bh_que1, &lo->lo_bh_que2, 0, 1, 2 },
+		{ &lo->lo_bh_que2, &lo->lo_bh_que0, &lo->lo_bh_que1, 2, 0, 1 },
+		{ &lo->lo_bh_que0, &lo->lo_bh_que2, &lo->lo_bh_que1, 0, 2, 1 },
+		{ &lo->lo_bh_que1, &lo->lo_bh_que0, &lo->lo_bh_que2, 1, 0, 2 }
+	};
+	char *md;
+	static const struct rlimit loop_rlim_defaults[RLIM_NLIMITS] = INIT_RLIMITS;
 
+	init_waitqueue_entry(&waitq, current);
+	memcpy(&current->rlim[0], &loop_rlim_defaults[0], sizeof(current->rlim));
 	daemonize();
 	exit_files(current);
 	reparent_to_init();
@@ -576,12 +1212,30 @@
 	flush_signals(current);
 	spin_unlock_irq(&current->sigmask_lock);
 
+	if (lo_nice > 0)
+		lo_nice = 0;
+	if (lo_nice < -20)
+		lo_nice = -20;
+#if defined(DEF_NICE) && defined(DEF_COUNTER)
+	/* old scheduler syntax */
+	current->policy = SCHED_OTHER;
+	current->nice = lo_nice;
+#else
+	/* O(1) scheduler syntax */
+	set_user_nice(current, lo_nice);
+#endif
+
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_bound;
 	atomic_inc(&lo->lo_pending);
 	spin_unlock_irq(&lo->lo_lock);
 
 	current->flags |= PF_NOIO;
+#if defined(PF_NOFREEZE)
+	current->flags |= PF_NOFREEZE;
+#elif defined(PF_IOTHREAD)
+	current->flags |= PF_IOTHREAD;
+#endif
 
 	/*
 	 * up sem, we are running
@@ -589,23 +1243,120 @@
 	up(&lo->lo_sem);
 
 	for (;;) {
-		down_interruptible(&lo->lo_bh_mutex);
+		add_wait_queue(&lo->lo_bh_wait, &waitq);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!atomic_read(&lo->lo_pending))
+				break;
+
+			x = 0;
+			spin_lock_irq(&lo->lo_lock);
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+			if(lo->lo_keyscrub_fn) x = 1;
+#endif
+			if (lo->lo_bh_que0) {
+				x = 1;
+			} else if (lo->lo_bh_que1 || lo->lo_bh_que2) {
+				/* file backed works too because lo->lo_bh_need == 0 */
+				if (lo->lo_bh_free || !lo->lo_bh_need)
+					x = 1;
+			}
+			spin_unlock_irq(&lo->lo_lock);
+			if (x)
+				break;
+
+			schedule();
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&lo->lo_bh_wait, &waitq);
+
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+		if(lo->lo_keyscrub_fn) {
+			(*lo->lo_keyscrub_fn)(lo->lo_keyscrub_ptr);
+			lo->lo_keyscrub_fn = 0;
+		}
+#endif
 		/*
-		 * could be upped because of tear-down, not because of
+		 * could be woken because of tear-down, not because of
 		 * pending work
 		 */
 		if (!atomic_read(&lo->lo_pending))
 			break;
 
-		bh = loop_get_bh(lo);
-		if (!bh) {
-			printk("loop: missing bh\n");
+		/*
+		 * read queues using alternating order to prevent starvation
+		 */
+		bh = loop_get_bh(lo, &x, &qt[++qi & 3]);
+		if (!bh)
+			continue;
+
+		/*
+		 *  x  list tag        usage(buffer-allocated)
+		 * --- --------------  -----------------------
+		 *  0  lo->lo_bh_que0  dev-read(y) / file-read
+		 *  1  lo->lo_bh_que1  dev-write(n) / file-write
+		 *  2  lo->lo_bh_que2  dev-read(n)
+		 */
+		rw = (x == 1) ? WRITE : READ;
+		if ((x >= 1) && !(lo->lo_flags & LO_FLAGS_DO_BMAP)) {
+			/* loop_make_request didn't allocate a buffer, do that now */
+			xbh = loop_get_buffer(lo, bh, 1, rw);
+			if (!xbh) {
+				run_task_queue(&tq_disk);
+				flushcnt = 0;
+				loop_add_queue_first(lo, bh, (rw == WRITE) ? &lo->lo_bh_que1 : &lo->lo_bh_que2);
+				/* lo->lo_bh_need should be 1 now, go back to sleep */
+				continue;
+			}
+			if (rw == WRITE) {
+				int trv;
+				md = bh_kmap(bh);
+				trv = lo_do_transfer(lo, WRITE, xbh->b_data, md, xbh->b_size, xbh->b_rsector - lo->lo_iv_remove);
+				bh_kunmap(bh);
+				if (trv) {
+					loop_put_buffer(lo, xbh);
+					buffer_IO_error(bh);
+					atomic_dec(&lo->lo_pending);
+					continue;
+				}
+			}
+			generic_make_request(rw, xbh);
+
+			/* start I/O if there are no more requests lacking buffers */
+			x = 0;
+			spin_lock_irq(&lo->lo_lock);
+			if (!lo->lo_bh_que1 && !lo->lo_bh_que2)
+				x = 1;
+			spin_unlock_irq(&lo->lo_lock);
+			if (x || (++flushcnt >= lo->lo_bh_flsh)) {
+				run_task_queue(&tq_disk);
+				flushcnt = 0;
+			}
+
+			/* request not completely processed yet */
 			continue;
 		}
-		loop_handle_bh(lo, bh);
+		if (lo->lo_flags & LO_FLAGS_DO_BMAP) {
+			/* request is for file backed device */
+			x = do_bh_filebacked(lo, bh, rw);
+			bh->b_reqnext = NULL;
+			bh->b_end_io(bh, !x);
+		} else {
+			/* device backed read has completed, do decrypt now */
+			xbh = bh->b_private;
+			/* must not use bh->b_rsector as IV, as it may be modified by LVM at this point */
+			/* instead, recompute IV from original request */
+			md = bh_kmap(xbh);
+			x = lo_do_transfer(lo, READ, bh->b_data, md, bh->b_size, xbh->b_rsector + lo->lo_offs_sec - lo->lo_iv_remove);
+			flush_dcache_page(xbh->b_page);
+			bh_kunmap(xbh);
+			xbh->b_reqnext = NULL;
+			xbh->b_end_io(xbh, !x);
+			loop_put_buffer(lo, bh);
+		}
 
 		/*
-		 * upped both for pending work and tear-down, lo_pending
+		 * woken both for pending work and tear-down, lo_pending
 		 * will hit zero then
 		 */
 		if (atomic_dec_and_test(&lo->lo_pending))
@@ -616,15 +1367,34 @@
 	return 0;
 }
 
+static void loop_set_softblksz(struct loop_device *lo, kdev_t dev)
+{
+	int bs = 0, x;
+
+	if (blksize_size[MAJOR(lo->lo_device)])
+		bs = blksize_size[MAJOR(lo->lo_device)][MINOR(lo->lo_device)];
+	if (!bs)
+		bs = BLOCK_SIZE;
+	if (lo->lo_flags & LO_FLAGS_DO_BMAP) {
+		x = loop_sizes[lo->lo_number];
+		if ((bs == 8192) && (x & 7))
+			bs = 4096;
+		if ((bs == 4096) && (x & 3))
+			bs = 2048;
+		if ((bs == 2048) && (x & 1))
+			bs = 1024;
+	}
+	set_blocksize(dev, bs);
+}
+
 static int loop_set_fd(struct loop_device *lo, struct file *lo_file, kdev_t dev,
 		       unsigned int arg)
 {
 	struct file	*file;
 	struct inode	*inode;
 	kdev_t		lo_device;
-	int		lo_flags = 0;
+	int		lo_flags = 0, hardsz = 512;
 	int		error;
-	int		bs;
 
 	MOD_INC_USE_COUNT;
 
@@ -643,33 +1413,49 @@
 	if (!(file->f_mode & FMODE_WRITE))
 		lo_flags |= LO_FLAGS_READ_ONLY;
 
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+	lo->lo_keyscrub_fn = 0;
+#endif
+	lo->lo_offset = lo->lo_sizelimit = 0;
+	lo->lo_offs_sec = lo->lo_iv_remove = 0;
+	lo->lo_bh_free = lo->lo_bh_que2 = lo->lo_bh_que1 = lo->lo_bh_que0 = NULL;
+	lo->lo_bh_need = lo->lo_bh_flsh = 0;
+	init_waitqueue_head(&lo->lo_bh_wait);
 	if (S_ISBLK(inode->i_mode)) {
 		lo_device = inode->i_rdev;
 		if (lo_device == dev) {
 			error = -EBUSY;
 			goto out_putf;
 		}
+		if (loop_prealloc_init(lo, 0)) {
+			error = -ENOMEM;
+			goto out_putf;
+		}
+		hardsz = get_hardsect_size(lo_device);
 	} else if (S_ISREG(inode->i_mode)) {
-		struct address_space_operations *aops = inode->i_mapping->a_ops;
 		/*
 		 * If we can't read - sorry. If we only can't write - well,
 		 * it's going to be read-only.
 		 */
-		if (!aops->readpage)
+		if (!file->f_op || !file->f_op->read)
 			goto out_putf;
 
-		if (!aops->prepare_write || !aops->commit_write)
+		if (!file->f_op->write)
 			lo_flags |= LO_FLAGS_READ_ONLY;
 
 		lo_device = inode->i_dev;
 		lo_flags |= LO_FLAGS_DO_BMAP;
+		if (loop_prealloc_init(lo, 1)) {
+			error = -ENOMEM;
+			goto out_putf;
+		}
 		error = 0;
 	} else
 		goto out_putf;
 
 	get_file(file);
 
-	if (IS_RDONLY (inode) || is_read_only(lo_device)
+	if ((S_ISREG(inode->i_mode) && IS_RDONLY(inode)) || is_read_only(lo_device)
 	    || !(lo_file->f_mode & FMODE_WRITE))
 		lo_flags |= LO_FLAGS_READ_ONLY;
 
@@ -677,28 +1463,40 @@
 
 	lo->lo_device = lo_device;
 	lo->lo_flags = lo_flags;
+	if(lo_flags & LO_FLAGS_READ_ONLY)
+		lo->lo_flags |= 0x200000; /* export to user space */
 	lo->lo_backing_file = file;
 	lo->transfer = NULL;
 	lo->ioctl = NULL;
-	figure_loop_size(lo);
-	lo->old_gfp_mask = inode->i_mapping->gfp_mask;
-	inode->i_mapping->gfp_mask &= ~(__GFP_IO|__GFP_FS);
-
-	bs = 0;
-	if (blksize_size[MAJOR(lo_device)])
-		bs = blksize_size[MAJOR(lo_device)][MINOR(lo_device)];
-	if (!bs)
-		bs = BLOCK_SIZE;
+	if (figure_loop_size(lo)) {
+		error = -EFBIG;
+		goto out_cleanup;
+	}
 
-	set_blocksize(dev, bs);
+	if (lo_flags & LO_FLAGS_DO_BMAP) {
+		lo->old_gfp_mask = inode->i_mapping->gfp_mask;
+		inode->i_mapping->gfp_mask &= ~(__GFP_IO|__GFP_FS);
+		inode->i_mapping->gfp_mask |= __GFP_HIGH;
+	} else {
+		lo->old_gfp_mask = -1;
+	}
 
-	lo->lo_bh = lo->lo_bhtail = NULL;
-	kernel_thread(loop_thread, lo, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
-	down(&lo->lo_sem);
+	loop_hardsizes[MINOR(dev)] = hardsz;
+	loop_set_softblksz(lo, dev);
 
+	error = kernel_thread(loop_thread, lo, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
+	if(error < 0)
+		goto out_mapping;
+	down(&lo->lo_sem);
 	fput(file);
 	return 0;
 
+ out_mapping:
+	if(lo->old_gfp_mask != -1)
+		inode->i_mapping->gfp_mask = lo->old_gfp_mask;
+ out_cleanup:
+	loop_prealloc_cleanup(lo);
+	fput(file);
  out_putf:
 	fput(file);
  out:
@@ -708,13 +1506,14 @@
 
 static int loop_release_xfer(struct loop_device *lo)
 {
-	int err = 0; 
+	int err = 0;
 	if (lo->lo_encrypt_type) {
-		struct loop_func_table *xfer= xfer_funcs[lo->lo_encrypt_type]; 
+		struct loop_func_table *xfer= xfer_funcs[lo->lo_encrypt_type];
+		lo->transfer = NULL;
 		if (xfer && xfer->release)
-			err = xfer->release(lo); 
+			err = xfer->release(lo);
 		if (xfer && xfer->unlock)
-			xfer->unlock(lo); 
+			xfer->unlock(lo);
 		lo->lo_encrypt_type = 0;
 	}
 	return err;
@@ -722,19 +1521,19 @@
 
 static int loop_init_xfer(struct loop_device *lo, int type,struct loop_info *i)
 {
-	int err = 0; 
+	int err = 0;
 	if (type) {
-		struct loop_func_table *xfer = xfer_funcs[type]; 
+		struct loop_func_table *xfer = xfer_funcs[type];
 		if (xfer->init)
 			err = xfer->init(lo, i);
-		if (!err) { 
+		if (!err) {
 			lo->lo_encrypt_type = type;
 			if (xfer->lock)
 				xfer->lock(lo);
 		}
 	}
 	return err;
-}  
+}
 
 static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 {
@@ -751,11 +1550,12 @@
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
 	if (atomic_dec_and_test(&lo->lo_pending))
-		up(&lo->lo_bh_mutex);
+		wake_up_interruptible(&lo->lo_bh_wait);
 	spin_unlock_irq(&lo->lo_lock);
 
 	down(&lo->lo_sem);
 
+	loop_prealloc_cleanup(lo);
 	lo->lo_backing_file = NULL;
 
 	loop_release_xfer(lo);
@@ -763,87 +1563,219 @@
 	lo->ioctl = NULL;
 	lo->lo_device = 0;
 	lo->lo_encrypt_type = 0;
-	lo->lo_offset = 0;
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+	lo->lo_keyscrub_fn = 0;
+#endif
+	lo->lo_offset = lo->lo_sizelimit = 0;
+	lo->lo_offs_sec = lo->lo_iv_remove = 0;
 	lo->lo_encrypt_key_size = 0;
 	lo->lo_flags = 0;
 	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
 	memset(lo->lo_name, 0, LO_NAME_SIZE);
+	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
 	loop_sizes[lo->lo_number] = 0;
 	invalidate_bdev(bdev, 0);
-	filp->f_dentry->d_inode->i_mapping->gfp_mask = gfp;
+	if (gfp != -1)
+		filp->f_dentry->d_inode->i_mapping->gfp_mask = gfp;
 	lo->lo_state = Lo_unbound;
 	fput(filp);
 	MOD_DEC_USE_COUNT;
 	return 0;
 }
 
-static int loop_set_status(struct loop_device *lo, struct loop_info *arg)
+static void
+loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
+{
+	memset(info64, 0, sizeof(*info64));
+	info64->lo_number = info->lo_number;
+	info64->lo_device = info->lo_device;
+	info64->lo_inode = info->lo_inode;
+	info64->lo_rdevice = info->lo_rdevice;
+	info64->lo_offset = info->lo_offset;
+	info64->lo_encrypt_type = info->lo_encrypt_type;
+	info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
+	info64->lo_flags = info->lo_flags;
+	info64->lo_init[0] = info->lo_init[0];
+	info64->lo_init[1] = info->lo_init[1];
+	if (info->lo_encrypt_type == 18) /* LO_CRYPT_CRYPTOAPI */
+		memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
+	else
+		memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
+	memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
+}
+
+static int
+loop_info64_to_old(struct loop_info64 *info64, struct loop_info *info)
+{
+	memset(info, 0, sizeof(*info));
+	info->lo_number = info64->lo_number;
+	info->lo_device = info64->lo_device;
+	info->lo_inode = info64->lo_inode;
+	info->lo_rdevice = info64->lo_rdevice;
+	info->lo_offset = info64->lo_offset;
+	info->lo_encrypt_type = info64->lo_encrypt_type;
+	info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
+	info->lo_flags = info64->lo_flags;
+	info->lo_init[0] = info64->lo_init[0];
+	info->lo_init[1] = info64->lo_init[1];
+	if (info->lo_encrypt_type == 18) /* LO_CRYPT_CRYPTOAPI */
+		memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
+	else
+		memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
+	memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+
+	/* error in case values were truncated */
+	if (info->lo_device != info64->lo_device ||
+	    info->lo_rdevice != info64->lo_rdevice ||
+	    info->lo_inode != info64->lo_inode ||
+	    info->lo_offset != info64->lo_offset ||
+	    info64->lo_sizelimit)
+		return -EOVERFLOW;
+
+	return 0;
+}
+
+static int loop_set_status(struct loop_device *lo, kdev_t dev, struct loop_info64 *info, struct loop_info *oldinfo)
 {
-	struct loop_info info; 
 	int err;
 	unsigned int type;
 
-	if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid && 
+	if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
-	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
-		return -EFAULT; 
-	if ((unsigned int) info.lo_encrypt_key_size > LO_KEY_SIZE)
+	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
 		return -EINVAL;
-	type = info.lo_encrypt_type; 
+	type = info->lo_encrypt_type;
 	if (type >= MAX_LO_CRYPT || xfer_funcs[type] == NULL)
 		return -EINVAL;
-	if (type == LO_CRYPT_XOR && info.lo_encrypt_key_size == 0)
+	if (type == LO_CRYPT_XOR && info->lo_encrypt_key_size == 0)
 		return -EINVAL;
 	err = loop_release_xfer(lo);
-	if (!err) 
-		err = loop_init_xfer(lo, type, &info);
 	if (err)
 		return err;	
 
-	lo->lo_offset = info.lo_offset;
-	strncpy(lo->lo_name, info.lo_name, LO_NAME_SIZE);
+	if ((loff_t)info->lo_offset < 0) {
+		/* negative offset == remove offset from IV computations */
+		lo->lo_offset = -(info->lo_offset);
+		lo->lo_iv_remove = lo->lo_offset >> 9;
+	} else {
+		/* positive offset == include offset in IV computations */
+		lo->lo_offset = info->lo_offset;
+		lo->lo_iv_remove = 0;
+	}
+	lo->lo_offs_sec = lo->lo_offset >> 9;
+	lo->lo_sizelimit = info->lo_sizelimit;
+	err = figure_loop_size(lo);
+	if (err)
+		return err;
+	loop_set_softblksz(lo, dev);
 
+	/* transfer init function for 2.4 kernels takes old style struct */
+	err = loop_init_xfer(lo, type, oldinfo);
+	/* copy key -- just in case transfer init func modified it */
+	memcpy(info->lo_encrypt_key, oldinfo->lo_encrypt_key, sizeof(info->lo_encrypt_key));
+	if (err)
+		return err;
+
+	strncpy(lo->lo_name, info->lo_file_name, LO_NAME_SIZE);
+	strncpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
 	lo->transfer = xfer_funcs[type]->transfer;
 	lo->ioctl = xfer_funcs[type]->ioctl;
-	lo->lo_encrypt_key_size = info.lo_encrypt_key_size;
-	lo->lo_init[0] = info.lo_init[0];
-	lo->lo_init[1] = info.lo_init[1];
-	if (info.lo_encrypt_key_size) {
-		memcpy(lo->lo_encrypt_key, info.lo_encrypt_key, 
-		       info.lo_encrypt_key_size);
-		lo->lo_key_owner = current->uid; 
-	}	
-	figure_loop_size(lo);
+	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
+	lo->lo_init[0] = info->lo_init[0];
+	lo->lo_init[1] = info->lo_init[1];
+	if (info->lo_encrypt_key_size) {
+		memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
+		       info->lo_encrypt_key_size);
+		lo->lo_key_owner = current->uid;
+	}
+
 	return 0;
 }
 
-static int loop_get_status(struct loop_device *lo, struct loop_info *arg)
+static int loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 {
-	struct loop_info	info;
 	struct file *file = lo->lo_backing_file;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
-	if (!arg)
-		return -EINVAL;
-	memset(&info, 0, sizeof(info));
-	info.lo_number = lo->lo_number;
-	info.lo_device = kdev_t_to_nr(file->f_dentry->d_inode->i_dev);
-	info.lo_inode = file->f_dentry->d_inode->i_ino;
-	info.lo_rdevice = kdev_t_to_nr(lo->lo_device);
-	info.lo_offset = lo->lo_offset;
-	info.lo_flags = lo->lo_flags;
-	strncpy(info.lo_name, lo->lo_name, LO_NAME_SIZE);
-	info.lo_encrypt_type = lo->lo_encrypt_type;
+	memset(info, 0, sizeof(*info));
+	info->lo_number = lo->lo_number;
+	info->lo_device = kdev_t_to_nr(file->f_dentry->d_inode->i_dev);
+	info->lo_inode = file->f_dentry->d_inode->i_ino;
+	info->lo_rdevice = kdev_t_to_nr(lo->lo_device);
+	info->lo_offset = lo->lo_iv_remove ? -(lo->lo_offset) : lo->lo_offset;
+	info->lo_sizelimit = lo->lo_sizelimit;
+	info->lo_flags = lo->lo_flags;
+	strncpy(info->lo_file_name, lo->lo_name, LO_NAME_SIZE);
+	strncpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
+	info->lo_encrypt_type = lo->lo_encrypt_type;
 	if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
-		info.lo_encrypt_key_size = lo->lo_encrypt_key_size;
-		memcpy(info.lo_encrypt_key, lo->lo_encrypt_key,
+		info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
+		memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
 		       lo->lo_encrypt_key_size);
+		info->lo_init[0] = lo->lo_init[0];
+		info->lo_init[1] = lo->lo_init[1];
 	}
-	return copy_to_user(arg, &info, sizeof(info)) ? -EFAULT : 0;
+	return 0;
+}
+
+static int
+loop_set_status_n(struct loop_device *lo, kdev_t dev, void *arg, int n)
+{
+	struct loop_info info;
+	struct loop_info64 info64;
+	int err;
+
+	if (n) {
+		if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
+			return -EFAULT;
+		/* truncation errors can be ignored here as transfer init func only wants key bits */
+		loop_info64_to_old(&info64, &info);
+	} else {
+		if (copy_from_user(&info, arg, sizeof (struct loop_info)))
+			return -EFAULT;
+		loop_info64_from_old(&info, &info64);
+	}
+	err = loop_set_status(lo, dev, &info64, &info);
+	memset(&info.lo_encrypt_key[0], 0, sizeof(info.lo_encrypt_key));
+	memset(&info64.lo_encrypt_key[0], 0, sizeof(info64.lo_encrypt_key));
+	return err;
+}
+
+static int
+loop_get_status_old(struct loop_device *lo, struct loop_info *arg) {
+	struct loop_info info;
+	struct loop_info64 info64;
+	int err = 0;
+
+	if (!arg)
+		err = -EINVAL;
+	if (!err)
+		err = loop_get_status(lo, &info64);
+	if (!err)
+		err = loop_info64_to_old(&info64, &info);
+	if (!err && copy_to_user(arg, &info, sizeof(info)))
+		err = -EFAULT;
+
+	return err;
+}
+
+static int
+loop_get_status64(struct loop_device *lo, struct loop_info64 *arg) {
+	struct loop_info64 info64;
+	int err = 0;
+
+	if (!arg)
+		err = -EINVAL;
+	if (!err)
+		err = loop_get_status(lo, &info64);
+	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
+		err = -EFAULT;
+
+	return err;
 }
 
 static int lo_ioctl(struct inode * inode, struct file * file,
@@ -872,10 +1804,16 @@
 		err = loop_clr_fd(lo, inode->i_bdev);
 		break;
 	case LOOP_SET_STATUS:
-		err = loop_set_status(lo, (struct loop_info *) arg);
+		err = loop_set_status_n(lo, inode->i_rdev, (void *) arg, 0);
 		break;
 	case LOOP_GET_STATUS:
-		err = loop_get_status(lo, (struct loop_info *) arg);
+		err = loop_get_status_old(lo, (struct loop_info *) arg);
+		break;
+	case LOOP_SET_STATUS64:
+		err = loop_set_status_n(lo, inode->i_rdev, (void *) arg, 1);
+		break;
+	case LOOP_GET_STATUS64:
+		err = loop_get_status64(lo, (struct loop_info64 *) arg);
 		break;
 	case BLKGETSIZE:
 		if (lo->lo_state != Lo_bound) {
@@ -894,6 +1832,8 @@
 	case BLKBSZGET:
 	case BLKBSZSET:
 	case BLKSSZGET:
+	case BLKROGET:
+	case BLKROSET:
 		err = blk_ioctl(inode->i_rdev, cmd, arg);
 		break;
 	default:
@@ -906,7 +1846,7 @@
 static int lo_open(struct inode *inode, struct file *file)
 {
 	struct loop_device *lo;
-	int	dev, type;
+	int	dev;
 
 	if (!inode)
 		return -EINVAL;
@@ -921,10 +1861,6 @@
 	lo = &loop_dev[dev];
 	MOD_INC_USE_COUNT;
 	down(&lo->lo_ctl_mutex);
-
-	type = lo->lo_encrypt_type; 
-	if (type && xfer_funcs[type] && xfer_funcs[type]->lock)
-		xfer_funcs[type]->lock(lo);
 	lo->lo_refcnt++;
 	up(&lo->lo_ctl_mutex);
 	return 0;
@@ -933,7 +1869,7 @@
 static int lo_release(struct inode *inode, struct file *file)
 {
 	struct loop_device *lo;
-	int	dev, type;
+	int	dev;
 
 	if (!inode)
 		return 0;
@@ -948,11 +1884,7 @@
 
 	lo = &loop_dev[dev];
 	down(&lo->lo_ctl_mutex);
-	type = lo->lo_encrypt_type;
 	--lo->lo_refcnt;
-	if (xfer_funcs[type] && xfer_funcs[type]->unlock)
-		xfer_funcs[type]->unlock(lo);
-
 	up(&lo->lo_ctl_mutex);
 	MOD_DEC_USE_COUNT;
 	return 0;
@@ -974,34 +1906,32 @@
 
 int loop_register_transfer(struct loop_func_table *funcs)
 {
-	if ((unsigned)funcs->number > MAX_LO_CRYPT || xfer_funcs[funcs->number])
+	if ((unsigned)funcs->number >= MAX_LO_CRYPT || xfer_funcs[funcs->number])
 		return -EINVAL;
 	xfer_funcs[funcs->number] = funcs;
-	return 0; 
+	return 0;
 }
 
 int loop_unregister_transfer(int number)
 {
-	struct loop_device *lo; 
+	struct loop_device *lo;
 
 	if ((unsigned)number >= MAX_LO_CRYPT)
-		return -EINVAL; 
-	for (lo = &loop_dev[0]; lo < &loop_dev[max_loop]; lo++) { 
+		return -EINVAL;
+	for (lo = &loop_dev[0]; lo < &loop_dev[max_loop]; lo++) {
 		int type = lo->lo_encrypt_type;
-		if (type == number) { 
-			xfer_funcs[type]->release(lo);
-			lo->transfer = NULL; 
-			lo->lo_encrypt_type = 0; 
+		if (type == number) {
+			loop_release_xfer(lo);
 		}
 	}
-	xfer_funcs[number] = NULL; 
-	return 0; 
+	xfer_funcs[number] = NULL;
+	return 0;
 }
 
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
-int __init loop_init(void) 
+int __init loop_init(void)
 {
 	int	i;
 
@@ -1017,10 +1947,9 @@
 		return -EIO;
 	}
 
-
 	loop_dev = kmalloc(max_loop * sizeof(struct loop_device), GFP_KERNEL);
 	if (!loop_dev)
-		return -ENOMEM;
+		goto out_dev;
 
 	loop_sizes = kmalloc(max_loop * sizeof(int), GFP_KERNEL);
 	if (!loop_sizes)
@@ -1030,6 +1959,10 @@
 	if (!loop_blksizes)
 		goto out_blksizes;
 
+	loop_hardsizes = kmalloc(max_loop * sizeof(int), GFP_KERNEL);
+	if (!loop_hardsizes)
+		goto out_hardsizes;
+
 	blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), loop_make_request);
 
 	for (i = 0; i < max_loop; i++) {
@@ -1037,45 +1970,86 @@
 		memset(lo, 0, sizeof(struct loop_device));
 		init_MUTEX(&lo->lo_ctl_mutex);
 		init_MUTEX_LOCKED(&lo->lo_sem);
-		init_MUTEX_LOCKED(&lo->lo_bh_mutex);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
 	}
 
 	memset(loop_sizes, 0, max_loop * sizeof(int));
 	memset(loop_blksizes, 0, max_loop * sizeof(int));
+	memset(loop_hardsizes, 0, max_loop * sizeof(int));
 	blk_size[MAJOR_NR] = loop_sizes;
 	blksize_size[MAJOR_NR] = loop_blksizes;
+	hardsect_size[MAJOR_NR] = loop_hardsizes;
 	for (i = 0; i < max_loop; i++)
 		register_disk(NULL, MKDEV(MAJOR_NR, i), 1, &lo_fops, 0);
 
+	for (i = 0; i < (sizeof(lo_prealloc) / sizeof(int)); i += 2) {
+		if (!lo_prealloc[i])
+			continue;
+		if (lo_prealloc[i] < LO_PREALLOC_MIN)
+			lo_prealloc[i] = LO_PREALLOC_MIN;
+		if (lo_prealloc[i] > LO_PREALLOC_MAX)
+			lo_prealloc[i] = LO_PREALLOC_MAX;
+	}
+
+#if defined(IOCTL32_COMPATIBLE_PTR)
+	lock_kernel();
+	register_ioctl32_conversion(LOOP_SET_STATUS64, IOCTL32_COMPATIBLE_PTR);
+	register_ioctl32_conversion(LOOP_GET_STATUS64, IOCTL32_COMPATIBLE_PTR);
+	register_ioctl32_conversion(LOOP_MULTI_KEY_SETUP, IOCTL32_COMPATIBLE_PTR);
+	register_ioctl32_conversion(LOOP_MULTI_KEY_SETUP_V3, IOCTL32_COMPATIBLE_PTR);
+	unlock_kernel();
+#endif
+
 	devfs_handle = devfs_mk_dir(NULL, "loop", NULL);
 	devfs_register_series(devfs_handle, "%u", max_loop, DEVFS_FL_DEFAULT,
 			      MAJOR_NR, 0,
 			      S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
 			      &lo_fops, NULL);
 
+#if CONFIG_BLK_DEV_LOOP_AES
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+	printk(KERN_INFO "loop: AES key scrubbing enabled\n");
+#endif
+#endif
 	printk(KERN_INFO "loop: loaded (max %d devices)\n", max_loop);
 	return 0;
 
+out_hardsizes:
+	kfree(loop_blksizes);
 out_blksizes:
 	kfree(loop_sizes);
 out_sizes:
 	kfree(loop_dev);
+out_dev:
 	if (devfs_unregister_blkdev(MAJOR_NR, "loop"))
 		printk(KERN_WARNING "loop: cannot unregister blkdev\n");
 	printk(KERN_ERR "loop: ran out of memory\n");
 	return -ENOMEM;
 }
 
-void loop_exit(void) 
+void loop_exit(void)
 {
 	devfs_unregister(devfs_handle);
 	if (devfs_unregister_blkdev(MAJOR_NR, "loop"))
 		printk(KERN_WARNING "loop: cannot unregister blkdev\n");
+
+	blk_size[MAJOR_NR] = 0;
+	blksize_size[MAJOR_NR] = 0;
+	hardsect_size[MAJOR_NR] = 0;
 	kfree(loop_dev);
 	kfree(loop_sizes);
 	kfree(loop_blksizes);
+	kfree(loop_hardsizes);
+
+#if defined(IOCTL32_COMPATIBLE_PTR)
+	lock_kernel();
+	unregister_ioctl32_conversion(LOOP_SET_STATUS64);
+	unregister_ioctl32_conversion(LOOP_GET_STATUS64);
+	unregister_ioctl32_conversion(LOOP_MULTI_KEY_SETUP);
+	unregister_ioctl32_conversion(LOOP_MULTI_KEY_SETUP_V3);
+	unlock_kernel();
+#endif
 }
 
 module_init(loop_init);
@@ -1089,4 +2063,15 @@
 }
 
 __setup("max_loop=", max_loop_setup);
+#endif
+
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+void loop_add_keyscrub_fn(struct loop_device *lo, void (*fn)(void *), void *ptr)
+{
+    lo->lo_keyscrub_ptr = ptr;
+    wmb();
+    lo->lo_keyscrub_fn = fn;
+    wake_up_interruptible(&lo->lo_bh_wait);
+}
+EXPORT_SYMBOL(loop_add_keyscrub_fn);
 #endif
diff -urN linux-2.4.28/drivers/misc/Makefile linux-2.4.28-loop-AES-v3.0b/drivers/misc/Makefile
--- linux-2.4.28/drivers/misc/Makefile	Sat Dec  1 18:27:13 2001
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/Makefile	Sun Feb  6 18:45:39 2005
@@ -9,7 +9,34 @@
 # parent makes..
 #
 
+.S.o:
+	$(CC) $(AFLAGS) $(AFLAGS_$@) -c $< -o $*.o
+
 O_TARGET := misc.o
+
+ifeq ($(CONFIG_BLK_DEV_LOOP_AES),y)
+AES_X86_ASM=n
+ifeq ($(CONFIG_X86),y)
+ifneq ($(CONFIG_X86_64),y)
+  AES_X86_ASM=y
+endif
+endif
+ifeq ($(AES_X86_ASM),y)
+  export-objs += crypto-ksym.o
+  obj-y       += aes-x86.o md5-x86.o crypto-ksym.o
+  AFLAGS_aes-x86.o := -DUSE_UNDERLINE=1
+else
+ifeq ($(CONFIG_X86_64),y)
+  export-objs += crypto-ksym.o
+  obj-y       += aes-amd64.o md5-amd64.o crypto-ksym.o
+  AFLAGS_aes-amd64.o := -DUSE_UNDERLINE=1
+else
+  export-objs += crypto-ksym.o
+  obj-y       += aes.o md5.o crypto-ksym.o
+  CFLAGS_aes.o := -DDATA_ALWAYS_ALIGNED=1
+endif
+endif
+endif
 
 include $(TOPDIR)/Rules.make
 
diff -urN linux-2.4.28/drivers/misc/aes-amd64.S linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes-amd64.S
--- linux-2.4.28/drivers/misc/aes-amd64.S	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes-amd64.S	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,893 @@
+//
+// Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.
+// All rights reserved.
+//
+// TERMS
+//
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted subject to the following conditions:
+//
+//  1. Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//  2. Redistributions in binary form must reproduce the above copyright
+//     notice, this list of conditions and the following disclaimer in the
+//     documentation and/or other materials provided with the distribution.
+//
+//  3. The copyright holder's name must not be used to endorse or promote
+//     any products derived from this software without his specific prior
+//     written permission.
+//
+//  This software is provided 'as is' with no express or implied warranties
+//  of correctness or fitness for purpose.
+
+// Modified by Jari Ruusu,  December 24 2001
+//  - Converted syntax to GNU CPP/assembler syntax
+//  - C programming interface converted back to "old" API
+//  - Minor portability cleanups and speed optimizations
+
+// Modified by Jari Ruusu,  April 11 2002
+//  - Added above copyright and terms to resulting object code so that
+//    binary distributions can avoid legal trouble
+
+// Modified by Jari Ruusu,  June 12 2004
+//  - Converted 32 bit x86 code to 64 bit AMD64 code
+//  - Re-wrote encrypt and decrypt code from scratch
+
+// An AES (Rijndael) implementation for the AMD64. This version only
+// implements the standard AES block length (128 bits, 16 bytes). This code
+// does not preserve the rax, rcx, rdx, rsi, rdi or r8-r11 registers or the
+// artihmetic status flags. However, the rbx, rbp and r12-r15 registers are
+// preserved across calls.
+
+// void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f)
+// void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
+// void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
+
+#if defined(USE_UNDERLINE)
+# define aes_set_key _aes_set_key
+# define aes_encrypt _aes_encrypt
+# define aes_decrypt _aes_decrypt
+#endif
+#if !defined(ALIGN64BYTES)
+# define ALIGN64BYTES 64
+#endif
+
+	.file	"aes-amd64.S"
+	.globl	aes_set_key
+	.globl	aes_encrypt
+	.globl	aes_decrypt
+
+	.section .rodata
+copyright:
+	.ascii "    \000"
+	.ascii "Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.\000"
+	.ascii "All rights reserved.\000"
+	.ascii "    \000"
+	.ascii "TERMS\000"
+	.ascii "    \000"
+	.ascii " Redistribution and use in source and binary forms, with or without\000"
+	.ascii " modification, are permitted subject to the following conditions:\000"
+	.ascii "    \000"
+	.ascii " 1. Redistributions of source code must retain the above copyright\000"
+	.ascii "    notice, this list of conditions and the following disclaimer.\000"
+	.ascii "    \000"
+	.ascii " 2. Redistributions in binary form must reproduce the above copyright\000"
+	.ascii "    notice, this list of conditions and the following disclaimer in the\000"
+	.ascii "    documentation and/or other materials provided with the distribution.\000"
+	.ascii "    \000"
+	.ascii " 3. The copyright holder's name must not be used to endorse or promote\000"
+	.ascii "    any products derived from this software without his specific prior\000"
+	.ascii "    written permission.\000"
+	.ascii "    \000"
+	.ascii " This software is provided 'as is' with no express or implied warranties\000"
+	.ascii " of correctness or fitness for purpose.\000"
+	.ascii "    \000"
+
+#define tlen	1024	// length of each of 4 'xor' arrays (256 32-bit words)
+
+// offsets in context structure
+
+#define nkey	0	// key length, size 4
+#define nrnd	4	// number of rounds, size 4
+#define ekey	8	// encryption key schedule base address, size 256
+#define dkey	264	// decryption key schedule base address, size 256
+
+// This macro performs a forward encryption cycle. It is entered with
+// the first previous round column values in I1E, I2E, I3E and I4E and
+// exits with the final values OU1, OU2, OU3 and OU4 registers.
+
+#define fwd_rnd(p1,p2,I1E,I1B,I1H,I2E,I2B,I2H,I3E,I3B,I3R,I4E,I4B,I4R,OU1,OU2,OU3,OU4) \
+	movl	p2(%rbp),OU1		;\
+	movl	p2+4(%rbp),OU2		;\
+	movl	p2+8(%rbp),OU3		;\
+	movl	p2+12(%rbp),OU4		;\
+	movzbl	I1B,%edi		;\
+	movzbl	I2B,%esi		;\
+	movzbl	I3B,%r8d		;\
+	movzbl	I4B,%r13d		;\
+	shrl	$8,I3E			;\
+	shrl	$8,I4E			;\
+	xorl	p1(,%rdi,4),OU1		;\
+	xorl	p1(,%rsi,4),OU2		;\
+	xorl	p1(,%r8,4),OU3		;\
+	xorl	p1(,%r13,4),OU4		;\
+	movzbl	I2H,%esi		;\
+	movzbl	I3B,%r8d		;\
+	movzbl	I4B,%r13d		;\
+	movzbl	I1H,%edi		;\
+	shrl	$8,I3E			;\
+	shrl	$8,I4E			;\
+	xorl	p1+tlen(,%rsi,4),OU1	;\
+	xorl	p1+tlen(,%r8,4),OU2	;\
+	xorl	p1+tlen(,%r13,4),OU3	;\
+	xorl	p1+tlen(,%rdi,4),OU4	;\
+	shrl	$16,I1E			;\
+	shrl	$16,I2E			;\
+	movzbl	I3B,%r8d		;\
+	movzbl	I4B,%r13d		;\
+	movzbl	I1B,%edi		;\
+	movzbl	I2B,%esi		;\
+	xorl	p1+2*tlen(,%r8,4),OU1	;\
+	xorl	p1+2*tlen(,%r13,4),OU2	;\
+	xorl	p1+2*tlen(,%rdi,4),OU3	;\
+	xorl	p1+2*tlen(,%rsi,4),OU4	;\
+	shrl	$8,I4E			;\
+	movzbl	I1H,%edi		;\
+	movzbl	I2H,%esi		;\
+	shrl	$8,I3E			;\
+	xorl	p1+3*tlen(,I4R,4),OU1	;\
+	xorl	p1+3*tlen(,%rdi,4),OU2	;\
+	xorl	p1+3*tlen(,%rsi,4),OU3	;\
+	xorl	p1+3*tlen(,I3R,4),OU4
+
+// This macro performs an inverse encryption cycle. It is entered with
+// the first previous round column values in I1E, I2E, I3E and I4E and
+// exits with the final values OU1, OU2, OU3 and OU4 registers.
+
+#define inv_rnd(p1,p2,I1E,I1B,I1R,I2E,I2B,I2R,I3E,I3B,I3H,I4E,I4B,I4H,OU1,OU2,OU3,OU4) \
+	movl	p2+12(%rbp),OU4		;\
+	movl	p2+8(%rbp),OU3		;\
+	movl	p2+4(%rbp),OU2		;\
+	movl	p2(%rbp),OU1		;\
+	movzbl	I4B,%edi		;\
+	movzbl	I3B,%esi		;\
+	movzbl	I2B,%r8d		;\
+	movzbl	I1B,%r13d		;\
+	shrl	$8,I2E			;\
+	shrl	$8,I1E			;\
+	xorl	p1(,%rdi,4),OU4		;\
+	xorl	p1(,%rsi,4),OU3		;\
+	xorl	p1(,%r8,4),OU2		;\
+	xorl	p1(,%r13,4),OU1		;\
+	movzbl	I3H,%esi		;\
+	movzbl	I2B,%r8d		;\
+	movzbl	I1B,%r13d		;\
+	movzbl	I4H,%edi		;\
+	shrl	$8,I2E			;\
+	shrl	$8,I1E			;\
+	xorl	p1+tlen(,%rsi,4),OU4	;\
+	xorl	p1+tlen(,%r8,4),OU3	;\
+	xorl	p1+tlen(,%r13,4),OU2	;\
+	xorl	p1+tlen(,%rdi,4),OU1	;\
+	shrl	$16,I4E			;\
+	shrl	$16,I3E			;\
+	movzbl	I2B,%r8d		;\
+	movzbl	I1B,%r13d		;\
+	movzbl	I4B,%edi		;\
+	movzbl	I3B,%esi		;\
+	xorl	p1+2*tlen(,%r8,4),OU4	;\
+	xorl	p1+2*tlen(,%r13,4),OU3	;\
+	xorl	p1+2*tlen(,%rdi,4),OU2	;\
+	xorl	p1+2*tlen(,%rsi,4),OU1	;\
+	shrl	$8,I1E			;\
+	movzbl	I4H,%edi		;\
+	movzbl	I3H,%esi		;\
+	shrl	$8,I2E			;\
+	xorl	p1+3*tlen(,I1R,4),OU4	;\
+	xorl	p1+3*tlen(,%rdi,4),OU3	;\
+	xorl	p1+3*tlen(,%rsi,4),OU2	;\
+	xorl	p1+3*tlen(,I2R,4),OU1
+
+// AES (Rijndael) Encryption Subroutine
+
+// rdi = pointer to AES context
+// rsi = pointer to input plaintext bytes
+// rdx = pointer to output ciphertext bytes
+
+	.text
+	.align	ALIGN64BYTES
+aes_encrypt:
+	movl	(%rsi),%eax		// read in plaintext
+	movl	4(%rsi),%ecx
+	movl	8(%rsi),%r10d
+	movl	12(%rsi),%r11d
+
+	pushq	%rbp
+	leaq	ekey+16(%rdi),%rbp	// encryption key pointer
+	movq	%rdx,%r9		// pointer to out block
+	movl	nrnd(%rdi),%edx		// number of rounds
+	pushq	%rbx
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	xorl	-16(%rbp),%eax		// xor in first round key
+	xorl	-12(%rbp),%ecx
+	xorl	-8(%rbp),%r10d
+	xorl	-4(%rbp),%r11d
+
+	subl	$10,%edx
+	je	aes_15
+	addq	$32,%rbp
+	subl	$2,%edx
+	je	aes_13
+	addq	$32,%rbp
+
+	fwd_rnd(aes_ft_tab,-64,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_ft_tab,-48,%ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+	jmp	aes_13
+	.align	ALIGN64BYTES
+aes_13:	fwd_rnd(aes_ft_tab,-32,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_ft_tab,-16,%ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+	jmp	aes_15
+	.align	ALIGN64BYTES
+aes_15:	fwd_rnd(aes_ft_tab,0,  %eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_ft_tab,16, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+	fwd_rnd(aes_ft_tab,32, %eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_ft_tab,48, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+	fwd_rnd(aes_ft_tab,64, %eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_ft_tab,80, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+	fwd_rnd(aes_ft_tab,96, %eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_ft_tab,112,%ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+	fwd_rnd(aes_ft_tab,128,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)
+	fwd_rnd(aes_fl_tab,144,%ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)
+
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%rbx
+	popq	%rbp
+
+	movl	%eax,(%r9)		// move final values to the output array.
+	movl	%ecx,4(%r9)
+	movl	%r10d,8(%r9)
+	movl	%r11d,12(%r9)
+	ret
+
+// AES (Rijndael) Decryption Subroutine
+
+// rdi = pointer to AES context
+// rsi = pointer to input ciphertext bytes
+// rdx = pointer to output plaintext bytes
+
+	.align	ALIGN64BYTES
+aes_decrypt:
+	movl	12(%rsi),%eax		// read in ciphertext
+	movl	8(%rsi),%ecx
+	movl	4(%rsi),%r10d
+	movl	(%rsi),%r11d
+
+	pushq	%rbp
+	leaq	dkey+16(%rdi),%rbp	// decryption key pointer
+	movq	%rdx,%r9		// pointer to out block
+	movl	nrnd(%rdi),%edx		// number of rounds
+	pushq	%rbx
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+
+	xorl	-4(%rbp),%eax		// xor in first round key
+	xorl	-8(%rbp),%ecx
+	xorl	-12(%rbp),%r10d
+	xorl	-16(%rbp),%r11d
+
+	subl	$10,%edx
+	je	aes_25
+	addq	$32,%rbp
+	subl	$2,%edx
+	je	aes_23
+	addq	$32,%rbp
+
+	inv_rnd(aes_it_tab,-64,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_it_tab,-48,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+	jmp	aes_23
+	.align	ALIGN64BYTES
+aes_23:	inv_rnd(aes_it_tab,-32,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_it_tab,-16,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+	jmp	aes_25
+	.align	ALIGN64BYTES
+aes_25:	inv_rnd(aes_it_tab,0,  %r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_it_tab,16, %r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+	inv_rnd(aes_it_tab,32, %r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_it_tab,48, %r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+	inv_rnd(aes_it_tab,64, %r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_it_tab,80, %r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+	inv_rnd(aes_it_tab,96, %r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_it_tab,112,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+	inv_rnd(aes_it_tab,128,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)
+	inv_rnd(aes_il_tab,144,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)
+
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%rbx
+	popq	%rbp
+
+	movl	%eax,12(%r9)		// move final values to the output array.
+	movl	%ecx,8(%r9)
+	movl	%r10d,4(%r9)
+	movl	%r11d,(%r9)
+	ret
+
+// AES (Rijndael) Key Schedule Subroutine
+
+// This macro performs a column mixing operation on an input 32-bit
+// word to give a 32-bit result. It uses each of the 4 bytes in the
+// the input column to index 4 different tables of 256 32-bit words
+// that are xored together to form the output value.
+
+#define mix_col(p1)			 \
+	movzbl	%bl,%ecx		;\
+	movl	p1(,%rcx,4),%eax	;\
+	movzbl	%bh,%ecx		;\
+	ror	$16,%ebx		;\
+	xorl	p1+tlen(,%rcx,4),%eax	;\
+	movzbl	%bl,%ecx		;\
+	xorl	p1+2*tlen(,%rcx,4),%eax	;\
+	movzbl	%bh,%ecx		;\
+	xorl	p1+3*tlen(,%rcx,4),%eax
+
+// Key Schedule Macros
+
+#define ksc4(p1)			 \
+	rol	$24,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	ror	$8,%ebx			;\
+	xorl	4*p1+aes_rcon_tab,%eax	;\
+	xorl	%eax,%esi		;\
+	xorl	%esi,%ebp		;\
+	movl	%esi,16*p1(%rdi)	;\
+	movl	%ebp,16*p1+4(%rdi)	;\
+	xorl	%ebp,%edx		;\
+	xorl	%edx,%ebx		;\
+	movl	%edx,16*p1+8(%rdi)	;\
+	movl	%ebx,16*p1+12(%rdi)
+
+#define ksc6(p1)			 \
+	rol	$24,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	ror	$8,%ebx			;\
+	xorl	4*p1+aes_rcon_tab,%eax	;\
+	xorl	24*p1-24(%rdi),%eax	;\
+	movl	%eax,24*p1(%rdi)	;\
+	xorl	24*p1-20(%rdi),%eax	;\
+	movl	%eax,24*p1+4(%rdi)	;\
+	xorl	%eax,%esi		;\
+	xorl	%esi,%ebp		;\
+	movl	%esi,24*p1+8(%rdi)	;\
+	movl	%ebp,24*p1+12(%rdi)	;\
+	xorl	%ebp,%edx		;\
+	xorl	%edx,%ebx		;\
+	movl	%edx,24*p1+16(%rdi)	;\
+	movl	%ebx,24*p1+20(%rdi)
+
+#define ksc8(p1)			 \
+	rol	$24,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	ror	$8,%ebx			;\
+	xorl	4*p1+aes_rcon_tab,%eax	;\
+	xorl	32*p1-32(%rdi),%eax	;\
+	movl	%eax,32*p1(%rdi)	;\
+	xorl	32*p1-28(%rdi),%eax	;\
+	movl	%eax,32*p1+4(%rdi)	;\
+	xorl	32*p1-24(%rdi),%eax	;\
+	movl	%eax,32*p1+8(%rdi)	;\
+	xorl	32*p1-20(%rdi),%eax	;\
+	movl	%eax,32*p1+12(%rdi)	;\
+	pushq	%rbx			;\
+	movl	%eax,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	popq	%rbx			;\
+	xorl	%eax,%esi		;\
+	xorl	%esi,%ebp		;\
+	movl	%esi,32*p1+16(%rdi)	;\
+	movl	%ebp,32*p1+20(%rdi)	;\
+	xorl	%ebp,%edx		;\
+	xorl	%edx,%ebx		;\
+	movl	%edx,32*p1+24(%rdi)	;\
+	movl	%ebx,32*p1+28(%rdi)
+
+// rdi = pointer to AES context
+// rsi = pointer to key bytes
+// rdx = key length, bytes or bits
+// rcx = ed_flag, 1=encrypt only, 0=both encrypt and decrypt
+
+	.align	ALIGN64BYTES
+aes_set_key:
+	pushfq
+	pushq	%rbp
+	pushq	%rbx
+
+	movq	%rcx,%r11		// ed_flg
+	movq	%rdx,%rcx		// key length
+	movq	%rdi,%r10		// AES context
+
+	cmpl	$128,%ecx
+	jb	aes_30
+	shrl	$3,%ecx
+aes_30:	cmpl	$32,%ecx
+	je	aes_32
+	cmpl	$24,%ecx
+	je	aes_32
+	movl	$16,%ecx
+aes_32:	shrl	$2,%ecx
+	movl	%ecx,nkey(%r10)
+	leaq	6(%rcx),%rax		// 10/12/14 for 4/6/8 32-bit key length
+	movl	%eax,nrnd(%r10)
+	leaq	ekey(%r10),%rdi		// key position in AES context
+	cld
+	movl	%ecx,%eax		// save key length in eax
+	rep ;	movsl			// words in the key schedule
+	movl	-4(%rsi),%ebx		// put some values in registers
+	movl	-8(%rsi),%edx		// to allow faster code
+	movl	-12(%rsi),%ebp
+	movl	-16(%rsi),%esi
+
+	cmpl	$4,%eax			// jump on key size
+	je	aes_36
+	cmpl	$6,%eax
+	je	aes_35
+
+	ksc8(0)
+	ksc8(1)
+	ksc8(2)
+	ksc8(3)
+	ksc8(4)
+	ksc8(5)
+	ksc8(6)
+	jmp	aes_37
+aes_35:	ksc6(0)
+	ksc6(1)
+	ksc6(2)
+	ksc6(3)
+	ksc6(4)
+	ksc6(5)
+	ksc6(6)
+	ksc6(7)
+	jmp	aes_37
+aes_36:	ksc4(0)
+	ksc4(1)
+	ksc4(2)
+	ksc4(3)
+	ksc4(4)
+	ksc4(5)
+	ksc4(6)
+	ksc4(7)
+	ksc4(8)
+	ksc4(9)
+aes_37:	cmpl	$0,%r11d		// ed_flg
+	jne	aes_39
+
+// compile decryption key schedule from encryption schedule - reverse
+// order and do mix_column operation on round keys except first and last
+
+	movl	nrnd(%r10),%eax		// kt = cx->d_key + nc * cx->Nrnd
+	shl	$2,%rax
+	leaq	dkey(%r10,%rax,4),%rdi
+	leaq	ekey(%r10),%rsi		// kf = cx->e_key
+
+	movsq				// copy first round key (unmodified)
+	movsq
+	subq	$32,%rdi
+	movl	$1,%r9d
+aes_38:					// do mix column on each column of
+	lodsl				// each round key
+	movl	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	lodsl
+	movl	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	lodsl
+	movl	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	lodsl
+	movl	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	subq	$32,%rdi
+
+	incl	%r9d
+	cmpl	nrnd(%r10),%r9d
+	jb	aes_38
+
+	movsq				// copy last round key (unmodified)
+	movsq
+aes_39:	popq	%rbx
+	popq	%rbp
+	popfq
+	ret
+
+
+// finite field multiplies by {02}, {04} and {08}
+
+#define f2(x)	((x<<1)^(((x>>7)&1)*0x11b))
+#define f4(x)	((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
+#define f8(x)	((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
+
+// finite field multiplies required in table generation
+
+#define f3(x)	(f2(x) ^ x)
+#define f9(x)	(f8(x) ^ x)
+#define fb(x)	(f8(x) ^ f2(x) ^ x)
+#define fd(x)	(f8(x) ^ f4(x) ^ x)
+#define fe(x)	(f8(x) ^ f4(x) ^ f2(x))
+
+// These defines generate the forward table entries
+
+#define u0(x)	((f3(x) << 24) | (x << 16) | (x << 8) | f2(x))
+#define u1(x)	((x << 24) | (x << 16) | (f2(x) << 8) | f3(x))
+#define u2(x)	((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x)
+#define u3(x)	((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x)
+
+// These defines generate the inverse table entries
+
+#define v0(x)	((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x))
+#define v1(x)	((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x))
+#define v2(x)	((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x))
+#define v3(x)	((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x))
+
+// These defines generate entries for the last round tables
+
+#define w0(x)	(x)
+#define w1(x)	(x <<  8)
+#define w2(x)	(x << 16)
+#define w3(x)	(x << 24)
+
+// macro to generate inverse mix column tables (needed for the key schedule)
+
+#define im_data0(p1) \
+	.long	p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\
+	.long	p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\
+	.long	p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\
+	.long	p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f)
+#define im_data1(p1) \
+	.long	p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\
+	.long	p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\
+	.long	p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\
+	.long	p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f)
+#define im_data2(p1) \
+	.long	p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\
+	.long	p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\
+	.long	p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\
+	.long	p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f)
+#define im_data3(p1) \
+	.long	p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\
+	.long	p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\
+	.long	p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\
+	.long	p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f)
+#define im_data4(p1) \
+	.long	p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\
+	.long	p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\
+	.long	p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\
+	.long	p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f)
+#define im_data5(p1) \
+	.long	p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\
+	.long	p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\
+	.long	p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\
+	.long	p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf)
+#define im_data6(p1) \
+	.long	p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\
+	.long	p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\
+	.long	p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\
+	.long	p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf)
+#define im_data7(p1) \
+	.long	p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\
+	.long	p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\
+	.long	p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\
+	.long	p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff)
+
+// S-box data - 256 entries
+
+#define sb_data0(p1) \
+	.long	p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\
+	.long	p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\
+	.long	p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\
+	.long	p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0)
+#define sb_data1(p1) \
+	.long	p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\
+	.long	p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\
+	.long	p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\
+	.long	p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75)
+#define sb_data2(p1) \
+	.long	p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\
+	.long	p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\
+	.long	p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\
+	.long	p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf)
+#define sb_data3(p1) \
+	.long	p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\
+	.long	p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\
+	.long	p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\
+	.long	p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2)
+#define sb_data4(p1) \
+	.long	p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\
+	.long	p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\
+	.long	p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\
+	.long	p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb)
+#define sb_data5(p1) \
+	.long	p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\
+	.long	p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\
+	.long	p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\
+	.long	p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08)
+#define sb_data6(p1) \
+	.long	p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\
+	.long	p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\
+	.long	p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\
+	.long	p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e)
+#define sb_data7(p1) \
+	.long	p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\
+	.long	p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\
+	.long	p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\
+	.long	p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16)
+
+// Inverse S-box data - 256 entries
+
+#define ib_data0(p1) \
+	.long	p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\
+	.long	p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\
+	.long	p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\
+	.long	p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb)
+#define ib_data1(p1) \
+	.long	p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\
+	.long	p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\
+	.long	p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\
+	.long	p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25)
+#define ib_data2(p1) \
+	.long	p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\
+	.long	p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\
+	.long	p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\
+	.long	p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84)
+#define ib_data3(p1) \
+	.long	p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\
+	.long	p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\
+	.long	p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\
+	.long	p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b)
+#define ib_data4(p1) \
+	.long	p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\
+	.long	p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\
+	.long	p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\
+	.long	p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e)
+#define ib_data5(p1) \
+	.long	p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\
+	.long	p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\
+	.long	p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\
+	.long	p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4)
+#define ib_data6(p1) \
+	.long	p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\
+	.long	p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\
+	.long	p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\
+	.long	p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef)
+#define ib_data7(p1) \
+	.long	p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\
+	.long	p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\
+	.long	p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\
+	.long	p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d)
+
+// The rcon_table (needed for the key schedule)
+//
+// Here is original Dr Brian Gladman's source code:
+//	_rcon_tab:
+//	%assign x   1
+//	%rep 29
+//	    dd  x
+//	%assign x f2(x)
+//	%endrep
+//
+// Here is precomputed output (it's more portable this way):
+
+	.section .rodata
+	.align	ALIGN64BYTES
+aes_rcon_tab:
+	.long	0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
+	.long	0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f
+	.long	0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4
+	.long	0xb3,0x7d,0xfa,0xef,0xc5
+
+// The forward xor tables
+
+	.align	ALIGN64BYTES
+aes_ft_tab:
+	sb_data0(u0)
+	sb_data1(u0)
+	sb_data2(u0)
+	sb_data3(u0)
+	sb_data4(u0)
+	sb_data5(u0)
+	sb_data6(u0)
+	sb_data7(u0)
+
+	sb_data0(u1)
+	sb_data1(u1)
+	sb_data2(u1)
+	sb_data3(u1)
+	sb_data4(u1)
+	sb_data5(u1)
+	sb_data6(u1)
+	sb_data7(u1)
+
+	sb_data0(u2)
+	sb_data1(u2)
+	sb_data2(u2)
+	sb_data3(u2)
+	sb_data4(u2)
+	sb_data5(u2)
+	sb_data6(u2)
+	sb_data7(u2)
+
+	sb_data0(u3)
+	sb_data1(u3)
+	sb_data2(u3)
+	sb_data3(u3)
+	sb_data4(u3)
+	sb_data5(u3)
+	sb_data6(u3)
+	sb_data7(u3)
+
+	.align	ALIGN64BYTES
+aes_fl_tab:
+	sb_data0(w0)
+	sb_data1(w0)
+	sb_data2(w0)
+	sb_data3(w0)
+	sb_data4(w0)
+	sb_data5(w0)
+	sb_data6(w0)
+	sb_data7(w0)
+
+	sb_data0(w1)
+	sb_data1(w1)
+	sb_data2(w1)
+	sb_data3(w1)
+	sb_data4(w1)
+	sb_data5(w1)
+	sb_data6(w1)
+	sb_data7(w1)
+
+	sb_data0(w2)
+	sb_data1(w2)
+	sb_data2(w2)
+	sb_data3(w2)
+	sb_data4(w2)
+	sb_data5(w2)
+	sb_data6(w2)
+	sb_data7(w2)
+
+	sb_data0(w3)
+	sb_data1(w3)
+	sb_data2(w3)
+	sb_data3(w3)
+	sb_data4(w3)
+	sb_data5(w3)
+	sb_data6(w3)
+	sb_data7(w3)
+
+// The inverse xor tables
+
+	.align	ALIGN64BYTES
+aes_it_tab:
+	ib_data0(v0)
+	ib_data1(v0)
+	ib_data2(v0)
+	ib_data3(v0)
+	ib_data4(v0)
+	ib_data5(v0)
+	ib_data6(v0)
+	ib_data7(v0)
+
+	ib_data0(v1)
+	ib_data1(v1)
+	ib_data2(v1)
+	ib_data3(v1)
+	ib_data4(v1)
+	ib_data5(v1)
+	ib_data6(v1)
+	ib_data7(v1)
+
+	ib_data0(v2)
+	ib_data1(v2)
+	ib_data2(v2)
+	ib_data3(v2)
+	ib_data4(v2)
+	ib_data5(v2)
+	ib_data6(v2)
+	ib_data7(v2)
+
+	ib_data0(v3)
+	ib_data1(v3)
+	ib_data2(v3)
+	ib_data3(v3)
+	ib_data4(v3)
+	ib_data5(v3)
+	ib_data6(v3)
+	ib_data7(v3)
+
+	.align	ALIGN64BYTES
+aes_il_tab:
+	ib_data0(w0)
+	ib_data1(w0)
+	ib_data2(w0)
+	ib_data3(w0)
+	ib_data4(w0)
+	ib_data5(w0)
+	ib_data6(w0)
+	ib_data7(w0)
+
+	ib_data0(w1)
+	ib_data1(w1)
+	ib_data2(w1)
+	ib_data3(w1)
+	ib_data4(w1)
+	ib_data5(w1)
+	ib_data6(w1)
+	ib_data7(w1)
+
+	ib_data0(w2)
+	ib_data1(w2)
+	ib_data2(w2)
+	ib_data3(w2)
+	ib_data4(w2)
+	ib_data5(w2)
+	ib_data6(w2)
+	ib_data7(w2)
+
+	ib_data0(w3)
+	ib_data1(w3)
+	ib_data2(w3)
+	ib_data3(w3)
+	ib_data4(w3)
+	ib_data5(w3)
+	ib_data6(w3)
+	ib_data7(w3)
+
+// The inverse mix column tables
+
+	.align	ALIGN64BYTES
+aes_im_tab:
+	im_data0(v0)
+	im_data1(v0)
+	im_data2(v0)
+	im_data3(v0)
+	im_data4(v0)
+	im_data5(v0)
+	im_data6(v0)
+	im_data7(v0)
+
+	im_data0(v1)
+	im_data1(v1)
+	im_data2(v1)
+	im_data3(v1)
+	im_data4(v1)
+	im_data5(v1)
+	im_data6(v1)
+	im_data7(v1)
+
+	im_data0(v2)
+	im_data1(v2)
+	im_data2(v2)
+	im_data3(v2)
+	im_data4(v2)
+	im_data5(v2)
+	im_data6(v2)
+	im_data7(v2)
+
+	im_data0(v3)
+	im_data1(v3)
+	im_data2(v3)
+	im_data3(v3)
+	im_data4(v3)
+	im_data5(v3)
+	im_data6(v3)
+	im_data7(v3)
diff -urN linux-2.4.28/drivers/misc/aes-x86.S linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes-x86.S
--- linux-2.4.28/drivers/misc/aes-x86.S	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes-x86.S	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,922 @@
+//
+// Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.
+// All rights reserved.
+//
+// TERMS
+//
+//  Redistribution and use in source and binary forms, with or without
+//  modification, are permitted subject to the following conditions:
+//
+//  1. Redistributions of source code must retain the above copyright
+//     notice, this list of conditions and the following disclaimer.
+//
+//  2. Redistributions in binary form must reproduce the above copyright
+//     notice, this list of conditions and the following disclaimer in the
+//     documentation and/or other materials provided with the distribution.
+//
+//  3. The copyright holder's name must not be used to endorse or promote
+//     any products derived from this software without his specific prior
+//     written permission.
+//
+//  This software is provided 'as is' with no express or implied warranties
+//  of correctness or fitness for purpose.
+
+// Modified by Jari Ruusu,  December 24 2001
+//  - Converted syntax to GNU CPP/assembler syntax
+//  - C programming interface converted back to "old" API
+//  - Minor portability cleanups and speed optimizations
+
+// Modified by Jari Ruusu,  April 11 2002
+//  - Added above copyright and terms to resulting object code so that
+//    binary distributions can avoid legal trouble
+
+// An AES (Rijndael) implementation for x86 compatible processors. This
+// version uses i386 instruction set but instruction scheduling is optimized
+// for Pentium-2. This version only implements the standard AES block length
+// (128 bits, 16 bytes). This code does not preserve the eax, ecx or edx
+// registers or the artihmetic status flags. However, the ebx, esi, edi, and
+// ebp registers are preserved across calls.
+
+// void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f)
+// void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
+// void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
+
+#if defined(USE_UNDERLINE)
+# define aes_set_key _aes_set_key
+# define aes_encrypt _aes_encrypt
+# define aes_decrypt _aes_decrypt
+#endif
+#if !defined(ALIGN32BYTES)
+# define ALIGN32BYTES 32
+#endif
+
+	.file	"aes-x86.S"
+	.globl	aes_set_key
+	.globl	aes_encrypt
+	.globl	aes_decrypt
+
+	.text
+copyright:
+	.ascii "    \000"
+	.ascii "Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.\000"
+	.ascii "All rights reserved.\000"
+	.ascii "    \000"
+	.ascii "TERMS\000"
+	.ascii "    \000"
+	.ascii " Redistribution and use in source and binary forms, with or without\000"
+	.ascii " modification, are permitted subject to the following conditions:\000"
+	.ascii "    \000"
+	.ascii " 1. Redistributions of source code must retain the above copyright\000"
+	.ascii "    notice, this list of conditions and the following disclaimer.\000"
+	.ascii "    \000"
+	.ascii " 2. Redistributions in binary form must reproduce the above copyright\000"
+	.ascii "    notice, this list of conditions and the following disclaimer in the\000"
+	.ascii "    documentation and/or other materials provided with the distribution.\000"
+	.ascii "    \000"
+	.ascii " 3. The copyright holder's name must not be used to endorse or promote\000"
+	.ascii "    any products derived from this software without his specific prior\000"
+	.ascii "    written permission.\000"
+	.ascii "    \000"
+	.ascii " This software is provided 'as is' with no express or implied warranties\000"
+	.ascii " of correctness or fitness for purpose.\000"
+	.ascii "    \000"
+
+#define tlen	1024	// length of each of 4 'xor' arrays (256 32-bit words)
+
+// offsets to parameters with one register pushed onto stack
+
+#define ctx	8	// AES context structure
+#define in_blk	12	// input byte array address parameter
+#define out_blk	16	// output byte array address parameter
+
+// offsets in context structure
+
+#define nkey	0	// key length, size 4
+#define nrnd	4	// number of rounds, size 4
+#define ekey	8	// encryption key schedule base address, size 256
+#define dkey	264	// decryption key schedule base address, size 256
+
+// This macro performs a forward encryption cycle. It is entered with
+// the first previous round column values in %eax, %ebx, %esi and %edi and
+// exits with the final values in the same registers.
+
+#define fwd_rnd(p1,p2)			 \
+	mov	%ebx,(%esp)		;\
+	movzbl	%al,%edx		;\
+	mov	%eax,%ecx		;\
+	mov	p2(%ebp),%eax		;\
+	mov	%edi,4(%esp)		;\
+	mov	p2+12(%ebp),%edi	;\
+	xor	p1(,%edx,4),%eax	;\
+	movzbl	%ch,%edx		;\
+	shr	$16,%ecx		;\
+	mov	p2+4(%ebp),%ebx		;\
+	xor	p1+tlen(,%edx,4),%edi	;\
+	movzbl	%cl,%edx		;\
+	movzbl	%ch,%ecx		;\
+	xor	p1+3*tlen(,%ecx,4),%ebx	;\
+	mov	%esi,%ecx		;\
+	mov	p1+2*tlen(,%edx,4),%esi	;\
+	movzbl	%cl,%edx		;\
+	xor	p1(,%edx,4),%esi	;\
+	movzbl	%ch,%edx		;\
+	shr	$16,%ecx		;\
+	xor	p1+tlen(,%edx,4),%ebx	;\
+	movzbl	%cl,%edx		;\
+	movzbl	%ch,%ecx		;\
+	xor	p1+2*tlen(,%edx,4),%eax	;\
+	mov	(%esp),%edx		;\
+	xor	p1+3*tlen(,%ecx,4),%edi ;\
+	movzbl	%dl,%ecx		;\
+	xor	p2+8(%ebp),%esi		;\
+	xor	p1(,%ecx,4),%ebx	;\
+	movzbl	%dh,%ecx		;\
+	shr	$16,%edx		;\
+	xor	p1+tlen(,%ecx,4),%eax	;\
+	movzbl	%dl,%ecx		;\
+	movzbl	%dh,%edx		;\
+	xor	p1+2*tlen(,%ecx,4),%edi	;\
+	mov	4(%esp),%ecx		;\
+	xor	p1+3*tlen(,%edx,4),%esi ;\
+	movzbl	%cl,%edx		;\
+	xor	p1(,%edx,4),%edi	;\
+	movzbl	%ch,%edx		;\
+	shr	$16,%ecx		;\
+	xor	p1+tlen(,%edx,4),%esi	;\
+	movzbl	%cl,%edx		;\
+	movzbl	%ch,%ecx		;\
+	xor	p1+2*tlen(,%edx,4),%ebx	;\
+	xor	p1+3*tlen(,%ecx,4),%eax
+
+// This macro performs an inverse encryption cycle. It is entered with
+// the first previous round column values in %eax, %ebx, %esi and %edi and
+// exits with the final values in the same registers.
+
+#define inv_rnd(p1,p2)			 \
+	movzbl	%al,%edx		;\
+	mov	%ebx,(%esp)		;\
+	mov	%eax,%ecx		;\
+	mov	p2(%ebp),%eax		;\
+	mov	%edi,4(%esp)		;\
+	mov	p2+4(%ebp),%ebx		;\
+	xor	p1(,%edx,4),%eax	;\
+	movzbl	%ch,%edx		;\
+	shr	$16,%ecx		;\
+	mov	p2+12(%ebp),%edi	;\
+	xor	p1+tlen(,%edx,4),%ebx	;\
+	movzbl	%cl,%edx		;\
+	movzbl	%ch,%ecx		;\
+	xor	p1+3*tlen(,%ecx,4),%edi	;\
+	mov	%esi,%ecx		;\
+	mov	p1+2*tlen(,%edx,4),%esi	;\
+	movzbl	%cl,%edx		;\
+	xor	p1(,%edx,4),%esi	;\
+	movzbl	%ch,%edx		;\
+	shr	$16,%ecx		;\
+	xor	p1+tlen(,%edx,4),%edi	;\
+	movzbl	%cl,%edx		;\
+	movzbl	%ch,%ecx		;\
+	xor	p1+2*tlen(,%edx,4),%eax	;\
+	mov	(%esp),%edx		;\
+	xor	p1+3*tlen(,%ecx,4),%ebx ;\
+	movzbl	%dl,%ecx		;\
+	xor	p2+8(%ebp),%esi		;\
+	xor	p1(,%ecx,4),%ebx	;\
+	movzbl	%dh,%ecx		;\
+	shr	$16,%edx		;\
+	xor	p1+tlen(,%ecx,4),%esi	;\
+	movzbl	%dl,%ecx		;\
+	movzbl	%dh,%edx		;\
+	xor	p1+2*tlen(,%ecx,4),%edi	;\
+	mov	4(%esp),%ecx		;\
+	xor	p1+3*tlen(,%edx,4),%eax ;\
+	movzbl	%cl,%edx		;\
+	xor	p1(,%edx,4),%edi	;\
+	movzbl	%ch,%edx		;\
+	shr	$16,%ecx		;\
+	xor	p1+tlen(,%edx,4),%eax	;\
+	movzbl	%cl,%edx		;\
+	movzbl	%ch,%ecx		;\
+	xor	p1+2*tlen(,%edx,4),%ebx	;\
+	xor	p1+3*tlen(,%ecx,4),%esi
+
+// AES (Rijndael) Encryption Subroutine
+
+	.text
+	.align	ALIGN32BYTES
+aes_encrypt:
+	push	%ebp
+	mov	ctx(%esp),%ebp		// pointer to context
+	mov	in_blk(%esp),%ecx
+	push	%ebx
+	push	%esi
+	push	%edi
+	mov	nrnd(%ebp),%edx		// number of rounds
+	lea	ekey+16(%ebp),%ebp	// key pointer
+
+// input four columns and xor in first round key
+
+	mov	(%ecx),%eax
+	mov	4(%ecx),%ebx
+	mov	8(%ecx),%esi
+	mov	12(%ecx),%edi
+	xor	-16(%ebp),%eax
+	xor	-12(%ebp),%ebx
+	xor	-8(%ebp),%esi
+	xor	-4(%ebp),%edi
+
+	sub	$8,%esp			// space for register saves on stack
+
+	sub	$10,%edx
+	je	aes_15
+	add	$32,%ebp
+	sub	$2,%edx
+	je	aes_13
+	add	$32,%ebp
+
+	fwd_rnd(aes_ft_tab,-64)		// 14 rounds for 256-bit key
+	fwd_rnd(aes_ft_tab,-48)
+aes_13:	fwd_rnd(aes_ft_tab,-32)		// 12 rounds for 192-bit key
+	fwd_rnd(aes_ft_tab,-16)
+aes_15:	fwd_rnd(aes_ft_tab,0)		// 10 rounds for 128-bit key
+	fwd_rnd(aes_ft_tab,16)
+	fwd_rnd(aes_ft_tab,32)
+	fwd_rnd(aes_ft_tab,48)
+	fwd_rnd(aes_ft_tab,64)
+	fwd_rnd(aes_ft_tab,80)
+	fwd_rnd(aes_ft_tab,96)
+	fwd_rnd(aes_ft_tab,112)
+	fwd_rnd(aes_ft_tab,128)
+	fwd_rnd(aes_fl_tab,144)		// last round uses a different table
+
+// move final values to the output array.
+
+	mov	out_blk+20(%esp),%ebp
+	add	$8,%esp
+	mov	%eax,(%ebp)
+	mov	%ebx,4(%ebp)
+	mov	%esi,8(%ebp)
+	mov	%edi,12(%ebp)
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	ret
+
+
+// AES (Rijndael) Decryption Subroutine
+
+	.align	ALIGN32BYTES
+aes_decrypt:
+	push	%ebp
+	mov	ctx(%esp),%ebp		// pointer to context
+	mov	in_blk(%esp),%ecx
+	push	%ebx
+	push	%esi
+	push	%edi
+	mov	nrnd(%ebp),%edx		// number of rounds
+	lea	dkey+16(%ebp),%ebp	// key pointer
+
+// input four columns and xor in first round key
+
+	mov	(%ecx),%eax
+	mov	4(%ecx),%ebx
+	mov	8(%ecx),%esi
+	mov	12(%ecx),%edi
+	xor	-16(%ebp),%eax
+	xor	-12(%ebp),%ebx
+	xor	-8(%ebp),%esi
+	xor	-4(%ebp),%edi
+
+	sub	$8,%esp			// space for register saves on stack
+
+	sub	$10,%edx
+	je	aes_25
+	add	$32,%ebp
+	sub	$2,%edx
+	je	aes_23
+	add	$32,%ebp
+
+	inv_rnd(aes_it_tab,-64)		// 14 rounds for 256-bit key
+	inv_rnd(aes_it_tab,-48)
+aes_23:	inv_rnd(aes_it_tab,-32)		// 12 rounds for 192-bit key
+	inv_rnd(aes_it_tab,-16)
+aes_25:	inv_rnd(aes_it_tab,0)		// 10 rounds for 128-bit key
+	inv_rnd(aes_it_tab,16)
+	inv_rnd(aes_it_tab,32)
+	inv_rnd(aes_it_tab,48)
+	inv_rnd(aes_it_tab,64)
+	inv_rnd(aes_it_tab,80)
+	inv_rnd(aes_it_tab,96)
+	inv_rnd(aes_it_tab,112)
+	inv_rnd(aes_it_tab,128)
+	inv_rnd(aes_il_tab,144)		// last round uses a different table
+
+// move final values to the output array.
+
+	mov	out_blk+20(%esp),%ebp
+	add	$8,%esp
+	mov	%eax,(%ebp)
+	mov	%ebx,4(%ebp)
+	mov	%esi,8(%ebp)
+	mov	%edi,12(%ebp)
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	ret
+
+// AES (Rijndael) Key Schedule Subroutine
+
+// input/output parameters
+
+#define aes_cx	12	// AES context
+#define in_key	16	// key input array address
+#define key_ln	20	// key length, bytes (16,24,32) or bits (128,192,256)
+#define ed_flg	24	// 0=create both encr/decr keys, 1=create encr key only
+
+// offsets for locals
+
+#define cnt	-4
+#define slen	8
+
+// This macro performs a column mixing operation on an input 32-bit
+// word to give a 32-bit result. It uses each of the 4 bytes in the
+// the input column to index 4 different tables of 256 32-bit words
+// that are xored together to form the output value.
+
+#define mix_col(p1)			 \
+	movzbl	%bl,%ecx		;\
+	mov	p1(,%ecx,4),%eax	;\
+	movzbl	%bh,%ecx		;\
+	ror	$16,%ebx		;\
+	xor	p1+tlen(,%ecx,4),%eax	;\
+	movzbl	%bl,%ecx		;\
+	xor	p1+2*tlen(,%ecx,4),%eax	;\
+	movzbl	%bh,%ecx		;\
+	xor	p1+3*tlen(,%ecx,4),%eax
+
+// Key Schedule Macros
+
+#define ksc4(p1)			 \
+	rol	$24,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	ror	$8,%ebx			;\
+	xor	4*p1+aes_rcon_tab,%eax	;\
+	xor	%eax,%esi		;\
+	xor	%esi,%ebp		;\
+	mov	%esi,16*p1(%edi)	;\
+	mov	%ebp,16*p1+4(%edi)	;\
+	xor	%ebp,%edx		;\
+	xor	%edx,%ebx		;\
+	mov	%edx,16*p1+8(%edi)	;\
+	mov	%ebx,16*p1+12(%edi)
+
+#define ksc6(p1)			 \
+	rol	$24,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	ror	$8,%ebx			;\
+	xor	4*p1+aes_rcon_tab,%eax	;\
+	xor	24*p1-24(%edi),%eax	;\
+	mov	%eax,24*p1(%edi)	;\
+	xor	24*p1-20(%edi),%eax	;\
+	mov	%eax,24*p1+4(%edi)	;\
+	xor	%eax,%esi		;\
+	xor	%esi,%ebp		;\
+	mov	%esi,24*p1+8(%edi)	;\
+	mov	%ebp,24*p1+12(%edi)	;\
+	xor	%ebp,%edx		;\
+	xor	%edx,%ebx		;\
+	mov	%edx,24*p1+16(%edi)	;\
+	mov	%ebx,24*p1+20(%edi)
+
+#define ksc8(p1)			 \
+	rol	$24,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	ror	$8,%ebx			;\
+	xor	4*p1+aes_rcon_tab,%eax	;\
+	xor	32*p1-32(%edi),%eax	;\
+	mov	%eax,32*p1(%edi)	;\
+	xor	32*p1-28(%edi),%eax	;\
+	mov	%eax,32*p1+4(%edi)	;\
+	xor	32*p1-24(%edi),%eax	;\
+	mov	%eax,32*p1+8(%edi)	;\
+	xor	32*p1-20(%edi),%eax	;\
+	mov	%eax,32*p1+12(%edi)	;\
+	push	%ebx			;\
+	mov	%eax,%ebx		;\
+	mix_col(aes_fl_tab)		;\
+	pop	%ebx			;\
+	xor	%eax,%esi		;\
+	xor	%esi,%ebp		;\
+	mov	%esi,32*p1+16(%edi)	;\
+	mov	%ebp,32*p1+20(%edi)	;\
+	xor	%ebp,%edx		;\
+	xor	%edx,%ebx		;\
+	mov	%edx,32*p1+24(%edi)	;\
+	mov	%ebx,32*p1+28(%edi)
+
+	.align	ALIGN32BYTES
+aes_set_key:
+	pushfl
+	push	%ebp
+	mov	%esp,%ebp
+	sub	$slen,%esp
+	push	%ebx
+	push	%esi
+	push	%edi
+
+	mov	aes_cx(%ebp),%edx	// edx -> AES context
+
+	mov	key_ln(%ebp),%ecx	// key length
+	cmpl	$128,%ecx
+	jb	aes_30
+	shr	$3,%ecx
+aes_30:	cmpl	$32,%ecx
+	je	aes_32
+	cmpl	$24,%ecx
+	je	aes_32
+	mov	$16,%ecx
+aes_32:	shr	$2,%ecx
+	mov	%ecx,nkey(%edx)
+
+	lea	6(%ecx),%eax		// 10/12/14 for 4/6/8 32-bit key length
+	mov	%eax,nrnd(%edx)
+
+	mov	in_key(%ebp),%esi	// key input array
+	lea	ekey(%edx),%edi		// key position in AES context
+	cld
+	push	%ebp
+	mov	%ecx,%eax		// save key length in eax
+	rep ;	movsl			// words in the key schedule
+	mov	-4(%esi),%ebx		// put some values in registers
+	mov	-8(%esi),%edx		// to allow faster code
+	mov	-12(%esi),%ebp
+	mov	-16(%esi),%esi
+
+	cmpl	$4,%eax			// jump on key size
+	je	aes_36
+	cmpl	$6,%eax
+	je	aes_35
+
+	ksc8(0)
+	ksc8(1)
+	ksc8(2)
+	ksc8(3)
+	ksc8(4)
+	ksc8(5)
+	ksc8(6)
+	jmp	aes_37
+aes_35:	ksc6(0)
+	ksc6(1)
+	ksc6(2)
+	ksc6(3)
+	ksc6(4)
+	ksc6(5)
+	ksc6(6)
+	ksc6(7)
+	jmp	aes_37
+aes_36:	ksc4(0)
+	ksc4(1)
+	ksc4(2)
+	ksc4(3)
+	ksc4(4)
+	ksc4(5)
+	ksc4(6)
+	ksc4(7)
+	ksc4(8)
+	ksc4(9)
+aes_37:	pop	%ebp
+	mov	aes_cx(%ebp),%edx	// edx -> AES context
+	cmpl	$0,ed_flg(%ebp)
+	jne	aes_39
+
+// compile decryption key schedule from encryption schedule - reverse
+// order and do mix_column operation on round keys except first and last
+
+	mov	nrnd(%edx),%eax		// kt = cx->d_key + nc * cx->Nrnd
+	shl	$2,%eax
+	lea	dkey(%edx,%eax,4),%edi
+	lea	ekey(%edx),%esi		// kf = cx->e_key
+
+	movsl				// copy first round key (unmodified)
+	movsl
+	movsl
+	movsl
+	sub	$32,%edi
+	movl	$1,cnt(%ebp)
+aes_38:					// do mix column on each column of
+	lodsl				// each round key
+	mov	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	lodsl
+	mov	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	lodsl
+	mov	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	lodsl
+	mov	%eax,%ebx
+	mix_col(aes_im_tab)
+	stosl
+	sub	$32,%edi
+
+	incl	cnt(%ebp)
+	mov	cnt(%ebp),%eax
+	cmp	nrnd(%edx),%eax
+	jb	aes_38
+
+	movsl				// copy last round key (unmodified)
+	movsl
+	movsl
+	movsl
+aes_39:	pop	%edi
+	pop	%esi
+	pop	%ebx
+	mov	%ebp,%esp
+	pop	%ebp
+	popfl
+	ret
+
+
+// finite field multiplies by {02}, {04} and {08}
+
+#define f2(x)	((x<<1)^(((x>>7)&1)*0x11b))
+#define f4(x)	((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
+#define f8(x)	((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
+
+// finite field multiplies required in table generation
+
+#define f3(x)	(f2(x) ^ x)
+#define f9(x)	(f8(x) ^ x)
+#define fb(x)	(f8(x) ^ f2(x) ^ x)
+#define fd(x)	(f8(x) ^ f4(x) ^ x)
+#define fe(x)	(f8(x) ^ f4(x) ^ f2(x))
+
+// These defines generate the forward table entries
+
+#define u0(x)	((f3(x) << 24) | (x << 16) | (x << 8) | f2(x))
+#define u1(x)	((x << 24) | (x << 16) | (f2(x) << 8) | f3(x))
+#define u2(x)	((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x)
+#define u3(x)	((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x)
+
+// These defines generate the inverse table entries
+
+#define v0(x)	((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x))
+#define v1(x)	((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x))
+#define v2(x)	((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x))
+#define v3(x)	((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x))
+
+// These defines generate entries for the last round tables
+
+#define w0(x)	(x)
+#define w1(x)	(x <<  8)
+#define w2(x)	(x << 16)
+#define w3(x)	(x << 24)
+
+// macro to generate inverse mix column tables (needed for the key schedule)
+
+#define im_data0(p1) \
+	.long	p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\
+	.long	p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\
+	.long	p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\
+	.long	p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f)
+#define im_data1(p1) \
+	.long	p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\
+	.long	p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\
+	.long	p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\
+	.long	p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f)
+#define im_data2(p1) \
+	.long	p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\
+	.long	p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\
+	.long	p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\
+	.long	p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f)
+#define im_data3(p1) \
+	.long	p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\
+	.long	p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\
+	.long	p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\
+	.long	p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f)
+#define im_data4(p1) \
+	.long	p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\
+	.long	p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\
+	.long	p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\
+	.long	p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f)
+#define im_data5(p1) \
+	.long	p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\
+	.long	p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\
+	.long	p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\
+	.long	p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf)
+#define im_data6(p1) \
+	.long	p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\
+	.long	p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\
+	.long	p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\
+	.long	p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf)
+#define im_data7(p1) \
+	.long	p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\
+	.long	p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\
+	.long	p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\
+	.long	p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff)
+
+// S-box data - 256 entries
+
+#define sb_data0(p1) \
+	.long	p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\
+	.long	p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\
+	.long	p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\
+	.long	p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0)
+#define sb_data1(p1) \
+	.long	p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\
+	.long	p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\
+	.long	p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\
+	.long	p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75)
+#define sb_data2(p1) \
+	.long	p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\
+	.long	p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\
+	.long	p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\
+	.long	p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf)
+#define sb_data3(p1) \
+	.long	p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\
+	.long	p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\
+	.long	p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\
+	.long	p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2)
+#define sb_data4(p1) \
+	.long	p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\
+	.long	p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\
+	.long	p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\
+	.long	p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb)
+#define sb_data5(p1) \
+	.long	p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\
+	.long	p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\
+	.long	p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\
+	.long	p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08)
+#define sb_data6(p1) \
+	.long	p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\
+	.long	p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\
+	.long	p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\
+	.long	p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e)
+#define sb_data7(p1) \
+	.long	p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\
+	.long	p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\
+	.long	p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\
+	.long	p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16)
+
+// Inverse S-box data - 256 entries
+
+#define ib_data0(p1) \
+	.long	p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\
+	.long	p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\
+	.long	p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\
+	.long	p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb)
+#define ib_data1(p1) \
+	.long	p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\
+	.long	p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\
+	.long	p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\
+	.long	p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25)
+#define ib_data2(p1) \
+	.long	p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\
+	.long	p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\
+	.long	p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\
+	.long	p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84)
+#define ib_data3(p1) \
+	.long	p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\
+	.long	p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\
+	.long	p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\
+	.long	p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b)
+#define ib_data4(p1) \
+	.long	p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\
+	.long	p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\
+	.long	p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\
+	.long	p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e)
+#define ib_data5(p1) \
+	.long	p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\
+	.long	p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\
+	.long	p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\
+	.long	p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4)
+#define ib_data6(p1) \
+	.long	p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\
+	.long	p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\
+	.long	p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\
+	.long	p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef)
+#define ib_data7(p1) \
+	.long	p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\
+	.long	p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\
+	.long	p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\
+	.long	p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d)
+
+// The rcon_table (needed for the key schedule)
+//
+// Here is original Dr Brian Gladman's source code:
+//	_rcon_tab:
+//	%assign x   1
+//	%rep 29
+//	    dd  x
+//	%assign x f2(x)
+//	%endrep
+//
+// Here is precomputed output (it's more portable this way):
+
+	.align	ALIGN32BYTES
+aes_rcon_tab:
+	.long	0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
+	.long	0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f
+	.long	0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4
+	.long	0xb3,0x7d,0xfa,0xef,0xc5
+
+// The forward xor tables
+
+	.align	ALIGN32BYTES
+aes_ft_tab:
+	sb_data0(u0)
+	sb_data1(u0)
+	sb_data2(u0)
+	sb_data3(u0)
+	sb_data4(u0)
+	sb_data5(u0)
+	sb_data6(u0)
+	sb_data7(u0)
+
+	sb_data0(u1)
+	sb_data1(u1)
+	sb_data2(u1)
+	sb_data3(u1)
+	sb_data4(u1)
+	sb_data5(u1)
+	sb_data6(u1)
+	sb_data7(u1)
+
+	sb_data0(u2)
+	sb_data1(u2)
+	sb_data2(u2)
+	sb_data3(u2)
+	sb_data4(u2)
+	sb_data5(u2)
+	sb_data6(u2)
+	sb_data7(u2)
+
+	sb_data0(u3)
+	sb_data1(u3)
+	sb_data2(u3)
+	sb_data3(u3)
+	sb_data4(u3)
+	sb_data5(u3)
+	sb_data6(u3)
+	sb_data7(u3)
+
+	.align	ALIGN32BYTES
+aes_fl_tab:
+	sb_data0(w0)
+	sb_data1(w0)
+	sb_data2(w0)
+	sb_data3(w0)
+	sb_data4(w0)
+	sb_data5(w0)
+	sb_data6(w0)
+	sb_data7(w0)
+
+	sb_data0(w1)
+	sb_data1(w1)
+	sb_data2(w1)
+	sb_data3(w1)
+	sb_data4(w1)
+	sb_data5(w1)
+	sb_data6(w1)
+	sb_data7(w1)
+
+	sb_data0(w2)
+	sb_data1(w2)
+	sb_data2(w2)
+	sb_data3(w2)
+	sb_data4(w2)
+	sb_data5(w2)
+	sb_data6(w2)
+	sb_data7(w2)
+
+	sb_data0(w3)
+	sb_data1(w3)
+	sb_data2(w3)
+	sb_data3(w3)
+	sb_data4(w3)
+	sb_data5(w3)
+	sb_data6(w3)
+	sb_data7(w3)
+
+// The inverse xor tables
+
+	.align	ALIGN32BYTES
+aes_it_tab:
+	ib_data0(v0)
+	ib_data1(v0)
+	ib_data2(v0)
+	ib_data3(v0)
+	ib_data4(v0)
+	ib_data5(v0)
+	ib_data6(v0)
+	ib_data7(v0)
+
+	ib_data0(v1)
+	ib_data1(v1)
+	ib_data2(v1)
+	ib_data3(v1)
+	ib_data4(v1)
+	ib_data5(v1)
+	ib_data6(v1)
+	ib_data7(v1)
+
+	ib_data0(v2)
+	ib_data1(v2)
+	ib_data2(v2)
+	ib_data3(v2)
+	ib_data4(v2)
+	ib_data5(v2)
+	ib_data6(v2)
+	ib_data7(v2)
+
+	ib_data0(v3)
+	ib_data1(v3)
+	ib_data2(v3)
+	ib_data3(v3)
+	ib_data4(v3)
+	ib_data5(v3)
+	ib_data6(v3)
+	ib_data7(v3)
+
+	.align	ALIGN32BYTES
+aes_il_tab:
+	ib_data0(w0)
+	ib_data1(w0)
+	ib_data2(w0)
+	ib_data3(w0)
+	ib_data4(w0)
+	ib_data5(w0)
+	ib_data6(w0)
+	ib_data7(w0)
+
+	ib_data0(w1)
+	ib_data1(w1)
+	ib_data2(w1)
+	ib_data3(w1)
+	ib_data4(w1)
+	ib_data5(w1)
+	ib_data6(w1)
+	ib_data7(w1)
+
+	ib_data0(w2)
+	ib_data1(w2)
+	ib_data2(w2)
+	ib_data3(w2)
+	ib_data4(w2)
+	ib_data5(w2)
+	ib_data6(w2)
+	ib_data7(w2)
+
+	ib_data0(w3)
+	ib_data1(w3)
+	ib_data2(w3)
+	ib_data3(w3)
+	ib_data4(w3)
+	ib_data5(w3)
+	ib_data6(w3)
+	ib_data7(w3)
+
+// The inverse mix column tables
+
+	.align	ALIGN32BYTES
+aes_im_tab:
+	im_data0(v0)
+	im_data1(v0)
+	im_data2(v0)
+	im_data3(v0)
+	im_data4(v0)
+	im_data5(v0)
+	im_data6(v0)
+	im_data7(v0)
+
+	im_data0(v1)
+	im_data1(v1)
+	im_data2(v1)
+	im_data3(v1)
+	im_data4(v1)
+	im_data5(v1)
+	im_data6(v1)
+	im_data7(v1)
+
+	im_data0(v2)
+	im_data1(v2)
+	im_data2(v2)
+	im_data3(v2)
+	im_data4(v2)
+	im_data5(v2)
+	im_data6(v2)
+	im_data7(v2)
+
+	im_data0(v3)
+	im_data1(v3)
+	im_data2(v3)
+	im_data3(v3)
+	im_data4(v3)
+	im_data5(v3)
+	im_data6(v3)
+	im_data7(v3)
diff -urN linux-2.4.28/drivers/misc/aes.c linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes.c
--- linux-2.4.28/drivers/misc/aes.c	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes.c	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,1479 @@
+// I retain copyright in this code but I encourage its free use provided
+// that I don't carry any responsibility for the results. I am especially 
+// happy to see it used in free and open source software. If you do use 
+// it I would appreciate an acknowledgement of its origin in the code or
+// the product that results and I would also appreciate knowing a little
+// about the use to which it is being put. I am grateful to Frank Yellin
+// for some ideas that are used in this implementation.
+//
+// Dr B. R. Gladman <brg@gladman.uk.net> 6th April 2001.
+//
+// This is an implementation of the AES encryption algorithm (Rijndael)
+// designed by Joan Daemen and Vincent Rijmen. This version is designed
+// to provide both fixed and dynamic block and key lengths and can also 
+// run with either big or little endian internal byte order (see aes.h). 
+// It inputs block and key lengths in bytes with the legal values being 
+// 16, 24 and 32.
+
+/*
+ * Modified by Jari Ruusu,  May 1 2001
+ *  - Fixed some compile warnings, code was ok but gcc warned anyway.
+ *  - Changed basic types: byte -> unsigned char, word -> u_int32_t
+ *  - Major name space cleanup: Names visible to outside now begin
+ *    with "aes_" or "AES_". A lot of stuff moved from aes.h to aes.c
+ *  - Removed C++ and DLL support as part of name space cleanup.
+ *  - Eliminated unnecessary recomputation of tables. (actual bug fix)
+ *  - Merged precomputed constant tables to aes.c file.
+ *  - Removed data alignment restrictions for portability reasons.
+ *  - Made block and key lengths accept bit count (128/192/256)
+ *    as well byte count (16/24/32).
+ *  - Removed all error checks. This change also eliminated the need
+ *    to preinitialize the context struct to zero.
+ *  - Removed some totally unused constants.
+ */
+/*
+ * Modified by Jari Ruusu,  April 21 2004
+ *  - Added back code that avoids byte swaps on big endian boxes.
+ */
+
+#include "aes.h"
+
+// CONFIGURATION OPTIONS (see also aes.h)
+//
+// 1.  Define UNROLL for full loop unrolling in encryption and decryption.
+// 2.  Define PARTIAL_UNROLL to unroll two loops in encryption and decryption.
+// 3.  Define FIXED_TABLES for compiled rather than dynamic tables.
+// 4.  Define FF_TABLES to use tables for field multiplies and inverses.
+//     Do not enable this without understanding stack space requirements.
+// 5.  Define ARRAYS to use arrays to hold the local state block. If this
+//     is not defined, individually declared 32-bit words are used.
+// 6.  Define FAST_VARIABLE if a high speed variable block implementation
+//     is needed (essentially three separate fixed block size code sequences)
+// 7.  Define either ONE_TABLE or FOUR_TABLES for a fast table driven 
+//     version using 1 table (2 kbytes of table space) or 4 tables (8
+//     kbytes of table space) for higher speed.
+// 8.  Define either ONE_LR_TABLE or FOUR_LR_TABLES for a further speed 
+//     increase by using tables for the last rounds but with more table
+//     space (2 or 8 kbytes extra).
+// 9.  If neither ONE_TABLE nor FOUR_TABLES is defined, a compact but 
+//     slower version is provided.
+// 10. If fast decryption key scheduling is needed define ONE_IM_TABLE
+//     or FOUR_IM_TABLES for higher speed (2 or 8 kbytes extra).
+
+#define UNROLL
+//#define PARTIAL_UNROLL
+
+#define FIXED_TABLES
+//#define FF_TABLES
+//#define ARRAYS
+#define FAST_VARIABLE
+
+//#define ONE_TABLE
+#define FOUR_TABLES
+
+//#define ONE_LR_TABLE
+#define FOUR_LR_TABLES
+
+//#define ONE_IM_TABLE
+#define FOUR_IM_TABLES
+
+#if defined(UNROLL) && defined (PARTIAL_UNROLL)
+#error both UNROLL and PARTIAL_UNROLL are defined
+#endif
+
+#if defined(ONE_TABLE) && defined (FOUR_TABLES)
+#error both ONE_TABLE and FOUR_TABLES are defined
+#endif
+
+#if defined(ONE_LR_TABLE) && defined (FOUR_LR_TABLES)
+#error both ONE_LR_TABLE and FOUR_LR_TABLES are defined
+#endif
+
+#if defined(ONE_IM_TABLE) && defined (FOUR_IM_TABLES)
+#error both ONE_IM_TABLE and FOUR_IM_TABLES are defined
+#endif
+
+#if defined(AES_BLOCK_SIZE) && AES_BLOCK_SIZE != 16 && AES_BLOCK_SIZE != 24 && AES_BLOCK_SIZE != 32
+#error an illegal block size has been specified
+#endif  
+
+/* INTERNAL_BYTE_ORDER: 0=unknown, 1=little endian, 2=big endian */
+#if defined(INTERNAL_BYTE_ORDER)
+#elif defined(__i386__)||defined(__i386)||defined(__x86_64__)||defined(__x86_64)||defined(__amd64__)||defined(__amd64)||defined(__AMD64__)||defined(__AMD64)
+# define INTERNAL_BYTE_ORDER 1
+# undef DATA_ALWAYS_ALIGNED
+# define DATA_ALWAYS_ALIGNED 1  /* unaligned access is always ok */
+#elif defined(__ppc__)||defined(__ppc)||defined(__PPC__)||defined(__PPC)||defined(__powerpc__)||defined(__powerpc)||defined(__POWERPC__)||defined(__POWERPC)||defined(__PowerPC__)||defined(__PowerPC)||defined(__ppc64__)||defined(__ppc64)||defined(__PPC64__)||defined(__PPC64)||defined(__powerpc64__)||defined(__powerpc64)||defined(__s390__)||defined(__s390)
+# define INTERNAL_BYTE_ORDER 2
+# undef DATA_ALWAYS_ALIGNED
+# define DATA_ALWAYS_ALIGNED 1  /* unaligned access is always ok */
+#elif defined(__alpha__)||defined(__alpha)||defined(__ia64__)||defined(__ia64)
+# define INTERNAL_BYTE_ORDER 1
+#elif defined(__hppa__)||defined(__hppa)||defined(__HPPA__)||defined(__HPPA)||defined(__parisc__)||defined(__parisc)||defined(__sparc__)||defined(__sparc)||defined(__sparc_v9__)||defined(__sparc_v9)||defined(__sparc64__)||defined(__sparc64)||defined(__mc68000__)||defined(__mc68000)
+# define INTERNAL_BYTE_ORDER 2
+#elif defined(CONFIGURE_DETECTS_BYTE_ORDER)
+# if WORDS_BIGENDIAN
+#  define INTERNAL_BYTE_ORDER 2
+# else
+#  define INTERNAL_BYTE_ORDER 1
+# endif
+#elif defined(__linux__) && defined(__KERNEL__)
+# include <asm/byteorder.h>
+# if defined(__BIG_ENDIAN)
+#  define INTERNAL_BYTE_ORDER 2
+# else
+#  define INTERNAL_BYTE_ORDER 1
+# endif
+#else
+# include <sys/param.h>
+# if (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN))
+#  define INTERNAL_BYTE_ORDER 1
+# elif WORDS_BIGENDIAN || defined(__BIG_ENDIAN__) || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN))
+#  define INTERNAL_BYTE_ORDER 2
+# else
+#  define INTERNAL_BYTE_ORDER 0
+# endif
+#endif
+
+#if defined(DATA_ALWAYS_ALIGNED) && (INTERNAL_BYTE_ORDER > 0)
+# define word_in(x)      *(u_int32_t*)(x)
+# define word_out(x,v)   *(u_int32_t*)(x) = (v)
+#elif defined(__linux__) && defined(__KERNEL__)
+# include <asm/unaligned.h>
+# define word_in(x)      get_unaligned((u_int32_t*)(x))
+# define word_out(x,v)   put_unaligned((v),(u_int32_t*)(x))
+#else
+/* unknown endianness and/or unable to handle unaligned data */
+# undef INTERNAL_BYTE_ORDER
+# define INTERNAL_BYTE_ORDER 1
+# define word_in(x)      ((u_int32_t)(((unsigned char *)(x))[0])|((u_int32_t)(((unsigned char *)(x))[1])<<8)|((u_int32_t)(((unsigned char *)(x))[2])<<16)|((u_int32_t)(((unsigned char *)(x))[3])<<24))
+# define word_out(x,v)   ((unsigned char *)(x))[0]=(v),((unsigned char *)(x))[1]=((v)>>8),((unsigned char *)(x))[2]=((v)>>16),((unsigned char *)(x))[3]=((v)>>24)
+#endif
+
+// upr(x,n): rotates bytes within words by n positions, moving bytes 
+// to higher index positions with wrap around into low positions
+// ups(x,n): moves bytes by n positions to higher index positions in 
+// words but without wrap around
+// bval(x,n): extracts a byte from a word
+
+#if (INTERNAL_BYTE_ORDER < 2)
+/* little endian */
+#define upr(x,n)        (((x) << 8 * (n)) | ((x) >> (32 - 8 * (n))))
+#define ups(x,n)        ((x) << 8 * (n))
+#define bval(x,n)       ((unsigned char)((x) >> 8 * (n)))
+#define bytes2word(b0, b1, b2, b3)  \
+        ((u_int32_t)(b3) << 24 | (u_int32_t)(b2) << 16 | (u_int32_t)(b1) << 8 | (b0))
+#else
+/* big endian */
+#define upr(x,n)        (((x) >> 8 * (n)) | ((x) << (32 - 8 * (n))))
+#define ups(x,n)        ((x) >> 8 * (n)))
+#define bval(x,n)       ((unsigned char)((x) >> (24 - 8 * (n))))
+#define bytes2word(b0, b1, b2, b3)  \
+        ((u_int32_t)(b0) << 24 | (u_int32_t)(b1) << 16 | (u_int32_t)(b2) << 8 | (b3))
+#endif
+
+// Disable at least some poor combinations of options
+
+#if !defined(ONE_TABLE) && !defined(FOUR_TABLES)
+#define FIXED_TABLES
+#undef  UNROLL
+#undef  ONE_LR_TABLE
+#undef  FOUR_LR_TABLES
+#undef  ONE_IM_TABLE
+#undef  FOUR_IM_TABLES
+#elif !defined(FOUR_TABLES)
+#ifdef  FOUR_LR_TABLES
+#undef  FOUR_LR_TABLES
+#define ONE_LR_TABLE
+#endif
+#ifdef  FOUR_IM_TABLES
+#undef  FOUR_IM_TABLES
+#define ONE_IM_TABLE
+#endif
+#elif !defined(AES_BLOCK_SIZE)
+#if defined(UNROLL)
+#define PARTIAL_UNROLL
+#undef UNROLL
+#endif
+#endif
+
+// the finite field modular polynomial and elements
+
+#define ff_poly 0x011b
+#define ff_hi   0x80
+
+// multiply four bytes in GF(2^8) by 'x' {02} in parallel
+
+#define m1  0x80808080
+#define m2  0x7f7f7f7f
+#define m3  0x0000001b
+#define FFmulX(x)  ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * m3))
+
+// The following defines provide alternative definitions of FFmulX that might
+// give improved performance if a fast 32-bit multiply is not available. Note
+// that a temporary variable u needs to be defined where FFmulX is used.
+
+// #define FFmulX(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ ((u >> 3) | (u >> 6)) 
+// #define m4  0x1b1b1b1b
+// #define FFmulX(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) & m4) 
+
+// perform column mix operation on four bytes in parallel
+
+#define fwd_mcol(x) (f2 = FFmulX(x), f2 ^ upr(x ^ f2,3) ^ upr(x,2) ^ upr(x,1))
+
+#if defined(FIXED_TABLES)
+
+// the S-Box table
+
+static const unsigned char s_box[256] =
+{
+    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+    0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+    0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+    0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+    0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+    0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+    0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+    0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+    0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+    0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+    0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+    0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+    0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+    0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+    0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+    0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+    0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+};
+
+// the inverse S-Box table
+
+static const unsigned char inv_s_box[256] =
+{
+    0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+    0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+    0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+    0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+    0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+    0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+    0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+    0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+    0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+    0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+    0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+    0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+    0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+    0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+    0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+    0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+    0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+    0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+    0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+    0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+    0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+    0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+    0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+    0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+    0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+    0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+    0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+    0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+    0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+    0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+    0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+    0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+};
+
+// used to ensure table is generated in the right format
+// depending on the internal byte order required
+
+#if (INTERNAL_BYTE_ORDER < 2)
+/* little endian */
+#define w0(p)          0x000000##p
+#else
+/* big endian */
+#define w0(p)        0x##p##000000
+#endif
+
+// Number of elements required in this table for different
+// block and key lengths is:
+//
+// Nk =      4  6  8
+//        ----------
+// Nb = 4 | 10  8  7
+//      6 | 19 12 11
+//      8 | 29 19 14
+//
+// this table can be a table of bytes if the key schedule
+// code is adjusted accordingly
+
+static const u_int32_t rcon_tab[29] =
+{
+    w0(01), w0(02), w0(04), w0(08),
+    w0(10), w0(20), w0(40), w0(80),
+    w0(1b), w0(36), w0(6c), w0(d8),
+    w0(ab), w0(4d), w0(9a), w0(2f),
+    w0(5e), w0(bc), w0(63), w0(c6),
+    w0(97), w0(35), w0(6a), w0(d4),
+    w0(b3), w0(7d), w0(fa), w0(ef),
+    w0(c5)
+};
+
+#undef  w0
+
+// used to ensure table is generated in the right format
+// depending on the internal byte order required
+
+#if (INTERNAL_BYTE_ORDER < 2)
+/* little endian */
+#define r0(p,q,r,s) 0x##p##q##r##s
+#define r1(p,q,r,s) 0x##q##r##s##p
+#define r2(p,q,r,s) 0x##r##s##p##q
+#define r3(p,q,r,s) 0x##s##p##q##r
+#define w0(p)          0x000000##p
+#define w1(p)        0x0000##p##00
+#define w2(p)        0x00##p##0000
+#define w3(p)        0x##p##000000
+#else
+/* big endian */
+#define r0(p,q,r,s) 0x##s##r##q##p
+#define r1(p,q,r,s) 0x##p##s##r##q
+#define r2(p,q,r,s) 0x##q##p##s##r
+#define r3(p,q,r,s) 0x##r##q##p##s
+#define w0(p)        0x##p##000000
+#define w1(p)        0x00##p##0000
+#define w2(p)        0x0000##p##00
+#define w3(p)          0x000000##p
+#endif
+
+#if defined(FIXED_TABLES) && (defined(ONE_TABLE) || defined(FOUR_TABLES)) 
+
+//  data for forward tables (other than last round)
+
+#define f_table \
+    r(a5,63,63,c6), r(84,7c,7c,f8), r(99,77,77,ee), r(8d,7b,7b,f6),\
+    r(0d,f2,f2,ff), r(bd,6b,6b,d6), r(b1,6f,6f,de), r(54,c5,c5,91),\
+    r(50,30,30,60), r(03,01,01,02), r(a9,67,67,ce), r(7d,2b,2b,56),\
+    r(19,fe,fe,e7), r(62,d7,d7,b5), r(e6,ab,ab,4d), r(9a,76,76,ec),\
+    r(45,ca,ca,8f), r(9d,82,82,1f), r(40,c9,c9,89), r(87,7d,7d,fa),\
+    r(15,fa,fa,ef), r(eb,59,59,b2), r(c9,47,47,8e), r(0b,f0,f0,fb),\
+    r(ec,ad,ad,41), r(67,d4,d4,b3), r(fd,a2,a2,5f), r(ea,af,af,45),\
+    r(bf,9c,9c,23), r(f7,a4,a4,53), r(96,72,72,e4), r(5b,c0,c0,9b),\
+    r(c2,b7,b7,75), r(1c,fd,fd,e1), r(ae,93,93,3d), r(6a,26,26,4c),\
+    r(5a,36,36,6c), r(41,3f,3f,7e), r(02,f7,f7,f5), r(4f,cc,cc,83),\
+    r(5c,34,34,68), r(f4,a5,a5,51), r(34,e5,e5,d1), r(08,f1,f1,f9),\
+    r(93,71,71,e2), r(73,d8,d8,ab), r(53,31,31,62), r(3f,15,15,2a),\
+    r(0c,04,04,08), r(52,c7,c7,95), r(65,23,23,46), r(5e,c3,c3,9d),\
+    r(28,18,18,30), r(a1,96,96,37), r(0f,05,05,0a), r(b5,9a,9a,2f),\
+    r(09,07,07,0e), r(36,12,12,24), r(9b,80,80,1b), r(3d,e2,e2,df),\
+    r(26,eb,eb,cd), r(69,27,27,4e), r(cd,b2,b2,7f), r(9f,75,75,ea),\
+    r(1b,09,09,12), r(9e,83,83,1d), r(74,2c,2c,58), r(2e,1a,1a,34),\
+    r(2d,1b,1b,36), r(b2,6e,6e,dc), r(ee,5a,5a,b4), r(fb,a0,a0,5b),\
+    r(f6,52,52,a4), r(4d,3b,3b,76), r(61,d6,d6,b7), r(ce,b3,b3,7d),\
+    r(7b,29,29,52), r(3e,e3,e3,dd), r(71,2f,2f,5e), r(97,84,84,13),\
+    r(f5,53,53,a6), r(68,d1,d1,b9), r(00,00,00,00), r(2c,ed,ed,c1),\
+    r(60,20,20,40), r(1f,fc,fc,e3), r(c8,b1,b1,79), r(ed,5b,5b,b6),\
+    r(be,6a,6a,d4), r(46,cb,cb,8d), r(d9,be,be,67), r(4b,39,39,72),\
+    r(de,4a,4a,94), r(d4,4c,4c,98), r(e8,58,58,b0), r(4a,cf,cf,85),\
+    r(6b,d0,d0,bb), r(2a,ef,ef,c5), r(e5,aa,aa,4f), r(16,fb,fb,ed),\
+    r(c5,43,43,86), r(d7,4d,4d,9a), r(55,33,33,66), r(94,85,85,11),\
+    r(cf,45,45,8a), r(10,f9,f9,e9), r(06,02,02,04), r(81,7f,7f,fe),\
+    r(f0,50,50,a0), r(44,3c,3c,78), r(ba,9f,9f,25), r(e3,a8,a8,4b),\
+    r(f3,51,51,a2), r(fe,a3,a3,5d), r(c0,40,40,80), r(8a,8f,8f,05),\
+    r(ad,92,92,3f), r(bc,9d,9d,21), r(48,38,38,70), r(04,f5,f5,f1),\
+    r(df,bc,bc,63), r(c1,b6,b6,77), r(75,da,da,af), r(63,21,21,42),\
+    r(30,10,10,20), r(1a,ff,ff,e5), r(0e,f3,f3,fd), r(6d,d2,d2,bf),\
+    r(4c,cd,cd,81), r(14,0c,0c,18), r(35,13,13,26), r(2f,ec,ec,c3),\
+    r(e1,5f,5f,be), r(a2,97,97,35), r(cc,44,44,88), r(39,17,17,2e),\
+    r(57,c4,c4,93), r(f2,a7,a7,55), r(82,7e,7e,fc), r(47,3d,3d,7a),\
+    r(ac,64,64,c8), r(e7,5d,5d,ba), r(2b,19,19,32), r(95,73,73,e6),\
+    r(a0,60,60,c0), r(98,81,81,19), r(d1,4f,4f,9e), r(7f,dc,dc,a3),\
+    r(66,22,22,44), r(7e,2a,2a,54), r(ab,90,90,3b), r(83,88,88,0b),\
+    r(ca,46,46,8c), r(29,ee,ee,c7), r(d3,b8,b8,6b), r(3c,14,14,28),\
+    r(79,de,de,a7), r(e2,5e,5e,bc), r(1d,0b,0b,16), r(76,db,db,ad),\
+    r(3b,e0,e0,db), r(56,32,32,64), r(4e,3a,3a,74), r(1e,0a,0a,14),\
+    r(db,49,49,92), r(0a,06,06,0c), r(6c,24,24,48), r(e4,5c,5c,b8),\
+    r(5d,c2,c2,9f), r(6e,d3,d3,bd), r(ef,ac,ac,43), r(a6,62,62,c4),\
+    r(a8,91,91,39), r(a4,95,95,31), r(37,e4,e4,d3), r(8b,79,79,f2),\
+    r(32,e7,e7,d5), r(43,c8,c8,8b), r(59,37,37,6e), r(b7,6d,6d,da),\
+    r(8c,8d,8d,01), r(64,d5,d5,b1), r(d2,4e,4e,9c), r(e0,a9,a9,49),\
+    r(b4,6c,6c,d8), r(fa,56,56,ac), r(07,f4,f4,f3), r(25,ea,ea,cf),\
+    r(af,65,65,ca), r(8e,7a,7a,f4), r(e9,ae,ae,47), r(18,08,08,10),\
+    r(d5,ba,ba,6f), r(88,78,78,f0), r(6f,25,25,4a), r(72,2e,2e,5c),\
+    r(24,1c,1c,38), r(f1,a6,a6,57), r(c7,b4,b4,73), r(51,c6,c6,97),\
+    r(23,e8,e8,cb), r(7c,dd,dd,a1), r(9c,74,74,e8), r(21,1f,1f,3e),\
+    r(dd,4b,4b,96), r(dc,bd,bd,61), r(86,8b,8b,0d), r(85,8a,8a,0f),\
+    r(90,70,70,e0), r(42,3e,3e,7c), r(c4,b5,b5,71), r(aa,66,66,cc),\
+    r(d8,48,48,90), r(05,03,03,06), r(01,f6,f6,f7), r(12,0e,0e,1c),\
+    r(a3,61,61,c2), r(5f,35,35,6a), r(f9,57,57,ae), r(d0,b9,b9,69),\
+    r(91,86,86,17), r(58,c1,c1,99), r(27,1d,1d,3a), r(b9,9e,9e,27),\
+    r(38,e1,e1,d9), r(13,f8,f8,eb), r(b3,98,98,2b), r(33,11,11,22),\
+    r(bb,69,69,d2), r(70,d9,d9,a9), r(89,8e,8e,07), r(a7,94,94,33),\
+    r(b6,9b,9b,2d), r(22,1e,1e,3c), r(92,87,87,15), r(20,e9,e9,c9),\
+    r(49,ce,ce,87), r(ff,55,55,aa), r(78,28,28,50), r(7a,df,df,a5),\
+    r(8f,8c,8c,03), r(f8,a1,a1,59), r(80,89,89,09), r(17,0d,0d,1a),\
+    r(da,bf,bf,65), r(31,e6,e6,d7), r(c6,42,42,84), r(b8,68,68,d0),\
+    r(c3,41,41,82), r(b0,99,99,29), r(77,2d,2d,5a), r(11,0f,0f,1e),\
+    r(cb,b0,b0,7b), r(fc,54,54,a8), r(d6,bb,bb,6d), r(3a,16,16,2c)
+
+//  data for inverse tables (other than last round)
+
+#define i_table \
+    r(50,a7,f4,51), r(53,65,41,7e), r(c3,a4,17,1a), r(96,5e,27,3a),\
+    r(cb,6b,ab,3b), r(f1,45,9d,1f), r(ab,58,fa,ac), r(93,03,e3,4b),\
+    r(55,fa,30,20), r(f6,6d,76,ad), r(91,76,cc,88), r(25,4c,02,f5),\
+    r(fc,d7,e5,4f), r(d7,cb,2a,c5), r(80,44,35,26), r(8f,a3,62,b5),\
+    r(49,5a,b1,de), r(67,1b,ba,25), r(98,0e,ea,45), r(e1,c0,fe,5d),\
+    r(02,75,2f,c3), r(12,f0,4c,81), r(a3,97,46,8d), r(c6,f9,d3,6b),\
+    r(e7,5f,8f,03), r(95,9c,92,15), r(eb,7a,6d,bf), r(da,59,52,95),\
+    r(2d,83,be,d4), r(d3,21,74,58), r(29,69,e0,49), r(44,c8,c9,8e),\
+    r(6a,89,c2,75), r(78,79,8e,f4), r(6b,3e,58,99), r(dd,71,b9,27),\
+    r(b6,4f,e1,be), r(17,ad,88,f0), r(66,ac,20,c9), r(b4,3a,ce,7d),\
+    r(18,4a,df,63), r(82,31,1a,e5), r(60,33,51,97), r(45,7f,53,62),\
+    r(e0,77,64,b1), r(84,ae,6b,bb), r(1c,a0,81,fe), r(94,2b,08,f9),\
+    r(58,68,48,70), r(19,fd,45,8f), r(87,6c,de,94), r(b7,f8,7b,52),\
+    r(23,d3,73,ab), r(e2,02,4b,72), r(57,8f,1f,e3), r(2a,ab,55,66),\
+    r(07,28,eb,b2), r(03,c2,b5,2f), r(9a,7b,c5,86), r(a5,08,37,d3),\
+    r(f2,87,28,30), r(b2,a5,bf,23), r(ba,6a,03,02), r(5c,82,16,ed),\
+    r(2b,1c,cf,8a), r(92,b4,79,a7), r(f0,f2,07,f3), r(a1,e2,69,4e),\
+    r(cd,f4,da,65), r(d5,be,05,06), r(1f,62,34,d1), r(8a,fe,a6,c4),\
+    r(9d,53,2e,34), r(a0,55,f3,a2), r(32,e1,8a,05), r(75,eb,f6,a4),\
+    r(39,ec,83,0b), r(aa,ef,60,40), r(06,9f,71,5e), r(51,10,6e,bd),\
+    r(f9,8a,21,3e), r(3d,06,dd,96), r(ae,05,3e,dd), r(46,bd,e6,4d),\
+    r(b5,8d,54,91), r(05,5d,c4,71), r(6f,d4,06,04), r(ff,15,50,60),\
+    r(24,fb,98,19), r(97,e9,bd,d6), r(cc,43,40,89), r(77,9e,d9,67),\
+    r(bd,42,e8,b0), r(88,8b,89,07), r(38,5b,19,e7), r(db,ee,c8,79),\
+    r(47,0a,7c,a1), r(e9,0f,42,7c), r(c9,1e,84,f8), r(00,00,00,00),\
+    r(83,86,80,09), r(48,ed,2b,32), r(ac,70,11,1e), r(4e,72,5a,6c),\
+    r(fb,ff,0e,fd), r(56,38,85,0f), r(1e,d5,ae,3d), r(27,39,2d,36),\
+    r(64,d9,0f,0a), r(21,a6,5c,68), r(d1,54,5b,9b), r(3a,2e,36,24),\
+    r(b1,67,0a,0c), r(0f,e7,57,93), r(d2,96,ee,b4), r(9e,91,9b,1b),\
+    r(4f,c5,c0,80), r(a2,20,dc,61), r(69,4b,77,5a), r(16,1a,12,1c),\
+    r(0a,ba,93,e2), r(e5,2a,a0,c0), r(43,e0,22,3c), r(1d,17,1b,12),\
+    r(0b,0d,09,0e), r(ad,c7,8b,f2), r(b9,a8,b6,2d), r(c8,a9,1e,14),\
+    r(85,19,f1,57), r(4c,07,75,af), r(bb,dd,99,ee), r(fd,60,7f,a3),\
+    r(9f,26,01,f7), r(bc,f5,72,5c), r(c5,3b,66,44), r(34,7e,fb,5b),\
+    r(76,29,43,8b), r(dc,c6,23,cb), r(68,fc,ed,b6), r(63,f1,e4,b8),\
+    r(ca,dc,31,d7), r(10,85,63,42), r(40,22,97,13), r(20,11,c6,84),\
+    r(7d,24,4a,85), r(f8,3d,bb,d2), r(11,32,f9,ae), r(6d,a1,29,c7),\
+    r(4b,2f,9e,1d), r(f3,30,b2,dc), r(ec,52,86,0d), r(d0,e3,c1,77),\
+    r(6c,16,b3,2b), r(99,b9,70,a9), r(fa,48,94,11), r(22,64,e9,47),\
+    r(c4,8c,fc,a8), r(1a,3f,f0,a0), r(d8,2c,7d,56), r(ef,90,33,22),\
+    r(c7,4e,49,87), r(c1,d1,38,d9), r(fe,a2,ca,8c), r(36,0b,d4,98),\
+    r(cf,81,f5,a6), r(28,de,7a,a5), r(26,8e,b7,da), r(a4,bf,ad,3f),\
+    r(e4,9d,3a,2c), r(0d,92,78,50), r(9b,cc,5f,6a), r(62,46,7e,54),\
+    r(c2,13,8d,f6), r(e8,b8,d8,90), r(5e,f7,39,2e), r(f5,af,c3,82),\
+    r(be,80,5d,9f), r(7c,93,d0,69), r(a9,2d,d5,6f), r(b3,12,25,cf),\
+    r(3b,99,ac,c8), r(a7,7d,18,10), r(6e,63,9c,e8), r(7b,bb,3b,db),\
+    r(09,78,26,cd), r(f4,18,59,6e), r(01,b7,9a,ec), r(a8,9a,4f,83),\
+    r(65,6e,95,e6), r(7e,e6,ff,aa), r(08,cf,bc,21), r(e6,e8,15,ef),\
+    r(d9,9b,e7,ba), r(ce,36,6f,4a), r(d4,09,9f,ea), r(d6,7c,b0,29),\
+    r(af,b2,a4,31), r(31,23,3f,2a), r(30,94,a5,c6), r(c0,66,a2,35),\
+    r(37,bc,4e,74), r(a6,ca,82,fc), r(b0,d0,90,e0), r(15,d8,a7,33),\
+    r(4a,98,04,f1), r(f7,da,ec,41), r(0e,50,cd,7f), r(2f,f6,91,17),\
+    r(8d,d6,4d,76), r(4d,b0,ef,43), r(54,4d,aa,cc), r(df,04,96,e4),\
+    r(e3,b5,d1,9e), r(1b,88,6a,4c), r(b8,1f,2c,c1), r(7f,51,65,46),\
+    r(04,ea,5e,9d), r(5d,35,8c,01), r(73,74,87,fa), r(2e,41,0b,fb),\
+    r(5a,1d,67,b3), r(52,d2,db,92), r(33,56,10,e9), r(13,47,d6,6d),\
+    r(8c,61,d7,9a), r(7a,0c,a1,37), r(8e,14,f8,59), r(89,3c,13,eb),\
+    r(ee,27,a9,ce), r(35,c9,61,b7), r(ed,e5,1c,e1), r(3c,b1,47,7a),\
+    r(59,df,d2,9c), r(3f,73,f2,55), r(79,ce,14,18), r(bf,37,c7,73),\
+    r(ea,cd,f7,53), r(5b,aa,fd,5f), r(14,6f,3d,df), r(86,db,44,78),\
+    r(81,f3,af,ca), r(3e,c4,68,b9), r(2c,34,24,38), r(5f,40,a3,c2),\
+    r(72,c3,1d,16), r(0c,25,e2,bc), r(8b,49,3c,28), r(41,95,0d,ff),\
+    r(71,01,a8,39), r(de,b3,0c,08), r(9c,e4,b4,d8), r(90,c1,56,64),\
+    r(61,84,cb,7b), r(70,b6,32,d5), r(74,5c,6c,48), r(42,57,b8,d0)
+
+// generate the required tables in the desired endian format
+
+#undef  r
+#define r   r0
+
+#if defined(ONE_TABLE)
+static const u_int32_t ft_tab[256] =
+    {   f_table };
+#elif defined(FOUR_TABLES)
+static const u_int32_t ft_tab[4][256] =
+{   {   f_table },
+#undef  r
+#define r   r1
+    {   f_table },
+#undef  r
+#define r   r2
+    {   f_table },
+#undef  r
+#define r   r3
+    {   f_table }
+};
+#endif
+
+#undef  r
+#define r   r0
+#if defined(ONE_TABLE)
+static const u_int32_t it_tab[256] =
+    {   i_table };
+#elif defined(FOUR_TABLES)
+static const u_int32_t it_tab[4][256] =
+{   {   i_table },
+#undef  r
+#define r   r1
+    {   i_table },
+#undef  r
+#define r   r2
+    {   i_table },
+#undef  r
+#define r   r3
+    {   i_table }
+};
+#endif
+
+#endif
+
+#if defined(FIXED_TABLES) && (defined(ONE_LR_TABLE) || defined(FOUR_LR_TABLES)) 
+
+//  data for inverse tables (last round)
+
+#define li_table    \
+    w(52), w(09), w(6a), w(d5), w(30), w(36), w(a5), w(38),\
+    w(bf), w(40), w(a3), w(9e), w(81), w(f3), w(d7), w(fb),\
+    w(7c), w(e3), w(39), w(82), w(9b), w(2f), w(ff), w(87),\
+    w(34), w(8e), w(43), w(44), w(c4), w(de), w(e9), w(cb),\
+    w(54), w(7b), w(94), w(32), w(a6), w(c2), w(23), w(3d),\
+    w(ee), w(4c), w(95), w(0b), w(42), w(fa), w(c3), w(4e),\
+    w(08), w(2e), w(a1), w(66), w(28), w(d9), w(24), w(b2),\
+    w(76), w(5b), w(a2), w(49), w(6d), w(8b), w(d1), w(25),\
+    w(72), w(f8), w(f6), w(64), w(86), w(68), w(98), w(16),\
+    w(d4), w(a4), w(5c), w(cc), w(5d), w(65), w(b6), w(92),\
+    w(6c), w(70), w(48), w(50), w(fd), w(ed), w(b9), w(da),\
+    w(5e), w(15), w(46), w(57), w(a7), w(8d), w(9d), w(84),\
+    w(90), w(d8), w(ab), w(00), w(8c), w(bc), w(d3), w(0a),\
+    w(f7), w(e4), w(58), w(05), w(b8), w(b3), w(45), w(06),\
+    w(d0), w(2c), w(1e), w(8f), w(ca), w(3f), w(0f), w(02),\
+    w(c1), w(af), w(bd), w(03), w(01), w(13), w(8a), w(6b),\
+    w(3a), w(91), w(11), w(41), w(4f), w(67), w(dc), w(ea),\
+    w(97), w(f2), w(cf), w(ce), w(f0), w(b4), w(e6), w(73),\
+    w(96), w(ac), w(74), w(22), w(e7), w(ad), w(35), w(85),\
+    w(e2), w(f9), w(37), w(e8), w(1c), w(75), w(df), w(6e),\
+    w(47), w(f1), w(1a), w(71), w(1d), w(29), w(c5), w(89),\
+    w(6f), w(b7), w(62), w(0e), w(aa), w(18), w(be), w(1b),\
+    w(fc), w(56), w(3e), w(4b), w(c6), w(d2), w(79), w(20),\
+    w(9a), w(db), w(c0), w(fe), w(78), w(cd), w(5a), w(f4),\
+    w(1f), w(dd), w(a8), w(33), w(88), w(07), w(c7), w(31),\
+    w(b1), w(12), w(10), w(59), w(27), w(80), w(ec), w(5f),\
+    w(60), w(51), w(7f), w(a9), w(19), w(b5), w(4a), w(0d),\
+    w(2d), w(e5), w(7a), w(9f), w(93), w(c9), w(9c), w(ef),\
+    w(a0), w(e0), w(3b), w(4d), w(ae), w(2a), w(f5), w(b0),\
+    w(c8), w(eb), w(bb), w(3c), w(83), w(53), w(99), w(61),\
+    w(17), w(2b), w(04), w(7e), w(ba), w(77), w(d6), w(26),\
+    w(e1), w(69), w(14), w(63), w(55), w(21), w(0c), w(7d),
+
+// generate the required tables in the desired endian format
+
+#undef  r
+#define r(p,q,r,s)  w0(q)
+#if defined(ONE_LR_TABLE)
+static const u_int32_t fl_tab[256] =
+    {   f_table     };
+#elif defined(FOUR_LR_TABLES)
+static const u_int32_t fl_tab[4][256] =
+{   {   f_table    },
+#undef  r
+#define r(p,q,r,s)   w1(q)
+    {   f_table    },
+#undef  r
+#define r(p,q,r,s)   w2(q)
+    {   f_table    },
+#undef  r
+#define r(p,q,r,s)   w3(q)
+    {   f_table    }
+};
+#endif
+
+#undef  w
+#define w   w0
+#if defined(ONE_LR_TABLE)
+static const u_int32_t il_tab[256] =
+    {   li_table    };
+#elif defined(FOUR_LR_TABLES)
+static const u_int32_t il_tab[4][256] =
+{   {   li_table    },
+#undef  w
+#define w   w1
+    {   li_table    },
+#undef  w
+#define w   w2
+    {   li_table    },
+#undef  w
+#define w   w3
+    {   li_table    }
+};
+#endif
+
+#endif
+
+#if defined(FIXED_TABLES) && (defined(ONE_IM_TABLE) || defined(FOUR_IM_TABLES)) 
+
+#define m_table \
+    r(00,00,00,00), r(0b,0d,09,0e), r(16,1a,12,1c), r(1d,17,1b,12),\
+    r(2c,34,24,38), r(27,39,2d,36), r(3a,2e,36,24), r(31,23,3f,2a),\
+    r(58,68,48,70), r(53,65,41,7e), r(4e,72,5a,6c), r(45,7f,53,62),\
+    r(74,5c,6c,48), r(7f,51,65,46), r(62,46,7e,54), r(69,4b,77,5a),\
+    r(b0,d0,90,e0), r(bb,dd,99,ee), r(a6,ca,82,fc), r(ad,c7,8b,f2),\
+    r(9c,e4,b4,d8), r(97,e9,bd,d6), r(8a,fe,a6,c4), r(81,f3,af,ca),\
+    r(e8,b8,d8,90), r(e3,b5,d1,9e), r(fe,a2,ca,8c), r(f5,af,c3,82),\
+    r(c4,8c,fc,a8), r(cf,81,f5,a6), r(d2,96,ee,b4), r(d9,9b,e7,ba),\
+    r(7b,bb,3b,db), r(70,b6,32,d5), r(6d,a1,29,c7), r(66,ac,20,c9),\
+    r(57,8f,1f,e3), r(5c,82,16,ed), r(41,95,0d,ff), r(4a,98,04,f1),\
+    r(23,d3,73,ab), r(28,de,7a,a5), r(35,c9,61,b7), r(3e,c4,68,b9),\
+    r(0f,e7,57,93), r(04,ea,5e,9d), r(19,fd,45,8f), r(12,f0,4c,81),\
+    r(cb,6b,ab,3b), r(c0,66,a2,35), r(dd,71,b9,27), r(d6,7c,b0,29),\
+    r(e7,5f,8f,03), r(ec,52,86,0d), r(f1,45,9d,1f), r(fa,48,94,11),\
+    r(93,03,e3,4b), r(98,0e,ea,45), r(85,19,f1,57), r(8e,14,f8,59),\
+    r(bf,37,c7,73), r(b4,3a,ce,7d), r(a9,2d,d5,6f), r(a2,20,dc,61),\
+    r(f6,6d,76,ad), r(fd,60,7f,a3), r(e0,77,64,b1), r(eb,7a,6d,bf),\
+    r(da,59,52,95), r(d1,54,5b,9b), r(cc,43,40,89), r(c7,4e,49,87),\
+    r(ae,05,3e,dd), r(a5,08,37,d3), r(b8,1f,2c,c1), r(b3,12,25,cf),\
+    r(82,31,1a,e5), r(89,3c,13,eb), r(94,2b,08,f9), r(9f,26,01,f7),\
+    r(46,bd,e6,4d), r(4d,b0,ef,43), r(50,a7,f4,51), r(5b,aa,fd,5f),\
+    r(6a,89,c2,75), r(61,84,cb,7b), r(7c,93,d0,69), r(77,9e,d9,67),\
+    r(1e,d5,ae,3d), r(15,d8,a7,33), r(08,cf,bc,21), r(03,c2,b5,2f),\
+    r(32,e1,8a,05), r(39,ec,83,0b), r(24,fb,98,19), r(2f,f6,91,17),\
+    r(8d,d6,4d,76), r(86,db,44,78), r(9b,cc,5f,6a), r(90,c1,56,64),\
+    r(a1,e2,69,4e), r(aa,ef,60,40), r(b7,f8,7b,52), r(bc,f5,72,5c),\
+    r(d5,be,05,06), r(de,b3,0c,08), r(c3,a4,17,1a), r(c8,a9,1e,14),\
+    r(f9,8a,21,3e), r(f2,87,28,30), r(ef,90,33,22), r(e4,9d,3a,2c),\
+    r(3d,06,dd,96), r(36,0b,d4,98), r(2b,1c,cf,8a), r(20,11,c6,84),\
+    r(11,32,f9,ae), r(1a,3f,f0,a0), r(07,28,eb,b2), r(0c,25,e2,bc),\
+    r(65,6e,95,e6), r(6e,63,9c,e8), r(73,74,87,fa), r(78,79,8e,f4),\
+    r(49,5a,b1,de), r(42,57,b8,d0), r(5f,40,a3,c2), r(54,4d,aa,cc),\
+    r(f7,da,ec,41), r(fc,d7,e5,4f), r(e1,c0,fe,5d), r(ea,cd,f7,53),\
+    r(db,ee,c8,79), r(d0,e3,c1,77), r(cd,f4,da,65), r(c6,f9,d3,6b),\
+    r(af,b2,a4,31), r(a4,bf,ad,3f), r(b9,a8,b6,2d), r(b2,a5,bf,23),\
+    r(83,86,80,09), r(88,8b,89,07), r(95,9c,92,15), r(9e,91,9b,1b),\
+    r(47,0a,7c,a1), r(4c,07,75,af), r(51,10,6e,bd), r(5a,1d,67,b3),\
+    r(6b,3e,58,99), r(60,33,51,97), r(7d,24,4a,85), r(76,29,43,8b),\
+    r(1f,62,34,d1), r(14,6f,3d,df), r(09,78,26,cd), r(02,75,2f,c3),\
+    r(33,56,10,e9), r(38,5b,19,e7), r(25,4c,02,f5), r(2e,41,0b,fb),\
+    r(8c,61,d7,9a), r(87,6c,de,94), r(9a,7b,c5,86), r(91,76,cc,88),\
+    r(a0,55,f3,a2), r(ab,58,fa,ac), r(b6,4f,e1,be), r(bd,42,e8,b0),\
+    r(d4,09,9f,ea), r(df,04,96,e4), r(c2,13,8d,f6), r(c9,1e,84,f8),\
+    r(f8,3d,bb,d2), r(f3,30,b2,dc), r(ee,27,a9,ce), r(e5,2a,a0,c0),\
+    r(3c,b1,47,7a), r(37,bc,4e,74), r(2a,ab,55,66), r(21,a6,5c,68),\
+    r(10,85,63,42), r(1b,88,6a,4c), r(06,9f,71,5e), r(0d,92,78,50),\
+    r(64,d9,0f,0a), r(6f,d4,06,04), r(72,c3,1d,16), r(79,ce,14,18),\
+    r(48,ed,2b,32), r(43,e0,22,3c), r(5e,f7,39,2e), r(55,fa,30,20),\
+    r(01,b7,9a,ec), r(0a,ba,93,e2), r(17,ad,88,f0), r(1c,a0,81,fe),\
+    r(2d,83,be,d4), r(26,8e,b7,da), r(3b,99,ac,c8), r(30,94,a5,c6),\
+    r(59,df,d2,9c), r(52,d2,db,92), r(4f,c5,c0,80), r(44,c8,c9,8e),\
+    r(75,eb,f6,a4), r(7e,e6,ff,aa), r(63,f1,e4,b8), r(68,fc,ed,b6),\
+    r(b1,67,0a,0c), r(ba,6a,03,02), r(a7,7d,18,10), r(ac,70,11,1e),\
+    r(9d,53,2e,34), r(96,5e,27,3a), r(8b,49,3c,28), r(80,44,35,26),\
+    r(e9,0f,42,7c), r(e2,02,4b,72), r(ff,15,50,60), r(f4,18,59,6e),\
+    r(c5,3b,66,44), r(ce,36,6f,4a), r(d3,21,74,58), r(d8,2c,7d,56),\
+    r(7a,0c,a1,37), r(71,01,a8,39), r(6c,16,b3,2b), r(67,1b,ba,25),\
+    r(56,38,85,0f), r(5d,35,8c,01), r(40,22,97,13), r(4b,2f,9e,1d),\
+    r(22,64,e9,47), r(29,69,e0,49), r(34,7e,fb,5b), r(3f,73,f2,55),\
+    r(0e,50,cd,7f), r(05,5d,c4,71), r(18,4a,df,63), r(13,47,d6,6d),\
+    r(ca,dc,31,d7), r(c1,d1,38,d9), r(dc,c6,23,cb), r(d7,cb,2a,c5),\
+    r(e6,e8,15,ef), r(ed,e5,1c,e1), r(f0,f2,07,f3), r(fb,ff,0e,fd),\
+    r(92,b4,79,a7), r(99,b9,70,a9), r(84,ae,6b,bb), r(8f,a3,62,b5),\
+    r(be,80,5d,9f), r(b5,8d,54,91), r(a8,9a,4f,83), r(a3,97,46,8d)
+
+#undef r
+#define r   r0
+
+#if defined(ONE_IM_TABLE)
+static const u_int32_t im_tab[256] =
+    {   m_table };
+#elif defined(FOUR_IM_TABLES)
+static const u_int32_t im_tab[4][256] =
+{   {   m_table },
+#undef  r
+#define r   r1
+    {   m_table },
+#undef  r
+#define r   r2
+    {   m_table },
+#undef  r
+#define r   r3
+    {   m_table }
+};
+#endif
+
+#endif
+
+#else
+
+static int tab_gen = 0;
+
+static unsigned char  s_box[256];            // the S box
+static unsigned char  inv_s_box[256];        // the inverse S box
+static u_int32_t  rcon_tab[AES_RC_LENGTH];   // table of round constants
+
+#if defined(ONE_TABLE)
+static u_int32_t  ft_tab[256];
+static u_int32_t  it_tab[256];
+#elif defined(FOUR_TABLES)
+static u_int32_t  ft_tab[4][256];
+static u_int32_t  it_tab[4][256];
+#endif
+
+#if defined(ONE_LR_TABLE)
+static u_int32_t  fl_tab[256];
+static u_int32_t  il_tab[256];
+#elif defined(FOUR_LR_TABLES)
+static u_int32_t  fl_tab[4][256];
+static u_int32_t  il_tab[4][256];
+#endif
+
+#if defined(ONE_IM_TABLE)
+static u_int32_t  im_tab[256];
+#elif defined(FOUR_IM_TABLES)
+static u_int32_t  im_tab[4][256];
+#endif
+
+// Generate the tables for the dynamic table option
+
+#if !defined(FF_TABLES)
+
+// It will generally be sensible to use tables to compute finite 
+// field multiplies and inverses but where memory is scarse this 
+// code might sometimes be better.
+
+// return 2 ^ (n - 1) where n is the bit number of the highest bit
+// set in x with x in the range 1 < x < 0x00000200.   This form is
+// used so that locals within FFinv can be bytes rather than words
+
+static unsigned char hibit(const u_int32_t x)
+{   unsigned char r = (unsigned char)((x >> 1) | (x >> 2));
+    
+    r |= (r >> 2);
+    r |= (r >> 4);
+    return (r + 1) >> 1;
+}
+
+// return the inverse of the finite field element x
+
+static unsigned char FFinv(const unsigned char x)
+{   unsigned char    p1 = x, p2 = 0x1b, n1 = hibit(x), n2 = 0x80, v1 = 1, v2 = 0;
+
+    if(x < 2) return x;
+
+    for(;;)
+    {
+        if(!n1) return v1;
+
+        while(n2 >= n1)
+        {   
+            n2 /= n1; p2 ^= p1 * n2; v2 ^= v1 * n2; n2 = hibit(p2);
+        }
+        
+        if(!n2) return v2;
+
+        while(n1 >= n2)
+        {   
+            n1 /= n2; p1 ^= p2 * n1; v1 ^= v2 * n1; n1 = hibit(p1);
+        }
+    }
+}
+
+// define the finite field multiplies required for Rijndael
+
+#define FFmul02(x)  ((((x) & 0x7f) << 1) ^ ((x) & 0x80 ? 0x1b : 0))
+#define FFmul03(x)  ((x) ^ FFmul02(x))
+#define FFmul09(x)  ((x) ^ FFmul02(FFmul02(FFmul02(x))))
+#define FFmul0b(x)  ((x) ^ FFmul02((x) ^ FFmul02(FFmul02(x))))
+#define FFmul0d(x)  ((x) ^ FFmul02(FFmul02((x) ^ FFmul02(x))))
+#define FFmul0e(x)  FFmul02((x) ^ FFmul02((x) ^ FFmul02(x)))
+
+#else
+
+#define FFinv(x)    ((x) ? pow[255 - log[x]]: 0)
+
+#define FFmul02(x) (x ? pow[log[x] + 0x19] : 0)
+#define FFmul03(x) (x ? pow[log[x] + 0x01] : 0)
+#define FFmul09(x) (x ? pow[log[x] + 0xc7] : 0)
+#define FFmul0b(x) (x ? pow[log[x] + 0x68] : 0)
+#define FFmul0d(x) (x ? pow[log[x] + 0xee] : 0)
+#define FFmul0e(x) (x ? pow[log[x] + 0xdf] : 0)
+
+#endif
+
+// The forward and inverse affine transformations used in the S-box
+
+#define fwd_affine(x) \
+    (w = (u_int32_t)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(unsigned char)(w^(w>>8)))
+
+#define inv_affine(x) \
+    (w = (u_int32_t)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(unsigned char)(w^(w>>8)))
+
+static void gen_tabs(void)
+{   u_int32_t  i, w;
+
+#if defined(FF_TABLES)
+
+    unsigned char  pow[512], log[256];
+
+    // log and power tables for GF(2^8) finite field with
+    // 0x011b as modular polynomial - the simplest primitive
+    // root is 0x03, used here to generate the tables
+
+    i = 0; w = 1; 
+    do
+    {   
+        pow[i] = (unsigned char)w;
+        pow[i + 255] = (unsigned char)w;
+        log[w] = (unsigned char)i++;
+        w ^=  (w << 1) ^ (w & ff_hi ? ff_poly : 0);
+    }
+    while (w != 1);
+
+#endif
+
+    for(i = 0, w = 1; i < AES_RC_LENGTH; ++i)
+    {
+        rcon_tab[i] = bytes2word(w, 0, 0, 0);
+        w = (w << 1) ^ (w & ff_hi ? ff_poly : 0);
+    }
+
+    for(i = 0; i < 256; ++i)
+    {   unsigned char    b;
+
+        s_box[i] = b = fwd_affine(FFinv((unsigned char)i));
+
+        w = bytes2word(b, 0, 0, 0);
+#if defined(ONE_LR_TABLE)
+        fl_tab[i] = w;
+#elif defined(FOUR_LR_TABLES)
+        fl_tab[0][i] = w;
+        fl_tab[1][i] = upr(w,1);
+        fl_tab[2][i] = upr(w,2);
+        fl_tab[3][i] = upr(w,3);
+#endif
+        w = bytes2word(FFmul02(b), b, b, FFmul03(b));
+#if defined(ONE_TABLE)
+        ft_tab[i] = w;
+#elif defined(FOUR_TABLES)
+        ft_tab[0][i] = w;
+        ft_tab[1][i] = upr(w,1);
+        ft_tab[2][i] = upr(w,2);
+        ft_tab[3][i] = upr(w,3);
+#endif
+        inv_s_box[i] = b = FFinv(inv_affine((unsigned char)i));
+
+        w = bytes2word(b, 0, 0, 0);
+#if defined(ONE_LR_TABLE)
+        il_tab[i] = w;
+#elif defined(FOUR_LR_TABLES)
+        il_tab[0][i] = w;
+        il_tab[1][i] = upr(w,1);
+        il_tab[2][i] = upr(w,2);
+        il_tab[3][i] = upr(w,3);
+#endif
+        w = bytes2word(FFmul0e(b), FFmul09(b), FFmul0d(b), FFmul0b(b));
+#if defined(ONE_TABLE)
+        it_tab[i] = w;
+#elif defined(FOUR_TABLES)
+        it_tab[0][i] = w;
+        it_tab[1][i] = upr(w,1);
+        it_tab[2][i] = upr(w,2);
+        it_tab[3][i] = upr(w,3);
+#endif
+#if defined(ONE_IM_TABLE)
+        im_tab[b] = w;
+#elif defined(FOUR_IM_TABLES)
+        im_tab[0][b] = w;
+        im_tab[1][b] = upr(w,1);
+        im_tab[2][b] = upr(w,2);
+        im_tab[3][b] = upr(w,3);
+#endif
+
+    }
+}
+
+#endif
+
+#define no_table(x,box,vf,rf,c) bytes2word( \
+    box[bval(vf(x,0,c),rf(0,c))], \
+    box[bval(vf(x,1,c),rf(1,c))], \
+    box[bval(vf(x,2,c),rf(2,c))], \
+    box[bval(vf(x,3,c),rf(3,c))])
+
+#define one_table(x,op,tab,vf,rf,c) \
+ (     tab[bval(vf(x,0,c),rf(0,c))] \
+  ^ op(tab[bval(vf(x,1,c),rf(1,c))],1) \
+  ^ op(tab[bval(vf(x,2,c),rf(2,c))],2) \
+  ^ op(tab[bval(vf(x,3,c),rf(3,c))],3))
+
+#define four_tables(x,tab,vf,rf,c) \
+ (  tab[0][bval(vf(x,0,c),rf(0,c))] \
+  ^ tab[1][bval(vf(x,1,c),rf(1,c))] \
+  ^ tab[2][bval(vf(x,2,c),rf(2,c))] \
+  ^ tab[3][bval(vf(x,3,c),rf(3,c))])
+
+#define vf1(x,r,c)  (x)
+#define rf1(r,c)    (r)
+#define rf2(r,c)    ((r-c)&3)
+
+#if defined(FOUR_LR_TABLES)
+#define ls_box(x,c)     four_tables(x,fl_tab,vf1,rf2,c)
+#elif defined(ONE_LR_TABLE)
+#define ls_box(x,c)     one_table(x,upr,fl_tab,vf1,rf2,c)
+#else
+#define ls_box(x,c)     no_table(x,s_box,vf1,rf2,c)
+#endif
+
+#if defined(FOUR_IM_TABLES)
+#define inv_mcol(x)     four_tables(x,im_tab,vf1,rf1,0)
+#elif defined(ONE_IM_TABLE)
+#define inv_mcol(x)     one_table(x,upr,im_tab,vf1,rf1,0)
+#else
+#define inv_mcol(x) \
+    (f9 = (x),f2 = FFmulX(f9), f4 = FFmulX(f2), f8 = FFmulX(f4), f9 ^= f8, \
+    f2 ^= f4 ^ f8 ^ upr(f2 ^ f9,3) ^ upr(f4 ^ f9,2) ^ upr(f9,1))
+#endif
+
+// Subroutine to set the block size (if variable) in bytes, legal
+// values being 16, 24 and 32.
+
+#if defined(AES_BLOCK_SIZE)
+#define nc   (AES_BLOCK_SIZE / 4)
+#else
+#define nc   (cx->aes_Ncol)
+
+void aes_set_blk(aes_context *cx, int n_bytes)
+{
+#if !defined(FIXED_TABLES)
+    if(!tab_gen) { gen_tabs(); tab_gen = 1; }
+#endif
+
+    switch(n_bytes) {
+    case 32:        /* bytes */
+    case 256:       /* bits */
+        nc = 8;
+        break;
+    case 24:        /* bytes */
+    case 192:       /* bits */
+        nc = 6;
+        break;
+    case 16:        /* bytes */
+    case 128:       /* bits */
+    default:
+        nc = 4;
+        break;
+    }
+}
+
+#endif
+
+// Initialise the key schedule from the user supplied key. The key
+// length is now specified in bytes - 16, 24 or 32 as appropriate.
+// This corresponds to bit lengths of 128, 192 and 256 bits, and
+// to Nk values of 4, 6 and 8 respectively.
+
+#define mx(t,f) (*t++ = inv_mcol(*f),f++)
+#define cp(t,f) *t++ = *f++
+
+#if   AES_BLOCK_SIZE == 16
+#define cpy(d,s)    cp(d,s); cp(d,s); cp(d,s); cp(d,s)
+#define mix(d,s)    mx(d,s); mx(d,s); mx(d,s); mx(d,s)
+#elif AES_BLOCK_SIZE == 24
+#define cpy(d,s)    cp(d,s); cp(d,s); cp(d,s); cp(d,s); \
+                    cp(d,s); cp(d,s)
+#define mix(d,s)    mx(d,s); mx(d,s); mx(d,s); mx(d,s); \
+                    mx(d,s); mx(d,s)
+#elif AES_BLOCK_SIZE == 32
+#define cpy(d,s)    cp(d,s); cp(d,s); cp(d,s); cp(d,s); \
+                    cp(d,s); cp(d,s); cp(d,s); cp(d,s)
+#define mix(d,s)    mx(d,s); mx(d,s); mx(d,s); mx(d,s); \
+                    mx(d,s); mx(d,s); mx(d,s); mx(d,s)
+#else
+
+#define cpy(d,s) \
+switch(nc) \
+{   case 8: cp(d,s); cp(d,s); \
+    case 6: cp(d,s); cp(d,s); \
+    case 4: cp(d,s); cp(d,s); \
+            cp(d,s); cp(d,s); \
+}
+
+#define mix(d,s) \
+switch(nc) \
+{   case 8: mx(d,s); mx(d,s); \
+    case 6: mx(d,s); mx(d,s); \
+    case 4: mx(d,s); mx(d,s); \
+            mx(d,s); mx(d,s); \
+}
+
+#endif
+
+void aes_set_key(aes_context *cx, const unsigned char in_key[], int n_bytes, const int f)
+{   u_int32_t    *kf, *kt, rci;
+
+#if !defined(FIXED_TABLES)
+    if(!tab_gen) { gen_tabs(); tab_gen = 1; }
+#endif
+
+    switch(n_bytes) {
+    case 32:                    /* bytes */
+    case 256:                   /* bits */
+        cx->aes_Nkey = 8;
+        break;
+    case 24:                    /* bytes */
+    case 192:                   /* bits */
+        cx->aes_Nkey = 6;
+        break;
+    case 16:                    /* bytes */
+    case 128:                   /* bits */
+    default:
+        cx->aes_Nkey = 4;
+        break;
+    }
+
+    cx->aes_Nrnd = (cx->aes_Nkey > nc ? cx->aes_Nkey : nc) + 6; 
+
+    cx->aes_e_key[0] = word_in(in_key     );
+    cx->aes_e_key[1] = word_in(in_key +  4);
+    cx->aes_e_key[2] = word_in(in_key +  8);
+    cx->aes_e_key[3] = word_in(in_key + 12);
+
+    kf = cx->aes_e_key; 
+    kt = kf + nc * (cx->aes_Nrnd + 1) - cx->aes_Nkey; 
+    rci = 0;
+
+    switch(cx->aes_Nkey)
+    {
+    case 4: do
+            {   kf[4] = kf[0] ^ ls_box(kf[3],3) ^ rcon_tab[rci++];
+                kf[5] = kf[1] ^ kf[4];
+                kf[6] = kf[2] ^ kf[5];
+                kf[7] = kf[3] ^ kf[6];
+                kf += 4;
+            }
+            while(kf < kt);
+            break;
+
+    case 6: cx->aes_e_key[4] = word_in(in_key + 16);
+            cx->aes_e_key[5] = word_in(in_key + 20);
+            do
+            {   kf[ 6] = kf[0] ^ ls_box(kf[5],3) ^ rcon_tab[rci++];
+                kf[ 7] = kf[1] ^ kf[ 6];
+                kf[ 8] = kf[2] ^ kf[ 7];
+                kf[ 9] = kf[3] ^ kf[ 8];
+                kf[10] = kf[4] ^ kf[ 9];
+                kf[11] = kf[5] ^ kf[10];
+                kf += 6;
+            }
+            while(kf < kt);
+            break;
+
+    case 8: cx->aes_e_key[4] = word_in(in_key + 16);
+            cx->aes_e_key[5] = word_in(in_key + 20);
+            cx->aes_e_key[6] = word_in(in_key + 24);
+            cx->aes_e_key[7] = word_in(in_key + 28);
+            do
+            {   kf[ 8] = kf[0] ^ ls_box(kf[7],3) ^ rcon_tab[rci++];
+                kf[ 9] = kf[1] ^ kf[ 8];
+                kf[10] = kf[2] ^ kf[ 9];
+                kf[11] = kf[3] ^ kf[10];
+                kf[12] = kf[4] ^ ls_box(kf[11],0);
+                kf[13] = kf[5] ^ kf[12];
+                kf[14] = kf[6] ^ kf[13];
+                kf[15] = kf[7] ^ kf[14];
+                kf += 8;
+            }
+            while (kf < kt);
+            break;
+    }
+
+    if(!f)
+    {   u_int32_t    i;
+        
+        kt = cx->aes_d_key + nc * cx->aes_Nrnd;
+        kf = cx->aes_e_key;
+        
+        cpy(kt, kf); kt -= 2 * nc;
+
+        for(i = 1; i < cx->aes_Nrnd; ++i)
+        { 
+#if defined(ONE_TABLE) || defined(FOUR_TABLES)
+#if !defined(ONE_IM_TABLE) && !defined(FOUR_IM_TABLES)
+            u_int32_t    f2, f4, f8, f9;
+#endif
+            mix(kt, kf);
+#else
+            cpy(kt, kf);
+#endif
+            kt -= 2 * nc;
+        }
+        
+        cpy(kt, kf);
+    }
+}
+
+// y = output word, x = input word, r = row, c = column
+// for r = 0, 1, 2 and 3 = column accessed for row r
+
+#if defined(ARRAYS)
+#define s(x,c) x[c]
+#else
+#define s(x,c) x##c
+#endif
+
+// I am grateful to Frank Yellin for the following constructions
+// which, given the column (c) of the output state variable that
+// is being computed, return the input state variables which are
+// needed for each row (r) of the state
+
+// For the fixed block size options, compilers reduce these two 
+// expressions to fixed variable references. For variable block 
+// size code conditional clauses will sometimes be returned
+
+#define unused  77  // Sunset Strip
+
+#define fwd_var(x,r,c) \
+ ( r==0 ?			\
+    ( c==0 ? s(x,0) \
+    : c==1 ? s(x,1) \
+    : c==2 ? s(x,2) \
+    : c==3 ? s(x,3) \
+    : c==4 ? s(x,4) \
+    : c==5 ? s(x,5) \
+    : c==6 ? s(x,6) \
+    : s(x,7))		\
+ : r==1 ?			\
+    ( c==0 ? s(x,1) \
+    : c==1 ? s(x,2) \
+    : c==2 ? s(x,3) \
+    : c==3 ? nc==4 ? s(x,0) : s(x,4) \
+    : c==4 ? s(x,5) \
+    : c==5 ? nc==8 ? s(x,6) : s(x,0) \
+    : c==6 ? s(x,7) \
+    : s(x,0))		\
+ : r==2 ?			\
+    ( c==0 ? nc==8 ? s(x,3) : s(x,2) \
+    : c==1 ? nc==8 ? s(x,4) : s(x,3) \
+    : c==2 ? nc==4 ? s(x,0) : nc==8 ? s(x,5) : s(x,4) \
+    : c==3 ? nc==4 ? s(x,1) : nc==8 ? s(x,6) : s(x,5) \
+    : c==4 ? nc==8 ? s(x,7) : s(x,0) \
+    : c==5 ? nc==8 ? s(x,0) : s(x,1) \
+    : c==6 ? s(x,1) \
+    : s(x,2))		\
+ :					\
+    ( c==0 ? nc==8 ? s(x,4) : s(x,3) \
+    : c==1 ? nc==4 ? s(x,0) : nc==8 ? s(x,5) : s(x,4) \
+    : c==2 ? nc==4 ? s(x,1) : nc==8 ? s(x,6) : s(x,5) \
+    : c==3 ? nc==4 ? s(x,2) : nc==8 ? s(x,7) : s(x,0) \
+    : c==4 ? nc==8 ? s(x,0) : s(x,1) \
+    : c==5 ? nc==8 ? s(x,1) : s(x,2) \
+    : c==6 ? s(x,2) \
+    : s(x,3)))
+
+#define inv_var(x,r,c) \
+ ( r==0 ?			\
+    ( c==0 ? s(x,0) \
+    : c==1 ? s(x,1) \
+    : c==2 ? s(x,2) \
+    : c==3 ? s(x,3) \
+    : c==4 ? s(x,4) \
+    : c==5 ? s(x,5) \
+    : c==6 ? s(x,6) \
+    : s(x,7))		\
+ : r==1 ?			\
+    ( c==0 ? nc==4 ? s(x,3) : nc==8 ? s(x,7) : s(x,5) \
+    : c==1 ? s(x,0) \
+    : c==2 ? s(x,1) \
+    : c==3 ? s(x,2) \
+    : c==4 ? s(x,3) \
+    : c==5 ? s(x,4) \
+    : c==6 ? s(x,5) \
+    : s(x,6))		\
+ : r==2 ?			\
+    ( c==0 ? nc==4 ? s(x,2) : nc==8 ? s(x,5) : s(x,4) \
+    : c==1 ? nc==4 ? s(x,3) : nc==8 ? s(x,6) : s(x,5) \
+    : c==2 ? nc==8 ? s(x,7) : s(x,0) \
+    : c==3 ? nc==8 ? s(x,0) : s(x,1) \
+    : c==4 ? nc==8 ? s(x,1) : s(x,2) \
+    : c==5 ? nc==8 ? s(x,2) : s(x,3) \
+    : c==6 ? s(x,3) \
+    : s(x,4))		\
+ :					\
+    ( c==0 ? nc==4 ? s(x,1) : nc==8 ? s(x,4) : s(x,3) \
+    : c==1 ? nc==4 ? s(x,2) : nc==8 ? s(x,5) : s(x,4) \
+    : c==2 ? nc==4 ? s(x,3) : nc==8 ? s(x,6) : s(x,5) \
+    : c==3 ? nc==8 ? s(x,7) : s(x,0) \
+    : c==4 ? nc==8 ? s(x,0) : s(x,1) \
+    : c==5 ? nc==8 ? s(x,1) : s(x,2) \
+    : c==6 ? s(x,2) \
+    : s(x,3)))
+
+#define si(y,x,k,c) s(y,c) = word_in(x + 4 * c) ^ k[c]
+#define so(y,x,c)   word_out(y + 4 * c, s(x,c))
+
+#if defined(FOUR_TABLES)
+#define fwd_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ four_tables(x,ft_tab,fwd_var,rf1,c)
+#define inv_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ four_tables(x,it_tab,inv_var,rf1,c)
+#elif defined(ONE_TABLE)
+#define fwd_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ one_table(x,upr,ft_tab,fwd_var,rf1,c)
+#define inv_rnd(y,x,k,c)    s(y,c)= (k)[c] ^ one_table(x,upr,it_tab,inv_var,rf1,c)
+#else
+#define fwd_rnd(y,x,k,c)    s(y,c) = fwd_mcol(no_table(x,s_box,fwd_var,rf1,c)) ^ (k)[c]
+#define inv_rnd(y,x,k,c)    s(y,c) = inv_mcol(no_table(x,inv_s_box,inv_var,rf1,c) ^ (k)[c])
+#endif
+
+#if defined(FOUR_LR_TABLES)
+#define fwd_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ four_tables(x,fl_tab,fwd_var,rf1,c)
+#define inv_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ four_tables(x,il_tab,inv_var,rf1,c)
+#elif defined(ONE_LR_TABLE)
+#define fwd_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ one_table(x,ups,fl_tab,fwd_var,rf1,c)
+#define inv_lrnd(y,x,k,c)   s(y,c)= (k)[c] ^ one_table(x,ups,il_tab,inv_var,rf1,c)
+#else
+#define fwd_lrnd(y,x,k,c)   s(y,c) = no_table(x,s_box,fwd_var,rf1,c) ^ (k)[c]
+#define inv_lrnd(y,x,k,c)   s(y,c) = no_table(x,inv_s_box,inv_var,rf1,c) ^ (k)[c]
+#endif
+
+#if AES_BLOCK_SIZE == 16
+
+#if defined(ARRAYS)
+#define locals(y,x)     x[4],y[4]
+#else
+#define locals(y,x)     x##0,x##1,x##2,x##3,y##0,y##1,y##2,y##3
+// the following defines prevent the compiler requiring the declaration
+// of generated but unused variables in the fwd_var and inv_var macros
+#define b04 unused
+#define b05 unused
+#define b06 unused
+#define b07 unused
+#define b14 unused
+#define b15 unused
+#define b16 unused
+#define b17 unused
+#endif
+#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
+                        s(y,2) = s(x,2); s(y,3) = s(x,3);
+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3)
+#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3)
+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3)
+
+#elif AES_BLOCK_SIZE == 24
+
+#if defined(ARRAYS)
+#define locals(y,x)     x[6],y[6]
+#else
+#define locals(y,x)     x##0,x##1,x##2,x##3,x##4,x##5, \
+                        y##0,y##1,y##2,y##3,y##4,y##5
+#define b06 unused
+#define b07 unused
+#define b16 unused
+#define b17 unused
+#endif
+#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
+                        s(y,2) = s(x,2); s(y,3) = s(x,3); \
+                        s(y,4) = s(x,4); s(y,5) = s(x,5);
+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); \
+                        si(y,x,k,3); si(y,x,k,4); si(y,x,k,5)
+#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); \
+                        so(y,x,3); so(y,x,4); so(y,x,5)
+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); \
+                        rm(y,x,k,3); rm(y,x,k,4); rm(y,x,k,5)
+#else
+
+#if defined(ARRAYS)
+#define locals(y,x)     x[8],y[8]
+#else
+#define locals(y,x)     x##0,x##1,x##2,x##3,x##4,x##5,x##6,x##7, \
+                        y##0,y##1,y##2,y##3,y##4,y##5,y##6,y##7
+#endif
+#define l_copy(y, x)    s(y,0) = s(x,0); s(y,1) = s(x,1); \
+                        s(y,2) = s(x,2); s(y,3) = s(x,3); \
+                        s(y,4) = s(x,4); s(y,5) = s(x,5); \
+                        s(y,6) = s(x,6); s(y,7) = s(x,7);
+
+#if AES_BLOCK_SIZE == 32
+
+#define state_in(y,x,k) si(y,x,k,0); si(y,x,k,1); si(y,x,k,2); si(y,x,k,3); \
+                        si(y,x,k,4); si(y,x,k,5); si(y,x,k,6); si(y,x,k,7)
+#define state_out(y,x)  so(y,x,0); so(y,x,1); so(y,x,2); so(y,x,3); \
+                        so(y,x,4); so(y,x,5); so(y,x,6); so(y,x,7)
+#define round(rm,y,x,k) rm(y,x,k,0); rm(y,x,k,1); rm(y,x,k,2); rm(y,x,k,3); \
+                        rm(y,x,k,4); rm(y,x,k,5); rm(y,x,k,6); rm(y,x,k,7)
+#else
+
+#define state_in(y,x,k) \
+switch(nc) \
+{   case 8: si(y,x,k,7); si(y,x,k,6); \
+    case 6: si(y,x,k,5); si(y,x,k,4); \
+    case 4: si(y,x,k,3); si(y,x,k,2); \
+            si(y,x,k,1); si(y,x,k,0); \
+}
+
+#define state_out(y,x) \
+switch(nc) \
+{   case 8: so(y,x,7); so(y,x,6); \
+    case 6: so(y,x,5); so(y,x,4); \
+    case 4: so(y,x,3); so(y,x,2); \
+            so(y,x,1); so(y,x,0); \
+}
+
+#if defined(FAST_VARIABLE)
+
+#define round(rm,y,x,k) \
+switch(nc) \
+{   case 8: rm(y,x,k,7); rm(y,x,k,6); \
+            rm(y,x,k,5); rm(y,x,k,4); \
+            rm(y,x,k,3); rm(y,x,k,2); \
+            rm(y,x,k,1); rm(y,x,k,0); \
+            break; \
+    case 6: rm(y,x,k,5); rm(y,x,k,4); \
+            rm(y,x,k,3); rm(y,x,k,2); \
+            rm(y,x,k,1); rm(y,x,k,0); \
+            break; \
+    case 4: rm(y,x,k,3); rm(y,x,k,2); \
+            rm(y,x,k,1); rm(y,x,k,0); \
+            break; \
+}
+#else
+
+#define round(rm,y,x,k) \
+switch(nc) \
+{   case 8: rm(y,x,k,7); rm(y,x,k,6); \
+    case 6: rm(y,x,k,5); rm(y,x,k,4); \
+    case 4: rm(y,x,k,3); rm(y,x,k,2); \
+            rm(y,x,k,1); rm(y,x,k,0); \
+}
+
+#endif
+
+#endif
+#endif
+
+void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
+{   u_int32_t        locals(b0, b1);
+    const u_int32_t  *kp = cx->aes_e_key;
+
+#if !defined(ONE_TABLE) && !defined(FOUR_TABLES)
+    u_int32_t        f2;
+#endif
+
+    state_in(b0, in_blk, kp); kp += nc;
+
+#if defined(UNROLL)
+
+    switch(cx->aes_Nrnd)
+    {
+    case 14:    round(fwd_rnd,  b1, b0, kp         ); 
+                round(fwd_rnd,  b0, b1, kp + nc    ); kp += 2 * nc;
+    case 12:    round(fwd_rnd,  b1, b0, kp         ); 
+                round(fwd_rnd,  b0, b1, kp + nc    ); kp += 2 * nc;
+    case 10:    round(fwd_rnd,  b1, b0, kp         );             
+                round(fwd_rnd,  b0, b1, kp +     nc);
+                round(fwd_rnd,  b1, b0, kp + 2 * nc); 
+                round(fwd_rnd,  b0, b1, kp + 3 * nc);
+                round(fwd_rnd,  b1, b0, kp + 4 * nc); 
+                round(fwd_rnd,  b0, b1, kp + 5 * nc);
+                round(fwd_rnd,  b1, b0, kp + 6 * nc); 
+                round(fwd_rnd,  b0, b1, kp + 7 * nc);
+                round(fwd_rnd,  b1, b0, kp + 8 * nc);
+                round(fwd_lrnd, b0, b1, kp + 9 * nc);
+    }
+
+#elif defined(PARTIAL_UNROLL)
+    {   u_int32_t    rnd;
+
+        for(rnd = 0; rnd < (cx->aes_Nrnd >> 1) - 1; ++rnd)
+        {
+            round(fwd_rnd, b1, b0, kp); 
+            round(fwd_rnd, b0, b1, kp + nc); kp += 2 * nc;
+        }
+
+        round(fwd_rnd,  b1, b0, kp);
+        round(fwd_lrnd, b0, b1, kp + nc);
+    }
+#else
+    {   u_int32_t    rnd;
+
+        for(rnd = 0; rnd < cx->aes_Nrnd - 1; ++rnd)
+        {
+            round(fwd_rnd, b1, b0, kp); 
+            l_copy(b0, b1); kp += nc;
+        }
+
+        round(fwd_lrnd, b0, b1, kp);
+    }
+#endif
+
+    state_out(out_blk, b0);
+}
+
+void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
+{   u_int32_t        locals(b0, b1);
+    const u_int32_t  *kp = cx->aes_d_key;
+
+#if !defined(ONE_TABLE) && !defined(FOUR_TABLES)
+    u_int32_t        f2, f4, f8, f9; 
+#endif
+
+    state_in(b0, in_blk, kp); kp += nc;
+
+#if defined(UNROLL)
+
+    switch(cx->aes_Nrnd)
+    {
+    case 14:    round(inv_rnd,  b1, b0, kp         );
+                round(inv_rnd,  b0, b1, kp + nc    ); kp += 2 * nc;
+    case 12:    round(inv_rnd,  b1, b0, kp         );
+                round(inv_rnd,  b0, b1, kp + nc    ); kp += 2 * nc;
+    case 10:    round(inv_rnd,  b1, b0, kp         );             
+                round(inv_rnd,  b0, b1, kp +     nc);
+                round(inv_rnd,  b1, b0, kp + 2 * nc); 
+                round(inv_rnd,  b0, b1, kp + 3 * nc);
+                round(inv_rnd,  b1, b0, kp + 4 * nc); 
+                round(inv_rnd,  b0, b1, kp + 5 * nc);
+                round(inv_rnd,  b1, b0, kp + 6 * nc); 
+                round(inv_rnd,  b0, b1, kp + 7 * nc);
+                round(inv_rnd,  b1, b0, kp + 8 * nc);
+                round(inv_lrnd, b0, b1, kp + 9 * nc);
+    }
+
+#elif defined(PARTIAL_UNROLL)
+    {   u_int32_t    rnd;
+
+        for(rnd = 0; rnd < (cx->aes_Nrnd >> 1) - 1; ++rnd)
+        {
+            round(inv_rnd, b1, b0, kp); 
+            round(inv_rnd, b0, b1, kp + nc); kp += 2 * nc;
+        }
+
+        round(inv_rnd,  b1, b0, kp);
+        round(inv_lrnd, b0, b1, kp + nc);
+    }
+#else
+    {   u_int32_t    rnd;
+
+        for(rnd = 0; rnd < cx->aes_Nrnd - 1; ++rnd)
+        {
+            round(inv_rnd, b1, b0, kp); 
+            l_copy(b0, b1); kp += nc;
+        }
+
+        round(inv_lrnd, b0, b1, kp);
+    }
+#endif
+
+    state_out(out_blk, b0);
+}
diff -urN linux-2.4.28/drivers/misc/aes.h linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes.h
--- linux-2.4.28/drivers/misc/aes.h	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/aes.h	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,113 @@
+// I retain copyright in this code but I encourage its free use provided
+// that I don't carry any responsibility for the results. I am especially 
+// happy to see it used in free and open source software. If you do use 
+// it I would appreciate an acknowledgement of its origin in the code or
+// the product that results and I would also appreciate knowing a little
+// about the use to which it is being put. I am grateful to Frank Yellin
+// for some ideas that are used in this implementation.
+//
+// Dr B. R. Gladman <brg@gladman.uk.net> 6th April 2001.
+//
+// This is an implementation of the AES encryption algorithm (Rijndael)
+// designed by Joan Daemen and Vincent Rijmen. This version is designed
+// to provide both fixed and dynamic block and key lengths and can also 
+// run with either big or little endian internal byte order (see aes.h). 
+// It inputs block and key lengths in bytes with the legal values being 
+// 16, 24 and 32.
+
+/*
+ * Modified by Jari Ruusu,  May 1 2001
+ *  - Fixed some compile warnings, code was ok but gcc warned anyway.
+ *  - Changed basic types: byte -> unsigned char, word -> u_int32_t
+ *  - Major name space cleanup: Names visible to outside now begin
+ *    with "aes_" or "AES_". A lot of stuff moved from aes.h to aes.c
+ *  - Removed C++ and DLL support as part of name space cleanup.
+ *  - Eliminated unnecessary recomputation of tables. (actual bug fix)
+ *  - Merged precomputed constant tables to aes.c file.
+ *  - Removed data alignment restrictions for portability reasons.
+ *  - Made block and key lengths accept bit count (128/192/256)
+ *    as well byte count (16/24/32).
+ *  - Removed all error checks. This change also eliminated the need
+ *    to preinitialize the context struct to zero.
+ *  - Removed some totally unused constants.
+ */
+
+#ifndef _AES_H
+#define _AES_H
+
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/config.h>
+#include <linux/module.h>
+
+// CONFIGURATION OPTIONS (see also aes.c)
+//
+// Define AES_BLOCK_SIZE to set the cipher block size (16, 24 or 32) or
+// leave this undefined for dynamically variable block size (this will
+// result in much slower code).
+// IMPORTANT NOTE: AES_BLOCK_SIZE is in BYTES (16, 24, 32 or undefined). If
+// left undefined a slower version providing variable block length is compiled
+
+#define AES_BLOCK_SIZE  16
+
+// The number of key schedule words for different block and key lengths
+// allowing for method of computation which requires the length to be a
+// multiple of the key length
+//
+// Nk =       4   6   8
+//        -------------
+// Nb = 4 |  60  60  64
+//      6 |  96  90  96
+//      8 | 120 120 120
+
+#if !defined(AES_BLOCK_SIZE) || (AES_BLOCK_SIZE == 32)
+#define AES_KS_LENGTH   120
+#define AES_RC_LENGTH    29
+#else
+#define AES_KS_LENGTH   4 * AES_BLOCK_SIZE
+#define AES_RC_LENGTH   (9 * AES_BLOCK_SIZE) / 8 - 8
+#endif
+
+typedef struct
+{
+    u_int32_t    aes_Nkey;      // the number of words in the key input block
+    u_int32_t    aes_Nrnd;      // the number of cipher rounds
+    u_int32_t    aes_e_key[AES_KS_LENGTH];   // the encryption key schedule
+    u_int32_t    aes_d_key[AES_KS_LENGTH];   // the decryption key schedule
+#if !defined(AES_BLOCK_SIZE)
+    u_int32_t    aes_Ncol;      // the number of columns in the cipher state
+#endif
+} aes_context;
+
+// avoid global name conflict with mainline kernel
+#define aes_set_key _aes_set_key
+#define aes_encrypt _aes_encrypt
+#define aes_decrypt _aes_decrypt
+
+// THE CIPHER INTERFACE
+
+#if !defined(AES_BLOCK_SIZE)
+extern void aes_set_blk(aes_context *, const int);
+#endif
+
+#if defined(CONFIG_X86) || defined(CONFIG_X86_64)
+ asmlinkage
+#endif
+extern void aes_set_key(aes_context *, const unsigned char [], const int, const int);
+
+#if defined(CONFIG_X86) || defined(CONFIG_X86_64)
+ asmlinkage
+#endif
+extern void aes_encrypt(const aes_context *, const unsigned char [], unsigned char []);
+
+#if defined(CONFIG_X86) || defined(CONFIG_X86_64)
+ asmlinkage
+#endif
+extern void aes_decrypt(const aes_context *, const unsigned char [], unsigned char []);
+
+// The block length inputs to aes_set_block and aes_set_key are in numbers
+// of bytes or bits.  The calls to subroutines must be made in the above
+// order but multiple calls can be made without repeating earlier calls
+// if their parameters have not changed.
+
+#endif  // _AES_H
diff -urN linux-2.4.28/drivers/misc/crypto-ksym.c linux-2.4.28-loop-AES-v3.0b/drivers/misc/crypto-ksym.c
--- linux-2.4.28/drivers/misc/crypto-ksym.c	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/crypto-ksym.c	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,7 @@
+#include <linux/module.h>
+#include "aes.h"
+#include "md5.h"
+EXPORT_SYMBOL_NOVERS(aes_set_key);
+EXPORT_SYMBOL_NOVERS(aes_encrypt);
+EXPORT_SYMBOL_NOVERS(aes_decrypt);
+EXPORT_SYMBOL_NOVERS(md5_transform_CPUbyteorder);
diff -urN linux-2.4.28/drivers/misc/md5-amd64.S linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5-amd64.S
--- linux-2.4.28/drivers/misc/md5-amd64.S	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5-amd64.S	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,200 @@
+//
+//  md5-amd64.S
+//
+//  Written by Jari Ruusu, October 1 2003
+//
+//  Copyright 2003 by Jari Ruusu.
+//  Redistribution of this file is permitted under the GNU Public License.
+//
+
+//  Modified by Jari Ruusu,  June 12 2004
+//   - Converted 32 bit x86 code to 64 bit AMD64 code
+
+// A MD5 transform implementation for AMD64 compatible processors.
+// This code does not preserve the rax, rcx, rdx, rsi, rdi or r8-r11
+// registers or the artihmetic status flags. However, the rbx, rbp and
+// r12-r15 registers are preserved across calls.
+
+// void md5_transform_CPUbyteorder(u_int32_t *hash, u_int32_t *in)
+
+#if defined(USE_UNDERLINE)
+# define md5_transform_CPUbyteorder _md5_transform_CPUbyteorder
+#endif
+#if !defined(ALIGN64BYTES)
+# define ALIGN64BYTES 64
+#endif
+
+	.file	"md5-amd64.S"
+	.globl	md5_transform_CPUbyteorder
+
+// rdi = pointer to hash[4] array which is read and written
+// rsi = pointer to in[16] array which is read only
+
+	.text
+	.align	ALIGN64BYTES
+md5_transform_CPUbyteorder:
+	movl	12(%rdi),%eax
+	movl	8(%rdi),%ecx
+	movl	(%rdi),%r8d
+	movl	4(%rdi),%r9d
+	movl	(%rsi),%r10d
+	prefetcht0 60(%rsi)
+	movl	%eax,%edx
+	xorl	%ecx,%eax
+
+#define REPEAT1(p1w,p2x,p3z,p4c,p5s,p6Nin,p7Nz,p8Ny) \
+	addl	$p4c,p1w		;\
+	andl	p2x,%eax		;\
+	addl	%r10d,p1w		;\
+	xorl	p3z,%eax		;\
+	movl	p6Nin*4(%rsi),%r10d	;\
+	addl	%eax,p1w		;\
+	movl	p7Nz,%eax		;\
+	roll	$p5s,p1w		;\
+	xorl	p8Ny,%eax		;\
+	addl	p2x,p1w
+
+	REPEAT1(%r8d,%r9d,%edx,0xd76aa478, 7, 1,%ecx,%r9d)
+	REPEAT1(%edx,%r8d,%ecx,0xe8c7b756,12, 2,%r9d,%r8d)
+	REPEAT1(%ecx,%edx,%r9d,0x242070db,17, 3,%r8d,%edx)
+	REPEAT1(%r9d,%ecx,%r8d,0xc1bdceee,22, 4,%edx,%ecx)
+	REPEAT1(%r8d,%r9d,%edx,0xf57c0faf, 7, 5,%ecx,%r9d)
+	REPEAT1(%edx,%r8d,%ecx,0x4787c62a,12, 6,%r9d,%r8d)
+	REPEAT1(%ecx,%edx,%r9d,0xa8304613,17, 7,%r8d,%edx)
+	REPEAT1(%r9d,%ecx,%r8d,0xfd469501,22, 8,%edx,%ecx)
+	REPEAT1(%r8d,%r9d,%edx,0x698098d8, 7, 9,%ecx,%r9d)
+	REPEAT1(%edx,%r8d,%ecx,0x8b44f7af,12,10,%r9d,%r8d)
+	REPEAT1(%ecx,%edx,%r9d,0xffff5bb1,17,11,%r8d,%edx)
+	REPEAT1(%r9d,%ecx,%r8d,0x895cd7be,22,12,%edx,%ecx)
+	REPEAT1(%r8d,%r9d,%edx,0x6b901122, 7,13,%ecx,%r9d)
+	REPEAT1(%edx,%r8d,%ecx,0xfd987193,12,14,%r9d,%r8d)
+	REPEAT1(%ecx,%edx,%r9d,0xa679438e,17,15,%r8d,%edx)
+
+	addl	$0x49b40821,%r9d
+	andl	%ecx,%eax
+	addl	%r10d,%r9d
+	xorl	%r8d,%eax
+	movl	1*4(%rsi),%r10d
+	addl	%eax,%r9d
+	movl	%ecx,%eax
+	roll	$22,%r9d
+	addl	%ecx,%r9d
+
+#define REPEAT2(p1w,p2x,p3y,p4z,p5c,p6s,p7Nin,p8Ny) \
+	xorl	p2x,%eax		;\
+	addl	$p5c,p1w		;\
+	andl	p4z,%eax		;\
+	addl	%r10d,p1w		;\
+	xorl	p3y,%eax		;\
+	movl	p7Nin*4(%rsi),%r10d	;\
+	addl	%eax,p1w		;\
+	movl	p8Ny,%eax		;\
+	roll	$p6s,p1w		;\
+	addl	p2x,p1w
+
+	REPEAT2(%r8d,%r9d,%ecx,%edx,0xf61e2562, 5, 6,%r9d)
+	REPEAT2(%edx,%r8d,%r9d,%ecx,0xc040b340, 9,11,%r8d)
+	REPEAT2(%ecx,%edx,%r8d,%r9d,0x265e5a51,14, 0,%edx)
+	REPEAT2(%r9d,%ecx,%edx,%r8d,0xe9b6c7aa,20, 5,%ecx)
+	REPEAT2(%r8d,%r9d,%ecx,%edx,0xd62f105d, 5,10,%r9d)
+	REPEAT2(%edx,%r8d,%r9d,%ecx,0x02441453, 9,15,%r8d)
+	REPEAT2(%ecx,%edx,%r8d,%r9d,0xd8a1e681,14, 4,%edx)
+	REPEAT2(%r9d,%ecx,%edx,%r8d,0xe7d3fbc8,20, 9,%ecx)
+	REPEAT2(%r8d,%r9d,%ecx,%edx,0x21e1cde6, 5,14,%r9d)
+	REPEAT2(%edx,%r8d,%r9d,%ecx,0xc33707d6, 9, 3,%r8d)
+	REPEAT2(%ecx,%edx,%r8d,%r9d,0xf4d50d87,14, 8,%edx)
+	REPEAT2(%r9d,%ecx,%edx,%r8d,0x455a14ed,20,13,%ecx)
+	REPEAT2(%r8d,%r9d,%ecx,%edx,0xa9e3e905, 5, 2,%r9d)
+	REPEAT2(%edx,%r8d,%r9d,%ecx,0xfcefa3f8, 9, 7,%r8d)
+	REPEAT2(%ecx,%edx,%r8d,%r9d,0x676f02d9,14,12,%edx)
+
+	xorl	%ecx,%eax
+	addl	$0x8d2a4c8a,%r9d
+	andl	%r8d,%eax
+	addl	%r10d,%r9d
+	xorl	%edx,%eax
+	movl	5*4(%rsi),%r10d
+	addl	%eax,%r9d
+	movl	%ecx,%eax
+	roll	$20,%r9d
+	xorl	%edx,%eax
+	addl	%ecx,%r9d
+
+#define REPEAT3(p1w,p2x,p3c,p4s,p5Nin,p6Ny,p7Nz) \
+	addl	$p3c,p1w		;\
+	xorl	p2x,%eax		;\
+	addl	%r10d,p1w		;\
+	movl	p5Nin*4(%rsi),%r10d	;\
+	addl	%eax,p1w		;\
+	movl	p6Ny,%eax		;\
+	roll	$p4s,p1w		;\
+	xorl	p7Nz,%eax		;\
+	addl	p2x,p1w
+
+	REPEAT3(%r8d,%r9d,0xfffa3942, 4, 8,%r9d,%ecx)
+	REPEAT3(%edx,%r8d,0x8771f681,11,11,%r8d,%r9d)
+	REPEAT3(%ecx,%edx,0x6d9d6122,16,14,%edx,%r8d)
+	REPEAT3(%r9d,%ecx,0xfde5380c,23, 1,%ecx,%edx)
+	REPEAT3(%r8d,%r9d,0xa4beea44, 4, 4,%r9d,%ecx)
+	REPEAT3(%edx,%r8d,0x4bdecfa9,11, 7,%r8d,%r9d)
+	REPEAT3(%ecx,%edx,0xf6bb4b60,16,10,%edx,%r8d)
+	REPEAT3(%r9d,%ecx,0xbebfbc70,23,13,%ecx,%edx)
+	REPEAT3(%r8d,%r9d,0x289b7ec6, 4, 0,%r9d,%ecx)
+	REPEAT3(%edx,%r8d,0xeaa127fa,11, 3,%r8d,%r9d)
+	REPEAT3(%ecx,%edx,0xd4ef3085,16, 6,%edx,%r8d)
+	REPEAT3(%r9d,%ecx,0x04881d05,23, 9,%ecx,%edx)
+	REPEAT3(%r8d,%r9d,0xd9d4d039, 4,12,%r9d,%ecx)
+	REPEAT3(%edx,%r8d,0xe6db99e5,11,15,%r8d,%r9d)
+	REPEAT3(%ecx,%edx,0x1fa27cf8,16, 2,%edx,%r8d)
+
+	addl	$0xc4ac5665,%r9d
+	xorl	%ecx,%eax
+	addl	%r10d,%r9d
+	movl	(%rsi),%r10d
+	addl	%eax,%r9d
+	movl	%edx,%eax
+	roll	$23,%r9d
+	notl	%eax
+	addl	%ecx,%r9d
+
+#define REPEAT4(p1w,p2x,p3y,p4c,p5s,p6Nin,p7Nz) \
+	addl	$p4c,p1w		;\
+	orl	p2x,%eax		;\
+	addl	%r10d,p1w		;\
+	xorl	p3y,%eax		;\
+	movl	p6Nin*4(%rsi),%r10d	;\
+	addl	%eax,p1w		;\
+	movl	p7Nz,%eax		;\
+	roll	$p5s,p1w		;\
+	notl	%eax			;\
+	addl	p2x,p1w
+
+	REPEAT4(%r8d,%r9d,%ecx,0xf4292244, 6, 7,%ecx)
+	REPEAT4(%edx,%r8d,%r9d,0x432aff97,10,14,%r9d)
+	REPEAT4(%ecx,%edx,%r8d,0xab9423a7,15, 5,%r8d)
+	REPEAT4(%r9d,%ecx,%edx,0xfc93a039,21,12,%edx)
+	REPEAT4(%r8d,%r9d,%ecx,0x655b59c3, 6, 3,%ecx)
+	REPEAT4(%edx,%r8d,%r9d,0x8f0ccc92,10,10,%r9d)
+	REPEAT4(%ecx,%edx,%r8d,0xffeff47d,15, 1,%r8d)
+	REPEAT4(%r9d,%ecx,%edx,0x85845dd1,21, 8,%edx)
+	REPEAT4(%r8d,%r9d,%ecx,0x6fa87e4f, 6,15,%ecx)
+	REPEAT4(%edx,%r8d,%r9d,0xfe2ce6e0,10, 6,%r9d)
+	REPEAT4(%ecx,%edx,%r8d,0xa3014314,15,13,%r8d)
+	REPEAT4(%r9d,%ecx,%edx,0x4e0811a1,21, 4,%edx)
+	REPEAT4(%r8d,%r9d,%ecx,0xf7537e82, 6,11,%ecx)
+	REPEAT4(%edx,%r8d,%r9d,0xbd3af235,10, 2,%r9d)
+	REPEAT4(%ecx,%edx,%r8d,0x2ad7d2bb,15, 9,%r8d)
+
+	addl	$0xeb86d391,%r9d
+	orl	%ecx,%eax
+	addl	%r10d,%r9d
+	xorl	%edx,%eax
+	addl	%eax,%r9d
+	roll	$21,%r9d
+	addl	%ecx,%r9d
+
+	addl	%r8d,(%rdi)
+	addl	%r9d,4(%rdi)
+	addl	%ecx,8(%rdi)
+	addl	%edx,12(%rdi)
+	ret
diff -urN linux-2.4.28/drivers/misc/md5-x86.S linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5-x86.S
--- linux-2.4.28/drivers/misc/md5-x86.S	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5-x86.S	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,207 @@
+//
+//  md5-x86.S
+//
+//  Written by Jari Ruusu, October 1 2003
+//
+//  Copyright 2003 by Jari Ruusu.
+//  Redistribution of this file is permitted under the GNU Public License.
+//
+
+// A MD5 transform implementation for x86 compatible processors. This
+// version uses i386 instruction set but instruction scheduling is optimized
+// for Pentium-2. This code does not preserve the eax, ecx or edx registers
+// or the artihmetic status flags. However, the ebx, esi, edi, and ebp
+// registers are preserved across calls.
+
+// void md5_transform_CPUbyteorder(u_int32_t *hash, u_int32_t *in)
+
+#if defined(USE_UNDERLINE)
+# define md5_transform_CPUbyteorder _md5_transform_CPUbyteorder
+#endif
+#if !defined(ALIGN32BYTES)
+# define ALIGN32BYTES 32
+#endif
+
+	.file	"md5-x86.S"
+	.globl	md5_transform_CPUbyteorder
+	.text
+	.align	ALIGN32BYTES
+
+md5_transform_CPUbyteorder:
+	push	%ebp
+	mov	4+4(%esp),%eax		// pointer to 'hash' input
+	mov	8+4(%esp),%ebp		// pointer to 'in' array
+	push	%ebx
+	push	%esi
+	push	%edi
+
+	mov	(%eax),%esi
+	mov	4(%eax),%edi
+	mov	8(%eax),%ecx
+	mov	12(%eax),%eax
+	mov	(%ebp),%ebx
+	mov	%eax,%edx
+	xor	%ecx,%eax
+
+#define REPEAT1(p1w,p2x,p3z,p4c,p5s,p6Nin,p7Nz,p8Ny) \
+	add	$p4c,p1w		;\
+	and	p2x,%eax		;\
+	add	%ebx,p1w		;\
+	xor	p3z,%eax		;\
+	mov	p6Nin*4(%ebp),%ebx	;\
+	add	%eax,p1w		;\
+	mov	p7Nz,%eax		;\
+	rol	$p5s,p1w		;\
+	xor	p8Ny,%eax		;\
+	add	p2x,p1w
+
+	REPEAT1(%esi,%edi,%edx,0xd76aa478, 7, 1,%ecx,%edi)
+	REPEAT1(%edx,%esi,%ecx,0xe8c7b756,12, 2,%edi,%esi)
+	REPEAT1(%ecx,%edx,%edi,0x242070db,17, 3,%esi,%edx)
+	REPEAT1(%edi,%ecx,%esi,0xc1bdceee,22, 4,%edx,%ecx)
+	REPEAT1(%esi,%edi,%edx,0xf57c0faf, 7, 5,%ecx,%edi)
+	REPEAT1(%edx,%esi,%ecx,0x4787c62a,12, 6,%edi,%esi)
+	REPEAT1(%ecx,%edx,%edi,0xa8304613,17, 7,%esi,%edx)
+	REPEAT1(%edi,%ecx,%esi,0xfd469501,22, 8,%edx,%ecx)
+	REPEAT1(%esi,%edi,%edx,0x698098d8, 7, 9,%ecx,%edi)
+	REPEAT1(%edx,%esi,%ecx,0x8b44f7af,12,10,%edi,%esi)
+	REPEAT1(%ecx,%edx,%edi,0xffff5bb1,17,11,%esi,%edx)
+	REPEAT1(%edi,%ecx,%esi,0x895cd7be,22,12,%edx,%ecx)
+	REPEAT1(%esi,%edi,%edx,0x6b901122, 7,13,%ecx,%edi)
+	REPEAT1(%edx,%esi,%ecx,0xfd987193,12,14,%edi,%esi)
+	REPEAT1(%ecx,%edx,%edi,0xa679438e,17,15,%esi,%edx)
+
+	add	$0x49b40821,%edi
+	and	%ecx,%eax
+	add	%ebx,%edi
+	xor	%esi,%eax
+	mov	1*4(%ebp),%ebx
+	add	%eax,%edi
+	mov	%ecx,%eax
+	rol	$22,%edi
+	add	%ecx,%edi
+
+#define REPEAT2(p1w,p2x,p3y,p4z,p5c,p6s,p7Nin,p8Ny) \
+	xor	p2x,%eax		;\
+	add	$p5c,p1w		;\
+	and	p4z,%eax		;\
+	add	%ebx,p1w		;\
+	xor	p3y,%eax		;\
+	mov	p7Nin*4(%ebp),%ebx	;\
+	add	%eax,p1w		;\
+	mov	p8Ny,%eax		;\
+	rol	$p6s,p1w		;\
+	add	p2x,p1w
+
+	REPEAT2(%esi,%edi,%ecx,%edx,0xf61e2562, 5, 6,%edi)
+	REPEAT2(%edx,%esi,%edi,%ecx,0xc040b340, 9,11,%esi)
+	REPEAT2(%ecx,%edx,%esi,%edi,0x265e5a51,14, 0,%edx)
+	REPEAT2(%edi,%ecx,%edx,%esi,0xe9b6c7aa,20, 5,%ecx)
+	REPEAT2(%esi,%edi,%ecx,%edx,0xd62f105d, 5,10,%edi)
+	REPEAT2(%edx,%esi,%edi,%ecx,0x02441453, 9,15,%esi)
+	REPEAT2(%ecx,%edx,%esi,%edi,0xd8a1e681,14, 4,%edx)
+	REPEAT2(%edi,%ecx,%edx,%esi,0xe7d3fbc8,20, 9,%ecx)
+	REPEAT2(%esi,%edi,%ecx,%edx,0x21e1cde6, 5,14,%edi)
+	REPEAT2(%edx,%esi,%edi,%ecx,0xc33707d6, 9, 3,%esi)
+	REPEAT2(%ecx,%edx,%esi,%edi,0xf4d50d87,14, 8,%edx)
+	REPEAT2(%edi,%ecx,%edx,%esi,0x455a14ed,20,13,%ecx)
+	REPEAT2(%esi,%edi,%ecx,%edx,0xa9e3e905, 5, 2,%edi)
+	REPEAT2(%edx,%esi,%edi,%ecx,0xfcefa3f8, 9, 7,%esi)
+	REPEAT2(%ecx,%edx,%esi,%edi,0x676f02d9,14,12,%edx)
+
+	xor	%ecx,%eax
+	add	$0x8d2a4c8a,%edi
+	and	%esi,%eax
+	add	%ebx,%edi
+	xor	%edx,%eax
+	mov	5*4(%ebp),%ebx
+	add	%eax,%edi
+	mov	%ecx,%eax
+	rol	$20,%edi
+	xor	%edx,%eax
+	add	%ecx,%edi
+
+#define REPEAT3(p1w,p2x,p3c,p4s,p5Nin,p6Ny,p7Nz) \
+	add	$p3c,p1w		;\
+	xor	p2x,%eax		;\
+	add	%ebx,p1w		;\
+	mov	p5Nin*4(%ebp),%ebx	;\
+	add	%eax,p1w		;\
+	mov	p6Ny,%eax		;\
+	rol	$p4s,p1w		;\
+	xor	p7Nz,%eax		;\
+	add	p2x,p1w
+
+	REPEAT3(%esi,%edi,0xfffa3942, 4, 8,%edi,%ecx)
+	REPEAT3(%edx,%esi,0x8771f681,11,11,%esi,%edi)
+	REPEAT3(%ecx,%edx,0x6d9d6122,16,14,%edx,%esi)
+	REPEAT3(%edi,%ecx,0xfde5380c,23, 1,%ecx,%edx)
+	REPEAT3(%esi,%edi,0xa4beea44, 4, 4,%edi,%ecx)
+	REPEAT3(%edx,%esi,0x4bdecfa9,11, 7,%esi,%edi)
+	REPEAT3(%ecx,%edx,0xf6bb4b60,16,10,%edx,%esi)
+	REPEAT3(%edi,%ecx,0xbebfbc70,23,13,%ecx,%edx)
+	REPEAT3(%esi,%edi,0x289b7ec6, 4, 0,%edi,%ecx)
+	REPEAT3(%edx,%esi,0xeaa127fa,11, 3,%esi,%edi)
+	REPEAT3(%ecx,%edx,0xd4ef3085,16, 6,%edx,%esi)
+	REPEAT3(%edi,%ecx,0x04881d05,23, 9,%ecx,%edx)
+	REPEAT3(%esi,%edi,0xd9d4d039, 4,12,%edi,%ecx)
+	REPEAT3(%edx,%esi,0xe6db99e5,11,15,%esi,%edi)
+	REPEAT3(%ecx,%edx,0x1fa27cf8,16, 2,%edx,%esi)
+
+	add	$0xc4ac5665,%edi
+	xor	%ecx,%eax
+	add	%ebx,%edi
+	mov	(%ebp),%ebx
+	add	%eax,%edi
+	mov	%edx,%eax
+	rol	$23,%edi
+	not	%eax
+	add	%ecx,%edi
+
+#define REPEAT4(p1w,p2x,p3y,p4c,p5s,p6Nin,p7Nz) \
+	add	$p4c,p1w		;\
+	or	p2x,%eax		;\
+	add	%ebx,p1w		;\
+	xor	p3y,%eax		;\
+	mov	p6Nin*4(%ebp),%ebx	;\
+	add	%eax,p1w		;\
+	mov	p7Nz,%eax		;\
+	rol	$p5s,p1w		;\
+	not	%eax			;\
+	add	p2x,p1w
+
+	REPEAT4(%esi,%edi,%ecx,0xf4292244, 6, 7,%ecx)
+	REPEAT4(%edx,%esi,%edi,0x432aff97,10,14,%edi)
+	REPEAT4(%ecx,%edx,%esi,0xab9423a7,15, 5,%esi)
+	REPEAT4(%edi,%ecx,%edx,0xfc93a039,21,12,%edx)
+	REPEAT4(%esi,%edi,%ecx,0x655b59c3, 6, 3,%ecx)
+	REPEAT4(%edx,%esi,%edi,0x8f0ccc92,10,10,%edi)
+	REPEAT4(%ecx,%edx,%esi,0xffeff47d,15, 1,%esi)
+	REPEAT4(%edi,%ecx,%edx,0x85845dd1,21, 8,%edx)
+	REPEAT4(%esi,%edi,%ecx,0x6fa87e4f, 6,15,%ecx)
+	REPEAT4(%edx,%esi,%edi,0xfe2ce6e0,10, 6,%edi)
+	REPEAT4(%ecx,%edx,%esi,0xa3014314,15,13,%esi)
+	REPEAT4(%edi,%ecx,%edx,0x4e0811a1,21, 4,%edx)
+	REPEAT4(%esi,%edi,%ecx,0xf7537e82, 6,11,%ecx)
+	REPEAT4(%edx,%esi,%edi,0xbd3af235,10, 2,%edi)
+	REPEAT4(%ecx,%edx,%esi,0x2ad7d2bb,15, 9,%esi)
+
+	add	$0xeb86d391,%edi
+	or	%ecx,%eax
+	add	%ebx,%edi
+	xor	%edx,%eax
+	mov	4+16(%esp),%ebp		// pointer to 'hash' output
+	add	%eax,%edi
+	rol	$21,%edi
+	add	%ecx,%edi
+
+	add	%esi,(%ebp)
+	add	%edi,4(%ebp)
+	add	%ecx,8(%ebp)
+	add	%edx,12(%ebp)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	ret
diff -urN linux-2.4.28/drivers/misc/md5.c linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5.c
--- linux-2.4.28/drivers/misc/md5.c	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5.c	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,106 @@
+/*
+ * MD5 Message Digest Algorithm (RFC1321).
+ *
+ * Derived from cryptoapi implementation, originally based on the
+ * public domain implementation written by Colin Plumb in 1993.
+ *
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "md5.h"
+
+#define MD5_F1(x, y, z)     (z ^ (x & (y ^ z)))
+#define MD5_F2(x, y, z)     MD5_F1(z, x, y)
+#define MD5_F3(x, y, z)     (x ^ y ^ z)
+#define MD5_F4(x, y, z)     (y ^ (x | ~z))
+#define MD5_STEP(f, w, x, y, z, in, s) \
+    (w += f(x, y, z) + in, w = (w<<s | w>>(32-s)) + x)
+
+void md5_transform_CPUbyteorder(u_int32_t *hash, u_int32_t const *in)
+{
+    u_int32_t a, b, c, d;
+
+    a = hash[0];
+    b = hash[1];
+    c = hash[2];
+    d = hash[3];
+
+    MD5_STEP(MD5_F1, a, b, c, d, in[0] + 0xd76aa478, 7);
+    MD5_STEP(MD5_F1, d, a, b, c, in[1] + 0xe8c7b756, 12);
+    MD5_STEP(MD5_F1, c, d, a, b, in[2] + 0x242070db, 17);
+    MD5_STEP(MD5_F1, b, c, d, a, in[3] + 0xc1bdceee, 22);
+    MD5_STEP(MD5_F1, a, b, c, d, in[4] + 0xf57c0faf, 7);
+    MD5_STEP(MD5_F1, d, a, b, c, in[5] + 0x4787c62a, 12);
+    MD5_STEP(MD5_F1, c, d, a, b, in[6] + 0xa8304613, 17);
+    MD5_STEP(MD5_F1, b, c, d, a, in[7] + 0xfd469501, 22);
+    MD5_STEP(MD5_F1, a, b, c, d, in[8] + 0x698098d8, 7);
+    MD5_STEP(MD5_F1, d, a, b, c, in[9] + 0x8b44f7af, 12);
+    MD5_STEP(MD5_F1, c, d, a, b, in[10] + 0xffff5bb1, 17);
+    MD5_STEP(MD5_F1, b, c, d, a, in[11] + 0x895cd7be, 22);
+    MD5_STEP(MD5_F1, a, b, c, d, in[12] + 0x6b901122, 7);
+    MD5_STEP(MD5_F1, d, a, b, c, in[13] + 0xfd987193, 12);
+    MD5_STEP(MD5_F1, c, d, a, b, in[14] + 0xa679438e, 17);
+    MD5_STEP(MD5_F1, b, c, d, a, in[15] + 0x49b40821, 22);
+
+    MD5_STEP(MD5_F2, a, b, c, d, in[1] + 0xf61e2562, 5);
+    MD5_STEP(MD5_F2, d, a, b, c, in[6] + 0xc040b340, 9);
+    MD5_STEP(MD5_F2, c, d, a, b, in[11] + 0x265e5a51, 14);
+    MD5_STEP(MD5_F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20);
+    MD5_STEP(MD5_F2, a, b, c, d, in[5] + 0xd62f105d, 5);
+    MD5_STEP(MD5_F2, d, a, b, c, in[10] + 0x02441453, 9);
+    MD5_STEP(MD5_F2, c, d, a, b, in[15] + 0xd8a1e681, 14);
+    MD5_STEP(MD5_F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20);
+    MD5_STEP(MD5_F2, a, b, c, d, in[9] + 0x21e1cde6, 5);
+    MD5_STEP(MD5_F2, d, a, b, c, in[14] + 0xc33707d6, 9);
+    MD5_STEP(MD5_F2, c, d, a, b, in[3] + 0xf4d50d87, 14);
+    MD5_STEP(MD5_F2, b, c, d, a, in[8] + 0x455a14ed, 20);
+    MD5_STEP(MD5_F2, a, b, c, d, in[13] + 0xa9e3e905, 5);
+    MD5_STEP(MD5_F2, d, a, b, c, in[2] + 0xfcefa3f8, 9);
+    MD5_STEP(MD5_F2, c, d, a, b, in[7] + 0x676f02d9, 14);
+    MD5_STEP(MD5_F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20);
+
+    MD5_STEP(MD5_F3, a, b, c, d, in[5] + 0xfffa3942, 4);
+    MD5_STEP(MD5_F3, d, a, b, c, in[8] + 0x8771f681, 11);
+    MD5_STEP(MD5_F3, c, d, a, b, in[11] + 0x6d9d6122, 16);
+    MD5_STEP(MD5_F3, b, c, d, a, in[14] + 0xfde5380c, 23);
+    MD5_STEP(MD5_F3, a, b, c, d, in[1] + 0xa4beea44, 4);
+    MD5_STEP(MD5_F3, d, a, b, c, in[4] + 0x4bdecfa9, 11);
+    MD5_STEP(MD5_F3, c, d, a, b, in[7] + 0xf6bb4b60, 16);
+    MD5_STEP(MD5_F3, b, c, d, a, in[10] + 0xbebfbc70, 23);
+    MD5_STEP(MD5_F3, a, b, c, d, in[13] + 0x289b7ec6, 4);
+    MD5_STEP(MD5_F3, d, a, b, c, in[0] + 0xeaa127fa, 11);
+    MD5_STEP(MD5_F3, c, d, a, b, in[3] + 0xd4ef3085, 16);
+    MD5_STEP(MD5_F3, b, c, d, a, in[6] + 0x04881d05, 23);
+    MD5_STEP(MD5_F3, a, b, c, d, in[9] + 0xd9d4d039, 4);
+    MD5_STEP(MD5_F3, d, a, b, c, in[12] + 0xe6db99e5, 11);
+    MD5_STEP(MD5_F3, c, d, a, b, in[15] + 0x1fa27cf8, 16);
+    MD5_STEP(MD5_F3, b, c, d, a, in[2] + 0xc4ac5665, 23);
+
+    MD5_STEP(MD5_F4, a, b, c, d, in[0] + 0xf4292244, 6);
+    MD5_STEP(MD5_F4, d, a, b, c, in[7] + 0x432aff97, 10);
+    MD5_STEP(MD5_F4, c, d, a, b, in[14] + 0xab9423a7, 15);
+    MD5_STEP(MD5_F4, b, c, d, a, in[5] + 0xfc93a039, 21);
+    MD5_STEP(MD5_F4, a, b, c, d, in[12] + 0x655b59c3, 6);
+    MD5_STEP(MD5_F4, d, a, b, c, in[3] + 0x8f0ccc92, 10);
+    MD5_STEP(MD5_F4, c, d, a, b, in[10] + 0xffeff47d, 15);
+    MD5_STEP(MD5_F4, b, c, d, a, in[1] + 0x85845dd1, 21);
+    MD5_STEP(MD5_F4, a, b, c, d, in[8] + 0x6fa87e4f, 6);
+    MD5_STEP(MD5_F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10);
+    MD5_STEP(MD5_F4, c, d, a, b, in[6] + 0xa3014314, 15);
+    MD5_STEP(MD5_F4, b, c, d, a, in[13] + 0x4e0811a1, 21);
+    MD5_STEP(MD5_F4, a, b, c, d, in[4] + 0xf7537e82, 6);
+    MD5_STEP(MD5_F4, d, a, b, c, in[11] + 0xbd3af235, 10);
+    MD5_STEP(MD5_F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15);
+    MD5_STEP(MD5_F4, b, c, d, a, in[9] + 0xeb86d391, 21);
+
+    hash[0] += a;
+    hash[1] += b;
+    hash[2] += c;
+    hash[3] += d;
+}
diff -urN linux-2.4.28/drivers/misc/md5.h linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5.h
--- linux-2.4.28/drivers/misc/md5.h	Thu Jan  1 01:00:00 1970
+++ linux-2.4.28-loop-AES-v3.0b/drivers/misc/md5.h	Sun Feb  6 18:45:39 2005
@@ -0,0 +1,11 @@
+/* md5.h */
+
+#include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/config.h>
+#include <linux/module.h>
+
+#if defined(CONFIG_X86) || defined(CONFIG_X86_64)
+ asmlinkage
+#endif
+extern void md5_transform_CPUbyteorder(u_int32_t *, u_int32_t const *);
diff -urN linux-2.4.28/include/linux/loop.h linux-2.4.28-loop-AES-v3.0b/include/linux/loop.h
--- linux-2.4.28/include/linux/loop.h	Sun Aug  4 14:26:52 2002
+++ linux-2.4.28-loop-AES-v3.0b/include/linux/loop.h	Sun Feb  6 18:45:39 2005
@@ -17,6 +17,11 @@
 
 #ifdef __KERNEL__
 
+/* definitions for IV metric -- cryptoapi specific */
+#define LOOP_IV_SECTOR_BITS 9
+#define LOOP_IV_SECTOR_SIZE (1 << LOOP_IV_SECTOR_BITS)
+typedef int loop_iv_t;
+
 /* Possible states of device */
 enum {
 	Lo_unbound,
@@ -27,35 +32,47 @@
 struct loop_device {
 	int		lo_number;
 	int		lo_refcnt;
-	kdev_t		lo_device;
-	int		lo_offset;
+	loff_t		lo_offset;
+	loff_t		lo_sizelimit;
 	int		lo_encrypt_type;
 	int		lo_encrypt_key_size;
-	int		lo_flags;
 	int		(*transfer)(struct loop_device *, int cmd,
 				    char *raw_buf, char *loop_buf, int size,
 				    int real_block);
+	int		(*ioctl)(struct loop_device *, int cmd,
+				 unsigned long arg);
 	char		lo_name[LO_NAME_SIZE];
 	char		lo_encrypt_key[LO_KEY_SIZE];
 	__u32           lo_init[2];
 	uid_t		lo_key_owner;	/* Who set the key */
-	int		(*ioctl)(struct loop_device *, int cmd, 
-				 unsigned long arg); 
+	kdev_t		lo_device;
+	int		lo_flags;
 
 	struct file *	lo_backing_file;
-	void		*key_data; 
+	void		*key_data;
 	char		key_reserved[48]; /* for use by the filter modules */
 
 	int		old_gfp_mask;
+	int		lo_state;
 
+	struct buffer_head	*lo_bh_que0;
+	struct buffer_head	*lo_bh_que1;
+	struct buffer_head	*lo_bh_que2;
+	struct buffer_head	*lo_bh_free;
 	spinlock_t		lo_lock;
-	struct buffer_head	*lo_bh;
-	struct buffer_head	*lo_bhtail;
-	int			lo_state;
 	struct semaphore	lo_sem;
 	struct semaphore	lo_ctl_mutex;
-	struct semaphore	lo_bh_mutex;
 	atomic_t		lo_pending;
+	int			lo_bh_flsh;
+	int			lo_bh_need;
+	wait_queue_head_t	lo_bh_wait;
+	unsigned long		lo_offs_sec;
+	unsigned long		lo_iv_remove;
+	unsigned char		lo_crypt_name[LO_NAME_SIZE];
+#if CONFIG_BLK_DEV_LOOP_KEYSCRUB
+	void			(*lo_keyscrub_fn)(void *);
+	void			*lo_keyscrub_ptr;
+#endif
 };
 
 typedef	int (* transfer_proc_t)(struct loop_device *, int cmd,
@@ -77,20 +94,19 @@
  */
 #define LO_FLAGS_DO_BMAP	1
 #define LO_FLAGS_READ_ONLY	2
-#define LO_FLAGS_BH_REMAP	4
 
-/* 
+/*
  * Note that this structure gets the wrong offsets when directly used
  * from a glibc program, because glibc has a 32bit dev_t.
- * Prevent people from shooting in their own foot.  
+ * Prevent people from shooting in their own foot.
  */
 #if __GLIBC__ >= 2 && !defined(dev_t)
 #error "Wrong dev_t in loop.h"
-#endif 
+#endif
 
 /*
  *	This uses kdev_t because glibc currently has no appropiate
- *	conversion version for the loop ioctls. 
+ *	conversion version for the loop ioctls.
  * 	The situation is very unpleasant	
  */
 
@@ -109,6 +125,22 @@
 	char		reserved[4];
 };
 
+struct loop_info64 {
+	__u64		   lo_device;			/* ioctl r/o */
+	__u64		   lo_inode;			/* ioctl r/o */
+	__u64		   lo_rdevice;			/* ioctl r/o */
+	__u64		   lo_offset;
+	__u64		   lo_sizelimit;/* bytes, 0 == max available */
+	__u32		   lo_number;			/* ioctl r/o */
+	__u32		   lo_encrypt_type;
+	__u32		   lo_encrypt_key_size;		/* ioctl w/o */
+	__u32		   lo_flags;			/* ioctl r/o */
+	__u8		   lo_file_name[LO_NAME_SIZE];
+	__u8		   lo_crypt_name[LO_NAME_SIZE];
+	__u8		   lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
+	__u64		   lo_init[2];
+};
+
 /*
  * Loop filter types
  */
@@ -122,25 +154,27 @@
 #define LO_CRYPT_IDEA     6
 #define LO_CRYPT_DUMMY    9
 #define LO_CRYPT_SKIPJACK 10
+#define LO_CRYPT_AES      16
+#define LO_CRYPT_CRYPTOAPI 18
 #define MAX_LO_CRYPT	20
 
 #ifdef __KERNEL__
 /* Support for loadable transfer modules */
 struct loop_func_table {
-	int number; 	/* filter type */ 
+	int number; 	/* filter type */
 	int (*transfer)(struct loop_device *lo, int cmd, char *raw_buf,
 			char *loop_buf, int size, int real_block);
-	int (*init)(struct loop_device *, struct loop_info *); 
+	int (*init)(struct loop_device *, struct loop_info *);
 	/* release is called from loop_unregister_transfer or clr_fd */
-	int (*release)(struct loop_device *); 
+	int (*release)(struct loop_device *);
 	int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
-	/* lock and unlock manage the module use counts */ 
+	/* lock and unlock manage the module use counts */
 	void (*lock)(struct loop_device *);
 	void (*unlock)(struct loop_device *);
-}; 
+};
 
-int  loop_register_transfer(struct loop_func_table *funcs);
-int loop_unregister_transfer(int number); 
+int loop_register_transfer(struct loop_func_table *funcs);
+int loop_unregister_transfer(int number);
 
 #endif
 /*
@@ -151,5 +185,10 @@
 #define LOOP_CLR_FD	0x4C01
 #define LOOP_SET_STATUS	0x4C02
 #define LOOP_GET_STATUS	0x4C03
+#define LOOP_SET_STATUS64 0x4C04
+#define LOOP_GET_STATUS64 0x4C05
+
+#define LOOP_MULTI_KEY_SETUP     0x4C4D
+#define LOOP_MULTI_KEY_SETUP_V3  0x4C4E
 
 #endif