Index: MAINTAINERS =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/MAINTAINERS,v retrieving revision 1.1.1.36 retrieving revision 1.1.1.36.4.1 diff -u -r1.1.1.36 -r1.1.1.36.4.1 --- a/MAINTAINERS 14 Apr 2004 13:05:25 -0000 1.1.1.36 +++ b/MAINTAINERS 17 Apr 2004 00:42:35 -0000 1.1.1.36.4.1 @@ -587,6 +587,13 @@ W: http://www.debian.org/~dz/i8k/ S: Maintained +DEVICE MAPPER +P: Joe Thornber +M: dm@uk.sistina.com +L: linux-LVM@sistina.com +W: http://www.sistina.com/lvm +S: Maintained + DEVICE NUMBER REGISTRY P: H. Peter Anvin M: hpa@zytor.com Index: Documentation/Configure.help =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/Documentation/Configure.help,v retrieving revision 1.1.1.42 retrieving revision 1.1.1.42.4.1 diff -u -r1.1.1.42 -r1.1.1.42.4.1 --- a/Documentation/Configure.help 14 Apr 2004 13:05:24 -0000 1.1.1.42 +++ b/Documentation/Configure.help 17 Apr 2004 00:42:36 -0000 1.1.1.42.4.1 @@ -1931,6 +1931,20 @@ want), say M here and read . The module will be called lvm-mod.o. +Device-mapper support +CONFIG_BLK_DEV_DM + Device-mapper is a low level volume manager. It works by allowing + people to specify mappings for ranges of logical sectors. Various + mapping types are available, in addition people may write their own + modules containing custom mappings if they wish. + + Higher level volume managers such as LVM2 use this driver. + + If you want to compile this as a module, say M here and read + . The module will be called dm-mod.o. + + If unsure, say N. + Multiple devices driver support (RAID and LVM) CONFIG_MD Support multiple physical spindles through a single logical device. Index: arch/mips64/kernel/ioctl32.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/mips64/kernel/ioctl32.c,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.4.1 diff -u -r1.1.1.14 -r1.1.1.14.4.1 --- a/arch/mips64/kernel/ioctl32.c 18 Feb 2004 13:36:30 -0000 1.1.1.14 +++ b/arch/mips64/kernel/ioctl32.c 17 Apr 2004 00:42:39 -0000 1.1.1.14.4.1 @@ -49,6 +49,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include #undef __KERNEL__ /* This file was born to be ugly ... */ @@ -2327,6 +2328,22 @@ IOCTL32_DEFAULT(RESTART_ARRAY_RW), #endif /* CONFIG_MD */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_TABLE_LOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_TABLE_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TABLE_STATUS), + IOCTL32_DEFAULT(DM_DEV_WAIT), + IOCTL32_DEFAULT(DM_LIST_DEVICES), + IOCTL32_DEFAULT(DM_TABLE_CLEAR), +#endif /* CONFIG_BLK_DEV_DM */ + #ifdef CONFIG_SIBYTE_TBPROF IOCTL32_DEFAULT(SBPROF_ZBSTART), IOCTL32_DEFAULT(SBPROF_ZBSTOP), Index: arch/parisc/kernel/ioctl32.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/parisc/kernel/ioctl32.c,v retrieving revision 1.1.1.9 retrieving revision 1.1.1.9.4.1 diff -u -r1.1.1.9 -r1.1.1.9.4.1 --- a/arch/parisc/kernel/ioctl32.c 25 Aug 2003 11:44:40 -0000 1.1.1.9 +++ b/arch/parisc/kernel/ioctl32.c 17 Apr 2004 00:42:39 -0000 1.1.1.9.4.1 @@ -55,6 +55,7 @@ #define max max */ #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -3423,6 +3424,22 @@ COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE) COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC) COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID) Index: arch/ppc64/kernel/ioctl32.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/ppc64/kernel/ioctl32.c,v retrieving revision 1.1.1.10 retrieving revision 1.1.1.10.4.1 diff -u -r1.1.1.10 -r1.1.1.10.4.1 --- a/arch/ppc64/kernel/ioctl32.c 18 Feb 2004 13:36:30 -0000 1.1.1.10 +++ b/arch/ppc64/kernel/ioctl32.c 17 Apr 2004 00:42:40 -0000 1.1.1.10.4.1 @@ -66,6 +66,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4408,6 +4409,22 @@ COMPATIBLE_IOCTL(NBD_PRINT_DEBUG), COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS), COMPATIBLE_IOCTL(NBD_DISCONNECT), +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION), +COMPATIBLE_IOCTL(DM_REMOVE_ALL), +COMPATIBLE_IOCTL(DM_DEV_CREATE), +COMPATIBLE_IOCTL(DM_DEV_REMOVE), +COMPATIBLE_IOCTL(DM_TABLE_LOAD), +COMPATIBLE_IOCTL(DM_DEV_SUSPEND), +COMPATIBLE_IOCTL(DM_DEV_RENAME), +COMPATIBLE_IOCTL(DM_TABLE_DEPS), +COMPATIBLE_IOCTL(DM_DEV_STATUS), +COMPATIBLE_IOCTL(DM_TABLE_STATUS), +COMPATIBLE_IOCTL(DM_DEV_WAIT), +COMPATIBLE_IOCTL(DM_LIST_DEVICES), +COMPATIBLE_IOCTL(DM_TABLE_CLEAR), +#endif /* CONFIG_BLK_DEV_DM */ /* Remove *PRIVATE in 2.5 */ COMPATIBLE_IOCTL(SIOCDEVPRIVATE), COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1), Index: arch/s390x/kernel/ioctl32.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/s390x/kernel/ioctl32.c,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.4.1 diff -u -r1.1.1.12 -r1.1.1.12.4.1 --- a/arch/s390x/kernel/ioctl32.c 25 Aug 2003 11:44:40 -0000 1.1.1.12 +++ b/arch/s390x/kernel/ioctl32.c 17 Apr 2004 00:42:40 -0000 1.1.1.12.4.1 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -627,6 +628,20 @@ IOCTL32_DEFAULT(SIOCGSTAMP), + IOCTL32_DEFAULT(DM_VERSION), + IOCTL32_DEFAULT(DM_REMOVE_ALL), + IOCTL32_DEFAULT(DM_DEV_CREATE), + IOCTL32_DEFAULT(DM_DEV_REMOVE), + IOCTL32_DEFAULT(DM_TABLE_LOAD), + IOCTL32_DEFAULT(DM_DEV_SUSPEND), + IOCTL32_DEFAULT(DM_DEV_RENAME), + IOCTL32_DEFAULT(DM_TABLE_DEPS), + IOCTL32_DEFAULT(DM_DEV_STATUS), + IOCTL32_DEFAULT(DM_TABLE_STATUS), + IOCTL32_DEFAULT(DM_DEV_WAIT), + IOCTL32_DEFAULT(DM_LIST_DEVICES), + IOCTL32_DEFAULT(DM_TABLE_CLEAR), + IOCTL32_DEFAULT(LOOP_SET_FD), IOCTL32_DEFAULT(LOOP_CLR_FD), Index: arch/sparc64/kernel/ioctl32.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/sparc64/kernel/ioctl32.c,v retrieving revision 1.1.1.29 retrieving revision 1.1.1.29.4.1 diff -u -r1.1.1.29 -r1.1.1.29.4.1 --- a/arch/sparc64/kernel/ioctl32.c 18 Feb 2004 13:36:31 -0000 1.1.1.29 +++ b/arch/sparc64/kernel/ioctl32.c 17 Apr 2004 00:42:42 -0000 1.1.1.29.4.1 @@ -56,6 +56,7 @@ #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE) #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -5085,6 +5086,22 @@ COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS) COMPATIBLE_IOCTL(NBD_DISCONNECT) +/* device-mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ /* Linux-1394 */ #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE) COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL) Index: arch/x86_64/ia32/ia32_ioctl.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/arch/x86_64/ia32/ia32_ioctl.c,v retrieving revision 1.1.1.10 retrieving revision 1.1.1.10.4.1 diff -u -r1.1.1.10 -r1.1.1.10.4.1 --- a/arch/x86_64/ia32/ia32_ioctl.c 14 Apr 2004 13:05:28 -0000 1.1.1.10 +++ b/arch/x86_64/ia32/ia32_ioctl.c 17 Apr 2004 00:42:42 -0000 1.1.1.10.4.1 @@ -68,6 +68,7 @@ #define max max #include #endif /* LVM */ +#include #include /* Ugly hack. */ @@ -4140,6 +4141,22 @@ COMPATIBLE_IOCTL(LV_BMAP) COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE) #endif /* LVM */ +/* Device-Mapper */ +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE) +COMPATIBLE_IOCTL(DM_VERSION) +COMPATIBLE_IOCTL(DM_REMOVE_ALL) +COMPATIBLE_IOCTL(DM_DEV_CREATE) +COMPATIBLE_IOCTL(DM_DEV_REMOVE) +COMPATIBLE_IOCTL(DM_TABLE_LOAD) +COMPATIBLE_IOCTL(DM_DEV_SUSPEND) +COMPATIBLE_IOCTL(DM_DEV_RENAME) +COMPATIBLE_IOCTL(DM_TABLE_DEPS) +COMPATIBLE_IOCTL(DM_DEV_STATUS) +COMPATIBLE_IOCTL(DM_TABLE_STATUS) +COMPATIBLE_IOCTL(DM_DEV_WAIT) +COMPATIBLE_IOCTL(DM_LIST_DEVICES) +COMPATIBLE_IOCTL(DM_TABLE_CLEAR) +#endif /* CONFIG_BLK_DEV_DM */ #ifdef CONFIG_AUTOFS_FS COMPATIBLE_IOCTL(AUTOFS_IOC_READY) COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL) Index: drivers/md/Config.in =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/drivers/md/Config.in,v retrieving revision 1.1.1.10 retrieving revision 1.1.1.10.4.1 diff -u -r1.1.1.10 -r1.1.1.10.4.1 --- a/drivers/md/Config.in 14 Sep 2001 21:22:18 -0000 1.1.1.10 +++ b/drivers/md/Config.in 17 Apr 2004 00:42:43 -0000 1.1.1.10.4.1 @@ -14,5 +14,6 @@ dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD endmenu Index: drivers/md/Makefile =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/drivers/md/Makefile,v retrieving revision 1.1.1.11 retrieving revision 1.1.1.11.4.1 diff -u -r1.1.1.11 -r1.1.1.11.4.1 --- a/drivers/md/Makefile 11 Nov 2001 18:09:32 -0000 1.1.1.11 +++ b/drivers/md/Makefile 17 Apr 2004 00:42:43 -0000 1.1.1.11.4.1 @@ -4,24 +4,35 @@ O_TARGET := mddev.o -export-objs := md.o xor.o -list-multi := lvm-mod.o +export-objs := md.o xor.o dm-table.o dm-target.o \ + dm.o + +list-multi := lvm-mod.o dm-mod.o lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o +dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \ + dm-linear.o dm-stripe.o # Note: link order is important. All raid personalities # and xor.o must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. -obj-$(CONFIG_MD_LINEAR) += linear.o -obj-$(CONFIG_MD_RAID0) += raid0.o -obj-$(CONFIG_MD_RAID1) += raid1.o -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o -obj-$(CONFIG_MD_MULTIPATH) += multipath.o -obj-$(CONFIG_BLK_DEV_MD) += md.o -obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o +obj-$(CONFIG_MD_LINEAR) += linear.o +obj-$(CONFIG_MD_RAID0) += raid0.o +obj-$(CONFIG_MD_RAID1) += raid1.o +obj-$(CONFIG_MD_RAID5) += raid5.o xor.o +obj-$(CONFIG_MD_MULTIPATH) += multipath.o +obj-$(CONFIG_BLK_DEV_MD) += md.o + +obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o + +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o include $(TOPDIR)/Rules.make lvm-mod.o: $(lvm-mod-objs) $(LD) -r -o $@ $(lvm-mod-objs) + +dm-mod.o: $(dm-mod-objs) + $(LD) -r -o $@ $(dm-mod-objs) + Index: drivers/md/dm-ioctl.c =================================================================== RCS file: drivers/md/dm-ioctl.c diff -N drivers/md/dm-ioctl.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm-ioctl.c 17 Apr 2004 00:42:43 -0000 1.1.1.1.10.1 @@ -0,0 +1,1284 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DM_DRIVER_EMAIL "dm@uk.sistina.com" + +/*----------------------------------------------------------------- + * The ioctl interface needs to be able to look up devices by + * name or uuid. + *---------------------------------------------------------------*/ +struct hash_cell { + struct list_head name_list; + struct list_head uuid_list; + + char *name; + char *uuid; + struct mapped_device *md; + struct dm_table *new_map; + + /* I hate devfs */ + devfs_handle_t devfs_entry; +}; + +#define NUM_BUCKETS 64 +#define MASK_BUCKETS (NUM_BUCKETS - 1) +static struct list_head _name_buckets[NUM_BUCKETS]; +static struct list_head _uuid_buckets[NUM_BUCKETS]; + +static devfs_handle_t _dev_dir; +void dm_hash_remove_all(void); + +/* + * Guards access to both hash tables. + */ +static DECLARE_RWSEM(_hash_lock); + +static void init_buckets(struct list_head *buckets) +{ + unsigned int i; + + for (i = 0; i < NUM_BUCKETS; i++) + INIT_LIST_HEAD(buckets + i); +} + +int dm_hash_init(void) +{ + init_buckets(_name_buckets); + init_buckets(_uuid_buckets); + _dev_dir = devfs_mk_dir(0, DM_DIR, NULL); + return 0; +} + +void dm_hash_exit(void) +{ + dm_hash_remove_all(); + devfs_unregister(_dev_dir); +} + +/*----------------------------------------------------------------- + * Hash function: + * We're not really concerned with the str hash function being + * fast since it's only used by the ioctl interface. + *---------------------------------------------------------------*/ +static unsigned int hash_str(const char *str) +{ + const unsigned int hash_mult = 2654435387U; + unsigned int h = 0; + + while (*str) + h = (h + (unsigned int) *str++) * hash_mult; + + return h & MASK_BUCKETS; +} + +/*----------------------------------------------------------------- + * Code for looking up a device by name + *---------------------------------------------------------------*/ +static struct hash_cell *__get_name_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each (tmp, _name_buckets + h) { + hc = list_entry(tmp, struct hash_cell, name_list); + if (!strcmp(hc->name, str)) + return hc; + } + + return NULL; +} + +static struct hash_cell *__get_uuid_cell(const char *str) +{ + struct list_head *tmp; + struct hash_cell *hc; + unsigned int h = hash_str(str); + + list_for_each (tmp, _uuid_buckets + h) { + hc = list_entry(tmp, struct hash_cell, uuid_list); + if (!strcmp(hc->uuid, str)) + return hc; + } + + return NULL; +} + +/*----------------------------------------------------------------- + * Inserting, removing and renaming a device. + *---------------------------------------------------------------*/ +static inline char *kstrdup(const char *str) +{ + char *r = kmalloc(strlen(str) + 1, GFP_KERNEL); + if (r) + strcpy(r, str); + return r; +} + +static struct hash_cell *alloc_cell(const char *name, const char *uuid, + struct mapped_device *md) +{ + struct hash_cell *hc; + + hc = kmalloc(sizeof(*hc), GFP_KERNEL); + if (!hc) + return NULL; + + hc->name = kstrdup(name); + if (!hc->name) { + kfree(hc); + return NULL; + } + + if (!uuid) + hc->uuid = NULL; + + else { + hc->uuid = kstrdup(uuid); + if (!hc->uuid) { + kfree(hc->name); + kfree(hc); + return NULL; + } + } + + INIT_LIST_HEAD(&hc->name_list); + INIT_LIST_HEAD(&hc->uuid_list); + hc->md = md; + hc->new_map = NULL; + return hc; +} + +static void free_cell(struct hash_cell *hc) +{ + if (hc) { + kfree(hc->name); + kfree(hc->uuid); + kfree(hc); + } +} + +/* + * devfs stuff. + */ +static int register_with_devfs(struct hash_cell *hc) +{ + kdev_t dev = dm_kdev(hc->md); + + hc->devfs_entry = + devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER, + major(dev), minor(dev), + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, + &dm_blk_dops, NULL); + + return 0; +} + +static int unregister_with_devfs(struct hash_cell *hc) +{ + devfs_unregister(hc->devfs_entry); + return 0; +} + +/* + * The kdev_t and uuid of a device can never change once it is + * initially inserted. + */ +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md) +{ + struct hash_cell *cell; + + /* + * Allocate the new cells. + */ + cell = alloc_cell(name, uuid, md); + if (!cell) + return -ENOMEM; + + /* + * Insert the cell into both hash tables. + */ + down_write(&_hash_lock); + if (__get_name_cell(name)) + goto bad; + + list_add(&cell->name_list, _name_buckets + hash_str(name)); + + if (uuid) { + if (__get_uuid_cell(uuid)) { + list_del(&cell->name_list); + goto bad; + } + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid)); + } + register_with_devfs(cell); + dm_get(md); + up_write(&_hash_lock); + + return 0; + + bad: + up_write(&_hash_lock); + free_cell(cell); + return -EBUSY; +} + +void __hash_remove(struct hash_cell *hc) +{ + /* remove from the dev hash */ + list_del(&hc->uuid_list); + list_del(&hc->name_list); + unregister_with_devfs(hc); + dm_put(hc->md); + if (hc->new_map) + dm_table_put(hc->new_map); + free_cell(hc); +} + +void dm_hash_remove_all(void) +{ + int i; + struct hash_cell *hc; + struct list_head *tmp, *n; + + down_write(&_hash_lock); + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_safe (tmp, n, _name_buckets + i) { + hc = list_entry(tmp, struct hash_cell, name_list); + __hash_remove(hc); + } + } + up_write(&_hash_lock); +} + +int dm_hash_rename(const char *old, const char *new) +{ + char *new_name, *old_name; + struct hash_cell *hc; + + /* + * duplicate new. + */ + new_name = kstrdup(new); + if (!new_name) + return -ENOMEM; + + down_write(&_hash_lock); + + /* + * Is new free ? + */ + hc = __get_name_cell(new); + if (hc) { + DMWARN("asked to rename to an already existing name %s -> %s", + old, new); + up_write(&_hash_lock); + kfree(new_name); + return -EBUSY; + } + + /* + * Is there such a device as 'old' ? + */ + hc = __get_name_cell(old); + if (!hc) { + DMWARN("asked to rename a non existent device %s -> %s", + old, new); + up_write(&_hash_lock); + kfree(new_name); + return -ENXIO; + } + + /* + * rename and move the name cell. + */ + list_del(&hc->name_list); + old_name = hc->name; + hc->name = new_name; + list_add(&hc->name_list, _name_buckets + hash_str(new_name)); + + /* rename the device node in devfs */ + unregister_with_devfs(hc); + register_with_devfs(hc); + + up_write(&_hash_lock); + kfree(old_name); + return 0; +} + +/*----------------------------------------------------------------- + * Implementation of the ioctl commands + *---------------------------------------------------------------*/ +/* + * All the ioctl commands get dispatched to functions with this + * prototype. + */ +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size); + +static int remove_all(struct dm_ioctl *param, size_t param_size) +{ + dm_hash_remove_all(); + param->data_size = 0; + return 0; +} + +/* + * Round up the ptr to an 8-byte boundary. + */ +#define ALIGN_MASK 7 +static inline void *align_ptr(void *ptr) +{ + return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK); +} + +/* + * Retrieves the data payload buffer from an already allocated + * struct dm_ioctl. + */ +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size, + size_t *len) +{ + param->data_start = align_ptr(param + 1) - (void *) param; + + if (param->data_start < param_size) + *len = param_size - param->data_start; + else + *len = 0; + + return ((void *) param) + param->data_start; +} + +static int list_devices(struct dm_ioctl *param, size_t param_size) +{ + unsigned int i; + struct hash_cell *hc; + size_t len, needed = 0; + struct dm_name_list *nl, *old_nl = NULL; + + down_write(&_hash_lock); + + /* + * Loop through all the devices working out how much + * space we need. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + needed += sizeof(struct dm_name_list); + needed += strlen(hc->name); + needed += ALIGN_MASK; + } + } + + /* + * Grab our output buffer. + */ + nl = get_result_buffer(param, param_size, &len); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + goto out; + } + param->data_size = param->data_start + needed; + + nl->dev = 0; /* Flags no data */ + + /* + * Now loop through filling out the names. + */ + for (i = 0; i < NUM_BUCKETS; i++) { + list_for_each_entry (hc, _name_buckets + i, name_list) { + if (old_nl) + old_nl->next = (uint32_t) ((void *) nl - + (void *) old_nl); + + nl->dev = dm_kdev(hc->md); + nl->next = 0; + strcpy(nl->name, hc->name); + + old_nl = nl; + nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1); + } + } + + out: + up_write(&_hash_lock); + return 0; +} + +static int check_name(const char *name) +{ + if (strchr(name, '/')) { + DMWARN("invalid device name"); + return -EINVAL; + } + + return 0; +} + +/* + * Fills in a dm_ioctl structure, ready for sending back to + * userland. + */ +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param) +{ + kdev_t dev = dm_kdev(md); + struct dm_table *table; + struct block_device *bdev; + + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG | + DM_ACTIVE_PRESENT_FLAG); + + if (dm_suspended(md)) + param->flags |= DM_SUSPEND_FLAG; + + param->dev = kdev_t_to_nr(dev); + + if (is_read_only(dev)) + param->flags |= DM_READONLY_FLAG; + + param->event_nr = dm_get_event_nr(md); + + table = dm_get_table(md); + if (table) { + param->flags |= DM_ACTIVE_PRESENT_FLAG; + param->target_count = dm_table_get_num_targets(table); + dm_table_put(table); + } else + param->target_count = 0; + + bdev = bdget(param->dev); + if (!bdev) + return -ENXIO; + param->open_count = bdev->bd_openers; + bdput(bdev); + + return 0; +} + +static int dev_create(struct dm_ioctl *param, size_t param_size) +{ + int r; + kdev_t dev = 0; + struct mapped_device *md; + + r = check_name(param->name); + if (r) + return r; + + if (param->flags & DM_PERSISTENT_DEV_FLAG) + dev = to_kdev_t(param->dev); + + r = dm_create(dev, &md); + if (r) + return r; + + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md); + if (r) { + dm_put(md); + return r; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(md, param); + dm_put(md); + + return r; +} + +/* + * Always use UUID for lookups if it's present, otherwise use name. + */ +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param) +{ + return *param->uuid ? + __get_uuid_cell(param->uuid) : __get_name_cell(param->name); +} + +static inline struct mapped_device *find_device(struct dm_ioctl *param) +{ + struct hash_cell *hc; + struct mapped_device *md = NULL; + + down_read(&_hash_lock); + hc = __find_device_hash_cell(param); + if (hc) { + md = hc->md; + + /* + * Sneakily write in both the name and the uuid + * while we have the cell. + */ + strncpy(param->name, hc->name, sizeof(param->name)); + if (hc->uuid) + strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1); + else + param->uuid[0] = '\0'; + + if (hc->new_map) + param->flags |= DM_INACTIVE_PRESENT_FLAG; + else + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + dm_get(md); + } + up_read(&_hash_lock); + + return md; +} + +static int dev_remove(struct dm_ioctl *param, size_t param_size) +{ + struct hash_cell *hc; + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + __hash_remove(hc); + up_write(&_hash_lock); + param->data_size = 0; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from userland. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str < end) + if (!*str++) + return 0; + + return -EINVAL; +} + +static int dev_rename(struct dm_ioctl *param, size_t param_size) +{ + int r; + char *new_name = (char *) param + param->data_start; + + if (new_name < (char *) (param + 1) || + invalid_str(new_name, (void *) param + param_size)) { + DMWARN("Invalid new logical volume name supplied."); + return -EINVAL; + } + + r = check_name(new_name); + if (r) + return r; + + param->data_size = 0; + return dm_hash_rename(param->name, new_name); +} + +static int do_suspend(struct dm_ioctl *param) +{ + int r = 0; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + if (!dm_suspended(md)) + r = dm_suspend(md); + + if (!r) + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +static int do_resume(struct dm_ioctl *param) +{ + int r = 0; + struct hash_cell *hc; + struct mapped_device *md; + struct dm_table *new_map; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + md = hc->md; + dm_get(md); + + new_map = hc->new_map; + hc->new_map = NULL; + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + up_write(&_hash_lock); + + /* Do we need to load a new map ? */ + if (new_map) { + /* Suspend if it isn't already suspended */ + if (!dm_suspended(md)) + dm_suspend(md); + + r = dm_swap_table(md, new_map); + if (r) { + dm_put(md); + dm_table_put(new_map); + return r; + } + + if (dm_table_get_mode(new_map) & FMODE_WRITE) + set_device_ro(dm_kdev(md), 0); + else + set_device_ro(dm_kdev(md), 1); + + dm_table_put(new_map); + } + + if (dm_suspended(md)) + r = dm_resume(md); + + if (!r) + r = __dev_status(md, param); + + dm_put(md); + return r; +} + +/* + * Set or unset the suspension state of a device. + * If the device already is in the requested state we just return its status. + */ +static int dev_suspend(struct dm_ioctl *param, size_t param_size) +{ + if (param->flags & DM_SUSPEND_FLAG) + return do_suspend(param); + + return do_resume(param); +} + +/* + * Copies device info back to user space, used by + * the create and info ioctls. + */ +static int dev_status(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + dm_put(md); + return r; +} + +/* + * Build up the status struct for each target + */ +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int i, num_targets; + struct dm_target_spec *spec; + char *outbuf, *outptr; + status_type_t type; + size_t remaining, len, used = 0; + + outptr = outbuf = get_result_buffer(param, param_size, &len); + + if (param->flags & DM_STATUS_TABLE_FLAG) + type = STATUSTYPE_TABLE; + else + type = STATUSTYPE_INFO; + + /* Get all the target info */ + num_targets = dm_table_get_num_targets(table); + for (i = 0; i < num_targets; i++) { + struct dm_target *ti = dm_table_get_target(table, i); + + remaining = len - (outptr - outbuf); + if (remaining < sizeof(struct dm_target_spec)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + + spec = (struct dm_target_spec *) outptr; + + spec->status = 0; + spec->sector_start = ti->begin; + spec->length = ti->len; + strncpy(spec->target_type, ti->type->name, + sizeof(spec->target_type)); + + outptr += sizeof(struct dm_target_spec); + remaining = len - (outptr - outbuf); + + /* Get the status/table string from the target driver */ + if (ti->type->status) { + if (ti->type->status(ti, type, outptr, remaining)) { + param->flags |= DM_BUFFER_FULL_FLAG; + break; + } + } else + outptr[0] = '\0'; + + outptr += strlen(outptr) + 1; + used = param->data_start + (outptr - outbuf); + + align_ptr(outptr); + spec->next = outptr - outbuf; + } + + if (used) + param->data_size = used; + + param->target_count = num_targets; +} + +/* + * Wait for a device to report an event + */ +static int dev_wait(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + DECLARE_WAITQUEUE(wq, current); + + md = find_device(param); + if (!md) + return -ENXIO; + + /* + * Wait for a notification event + */ + set_current_state(TASK_INTERRUPTIBLE); + if (!dm_add_wait_queue(md, &wq, param->event_nr)) { + schedule(); + dm_remove_wait_queue(md, &wq); + } + set_current_state(TASK_RUNNING); + + /* + * The userland program is going to want to know what + * changed to trigger the event, so we may as well tell + * him and save an ioctl. + */ + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +static inline int get_mode(struct dm_ioctl *param) +{ + int mode = FMODE_READ | FMODE_WRITE; + + if (param->flags & DM_READONLY_FLAG) + mode = FMODE_READ; + + return mode; +} + +static int next_target(struct dm_target_spec *last, uint32_t next, void *end, + struct dm_target_spec **spec, char **target_params) +{ + *spec = (struct dm_target_spec *) ((unsigned char *) last + next); + *target_params = (char *) (*spec + 1); + + if (*spec < (last + 1)) + return -EINVAL; + + return invalid_str(*target_params, end); +} + +static int populate_table(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + int r; + unsigned int i = 0; + struct dm_target_spec *spec = (struct dm_target_spec *) param; + uint32_t next = param->data_start; + void *end = (void *) param + param_size; + char *target_params; + + if (!param->target_count) { + DMWARN("populate_table: no targets specified"); + return -EINVAL; + } + + for (i = 0; i < param->target_count; i++) { + + r = next_target(spec, next, end, &spec, &target_params); + if (r) { + DMWARN("unable to find target"); + return r; + } + + r = dm_table_add_target(table, spec->target_type, + (sector_t) spec->sector_start, + (sector_t) spec->length, + target_params); + if (r) { + DMWARN("error adding target to table"); + return r; + } + + next = spec->next; + } + + return dm_table_complete(table); +} + +static int table_load(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + struct dm_table *t; + + r = dm_table_create(&t, get_mode(param), param->target_count); + if (r) + return r; + + r = populate_table(t, param, param_size); + if (r) { + dm_table_put(t); + return r; + } + + down_write(&_hash_lock); + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) + dm_table_put(hc->new_map); + hc->new_map = t; + param->flags |= DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(hc->md, param); + up_write(&_hash_lock); + return r; +} + +static int table_clear(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct hash_cell *hc; + + down_write(&_hash_lock); + + hc = __find_device_hash_cell(param); + if (!hc) { + DMWARN("device doesn't appear to be in the dev hash table."); + up_write(&_hash_lock); + return -ENXIO; + } + + if (hc->new_map) { + dm_table_put(hc->new_map); + hc->new_map = NULL; + } + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + + r = __dev_status(hc->md, param); + up_write(&_hash_lock); + return r; +} + +/* + * Retrieves a list of devices used by a particular dm device. + */ +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param, + size_t param_size) +{ + unsigned int count = 0; + struct list_head *tmp; + size_t len, needed; + struct dm_target_deps *deps; + + deps = get_result_buffer(param, param_size, &len); + + /* + * Count the devices. + */ + list_for_each(tmp, dm_table_get_devices(table)) + count++; + + /* + * Check we have enough space. + */ + needed = sizeof(*deps) + (sizeof(*deps->dev) * count); + if (len < needed) { + param->flags |= DM_BUFFER_FULL_FLAG; + return; + } + + /* + * Fill in the devices. + */ + deps->count = count; + count = 0; + list_for_each(tmp, dm_table_get_devices(table)) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + deps->dev[count++] = dd->bdev->bd_dev; + } + + param->data_size = param->data_start + needed; +} + +static int table_deps(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_deps(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +/* + * Return the status of a device as a text string for each + * target. + */ +static int table_status(struct dm_ioctl *param, size_t param_size) +{ + int r; + struct mapped_device *md; + struct dm_table *table; + + md = find_device(param); + if (!md) + return -ENXIO; + + r = __dev_status(md, param); + if (r) + goto out; + + table = dm_get_table(md); + if (table) { + retrieve_status(table, param, param_size); + dm_table_put(table); + } + + out: + dm_put(md); + return r; +} + +/*----------------------------------------------------------------- + * Implementation of open/close/ioctl on the special char + * device. + *---------------------------------------------------------------*/ +static ioctl_fn lookup_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */ + {DM_REMOVE_ALL_CMD, remove_all}, + {DM_LIST_DEVICES_CMD, list_devices}, + + {DM_DEV_CREATE_CMD, dev_create}, + {DM_DEV_REMOVE_CMD, dev_remove}, + {DM_DEV_RENAME_CMD, dev_rename}, + {DM_DEV_SUSPEND_CMD, dev_suspend}, + {DM_DEV_STATUS_CMD, dev_status}, + {DM_DEV_WAIT_CMD, dev_wait}, + + {DM_TABLE_LOAD_CMD, table_load}, + {DM_TABLE_CLEAR_CMD, table_clear}, + {DM_TABLE_DEPS_CMD, table_deps}, + {DM_TABLE_STATUS_CMD, table_status} + }; + + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn; +} + +/* + * As well as checking the version compatibility this always + * copies the kernel interface version out. + */ +static int check_version(unsigned int cmd, struct dm_ioctl *user) +{ + uint32_t version[3]; + int r = 0; + + if (copy_from_user(version, user->version, sizeof(version))) + return -EFAULT; + + if ((DM_VERSION_MAJOR != version[0]) || + (DM_VERSION_MINOR < version[1])) { + DMWARN("ioctl interface mismatch: " + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)", + DM_VERSION_MAJOR, DM_VERSION_MINOR, + DM_VERSION_PATCHLEVEL, + version[0], version[1], version[2], cmd); + r = -EINVAL; + } + + /* + * Fill in the kernel version. + */ + version[0] = DM_VERSION_MAJOR; + version[1] = DM_VERSION_MINOR; + version[2] = DM_VERSION_PATCHLEVEL; + if (copy_to_user(user->version, version, sizeof(version))) + return -EFAULT; + + return r; +} + +static void free_params(struct dm_ioctl *param) +{ + vfree(param); +} + +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param) +{ + struct dm_ioctl tmp, *dmi; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + if (tmp.data_size < sizeof(tmp)) + return -EINVAL; + + dmi = (struct dm_ioctl *) vmalloc(tmp.data_size); + if (!dmi) + return -ENOMEM; + + if (copy_from_user(dmi, user, tmp.data_size)) { + vfree(dmi); + return -EFAULT; + } + + *param = dmi; + return 0; +} + +static int validate_params(uint cmd, struct dm_ioctl *param) +{ + /* Always clear this flag */ + param->flags &= ~DM_BUFFER_FULL_FLAG; + + /* Ignores parameters */ + if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD) + return 0; + + /* Unless creating, either name or uuid but not both */ + if (cmd != DM_DEV_CREATE_CMD) { + if ((!*param->uuid && !*param->name) || + (*param->uuid && *param->name)) { + DMWARN("one of name or uuid must be supplied, cmd(%u)", + cmd); + return -EINVAL; + } + } + + /* Ensure strings are terminated */ + param->name[DM_NAME_LEN - 1] = '\0'; + param->uuid[DM_UUID_LEN - 1] = '\0'; + + return 0; +} + +static int ctl_ioctl(struct inode *inode, struct file *file, + uint command, ulong u) +{ + int r = 0; + unsigned int cmd; + struct dm_ioctl *param; + struct dm_ioctl *user = (struct dm_ioctl *) u; + ioctl_fn fn = NULL; + size_t param_size; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (_IOC_TYPE(command) != DM_IOCTL) + return -ENOTTY; + + cmd = _IOC_NR(command); + + /* + * Check the interface version passed in. This also + * writes out the kernel's interface version. + */ + r = check_version(cmd, user); + if (r) + return r; + + /* + * Nothing more to do for the version command. + */ + if (cmd == DM_VERSION_CMD) + return 0; + + fn = lookup_ioctl(cmd); + if (!fn) { + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command); + return -ENOTTY; + } + + /* + * FIXME: I don't like this, we're trying to avoid low + * memory issues when a device is suspended. + */ + current->flags |= PF_MEMALLOC; + + /* + * Copy the parameters into kernel space. + */ + r = copy_params(user, ¶m); + if (r) { + current->flags &= ~PF_MEMALLOC; + return r; + } + + r = validate_params(cmd, param); + if (r) + goto out; + + param_size = param->data_size; + param->data_size = sizeof(*param); + r = fn(param, param_size); + + /* + * Copy the results back to userland. + */ + if (!r && copy_to_user(user, param, param->data_size)) + r = -EFAULT; + + out: + free_params(param); + current->flags &= ~PF_MEMALLOC; + return r; +} + +static struct file_operations _ctl_fops = { + .ioctl = ctl_ioctl, + .owner = THIS_MODULE, +}; + +static devfs_handle_t _ctl_handle; + +static struct miscdevice _dm_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DM_NAME, + .fops = &_ctl_fops +}; + +/* + * Create misc character device and link to DM_DIR/control. + */ +int __init dm_interface_init(void) +{ + int r; + char rname[64]; + + r = dm_hash_init(); + if (r) + return r; + + r = misc_register(&_dm_misc); + if (r) { + DMERR("misc_register failed for control device"); + dm_hash_exit(); + return r; + } + + r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3, + sizeof rname - 3); + if (r == -ENOSYS) + goto done; /* devfs not present */ + + if (r < 0) { + DMERR("devfs_generate_path failed for control device"); + goto failed; + } + + strncpy(rname + r, "../", 3); + r = devfs_mk_symlink(NULL, DM_DIR "/control", + DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL); + if (r) { + DMERR("devfs_mk_symlink failed for control device"); + goto failed; + } + devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle); + + done: + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR, + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA, + DM_DRIVER_EMAIL); + return 0; + + failed: + misc_deregister(&_dm_misc); + dm_hash_exit(); + return r; +} + +void dm_interface_exit(void) +{ + if (misc_deregister(&_dm_misc) < 0) + DMERR("misc_deregister failed for control device"); + + dm_hash_exit(); +} Index: drivers/md/dm-linear.c =================================================================== RCS file: drivers/md/dm-linear.c diff -N drivers/md/dm-linear.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm-linear.c 17 Apr 2004 00:42:43 -0000 1.1.1.1.10.1 @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +/* + * Linear: maps a linear range of a device. + */ +struct linear_c { + struct dm_dev *dev; + sector_t start; +}; + +/* + * Construct a linear mapping: + */ +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct linear_c *lc; + + if (argc != 2) { + ti->error = "dm-linear: Invalid argument count"; + return -EINVAL; + } + + lc = kmalloc(sizeof(*lc), GFP_KERNEL); + if (lc == NULL) { + ti->error = "dm-linear: Cannot allocate linear context"; + return -ENOMEM; + } + + if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) { + ti->error = "dm-linear: Invalid device sector"; + goto bad; + } + + if (dm_get_device(ti, argv[0], lc->start, ti->len, + dm_table_get_mode(ti->table), &lc->dev)) { + ti->error = "dm-linear: Device lookup failed"; + goto bad; + } + + ti->private = lc; + return 0; + + bad: + kfree(lc); + return -EINVAL; +} + +static void linear_dtr(struct dm_target *ti) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + dm_put_device(ti, lc->dev); + kfree(lc); +} + +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + + bh->b_rdev = lc->dev->dev; + bh->b_rsector = lc->start + (bh->b_rsector - ti->begin); + + return 1; +} + +static int linear_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct linear_c *lc = (struct linear_c *) ti->private; + kdev_t kdev; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + kdev = to_kdev_t(lc->dev->bdev->bd_dev); + snprintf(result, maxlen, "%s " SECTOR_FORMAT, + dm_kdevname(kdev), lc->start); + break; + } + return 0; +} + +static struct target_type linear_target = { + .name = "linear", + .module = THIS_MODULE, + .ctr = linear_ctr, + .dtr = linear_dtr, + .map = linear_map, + .status = linear_status, +}; + +int __init dm_linear_init(void) +{ + int r = dm_register_target(&linear_target); + + if (r < 0) + DMERR("linear: register failed %d", r); + + return r; +} + +void dm_linear_exit(void) +{ + int r = dm_unregister_target(&linear_target); + + if (r < 0) + DMERR("linear: unregister failed %d", r); +} Index: drivers/md/dm-stripe.c =================================================================== RCS file: drivers/md/dm-stripe.c diff -N drivers/md/dm-stripe.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm-stripe.c 17 Apr 2004 00:42:43 -0000 1.1.1.1.10.1 @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include + +struct stripe { + struct dm_dev *dev; + sector_t physical_start; +}; + +struct stripe_c { + uint32_t stripes; + + /* The size of this target / num. stripes */ + uint32_t stripe_width; + + /* stripe chunk size */ + uint32_t chunk_shift; + sector_t chunk_mask; + + struct stripe stripe[0]; +}; + +static inline struct stripe_c *alloc_context(unsigned int stripes) +{ + size_t len; + + if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), + stripes)) + return NULL; + + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); + + return kmalloc(len, GFP_KERNEL); +} + +/* + * Parse a single pair + */ +static int get_stripe(struct dm_target *ti, struct stripe_c *sc, + unsigned int stripe, char **argv) +{ + sector_t start; + + if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1) + return -EINVAL; + + if (dm_get_device(ti, argv[0], start, sc->stripe_width, + dm_table_get_mode(ti->table), + &sc->stripe[stripe].dev)) + return -ENXIO; + + sc->stripe[stripe].physical_start = start; + return 0; +} + +/* + * FIXME: Nasty function, only present because we can't link + * against __moddi3 and __divdi3. + * + * returns a == b * n + */ +static int multiple(sector_t a, sector_t b, sector_t *n) +{ + sector_t acc, prev, i; + + *n = 0; + while (a >= b) { + for (acc = b, prev = 0, i = 1; + acc <= a; + prev = acc, acc <<= 1, i <<= 1) + ; + + a -= prev; + *n += i >> 1; + } + + return a == 0; +} + +/* + * Construct a striped mapping. + * [ ]+ + */ +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct stripe_c *sc; + sector_t width; + uint32_t stripes; + uint32_t chunk_size; + char *end; + int r; + unsigned int i; + + if (argc < 2) { + ti->error = "dm-stripe: Not enough arguments"; + return -EINVAL; + } + + stripes = simple_strtoul(argv[0], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid stripe count"; + return -EINVAL; + } + + chunk_size = simple_strtoul(argv[1], &end, 10); + if (*end) { + ti->error = "dm-stripe: Invalid chunk_size"; + return -EINVAL; + } + + /* + * chunk_size is a power of two + */ + if (!chunk_size || (chunk_size & (chunk_size - 1))) { + ti->error = "dm-stripe: Invalid chunk size"; + return -EINVAL; + } + + if (!multiple(ti->len, stripes, &width)) { + ti->error = "dm-stripe: Target length not divisable by " + "number of stripes"; + return -EINVAL; + } + + /* + * Do we have enough arguments for that many stripes ? + */ + if (argc != (2 + 2 * stripes)) { + ti->error = "dm-stripe: Not enough destinations specified"; + return -EINVAL; + } + + sc = alloc_context(stripes); + if (!sc) { + ti->error = "dm-stripe: Memory allocation for striped context " + "failed"; + return -ENOMEM; + } + + sc->stripes = stripes; + sc->stripe_width = width; + + sc->chunk_mask = ((sector_t) chunk_size) - 1; + for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++) + chunk_size >>= 1; + sc->chunk_shift--; + + /* + * Get the stripe destinations. + */ + for (i = 0; i < stripes; i++) { + argv += 2; + + r = get_stripe(ti, sc, i, argv); + if (r < 0) { + ti->error = "dm-stripe: Couldn't parse stripe " + "destination"; + while (i--) + dm_put_device(ti, sc->stripe[i].dev); + kfree(sc); + return r; + } + } + + ti->private = sc; + return 0; +} + +static void stripe_dtr(struct dm_target *ti) +{ + unsigned int i; + struct stripe_c *sc = (struct stripe_c *) ti->private; + + for (i = 0; i < sc->stripes; i++) + dm_put_device(ti, sc->stripe[i].dev); + + kfree(sc); +} + +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *context) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + + sector_t offset = bh->b_rsector - ti->begin; + uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift); + uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */ + chunk = chunk / sc->stripes; + + bh->b_rdev = sc->stripe[stripe].dev->dev; + bh->b_rsector = sc->stripe[stripe].physical_start + + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask); + return 1; +} + +static int stripe_status(struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen) +{ + struct stripe_c *sc = (struct stripe_c *) ti->private; + int offset; + unsigned int i; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT, + sc->stripes, sc->chunk_mask + 1); + for (i = 0; i < sc->stripes; i++) { + offset += + snprintf(result + offset, maxlen - offset, + " %s " SECTOR_FORMAT, + dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)), + sc->stripe[i].physical_start); + } + break; + } + return 0; +} + +static struct target_type stripe_target = { + .name = "striped", + .module = THIS_MODULE, + .ctr = stripe_ctr, + .dtr = stripe_dtr, + .map = stripe_map, + .status = stripe_status, +}; + +int __init dm_stripe_init(void) +{ + int r; + + r = dm_register_target(&stripe_target); + if (r < 0) + DMWARN("striped target registration failed"); + + return r; +} + +void dm_stripe_exit(void) +{ + if (dm_unregister_target(&stripe_target)) + DMWARN("striped target unregistration failed"); + + return; +} Index: drivers/md/dm-table.c =================================================================== RCS file: drivers/md/dm-table.c diff -N drivers/md/dm-table.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm-table.c 17 Apr 2004 00:42:43 -0000 1.1.1.1.10.1 @@ -0,0 +1,679 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include + +#define MAX_DEPTH 16 +#define NODE_SIZE L1_CACHE_BYTES +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) + +struct dm_table { + atomic_t holders; + + /* btree table */ + unsigned int depth; + unsigned int counts[MAX_DEPTH]; /* in nodes */ + sector_t *index[MAX_DEPTH]; + + unsigned int num_targets; + unsigned int num_allocated; + sector_t *highs; + struct dm_target *targets; + + /* + * Indicates the rw permissions for the new logical + * device. This should be a combination of FMODE_READ + * and FMODE_WRITE. + */ + int mode; + + /* a list of devices used by this table */ + struct list_head devices; + + /* events get handed up using this callback */ + void (*event_fn)(void *); + void *event_context; +}; + +/* + * Similar to ceiling(log_size(n)) + */ +static unsigned int int_log(unsigned long n, unsigned long base) +{ + int result = 0; + + while (n > 1) { + n = dm_div_up(n, base); + result++; + } + + return result; +} + +/* + * Calculate the index of the child node of the n'th node k'th key. + */ +static inline unsigned int get_child(unsigned int n, unsigned int k) +{ + return (n * CHILDREN_PER_NODE) + k; +} + +/* + * Return the n'th node of level l from table t. + */ +static inline sector_t *get_node(struct dm_table *t, unsigned int l, + unsigned int n) +{ + return t->index[l] + (n * KEYS_PER_NODE); +} + +/* + * Return the highest key that you could lookup from the n'th + * node on level l of the btree. + */ +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) +{ + for (; l < t->depth - 1; l++) + n = get_child(n, CHILDREN_PER_NODE - 1); + + if (n >= t->counts[l]) + return (sector_t) - 1; + + return get_node(t, l, n)[KEYS_PER_NODE - 1]; +} + +/* + * Fills in a level of the btree based on the highs of the level + * below it. + */ +static int setup_btree_index(unsigned int l, struct dm_table *t) +{ + unsigned int n, k; + sector_t *node; + + for (n = 0U; n < t->counts[l]; n++) { + node = get_node(t, l, n); + + for (k = 0U; k < KEYS_PER_NODE; k++) + node[k] = high(t, l + 1, get_child(n, k)); + } + + return 0; +} + + + +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets) +{ + struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + + if (!t) + return -ENOMEM; + + memset(t, 0, sizeof(*t)); + INIT_LIST_HEAD(&t->devices); + atomic_set(&t->holders, 1); + + num_targets = dm_round_up(num_targets, KEYS_PER_NODE); + + /* Allocate both the target array and offset array at once. */ + t->highs = (sector_t *) vcalloc(sizeof(struct dm_target) + + sizeof(sector_t), num_targets); + if (!t->highs) { + kfree(t); + return -ENOMEM; + } + + memset(t->highs, -1, sizeof(*t->highs) * num_targets); + + t->targets = (struct dm_target *) (t->highs + num_targets); + t->num_allocated = num_targets; + t->mode = mode; + *result = t; + return 0; +} + +static void free_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + for (tmp = devices->next; tmp != devices; tmp = next) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + next = tmp->next; + kfree(dd); + } +} + +void table_destroy(struct dm_table *t) +{ + unsigned int i; + + /* free the indexes (see dm_table_complete) */ + if (t->depth >= 2) + vfree(t->index[t->depth - 2]); + + /* free the targets */ + for (i = 0; i < t->num_targets; i++) { + struct dm_target *tgt = t->targets + i; + + if (tgt->type->dtr) + tgt->type->dtr(tgt); + + dm_put_target_type(tgt->type); + } + + vfree(t->highs); + + /* free the device list */ + if (t->devices.next != &t->devices) { + DMWARN("devices still present during destroy: " + "dm_table_remove_device calls missing"); + + free_devices(&t->devices); + } + + kfree(t); +} + +void dm_table_get(struct dm_table *t) +{ + atomic_inc(&t->holders); +} + +void dm_table_put(struct dm_table *t) +{ + if (atomic_dec_and_test(&t->holders)) + table_destroy(t); +} + +/* + * Convert a device path to a dev_t. + */ +static int lookup_device(const char *path, kdev_t *dev) +{ + int r; + struct nameidata nd; + struct inode *inode; + + if (!path_init(path, LOOKUP_FOLLOW, &nd)) + return 0; + + if ((r = path_walk(path, &nd))) + goto out; + + inode = nd.dentry->d_inode; + if (!inode) { + r = -ENOENT; + goto out; + } + + if (!S_ISBLK(inode->i_mode)) { + r = -ENOTBLK; + goto out; + } + + *dev = inode->i_rdev; + + out: + path_release(&nd); + return r; +} + +/* + * See if we've already got a device in the list. + */ +static struct dm_dev *find_device(struct list_head *l, kdev_t dev) +{ + struct list_head *tmp; + + list_for_each(tmp, l) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + if (kdev_same(dd->dev, dev)) + return dd; + } + + return NULL; +} + +/* + * Open a device so we can use it as a map destination. + */ +static int open_dev(struct dm_dev *dd) +{ + if (dd->bdev) + BUG(); + + dd->bdev = bdget(kdev_t_to_nr(dd->dev)); + if (!dd->bdev) + return -ENOMEM; + + return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW); +} + +/* + * Close a device that we've been using. + */ +static void close_dev(struct dm_dev *dd) +{ + if (!dd->bdev) + return; + + blkdev_put(dd->bdev, BDEV_RAW); + dd->bdev = NULL; +} + +/* + * If possible (ie. blk_size[major] is set), this checks an area + * of a destination device is valid. + */ +static int check_device_area(kdev_t dev, sector_t start, sector_t len) +{ + int *sizes; + sector_t dev_size; + + if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)])) + /* we don't know the device details, + * so give the benefit of the doubt */ + return 1; + + /* convert to 512-byte sectors */ + dev_size <<= 1; + + return ((start < dev_size) && (len <= (dev_size - start))); +} + +/* + * This upgrades the mode on an already open dm_dev. Being + * careful to leave things as they were if we fail to reopen the + * device. + */ +static int upgrade_mode(struct dm_dev *dd, int new_mode) +{ + int r; + struct dm_dev dd_copy; + + memcpy(&dd_copy, dd, sizeof(dd_copy)); + + dd->mode |= new_mode; + dd->bdev = NULL; + r = open_dev(dd); + if (!r) + close_dev(&dd_copy); + else + memcpy(dd, &dd_copy, sizeof(dd_copy)); + + return r; +} + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result) +{ + int r; + kdev_t dev; + struct dm_dev *dd; + unsigned major, minor; + struct dm_table *t = ti->table; + + if (!t) + BUG(); + + if (sscanf(path, "%u:%u", &major, &minor) == 2) { + /* Extract the major/minor numbers */ + dev = mk_kdev(major, minor); + } else { + /* convert the path to a device */ + if ((r = lookup_device(path, &dev))) + return r; + } + + dd = find_device(&t->devices, dev); + if (!dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM; + + dd->dev = dev; + dd->mode = mode; + dd->bdev = NULL; + + if ((r = open_dev(dd))) { + kfree(dd); + return r; + } + + atomic_set(&dd->count, 0); + list_add(&dd->list, &t->devices); + + } else if (dd->mode != (mode | dd->mode)) { + r = upgrade_mode(dd, mode); + if (r) + return r; + } + atomic_inc(&dd->count); + + if (!check_device_area(dd->dev, start, len)) { + DMWARN("device %s too small for target", path); + dm_put_device(ti, dd); + return -EINVAL; + } + + *result = dd; + + return 0; +} + +/* + * Decrement a devices use count and remove it if neccessary. + */ +void dm_put_device(struct dm_target *ti, struct dm_dev *dd) +{ + if (atomic_dec_and_test(&dd->count)) { + close_dev(dd); + list_del(&dd->list); + kfree(dd); + } +} + +/* + * Checks to see if the target joins onto the end of the table. + */ +static int adjoin(struct dm_table *table, struct dm_target *ti) +{ + struct dm_target *prev; + + if (!table->num_targets) + return !ti->begin; + + prev = &table->targets[table->num_targets - 1]; + return (ti->begin == (prev->begin + prev->len)); +} + +/* + * Used to dynamically allocate the arg array. + */ +static char **realloc_argv(unsigned *array_size, char **old_argv) +{ + char **argv; + unsigned new_size; + + new_size = *array_size ? *array_size * 2 : 64; + argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); + if (argv) { + memcpy(argv, old_argv, *array_size * sizeof(*argv)); + *array_size = new_size; + } + + kfree(old_argv); + return argv; +} + +/* + * Destructively splits up the argument list to pass to ctr. + */ +static int split_args(int *argc, char ***argvp, char *input) +{ + char *start, *end = input, *out, **argv = NULL; + unsigned array_size = 0; + + *argc = 0; + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + + while (1) { + start = end; + + /* Skip whitespace */ + while (*start && isspace(*start)) + start++; + + if (!*start) + break; /* success, we hit the end */ + + /* 'out' is used to remove any back-quotes */ + end = out = start; + while (*end) { + /* Everything apart from '\0' can be quoted */ + if (*end == '\\' && *(end + 1)) { + *out++ = *(end + 1); + end += 2; + continue; + } + + if (isspace(*end)) + break; /* end of token */ + + *out++ = *end++; + } + + /* have we already filled the array ? */ + if ((*argc + 1) > array_size) { + argv = realloc_argv(&array_size, argv); + if (!argv) + return -ENOMEM; + } + + /* we know this is whitespace */ + if (*end) + end++; + + /* terminate the string and put it in the array */ + *out = '\0'; + argv[*argc] = start; + (*argc)++; + } + + *argvp = argv; + return 0; +} + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params) +{ + int r = -EINVAL, argc; + char **argv; + struct dm_target *tgt; + + if (t->num_targets >= t->num_allocated) + return -ENOMEM; + + tgt = t->targets + t->num_targets; + memset(tgt, 0, sizeof(*tgt)); + + tgt->type = dm_get_target_type(type); + if (!tgt->type) { + tgt->error = "unknown target type"; + return -EINVAL; + } + + tgt->table = t; + tgt->begin = start; + tgt->len = len; + tgt->error = "Unknown error"; + + /* + * Does this target adjoin the previous one ? + */ + if (!adjoin(t, tgt)) { + tgt->error = "Gap in table"; + r = -EINVAL; + goto bad; + } + + r = split_args(&argc, &argv, params); + if (r) { + tgt->error = "couldn't split parameters (insufficient memory)"; + goto bad; + } + + r = tgt->type->ctr(tgt, argc, argv); + kfree(argv); + if (r) + goto bad; + + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; + return 0; + + bad: + printk(KERN_ERR DM_NAME ": %s\n", tgt->error); + dm_put_target_type(tgt->type); + return r; +} + +static int setup_indexes(struct dm_table *t) +{ + int i; + unsigned int total = 0; + sector_t *indexes; + + /* allocate the space for *all* the indexes */ + for (i = t->depth - 2; i >= 0; i--) { + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); + total += t->counts[i]; + } + + indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE); + if (!indexes) + return -ENOMEM; + + /* set up internal nodes, bottom-up */ + for (i = t->depth - 2, total = 0; i >= 0; i--) { + t->index[i] = indexes; + indexes += (KEYS_PER_NODE * t->counts[i]); + setup_btree_index(i, t); + } + + return 0; +} + +/* + * Builds the btree to index the map. + */ +int dm_table_complete(struct dm_table *t) +{ + int r = 0; + unsigned int leaf_nodes; + + /* how many indexes will the btree have ? */ + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); + + /* leaf layer has already been set up */ + t->counts[t->depth - 1] = leaf_nodes; + t->index[t->depth - 1] = t->highs; + + if (t->depth >= 2) + r = setup_indexes(t); + + return r; +} + +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED; +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context) +{ + spin_lock_irq(&_event_lock); + t->event_fn = fn; + t->event_context = context; + spin_unlock_irq(&_event_lock); +} + +void dm_table_event(struct dm_table *t) +{ + spin_lock(&_event_lock); + if (t->event_fn) + t->event_fn(t->event_context); + spin_unlock(&_event_lock); +} + +sector_t dm_table_get_size(struct dm_table *t) +{ + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; +} + +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) +{ + if (index > t->num_targets) + return NULL; + + return t->targets + index; +} + +/* + * Search the btree for the correct target. + */ +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) +{ + unsigned int l, n = 0, k = 0; + sector_t *node; + + for (l = 0; l < t->depth; l++) { + n = get_child(n, k); + node = get_node(t, l, n); + + for (k = 0; k < KEYS_PER_NODE; k++) + if (node[k] >= sector) + break; + } + + return &t->targets[(KEYS_PER_NODE * n) + k]; +} + +unsigned int dm_table_get_num_targets(struct dm_table *t) +{ + return t->num_targets; +} + +struct list_head *dm_table_get_devices(struct dm_table *t) +{ + return &t->devices; +} + +int dm_table_get_mode(struct dm_table *t) +{ + return t->mode; +} + +void dm_table_suspend_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->suspend) + ti->type->suspend(ti); + } +} + +void dm_table_resume_targets(struct dm_table *t) +{ + int i; + + for (i = 0; i < t->num_targets; i++) { + struct dm_target *ti = t->targets + i; + + if (ti->type->resume) + ti->type->resume(ti); + } +} + +EXPORT_SYMBOL(dm_get_device); +EXPORT_SYMBOL(dm_put_device); +EXPORT_SYMBOL(dm_table_event); +EXPORT_SYMBOL(dm_table_get_mode); Index: drivers/md/dm-target.c =================================================================== RCS file: drivers/md/dm-target.c diff -N drivers/md/dm-target.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm-target.c 17 Apr 2004 00:42:43 -0000 1.1.1.1.10.1 @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include + +struct tt_internal { + struct target_type tt; + + struct list_head list; + long use; +}; + +static LIST_HEAD(_targets); +static DECLARE_RWSEM(_lock); + +#define DM_MOD_NAME_SIZE 32 + +static inline struct tt_internal *__find_target_type(const char *name) +{ + struct list_head *tih; + struct tt_internal *ti; + + list_for_each(tih, &_targets) { + ti = list_entry(tih, struct tt_internal, list); + + if (!strcmp(name, ti->tt.name)) + return ti; + } + + return NULL; +} + +static struct tt_internal *get_target_type(const char *name) +{ + struct tt_internal *ti; + + down_read(&_lock); + ti = __find_target_type(name); + + if (ti) { + if (ti->use == 0 && ti->tt.module) + __MOD_INC_USE_COUNT(ti->tt.module); + ti->use++; + } + up_read(&_lock); + + return ti; +} + +static void load_module(const char *name) +{ + char module_name[DM_MOD_NAME_SIZE] = "dm-"; + + /* Length check for strcat() below */ + if (strlen(name) > (DM_MOD_NAME_SIZE - 4)) + return; + + strcat(module_name, name); + request_module(module_name); +} + +struct target_type *dm_get_target_type(const char *name) +{ + struct tt_internal *ti = get_target_type(name); + + if (!ti) { + load_module(name); + ti = get_target_type(name); + } + + return ti ? &ti->tt : NULL; +} + +void dm_put_target_type(struct target_type *t) +{ + struct tt_internal *ti = (struct tt_internal *) t; + + down_read(&_lock); + if (--ti->use == 0 && ti->tt.module) + __MOD_DEC_USE_COUNT(ti->tt.module); + + if (ti->use < 0) + BUG(); + up_read(&_lock); + + return; +} + +static struct tt_internal *alloc_target(struct target_type *t) +{ + struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (ti) { + memset(ti, 0, sizeof(*ti)); + ti->tt = *t; + } + + return ti; +} + +int dm_register_target(struct target_type *t) +{ + int rv = 0; + struct tt_internal *ti = alloc_target(t); + + if (!ti) + return -ENOMEM; + + down_write(&_lock); + if (__find_target_type(t->name)) { + kfree(ti); + rv = -EEXIST; + } else + list_add(&ti->list, &_targets); + + up_write(&_lock); + return rv; +} + +int dm_unregister_target(struct target_type *t) +{ + struct tt_internal *ti; + + down_write(&_lock); + if (!(ti = __find_target_type(t->name))) { + up_write(&_lock); + return -EINVAL; + } + + if (ti->use) { + up_write(&_lock); + return -ETXTBSY; + } + + list_del(&ti->list); + kfree(ti); + + up_write(&_lock); + return 0; +} + +/* + * io-err: always fails an io, useful for bringing + * up LVs that have holes in them. + */ +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args) +{ + return 0; +} + +static void io_err_dtr(struct dm_target *ti) +{ + /* empty */ +} + +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw, + union map_info *map_context) +{ + return -EIO; +} + +static struct target_type error_target = { + .name = "error", + .ctr = io_err_ctr, + .dtr = io_err_dtr, + .map = io_err_map, +}; + +int dm_target_init(void) +{ + return dm_register_target(&error_target); +} + +void dm_target_exit(void) +{ + if (dm_unregister_target(&error_target)) + DMWARN("error target unregistration failed"); +} + +EXPORT_SYMBOL(dm_register_target); +EXPORT_SYMBOL(dm_unregister_target); Index: drivers/md/dm.c =================================================================== RCS file: drivers/md/dm.c diff -N drivers/md/dm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm.c 17 Apr 2004 00:42:43 -0000 1.1.1.2.8.1 @@ -0,0 +1,1112 @@ +/* + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. + * + * This file is released under the GPL. + */ + +#include "dm.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static const char *_name = DM_NAME; +#define DEFAULT_READ_AHEAD 64 + +struct dm_io { + struct mapped_device *md; + + struct dm_target *ti; + int rw; + union map_info map_context; + void (*end_io) (struct buffer_head * bh, int uptodate); + void *context; +}; + +struct deferred_io { + int rw; + struct buffer_head *bh; + struct deferred_io *next; +}; + +/* + * Bits for the md->flags field. + */ +#define DMF_BLOCK_IO 0 +#define DMF_SUSPENDED 1 + +struct mapped_device { + struct rw_semaphore lock; + atomic_t holders; + + kdev_t dev; + unsigned long flags; + + /* + * A list of ios that arrived while we were suspended. + */ + atomic_t pending; + wait_queue_head_t wait; + struct deferred_io *deferred; + + /* + * The current mapping. + */ + struct dm_table *map; + + /* + * io objects are allocated from here. + */ + mempool_t *io_pool; + + /* + * Event handling. + */ + uint32_t event_nr; + wait_queue_head_t eventq; +}; + +#define MIN_IOS 256 +static kmem_cache_t *_io_cache; + +static struct mapped_device *get_kdev(kdev_t dev); +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh); +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb); + +/*----------------------------------------------------------------- + * In order to avoid the 256 minor number limit we are going to + * register more major numbers as neccessary. + *---------------------------------------------------------------*/ +#define MAX_MINORS (1 << MINORBITS) + +struct major_details { + unsigned int major; + + int transient; + struct list_head transient_list; + + unsigned int first_free_minor; + int nr_free_minors; + + struct mapped_device *mds[MAX_MINORS]; + int blk_size[MAX_MINORS]; + int blksize_size[MAX_MINORS]; + int hardsect_size[MAX_MINORS]; +}; + +static struct rw_semaphore _dev_lock; +static struct major_details *_majors[MAX_BLKDEV]; + +/* + * This holds a list of majors that non-specified device numbers + * may be allocated from. Only majors with free minors appear on + * this list. + */ +static LIST_HEAD(_transients_free); + +static int __alloc_major(unsigned int major, struct major_details **result) +{ + int r; + unsigned int transient = !major; + struct major_details *maj; + + /* Major already allocated? */ + if (major && _majors[major]) + return 0; + + maj = kmalloc(sizeof(*maj), GFP_KERNEL); + if (!maj) + return -ENOMEM; + + memset(maj, 0, sizeof(*maj)); + INIT_LIST_HEAD(&maj->transient_list); + + maj->nr_free_minors = MAX_MINORS; + + r = register_blkdev(major, _name, &dm_blk_dops); + if (r < 0) { + DMERR("register_blkdev failed for %d", major); + kfree(maj); + return r; + } + if (r > 0) + major = r; + + maj->major = major; + + if (transient) { + maj->transient = transient; + list_add_tail(&maj->transient_list, &_transients_free); + } + + _majors[major] = maj; + + blk_size[major] = maj->blk_size; + blksize_size[major] = maj->blksize_size; + hardsect_size[major] = maj->hardsect_size; + read_ahead[major] = DEFAULT_READ_AHEAD; + + blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request); + + *result = maj; + return 0; +} + +static void __free_major(struct major_details *maj) +{ + unsigned int major = maj->major; + + list_del(&maj->transient_list); + + read_ahead[major] = 0; + blk_size[major] = NULL; + blksize_size[major] = NULL; + hardsect_size[major] = NULL; + + _majors[major] = NULL; + kfree(maj); + + if (unregister_blkdev(major, _name) < 0) + DMERR("devfs_unregister_blkdev failed"); +} + +static void free_all_majors(void) +{ + unsigned int major = ARRAY_SIZE(_majors); + + down_write(&_dev_lock); + + while (major--) + if (_majors[major]) + __free_major(_majors[major]); + + up_write(&_dev_lock); +} + +static void free_dev(kdev_t dev) +{ + unsigned int major = major(dev); + unsigned int minor = minor(dev); + struct major_details *maj; + + down_write(&_dev_lock); + + maj = _majors[major]; + if (!maj) + goto out; + + maj->mds[minor] = NULL; + maj->nr_free_minors++; + + if (maj->nr_free_minors == MAX_MINORS) { + __free_major(maj); + goto out; + } + + if (!maj->transient) + goto out; + + if (maj->nr_free_minors == 1) + list_add_tail(&maj->transient_list, &_transients_free); + + if (minor < maj->first_free_minor) + maj->first_free_minor = minor; + + out: + up_write(&_dev_lock); +} + +static void __alloc_minor(struct major_details *maj, unsigned int minor, + struct mapped_device *md) +{ + maj->mds[minor] = md; + md->dev = mk_kdev(maj->major, minor); + maj->nr_free_minors--; + + if (maj->transient && !maj->nr_free_minors) + list_del_init(&maj->transient_list); +} + +/* + * See if requested kdev_t is available. + */ +static int specific_dev(kdev_t dev, struct mapped_device *md) +{ + int r = 0; + unsigned int major = major(dev); + unsigned int minor = minor(dev); + struct major_details *maj; + + if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) { + DMWARN("device number requested out of range (%d, %d)", + major, minor); + return -EINVAL; + } + + down_write(&_dev_lock); + maj = _majors[major]; + + /* Register requested major? */ + if (!maj) { + r = __alloc_major(major, &maj); + if (r) + goto out; + + major = maj->major; + } + + if (maj->mds[minor]) { + r = -EBUSY; + goto out; + } + + __alloc_minor(maj, minor, md); + + out: + up_write(&_dev_lock); + + return r; +} + +/* + * Find first unused device number, requesting a new major number if required. + */ +static int first_free_dev(struct mapped_device *md) +{ + int r = 0; + struct major_details *maj; + + down_write(&_dev_lock); + + if (list_empty(&_transients_free)) { + r = __alloc_major(0, &maj); + if (r) + goto out; + } else + maj = list_entry(_transients_free.next, struct major_details, + transient_list); + + while (maj->mds[maj->first_free_minor++]) + ; + + __alloc_minor(maj, maj->first_free_minor - 1, md); + + out: + up_write(&_dev_lock); + + return r; +} + +static struct mapped_device *get_kdev(kdev_t dev) +{ + struct mapped_device *md; + struct major_details *maj; + + down_read(&_dev_lock); + maj = _majors[major(dev)]; + if (!maj) { + md = NULL; + goto out; + } + md = maj->mds[minor(dev)]; + if (md) + dm_get(md); + out: + up_read(&_dev_lock); + + return md; +} + +/*----------------------------------------------------------------- + * init/exit code + *---------------------------------------------------------------*/ + +static __init int local_init(void) +{ + init_rwsem(&_dev_lock); + + /* allocate a slab for the dm_ios */ + _io_cache = kmem_cache_create("dm io", + sizeof(struct dm_io), 0, 0, NULL, NULL); + + if (!_io_cache) + return -ENOMEM; + + return 0; +} + +static void local_exit(void) +{ + kmem_cache_destroy(_io_cache); + free_all_majors(); + + DMINFO("cleaned up"); +} + +/* + * We have a lot of init/exit functions, so it seems easier to + * store them in an array. The disposable macro 'xx' + * expands a prefix into a pair of function names. + */ +static struct { + int (*init) (void); + void (*exit) (void); + +} _inits[] = { +#define xx(n) {n ## _init, n ## _exit}, + xx(local) + xx(dm_target) + xx(dm_linear) + xx(dm_stripe) + xx(dm_interface) +#undef xx +}; + +static int __init dm_init(void) +{ + const int count = ARRAY_SIZE(_inits); + + int r, i; + + for (i = 0; i < count; i++) { + r = _inits[i].init(); + if (r) + goto bad; + } + + return 0; + + bad: + while (i--) + _inits[i].exit(); + + return r; +} + +static void __exit dm_exit(void) +{ + int i = ARRAY_SIZE(_inits); + + while (i--) + _inits[i].exit(); +} + +/* + * Block device functions + */ +static int dm_blk_open(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + return 0; +} + +static int dm_blk_close(struct inode *inode, struct file *file) +{ + struct mapped_device *md; + + md = get_kdev(inode->i_rdev); + dm_put(md); /* put the reference gained by dm_blk_open */ + dm_put(md); + return 0; +} + +static inline struct dm_io *alloc_io(struct mapped_device *md) +{ + return mempool_alloc(md->io_pool, GFP_NOIO); +} + +static inline void free_io(struct mapped_device *md, struct dm_io *io) +{ + mempool_free(io, md->io_pool); +} + +static inline struct deferred_io *alloc_deferred(void) +{ + return kmalloc(sizeof(struct deferred_io), GFP_NOIO); +} + +static inline void free_deferred(struct deferred_io *di) +{ + kfree(di); +} + +static inline sector_t volume_size(kdev_t dev) +{ + return blk_size[major(dev)][minor(dev)] << 1; +} + +/* FIXME: check this */ +static int dm_blk_ioctl(struct inode *inode, struct file *file, + unsigned int command, unsigned long a) +{ + kdev_t dev = inode->i_rdev; + long size; + + switch (command) { + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: + case BLKFLSBUF: + case BLKSSZGET: + //case BLKRRPART: /* Re-read partition tables */ + //case BLKPG: + case BLKELVGET: + case BLKELVSET: + case BLKBSZGET: + case BLKBSZSET: + return blk_ioctl(dev, command, a); + break; + + case BLKGETSIZE: + size = volume_size(dev); + if (copy_to_user((void *) a, &size, sizeof(long))) + return -EFAULT; + break; + + case BLKGETSIZE64: + size = volume_size(dev); + if (put_user((u64) ((u64) size) << 9, (u64 *) a)) + return -EFAULT; + break; + + case BLKRRPART: + return -ENOTTY; + + case LV_BMAP: + return dm_user_bmap(inode, (struct lv_bmap *) a); + + default: + DMWARN("unknown block ioctl 0x%x", command); + return -ENOTTY; + } + + return 0; +} + +/* + * Add the buffer to the list of deferred io. + */ +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw) +{ + struct deferred_io *di; + + di = alloc_deferred(); + if (!di) + return -ENOMEM; + + down_write(&md->lock); + + if (!test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + free_deferred(di); + return 1; + } + + di->bh = bh; + di->rw = rw; + di->next = md->deferred; + md->deferred = di; + + up_write(&md->lock); + return 0; /* deferred successfully */ +} + +/* + * bh->b_end_io routine that decrements the pending count + * and then calls the original bh->b_end_io fn. + */ +static void dec_pending(struct buffer_head *bh, int uptodate) +{ + int r; + struct dm_io *io = bh->b_private; + dm_endio_fn endio = io->ti->type->end_io; + + if (endio) { + r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO, + &io->map_context); + if (r < 0) + uptodate = 0; + + else if (r > 0) + /* the target wants another shot at the io */ + return; + } + + if (atomic_dec_and_test(&io->md->pending)) + /* nudge anyone waiting on suspend queue */ + wake_up(&io->md->wait); + + bh->b_end_io = io->end_io; + bh->b_private = io->context; + free_io(io->md, io); + + bh->b_end_io(bh, uptodate); +} + +/* + * Do the bh mapping for a given leaf + */ +static inline int __map_buffer(struct mapped_device *md, int rw, + struct buffer_head *bh, struct dm_io *io) +{ + struct dm_target *ti; + + if (!md->map) + return -EINVAL; + + ti = dm_table_find_target(md->map, bh->b_rsector); + if (!ti->type) + return -EINVAL; + + /* hook the end io request fn */ + atomic_inc(&md->pending); + io->md = md; + io->ti = ti; + io->rw = rw; + io->end_io = bh->b_end_io; + io->context = bh->b_private; + bh->b_end_io = dec_pending; + bh->b_private = io; + + return ti->type->map(ti, bh, rw, &io->map_context); +} + +/* + * Checks to see if we should be deferring io, if so it queues it + * and returns 1. + */ +static inline int __deferring(struct mapped_device *md, int rw, + struct buffer_head *bh) +{ + int r; + + /* + * If we're suspended we have to queue this io for later. + */ + while (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_read(&md->lock); + + /* + * There's no point deferring a read ahead + * request, just drop it. + */ + if (rw == READA) { + down_read(&md->lock); + return -EIO; + } + + r = queue_io(md, bh, rw); + down_read(&md->lock); + + if (r < 0) + return r; + + if (r == 0) + return 1; /* deferred successfully */ + + } + + return 0; +} + +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh) +{ + int r; + struct dm_io *io; + struct mapped_device *md; + + md = get_kdev(bh->b_rdev); + if (!md) { + buffer_IO_error(bh); + return 0; + } + + io = alloc_io(md); + down_read(&md->lock); + + r = __deferring(md, rw, bh); + if (r < 0) + goto bad; + + else if (!r) { + /* not deferring */ + r = __map_buffer(md, rw, bh, io); + if (r < 0) + goto bad; + } else + r = 0; + + up_read(&md->lock); + dm_put(md); + return r; + + bad: + buffer_IO_error(bh); + up_read(&md->lock); + dm_put(md); + return 0; +} + +static int check_dev_size(kdev_t dev, unsigned long block) +{ + unsigned int major = major(dev); + unsigned int minor = minor(dev); + + /* FIXME: check this */ + unsigned long max_sector = (blk_size[major][minor] << 1) + 1; + unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9); + + return (sector > max_sector) ? 0 : 1; +} + +/* + * Creates a dummy buffer head and maps it (for lilo). + */ +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block, + kdev_t *r_dev, unsigned long *r_block) +{ + struct buffer_head bh; + struct dm_target *ti; + union map_info map_context; + int r; + + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + return -EPERM; + } + + if (!check_dev_size(dev, block)) { + return -EINVAL; + } + + if (!md->map) + return -EINVAL; + + /* setup dummy bh */ + memset(&bh, 0, sizeof(bh)); + bh.b_blocknr = block; + bh.b_dev = bh.b_rdev = dev; + bh.b_size = blksize_size[major(dev)][minor(dev)]; + bh.b_rsector = block * (bh.b_size >> 9); + + /* find target */ + ti = dm_table_find_target(md->map, bh.b_rsector); + + /* do the mapping */ + r = ti->type->map(ti, &bh, READ, &map_context); + ti->type->end_io(ti, &bh, READ, 0, &map_context); + + if (!r) { + *r_dev = bh.b_rdev; + *r_block = bh.b_rsector / (bh.b_size >> 9); + } + + return r; +} + +/* + * Marshals arguments and results between user and kernel space. + */ +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb) +{ + struct mapped_device *md; + unsigned long block, r_block; + kdev_t r_dev; + int r; + + if (get_user(block, &lvb->lv_block)) + return -EFAULT; + + md = get_kdev(inode->i_rdev); + if (!md) + return -ENXIO; + + down_read(&md->lock); + r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block); + up_read(&md->lock); + dm_put(md); + + if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) || + put_user(r_block, &lvb->lv_block))) + r = -EFAULT; + + return r; +} + +static void free_md(struct mapped_device *md) +{ + free_dev(md->dev); + mempool_destroy(md->io_pool); + kfree(md); +} + +/* + * Allocate and initialise a blank device with a given minor. + */ +static struct mapped_device *alloc_md(kdev_t dev) +{ + int r; + struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); + + if (!md) { + DMWARN("unable to allocate device, out of memory."); + return NULL; + } + + memset(md, 0, sizeof(*md)); + + /* Allocate suitable device number */ + if (!dev) + r = first_free_dev(md); + else + r = specific_dev(dev, md); + + if (r) { + kfree(md); + return NULL; + } + + md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab, + mempool_free_slab, _io_cache); + if (!md->io_pool) { + free_md(md); + kfree(md); + return NULL; + } + + init_rwsem(&md->lock); + atomic_set(&md->holders, 1); + atomic_set(&md->pending, 0); + init_waitqueue_head(&md->wait); + init_waitqueue_head(&md->eventq); + + return md; +} + +/* + * The hardsect size for a mapped device is the largest hardsect size + * from the devices it maps onto. + */ +static int __find_hardsect_size(struct list_head *devices) +{ + int result = 512, size; + struct list_head *tmp; + + list_for_each (tmp, devices) { + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list); + size = get_hardsect_size(dd->dev); + if (size > result) + result = size; + } + + return result; +} + +/* + * Bind a table to the device. + */ +static void event_callback(void *context) +{ + struct mapped_device *md = (struct mapped_device *) context; + + down_write(&md->lock); + md->event_nr++; + wake_up_interruptible(&md->eventq); + up_write(&md->lock); +} + +static int __bind(struct mapped_device *md, struct dm_table *t) +{ + unsigned int minor = minor(md->dev); + unsigned int major = major(md->dev); + md->map = t; + + /* in k */ + blk_size[major][minor] = dm_table_get_size(t) >> 1; + blksize_size[major][minor] = BLOCK_SIZE; + hardsect_size[major][minor] = + __find_hardsect_size(dm_table_get_devices(t)); + register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]); + + dm_table_event_callback(md->map, event_callback, md); + dm_table_get(t); + return 0; +} + +static void __unbind(struct mapped_device *md) +{ + unsigned int minor = minor(md->dev); + unsigned int major = major(md->dev); + + if (md->map) { + dm_table_event_callback(md->map, NULL, NULL); + dm_table_put(md->map); + md->map = NULL; + + } + + blk_size[major][minor] = 0; + blksize_size[major][minor] = 0; + hardsect_size[major][minor] = 0; +} + +/* + * Constructor for a new device. + */ +int dm_create(kdev_t dev, struct mapped_device **result) +{ + struct mapped_device *md; + + md = alloc_md(dev); + if (!md) + return -ENXIO; + + __unbind(md); /* Ensure zero device size */ + + *result = md; + return 0; +} + +void dm_get(struct mapped_device *md) +{ + atomic_inc(&md->holders); +} + +void dm_put(struct mapped_device *md) +{ + if (atomic_dec_and_test(&md->holders)) { + if (md->map) + dm_table_suspend_targets(md->map); + __unbind(md); + free_md(md); + } +} + +/* + * Requeue the deferred io by calling generic_make_request. + */ +static void flush_deferred_io(struct deferred_io *c) +{ + struct deferred_io *n; + + while (c) { + n = c->next; + generic_make_request(c->rw, c->bh); + free_deferred(c); + c = n; + } +} + +/* + * Swap in a new table (destroying old one). + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *table) +{ + int r; + + down_write(&md->lock); + + /* + * The device must be suspended, or have no table bound yet. + */ + if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EPERM; + } + + __unbind(md); + r = __bind(md, table); + if (r) + return r; + + up_write(&md->lock); + return 0; +} + +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight io and ensure that any further io gets deferred. + */ +int dm_suspend(struct mapped_device *md) +{ + int r = 0; + DECLARE_WAITQUEUE(wait, current); + + down_write(&md->lock); + + /* + * First we set the BLOCK_IO flag so no more ios will be + * mapped. + */ + if (test_bit(DMF_BLOCK_IO, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + set_bit(DMF_BLOCK_IO, &md->flags); + add_wait_queue(&md->wait, &wait); + up_write(&md->lock); + + /* + * Then we wait for the already mapped ios to + * complete. + */ + run_task_queue(&tq_disk); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!atomic_read(&md->pending) || signal_pending(current)) + break; + + schedule(); + } + set_current_state(TASK_RUNNING); + + down_write(&md->lock); + remove_wait_queue(&md->wait, &wait); + + /* did we flush everything ? */ + if (atomic_read(&md->pending)) { + clear_bit(DMF_BLOCK_IO, &md->flags); + r = -EINTR; + } else { + set_bit(DMF_SUSPENDED, &md->flags); + if (md->map) + dm_table_suspend_targets(md->map); + } + up_write(&md->lock); + + return r; +} + +int dm_resume(struct mapped_device *md) +{ + struct deferred_io *def; + + down_write(&md->lock); + if (!test_bit(DMF_SUSPENDED, &md->flags)) { + up_write(&md->lock); + return -EINVAL; + } + + if (md->map) + dm_table_resume_targets(md->map); + + clear_bit(DMF_SUSPENDED, &md->flags); + clear_bit(DMF_BLOCK_IO, &md->flags); + def = md->deferred; + md->deferred = NULL; + up_write(&md->lock); + + flush_deferred_io(def); + run_task_queue(&tq_disk); + + return 0; +} + +struct dm_table *dm_get_table(struct mapped_device *md) +{ + struct dm_table *t; + + down_read(&md->lock); + t = md->map; + if (t) + dm_table_get(t); + up_read(&md->lock); + + return t; +} + +/*----------------------------------------------------------------- + * Event notification. + *---------------------------------------------------------------*/ +uint32_t dm_get_event_nr(struct mapped_device *md) +{ + uint32_t r; + + down_read(&md->lock); + r = md->event_nr; + up_read(&md->lock); + + return r; +} + +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, + uint32_t event_nr) +{ + down_write(&md->lock); + if (event_nr != md->event_nr) { + up_write(&md->lock); + return 1; + } + + add_wait_queue(&md->eventq, wq); + up_write(&md->lock); + + return 0; +} + +const char *dm_kdevname(kdev_t dev) +{ + static char buffer[32]; + sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev)); + return buffer; +} + +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq) +{ + down_write(&md->lock); + remove_wait_queue(&md->eventq, wq); + up_write(&md->lock); +} + +kdev_t dm_kdev(struct mapped_device *md) +{ + kdev_t dev; + + down_read(&md->lock); + dev = md->dev; + up_read(&md->lock); + + return dev; +} + +int dm_suspended(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED, &md->flags); +} + +struct block_device_operations dm_blk_dops = { + .open = dm_blk_open, + .release = dm_blk_close, + .ioctl = dm_blk_ioctl, + .owner = THIS_MODULE +}; + +/* + * module hooks + */ +module_init(dm_init); +module_exit(dm_exit); + +MODULE_DESCRIPTION(DM_NAME " driver"); +MODULE_AUTHOR("Joe Thornber "); +MODULE_LICENSE("GPL"); + +EXPORT_SYMBOL(dm_kdevname); Index: drivers/md/dm.h =================================================================== RCS file: drivers/md/dm.h diff -N drivers/md/dm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/drivers/md/dm.h 17 Apr 2004 00:42:43 -0000 1.1.1.2.8.1 @@ -0,0 +1,172 @@ +/* + * Internal header file for device mapper + * + * Copyright (C) 2001, 2002 Sistina Software + * + * This file is released under the LGPL. + */ + +#ifndef DM_INTERNAL_H +#define DM_INTERNAL_H + +#include +#include +#include +#include + +#define DM_NAME "device-mapper" +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x) +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x) +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x) + +/* + * FIXME: I think this should be with the definition of sector_t + * in types.h. + */ +#ifdef CONFIG_LBD +#define SECTOR_FORMAT "%Lu" +#else +#define SECTOR_FORMAT "%lu" +#endif + +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1 << SECTOR_SHIFT) + +extern struct block_device_operations dm_blk_dops; + +/* + * List of devices that a metadevice uses and should open/close. + */ +struct dm_dev { + struct list_head list; + + atomic_t count; + int mode; + kdev_t dev; + struct block_device *bdev; +}; + +struct dm_table; +struct mapped_device; + +/*----------------------------------------------------------------- + * Functions for manipulating a struct mapped_device. + * Drop the reference with dm_put when you finish with the object. + *---------------------------------------------------------------*/ +int dm_create(kdev_t dev, struct mapped_device **md); + +/* + * Reference counting for md. + */ +void dm_get(struct mapped_device *md); +void dm_put(struct mapped_device *md); + +/* + * A device can still be used while suspended, but I/O is deferred. + */ +int dm_suspend(struct mapped_device *md); +int dm_resume(struct mapped_device *md); + +/* + * The device must be suspended before calling this method. + */ +int dm_swap_table(struct mapped_device *md, struct dm_table *t); + +/* + * Drop a reference on the table when you've finished with the + * result. + */ +struct dm_table *dm_get_table(struct mapped_device *md); + +/* + * Event functions. + */ +uint32_t dm_get_event_nr(struct mapped_device *md); +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq, + uint32_t event_nr); +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq); + +/* + * Info functions. + */ +kdev_t dm_kdev(struct mapped_device *md); +int dm_suspended(struct mapped_device *md); + +/*----------------------------------------------------------------- + * Functions for manipulating a table. Tables are also reference + * counted. + *---------------------------------------------------------------*/ +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets); + +void dm_table_get(struct dm_table *t); +void dm_table_put(struct dm_table *t); + +int dm_table_add_target(struct dm_table *t, const char *type, + sector_t start, sector_t len, char *params); +int dm_table_complete(struct dm_table *t); +void dm_table_event_callback(struct dm_table *t, + void (*fn)(void *), void *context); +void dm_table_event(struct dm_table *t); +sector_t dm_table_get_size(struct dm_table *t); +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index); +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); +unsigned int dm_table_get_num_targets(struct dm_table *t); +struct list_head *dm_table_get_devices(struct dm_table *t); +int dm_table_get_mode(struct dm_table *t); +void dm_table_suspend_targets(struct dm_table *t); +void dm_table_resume_targets(struct dm_table *t); + +/*----------------------------------------------------------------- + * A registry of target types. + *---------------------------------------------------------------*/ +int dm_target_init(void); +void dm_target_exit(void); +struct target_type *dm_get_target_type(const char *name); +void dm_put_target_type(struct target_type *t); + + +/*----------------------------------------------------------------- + * Useful inlines. + *---------------------------------------------------------------*/ +static inline int array_too_big(unsigned long fixed, unsigned long obj, + unsigned long num) +{ + return (num > (ULONG_MAX - fixed) / obj); +} + +/* + * ceiling(n / size) * size + */ +static inline unsigned long dm_round_up(unsigned long n, unsigned long size) +{ + unsigned long r = n % size; + return n + (r ? (size - r) : 0); +} + +/* + * Ceiling(n / size) + */ +static inline unsigned long dm_div_up(unsigned long n, unsigned long size) +{ + return dm_round_up(n, size) / size; +} + +const char *dm_kdevname(kdev_t dev); + +/* + * The device-mapper can be driven through one of two interfaces; + * ioctl or filesystem, depending which patch you have applied. + */ +int dm_interface_init(void); +void dm_interface_exit(void); + +/* + * Targets for linear and striped mappings + */ +int dm_linear_init(void); +void dm_linear_exit(void); + +int dm_stripe_init(void); +void dm_stripe_exit(void); + +#endif Index: fs/buffer.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/fs/buffer.c,v retrieving revision 1.1.1.34 retrieving revision 1.1.1.34.4.1 diff -u -r1.1.1.34 -r1.1.1.34.4.1 --- a/fs/buffer.c 18 Feb 2004 13:36:31 -0000 1.1.1.34 +++ b/fs/buffer.c 17 Apr 2004 00:42:43 -0000 1.1.1.34.4.1 @@ -799,6 +799,7 @@ bh->b_list = BUF_CLEAN; bh->b_end_io = handler; bh->b_private = private; + bh->b_journal_head = NULL; } void end_buffer_io_async(struct buffer_head * bh, int uptodate) Index: fs/jbd/journal.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/fs/jbd/journal.c,v retrieving revision 1.1.1.14 retrieving revision 1.1.1.14.4.1 diff -u -r1.1.1.14 -r1.1.1.14.4.1 --- a/fs/jbd/journal.c 14 Apr 2004 13:05:40 -0000 1.1.1.14 +++ b/fs/jbd/journal.c 17 Apr 2004 00:42:43 -0000 1.1.1.14.4.1 @@ -1804,9 +1804,9 @@ if (buffer_jbd(bh)) { /* Someone did it for us! */ - J_ASSERT_BH(bh, bh->b_private != NULL); + J_ASSERT_BH(bh, bh->b_journal_head != NULL); journal_free_journal_head(jh); - jh = bh->b_private; + jh = bh->b_journal_head; } else { /* * We actually don't need jh_splice_lock when @@ -1814,7 +1814,7 @@ */ spin_lock(&jh_splice_lock); set_bit(BH_JBD, &bh->b_state); - bh->b_private = jh; + bh->b_journal_head = jh; jh->b_bh = bh; atomic_inc(&bh->b_count); spin_unlock(&jh_splice_lock); @@ -1823,7 +1823,7 @@ } jh->b_jcount++; spin_unlock(&journal_datalist_lock); - return bh->b_private; + return bh->b_journal_head; } /* @@ -1856,7 +1856,7 @@ J_ASSERT_BH(bh, jh2bh(jh) == bh); BUFFER_TRACE(bh, "remove journal_head"); spin_lock(&jh_splice_lock); - bh->b_private = NULL; + bh->b_journal_head = NULL; jh->b_bh = NULL; /* debug, really */ clear_bit(BH_JBD, &bh->b_state); __brelse(bh); Index: include/linux/device-mapper.h =================================================================== RCS file: include/linux/device-mapper.h diff -N include/linux/device-mapper.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/linux/device-mapper.h 17 Apr 2004 00:42:44 -0000 1.1.1.1.10.1 @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2001 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DEVICE_MAPPER_H +#define _LINUX_DEVICE_MAPPER_H + +typedef unsigned long sector_t; + +struct dm_target; +struct dm_table; +struct dm_dev; + +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; + +union map_info { + void *ptr; + unsigned long long ll; +}; + +/* + * In the constructor the target parameter will already have the + * table, type, begin and len fields filled in. + */ +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc, + char **argv); + +/* + * The destructor doesn't need to free the dm_target, just + * anything hidden ti->private. + */ +typedef void (*dm_dtr_fn) (struct dm_target * ti); + +/* + * The map function must return: + * < 0: error + * = 0: The target will handle the io by resubmitting it later + * > 0: simple remap complete + */ +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh, + int rw, union map_info *map_context); + +/* + * Returns: + * < 0 : error (currently ignored) + * 0 : ended successfully + * 1 : for some reason the io has still not completed (eg, + * multipath target might want to requeue a failed io). + */ +typedef int (*dm_endio_fn) (struct dm_target * ti, + struct buffer_head * bh, int rw, int error, + union map_info *map_context); +typedef void (*dm_suspend_fn) (struct dm_target *ti); +typedef void (*dm_resume_fn) (struct dm_target *ti); +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type, + char *result, unsigned int maxlen); + +void dm_error(const char *message); + +/* + * Constructors should call these functions to ensure destination devices + * are opened/closed correctly. + * FIXME: too many arguments. + */ +int dm_get_device(struct dm_target *ti, const char *path, sector_t start, + sector_t len, int mode, struct dm_dev **result); +void dm_put_device(struct dm_target *ti, struct dm_dev *d); + +/* + * Information about a target type + */ +struct target_type { + const char *name; + struct module *module; + dm_ctr_fn ctr; + dm_dtr_fn dtr; + dm_map_fn map; + dm_endio_fn end_io; + dm_suspend_fn suspend; + dm_resume_fn resume; + dm_status_fn status; +}; + +struct dm_target { + struct dm_table *table; + struct target_type *type; + + /* target limits */ + sector_t begin; + sector_t len; + + /* target specific data */ + void *private; + + /* Used to provide an error string from the ctr */ + char *error; +}; + +int dm_register_target(struct target_type *t); +int dm_unregister_target(struct target_type *t); + +#endif /* _LINUX_DEVICE_MAPPER_H */ Index: include/linux/dm-ioctl.h =================================================================== RCS file: include/linux/dm-ioctl.h diff -N include/linux/dm-ioctl.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/linux/dm-ioctl.h 17 Apr 2004 00:42:44 -0000 1.1.1.1.10.1 @@ -0,0 +1,237 @@ +/* + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited. + * + * This file is released under the LGPL. + */ + +#ifndef _LINUX_DM_IOCTL_H +#define _LINUX_DM_IOCTL_H + +#include + +#define DM_DIR "mapper" /* Slashes not supported */ +#define DM_MAX_TYPE_NAME 16 +#define DM_NAME_LEN 128 +#define DM_UUID_LEN 129 + +/* + * A traditional ioctl interface for the device mapper. + * + * Each device can have two tables associated with it, an + * 'active' table which is the one currently used by io passing + * through the device, and an 'inactive' one which is a table + * that is being prepared as a replacement for the 'active' one. + * + * DM_VERSION: + * Just get the version information for the ioctl interface. + * + * DM_REMOVE_ALL: + * Remove all dm devices, destroy all tables. Only really used + * for debug. + * + * DM_LIST_DEVICES: + * Get a list of all the dm device names. + * + * DM_DEV_CREATE: + * Create a new device, neither the 'active' or 'inactive' table + * slots will be filled. The device will be in suspended state + * after creation, however any io to the device will get errored + * since it will be out-of-bounds. + * + * DM_DEV_REMOVE: + * Remove a device, destroy any tables. + * + * DM_DEV_RENAME: + * Rename a device. + * + * DM_SUSPEND: + * This performs both suspend and resume, depending which flag is + * passed in. + * Suspend: This command will not return until all pending io to + * the device has completed. Further io will be deferred until + * the device is resumed. + * Resume: It is no longer an error to issue this command on an + * unsuspended device. If a table is present in the 'inactive' + * slot, it will be moved to the active slot, then the old table + * from the active slot will be _destroyed_. Finally the device + * is resumed. + * + * DM_DEV_STATUS: + * Retrieves the status for the table in the 'active' slot. + * + * DM_DEV_WAIT: + * Wait for a significant event to occur to the device. This + * could either be caused by an event triggered by one of the + * targets of the table in the 'active' slot, or a table change. + * + * DM_TABLE_LOAD: + * Load a table into the 'inactive' slot for the device. The + * device does _not_ need to be suspended prior to this command. + * + * DM_TABLE_CLEAR: + * Destroy any table in the 'inactive' slot (ie. abort). + * + * DM_TABLE_DEPS: + * Return a set of device dependencies for the 'active' table. + * + * DM_TABLE_STATUS: + * Return the targets status for the 'active' table. + */ + +/* + * All ioctl arguments consist of a single chunk of memory, with + * this structure at the start. If a uuid is specified any + * lookup (eg. for a DM_INFO) will be done on that, *not* the + * name. + */ +struct dm_ioctl { + /* + * The version number is made up of three parts: + * major - no backward or forward compatibility, + * minor - only backwards compatible, + * patch - both backwards and forwards compatible. + * + * All clients of the ioctl interface should fill in the + * version number of the interface that they were + * compiled with. + * + * All recognised ioctl commands (ie. those that don't + * return -ENOTTY) fill out this field, even if the + * command failed. + */ + uint32_t version[3]; /* in/out */ + uint32_t data_size; /* total size of data passed in + * including this struct */ + + uint32_t data_start; /* offset to start of data + * relative to start of this struct */ + + uint32_t target_count; /* in/out */ + int32_t open_count; /* out */ + uint32_t flags; /* in/out */ + uint32_t event_nr; /* in/out */ + uint32_t padding; + + uint64_t dev; /* in/out */ + + char name[DM_NAME_LEN]; /* device name */ + char uuid[DM_UUID_LEN]; /* unique identifier for + * the block device */ +}; + +/* + * Used to specify tables. These structures appear after the + * dm_ioctl. + */ +struct dm_target_spec { + uint64_t sector_start; + uint64_t length; + int32_t status; /* used when reading from kernel only */ + + /* + * Offset in bytes (from the start of this struct) to + * next target_spec. + */ + uint32_t next; + + char target_type[DM_MAX_TYPE_NAME]; + + /* + * Parameter string starts immediately after this object. + * Be careful to add padding after string to ensure correct + * alignment of subsequent dm_target_spec. + */ +}; + +/* + * Used to retrieve the target dependencies. + */ +struct dm_target_deps { + uint32_t count; /* Array size */ + uint32_t padding; /* unused */ + uint64_t dev[0]; /* out */ +}; + +/* + * Used to get a list of all dm devices. + */ +struct dm_name_list { + uint64_t dev; + uint32_t next; /* offset to the next record from + the _start_ of this */ + char name[0]; +}; + +/* + * If you change this make sure you make the corresponding change + * to dm-ioctl.c:lookup_ioctl() + */ +enum { + /* Top level cmds */ + DM_VERSION_CMD = 0, + DM_REMOVE_ALL_CMD, + DM_LIST_DEVICES_CMD, + + /* device level cmds */ + DM_DEV_CREATE_CMD, + DM_DEV_REMOVE_CMD, + DM_DEV_RENAME_CMD, + DM_DEV_SUSPEND_CMD, + DM_DEV_STATUS_CMD, + DM_DEV_WAIT_CMD, + + /* Table level cmds */ + DM_TABLE_LOAD_CMD, + DM_TABLE_CLEAR_CMD, + DM_TABLE_DEPS_CMD, + DM_TABLE_STATUS_CMD, +}; + +#define DM_IOCTL 0xfd + +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl) +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl) +#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl) + +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl) +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl) +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl) +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl) +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl) +#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl) + +#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl) +#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl) +#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl) +#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl) + +#define DM_VERSION_MAJOR 4 +#define DM_VERSION_MINOR 0 +#define DM_VERSION_PATCHLEVEL 5 +#define DM_VERSION_EXTRA "-ioctl (2003-11-18)" + +/* Status bits */ +#define DM_READONLY_FLAG (1 << 0) /* In/Out */ +#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */ +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */ + +/* + * Flag passed into ioctl STATUS command to get table information + * rather than current status. + */ +#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */ + +/* + * Flags that indicate whether a table is present in either of + * the two table slots that a device has. + */ +#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */ +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */ + +/* + * Indicates that the buffer passed in wasn't big enough for the + * results. + */ +#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */ + +#endif /* _LINUX_DM_IOCTL_H */ Index: include/linux/fs.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/fs.h,v retrieving revision 1.1.1.38 retrieving revision 1.1.1.38.4.1 diff -u -r1.1.1.38 -r1.1.1.38.4.1 --- a/include/linux/fs.h 18 Feb 2004 13:36:32 -0000 1.1.1.38 +++ b/include/linux/fs.h 17 Apr 2004 00:42:44 -0000 1.1.1.38.4.1 @@ -266,7 +266,7 @@ struct page *b_page; /* the page this bh is mapped to */ void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ void *b_private; /* reserved for b_end_io */ - + void *b_journal_head; /* ext3 journal_heads */ unsigned long b_rsector; /* Real buffer location on disk */ wait_queue_head_t b_wait; Index: include/linux/jbd.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/jbd.h,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.4.1 diff -u -r1.1.1.12 -r1.1.1.12.4.1 --- a/include/linux/jbd.h 13 Jun 2003 14:51:38 -0000 1.1.1.12 +++ b/include/linux/jbd.h 17 Apr 2004 00:42:44 -0000 1.1.1.12.4.1 @@ -311,7 +311,7 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) { - return bh->b_private; + return bh->b_journal_head; } #define HAVE_JOURNAL_CALLBACK_STATUS Index: include/linux/mempool.h =================================================================== RCS file: include/linux/mempool.h diff -N include/linux/mempool.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/include/linux/mempool.h 17 Apr 2004 00:42:44 -0000 1.1.1.1.10.1 @@ -0,0 +1,31 @@ +/* + * memory buffer pool support + */ +#ifndef _LINUX_MEMPOOL_H +#define _LINUX_MEMPOOL_H + +#include +#include + +struct mempool_s; +typedef struct mempool_s mempool_t; + +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data); +typedef void (mempool_free_t)(void *element, void *pool_data); + +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask); +extern void mempool_destroy(mempool_t *pool); +extern void * mempool_alloc(mempool_t *pool, int gfp_mask); +extern void mempool_free(void *element, mempool_t *pool); + +/* + * A mempool_alloc_t and mempool_free_t that get the memory from + * a slab that is passed in through pool_data. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data); +void mempool_free_slab(void *element, void *pool_data); + + +#endif /* _LINUX_MEMPOOL_H */ Index: include/linux/vmalloc.h =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/include/linux/vmalloc.h,v retrieving revision 1.1.1.11 retrieving revision 1.1.1.11.4.1 diff -u -r1.1.1.11 -r1.1.1.11.4.1 --- a/include/linux/vmalloc.h 25 Aug 2003 11:44:44 -0000 1.1.1.11 +++ b/include/linux/vmalloc.h 17 Apr 2004 00:42:44 -0000 1.1.1.11.4.1 @@ -29,6 +29,7 @@ extern void vmfree_area_pages(unsigned long address, unsigned long size); extern int vmalloc_area_pages(unsigned long address, unsigned long size, int gfp_mask, pgprot_t prot); +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size); /* * Allocate any pages Index: kernel/ksyms.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/kernel/ksyms.c,v retrieving revision 1.1.1.34 retrieving revision 1.1.1.34.4.1 diff -u -r1.1.1.34 -r1.1.1.34.4.1 --- a/kernel/ksyms.c 18 Feb 2004 13:36:32 -0000 1.1.1.34 +++ b/kernel/ksyms.c 17 Apr 2004 00:42:44 -0000 1.1.1.34.4.1 @@ -115,6 +115,7 @@ EXPORT_SYMBOL(__vmalloc); EXPORT_SYMBOL(vmap); EXPORT_SYMBOL(vmalloc_to_page); +EXPORT_SYMBOL(vcalloc); EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(remap_page_range); EXPORT_SYMBOL(max_mapnr); Index: mm/Makefile =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/mm/Makefile,v retrieving revision 1.1.1.12 retrieving revision 1.1.1.12.4.1 diff -u -r1.1.1.12 -r1.1.1.12.4.1 --- a/mm/Makefile 3 Aug 2002 00:39:46 -0000 1.1.1.12 +++ b/mm/Makefile 17 Apr 2004 00:42:44 -0000 1.1.1.12.4.1 @@ -9,12 +9,12 @@ O_TARGET := mm.o -export-objs := shmem.o filemap.o memory.o page_alloc.o +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \ - shmem.o + shmem.o mempool.o obj-$(CONFIG_HIGHMEM) += highmem.o Index: mm/mempool.c =================================================================== RCS file: mm/mempool.c diff -N mm/mempool.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ b/mm/mempool.c 17 Apr 2004 00:42:44 -0000 1.1.1.1.10.1 @@ -0,0 +1,299 @@ +/* + * linux/mm/mempool.c + * + * memory buffer pool support. Such pools are mostly used + * for guaranteed, deadlock-free memory allocations during + * extreme VM load. + * + * started by Ingo Molnar, Copyright (C) 2001 + */ + +#include +#include +#include +#include + +struct mempool_s { + spinlock_t lock; + int min_nr; /* nr of elements at *elements */ + int curr_nr; /* Current nr of elements at *elements */ + void **elements; + + void *pool_data; + mempool_alloc_t *alloc; + mempool_free_t *free; + wait_queue_head_t wait; +}; + +static void add_element(mempool_t *pool, void *element) +{ + BUG_ON(pool->curr_nr >= pool->min_nr); + pool->elements[pool->curr_nr++] = element; +} + +static void *remove_element(mempool_t *pool) +{ + BUG_ON(pool->curr_nr <= 0); + return pool->elements[--pool->curr_nr]; +} + +static void free_pool(mempool_t *pool) +{ + while (pool->curr_nr) { + void *element = remove_element(pool); + pool->free(element, pool->pool_data); + } + kfree(pool->elements); + kfree(pool); +} + +/** + * mempool_create - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. + * @free_fn: user-defined element-freeing function. + * @pool_data: optional private data available to the user-defined functions. + * + * this function creates and allocates a guaranteed size, preallocated + * memory pool. The pool can be used from the mempool_alloc and mempool_free + * functions. This function might sleep. Both the alloc_fn() and the free_fn() + * functions might sleep - as long as the mempool_alloc function is not called + * from IRQ contexts. + */ +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) +{ + mempool_t *pool; + + pool = kmalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return NULL; + memset(pool, 0, sizeof(*pool)); + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL); + if (!pool->elements) { + kfree(pool); + return NULL; + } + spin_lock_init(&pool->lock); + pool->min_nr = min_nr; + pool->pool_data = pool_data; + init_waitqueue_head(&pool->wait); + pool->alloc = alloc_fn; + pool->free = free_fn; + + /* + * First pre-allocate the guaranteed number of buffers. + */ + while (pool->curr_nr < pool->min_nr) { + void *element; + + element = pool->alloc(GFP_KERNEL, pool->pool_data); + if (unlikely(!element)) { + free_pool(pool); + return NULL; + } + add_element(pool, element); + } + return pool; +} + +/** + * mempool_resize - resize an existing memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @new_min_nr: the new minimum number of elements guaranteed to be + * allocated for this pool. + * @gfp_mask: the usual allocation bitmask. + * + * This function shrinks/grows the pool. In the case of growing, + * it cannot be guaranteed that the pool will be grown to the new + * size immediately, but new mempool_free() calls will refill it. + * + * Note, the caller must guarantee that no mempool_destroy is called + * while this function is running. mempool_alloc() & mempool_free() + * might be called (eg. from IRQ contexts) while this function executes. + */ +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask) +{ + void *element; + void **new_elements; + unsigned long flags; + + BUG_ON(new_min_nr <= 0); + + spin_lock_irqsave(&pool->lock, flags); + if (new_min_nr < pool->min_nr) { + while (pool->curr_nr > new_min_nr) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + pool->free(element, pool->pool_data); + spin_lock_irqsave(&pool->lock, flags); + } + pool->min_nr = new_min_nr; + goto out_unlock; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* Grow the pool */ + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + if (!new_elements) + return -ENOMEM; + + spin_lock_irqsave(&pool->lock, flags); + memcpy(new_elements, pool->elements, + pool->curr_nr * sizeof(*new_elements)); + kfree(pool->elements); + pool->elements = new_elements; + pool->min_nr = new_min_nr; + + while (pool->curr_nr < pool->min_nr) { + spin_unlock_irqrestore(&pool->lock, flags); + element = pool->alloc(gfp_mask, pool->pool_data); + if (!element) + goto out; + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) + add_element(pool, element); + else + kfree(element); /* Raced */ + } +out_unlock: + spin_unlock_irqrestore(&pool->lock, flags); +out: + return 0; +} + +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. The caller + * has to guarantee that all elements have been returned to the pool (ie: + * freed) prior to calling mempool_destroy(). + */ +void mempool_destroy(mempool_t *pool) +{ + if (pool->curr_nr != pool->min_nr) + BUG(); /* There were outstanding elements */ + free_pool(pool); +} + +/** + * mempool_alloc - allocate an element from a specific memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * @gfp_mask: the usual allocation bitmask. + * + * this function only sleeps if the alloc_fn function sleeps or + * returns NULL. Note that due to preallocation, this function + * *never* fails when called from process contexts. (it might + * fail if called from an IRQ context.) + */ +void * mempool_alloc(mempool_t *pool, int gfp_mask) +{ + void *element; + unsigned long flags; + int curr_nr; + DECLARE_WAITQUEUE(wait, current); + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + +repeat_alloc: + element = pool->alloc(gfp_nowait, pool->pool_data); + if (likely(element != NULL)) + return element; + + /* + * If the pool is less than 50% full then try harder + * to allocate an element: + */ + if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) { + element = pool->alloc(gfp_mask, pool->pool_data); + if (likely(element != NULL)) + return element; + } + + /* + * Kick the VM at this point. + */ + wakeup_bdflush(); + + spin_lock_irqsave(&pool->lock, flags); + if (likely(pool->curr_nr)) { + element = remove_element(pool); + spin_unlock_irqrestore(&pool->lock, flags); + return element; + } + spin_unlock_irqrestore(&pool->lock, flags); + + /* We must not sleep in the GFP_ATOMIC case */ + if (gfp_mask == gfp_nowait) + return NULL; + + run_task_queue(&tq_disk); + + add_wait_queue_exclusive(&pool->wait, &wait); + set_task_state(current, TASK_UNINTERRUPTIBLE); + + spin_lock_irqsave(&pool->lock, flags); + curr_nr = pool->curr_nr; + spin_unlock_irqrestore(&pool->lock, flags); + + if (!curr_nr) + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(&pool->wait, &wait); + + goto repeat_alloc; +} + +/** + * mempool_free - return an element to the pool. + * @element: pool element pointer. + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * this function only sleeps if the free_fn() function sleeps. + */ +void mempool_free(void *element, mempool_t *pool) +{ + unsigned long flags; + + if (pool->curr_nr < pool->min_nr) { + spin_lock_irqsave(&pool->lock, flags); + if (pool->curr_nr < pool->min_nr) { + add_element(pool, element); + spin_unlock_irqrestore(&pool->lock, flags); + wake_up(&pool->wait); + return; + } + spin_unlock_irqrestore(&pool->lock, flags); + } + pool->free(element, pool->pool_data); +} + +/* + * A commonly used alloc and free fn. + */ +void *mempool_alloc_slab(int gfp_mask, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + return kmem_cache_alloc(mem, gfp_mask); +} + +void mempool_free_slab(void *element, void *pool_data) +{ + kmem_cache_t *mem = (kmem_cache_t *) pool_data; + kmem_cache_free(mem, element); +} + + +EXPORT_SYMBOL(mempool_create); +EXPORT_SYMBOL(mempool_resize); +EXPORT_SYMBOL(mempool_destroy); +EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_free); +EXPORT_SYMBOL(mempool_alloc_slab); +EXPORT_SYMBOL(mempool_free_slab); Index: mm/vmalloc.c =================================================================== RCS file: /home/gondolin/herbert/src/CVS/debian/kernel-source-2.4/mm/vmalloc.c,v retrieving revision 1.1.1.20 retrieving revision 1.1.1.20.4.1 diff -u -r1.1.1.20 -r1.1.1.20.4.1 --- a/mm/vmalloc.c 14 Apr 2004 13:05:41 -0000 1.1.1.20 +++ b/mm/vmalloc.c 17 Apr 2004 00:42:44 -0000 1.1.1.20.4.1 @@ -382,3 +382,22 @@ read_unlock(&vmlist_lock); return buf - buf_start; } + +void *vcalloc(unsigned long nmemb, unsigned long elem_size) +{ + unsigned long size; + void *addr; + + /* + * Check that we're not going to overflow. + */ + if (nmemb > (ULONG_MAX / elem_size)) + return NULL; + + size = nmemb * elem_size; + addr = vmalloc(size); + if (addr) + memset(addr, 0, size); + + return addr; +}