From de7c9a578187a2d417bc894b5f18d34519481bb5 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 24 Sep 2018 14:51:46 +0100 Subject: [PATCH] [thin_journal_check] Checks journal of block manager activity. You need to apply doc/bm-journal.patch to create the journal. thin_journal_check confirms that if the machine had crashed at any time during the test run no metadata corruption would have occured. --- bin/thin_journal_check | 1 + doc/bm-journal.patch | 1505 +++++++++++++++++++++++ thin-provisioning/commands.cc | 1 + thin-provisioning/thin_journal.cc | 51 +- thin-provisioning/thin_journal.h | 23 +- thin-provisioning/thin_journal_check.cc | 274 +++-- 6 files changed, 1761 insertions(+), 94 deletions(-) create mode 120000 bin/thin_journal_check create mode 100644 doc/bm-journal.patch diff --git a/bin/thin_journal_check b/bin/thin_journal_check new file mode 120000 index 0000000..84c01e7 --- /dev/null +++ b/bin/thin_journal_check @@ -0,0 +1 @@ +pdata_tools \ No newline at end of file diff --git a/doc/bm-journal.patch b/doc/bm-journal.patch new file mode 100644 index 0000000..792b0f3 --- /dev/null +++ b/doc/bm-journal.patch @@ -0,0 +1,1505 @@ +commit 6cb3772bdb92399319eb463e658ce62b692c669a +Author: Joe Thornber +Date: Mon Sep 24 14:48:16 2018 +0100 + + [bm journal] Journalling version of block manager. + + Can be used to confirm we're crash proof with the thin_journal_check tool. + +diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c +index 69dddeab124c..f7b11f270846 100644 +--- a/drivers/md/dm-cache-metadata.c ++++ b/drivers/md/dm-cache-metadata.c +@@ -106,6 +106,8 @@ struct dm_cache_metadata { + + unsigned version; + struct block_device *bdev; ++ struct block_device *journal_dev; ++ + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_transaction_manager *tm; +@@ -281,7 +283,7 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result) + } + } + +- dm_bm_unlock(b); ++ dm_bm_unlock(bm, b); + + return 0; + } +@@ -504,12 +506,12 @@ static int __open_metadata(struct dm_cache_metadata *cmd) + dm_disk_bitset_init(cmd->tm, &cmd->discard_info); + sb_flags = le32_to_cpu(disk_super->flags); + cmd->clean_when_opened = test_bit(CLEAN_SHUTDOWN, &sb_flags); +- dm_bm_unlock(sblock); ++ dm_bm_unlock(cmd->bm, sblock); + + return 0; + + bad: +- dm_bm_unlock(sblock); ++ dm_bm_unlock(cmd->bm, sblock); + return r; + } + +@@ -533,8 +535,9 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, + bool may_format_device) + { + int r; +- cmd->bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, +- CACHE_MAX_CONCURRENT_LOCKS); ++ cmd->bm = dm_block_manager_create_with_journal(cmd->bdev, ++ DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, ++ CACHE_MAX_CONCURRENT_LOCKS, cmd->journal_dev); + if (IS_ERR(cmd->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(cmd->bm); +@@ -621,9 +624,8 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd, + disk_super = dm_block_data(sblock); + update_flags(disk_super, mutator); + read_superblock_fields(cmd, disk_super); +- dm_bm_unlock(sblock); + +- return dm_bm_flush(cmd->bm); ++ return dm_bm_flush_and_unlock(cmd->bm, sblock); + } + + static int __begin_transaction(struct dm_cache_metadata *cmd) +@@ -642,7 +644,7 @@ static int __begin_transaction(struct dm_cache_metadata *cmd) + + disk_super = dm_block_data(sblock); + read_superblock_fields(cmd, disk_super); +- dm_bm_unlock(sblock); ++ dm_bm_unlock(cmd->bm, sblock); + + return 0; + } +@@ -1775,7 +1777,7 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) + disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(cmd->flags); + +- dm_bm_unlock(sblock); ++ dm_bm_unlock(cmd->bm, sblock); + + out: + WRITE_UNLOCK(cmd); +diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c +index 8e48920a3ffa..4aad158a58e8 100644 +--- a/drivers/md/dm-era-target.c ++++ b/drivers/md/dm-era-target.c +@@ -342,7 +342,7 @@ static int superblock_all_zeroes(struct dm_block_manager *bm, bool *result) + } + } + +- dm_bm_unlock(b); ++ dm_bm_unlock(bm, b); + + return 0; + } +@@ -583,12 +583,12 @@ static int open_metadata(struct era_metadata *md) + md->metadata_snap = le64_to_cpu(disk->metadata_snap); + md->archived_writesets = true; + +- dm_bm_unlock(sblock); ++ dm_bm_unlock(md->bm, sblock); + + return 0; + + bad: +- dm_bm_unlock(sblock); ++ dm_bm_unlock(md->bm, sblock); + return r; + } + +diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c +index 72142021b5c9..8420b67b0e51 100644 +--- a/drivers/md/dm-thin-metadata.c ++++ b/drivers/md/dm-thin-metadata.c +@@ -146,6 +146,8 @@ struct dm_pool_metadata { + struct hlist_node hash; + + struct block_device *bdev; ++ struct block_device *journal_dev; ++ + struct dm_block_manager *bm; + struct dm_space_map *metadata_sm; + struct dm_space_map *data_sm; +@@ -399,7 +401,7 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) + } + } + +- dm_bm_unlock(b); ++ dm_bm_unlock(bm, b); + + return 0; + } +@@ -655,7 +657,7 @@ static int __open_metadata(struct dm_pool_metadata *pmd) + } + + __setup_btree_details(pmd); +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + + return 0; + +@@ -665,7 +667,7 @@ static int __open_metadata(struct dm_pool_metadata *pmd) + dm_tm_destroy(pmd->tm); + dm_sm_destroy(pmd->metadata_sm); + bad_unlock_sblock: +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + + return r; + } +@@ -688,8 +690,18 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f + { + int r; + +- pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, +- THIN_MAX_CONCURRENT_LOCKS); ++ pr_alert("pmd->journal_dev = %p\n", pmd->journal_dev); ++ if (pmd->journal_dev) ++ pmd->bm = dm_block_manager_create_with_journal( ++ pmd->bdev, ++ THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, ++ THIN_MAX_CONCURRENT_LOCKS, ++ pmd->journal_dev); ++ else ++ pmd->bm = dm_block_manager_create(pmd->bdev, ++ THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, ++ THIN_MAX_CONCURRENT_LOCKS); ++ + if (IS_ERR(pmd->bm)) { + DMERR("could not create block manager"); + return PTR_ERR(pmd->bm); +@@ -734,7 +746,7 @@ static int __begin_transaction(struct dm_pool_metadata *pmd) + pmd->flags = le32_to_cpu(disk_super->flags); + pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); + +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + return 0; + } + +@@ -818,7 +830,8 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) + + struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size, +- bool format_device) ++ bool format_device, ++ struct block_device *journal) + { + int r; + struct dm_pool_metadata *pmd; +@@ -834,6 +847,7 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + INIT_LIST_HEAD(&pmd->thin_devices); + pmd->fail_io = false; + pmd->bdev = bdev; ++ pmd->journal_dev = journal; + pmd->data_block_size = data_block_size; + + r = __create_persistent_data_objects(pmd, format_device); +@@ -1253,7 +1267,7 @@ static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) + + disk_super = dm_block_data(sblock); + disk_super->held_root = cpu_to_le64(held_root); +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + return 0; + } + +@@ -1284,7 +1298,7 @@ static int __release_metadata_snap(struct dm_pool_metadata *pmd) + held_root = le64_to_cpu(disk_super->held_root); + disk_super->held_root = cpu_to_le64(0); + +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + + if (!held_root) { + DMWARN("No pool metadata snapshot found: nothing to release."); +@@ -1332,7 +1346,7 @@ static int __get_metadata_snap(struct dm_pool_metadata *pmd, + disk_super = dm_block_data(sblock); + *result = le64_to_cpu(disk_super->held_root); + +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + + return 0; + } +@@ -1790,6 +1804,10 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) + + __set_abort_with_changes_flags(pmd); + __destroy_persistent_data_objects(pmd); ++ ++ // FIXME: hack to avoid writing code for reopening the journal ++ BUG(); ++ + r = __create_persistent_data_objects(pmd, false); + if (r) + pmd->fail_io = true; +@@ -1985,7 +2003,7 @@ int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) + disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(pmd->flags); + +- dm_bm_unlock(sblock); ++ dm_bm_unlock(pmd->bm, sblock); + out: + up_write(&pmd->root_lock); + return r; +diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h +index 35e954ea20a9..6bd01c74e925 100644 +--- a/drivers/md/dm-thin-metadata.h ++++ b/drivers/md/dm-thin-metadata.h +@@ -43,7 +43,8 @@ typedef uint64_t dm_thin_id; + */ + struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, + sector_t data_block_size, +- bool format_device); ++ bool format_device, ++ struct block_device *journal); + + int dm_pool_metadata_close(struct dm_pool_metadata *pmd); + +diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c +index 7bd60a150f8f..66f03447a05e 100644 +--- a/drivers/md/dm-thin.c ++++ b/drivers/md/dm-thin.c +@@ -8,6 +8,7 @@ + #include "dm-bio-prison-v1.h" + #include "dm.h" + ++#include + #include + #include + #include +@@ -34,6 +35,10 @@ + + static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS; + ++static char *journal_name = NULL; ++module_param_named(block_manager_journal, journal_name, charp, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC(block_manager_journal, "Device to recieve the block manager journal (used for debugging)"); ++ + DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, + "A percentage of time allocated for copy on write"); + +@@ -287,6 +292,7 @@ struct pool_c { + struct pool *pool; + struct dm_dev *data_dev; + struct dm_dev *metadata_dev; ++ struct dm_dev *journal_dev; + struct dm_target_callbacks callbacks; + + dm_block_t low_water_blocks; +@@ -2839,6 +2845,7 @@ static struct kmem_cache *_new_mapping_cache; + + static struct pool *pool_create(struct mapped_device *pool_md, + struct block_device *metadata_dev, ++ struct block_device *journal_dev, + unsigned long block_size, + int read_only, char **error) + { +@@ -2848,7 +2855,8 @@ static struct pool *pool_create(struct mapped_device *pool_md, + struct dm_pool_metadata *pmd; + bool format_device = read_only ? false : true; + +- pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device); ++ pr_alert("passing journal_dev = %p\n", journal_dev); ++ pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device, journal_dev); + if (IS_ERR(pmd)) { + *error = "Error creating metadata object"; + return (struct pool *)pmd; +@@ -2986,6 +2994,7 @@ static void __pool_dec(struct pool *pool) + + static struct pool *__pool_find(struct mapped_device *pool_md, + struct block_device *metadata_dev, ++ struct block_device *journal_dev, + unsigned long block_size, int read_only, + char **error, int *created) + { +@@ -3008,7 +3017,7 @@ static struct pool *__pool_find(struct mapped_device *pool_md, + __pool_inc(pool); + + } else { +- pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); ++ pool = pool_create(pool_md, metadata_dev, journal_dev, block_size, read_only, error); + *created = 1; + } + } +@@ -3029,6 +3038,7 @@ static void pool_dtr(struct dm_target *ti) + __pool_dec(pt->pool); + dm_put_device(ti, pt->metadata_dev); + dm_put_device(ti, pt->data_dev); ++ dm_put_device(ti, pt->journal_dev); + kfree(pt); + + mutex_unlock(&dm_thin_pool_table.mutex); +@@ -3145,6 +3155,14 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt) + return min((dm_block_t)1024ULL /* 4M */, quarter); + } + ++static void normalise_journal_name_(const char *name, char *buffer, size_t len) ++{ ++ while (*name && !isspace(*name) && --len) ++ *buffer++ = *name++; ++ ++ *buffer = '\0'; ++} ++ + /* + * thin-pool + * +@@ -3169,6 +3187,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) + unsigned long block_size; + dm_block_t low_water_blocks; + struct dm_dev *metadata_dev; ++ struct dm_dev *journal_dev = NULL; + fmode_t metadata_mode; + + /* +@@ -3230,7 +3249,21 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) + goto out; + } + +- pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, ++ if (journal_name) { ++ char buffer[64]; ++ normalise_journal_name_(journal_name, buffer, sizeof(buffer)); ++ if (buffer[0]) { ++ r = dm_get_device(ti, buffer, FMODE_READ | FMODE_WRITE, &journal_dev); ++ if (r) { ++ pr_alert("couldn't open journal device '%s'", buffer); ++ journal_dev = NULL; ++ } else { ++ pr_alert("opened journal device '%s'", buffer); ++ } ++ } ++ } ++ ++ pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, journal_dev ? journal_dev->bdev : NULL, + block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); + if (IS_ERR(pool)) { + r = PTR_ERR(pool); +@@ -3253,6 +3286,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) + pt->ti = ti; + pt->metadata_dev = metadata_dev; + pt->data_dev = data_dev; ++ pt->journal_dev = journal_dev; + pt->low_water_blocks = low_water_blocks; + pt->adjusted_pf = pt->requested_pf = pf; + ti->num_flush_bios = 1; +@@ -4400,6 +4434,7 @@ module_exit(dm_thin_exit); + module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR); + MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds"); + ++ + MODULE_DESCRIPTION(DM_NAME " thin provisioning target"); + MODULE_AUTHOR("Joe Thornber "); + MODULE_LICENSE("GPL"); +diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c +index 492a3f8ac119..b1f773cb037f 100644 +--- a/drivers/md/persistent-data/dm-block-manager.c ++++ b/drivers/md/persistent-data/dm-block-manager.c +@@ -291,6 +291,7 @@ static int bl_down_write(struct block_lock *lock) + static void bl_up_write(struct block_lock *lock) + { + spin_lock(&lock->lock); ++ BUG_ON(lock->count != -1); + __del_holder(lock, current); + lock->count = 0; + if (!list_empty(&lock->waiters)) +@@ -343,13 +344,16 @@ void *dm_block_data(struct dm_block *b) + } + EXPORT_SYMBOL_GPL(dm_block_data); + ++// FIXME: test to see if it's worth reducing this ++#define CHECKSUM_SIZE 32 ++#define NR_CHECKSUMS (4096 / CHECKSUM_SIZE) ++ + struct buffer_aux { + struct dm_block_validator *validator; ++ struct block_lock lock; + int write_locked; + +-#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING +- struct block_lock lock; +-#endif ++ uint32_t checksums[NR_CHECKSUMS]; + }; + + static void dm_block_manager_alloc_callback(struct dm_buffer *buf) +@@ -368,69 +372,43 @@ static void dm_block_manager_write_callback(struct dm_buffer *buf) + } + } + +-/*---------------------------------------------------------------- +- * Public interface +- *--------------------------------------------------------------*/ +-struct dm_block_manager { ++/*--------------------------------------------------------------*/ ++ ++struct block_manager { ++ struct dm_block_manager bm; + struct dm_bufio_client *bufio; + bool read_only:1; + }; + +-struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, +- unsigned block_size, +- unsigned max_held_per_thread) +-{ +- int r; +- struct dm_block_manager *bm; +- +- bm = kmalloc(sizeof(*bm), GFP_KERNEL); +- if (!bm) { +- r = -ENOMEM; +- goto bad; +- } +- +- bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, +- sizeof(struct buffer_aux), +- dm_block_manager_alloc_callback, +- dm_block_manager_write_callback); +- if (IS_ERR(bm->bufio)) { +- r = PTR_ERR(bm->bufio); +- kfree(bm); +- goto bad; +- } +- +- bm->read_only = false; +- +- return bm; ++#define DECLARE_BM struct block_manager *bm = container_of(dbm, struct block_manager, bm) + +-bad: +- return ERR_PTR(r); +-} +-EXPORT_SYMBOL_GPL(dm_block_manager_create); +- +-void dm_block_manager_destroy(struct dm_block_manager *bm) ++static void _destroy(struct dm_block_manager *dbm) + { ++ DECLARE_BM; ++ + dm_bufio_client_destroy(bm->bufio); + kfree(bm); + } +-EXPORT_SYMBOL_GPL(dm_block_manager_destroy); + +-unsigned dm_bm_block_size(struct dm_block_manager *bm) ++static unsigned _block_size(struct dm_block_manager *dbm) + { ++ DECLARE_BM; + return dm_bufio_get_block_size(bm->bufio); + } +-EXPORT_SYMBOL_GPL(dm_bm_block_size); + +-dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) ++static dm_block_t _nr_blocks(struct dm_block_manager *dbm) + { ++ DECLARE_BM; + return dm_bufio_get_device_size(bm->bufio); + } + +-static int dm_bm_validate_buffer(struct dm_block_manager *bm, +- struct dm_buffer *buf, +- struct buffer_aux *aux, +- struct dm_block_validator *v) ++static int _validate_buffer(struct dm_block_manager *dbm, ++ struct dm_buffer *buf, ++ struct buffer_aux *aux, ++ struct dm_block_validator *v) + { ++ DECLARE_BM; ++ + if (unlikely(!aux->validator)) { + int r; + if (!v) +@@ -453,10 +431,18 @@ static int dm_bm_validate_buffer(struct dm_block_manager *bm, + + return 0; + } +-int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, +- struct dm_block_validator *v, +- struct dm_block **result) ++ ++static void _prefetch(struct dm_block_manager *dbm, dm_block_t b) + { ++ DECLARE_BM; ++ dm_bufio_prefetch(bm->bufio, b, 1); ++} ++ ++static int _read_lock(struct dm_block_manager *dbm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ DECLARE_BM; + struct buffer_aux *aux; + void *p; + int r; +@@ -475,7 +461,7 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + + aux->write_locked = 0; + +- r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); ++ r = dm_bm_validate_buffer(dbm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); +@@ -484,12 +470,12 @@ int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, + + return 0; + } +-EXPORT_SYMBOL_GPL(dm_bm_read_lock); + +-int dm_bm_write_lock(struct dm_block_manager *bm, +- dm_block_t b, struct dm_block_validator *v, +- struct dm_block **result) ++static int _write_lock(struct dm_block_manager *dbm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result) + { ++ DECLARE_BM; + struct buffer_aux *aux; + void *p; + int r; +@@ -511,7 +497,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, + + aux->write_locked = 1; + +- r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); ++ r = dm_bm_validate_buffer(dbm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_write(&aux->lock); + dm_bufio_release(to_buffer(*result)); +@@ -520,12 +506,12 @@ int dm_bm_write_lock(struct dm_block_manager *bm, + + return 0; + } +-EXPORT_SYMBOL_GPL(dm_bm_write_lock); + +-int dm_bm_read_try_lock(struct dm_block_manager *bm, +- dm_block_t b, struct dm_block_validator *v, +- struct dm_block **result) ++static int _read_try_lock(struct dm_block_manager *dbm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result) + { ++ DECLARE_BM; + struct buffer_aux *aux; + void *p; + int r; +@@ -545,7 +531,7 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, + } + aux->write_locked = 0; + +- r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v); ++ r = dm_bm_validate_buffer(dbm, to_buffer(*result), aux, v); + if (unlikely(r)) { + bl_up_read(&aux->lock); + dm_bufio_release(to_buffer(*result)); +@@ -555,10 +541,11 @@ int dm_bm_read_try_lock(struct dm_block_manager *bm, + return 0; + } + +-int dm_bm_write_lock_zero(struct dm_block_manager *bm, +- dm_block_t b, struct dm_block_validator *v, +- struct dm_block **result) ++static int _write_lock_zero(struct dm_block_manager *dbm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result) + { ++ DECLARE_BM; + int r; + struct buffer_aux *aux; + void *p; +@@ -570,7 +557,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, + if (unlikely(IS_ERR(p))) + return PTR_ERR(p); + +- memset(p, 0, dm_bm_block_size(bm)); ++ memset(p, 0, dm_bm_block_size(dbm)); + + aux = dm_bufio_get_aux_data(to_buffer(*result)); + r = bl_down_write(&aux->lock); +@@ -584,9 +571,8 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, + + return 0; + } +-EXPORT_SYMBOL_GPL(dm_bm_write_lock_zero); + +-void dm_bm_unlock(struct dm_block *b) ++static void _unlock(struct dm_block_manager *bm, struct dm_block *b) + { + struct buffer_aux *aux; + aux = dm_bufio_get_aux_data(to_buffer(b)); +@@ -599,39 +585,579 @@ void dm_bm_unlock(struct dm_block *b) + + dm_bufio_release(to_buffer(b)); + } +-EXPORT_SYMBOL_GPL(dm_bm_unlock); + +-int dm_bm_flush(struct dm_block_manager *bm) ++static int _flush(struct dm_block_manager *dbm) + { ++ DECLARE_BM; ++ + if (bm->read_only) + return -EPERM; + + return dm_bufio_write_dirty_buffers(bm->bufio); + } +-EXPORT_SYMBOL_GPL(dm_bm_flush); + +-void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) ++static int _flush_and_unlock(struct dm_block_manager *dbm, ++ struct dm_block *superblock) + { +- dm_bufio_prefetch(bm->bufio, b, 1); ++ DECLARE_BM; ++ int r; ++ ++ if (bm->read_only) ++ return -EPERM; ++ ++ r = dm_bufio_write_dirty_buffers(bm->bufio); ++ if (unlikely(r)) { ++ dm_bm_unlock(dbm, superblock); ++ return r; ++ } ++ ++ dm_bm_unlock(dbm, superblock); ++ ++ return dm_bufio_write_dirty_buffers(bm->bufio); + } + +-bool dm_bm_is_read_only(struct dm_block_manager *bm) ++static bool _is_read_only(struct dm_block_manager *dbm) + { ++ DECLARE_BM; + return bm->read_only; + } +-EXPORT_SYMBOL_GPL(dm_bm_is_read_only); + +-void dm_bm_set_read_only(struct dm_block_manager *bm) ++static void _set_read_only(struct dm_block_manager *dbm) + { ++ DECLARE_BM; + bm->read_only = true; + } +-EXPORT_SYMBOL_GPL(dm_bm_set_read_only); + +-void dm_bm_set_read_write(struct dm_block_manager *bm) ++static void _set_read_write(struct dm_block_manager *dbm) + { ++ DECLARE_BM; + bm->read_only = false; + } +-EXPORT_SYMBOL_GPL(dm_bm_set_read_write); ++#undef DECLARE_BM ++ ++static void _check_bm_filled_out(struct dm_block_manager *dbm) ++{ ++ BUG_ON(!dbm->destroy); ++ BUG_ON(!dbm->block_size); ++ BUG_ON(!dbm->nr_blocks); ++ BUG_ON(!dbm->validate_buffer); ++ BUG_ON(!dbm->prefetch); ++ BUG_ON(!dbm->read_lock_); ++ BUG_ON(!dbm->write_lock_); ++ BUG_ON(!dbm->read_try_lock_); ++ BUG_ON(!dbm->write_lock_zero); ++ BUG_ON(!dbm->unlock); ++ BUG_ON(!dbm->flush); ++ BUG_ON(!dbm->flush_and_unlock); ++ BUG_ON(!dbm->is_read_only); ++ BUG_ON(!dbm->set_read_only); ++ BUG_ON(!dbm->set_read_write); ++} ++ ++struct dm_block_manager *dm_block_manager_create(struct block_device *bdev, ++ unsigned block_size, ++ unsigned max_held_per_thread) ++{ ++ int r; ++ struct block_manager *bm; ++ ++ bm = kmalloc(sizeof(*bm), GFP_KERNEL); ++ if (!bm) { ++ r = -ENOMEM; ++ goto bad; ++ } ++ ++ bm->bm.destroy = _destroy; ++ bm->bm.block_size = _block_size; ++ bm->bm.nr_blocks = _nr_blocks; ++ bm->bm.validate_buffer = _validate_buffer; ++ bm->bm.prefetch = _prefetch; ++ bm->bm.read_lock_ = _read_lock; ++ bm->bm.write_lock_ = _write_lock; ++ bm->bm.read_try_lock_ = _read_try_lock; ++ bm->bm.write_lock_zero = _write_lock_zero; ++ bm->bm.unlock = _unlock; ++ bm->bm.flush = _flush; ++ bm->bm.flush_and_unlock = _flush_and_unlock; ++ bm->bm.is_read_only = _is_read_only; ++ bm->bm.set_read_only = _set_read_only; ++ bm->bm.set_read_write = _set_read_write; ++ ++ bm->bufio = dm_bufio_client_create(bdev, block_size, max_held_per_thread, ++ sizeof(struct buffer_aux), ++ dm_block_manager_alloc_callback, ++ dm_block_manager_write_callback); ++ ++ if (IS_ERR(bm->bufio)) { ++ r = PTR_ERR(bm->bufio); ++ kfree(bm); ++ goto bad; ++ } ++ ++ bm->read_only = false; ++ ++ _check_bm_filled_out(&bm->bm); ++ ++ pr_alert("created real at %p\n", &bm->bm); ++ return &bm->bm; ++ ++bad: ++ return ERR_PTR(r); ++} ++EXPORT_SYMBOL_GPL(dm_block_manager_create); ++ ++/*----------------------------------------------------------------*/ ++ ++enum msg_type { ++ MT_OPEN_JOURNAL, ++ MT_CLOSE_JOURNAL, ++ ++ MT_READ_LOCK, ++ MT_WRITE_LOCK, ++ MT_ZERO_LOCK, ++ MT_TRY_READ_LOCK, ++ MT_UNLOCK, ++ MT_VERIFY, ++ MT_PREPARE, ++ MT_FLUSH, ++ MT_FLUSH_AND_UNLOCK, ++ MT_PREFETCH, ++ MT_SET_READ_ONLY, ++ MT_SET_READ_WRITE, ++}; ++ ++struct byte_stream { ++ spinlock_t lock; ++ struct block_device *dev; ++ struct dm_bufio_client *cache; ++ ++ uint64_t block_index; ++ struct dm_buffer *current_buffer; ++ void *current_data; ++ uint8_t *out_begin; ++ uint8_t *out_end; ++}; ++ ++#define JOURNAL_BLOCK_SIZE (1024 * 1024 * 1024) ++ ++// We just BUG if there's an error; this is developement code. ++static void _prep_block(struct byte_stream *bs, uint64_t block) ++{ ++ bs->current_data = dm_bufio_new(bs->cache, block, &bs->current_buffer); ++ BUG_ON(!bs->current_data); ++ bs->out_begin = bs->current_data; ++ bs->out_end = bs->current_data + JOURNAL_BLOCK_SIZE; ++} ++ ++static void _commit_block(struct byte_stream *bs) ++{ ++ dm_bufio_mark_buffer_dirty(bs->current_buffer); ++ dm_bufio_release(bs->current_buffer); ++} ++ ++static struct byte_stream *_bs_open(struct block_device *dev) ++{ ++ struct byte_stream *bs = kzalloc(sizeof(*bs), GFP_KERNEL); ++ ++ if (!bs) ++ return NULL; ++ ++ spin_lock_init(&bs->lock); ++ bs->dev = dev; ++ bs->cache = dm_bufio_client_create(dev, JOURNAL_BLOCK_SIZE, ++ 2, 0, NULL, NULL); ++ if (!bs->cache) { ++ kfree(bs); ++ return NULL; ++ } ++ ++ _prep_block(bs, 0); ++ ++ return bs; ++} ++ ++static void _bs_close(struct byte_stream *bs) ++{ ++ _commit_block(bs); ++ dm_bufio_client_destroy(bs->cache); ++ kfree(bs); ++} ++ ++static size_t _cpy_bytes(struct byte_stream *bs, uint8_t *b, uint8_t *e) ++{ ++ size_t len = min(e - b, bs->out_end - bs->out_begin); ++ memcpy(bs->out_begin, b, len); ++ bs->out_begin += len; ++ return len; ++} ++ ++static bool _no_space(struct byte_stream *bs) ++{ ++ return bs->out_begin == bs->out_end; ++} ++ ++static void _push_bytes(struct byte_stream *bs, uint8_t *b, uint8_t *e) ++{ ++ while (b != e) { ++ if (_no_space(bs)) { ++ pr_alert("push_bytes: out of space\n"); ++ _commit_block(bs); ++ _prep_block(bs, bs->block_index + 1); ++ pr_alert("done"); ++ } ++ ++ b += _cpy_bytes(bs, b, e); ++ } ++} ++ ++static void _push_u8(struct byte_stream *bs, uint8_t v) ++{ ++ return _push_bytes(bs, &v, &v + 1); ++} ++ ++static void _push_u16(struct byte_stream *bs, uint16_t v) ++{ ++ return _push_bytes(bs, (uint8_t *) &v, (uint8_t *) (&v + 1)); ++} ++ ++static void _push_u64(struct byte_stream *bs, uint64_t v) ++{ ++ return _push_bytes(bs, (uint8_t *) &v, (uint8_t *) (&v + 1)); ++} ++ ++static void _push_msg(struct byte_stream *bs, enum msg_type t, int err) ++{ ++ uint8_t b = t << 1; ++ b |= err ? 0 : 1; ++ _push_u8(bs, b); ++} ++ ++/*----------------------------------------------------------------*/ ++ ++static u32 _cs_chunk(const void *data, unsigned chunk) ++{ ++ return crc32c(0, data + (chunk * CHECKSUM_SIZE), CHECKSUM_SIZE); ++} ++ ++static void _calc_checksums(struct dm_block *b) ++{ ++ unsigned i; ++ const void *data = dm_block_data(b); ++ struct buffer_aux *aux = dm_bufio_get_aux_data((struct dm_buffer *) b); ++ ++ for (i = 0; i < NR_CHECKSUMS; i++) ++ aux->checksums[i] = _cs_chunk(data, i); ++} ++ ++static void _write_delta(struct byte_stream *bs, struct dm_block *b, unsigned chunk) ++{ ++ uint8_t *begin = dm_block_data(b) + (chunk * CHECKSUM_SIZE); ++ uint8_t *end = begin + CHECKSUM_SIZE; ++ ++ _push_u16(bs, chunk); ++ _push_bytes(bs, begin, end); ++} ++ ++static void _terminate_deltas(struct byte_stream *bs) ++{ ++ BUG_ON(NR_CHECKSUMS > 0xff); ++ _push_u16(bs, 0xffff); ++} ++ ++static void _push_deltas(struct byte_stream *bs, struct dm_block *b) ++{ ++ unsigned i; ++ uint32_t sum; ++ const void *data = dm_block_data(b); ++ struct buffer_aux *aux = dm_bufio_get_aux_data((struct dm_buffer *) b); ++ ++ if (aux->write_locked) ++ for (i = 0; i < NR_CHECKSUMS; i++) { ++ sum = _cs_chunk(data, i); ++ if (sum != aux->checksums[i]) ++ _write_delta(bs, b, i); ++ } ++ ++ _terminate_deltas(bs); ++} ++ ++/*----------------------------------------------------------------*/ ++ ++struct journal_bm { ++ struct dm_block_manager bm; ++ struct dm_block_manager *inner; ++ struct byte_stream *out; ++}; ++ ++#define DECLARE_BM struct journal_bm *bm = container_of(dbm, struct journal_bm, bm) ++ ++static void _j_destroy(struct dm_block_manager *dbm) ++{ ++ DECLARE_BM; ++ _push_msg(bm->out, MT_CLOSE_JOURNAL, true); ++ _bs_close(bm->out); ++ bm->inner->destroy(bm->inner); ++ kfree(bm); ++} ++ ++static unsigned _j_block_size(struct dm_block_manager *dbm) ++{ ++ DECLARE_BM; ++ return bm->inner->block_size(bm->inner); ++} ++ ++static dm_block_t _j_nr_blocks(struct dm_block_manager *dbm) ++{ ++ DECLARE_BM; ++ return bm->inner->nr_blocks(bm->inner); ++} ++ ++static int _j_validate_buffer(struct dm_block_manager *dbm, ++ struct dm_buffer *buf, ++ struct buffer_aux *aux, ++ struct dm_block_validator *v) ++{ ++ int r; ++ DECLARE_BM; ++ unsigned long flags; ++ ++ r = bm->inner->validate_buffer(bm->inner, buf, aux, v); ++ ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_VERIFY, r); ++ _push_u64(bm->out, dm_bufio_get_block_number(buf)); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ return r; ++} ++ ++static void _j_prefetch(struct dm_block_manager *dbm, dm_block_t b) ++{ ++ DECLARE_BM; ++ bm->inner->prefetch(bm->inner, b); ++} ++ ++static int _j_read_lock(struct dm_block_manager *dbm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ int r; ++ DECLARE_BM; ++ unsigned long flags; ++ ++ r = bm->inner->read_lock_(bm->inner, b, v, result); ++ ++ // No need to calculate checksums for a read lock ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_READ_LOCK, r); ++ _push_u64(bm->out, b); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ return r; ++} ++ ++static int _j_write_lock(struct dm_block_manager *dbm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ int r; ++ DECLARE_BM; ++ unsigned long flags; ++ ++ r = bm->inner->write_lock_(bm->inner, b, v, result); ++ if (!r) ++ _calc_checksums(*result); ++ ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_WRITE_LOCK, r); ++ _push_u64(bm->out, b); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ return r; ++} ++ ++static int _j_read_try_lock(struct dm_block_manager *dbm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ int r; ++ DECLARE_BM; ++ unsigned long flags; ++ ++ r = bm->inner->read_try_lock_(bm->inner, b, v, result); ++ ++ // try_read_lock is called from request context, so we mustn't trigger io. ++ // FIXME: work out a way to journal this! ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_TRY_READ_LOCK, r); ++ _push_u64(bm->out, b); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ return r; ++} ++ ++static int _j_write_lock_zero(struct dm_block_manager *dbm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ int r; ++ DECLARE_BM; ++ unsigned long flags; ++ ++ r = bm->inner->write_lock_zero(bm->inner, b, v, result); ++ if (!r) ++ _calc_checksums(*result); ++ ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_ZERO_LOCK, r); ++ _push_u64(bm->out, b); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ return r; ++} ++ ++static void _j_unlock(struct dm_block_manager *dbm, struct dm_block *b) ++{ ++ DECLARE_BM; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_UNLOCK, 0); ++ _push_u64(bm->out, dm_block_location(b)); ++ _push_deltas(bm->out, b); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ bm->inner->unlock(bm->inner, b); ++} ++ ++static int _j_flush(struct dm_block_manager *dbm) ++{ ++ int r; ++ DECLARE_BM; ++ unsigned long flags; ++ ++ r = bm->inner->flush(bm->inner); ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_FLUSH, r); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ return r; ++} ++ ++static int _j_flush_and_unlock(struct dm_block_manager *dbm, ++ struct dm_block *superblock) ++{ ++ DECLARE_BM; ++ unsigned long flags; ++ ++ pr_alert("flush_and_unlock\n"); ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_FLUSH_AND_UNLOCK, 0); ++ _push_u64(bm->out, dm_block_location(superblock)); ++ _push_deltas(bm->out, superblock); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++ ++ return bm->inner->flush_and_unlock(bm->inner, superblock); ++} ++ ++static bool _j_is_read_only(struct dm_block_manager *dbm) ++{ ++ DECLARE_BM; ++ ++ return bm->inner->is_read_only(bm->inner); ++} ++ ++static void _j_set_read_only(struct dm_block_manager *dbm) ++{ ++ DECLARE_BM; ++ unsigned long flags; ++ ++ bm->inner->set_read_only(bm->inner); ++ ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_SET_READ_ONLY, true); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++} ++ ++static void _j_set_read_write(struct dm_block_manager *dbm) ++{ ++ DECLARE_BM; ++ unsigned long flags; ++ ++ bm->inner->set_read_write(bm->inner); ++ ++ spin_lock_irqsave(&bm->out->lock, flags); ++ _push_msg(bm->out, MT_SET_READ_WRITE, true); ++ spin_unlock_irqrestore(&bm->out->lock, flags); ++} ++ ++#undef DECLARE_BM ++ ++static bool _unformatted_journal(struct byte_stream *bs) ++{ ++ // The journal is unformatted if the first sector (512 bytes) is zeroed. ++ uint8_t buffer[64]; ++ ++ for (i = 0; i < 8; i++) { ++ _bs_ ++ } ++ ++ _bs_rewrind(bs); ++} ++ ++struct dm_block_manager *dm_block_manager_create_with_journal(struct block_device *bdev, ++ unsigned block_size, ++ unsigned max_held_per_thread, ++ struct block_device *jdev) ++{ ++ struct journal_bm *jbm; ++ struct dm_block_manager *inner = dm_block_manager_create(bdev, block_size, max_held_per_thread); ++ ++ if (IS_ERR(inner)) ++ return inner; ++ ++ jbm = kmalloc(sizeof(*jbm), GFP_KERNEL); ++ if (!jbm) { ++ inner->destroy(inner); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ jbm->out = _bs_open(jdev); ++ if (!jbm->out) { ++ inner->destroy(inner); ++ kfree(jbm); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ jbm->bm.destroy = _j_destroy; ++ jbm->bm.block_size = _j_block_size; ++ jbm->bm.nr_blocks = _j_nr_blocks; ++ jbm->bm.validate_buffer = _j_validate_buffer; ++ jbm->bm.prefetch = _j_prefetch; ++ jbm->bm.read_lock_ = _j_read_lock; ++ jbm->bm.write_lock_ = _j_write_lock; ++ jbm->bm.read_try_lock_ = _j_read_try_lock; ++ jbm->bm.write_lock_zero = _j_write_lock_zero; ++ jbm->bm.unlock = _j_unlock; ++ jbm->bm.flush = _j_flush; ++ jbm->bm.flush_and_unlock = _j_flush_and_unlock; ++ jbm->bm.is_read_only = _j_is_read_only; ++ jbm->bm.set_read_only = _j_set_read_only; ++ jbm->bm.set_read_write = _j_set_read_write; ++ ++ _check_bm_filled_out(&jbm->bm); ++ ++ jbm->inner = inner; ++ ++ pr_alert("journalling block manager created\n"); ++ ++ _push_msg(jbm->out, MT_OPEN_JOURNAL, 0); ++ _push_u64(jbm->out, dm_bm_nr_blocks(inner)); ++ ++ return &jbm->bm; ++} ++EXPORT_SYMBOL_GPL(dm_block_manager_create_with_journal); ++ ++/*----------------------------------------------------------------*/ + + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor) + { +@@ -645,4 +1171,5 @@ MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Joe Thornber "); + MODULE_DESCRIPTION("Immutable metadata library for dm"); + ++ + /*----------------------------------------------------------------*/ +diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h +index e728937f376a..adb55f6aceac 100644 +--- a/drivers/md/persistent-data/dm-block-manager.h ++++ b/drivers/md/persistent-data/dm-block-manager.h +@@ -23,23 +23,8 @@ void *dm_block_data(struct dm_block *b); + + /*----------------------------------------------------------------*/ + +-/* +- * @name should be a unique identifier for the block manager, no longer +- * than 32 chars. +- * +- * @max_held_per_thread should be the maximum number of locks, read or +- * write, that an individual thread holds at any one time. +- */ +-struct dm_block_manager; +-struct dm_block_manager *dm_block_manager_create( +- struct block_device *bdev, unsigned block_size, +- unsigned max_held_per_thread); +-void dm_block_manager_destroy(struct dm_block_manager *bm); +- +-unsigned dm_bm_block_size(struct dm_block_manager *bm); +-dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm); +- +-/*----------------------------------------------------------------*/ ++struct dm_buffer; ++struct buffer_aux; + + /* + * The validator allows the caller to verify newly-read data and modify +@@ -57,44 +42,141 @@ struct dm_block_validator { + int (*check)(struct dm_block_validator *v, struct dm_block *b, size_t block_size); + }; + ++struct dm_block_manager { ++ void (*destroy)(struct dm_block_manager *bm); ++ unsigned (*block_size)(struct dm_block_manager *bm); ++ dm_block_t (*nr_blocks)(struct dm_block_manager *bm); ++ int (*validate_buffer)(struct dm_block_manager *bm, ++ struct dm_buffer *buf, ++ struct buffer_aux *aux, ++ struct dm_block_validator *v); ++ void (*prefetch)(struct dm_block_manager *bm, dm_block_t b); ++ int (*read_lock_)(struct dm_block_manager *bm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result); ++ int (*write_lock_)(struct dm_block_manager *bm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result); ++ int (*read_try_lock_)(struct dm_block_manager *bm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result); ++ int (*write_lock_zero)(struct dm_block_manager *bm, ++ dm_block_t b, struct dm_block_validator *v, ++ struct dm_block **result); ++ void (*unlock)(struct dm_block_manager *bm, struct dm_block *b); ++ int (*flush)(struct dm_block_manager *bm); ++ int (*flush_and_unlock)(struct dm_block_manager *bm, ++ struct dm_block *superblock); ++ bool (*is_read_only)(struct dm_block_manager *bm); ++ void (*set_read_only)(struct dm_block_manager *bm); ++ void (*set_read_write)(struct dm_block_manager *bm); ++}; ++ ++/* ++ * @name should be a unique identifier for the block manager, no longer ++ * than 32 chars. ++ * ++ * @max_held_per_thread should be the maximum number of locks, read or ++ * write, that an individual thread holds at any one time. ++ */ ++ ++struct dm_block_manager *dm_block_manager_create( ++ struct block_device *bdev, unsigned block_size, ++ unsigned max_held_per_thread); ++ ++struct dm_block_manager *dm_block_manager_create_with_journal( ++ struct block_device *bdev, unsigned block_size, ++ unsigned max_held_per_thread, ++ struct block_device *jdev); ++ + /*----------------------------------------------------------------*/ + ++static inline void dm_block_manager_destroy(struct dm_block_manager *bm) ++{ ++ bm->destroy(bm); ++} ++ ++static inline unsigned dm_bm_block_size(struct dm_block_manager *bm) ++{ ++ return bm->block_size(bm); ++} ++ ++static inline dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm) ++{ ++ return bm->nr_blocks(bm); ++} ++ ++/*----------------------------------------------------------------*/ ++ ++static inline int dm_bm_validate_buffer(struct dm_block_manager *bm, ++ struct dm_buffer *buf, ++ struct buffer_aux *aux, ++ struct dm_block_validator *v) ++{ ++ return bm->validate_buffer(bm, buf, aux, v); ++} ++ + /* + * You can have multiple concurrent readers or a single writer holding a + * block lock. + */ + ++static inline void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) ++{ ++ bm->prefetch(bm, b); ++} ++ + /* + * dm_bm_lock() locks a block and returns through @result a pointer to + * memory that holds a copy of that block. If you have write-locked the + * block then any changes you make to memory pointed to by @result will be + * written back to the disk sometime after dm_bm_unlock is called. + */ +-int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, +- struct dm_block_validator *v, +- struct dm_block **result); +- +-int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, +- struct dm_block_validator *v, +- struct dm_block **result); ++static inline int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ return bm->read_lock_(bm, b, v, result); ++} ++ ++static inline int dm_bm_write_lock(struct dm_block_manager *bm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ return bm->write_lock_(bm, b, v, result); ++} + + /* + * The *_try_lock variants return -EWOULDBLOCK if the block isn't + * available immediately. + */ +-int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, +- struct dm_block_validator *v, +- struct dm_block **result); ++static inline int dm_bm_read_try_lock(struct dm_block_manager *bm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ return bm->read_try_lock_(bm, b, v, result); ++} + + /* + * Use dm_bm_write_lock_zero() when you know you're going to + * overwrite the block completely. It saves a disk read. + */ +-int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, +- struct dm_block_validator *v, +- struct dm_block **result); +- +-void dm_bm_unlock(struct dm_block *b); ++static inline int dm_bm_write_lock_zero(struct dm_block_manager *bm, dm_block_t b, ++ struct dm_block_validator *v, ++ struct dm_block **result) ++{ ++ return bm->write_lock_zero(bm, b, v, result); ++} ++ ++static inline void dm_bm_unlock(struct dm_block_manager *bm, struct dm_block *b) ++{ ++ bm->unlock(bm, b); ++} ++ ++static inline int dm_bm_flush(struct dm_block_manager *bm) ++{ ++ return bm->flush(bm); ++} + + /* + * It's a common idiom to have a superblock that should be committed last. +@@ -105,12 +187,11 @@ void dm_bm_unlock(struct dm_block *b); + * + * This method always blocks. + */ +-int dm_bm_flush(struct dm_block_manager *bm); +- +-/* +- * Request data is prefetched into the cache. +- */ +-void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); ++static inline int dm_bm_flush_and_unlock(struct dm_block_manager *bm, ++ struct dm_block *superblock) ++{ ++ return bm->flush_and_unlock(bm, superblock); ++} + + /* + * Switches the bm to a read only mode. Once read-only mode +@@ -123,9 +204,20 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b); + * Additionally you should not use dm_bm_unlock_move, however no error will + * be returned if you do. + */ +-bool dm_bm_is_read_only(struct dm_block_manager *bm); +-void dm_bm_set_read_only(struct dm_block_manager *bm); +-void dm_bm_set_read_write(struct dm_block_manager *bm); ++static inline void dm_bm_set_read_only(struct dm_block_manager *bm) ++{ ++ return bm->set_read_only(bm); ++} ++ ++static inline bool dm_bm_is_read_only(struct dm_block_manager *bm) ++{ ++ return bm->is_read_only(bm); ++} ++ ++static inline void dm_bm_set_read_write(struct dm_block_manager *bm) ++{ ++ bm->set_read_write(bm); ++} + + u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor); + +diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c +index abe2c5dd0993..5b447efdc2fb 100644 +--- a/drivers/md/persistent-data/dm-transaction-manager.c ++++ b/drivers/md/persistent-data/dm-transaction-manager.c +@@ -225,7 +225,7 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root) + return -EWOULDBLOCK; + + wipe_shadow_table(tm); +- dm_bm_unlock(root); ++ dm_bm_unlock(tm->bm, root); + + return dm_bm_flush(tm->bm); + } +@@ -289,14 +289,14 @@ static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig, + */ + r = dm_bm_write_lock_zero(tm->bm, new, v, result); + if (r) { +- dm_bm_unlock(orig_block); ++ dm_bm_unlock(tm->bm, orig_block); + return r; + } + + memcpy(dm_block_data(*result), dm_block_data(orig_block), + dm_bm_block_size(tm->bm)); + +- dm_bm_unlock(orig_block); ++ dm_bm_unlock(tm->bm, orig_block); + return r; + } + +@@ -344,7 +344,10 @@ EXPORT_SYMBOL_GPL(dm_tm_read_lock); + + void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b) + { +- dm_bm_unlock(b); ++ if (tm->is_clone) ++ tm = tm->real; ++ ++ dm_bm_unlock(tm->bm, b); + } + EXPORT_SYMBOL_GPL(dm_tm_unlock); + diff --git a/thin-provisioning/commands.cc b/thin-provisioning/commands.cc index b5f8495..13dc76c 100644 --- a/thin-provisioning/commands.cc +++ b/thin-provisioning/commands.cc @@ -25,6 +25,7 @@ thin_provisioning::register_thin_commands(base::application &app) app.add_cmd(command::ptr(new thin_generate_metadata_cmd())); app.add_cmd(command::ptr(new thin_show_duplicates_cmd())); app.add_cmd(command::ptr(new thin_show_metadata_cmd())); + app.add_cmd(command::ptr(new thin_journal_cmd())); #endif } diff --git a/thin-provisioning/thin_journal.cc b/thin-provisioning/thin_journal.cc index b91fad8..b24eea1 100644 --- a/thin-provisioning/thin_journal.cc +++ b/thin-provisioning/thin_journal.cc @@ -38,17 +38,22 @@ byte_stream::read_bytes(uint8_t *b, uint8_t *e) { while (b != e) b += read_some_(b, e); + + assert(b == e); } void byte_stream::next_block_() { current_block_++; + cursor_ = 0; } size_t byte_stream::read_some_(uint8_t *b, uint8_t *e) { + assert(cursor_ <= JOURNAL_BLOCK_SIZE); + if (cursor_ == JOURNAL_BLOCK_SIZE) next_block_(); @@ -69,6 +74,29 @@ journal_msg::journal_msg(bool success) { } +open_journal_msg::open_journal_msg(uint64_t nr_metadata_blocks) + : journal_msg(true), + nr_metadata_blocks_(nr_metadata_blocks) +{ +} + +void +open_journal_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +close_journal_msg::close_journal_msg() + : journal_msg(true) +{ +} + +void +close_journal_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + block_msg::block_msg(bool success, uint64_t index) : journal_msg(success), index_(index) { @@ -228,9 +256,18 @@ journal::read_one_(struct journal_visitor &v) uint8_t header = read_(); uint8_t t = header >> 1; uint8_t success = header & 0x1; - uint64_t index; + uint64_t index, nr_blocks; switch (static_cast(t)) { + case MT_OPEN_JOURNAL: + nr_blocks = read_(); + v.visit(open_journal_msg(nr_blocks)); + break; + + case MT_CLOSE_JOURNAL: + v.visit(close_journal_msg()); + return false; + case MT_READ_LOCK: index = read_(); v.visit(read_lock_msg(success, index)); @@ -273,6 +310,7 @@ journal::read_one_(struct journal_visitor &v) break; case MT_FLUSH_AND_UNLOCK: { + cerr << "reading flush_and_unlock msg\n"; index = read_(); auto deltas = read_deltas_(); v.visit(flush_and_unlock_msg(success, index, deltas)); @@ -291,9 +329,6 @@ journal::read_one_(struct journal_visitor &v) case MT_SET_READ_WRITE: v.visit(set_read_write_msg()); break; - - case MT_END_OF_JOURNAL: - return false; } return true; @@ -302,14 +337,16 @@ journal::read_one_(struct journal_visitor &v) bool journal::read_delta_(delta_list &ds) { - uint8_t chunk = read_(); + uint16_t chunk = read_(); - if (chunk == 0xff) + if (chunk == 0xffff) return false; + assert(chunk < JOURNAL_NR_CHUNKS); + auto bytes = vector(JOURNAL_CHUNK_SIZE, 0); in_.read_bytes(bytes.data(), bytes.data() + JOURNAL_CHUNK_SIZE); - ds.push_back(delta(chunk, bytes)); + ds.push_back(delta(chunk * JOURNAL_CHUNK_SIZE, bytes)); return true; } diff --git a/thin-provisioning/thin_journal.h b/thin-provisioning/thin_journal.h index 5a87351..1f8cbbe 100644 --- a/thin-provisioning/thin_journal.h +++ b/thin-provisioning/thin_journal.h @@ -27,8 +27,8 @@ namespace thin_provisioning { uint32_t const JOURNAL_BLOCK_SIZE = 256 * 1024; - uint32_t const JOURNAL_NR_CHUNKS = 32; - uint32_t const JOURNAL_CHUNK_SIZE = 4096 / JOURNAL_NR_CHUNKS; + uint32_t const JOURNAL_CHUNK_SIZE = 32; + uint32_t const JOURNAL_NR_CHUNKS = (4096 / JOURNAL_CHUNK_SIZE); class byte_stream { public: @@ -59,6 +59,17 @@ namespace thin_provisioning { bool success_; }; + struct open_journal_msg : public journal_msg { + open_journal_msg(uint64_t nr_metadata_blocks); + virtual void visit(journal_visitor &v) const; + uint64_t nr_metadata_blocks_; + }; + + struct close_journal_msg : public journal_msg { + close_journal_msg(); + virtual void visit(journal_visitor &v) const; + }; + struct block_msg : public journal_msg { block_msg(bool success, uint64_t index); uint64_t index_; @@ -148,6 +159,8 @@ namespace thin_provisioning { msg.visit(*this); } + virtual void visit(open_journal_msg const &msg) = 0; + virtual void visit(close_journal_msg const &msg) = 0; virtual void visit(read_lock_msg const &msg) = 0; virtual void visit(write_lock_msg const &msg) = 0; virtual void visit(zero_lock_msg const &msg) = 0; @@ -163,7 +176,10 @@ namespace thin_provisioning { }; enum msg_type { - MT_READ_LOCK = 0, + MT_OPEN_JOURNAL, + MT_CLOSE_JOURNAL, + + MT_READ_LOCK, MT_WRITE_LOCK, MT_ZERO_LOCK, MT_TRY_READ_LOCK, @@ -175,7 +191,6 @@ namespace thin_provisioning { MT_PREFETCH, MT_SET_READ_ONLY, MT_SET_READ_WRITE, - MT_END_OF_JOURNAL, }; class journal { diff --git a/thin-provisioning/thin_journal_check.cc b/thin-provisioning/thin_journal_check.cc index a33a255..0747d6f 100644 --- a/thin-provisioning/thin_journal_check.cc +++ b/thin-provisioning/thin_journal_check.cc @@ -54,6 +54,97 @@ using namespace thin_provisioning; //---------------------------------------------------------------- namespace { + class journal_display : public journal_visitor { + public: + journal_display(journal_visitor &inner) + : inner_(inner) { + } + + virtual void visit(open_journal_msg const &msg) { + cout << "open_journal\n"; + inner_.visit(msg); + } + + virtual void visit(close_journal_msg const &msg) { + cout << "close_journal\n"; + inner_.visit(msg); + } + + virtual void visit(read_lock_msg const &msg) { + if (interesting(msg.index_)) + cout << "read_lock " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(write_lock_msg const &msg) { + if (interesting(msg.index_)) + cout << "write_lock " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(zero_lock_msg const &msg) { + if (interesting(msg.index_)) + cout << "zero_lock " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(try_read_lock_msg const &msg) { + if (interesting(msg.index_)) + cout << "try_read_lock " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(unlock_msg const &msg) { + if (interesting(msg.index_)) + cout << "unlock " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(verify_msg const &msg) { + if (interesting(msg.index_)) + cout << "verify " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(prepare_msg const &msg) { + if (interesting(msg.index_)) + cout << "prepare " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(flush_msg const &msg) { + cout << "flush\n"; + inner_.visit(msg); + } + + virtual void visit(flush_and_unlock_msg const &msg) { + if (interesting(msg.index_)) + cout << "flush_and_unlock " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(prefetch_msg const &msg) { + if (interesting(msg.index_)) + cout << "prefetch " << msg.index_ << "\n"; + inner_.visit(msg); + } + + virtual void visit(set_read_only_msg const &msg) { + cout << "set_read_only\n"; + inner_.visit(msg); + } + + virtual void visit(set_read_write_msg const &msg) { + cout << "set_read_write\n"; + inner_.visit(msg); + } + + bool interesting(block_address b) const { + return true; + } + + journal_visitor &inner_; + }; unsigned const MAX_HELD_LOCKS = 16; @@ -62,28 +153,44 @@ namespace { // Need to track updates to the superblock to define transactions. class checker : public journal_visitor { public: - checker(block_address &nr_metadata_blocks) - : bm_(new block_manager<>("metadata.tmp", nr_metadata_blocks, MAX_HELD_LOCKS, block_manager<>::CREATE)) { + virtual void visit(open_journal_msg const &msg) { + bm_.reset(new block_manager<>("metadata.tmp", msg.nr_metadata_blocks_, + MAX_HELD_LOCKS, block_manager<>::CREATE)); + } + + virtual void visit(close_journal_msg const &msg) { + // noop } virtual void visit(read_lock_msg const &msg) { - read_lock_(msg.index_); + if (msg.success_) + read_lock_(msg.index_); } virtual void visit(write_lock_msg const &msg) { - write_lock_(msg.index_); + if (msg.success_) + write_lock_(msg.index_); } virtual void visit(zero_lock_msg const &msg) { - write_lock_(msg.index_); + if (msg.success_) { + write_lock_(msg.index_); + zero_(msg.index_); + } } virtual void visit(try_read_lock_msg const &msg) { - read_lock_(msg.index_); + if (msg.success_) + read_lock_(msg.index_); } virtual void visit(unlock_msg const &msg) { + bool write_locked = is_write_locked_(msg.index_); + unlock_(msg.index_, msg.deltas_); + + if (write_locked && msg.index_ == superblock_detail::SUPERBLOCK_LOCATION) + commit_(); } virtual void visit(verify_msg const &msg) { @@ -95,17 +202,17 @@ namespace { } virtual void visit(flush_msg const &msg) { - cerr << "spurious flush()\n"; + cout << "WARN: spurious flush()\n"; } virtual void visit(flush_and_unlock_msg const &msg) { if (msg.index_ != superblock_detail::SUPERBLOCK_LOCATION) { - cerr << "flush_and_unlock received for block " << msg.index_ + cout << "ERROR: flush_and_unlock received for block " << msg.index_ << ", which isn't the superblock\n"; - throw runtime_error("bad flush_and_unlock"); } - commit(msg.deltas_); + unlock_(msg.index_, msg.deltas_); + commit_(); } virtual void visit(prefetch_msg const &msg) { @@ -122,55 +229,68 @@ namespace { private: void read_lock_(block_address b) { - if (write_locks_.count(b)) { - cerr << "read lock taken concurrently with write lock for block " - << b << "\n"; - throw runtime_error("bad read lock"); - } + auto it = locks_.find(b); + if (it == locks_.end()) + locks_.insert(make_pair(b, -1)); - auto it = read_locks_.find(b); - if (it == read_locks_.end()) - read_locks_.insert(make_pair(b, 1)); - else - it->second++; + else if (it->second > 0) { + cout << "WARN: read lock taken concurrently with write lock for block " + << b << "\n"; + + } else + --it->second; } void write_lock_(block_address b) { - if (active_.count(b)) { - cerr << "write lock taken for block " + if (is_superblock_(b)) { + if (locks_.size()) + cout << "WARN: superblock taken when locks still held\n"; + + } else if (active_.count(b)) { + cout << "ERROR: write lock taken for block " << b << ", but it is still in the active transaction\n"; - throw runtime_error("bad write lock"); + throw runtime_error("bad write_lock"); } - if (write_locks_.count(b)) { - cerr << "write lock already held for block " - << b - << "\n"; - throw runtime_error("bad write lock"); - } + auto it = locks_.find(b); + if (it == locks_.end()) + locks_.insert(make_pair(b, 1)); - if (read_locks_.count(b)) { - cerr << "read lock requested for write locked block " + else if (it->second < 0) { + cout << "WARN: write lock requested for read locked block " << b << "\n"; - throw runtime_error("bad write lock"); - } - - write_locks_.insert(b); + } else + it->second++; } + bool is_write_locked_(block_address b) const { + auto it = locks_.find(b); + return it != locks_.end() && it->second > 0; + } void unlock_(block_address b, delta_list const &deltas) { - if (write_locks_.count(b)) { - write_locks_.erase(b); + auto it = locks_.find(b); + if (it == locks_.end() || !it->second) { + cout << "ERROR: unlock requested on block " << b << ", which isn't locked\n"; + throw runtime_error("bad unlock"); + } + if (it->second < 0) { + it->second++; + + if (deltas.size()) { + cout << "ERROR: unlocking a read lock for " << b << ", yet there are " << deltas.size() << " deltas\n"; + throw runtime_error("bad unlock"); + } + } else { auto wr = bm_->write_lock(b); for (auto &&d : deltas) { uint8_t *data = static_cast(wr.data()); if (d.offset_ + d.bytes_.size() > 4096) { - cerr << "delta for block " << b << " is out of range (" + cout << "ERROR: delta for block " << b << " is out of range (" << d.offset_ << ", " << d.offset_ + d.bytes_.size() << "]\n"; throw runtime_error("bad unlock"); } @@ -178,46 +298,28 @@ namespace { memcpy(data + d.offset_, d.bytes_.data(), d.bytes_.size()); } - } else { - auto it = read_locks_.find(b); - if (it == read_locks_.end()) { - cerr << "unlock requested on block " << b << ", which isn't locked\n"; - throw runtime_error("bad unlock"); - } - - if (deltas.size()) { - cerr << "unlocking a read lock for " << b << ", yet there are " << deltas.size() << " deltas\n"; - throw runtime_error("bad unlock"); - } - - // Decrement lock - if (!it->second) { - cerr << "read lock entry has zero count (internal error)\n"; - throw runtime_error("bad unlock"); - } - - if (!--it->second) - read_locks_.erase(it); - + it->second--; } + + if (!it->second) + locks_.erase(it); } - void commit(delta_list const &deltas) { + void zero_(block_address b) { + auto wr = bm_->write_lock_zero(b); + } + + void commit_() { + using namespace thin_provisioning::superblock_detail; + // At this point the only lock held should be the superblock, // and that should be a write lock. - if (read_locks_.size()) { - cerr << "committing when the following read locks are still held:\n"; - for (auto &&p : read_locks_) - cerr << p.first << "\n"; - } - - - unlock_(superblock_detail::SUPERBLOCK_LOCATION, deltas); - - if (write_locks_.size()) { - cerr << "commit() called, but the following write locks are held:\n"; - for (auto &&b : write_locks_) - cerr << b << "\n"; + if (locks_.size() != 0) { + cout << "ERROR: committing when the following locks are still held:\n"; + for (auto &&p : locks_) + if (p.first != SUPERBLOCK_LOCATION) + cerr << p.first << "\n"; + throw runtime_error("bad commit"); } build_active_set_(); @@ -226,6 +328,7 @@ namespace { void build_active_set_() { using namespace thin_provisioning::superblock_detail; + cerr << "build active set\n"; superblock sb = read_superblock(bm_); block_counter bc; @@ -242,20 +345,24 @@ namespace { for (auto &&p : bc.get_counts()) { if (!p.second) { - cerr << "weird zero count for block " << p.first << "\n"; - throw runtime_error("build_active_set() failed"); + cout << "weird zero count for block " << p.first << "\n"; } active_.insert(p.first); } } + bool is_superblock_(block_address b) const { + return b == superblock_detail::SUPERBLOCK_LOCATION; + } + typedef set block_set; - typedef map block_map; + + // write locks positive, unlocked 0, read locks negative + typedef map block_map; block_set active_; - block_set write_locks_; - block_map read_locks_; + block_map locks_; block_manager<>::ptr bm_; transaction_manager::ptr tm_; @@ -269,13 +376,14 @@ namespace { bool quiet; }; - void check(string const &path, block_address nr_metadata_blocks) { + void check(string const &path) { block_address journal_size = get_file_length(path) / JOURNAL_BLOCK_SIZE; block_manager::ptr bm( new block_manager(path, journal_size, 4, block_manager::READ_ONLY)); journal j(bm); - checker c(nr_metadata_blocks); + checker c; + journal_display dc(c); j.read_journal(c); } @@ -291,7 +399,7 @@ thin_journal_cmd::thin_journal_cmd() void thin_journal_cmd::usage(std::ostream &out) const { - out << "Usage: " << get_name() << " [options] {device|file} {nr blocks}" << endl + out << "Usage: " << get_name() << " [options] {device|file}" << endl << "Options:\n" << " {-q|--quiet}\n" << " {-h|--help}\n" @@ -332,7 +440,7 @@ thin_journal_cmd::run(int argc, char **argv) } } - if (argc - optind != 2) { + if (argc - optind != 1) { if (!fs.quiet) usage(cerr); @@ -340,7 +448,7 @@ thin_journal_cmd::run(int argc, char **argv) } try { - check(argv[optind], lexical_cast(argv[optind + 1])); + check(argv[optind]); } catch (std::exception &e) { cerr << e.what() << "\n";