diff --git a/Makefile.in b/Makefile.in index 73326b1..e54fbd5 100644 --- a/Makefile.in +++ b/Makefile.in @@ -36,6 +36,7 @@ all: $(PROGRAMS) SOURCE=\ base/base64.cc \ + base/endian_utils.cc \ base/error_state.cc \ \ caching/hint_array.cc \ @@ -46,8 +47,9 @@ SOURCE=\ caching/restore_emitter.cc \ caching/xml_format.cc \ \ + era/era_detail.cc \ + \ persistent-data/checksum.cc \ - persistent-data/endian_utils.cc \ persistent-data/error_set.cc \ persistent-data/file_utils.cc \ persistent-data/hex_dump.cc \ @@ -55,6 +57,7 @@ SOURCE=\ persistent-data/transaction_manager.cc \ \ persistent-data/data-structures/bitset.cc \ + persistent-data/data-structures/bloom_filter.cc \ persistent-data/data-structures/btree.cc \ \ persistent-data/space_map.cc \ @@ -153,8 +156,9 @@ THIN_REPAIR_SOURCE=$(SOURCE) THIN_RESTORE_SOURCE=$(SOURCE) THIN_CHECK_SOURCE=\ base/error_state.cc \ + base/endian_utils.cc \ + \ persistent-data/checksum.cc \ - persistent-data/endian_utils.cc \ persistent-data/error_set.cc \ persistent-data/file_utils.cc \ persistent-data/hex_dump.cc \ @@ -165,6 +169,7 @@ THIN_CHECK_SOURCE=\ persistent-data/space-maps/recursive.cc \ persistent-data/space-maps/careful_alloc.cc \ persistent-data/transaction_manager.cc \ + \ thin-provisioning/device_tree.cc \ thin-provisioning/mapping_tree.cc \ thin-provisioning/metadata.cc \ @@ -172,8 +177,9 @@ THIN_CHECK_SOURCE=\ thin-provisioning/superblock.cc THIN_RMAP_SOURCE=\ + base/endian_utils.cc \ + \ persistent-data/checksum.cc \ - persistent-data/endian_utils.cc \ persistent-data/error_set.cc \ persistent-data/file_utils.cc \ persistent-data/hex_dump.cc \ @@ -232,8 +238,9 @@ thin_metadata_size: thin-provisioning/thin_metadata_size.o CACHE_CHECK_SOURCE=\ base/base64.cc \ base/error_state.cc \ + base/endian_utils.cc \ + \ persistent-data/checksum.cc \ - persistent-data/endian_utils.cc \ persistent-data/error_set.cc \ persistent-data/file_utils.cc \ persistent-data/hex_dump.cc \ @@ -245,6 +252,7 @@ CACHE_CHECK_SOURCE=\ persistent-data/space-maps/recursive.cc \ persistent-data/space-maps/careful_alloc.cc \ persistent-data/transaction_manager.cc \ + \ caching/hint_array.cc \ caching/superblock.cc \ caching/mapping_array.cc \ diff --git a/persistent-data/endian_utils.cc b/base/endian_utils.cc similarity index 100% rename from persistent-data/endian_utils.cc rename to base/endian_utils.cc diff --git a/persistent-data/endian_utils.h b/base/endian_utils.h similarity index 97% rename from persistent-data/endian_utils.h rename to base/endian_utils.h index 39276e6..3ad75ae 100644 --- a/persistent-data/endian_utils.h +++ b/base/endian_utils.h @@ -16,8 +16,8 @@ // with thin-provisioning-tools. If not, see // . -#ifndef ENDIAN_H -#define ENDIAN_H +#ifndef BASE_ENDIAN_H +#define BASE_ENDIAN_H #include #include @@ -25,7 +25,6 @@ //---------------------------------------------------------------- -// FIXME: rename to endian namespace base { // These are just little wrapper types to make the compiler diff --git a/caching/mapping_array.cc b/caching/mapping_array.cc index d31c2c9..c6af4ef 100644 --- a/caching/mapping_array.cc +++ b/caching/mapping_array.cc @@ -1,5 +1,5 @@ +#include "base/endian_utils.h" #include "caching/mapping_array.h" -#include "persistent-data/endian_utils.h" #include diff --git a/caching/metadata.h b/caching/metadata.h index f15543e..46c2b93 100644 --- a/caching/metadata.h +++ b/caching/metadata.h @@ -1,10 +1,11 @@ #ifndef CACHE_METADATA_H #define CACHE_METADATA_H +#include "base/endian_utils.h" + #include "persistent-data/block.h" #include "persistent-data/data-structures/array.h" #include "persistent-data/data-structures/bitset.h" -#include "persistent-data/endian_utils.h" #include "persistent-data/space-maps/disk.h" #include "persistent-data/transaction_manager.h" diff --git a/caching/superblock.h b/caching/superblock.h index 2c2cf30..d1a24aa 100644 --- a/caching/superblock.h +++ b/caching/superblock.h @@ -1,7 +1,7 @@ #ifndef CACHE_SUPERBLOCK_H #define CACHE_SUPERBLOCK_H -#include "persistent-data/endian_utils.h" +#include "base/endian_utils.h" #include "persistent-data/data-structures/btree.h" #include diff --git a/era/era_detail.cc b/era/era_detail.cc new file mode 100644 index 0000000..43fbc51 --- /dev/null +++ b/era/era_detail.cc @@ -0,0 +1,36 @@ +#include "era/era_detail.h" + +#include + +using namespace base; +using namespace era; + +//---------------------------------------------------------------- + +namespace { + le32 pack_hash_detail(uint32_t hash1, uint32_t hash2, uint32_t nr_probes) { + throw std::runtime_error("not implemented"); + } + + void unpack_hash_detail(le32 packed, uint32_t &hash1, uint32_t &hash2, uint32_t &nr_probes) { + throw std::runtime_error("not implemented"); + } +} + +void +era_detail_traits::unpack(disk_type const &disk, value_type &value) +{ + value.nr_bits = to_cpu(disk.nr_bits); + unpack_hash_detail(disk.hash_fns_and_probes, value.hash1, value.hash2, value.nr_probes); + value.bloom_root = to_cpu(disk.bloom_root); +} + +void +era_detail_traits::pack(value_type const &value, disk_type &disk) +{ + disk.nr_bits = to_disk(value.nr_bits); + disk.hash_fns_and_probes = pack_hash_detail(value.hash1, value.hash2, value.nr_probes); + disk.bloom_root = to_disk(value.bloom_root); +} + +//---------------------------------------------------------------- diff --git a/era/era_detail.h b/era/era_detail.h new file mode 100644 index 0000000..adaec57 --- /dev/null +++ b/era/era_detail.h @@ -0,0 +1,36 @@ +#ifndef ERA_DETAIL_H +#define ERA_DETAIL_H + +#include "base/endian_utils.h" + +//---------------------------------------------------------------- + +namespace era { + struct era_detail_disk { + base::le32 nr_bits; + base::le32 hash_fns_and_probes; + base::le64 bloom_root; + } __attribute__ ((packed)); + + struct era_detail { + uint32_t nr_bits; + + uint32_t hash1; + uint32_t hash2; + uint32_t nr_probes; + + uint64_t bloom_root; + }; + + struct era_detail_traits { + typedef era_detail_disk disk_type; + typedef era_detail value_type; + + static void unpack(disk_type const &disk, value_type &value); + static void pack(value_type const &value, disk_type &disk); + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/persistent-data/data-structures/array_block.h b/persistent-data/data-structures/array_block.h index 1bb3d4c..638e164 100644 --- a/persistent-data/data-structures/array_block.h +++ b/persistent-data/data-structures/array_block.h @@ -19,7 +19,7 @@ #ifndef ARRAY_BLOCK_H #define ARRAY_BLOCK_H -#include "persistent-data/endian_utils.h" +#include "base/endian_utils.h" //---------------------------------------------------------------- diff --git a/persistent-data/data-structures/bitset.cc b/persistent-data/data-structures/bitset.cc index 1570fc1..6bab965 100644 --- a/persistent-data/data-structures/bitset.cc +++ b/persistent-data/data-structures/bitset.cc @@ -46,6 +46,10 @@ namespace persistent_data { return array_.get_root(); } + unsigned get_nr_bits() const { + return nr_bits_; + } + void grow(unsigned new_nr_bits, bool default_value) { pad_last_block(default_value); resize_array(new_nr_bits, default_value); @@ -184,7 +188,7 @@ namespace persistent_data { if (n >= nr_bits_) { std::ostringstream str; str << "bitset index out of bounds (" - << n << " >= " << nr_bits_ << endl; + << n << " >= " << nr_bits_ << ")"; throw runtime_error(str.str()); } } @@ -214,6 +218,12 @@ persistent_data::bitset::get_root() const return impl_->get_root(); } +unsigned +bitset::get_nr_bits() const +{ + return impl_->get_nr_bits(); +} + void persistent_data::bitset::grow(unsigned new_nr_bits, bool default_value) { diff --git a/persistent-data/data-structures/bitset.h b/persistent-data/data-structures/bitset.h index a6e90ae..70688aa 100644 --- a/persistent-data/data-structures/bitset.h +++ b/persistent-data/data-structures/bitset.h @@ -16,8 +16,8 @@ // with thin-provisioning-tools. If not, see // . -#ifndef BITSET_H -#define BITSET_H +#ifndef PERSISTENT_DATA_DATA_STRUCTURES_BITSET_H +#define PERSISTENT_DATA_DATA_STRUCTURES_BITSET_H #include "persistent-data/run.h" @@ -54,6 +54,7 @@ namespace persistent_data { bitset(tm_ptr tm); bitset(tm_ptr tm, block_address root, unsigned nr_bits); block_address get_root() const; + unsigned get_nr_bits() const; void grow(unsigned new_nr_bits, bool default_value); void destroy(); diff --git a/persistent-data/data-structures/bloom_filter.cc b/persistent-data/data-structures/bloom_filter.cc new file mode 100644 index 0000000..3ca6ffb --- /dev/null +++ b/persistent-data/data-structures/bloom_filter.cc @@ -0,0 +1,146 @@ +#include "persistent-data/data-structures/bloom_filter.h" + +#include + +using namespace persistent_data; + +//---------------------------------------------------------------- + +namespace { + static const uint64_t m1 = 0x9e37fffffffc0001UL; + static const unsigned bits = 18; + + static uint32_t hash1(block_address const &b) { + return (b * m1) >> bits; + } + + static uint32_t hash2(block_address const &b) { + uint32_t n = b; + + n = n ^ (n >> 16); + n = n * 0x85ebca6bu; + n = n ^ (n >> 13); + n = n * 0xc2b2ae35u; + n = n ^ (n >> 16); + + return n; + } + + void check_power_of_two(unsigned nr_bits) { + if (nr_bits & (nr_bits - 1)) + throw std::runtime_error("bloom filter needs a power of two nr_bits"); + } +} + +//---------------------------------------------------------------- + +bloom_filter::bloom_filter(tm_ptr tm, + unsigned nr_bits, unsigned nr_probes) + : tm_(tm), + bits_(tm), + nr_probes_(nr_probes), + mask_(nr_bits - 1) +{ + check_power_of_two(nr_bits); + bits_.grow(nr_bits, false); +} + +bloom_filter::bloom_filter(tm_ptr tm, block_address root, + unsigned nr_bits, unsigned nr_probes) + : tm_(tm), + bits_(tm, root, nr_bits), + nr_probes_(nr_probes), + mask_(nr_bits - 1) +{ + check_power_of_two(nr_bits); +} + +block_address +bloom_filter::get_root() const +{ + return bits_.get_root(); +} + +bool +bloom_filter::test(uint64_t b) +{ + vector probes(nr_probes_); + fill_probes(b, probes); + + for (unsigned p = 0; p < nr_probes_; p++) + if (!bits_.get(probes[p])) + return false; + + return true; +} + +void +bloom_filter::set(uint64_t b) +{ + vector probes(nr_probes_); + fill_probes(b, probes); + + for (unsigned p = 0; p < nr_probes_; p++) + bits_.set(probes[p], true); +} + +void +bloom_filter::flush() +{ + bits_.flush(); +} + +void +bloom_filter::fill_probes(block_address b, vector &probes) const +{ + uint32_t h1 = hash1(b) & mask_; + uint32_t h2 = hash2(b) & mask_; + + probes[0] = h1; + for (unsigned p = 1; p < nr_probes_; p++) { + h1 = (h1 + h2) & mask_; + h2 = (h2 + p) & mask_; + probes[p] = h1; + } +} + +void +bloom_filter::print_debug(ostream &out) +{ + print_residency(out); + + map runs; + + for (unsigned i = 0; i < bits_.get_nr_bits();) { + bool v = bits_.get(i); + unsigned run_length = 1; + + while (++i < bits_.get_nr_bits() && bits_.get(i) == v) + run_length++; + + map::iterator it = runs.find(run_length); + if (it != runs.end()) + it->second++; + else + runs.insert(make_pair(run_length, 1)); + } + + { + map::const_iterator it; + for (it = runs.begin(); it != runs.end(); ++it) + out << it->first << ": " << it->second << endl; + } +} + +void +bloom_filter::print_residency(ostream &out) +{ + unsigned count = 0; + for (unsigned i = 0; i < bits_.get_nr_bits(); i++) + if (bits_.get(i)) + count++; + + out << "residency: " << count << "/" << bits_.get_nr_bits() << endl; +} + +//---------------------------------------------------------------- diff --git a/persistent-data/data-structures/bloom_filter.h b/persistent-data/data-structures/bloom_filter.h new file mode 100644 index 0000000..6703a7d --- /dev/null +++ b/persistent-data/data-structures/bloom_filter.h @@ -0,0 +1,47 @@ +#ifndef PERSISTENT_DATA_DATA_STRUCTURES_BLOOM_FILTER_H +#define PERSISTENT_DATA_DATA_STRUCTURES_BLOOM_FILTER_H + +#include "persistent-data/transaction_manager.h" +#include "persistent-data/data-structures/bitset.h" + +#include + +//---------------------------------------------------------------- + +namespace persistent_data { + class bloom_filter { + public: + typedef boost::shared_ptr ptr; + typedef typename persistent_data::transaction_manager::ptr tm_ptr; + + // nr_bits must be a power of two + bloom_filter(tm_ptr tm, + unsigned nr_bits, unsigned nr_probes); + + bloom_filter(tm_ptr tm, block_address root, + unsigned nr_bits_power, unsigned nr_probes); + + block_address get_root() const; + + bool test(uint64_t b); // not const due to caching effects in bitset + void set(uint64_t b); + void flush(); + + void print_debug(ostream &out); + + private: + void print_residency(ostream &out); + + void fill_probes(block_address b, vector &probes) const; + + tm_ptr tm_; + unsigned nr_bits_; + persistent_data::bitset bits_; + unsigned nr_probes_; + uint64_t mask_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/persistent-data/data-structures/btree.h b/persistent-data/data-structures/btree.h index 26e687a..93c3284 100644 --- a/persistent-data/data-structures/btree.h +++ b/persistent-data/data-structures/btree.h @@ -19,7 +19,7 @@ #ifndef BTREE_H #define BTREE_H -#include "persistent-data/endian_utils.h" +#include "base/endian_utils.h" #include "persistent-data/transaction_manager.h" #include "persistent-data/data-structures/ref_counter.h" diff --git a/persistent-data/space-maps/disk.cc b/persistent-data/space-maps/disk.cc index df3ed43..0c851f6 100644 --- a/persistent-data/space-maps/disk.cc +++ b/persistent-data/space-maps/disk.cc @@ -16,6 +16,8 @@ // with thin-provisioning-tools. If not, see // . +#include "base/endian_utils.h" + #include "persistent-data/space-maps/disk.h" #include "persistent-data/space-maps/disk_structures.h" #include "persistent-data/space-maps/recursive.h" @@ -23,7 +25,6 @@ #include "persistent-data/data-structures/btree_damage_visitor.h" #include "persistent-data/checksum.h" -#include "persistent-data/endian_utils.h" #include "persistent-data/math_utils.h" #include "persistent-data/transaction_manager.h" diff --git a/persistent-data/space-maps/disk_structures.h b/persistent-data/space-maps/disk_structures.h index a92f490..1429d36 100644 --- a/persistent-data/space-maps/disk_structures.h +++ b/persistent-data/space-maps/disk_structures.h @@ -19,7 +19,7 @@ #ifndef SPACE_MAP_DISK_STRUCTURES_H #define SPACE_MAP_DISK_STRUCTURES_H -#include "persistent-data/endian_utils.h" +#include "base/endian_utils.h" // FIXME: what's this included for? #include "persistent-data/data-structures/btree.h" diff --git a/thin-provisioning/metadata.h b/thin-provisioning/metadata.h index 9749acb..c0913a0 100644 --- a/thin-provisioning/metadata.h +++ b/thin-provisioning/metadata.h @@ -19,9 +19,10 @@ #ifndef METADATA_LL_H #define METADATA_LL_H +#include "base/endian_utils.h" + #include "persistent-data/block.h" #include "persistent-data/data-structures/btree.h" -#include "persistent-data/endian_utils.h" #include "persistent-data/space-maps/disk.h" #include "persistent-data/transaction_manager.h" diff --git a/thin-provisioning/superblock.h b/thin-provisioning/superblock.h index d6d78e3..3a3d90a 100644 --- a/thin-provisioning/superblock.h +++ b/thin-provisioning/superblock.h @@ -1,8 +1,9 @@ #ifndef THIN_SUPERBLOCK_H #define THIN_SUPERBLOCK_H +#include "base/endian_utils.h" + #include "persistent-data/block.h" -#include "persistent-data/endian_utils.h" #include "persistent-data/data-structures/ref_counter.h" //---------------------------------------------------------------- diff --git a/unit-tests/Makefile.in b/unit-tests/Makefile.in index d2506ec..585ddff 100644 --- a/unit-tests/Makefile.in +++ b/unit-tests/Makefile.in @@ -50,6 +50,7 @@ TEST_SOURCE=\ unit-tests/base64_t.cc \ unit-tests/bitset_t.cc \ unit-tests/block_t.cc \ + unit-tests/bloom_filter_t.cc \ unit-tests/btree_t.cc \ unit-tests/btree_counter_t.cc \ unit-tests/btree_damage_visitor_t.cc \ diff --git a/unit-tests/bloom_filter_t.cc b/unit-tests/bloom_filter_t.cc new file mode 100644 index 0000000..bb879ed --- /dev/null +++ b/unit-tests/bloom_filter_t.cc @@ -0,0 +1,153 @@ +#include "gmock/gmock.h" +#include "persistent-data/data-structures/bloom_filter.h" +#include "persistent-data/transaction_manager.h" +#include "persistent-data/space-maps/core.h" +#include "persistent-data/data-structures/array_block.h" +#include "test_utils.h" + +#include +#include +#include +#include +#include +#include + +using namespace persistent_data; +using namespace std; +using namespace test; +using namespace testing; + +//---------------------------------------------------------------- + +namespace { + block_address const BLOCK_SIZE = 4096; + block_address const NR_BLOCKS = 102400; + block_address const SUPERBLOCK = 0; + + //-------------------------------- + + class BloomFilterTests : public Test { + public: + BloomFilterTests() + : bm_(create_bm(NR_BLOCKS)), + sm_(setup_core_map()), + tm_(new transaction_manager(bm_, sm_)) { + } + + set generate_random_blocks(unsigned count, + block_address max = std::numeric_limits::max()) { + set r; + + using namespace boost::random; + + mt19937 rng; + uniform_int_distribution uniform_dist(0, max); + + while (r.size() < count) { + block_address b = uniform_dist(rng); + r.insert(b); + } + + return r; + } + + void commit() { + block_manager<>::write_ref superblock(bm_->superblock(SUPERBLOCK)); + } + + space_map::ptr setup_core_map() { + space_map::ptr sm(new core_map(NR_BLOCKS)); + sm->inc(SUPERBLOCK); + return sm; + } + + with_temp_directory dir_; + block_manager<>::ptr bm_; + space_map::ptr sm_; + transaction_manager::ptr tm_; + }; +} + +//---------------------------------------------------------------- + +TEST_F(BloomFilterTests, nr_bits_must_be_a_power_of_two) +{ + ASSERT_THROW(bloom_filter f(tm_, 1023, 3), runtime_error); +} + +TEST_F(BloomFilterTests, can_create_a_bloom_filter) +{ + bloom_filter f(tm_, 1024, 3); +} + +TEST_F(BloomFilterTests, no_false_negatives) +{ + bloom_filter f(tm_, 4096, 6); + set bs = generate_random_blocks(1000); + + set::const_iterator it; + for (it = bs.begin(); it != bs.end(); ++it) + f.set(*it); + + for (it = bs.begin(); it != bs.end(); ++it) + ASSERT_THAT(f.test(*it), Eq(true)); +} + +TEST_F(BloomFilterTests, reload_works) +{ + block_address root; + set bs = generate_random_blocks(1000); + + { + bloom_filter f(tm_, 4096, 6); + + set::const_iterator it; + for (it = bs.begin(); it != bs.end(); ++it) + f.set(*it); + + f.flush(); + root = f.get_root(); + commit(); + } + + { + bloom_filter f(tm_, root, 4096, 6); + + set::const_iterator it; + for (it = bs.begin(); it != bs.end(); ++it) + ASSERT_THAT(f.test(*it), Eq(true)); + } +} + +TEST_F(BloomFilterTests, count_false_positives) +{ + block_address nr_blocks = 1024 * 1024; + block_address written_blocks = nr_blocks / 100; + + unsigned shift = 1; + + while ((1ull << shift) < (16 * written_blocks)) + shift++; + cerr << "bitset size: " << ((1 << shift) / (8 * 1024)) << "k" << endl; + + bloom_filter f(tm_, 1 << shift, 6); + + set bs = generate_random_blocks(written_blocks, nr_blocks); + set::const_iterator it; + + for (it = bs.begin(); it != bs.end(); ++it) + f.set(*it); + + // f.print_debug(cerr); + + unsigned count = 0; + for (unsigned i = 0; i < nr_blocks; i++) + if (!bs.count(i) && f.test(i)) + count++; + + cerr << count << " false positives out of " << nr_blocks << ", " + << static_cast(count * 100) / static_cast(nr_blocks) + << "%" << endl; +} + +//----------------------------------------------------------------