From dbd0c650882bb80554604da3554d632b786aff13 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 19 Aug 2015 09:41:14 +0100 Subject: [PATCH 01/27] [thin_show_duplicates] stub new command --- Makefile.in | 2 + bin/thin_show_duplicates | 1 + main.cc | 1 + thin-provisioning/commands.h | 1 + thin-provisioning/thin_show_duplicates.cc | 111 ++++++++++++++++++++++ 5 files changed, 116 insertions(+) create mode 120000 bin/thin_show_duplicates create mode 100644 thin-provisioning/thin_show_duplicates.cc diff --git a/Makefile.in b/Makefile.in index e67b300..08f895a 100644 --- a/Makefile.in +++ b/Makefile.in @@ -88,6 +88,7 @@ SOURCE=\ thin-provisioning/thin_repair.cc \ thin-provisioning/thin_restore.cc \ thin-provisioning/thin_rmap.cc \ + thin-provisioning/thin_show_duplicates.cc \ thin-provisioning/thin_trim.cc \ thin-provisioning/xml_format.cc @@ -176,6 +177,7 @@ install: bin/pdata_tools ln -s -f pdata_tools $(BINDIR)/thin_repair ln -s -f pdata_tools $(BINDIR)/thin_restore ln -s -f pdata_tools $(BINDIR)/thin_rmap + ln -s -f pdata_tools $(BINDIR)/thin_show_duplicates ln -s -f pdata_tools $(BINDIR)/thin_trim ln -s -f pdata_tools $(BINDIR)/thin_metadata_size ln -s -f pdata_tools $(BINDIR)/era_check diff --git a/bin/thin_show_duplicates b/bin/thin_show_duplicates new file mode 120000 index 0000000..84c01e7 --- /dev/null +++ b/bin/thin_show_duplicates @@ -0,0 +1 @@ +pdata_tools \ No newline at end of file diff --git a/main.cc b/main.cc index ed69ba9..ff9017b 100644 --- a/main.cc +++ b/main.cc @@ -32,6 +32,7 @@ int main(int argc, char **argv) app.add_cmd(thin_provisioning::thin_restore_cmd); app.add_cmd(thin_provisioning::thin_repair_cmd); app.add_cmd(thin_provisioning::thin_rmap_cmd); + app.add_cmd(thin_provisioning::thin_show_dups_cmd); // FIXME: convert thin_metadata_size to c++ //app.add_cmd(thin_provisioning::thin_metadata_size_cmd); diff --git a/thin-provisioning/commands.h b/thin-provisioning/commands.h index de63e53..65714ec 100644 --- a/thin-provisioning/commands.h +++ b/thin-provisioning/commands.h @@ -15,6 +15,7 @@ namespace thin_provisioning { extern base::command thin_rmap_cmd; extern base::command thin_trim_cmd; extern base::command thin_metadata_size_cmd; + extern base::command thin_show_dups_cmd; } //---------------------------------------------------------------- diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc new file mode 100644 index 0000000..06066b1 --- /dev/null +++ b/thin-provisioning/thin_show_duplicates.cc @@ -0,0 +1,111 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include +#include +#include + +#include "version.h" + +#include "base/application.h" +#include "base/error_state.h" +#include "persistent-data/file_utils.h" +#include "persistent-data/space-maps/core.h" +#include "persistent-data/space-maps/disk.h" +#include "thin-provisioning/commands.h" +#include "thin-provisioning/device_tree.h" +#include "thin-provisioning/mapping_tree.h" +#include "thin-provisioning/superblock.h" + +using namespace base; +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +namespace { + block_manager<>::ptr + open_bm(string const &path) { + block_address nr_blocks = get_nr_blocks(path); + block_manager<>::mode m = block_manager<>::READ_ONLY; + return block_manager<>::ptr(new block_manager<>(path, nr_blocks, 1, m)); + } + + transaction_manager::ptr + open_tm(block_manager<>::ptr bm) { + space_map::ptr sm(new core_map(bm->get_nr_blocks())); + sm->inc(superblock_detail::SUPERBLOCK_LOCATION); + transaction_manager::ptr tm(new transaction_manager(bm, sm)); + return tm; + } + + //-------------------------------- + + struct flags { + flags() { + } + }; + + void usage(ostream &out, string const &cmd) { + out << "Usage: " << cmd << " [options] {device|file}" << endl + << "Options:" << endl + << " {-h|--help}" << endl + << " {-V|--version}" << endl; + } +} + +int thin_show_dups_main(int argc, char **argv) +{ + int c; + flags fs; + + char const shortopts[] = "qhV"; + option const longopts[] = { + { "help", no_argument, NULL, 'h'}, + { "version", no_argument, NULL, 'V'}, + { NULL, no_argument, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { + switch(c) { + case 'h': + usage(cout, basename(argv[0])); + return 0; + + case 'V': + cout << THIN_PROVISIONING_TOOLS_VERSION << endl; + return 0; + + default: + usage(cerr, basename(argv[0])); + return 1; + } + } + + if (argc == optind) { + cerr << "No input file provided." << endl; + usage(cerr, basename(argv[0])); + exit(1); + } + + return 0; +} + +base::command thin_provisioning::thin_show_dups_cmd("thin_show_duplicates", thin_show_dups_main); + +//---------------------------------------------------------------- From 59a622670c775bba1348936ee3eb5bef1814575e Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 19 Aug 2015 12:32:57 +0100 Subject: [PATCH 02/27] [thin_show_duplicates] wip --- persistent-data/block.tcc | 1 + persistent-data/file_utils.cc | 4 +- persistent-data/file_utils.h | 2 +- thin-provisioning/thin_show_duplicates.cc | 125 +++++++++++++++++++++- 4 files changed, 124 insertions(+), 8 deletions(-) diff --git a/persistent-data/block.tcc b/persistent-data/block.tcc index 6e47a91..0824bf3 100644 --- a/persistent-data/block.tcc +++ b/persistent-data/block.tcc @@ -223,6 +223,7 @@ namespace persistent_data { unsigned max_concurrent_blocks, mode m, bool excl) + // FIXME: * BlockSize ? : fd_(open_or_create_block_file(path, nr_blocks * BlockSize, m, excl)), bc_(fd_, BlockSize >> SECTOR_SHIFT, nr_blocks, 1024u * 1024u * 16), superblock_ref_count_(0) diff --git a/persistent-data/file_utils.cc b/persistent-data/file_utils.cc index 2467079..b00acf0 100644 --- a/persistent-data/file_utils.cc +++ b/persistent-data/file_utils.cc @@ -12,7 +12,7 @@ using namespace base; //---------------------------------------------------------------- persistent_data::block_address -persistent_data::get_nr_blocks(string const &path) +persistent_data::get_nr_blocks(string const &path, sector_t block_size) { using namespace persistent_data; @@ -39,7 +39,7 @@ persistent_data::get_nr_blocks(string const &path) throw runtime_error("ioctl BLKGETSIZE64 failed"); } ::close(fd); - nr_blocks = div_down(nr_blocks, MD_BLOCK_SIZE); + nr_blocks = div_down(nr_blocks, block_size); } else // FIXME: needs a better message throw runtime_error("bad path"); diff --git a/persistent-data/file_utils.h b/persistent-data/file_utils.h index fcf203d..e641b7a 100644 --- a/persistent-data/file_utils.h +++ b/persistent-data/file_utils.h @@ -9,7 +9,7 @@ // FIXME: move to a different unit namespace persistent_data { - persistent_data::block_address get_nr_blocks(string const &path); + persistent_data::block_address get_nr_blocks(string const &path, sector_t block_size = MD_BLOCK_SIZE); block_manager<>::ptr open_bm(std::string const &dev_path, block_manager<>::mode m, bool excl = true); diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 06066b1..7666c0e 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -32,6 +32,9 @@ #include "thin-provisioning/mapping_tree.h" #include "thin-provisioning/superblock.h" +#include +#include + using namespace base; using namespace std; using namespace thin_provisioning; @@ -57,14 +60,126 @@ namespace { //-------------------------------- struct flags { - flags() { + flags() + : block_size(512 * 1024), + cache_mem(64 * 1024 * 1024) { } + + unsigned block_size; + unsigned cache_mem; }; + int open_file(string const &path) { + int fd = ::open(path.c_str(), O_RDONLY | O_DIRECT | O_EXCL, 0666); + if (fd < 0) + syscall_failed("open", + "Note: you cannot run this tool with these options on live metadata."); + + return fd; + } + + class duplicate_counter { + public: + duplicate_counter(block_address nr_blocks) + : counts_(nr_blocks), + total_dups_(0) { + } + + void add_duplicate(block_address b1, block_address b2) { + // cout << "block " << b2 << " is a duplicate of " << b1 << "\n"; + total_dups_++; + counts_[b1]++; + } + + block_address get_total() const { + return total_dups_; + } + + private: + vector counts_; + block_address total_dups_; + }; + + class duplicate_detector { + public: + duplicate_detector(unsigned block_size, block_address nr_blocks) + : block_size_(block_size), + results_(nr_blocks) { + } + + void examine(block_cache::block const &b) { + digestor_.reset(); + digestor_.process_bytes(b.get_data(), block_size_); + unsigned int digest[5]; + digestor_.get_digest(digest); + + // hack + vector v(5); + for (unsigned i = 0; i < 5; i++) + v[i] = digest[i]; + + fingerprint_map::const_iterator it = fm_.find(v); + if (it != fm_.end()) { + results_.add_duplicate(it->second, b.get_index()); + } else + fm_.insert(make_pair(v, b.get_index())); + } + + block_address get_total_duplicates() const { + return results_.get_total(); + } + + private: + typedef map, block_address> fingerprint_map; + + unsigned block_size_; + boost::uuids::detail::sha1 digestor_; + fingerprint_map fm_; + duplicate_counter results_; + }; + + int show_dups(string const &path, flags const &fs) { + cerr << "path = " << path << "\n"; + block_address nr_blocks = get_nr_blocks(path, fs.block_size); + cerr << "nr_blocks = " << nr_blocks << "\n"; + + // The cache uses a LRU eviction policy, which plays badly + // with a sequential read. So we can't prefetch all the + // blocks. + + // FIXME: add MRU policy to cache + unsigned cache_blocks = (fs.cache_mem / fs.block_size) / 2; + int fd = open_file(path); + sector_t block_sectors = fs.block_size / 512; + block_cache cache(fd, block_sectors, nr_blocks, fs.cache_mem); + validator::ptr v(new bcache::noop_validator()); + + duplicate_detector detector(fs.block_size, nr_blocks); + + // warm up the cache + for (block_address i = 0; i < cache_blocks; i++) + cache.prefetch(i); + + for (block_address i = 0; i < nr_blocks; i++) { + block_cache::block &b = cache.get(i, 0, v); + block_address prefetch = i + cache_blocks; + if (prefetch < nr_blocks) + cache.prefetch(prefetch); + + detector.examine(b); + b.put(); + } + + cout << "total dups: " << detector.get_total_duplicates() << endl; + + return 0; + } + void usage(ostream &out, string const &cmd) { - out << "Usage: " << cmd << " [options] {device|file}" << endl - << "Options:" << endl - << " {-h|--help}" << endl + out << "Usage: " << cmd << " [options] {device|file}\n" + << "Options:\n" + << " {--block-sectors} \n" + << " {-h|--help}\n" << " {-V|--version}" << endl; } } @@ -103,7 +218,7 @@ int thin_show_dups_main(int argc, char **argv) exit(1); } - return 0; + return show_dups(argv[optind], fs); } base::command thin_provisioning::thin_show_dups_cmd("thin_show_duplicates", thin_show_dups_main); From 25f4f23e4202dd31866d4832a1e5b3424daccfb2 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 19 Aug 2015 12:44:07 +0100 Subject: [PATCH 03/27] [file_utils] fix bug in get_nr_blocks. Introduced in previous patch --- persistent-data/file_utils.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/persistent-data/file_utils.cc b/persistent-data/file_utils.cc index b00acf0..88ee945 100644 --- a/persistent-data/file_utils.cc +++ b/persistent-data/file_utils.cc @@ -24,7 +24,7 @@ persistent_data::get_nr_blocks(string const &path, sector_t block_size) throw runtime_error("Couldn't stat dev path"); if (S_ISREG(info.st_mode) && info.st_size) - nr_blocks = div_up(info.st_size, MD_BLOCK_SIZE); + nr_blocks = div_up(info.st_size, block_size); else if (S_ISBLK(info.st_mode)) { // To get the size of a block device we need to From 519cbfd855178fa8fcb52894e53a129ad63357ca Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 19 Aug 2015 12:53:11 +0100 Subject: [PATCH 04/27] [thin_show_duplicates] add a progress bar --- thin-provisioning/thin_show_duplicates.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 7666c0e..f9aef87 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -24,6 +24,7 @@ #include "base/application.h" #include "base/error_state.h" +#include "base/progress_monitor.h" #include "persistent-data/file_utils.h" #include "persistent-data/space-maps/core.h" #include "persistent-data/space-maps/disk.h" @@ -140,6 +141,7 @@ namespace { int show_dups(string const &path, flags const &fs) { cerr << "path = " << path << "\n"; + cerr << "block size = " << fs.block_size << "\n"; block_address nr_blocks = get_nr_blocks(path, fs.block_size); cerr << "nr_blocks = " << nr_blocks << "\n"; @@ -160,6 +162,8 @@ namespace { for (block_address i = 0; i < cache_blocks; i++) cache.prefetch(i); + auto_ptr pbar = create_progress_bar("Examining data"); + for (block_address i = 0; i < nr_blocks; i++) { block_cache::block &b = cache.get(i, 0, v); block_address prefetch = i + cache_blocks; @@ -168,9 +172,11 @@ namespace { detector.examine(b); b.put(); + + pbar->update_percent(i * 100 / nr_blocks); } - cout << "total dups: " << detector.get_total_duplicates() << endl; + cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; return 0; } From 929a824184134fb63d8c7eefa05be5bbeb336f7c Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 19 Aug 2015 13:07:56 +0100 Subject: [PATCH 05/27] [thin_show_duplicates] add --block-sectors switch --- thin-provisioning/thin_show_duplicates.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index f9aef87..04f8150 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -34,6 +34,7 @@ #include "thin-provisioning/superblock.h" #include +#include #include using namespace base; @@ -58,6 +59,19 @@ namespace { return tm; } + uint64_t parse_int(string const &str, string const &desc) { + try { + return boost::lexical_cast(str); + + } catch (...) { + ostringstream out; + out << "Couldn't parse " << desc << ": '" << str << "'"; + exit(1); + } + + return 0; // never get here + } + //-------------------------------- struct flags { @@ -197,6 +211,7 @@ int thin_show_dups_main(int argc, char **argv) char const shortopts[] = "qhV"; option const longopts[] = { + { "block-sectors", required_argument, NULL, 1}, { "help", no_argument, NULL, 'h'}, { "version", no_argument, NULL, 'V'}, { NULL, no_argument, NULL, 0 } @@ -212,6 +227,10 @@ int thin_show_dups_main(int argc, char **argv) cout << THIN_PROVISIONING_TOOLS_VERSION << endl; return 0; + case 1: + fs.block_size = 512 * parse_int(optarg, "block sectors"); + break; + default: usage(cerr, basename(argv[0])); return 1; From 94636b63d7c2d2d7ae708b64bab84aa7a025432f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 19 Aug 2015 13:46:02 +0100 Subject: [PATCH 06/27] [thin_show_duplicates] print out the percentage of duplicates --- thin-provisioning/thin_show_duplicates.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 04f8150..12d3490 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -191,6 +191,7 @@ namespace { } cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; + cout << (detector.get_total_duplicates() * 100) / nr_blocks << "% duplicates\n"; return 0; } From 5f11f5af9993cfa9f8e1bb80dd1f6b4cdef5801d Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 20 Aug 2015 11:12:53 +0100 Subject: [PATCH 07/27] [progress_bar] Tidy up the appearance when at 100% --- base/progress_monitor.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/base/progress_monitor.cc b/base/progress_monitor.cc index 1d88302..a33e08c 100644 --- a/base/progress_monitor.cc +++ b/base/progress_monitor.cc @@ -31,17 +31,22 @@ namespace { if (nr_equals < progress_width_) cout << '>'; + else + cout << "="; for (unsigned i = 0; i < nr_spaces; i++) cout << ' '; - cout << "] " << spinner_char() << " " << p << "%\r" << flush; + cout << "] " << spinner_char(p) << " " << p << "%\r" << flush; spinner_++; } private: - char spinner_char() const { + char spinner_char(unsigned p) const { + if (p == 100) + return ' '; + char cs[] = {'|', '/', '-', '\\'}; unsigned index = spinner_ % sizeof(cs); From d954f230fa1fb77c5934734f83eaa44140f42e5e Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 21 Aug 2015 13:10:49 +0100 Subject: [PATCH 08/27] [thin_show_duplicates] wip --- thin-provisioning/thin_show_duplicates.cc | 181 ++++++++++++++++++++-- 1 file changed, 168 insertions(+), 13 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 12d3490..d743685 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -25,6 +25,7 @@ #include "base/application.h" #include "base/error_state.h" #include "base/progress_monitor.h" +#include "persistent-data/data-structures/btree_damage_visitor.h" #include "persistent-data/file_utils.h" #include "persistent-data/space-maps/core.h" #include "persistent-data/space-maps/disk.h" @@ -32,18 +33,27 @@ #include "thin-provisioning/device_tree.h" #include "thin-provisioning/mapping_tree.h" #include "thin-provisioning/superblock.h" +#include "thin-provisioning/rmap_visitor.h" #include #include +#include #include using namespace base; +using namespace boost; +using namespace persistent_data; using namespace std; using namespace thin_provisioning; //---------------------------------------------------------------- namespace { + bool factor_of(block_address f, block_address n) { + cerr << n << " % " << f << "\n"; + return (n % f) == 0; + } + block_manager<>::ptr open_bm(string const &path) { block_address nr_blocks = get_nr_blocks(path); @@ -74,13 +84,21 @@ namespace { //-------------------------------- + struct data_block { + block_address begin, end; + void *data; + }; + + //-------------------------------- + struct flags { flags() - : block_size(512 * 1024), - cache_mem(64 * 1024 * 1024) { + : cache_mem(64 * 1024 * 1024) { } - unsigned block_size; + string data_dev; + optional metadata_dev; + optional block_size; unsigned cache_mem; }; @@ -93,6 +111,38 @@ namespace { return fd; } + + // FIXME: introduce abstraction for a stream of segments + + using namespace mapping_tree_detail; + + typedef rmap_visitor::region region; + typedef rmap_visitor::rmap_region rmap_region; + + class damage_visitor { + public: + virtual void visit(btree_path const &path, btree_detail::damage const &d) { + throw std::runtime_error("damage in mapping tree, please run thin_check"); + } + }; + + // FIXME: too big to return by value + vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks) { + damage_visitor dv; + rmap_visitor rv; + + mapping_tree mtree(*tm, sb.data_mapping_root_, + mapping_tree_detail::block_traits::ref_counter(tm->get_sm())); + + rv.add_data_region(rmap_visitor::region(0, nr_blocks)); + + btree_visit_values(mtree, rv, dv); + rv.complete(); + cerr << "rmap size: " << rv.get_rmap().size() << "\n"; + return rv.get_rmap(); + } + class duplicate_counter { public: duplicate_counter(block_address nr_blocks) @@ -101,7 +151,6 @@ namespace { } void add_duplicate(block_address b1, block_address b2) { - // cout << "block " << b2 << " is a duplicate of " << b1 << "\n"; total_dups_++; counts_[b1]++; } @@ -153,10 +202,98 @@ namespace { duplicate_counter results_; }; - int show_dups(string const &path, flags const &fs) { - cerr << "path = " << path << "\n"; + int show_dups_pool(flags const &fs) { + block_manager<>::ptr bm = open_bm(*fs.metadata_dev); + transaction_manager::ptr tm = open_tm(bm); + superblock_detail::superblock sb = read_superblock(bm); + + block_address block_size = sb.data_block_size_ * 512; +#if 0 + if (fs.block_size) { + if (!factor_of(*fs.block_size, sb.data_block_size_ * 512)) + throw runtime_error("specified block size must be a factor of the pool block size."); + + block_size = *fs.block_size; + } +#endif + + cerr << "path = " << fs.data_dev << "\n"; + cerr << "block size = " << block_size << "\n"; + block_address nr_blocks = get_nr_blocks(fs.data_dev, block_size); + cerr << "nr_blocks = " << nr_blocks << "\n"; + + cerr << "reading rmap..."; + vector rmap = read_rmap(tm, sb, nr_blocks); + cerr << "done\n"; + + uint32_t const UNMAPPED = -1; + vector block_to_thin(nr_blocks, UNMAPPED); + vector::const_iterator it; + set thins; + block_address nr_mapped = 0; + for (it = rmap.begin(); it != rmap.end(); ++it) { + rmap_region const &r = *it; + for (block_address b = r.data_begin; b != r.data_end; b++) + if (block_to_thin[b] == UNMAPPED) { + nr_mapped++; + block_to_thin[b] = r.thin_dev; + } + thins.insert(r.thin_dev); + } + cerr << nr_mapped << " mapped blocks\n"; + + cerr << "there are " << thins.size() << " thin devices\n"; + + // The cache uses a LRU eviction policy, which plays badly + // with a sequential read. So we can't prefetch all the + // blocks. + + // FIXME: add MRU policy to cache + unsigned cache_blocks = (fs.cache_mem / block_size) / 2; + int fd = open_file(fs.data_dev); + sector_t block_sectors = block_size / 512; + block_cache cache(fd, block_sectors, nr_blocks, fs.cache_mem); + validator::ptr v(new bcache::noop_validator()); + + duplicate_detector detector(block_size, nr_blocks); + + // warm up the cache + for (block_address i = 0; i < cache_blocks; i++) + cache.prefetch(i); + + auto_ptr pbar = create_progress_bar("Examining data"); + + for (block_address i = 0; i < nr_blocks; i++) { + if (block_to_thin[i] == UNMAPPED) + continue; + + block_cache::block &b = cache.get(i, 0, v); + block_address prefetch = i + cache_blocks; + if (prefetch < nr_blocks) + cache.prefetch(prefetch); + + detector.examine(b); + b.put(); + + if (!(i & 127)) + pbar->update_percent(i * 100 / nr_blocks); + } + pbar->update_percent(100); + + cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; + cout << (detector.get_total_duplicates() * 100) / nr_mapped << "% duplicates\n"; + + return 0; + } + + int show_dups_linear(flags const &fs) { + if (!fs.block_size) + // FIXME: this check should be moved to the switch parsing + throw runtime_error("--block-sectors or --metadata-dev must be supplied"); + + cerr << "path = " << fs.data_dev << "\n"; cerr << "block size = " << fs.block_size << "\n"; - block_address nr_blocks = get_nr_blocks(path, fs.block_size); + block_address nr_blocks = get_nr_blocks(fs.data_dev, *fs.block_size); cerr << "nr_blocks = " << nr_blocks << "\n"; // The cache uses a LRU eviction policy, which plays badly @@ -164,13 +301,13 @@ namespace { // blocks. // FIXME: add MRU policy to cache - unsigned cache_blocks = (fs.cache_mem / fs.block_size) / 2; - int fd = open_file(path); - sector_t block_sectors = fs.block_size / 512; + unsigned cache_blocks = (fs.cache_mem / *fs.block_size) / 2; + int fd = open_file(fs.data_dev); + sector_t block_sectors = *fs.block_size / 512; block_cache cache(fd, block_sectors, nr_blocks, fs.cache_mem); validator::ptr v(new bcache::noop_validator()); - duplicate_detector detector(fs.block_size, nr_blocks); + duplicate_detector detector(*fs.block_size, nr_blocks); // warm up the cache for (block_address i = 0; i < cache_blocks; i++) @@ -189,6 +326,7 @@ namespace { pbar->update_percent(i * 100 / nr_blocks); } + pbar->update_percent(100); cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; cout << (detector.get_total_duplicates() * 100) / nr_blocks << "% duplicates\n"; @@ -196,10 +334,20 @@ namespace { return 0; } + int show_dups(flags const &fs) { + if (fs.metadata_dev) + return show_dups_pool(fs); + else { + cerr << "No metadata device provided, so treating data device as a linear device\n"; + return show_dups_linear(fs); + } + } + void usage(ostream &out, string const &cmd) { out << "Usage: " << cmd << " [options] {device|file}\n" << "Options:\n" << " {--block-sectors} \n" + << " {--metadata-dev} \n" << " {-h|--help}\n" << " {-V|--version}" << endl; } @@ -213,6 +361,7 @@ int thin_show_dups_main(int argc, char **argv) char const shortopts[] = "qhV"; option const longopts[] = { { "block-sectors", required_argument, NULL, 1}, + { "metadata-dev", required_argument, NULL, 2}, { "help", no_argument, NULL, 'h'}, { "version", no_argument, NULL, 'V'}, { NULL, no_argument, NULL, 0 } @@ -232,6 +381,10 @@ int thin_show_dups_main(int argc, char **argv) fs.block_size = 512 * parse_int(optarg, "block sectors"); break; + case 2: + fs.metadata_dev = optarg; + break; + default: usage(cerr, basename(argv[0])); return 1; @@ -239,12 +392,14 @@ int thin_show_dups_main(int argc, char **argv) } if (argc == optind) { - cerr << "No input file provided." << endl; + cerr << "No data device/file provided." << endl; usage(cerr, basename(argv[0])); exit(1); } - return show_dups(argv[optind], fs); + fs.data_dev = argv[optind]; + + return show_dups(fs); } base::command thin_provisioning::thin_show_dups_cmd("thin_show_duplicates", thin_show_dups_main); From c8d3ce6af5f7ab60b322800f3effe590e15ee467 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 24 Aug 2015 11:18:31 +0100 Subject: [PATCH 09/27] [thin_show_duplicates] start factoring out a chunk_stream abstraction --- block-cache/block_cache.h | 39 +++++++ thin-provisioning/thin_show_duplicates.cc | 118 +++++++++++++++++++--- 2 files changed, 144 insertions(+), 13 deletions(-) diff --git a/block-cache/block_cache.h b/block-cache/block_cache.h index 4bc6667..008af15 100644 --- a/block-cache/block_cache.h +++ b/block-cache/block_cache.h @@ -12,6 +12,7 @@ #include #include #include +#include //---------------------------------------------------------------- @@ -112,6 +113,44 @@ namespace bcache { validator::ptr v_; }; + class auto_block { + public: + auto_block() + : b_(0) { + } + + auto_block(block &b) + : b_(&b) { + } + + ~auto_block() { + put(); + } + + auto_block &operator =(block &b) { + put(); + b_ = &b; + return *this; + } + + void *get_data() const { + if (b_) + return b_->get_data(); + + throw std::runtime_error("auto_block not set"); + } + + private: + void put() { + if (b_) { + b_->put(); + b_ = 0; + } + } + + block *b_; + }; + //-------------------------------- block_cache(int fd, sector_t block_size, diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index d743685..9be5c07 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -38,6 +38,7 @@ #include #include #include +#include #include using namespace base; @@ -54,6 +55,15 @@ namespace { return (n % f) == 0; } + int open_file(string const &path) { + int fd = ::open(path.c_str(), O_RDONLY | O_DIRECT | O_EXCL, 0666); + if (fd < 0) + syscall_failed("open", + "Note: you cannot run this tool with these options on live metadata."); + + return fd; + } + block_manager<>::ptr open_bm(string const &path) { block_address nr_blocks = get_nr_blocks(path); @@ -84,9 +94,97 @@ namespace { //-------------------------------- - struct data_block { - block_address begin, end; - void *data; + // Once we start using variable sized blocks we will find we want + // to examine data that crosses cache block boundaries. So a block + // to be examined can be composed of multiple chunks of memory. + + struct mem { + mem(void *b, void *e) + : begin(b), + end(e) { + } + + void *begin, *end; + }; + + struct chunk { + sector_t offset_sectors_; + deque mem_; + }; + + class chunk_stream { + public: + virtual ~chunk_stream() {} + + virtual void rewind() = 0; + virtual bool advance() = 0; + virtual chunk const &get() const = 0; + }; + + class cache_stream : public chunk_stream { + public: + cache_stream(string const &path, + block_address block_size, + size_t cache_mem) + : block_size_(block_size), + nr_blocks_(get_nr_blocks(path, block_size)), + + // hack because cache uses LRU rather than MRU + cache_blocks_((cache_mem / block_size) / 2u), + fd_(open_file(path)), + v_(new bcache::noop_validator()), + cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), + current_index_(0) { + load(0); + } + + virtual void rewind() { + load(0); + } + + virtual bool advance() { + if (current_index_ >= nr_blocks_) + return false; + + current_index_++; + + load(current_index_); + return true; + } + + virtual chunk const &get() const { + return current_chunk_; + } + + private: + void load(block_address b) { + current_index_ = b; + current_block_ = cache_->get(current_index_, 0, v_); + + current_chunk_.offset_sectors_ = (b * block_size_) / 512; + current_chunk_.mem_.clear(); + current_chunk_.mem_.push_back(mem(current_block_.get_data(), + current_block_.get_data() + block_size_)); + } + + block_address block_size_; + block_address nr_blocks_; + block_address cache_blocks_; + int fd_; + validator::ptr v_; + auto_ptr cache_; + + block_address current_index_; + block_cache::auto_block current_block_; + chunk current_chunk_; + }; + + class fixed_block_stream : public chunk_stream { + public: + }; + + class variable_size_stream : public chunk_stream { + }; //-------------------------------- @@ -102,16 +200,6 @@ namespace { unsigned cache_mem; }; - int open_file(string const &path) { - int fd = ::open(path.c_str(), O_RDONLY | O_DIRECT | O_EXCL, 0666); - if (fd < 0) - syscall_failed("open", - "Note: you cannot run this tool with these options on live metadata."); - - return fd; - } - - // FIXME: introduce abstraction for a stream of segments using namespace mapping_tree_detail; @@ -217,6 +305,10 @@ namespace { } #endif + { + cache_stream(fs.data_dev, block_size, fs.cache_mem); + } + cerr << "path = " << fs.data_dev << "\n"; cerr << "block size = " << block_size << "\n"; block_address nr_blocks = get_nr_blocks(fs.data_dev, block_size); From ac4104d063e518d183ec032cd1b9ddac4549f28f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 24 Aug 2015 11:24:55 +0100 Subject: [PATCH 10/27] add prefetching --- thin-provisioning/thin_show_duplicates.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 9be5c07..3142e62 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -136,6 +136,8 @@ namespace { cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), current_index_(0) { load(0); + for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) + cache_->prefetch(i); } virtual void rewind() { @@ -165,6 +167,9 @@ namespace { current_chunk_.mem_.clear(); current_chunk_.mem_.push_back(mem(current_block_.get_data(), current_block_.get_data() + block_size_)); + + if (current_index_ + cache_blocks_ < nr_blocks_) + cache_->prefetch(current_index_ + cache_blocks_); } block_address block_size_; From 46fe4525bb900e8b6cbbf10fa9f62f16bb4f1402 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 24 Aug 2015 14:29:06 +0100 Subject: [PATCH 11/27] [thin_show_dups] factor out a pool stream --- thin-provisioning/thin_show_duplicates.cc | 259 +++++++++++++--------- 1 file changed, 155 insertions(+), 104 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 3142e62..96d1a23 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -99,15 +99,17 @@ namespace { // to be examined can be composed of multiple chunks of memory. struct mem { - mem(void *b, void *e) + mem(uint8_t *b, uint8_t *e) : begin(b), end(e) { } - void *begin, *end; + uint8_t *begin, *end; }; struct chunk { + // FIXME: switch to bytes rather than sectors + // FIXME: add length too sector_t offset_sectors_; deque mem_; }; @@ -116,8 +118,10 @@ namespace { public: virtual ~chunk_stream() {} + virtual block_address nr_chunks() const = 0; virtual void rewind() = 0; - virtual bool advance() = 0; + virtual bool advance(block_address count = 1ull) = 0; + virtual block_address index() const = 0; virtual chunk const &get() const = 0; }; @@ -140,20 +144,28 @@ namespace { cache_->prefetch(i); } + virtual block_address nr_chunks() const { + return nr_blocks_; + } + virtual void rewind() { load(0); } - virtual bool advance() { - if (current_index_ >= nr_blocks_) + virtual bool advance(block_address count = 1ull) { + if (current_index_ + count >= nr_blocks_) return false; - current_index_++; + current_index_ += count; load(current_index_); return true; } + virtual block_address index() const { + return current_index_; + } + virtual chunk const &get() const { return current_chunk_; } @@ -165,8 +177,8 @@ namespace { current_chunk_.offset_sectors_ = (b * block_size_) / 512; current_chunk_.mem_.clear(); - current_chunk_.mem_.push_back(mem(current_block_.get_data(), - current_block_.get_data() + block_size_)); + current_chunk_.mem_.push_back(mem(static_cast(current_block_.get_data()), + static_cast(current_block_.get_data()) + block_size_)); if (current_index_ + cache_blocks_ < nr_blocks_) cache_->prefetch(current_index_ + cache_blocks_); @@ -184,12 +196,112 @@ namespace { chunk current_chunk_; }; - class fixed_block_stream : public chunk_stream { + //-------------------------------- + + typedef rmap_visitor::region region; + typedef rmap_visitor::rmap_region rmap_region; + + uint32_t const UNMAPPED = -1; + + class pool_stream : public chunk_stream { public: - }; - class variable_size_stream : public chunk_stream { + pool_stream(cache_stream &stream, + transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks) + : stream_(stream), + block_to_thin_(stream.nr_chunks(), UNMAPPED), + nr_mapped_(0) { + init_rmap(tm, sb, nr_blocks); + } + block_address nr_chunks() const { + return stream_.nr_chunks(); + } + + void rewind() { + stream_.rewind(); + } + + bool advance(block_address count = 1ull) { + while (count--) + if (!advance_one()) + return false; + + return true; + } + + block_address index() const { + return stream_.index(); + } + + chunk const &get() const { + return stream_.get(); + } + + private: + class damage_visitor { + public: + virtual void visit(btree_path const &path, btree_detail::damage const &d) { + throw std::runtime_error("damage in mapping tree, please run thin_check"); + } + }; + + // FIXME: too big to return by value + vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks) { + damage_visitor dv; + rmap_visitor rv; + + mapping_tree mtree(*tm, sb.data_mapping_root_, + mapping_tree_detail::block_traits::ref_counter(tm->get_sm())); + + rv.add_data_region(rmap_visitor::region(0, nr_blocks)); + + btree_visit_values(mtree, rv, dv); + rv.complete(); + cerr << "rmap size: " << rv.get_rmap().size() << "\n"; + return rv.get_rmap(); + } + + void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks) { + cerr << "reading rmap..."; + vector rmap = read_rmap(tm, sb, nr_blocks); + cerr << "done\n"; + + vector::const_iterator it; + set thins; + for (it = rmap.begin(); it != rmap.end(); ++it) { + rmap_region const &r = *it; + for (block_address b = r.data_begin; b != r.data_end; b++) + if (block_to_thin_[b] == UNMAPPED) { + nr_mapped_++; + block_to_thin_[b] = r.thin_dev; + } + thins.insert(r.thin_dev); + } + + cerr << nr_mapped_ << " mapped blocks\n"; + cerr << "there are " << thins.size() << " thin devices\n"; + } + + bool advance_one() { + block_address new_index = index() + 1; + + while (block_to_thin_[new_index] == UNMAPPED && + new_index < nr_chunks()) + new_index++; + + if (new_index >= nr_chunks()) + return false; + + return stream_.advance(new_index - index()); + } + + cache_stream &stream_; + vector block_to_thin_; + block_address nr_mapped_; }; //-------------------------------- @@ -209,33 +321,6 @@ namespace { using namespace mapping_tree_detail; - typedef rmap_visitor::region region; - typedef rmap_visitor::rmap_region rmap_region; - - class damage_visitor { - public: - virtual void visit(btree_path const &path, btree_detail::damage const &d) { - throw std::runtime_error("damage in mapping tree, please run thin_check"); - } - }; - - // FIXME: too big to return by value - vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, - block_address nr_blocks) { - damage_visitor dv; - rmap_visitor rv; - - mapping_tree mtree(*tm, sb.data_mapping_root_, - mapping_tree_detail::block_traits::ref_counter(tm->get_sm())); - - rv.add_data_region(rmap_visitor::region(0, nr_blocks)); - - btree_visit_values(mtree, rv, dv); - rv.complete(); - cerr << "rmap size: " << rv.get_rmap().size() << "\n"; - return rv.get_rmap(); - } - class duplicate_counter { public: duplicate_counter(block_address nr_blocks) @@ -264,6 +349,7 @@ namespace { results_(nr_blocks) { } + // FIXME: remove void examine(block_cache::block const &b) { digestor_.reset(); digestor_.process_bytes(b.get_data(), block_size_); @@ -282,6 +368,28 @@ namespace { fm_.insert(make_pair(v, b.get_index())); } + void examine(chunk const &c) { + digestor_.reset(); + + for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) + digestor_.process_bytes(it->begin, it->end - it->begin); + + unsigned int digest[5]; + digestor_.get_digest(digest); + + // hack + vector v(5); + for (unsigned i = 0; i < 5; i++) + v[i] = digest[i]; + + fingerprint_map::const_iterator it = fm_.find(v); + block_address index = (c.offset_sectors_ * 512) / block_size_; + if (it != fm_.end()) { + results_.add_duplicate(it->second, index); + } else + fm_.insert(make_pair(v, index)); + } + block_address get_total_duplicates() const { return results_.get_total(); } @@ -299,86 +407,29 @@ namespace { block_manager<>::ptr bm = open_bm(*fs.metadata_dev); transaction_manager::ptr tm = open_tm(bm); superblock_detail::superblock sb = read_superblock(bm); - block_address block_size = sb.data_block_size_ * 512; -#if 0 - if (fs.block_size) { - if (!factor_of(*fs.block_size, sb.data_block_size_ * 512)) - throw runtime_error("specified block size must be a factor of the pool block size."); - - block_size = *fs.block_size; - } -#endif - - { - cache_stream(fs.data_dev, block_size, fs.cache_mem); - } + block_address nr_blocks = get_nr_blocks(fs.data_dev, block_size); cerr << "path = " << fs.data_dev << "\n"; cerr << "block size = " << block_size << "\n"; - block_address nr_blocks = get_nr_blocks(fs.data_dev, block_size); cerr << "nr_blocks = " << nr_blocks << "\n"; - cerr << "reading rmap..."; - vector rmap = read_rmap(tm, sb, nr_blocks); - cerr << "done\n"; - - uint32_t const UNMAPPED = -1; - vector block_to_thin(nr_blocks, UNMAPPED); - vector::const_iterator it; - set thins; - block_address nr_mapped = 0; - for (it = rmap.begin(); it != rmap.end(); ++it) { - rmap_region const &r = *it; - for (block_address b = r.data_begin; b != r.data_end; b++) - if (block_to_thin[b] == UNMAPPED) { - nr_mapped++; - block_to_thin[b] = r.thin_dev; - } - thins.insert(r.thin_dev); - } - cerr << nr_mapped << " mapped blocks\n"; - - cerr << "there are " << thins.size() << " thin devices\n"; - - // The cache uses a LRU eviction policy, which plays badly - // with a sequential read. So we can't prefetch all the - // blocks. - - // FIXME: add MRU policy to cache - unsigned cache_blocks = (fs.cache_mem / block_size) / 2; - int fd = open_file(fs.data_dev); - sector_t block_sectors = block_size / 512; - block_cache cache(fd, block_sectors, nr_blocks, fs.cache_mem); - validator::ptr v(new bcache::noop_validator()); + cache_stream stream(fs.data_dev, block_size, fs.cache_mem); + pool_stream pstream(stream, tm, sb, nr_blocks); duplicate_detector detector(block_size, nr_blocks); - - // warm up the cache - for (block_address i = 0; i < cache_blocks; i++) - cache.prefetch(i); - auto_ptr pbar = create_progress_bar("Examining data"); - for (block_address i = 0; i < nr_blocks; i++) { - if (block_to_thin[i] == UNMAPPED) - continue; + do { + chunk const &c = pstream.get(); + detector.examine(c); + pbar->update_percent((pstream.index() * 100) / pstream.nr_chunks()); - block_cache::block &b = cache.get(i, 0, v); - block_address prefetch = i + cache_blocks; - if (prefetch < nr_blocks) - cache.prefetch(prefetch); - - detector.examine(b); - b.put(); - - if (!(i & 127)) - pbar->update_percent(i * 100 / nr_blocks); - } + } while (pstream.advance()); pbar->update_percent(100); cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; - cout << (detector.get_total_duplicates() * 100) / nr_mapped << "% duplicates\n"; +// cout << (detector.get_total_duplicates() * 100) / nr_mapped_ << "% duplicates\n"; return 0; } From 3470ede50b256a3911fafc2b8df24aa9852c14e9 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 24 Aug 2015 16:55:53 +0100 Subject: [PATCH 12/27] [thin_show_dups] pull the various streams out to their own files --- Makefile.in | 2 + thin-provisioning/cache_stream.cc | 91 +++++++++ thin-provisioning/cache_stream.h | 41 ++++ thin-provisioning/chunk_stream.h | 60 ++++++ thin-provisioning/pool_stream.cc | 146 ++++++++++++++ thin-provisioning/pool_stream.h | 60 ++++++ thin-provisioning/thin_show_duplicates.cc | 229 +--------------------- 7 files changed, 406 insertions(+), 223 deletions(-) create mode 100644 thin-provisioning/cache_stream.cc create mode 100644 thin-provisioning/cache_stream.h create mode 100644 thin-provisioning/chunk_stream.h create mode 100644 thin-provisioning/pool_stream.cc create mode 100644 thin-provisioning/pool_stream.h diff --git a/Makefile.in b/Makefile.in index 08f895a..0dd6c99 100644 --- a/Makefile.in +++ b/Makefile.in @@ -71,12 +71,14 @@ SOURCE=\ persistent-data/space_map.cc \ persistent-data/transaction_manager.cc \ persistent-data/validators.cc \ + thin-provisioning/cache_stream.cc \ thin-provisioning/device_tree.cc \ thin-provisioning/human_readable_format.cc \ thin-provisioning/mapping_tree.cc \ thin-provisioning/metadata.cc \ thin-provisioning/metadata_checker.cc \ thin-provisioning/metadata_dumper.cc \ + thin-provisioning/pool_stream.cc \ thin-provisioning/restore_emitter.cc \ thin-provisioning/rmap_visitor.cc \ thin-provisioning/superblock.cc \ diff --git a/thin-provisioning/cache_stream.cc b/thin-provisioning/cache_stream.cc new file mode 100644 index 0000000..956fc4e --- /dev/null +++ b/thin-provisioning/cache_stream.cc @@ -0,0 +1,91 @@ +#include "thin-provisioning/cache_stream.h" +#include "persistent-data/file_utils.h" + +using namespace thin_provisioning; +using namespace std; +using namespace persistent_data; + +//---------------------------------------------------------------- + +namespace { + int open_file(string const &path) { + int fd = ::open(path.c_str(), O_RDONLY | O_DIRECT | O_EXCL, 0666); + if (fd < 0) + syscall_failed("open", + "Note: you cannot run this tool with these options on live metadata."); + + return fd; + } +} + +//---------------------------------------------------------------- + +cache_stream::cache_stream(string const &path, + block_address block_size, + size_t cache_mem) + : block_size_(block_size), + nr_blocks_(get_nr_blocks(path, block_size)), + + // hack because cache uses LRU rather than MRU + cache_blocks_((cache_mem / block_size) / 2u), + fd_(open_file(path)), + v_(new bcache::noop_validator()), + cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), + current_index_(0) { + load(0); + for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) + cache_->prefetch(i); +} + +block_address +cache_stream::nr_chunks() const +{ + return nr_blocks_; +} + +void +cache_stream::rewind() +{ + load(0); +} + +bool +cache_stream::advance(block_address count) +{ + if (current_index_ + count >= nr_blocks_) + return false; + + current_index_ += count; + + load(current_index_); + return true; +} + +block_address +cache_stream::index() const +{ + return current_index_; +} + +chunk const & +cache_stream::get() const +{ + return current_chunk_; +} + +void +cache_stream::load(block_address b) +{ + current_index_ = b; + current_block_ = cache_->get(current_index_, 0, v_); + + current_chunk_.offset_sectors_ = (b * block_size_) / 512; + current_chunk_.mem_.clear(); + current_chunk_.mem_.push_back(mem(static_cast(current_block_.get_data()), + static_cast(current_block_.get_data()) + block_size_)); + + if (current_index_ + cache_blocks_ < nr_blocks_) + cache_->prefetch(current_index_ + cache_blocks_); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/cache_stream.h b/thin-provisioning/cache_stream.h new file mode 100644 index 0000000..cfe6ca8 --- /dev/null +++ b/thin-provisioning/cache_stream.h @@ -0,0 +1,41 @@ +#ifndef THIN_PROVISIONING_CACHE_STREAM_H +#define THIN_PROVISIONING_CACHE_STREAM_H + +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + using namespace bcache; + + class cache_stream : public chunk_stream { + public: + cache_stream(std::string const &path, + block_address block_size, + size_t cache_mem); + + virtual block_address nr_chunks() const; + virtual void rewind(); + virtual bool advance(block_address count = 1ull); + virtual block_address index() const; + virtual chunk const &get() const; + + private: + void load(block_address b); + + block_address block_size_; + block_address nr_blocks_; + block_address cache_blocks_; + int fd_; + validator::ptr v_; + std::auto_ptr cache_; + + block_address current_index_; + block_cache::auto_block current_block_; + chunk current_chunk_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/chunk_stream.h b/thin-provisioning/chunk_stream.h new file mode 100644 index 0000000..d87ee9c --- /dev/null +++ b/thin-provisioning/chunk_stream.h @@ -0,0 +1,60 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#ifndef CHUNK_STREAM_H +#define CHUNK_STREAM_H + +#include "block-cache/block_cache.h" + +#include +#include + +//---------------------------------------------------------------- + +namespace thin_provisioning { + struct mem { + mem(uint8_t *b, uint8_t *e) + : begin(b), + end(e) { + } + + uint8_t *begin, *end; + }; + + struct chunk { + // FIXME: switch to bytes rather than sectors + // FIXME: add length too + uint64_t offset_sectors_; + std::deque mem_; + }; + + class chunk_stream { + public: + virtual ~chunk_stream() {} + + virtual bcache::block_address nr_chunks() const = 0; + virtual void rewind() = 0; + virtual bool advance(bcache::block_address count = 1ull) = 0; + virtual bcache::block_address index() const = 0; + virtual chunk const &get() const = 0; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/pool_stream.cc b/thin-provisioning/pool_stream.cc new file mode 100644 index 0000000..63b64ac --- /dev/null +++ b/thin-provisioning/pool_stream.cc @@ -0,0 +1,146 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include "thin-provisioning/pool_stream.h" +#include "persistent-data/data-structures/btree_damage_visitor.h" + +using namespace thin_provisioning; +using namespace persistent_data; + +//---------------------------------------------------------------- + +namespace { + class damage_visitor { + public: + virtual void visit(btree_path const &path, btree_detail::damage const &d) { + throw std::runtime_error("damage in mapping tree, please run thin_check"); + } + }; + + uint32_t const UNMAPPED = -1; +} + +//---------------------------------------------------------------- + +pool_stream::pool_stream(cache_stream &stream, + transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks) + : stream_(stream), + block_to_thin_(stream.nr_chunks(), UNMAPPED), + nr_mapped_(0) +{ + init_rmap(tm, sb, nr_blocks); +} + +block_address +pool_stream::nr_chunks() const +{ + return stream_.nr_chunks(); +} + +void +pool_stream::rewind() +{ + stream_.rewind(); +} + +bool +pool_stream::advance(block_address count) +{ + while (count--) + if (!advance_one()) + return false; + + return true; +} + +block_address +pool_stream::index() const +{ + return stream_.index(); +} + +chunk const & +pool_stream::get() const +{ + return stream_.get(); +} + + + +// FIXME: too big to return by value +vector +pool_stream::read_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, + block_address nr_blocks) +{ + damage_visitor dv; + rmap_visitor rv; + + mapping_tree mtree(*tm, sb.data_mapping_root_, + mapping_tree_detail::block_traits::ref_counter(tm->get_sm())); + + rv.add_data_region(rmap_visitor::region(0, nr_blocks)); + + btree_visit_values(mtree, rv, dv); + rv.complete(); + cerr << "rmap size: " << rv.get_rmap().size() << "\n"; + return rv.get_rmap(); +} + +void +pool_stream::init_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, + block_address nr_blocks) +{ + cerr << "reading rmap..."; + vector rmap = read_rmap(tm, sb, nr_blocks); + cerr << "done\n"; + + vector::const_iterator it; + set thins; + for (it = rmap.begin(); it != rmap.end(); ++it) { + rmap_region const &r = *it; + for (block_address b = r.data_begin; b != r.data_end; b++) + if (block_to_thin_[b] == UNMAPPED) { + nr_mapped_++; + block_to_thin_[b] = r.thin_dev; + } + thins.insert(r.thin_dev); + } + + cerr << nr_mapped_ << " mapped blocks\n"; + cerr << "there are " << thins.size() << " thin devices\n"; +} + +bool +pool_stream::advance_one() +{ + block_address new_index = index() + 1; + + while (block_to_thin_[new_index] == UNMAPPED && + new_index < nr_chunks()) + new_index++; + + if (new_index >= nr_chunks()) + return false; + + return stream_.advance(new_index - index()); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/pool_stream.h b/thin-provisioning/pool_stream.h new file mode 100644 index 0000000..1fa5c51 --- /dev/null +++ b/thin-provisioning/pool_stream.h @@ -0,0 +1,60 @@ +// Copyright (C) 2015 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#ifndef POOL_STREAM_H +#define POOL_STREAM_H + +#include "thin-provisioning/cache_stream.h" +#include "thin-provisioning/rmap_visitor.h" +#include "thin-provisioning/superblock.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class pool_stream : public chunk_stream { + public: + pool_stream(cache_stream &stream, + transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks); + + block_address nr_chunks() const; + void rewind(); + bool advance(block_address count = 1ull); + block_address index() const; + chunk const &get() const; + + private: + typedef rmap_visitor::region region; + typedef rmap_visitor::rmap_region rmap_region; + + // FIXME: too big to return by value + vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks); + void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + block_address nr_blocks); + bool advance_one(); + + cache_stream &stream_; + vector block_to_thin_; + block_address nr_mapped_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 96d1a23..261b971 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -29,11 +29,13 @@ #include "persistent-data/file_utils.h" #include "persistent-data/space-maps/core.h" #include "persistent-data/space-maps/disk.h" +#include "thin-provisioning/cache_stream.h" +#include "thin-provisioning/pool_stream.h" #include "thin-provisioning/commands.h" #include "thin-provisioning/device_tree.h" #include "thin-provisioning/mapping_tree.h" -#include "thin-provisioning/superblock.h" #include "thin-provisioning/rmap_visitor.h" +#include "thin-provisioning/superblock.h" #include #include @@ -55,15 +57,6 @@ namespace { return (n % f) == 0; } - int open_file(string const &path) { - int fd = ::open(path.c_str(), O_RDONLY | O_DIRECT | O_EXCL, 0666); - if (fd < 0) - syscall_failed("open", - "Note: you cannot run this tool with these options on live metadata."); - - return fd; - } - block_manager<>::ptr open_bm(string const &path) { block_address nr_blocks = get_nr_blocks(path); @@ -94,218 +87,6 @@ namespace { //-------------------------------- - // Once we start using variable sized blocks we will find we want - // to examine data that crosses cache block boundaries. So a block - // to be examined can be composed of multiple chunks of memory. - - struct mem { - mem(uint8_t *b, uint8_t *e) - : begin(b), - end(e) { - } - - uint8_t *begin, *end; - }; - - struct chunk { - // FIXME: switch to bytes rather than sectors - // FIXME: add length too - sector_t offset_sectors_; - deque mem_; - }; - - class chunk_stream { - public: - virtual ~chunk_stream() {} - - virtual block_address nr_chunks() const = 0; - virtual void rewind() = 0; - virtual bool advance(block_address count = 1ull) = 0; - virtual block_address index() const = 0; - virtual chunk const &get() const = 0; - }; - - class cache_stream : public chunk_stream { - public: - cache_stream(string const &path, - block_address block_size, - size_t cache_mem) - : block_size_(block_size), - nr_blocks_(get_nr_blocks(path, block_size)), - - // hack because cache uses LRU rather than MRU - cache_blocks_((cache_mem / block_size) / 2u), - fd_(open_file(path)), - v_(new bcache::noop_validator()), - cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), - current_index_(0) { - load(0); - for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) - cache_->prefetch(i); - } - - virtual block_address nr_chunks() const { - return nr_blocks_; - } - - virtual void rewind() { - load(0); - } - - virtual bool advance(block_address count = 1ull) { - if (current_index_ + count >= nr_blocks_) - return false; - - current_index_ += count; - - load(current_index_); - return true; - } - - virtual block_address index() const { - return current_index_; - } - - virtual chunk const &get() const { - return current_chunk_; - } - - private: - void load(block_address b) { - current_index_ = b; - current_block_ = cache_->get(current_index_, 0, v_); - - current_chunk_.offset_sectors_ = (b * block_size_) / 512; - current_chunk_.mem_.clear(); - current_chunk_.mem_.push_back(mem(static_cast(current_block_.get_data()), - static_cast(current_block_.get_data()) + block_size_)); - - if (current_index_ + cache_blocks_ < nr_blocks_) - cache_->prefetch(current_index_ + cache_blocks_); - } - - block_address block_size_; - block_address nr_blocks_; - block_address cache_blocks_; - int fd_; - validator::ptr v_; - auto_ptr cache_; - - block_address current_index_; - block_cache::auto_block current_block_; - chunk current_chunk_; - }; - - //-------------------------------- - - typedef rmap_visitor::region region; - typedef rmap_visitor::rmap_region rmap_region; - - uint32_t const UNMAPPED = -1; - - class pool_stream : public chunk_stream { - public: - - pool_stream(cache_stream &stream, - transaction_manager::ptr tm, superblock_detail::superblock const &sb, - block_address nr_blocks) - : stream_(stream), - block_to_thin_(stream.nr_chunks(), UNMAPPED), - nr_mapped_(0) { - init_rmap(tm, sb, nr_blocks); - } - - block_address nr_chunks() const { - return stream_.nr_chunks(); - } - - void rewind() { - stream_.rewind(); - } - - bool advance(block_address count = 1ull) { - while (count--) - if (!advance_one()) - return false; - - return true; - } - - block_address index() const { - return stream_.index(); - } - - chunk const &get() const { - return stream_.get(); - } - - private: - class damage_visitor { - public: - virtual void visit(btree_path const &path, btree_detail::damage const &d) { - throw std::runtime_error("damage in mapping tree, please run thin_check"); - } - }; - - // FIXME: too big to return by value - vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, - block_address nr_blocks) { - damage_visitor dv; - rmap_visitor rv; - - mapping_tree mtree(*tm, sb.data_mapping_root_, - mapping_tree_detail::block_traits::ref_counter(tm->get_sm())); - - rv.add_data_region(rmap_visitor::region(0, nr_blocks)); - - btree_visit_values(mtree, rv, dv); - rv.complete(); - cerr << "rmap size: " << rv.get_rmap().size() << "\n"; - return rv.get_rmap(); - } - - void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, - block_address nr_blocks) { - cerr << "reading rmap..."; - vector rmap = read_rmap(tm, sb, nr_blocks); - cerr << "done\n"; - - vector::const_iterator it; - set thins; - for (it = rmap.begin(); it != rmap.end(); ++it) { - rmap_region const &r = *it; - for (block_address b = r.data_begin; b != r.data_end; b++) - if (block_to_thin_[b] == UNMAPPED) { - nr_mapped_++; - block_to_thin_[b] = r.thin_dev; - } - thins.insert(r.thin_dev); - } - - cerr << nr_mapped_ << " mapped blocks\n"; - cerr << "there are " << thins.size() << " thin devices\n"; - } - - bool advance_one() { - block_address new_index = index() + 1; - - while (block_to_thin_[new_index] == UNMAPPED && - new_index < nr_chunks()) - new_index++; - - if (new_index >= nr_chunks()) - return false; - - return stream_.advance(new_index - index()); - } - - cache_stream &stream_; - vector block_to_thin_; - block_address nr_mapped_; - }; - - //-------------------------------- - struct flags { flags() : cache_mem(64 * 1024 * 1024) { @@ -434,6 +215,7 @@ namespace { return 0; } +#if 0 int show_dups_linear(flags const &fs) { if (!fs.block_size) // FIXME: this check should be moved to the switch parsing @@ -481,13 +263,14 @@ namespace { return 0; } +#endif int show_dups(flags const &fs) { if (fs.metadata_dev) return show_dups_pool(fs); else { cerr << "No metadata device provided, so treating data device as a linear device\n"; - return show_dups_linear(fs); + //return show_dups_linear(fs); } } From 10f93be8b1dc3062307be5ca5454d908c811b802 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 25 Aug 2015 08:22:16 +0100 Subject: [PATCH 13/27] [thin_show_dups] put linear branch back in --- thin-provisioning/pool_stream.cc | 2 +- thin-provisioning/thin_show_duplicates.cc | 38 ++++++----------------- 2 files changed, 10 insertions(+), 30 deletions(-) diff --git a/thin-provisioning/pool_stream.cc b/thin-provisioning/pool_stream.cc index 63b64ac..0191698 100644 --- a/thin-provisioning/pool_stream.cc +++ b/thin-provisioning/pool_stream.cc @@ -50,7 +50,7 @@ pool_stream::pool_stream(cache_stream &stream, block_address pool_stream::nr_chunks() const { - return stream_.nr_chunks(); + return nr_mapped_; } void diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 261b971..62a4f77 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -210,12 +210,11 @@ namespace { pbar->update_percent(100); cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; -// cout << (detector.get_total_duplicates() * 100) / nr_mapped_ << "% duplicates\n"; + cout << (detector.get_total_duplicates() * 100) / pstream.nr_chunks() << "% duplicates\n"; return 0; } -#if 0 int show_dups_linear(flags const &fs) { if (!fs.block_size) // FIXME: this check should be moved to the switch parsing @@ -223,39 +222,21 @@ namespace { cerr << "path = " << fs.data_dev << "\n"; cerr << "block size = " << fs.block_size << "\n"; + block_address block_size = *fs.block_size * 512; block_address nr_blocks = get_nr_blocks(fs.data_dev, *fs.block_size); cerr << "nr_blocks = " << nr_blocks << "\n"; - // The cache uses a LRU eviction policy, which plays badly - // with a sequential read. So we can't prefetch all the - // blocks. - - // FIXME: add MRU policy to cache - unsigned cache_blocks = (fs.cache_mem / *fs.block_size) / 2; - int fd = open_file(fs.data_dev); - sector_t block_sectors = *fs.block_size / 512; - block_cache cache(fd, block_sectors, nr_blocks, fs.cache_mem); - validator::ptr v(new bcache::noop_validator()); + cache_stream stream(fs.data_dev, block_size, fs.cache_mem); duplicate_detector detector(*fs.block_size, nr_blocks); - // warm up the cache - for (block_address i = 0; i < cache_blocks; i++) - cache.prefetch(i); - auto_ptr pbar = create_progress_bar("Examining data"); + do { + chunk const &c = stream.get(); + detector.examine(c); + pbar->update_percent((stream.index() * 100) / stream.nr_chunks()); - for (block_address i = 0; i < nr_blocks; i++) { - block_cache::block &b = cache.get(i, 0, v); - block_address prefetch = i + cache_blocks; - if (prefetch < nr_blocks) - cache.prefetch(prefetch); - - detector.examine(b); - b.put(); - - pbar->update_percent(i * 100 / nr_blocks); - } + } while (stream.advance()); pbar->update_percent(100); cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; @@ -263,14 +244,13 @@ namespace { return 0; } -#endif int show_dups(flags const &fs) { if (fs.metadata_dev) return show_dups_pool(fs); else { cerr << "No metadata device provided, so treating data device as a linear device\n"; - //return show_dups_linear(fs); + return show_dups_linear(fs); } } From 6dd6fcb4cd24a0b19ed3584f7a684e33b84d6002 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 25 Aug 2015 08:38:01 +0100 Subject: [PATCH 14/27] [thin_show_dups] fix bug calculating block size for linear volumes --- thin-provisioning/thin_show_duplicates.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 62a4f77..6c07edc 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -220,15 +220,15 @@ namespace { // FIXME: this check should be moved to the switch parsing throw runtime_error("--block-sectors or --metadata-dev must be supplied"); - cerr << "path = " << fs.data_dev << "\n"; - cerr << "block size = " << fs.block_size << "\n"; - block_address block_size = *fs.block_size * 512; + block_address block_size = *fs.block_size; block_address nr_blocks = get_nr_blocks(fs.data_dev, *fs.block_size); + + cerr << "path = " << fs.data_dev << "\n"; cerr << "nr_blocks = " << nr_blocks << "\n"; + cerr << "block size = " << block_size << "\n"; cache_stream stream(fs.data_dev, block_size, fs.cache_mem); - - duplicate_detector detector(*fs.block_size, nr_blocks); + duplicate_detector detector(block_size, nr_blocks); auto_ptr pbar = create_progress_bar("Examining data"); do { From d44a817c60b4dbf05d1a6ae2a3b1c91bbef8b974 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 25 Aug 2015 09:14:40 +0100 Subject: [PATCH 15/27] [thin_show_dups] Track zero blocks --- thin-provisioning/thin_show_duplicates.cc | 94 ++++++++++++++--------- 1 file changed, 59 insertions(+), 35 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 6c07edc..7819ca1 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -106,47 +106,44 @@ namespace { public: duplicate_counter(block_address nr_blocks) : counts_(nr_blocks), - total_dups_(0) { + non_zero_dups_(0), + zero_dups_(0) { } void add_duplicate(block_address b1, block_address b2) { - total_dups_++; + non_zero_dups_++; counts_[b1]++; } + void add_zero_duplicate(block_address b) { + zero_dups_++; + } + block_address get_total() const { - return total_dups_; + return non_zero_dups_ + zero_dups_; + } + + block_address get_non_zeroes() const { + return non_zero_dups_; + } + + block_address get_zeroes() const { + return zero_dups_; } private: vector counts_; - block_address total_dups_; + block_address non_zero_dups_; + block_address zero_dups_; }; class duplicate_detector { public: duplicate_detector(unsigned block_size, block_address nr_blocks) : block_size_(block_size), - results_(nr_blocks) { - } - - // FIXME: remove - void examine(block_cache::block const &b) { - digestor_.reset(); - digestor_.process_bytes(b.get_data(), block_size_); - unsigned int digest[5]; - digestor_.get_digest(digest); - - // hack - vector v(5); - for (unsigned i = 0; i < 5; i++) - v[i] = digest[i]; - - fingerprint_map::const_iterator it = fm_.find(v); - if (it != fm_.end()) { - results_.add_duplicate(it->second, b.get_index()); - } else - fm_.insert(make_pair(v, b.get_index())); + results_(nr_blocks), + zero_fingerprint_(5, 0ull) { + calc_zero_fingerprint(); } void examine(chunk const &c) { @@ -163,16 +160,37 @@ namespace { for (unsigned i = 0; i < 5; i++) v[i] = digest[i]; - fingerprint_map::const_iterator it = fm_.find(v); block_address index = (c.offset_sectors_ * 512) / block_size_; - if (it != fm_.end()) { - results_.add_duplicate(it->second, index); - } else - fm_.insert(make_pair(v, index)); + + if (v == zero_fingerprint_) + results_.add_zero_duplicate(index); + + else { + fingerprint_map::const_iterator it = fm_.find(v); + if (it != fm_.end()) { + results_.add_duplicate(it->second, index); + } else + fm_.insert(make_pair(v, index)); + } } - block_address get_total_duplicates() const { - return results_.get_total(); + duplicate_counter const &get_results() const { + return results_; + } + + void calc_zero_fingerprint() { + auto_ptr bytes(new uint8_t[block_size_]); + memset(bytes.get(), 0, block_size_); + + digestor_.reset(); + digestor_.process_bytes(bytes.get(), block_size_); + + unsigned int digest[5]; + digestor_.get_digest(digest); + + // hack + for (unsigned i = 0; i < 5; i++) + zero_fingerprint_[i] = digest[i]; } private: @@ -182,6 +200,8 @@ namespace { boost::uuids::detail::sha1 digestor_; fingerprint_map fm_; duplicate_counter results_; + + vector zero_fingerprint_; }; int show_dups_pool(flags const &fs) { @@ -209,8 +229,8 @@ namespace { } while (pstream.advance()); pbar->update_percent(100); - cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; - cout << (detector.get_total_duplicates() * 100) / pstream.nr_chunks() << "% duplicates\n"; + cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; + cout << (detector.get_results().get_total() * 100) / pstream.nr_chunks() << "% duplicates\n"; return 0; } @@ -239,8 +259,12 @@ namespace { } while (stream.advance()); pbar->update_percent(100); - cout << "\n\ntotal dups: " << detector.get_total_duplicates() << endl; - cout << (detector.get_total_duplicates() * 100) / nr_blocks << "% duplicates\n"; + cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; + cout << (detector.get_results().get_total() * 100) / nr_blocks << "% duplicates\n"; + + duplicate_counter r = detector.get_results(); + cout << "\n\nchunks\tnon zero dups\tzero dups\n" + << nr_blocks << "\t" << r.get_non_zeroes() << "\t" << r.get_zeroes() << "\n"; return 0; } From 750ce0f47b616907b110fbe8afc1508efd42c2ed Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 3 Sep 2015 13:02:29 +0100 Subject: [PATCH 16/27] [thin_show_dups] variable_chunk_stream --- Makefile.in | 3 + base/rolling_hash.cc | 148 ++++++++++++++++++++ base/rolling_hash.h | 61 ++++++++ thin-provisioning/cache_stream.cc | 53 ++++--- thin-provisioning/cache_stream.h | 23 +++- thin-provisioning/chunk_stream.cc | 23 ++++ thin-provisioning/chunk_stream.h | 15 +- thin-provisioning/pool_stream.cc | 18 ++- thin-provisioning/pool_stream.h | 10 +- thin-provisioning/thin_show_duplicates.cc | 111 +++++++-------- thin-provisioning/variable_chunk_stream.cc | 152 ++++++++++++++++++++ thin-provisioning/variable_chunk_stream.h | 42 ++++++ unit-tests/Makefile.in | 1 + unit-tests/rolling_hash_t.cc | 153 +++++++++++++++++++++ 14 files changed, 709 insertions(+), 104 deletions(-) create mode 100644 base/rolling_hash.cc create mode 100644 base/rolling_hash.h create mode 100644 thin-provisioning/chunk_stream.cc create mode 100644 thin-provisioning/variable_chunk_stream.cc create mode 100644 thin-provisioning/variable_chunk_stream.h create mode 100644 unit-tests/rolling_hash_t.cc diff --git a/Makefile.in b/Makefile.in index 0dd6c99..d09c873 100644 --- a/Makefile.in +++ b/Makefile.in @@ -31,6 +31,7 @@ SOURCE=\ base/error_state.cc \ base/error_string.cc \ base/progress_monitor.cc \ + base/rolling_hash.cc \ base/xml_utils.cc \ block-cache/block_cache.cc \ caching/cache_check.cc \ @@ -72,6 +73,7 @@ SOURCE=\ persistent-data/transaction_manager.cc \ persistent-data/validators.cc \ thin-provisioning/cache_stream.cc \ + thin-provisioning/chunk_stream.cc \ thin-provisioning/device_tree.cc \ thin-provisioning/human_readable_format.cc \ thin-provisioning/mapping_tree.cc \ @@ -92,6 +94,7 @@ SOURCE=\ thin-provisioning/thin_rmap.cc \ thin-provisioning/thin_show_duplicates.cc \ thin-provisioning/thin_trim.cc \ + thin-provisioning/variable_chunk_stream.cc \ thin-provisioning/xml_format.cc CC:=@CC@ diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc new file mode 100644 index 0000000..1ea362f --- /dev/null +++ b/base/rolling_hash.cc @@ -0,0 +1,148 @@ +#include "base/rolling_hash.h" + +using namespace base; +using namespace boost; +using namespace std; + +//---------------------------------------------------------------- + +namespace { + uint32_t MULTIPLIER = 4294967291UL; + uint32_t SEED = 123; +} + +rolling_hash::rolling_hash(unsigned window_size) + : a_(MULTIPLIER), + a_to_k_minus_1_(a_), + window_size_(window_size) { + + for (unsigned i = 1; i < window_size_ - 1; i++) + a_to_k_minus_1_ *= a_; + + reset(); +} + +void +rolling_hash::reset() +{ + // prime with zeroes + chars_.clear(); + + hash_ = 0; + for (unsigned i = 0; i < window_size_; i++) { + hash_ = (hash_ * a_) + SEED; + chars_.push_back(0); + } +} + +uint32_t +rolling_hash::step(uint8_t byte) +{ + update_hash(byte); + return hash_; +} + +uint32_t +rolling_hash::get_hash() const +{ + return hash_; +} + +void +rolling_hash::update_hash(uint8_t byte) +{ + hash_ -= a_to_k_minus_1_ * (chars_.front() + SEED); + chars_.pop_front(); + chars_.push_back(byte); + hash_ = (hash_ * a_) + byte + SEED; +} + +//-------------------------------- + +content_based_hash::content_based_hash(unsigned window_size) + : rhash_(window_size), + + // FIXME: hard coded values + backup_div_((window_size / 4) - 1), + div_((window_size / 2) - 1), + min_len_(window_size / 8), + max_len_(window_size), + len_(0) +{ +} + +void +content_based_hash::reset() +{ + len_ = 0; + backup_break_.reset(); + rhash_.reset(); +} + +optional +content_based_hash::step(uint8_t byte) +{ +#if 0 + optional r; + + rhash_.step(byte); + len_++; + + if (len_ < min_len_) + return r; + + if (hit_break(backup_div_)) + backup_break_ = len_; + + if (hit_break(div_)) { + // found a break + r = len_; + len_ = 0; + backup_break_.reset(); + + } else if (len_ >= max_len_) { + // too big, is there a backup? + if (backup_break_) { + len_ -= *backup_break_; + r = backup_break_; + backup_break_.reset(); + + } else { + r = len_; + len_ = 0; + } + } + + return r; +#else + optional r; + + rhash_.step(byte); + len_++; + + if (len_ < min_len_) + return r; + + if (hit_break(div_)) { + // found a break + r = len_; + len_ = 0; + backup_break_.reset(); + + } else if (len_ >= max_len_) { + r = len_; + len_ = 0; + } + + return r; +#endif +} + +bool +content_based_hash::hit_break(uint32_t mask) const +{ + uint32_t h = rhash_.get_hash() >> 8; + return !(h & mask); +} + +//---------------------------------------------------------------- diff --git a/base/rolling_hash.h b/base/rolling_hash.h new file mode 100644 index 0000000..d44012a --- /dev/null +++ b/base/rolling_hash.h @@ -0,0 +1,61 @@ +#ifndef BASE_ROLLING_HASH_H +#define BASE_ROLLING_HASH_H + +#include +#include +#include + +//---------------------------------------------------------------- + +namespace base { + class rolling_hash { + public: + rolling_hash(unsigned window_size); + + void reset(); + + // Returns the current hash + uint32_t step(uint8_t byte); + + uint32_t get_hash() const; + + private: + void update_hash(uint8_t byte); + + uint32_t a_; + uint32_t a_to_k_minus_1_; + + // FIXME: use a ring buffer + std::list chars_; + + uint32_t hash_; + uint32_t window_size_; + }; + + class content_based_hash { + public: + content_based_hash(unsigned window_size); + void reset(); + + // Returns a break point relative to the last reset/break. + boost::optional step(uint8_t byte); + + private: + bool hit_break(uint32_t div) const; + + rolling_hash rhash_; + + uint32_t backup_div_; + uint32_t div_; + + unsigned min_len_; + unsigned max_len_; + + unsigned len_; + boost::optional backup_break_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/cache_stream.cc b/thin-provisioning/cache_stream.cc index 956fc4e..8fbcc72 100644 --- a/thin-provisioning/cache_stream.cc +++ b/thin-provisioning/cache_stream.cc @@ -32,9 +32,8 @@ cache_stream::cache_stream(string const &path, v_(new bcache::noop_validator()), cache_(new block_cache(fd_, block_size / 512, nr_blocks_, cache_mem)), current_index_(0) { - load(0); - for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) - cache_->prefetch(i); + + rewind(); } block_address @@ -46,19 +45,27 @@ cache_stream::nr_chunks() const void cache_stream::rewind() { - load(0); + current_index_ = 0; + + for (block_address i = 1; i < min(cache_blocks_, nr_blocks_); i++) + cache_->prefetch(i); } bool -cache_stream::advance(block_address count) +cache_stream::next(block_address count) { - if (current_index_ + count >= nr_blocks_) - return false; + current_index_ = min(current_index_ + count, nr_blocks_); - current_index_ += count; + if (current_index_ + cache_blocks_ < nr_blocks_) + cache_->prefetch(current_index_ + cache_blocks_); - load(current_index_); - return true; + return !eof(); +} + +bool +cache_stream::eof() const +{ + return current_index_ >= nr_blocks_; } block_address @@ -68,24 +75,26 @@ cache_stream::index() const } chunk const & -cache_stream::get() const +cache_stream::get() { - return current_chunk_; + chunk_wrapper *w = new chunk_wrapper(*this); + return w->c_; } void -cache_stream::load(block_address b) +cache_stream::put(chunk const &c) { - current_index_ = b; - current_block_ = cache_->get(current_index_, 0, v_); + chunk_wrapper *w = container_of(const_cast(&c), chunk_wrapper, c_); + delete w; +} - current_chunk_.offset_sectors_ = (b * block_size_) / 512; - current_chunk_.mem_.clear(); - current_chunk_.mem_.push_back(mem(static_cast(current_block_.get_data()), - static_cast(current_block_.get_data()) + block_size_)); - - if (current_index_ + cache_blocks_ < nr_blocks_) - cache_->prefetch(current_index_ + cache_blocks_); +cache_stream::chunk_wrapper::chunk_wrapper(cache_stream &parent) + : block_(parent.cache_->get(parent.current_index_, 0, parent.v_)) +{ + c_.offset_ = parent.current_index_ * parent.block_size_; + c_.len_ = parent.block_size_; + c_.mem_.push_back(mem(static_cast(block_.get_data()), + static_cast(block_.get_data()) + parent.block_size_)); } //---------------------------------------------------------------- diff --git a/thin-provisioning/cache_stream.h b/thin-provisioning/cache_stream.h index cfe6ca8..65c81b1 100644 --- a/thin-provisioning/cache_stream.h +++ b/thin-provisioning/cache_stream.h @@ -14,25 +14,36 @@ namespace thin_provisioning { block_address block_size, size_t cache_mem); - virtual block_address nr_chunks() const; + block_address nr_chunks() const; + virtual void rewind(); - virtual bool advance(block_address count = 1ull); virtual block_address index() const; - virtual chunk const &get() const; + + virtual bool next(block_address count = 1ull); + virtual bool eof() const; + + virtual chunk const &get(); + virtual void put(chunk const &c); private: - void load(block_address b); + struct chunk_wrapper { + chunk_wrapper(cache_stream &parent); + + block_cache::auto_block block_; + chunk c_; + }; + + friend class chunk_wrapper; block_address block_size_; block_address nr_blocks_; block_address cache_blocks_; + int fd_; validator::ptr v_; std::auto_ptr cache_; block_address current_index_; - block_cache::auto_block current_block_; - chunk current_chunk_; }; } diff --git a/thin-provisioning/chunk_stream.cc b/thin-provisioning/chunk_stream.cc new file mode 100644 index 0000000..4ac99ff --- /dev/null +++ b/thin-provisioning/chunk_stream.cc @@ -0,0 +1,23 @@ +#include "thin-provisioning/chunk_stream.h" + +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +uint8_t +chunk::operator[](uint64_t n) const +{ + std::deque::const_iterator it; + for (it = mem_.begin(); it != mem_.end(); it++) { + uint64_t mem_len = it->end - it->begin; + if (n > mem_len) + n -= mem_len; + else + return it->begin[n]; + } + + throw runtime_error("chunk out of bounds"); +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/chunk_stream.h b/thin-provisioning/chunk_stream.h index d87ee9c..4e1ea96 100644 --- a/thin-provisioning/chunk_stream.h +++ b/thin-provisioning/chunk_stream.h @@ -37,21 +37,24 @@ namespace thin_provisioning { }; struct chunk { - // FIXME: switch to bytes rather than sectors - // FIXME: add length too - uint64_t offset_sectors_; + uint64_t offset_, len_; std::deque mem_; + + uint8_t operator[](uint64_t n) const; }; class chunk_stream { public: virtual ~chunk_stream() {} - virtual bcache::block_address nr_chunks() const = 0; virtual void rewind() = 0; - virtual bool advance(bcache::block_address count = 1ull) = 0; virtual bcache::block_address index() const = 0; - virtual chunk const &get() const = 0; + + virtual bool next(bcache::block_address count = 1ull) = 0; + virtual bool eof() const = 0; + + virtual chunk const &get() = 0; + virtual void put(chunk const &c) = 0; }; } diff --git a/thin-provisioning/pool_stream.cc b/thin-provisioning/pool_stream.cc index 0191698..21964f9 100644 --- a/thin-provisioning/pool_stream.cc +++ b/thin-provisioning/pool_stream.cc @@ -60,7 +60,7 @@ pool_stream::rewind() } bool -pool_stream::advance(block_address count) +pool_stream::next(block_address count) { while (count--) if (!advance_one()) @@ -69,6 +69,12 @@ pool_stream::advance(block_address count) return true; } +bool +pool_stream::eof() const +{ + return stream_.eof(); +} + block_address pool_stream::index() const { @@ -76,12 +82,16 @@ pool_stream::index() const } chunk const & -pool_stream::get() const +pool_stream::get() { return stream_.get(); } - +void +pool_stream::put(chunk const &c) +{ + stream_.put(c); +} // FIXME: too big to return by value vector @@ -140,7 +150,7 @@ pool_stream::advance_one() if (new_index >= nr_chunks()) return false; - return stream_.advance(new_index - index()); + return stream_.next(new_index - index()); } //---------------------------------------------------------------- diff --git a/thin-provisioning/pool_stream.h b/thin-provisioning/pool_stream.h index 1fa5c51..71576ed 100644 --- a/thin-provisioning/pool_stream.h +++ b/thin-provisioning/pool_stream.h @@ -34,16 +34,20 @@ namespace thin_provisioning { block_address nr_chunks() const; void rewind(); - bool advance(block_address count = 1ull); + bool next(block_address count = 1ull); + bool eof() const; block_address index() const; - chunk const &get() const; + + chunk const &get(); + void put(chunk const &c); private: typedef rmap_visitor::region region; typedef rmap_visitor::rmap_region rmap_region; // FIXME: too big to return by value - vector read_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, + vector read_rmap(transaction_manager::ptr tm, + superblock_detail::superblock const &sb, block_address nr_blocks); void init_rmap(transaction_manager::ptr tm, superblock_detail::superblock const &sb, block_address nr_blocks); diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 7819ca1..dafcd34 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -36,6 +36,7 @@ #include "thin-provisioning/mapping_tree.h" #include "thin-provisioning/rmap_visitor.h" #include "thin-provisioning/superblock.h" +#include "thin-provisioning/variable_chunk_stream.h" #include #include @@ -98,25 +99,21 @@ namespace { unsigned cache_mem; }; - // FIXME: introduce abstraction for a stream of segments - using namespace mapping_tree_detail; class duplicate_counter { public: - duplicate_counter(block_address nr_blocks) - : counts_(nr_blocks), - non_zero_dups_(0), + duplicate_counter() + : non_zero_dups_(0), zero_dups_(0) { } - void add_duplicate(block_address b1, block_address b2) { - non_zero_dups_++; - counts_[b1]++; + void add_duplicate(block_address len) { + non_zero_dups_ += len; } - void add_zero_duplicate(block_address b) { - zero_dups_++; + void add_zero_duplicate(block_address len) { + zero_dups_ += len; } block_address get_total() const { @@ -132,45 +129,35 @@ namespace { } private: - vector counts_; block_address non_zero_dups_; block_address zero_dups_; }; class duplicate_detector { public: - duplicate_detector(unsigned block_size, block_address nr_blocks) - : block_size_(block_size), - results_(nr_blocks), - zero_fingerprint_(5, 0ull) { - calc_zero_fingerprint(); - } - void examine(chunk const &c) { - digestor_.reset(); - - for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) - digestor_.process_bytes(it->begin, it->end - it->begin); - - unsigned int digest[5]; - digestor_.get_digest(digest); - - // hack - vector v(5); - for (unsigned i = 0; i < 5; i++) - v[i] = digest[i]; - - block_address index = (c.offset_sectors_ * 512) / block_size_; - - if (v == zero_fingerprint_) - results_.add_zero_duplicate(index); + if (all_zeroes(c)) + results_.add_zero_duplicate(c.len_); else { + digestor_.reset(); + + for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) + digestor_.process_bytes(it->begin, it->end - it->begin); + + unsigned int digest[5]; + digestor_.get_digest(digest); + + // hack + vector v(5); + for (unsigned i = 0; i < 5; i++) + v[i] = digest[i]; + fingerprint_map::const_iterator it = fm_.find(v); if (it != fm_.end()) { - results_.add_duplicate(it->second, index); + results_.add_duplicate(c.len_); } else - fm_.insert(make_pair(v, index)); + fm_.insert(make_pair(v, c.offset_)); } } @@ -178,30 +165,24 @@ namespace { return results_; } - void calc_zero_fingerprint() { - auto_ptr bytes(new uint8_t[block_size_]); - memset(bytes.get(), 0, block_size_); + private: + bool all_zeroes(chunk const &c) const { + for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) { + for (uint8_t *ptr = it->begin; ptr != it->end; ptr++) { + if (*ptr != 0) + return false; + } + } - digestor_.reset(); - digestor_.process_bytes(bytes.get(), block_size_); - - unsigned int digest[5]; - digestor_.get_digest(digest); - - // hack - for (unsigned i = 0; i < 5; i++) - zero_fingerprint_[i] = digest[i]; + return true; } - private: typedef map, block_address> fingerprint_map; unsigned block_size_; boost::uuids::detail::sha1 digestor_; fingerprint_map fm_; duplicate_counter results_; - - vector zero_fingerprint_; }; int show_dups_pool(flags const &fs) { @@ -218,15 +199,16 @@ namespace { cache_stream stream(fs.data_dev, block_size, fs.cache_mem); pool_stream pstream(stream, tm, sb, nr_blocks); - duplicate_detector detector(block_size, nr_blocks); + duplicate_detector detector; auto_ptr pbar = create_progress_bar("Examining data"); do { chunk const &c = pstream.get(); detector.examine(c); + pstream.put(c); pbar->update_percent((pstream.index() * 100) / pstream.nr_chunks()); - } while (pstream.advance()); + } while (pstream.next()); pbar->update_percent(100); cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; @@ -247,24 +229,27 @@ namespace { cerr << "nr_blocks = " << nr_blocks << "\n"; cerr << "block size = " << block_size << "\n"; - cache_stream stream(fs.data_dev, block_size, fs.cache_mem); - duplicate_detector detector(block_size, nr_blocks); + cache_stream low_level_stream(fs.data_dev, block_size, fs.cache_mem); + variable_chunk_stream stream(low_level_stream, 4096); + duplicate_detector detector; auto_ptr pbar = create_progress_bar("Examining data"); do { + // FIXME: use a wrapper class to automate the put() chunk const &c = stream.get(); detector.examine(c); - pbar->update_percent((stream.index() * 100) / stream.nr_chunks()); + stream.put(c); +// pbar->update_percent((stream.index() * 100) / stream.nr_chunks()); - } while (stream.advance()); + } while (stream.next()); pbar->update_percent(100); - cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; - cout << (detector.get_results().get_total() * 100) / nr_blocks << "% duplicates\n"; - duplicate_counter r = detector.get_results(); - cout << "\n\nchunks\tnon zero dups\tzero dups\n" - << nr_blocks << "\t" << r.get_non_zeroes() << "\t" << r.get_zeroes() << "\n"; + block_address meg = 1024 * 1024; + cout << "\n\n" + << (nr_blocks * block_size) / meg << "m examined, " + << r.get_non_zeroes() / meg << "m duplicates, " + << r.get_zeroes() / meg << "m zeroes\n"; return 0; } diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc new file mode 100644 index 0000000..9a9d11e --- /dev/null +++ b/thin-provisioning/variable_chunk_stream.cc @@ -0,0 +1,152 @@ +#include "thin-provisioning/variable_chunk_stream.h" + +using namespace boost; +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +variable_chunk_stream::variable_chunk_stream(chunk_stream &stream, unsigned window_size) + : index_(0), + h_(window_size), + stream_(stream), + big_chunk_(0) { + next_big_chunk(); +} + +variable_chunk_stream::~variable_chunk_stream() +{ + put_big_chunk(); +} + +void +variable_chunk_stream::rewind() +{ + // FIXME: not complete + index_ = 0; + stream_.rewind(); + h_.reset(); +} + +bool +variable_chunk_stream::next(bcache::block_address count) +{ + while (count--) { + index_++; + advance_one(); + } + + return !eof(); +} + +bool +variable_chunk_stream::eof() const +{ + return stream_.eof(); +} + +bcache::block_address +variable_chunk_stream::index() const +{ + return index_; +} + +chunk const & +variable_chunk_stream::get() +{ + assert(big_chunk_); + + little_chunk_.len_ = little_e_ - little_b_; + little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_; + + little_chunk_.mem_.clear(); + little_chunk_.mem_.push_back(mem(little_b_, little_e_)); + + return little_chunk_; +} + +void +variable_chunk_stream::put(chunk const &c) +{ + // noop +} + +bool +variable_chunk_stream::next_big_chunk() +{ + put_big_chunk(); + + if (!stream_.next()) + return false; + + big_chunk_ = &stream_.get(); + little_b_ = little_e_ = big_chunk_->mem_.front().begin; + h_.reset(); + + return true; +} + +bool +variable_chunk_stream::advance_one() +{ + uint8_t *big_e; + + assert(big_chunk_); + + big_e = big_chunk_->mem_.front().end; + little_b_ = little_e_; + + if (little_b_ == big_e) { + if (next_big_chunk()) + big_e = big_chunk_->mem_.front().end; + else + return false; + } + + assert(little_e_ >= big_chunk_->mem_.front().begin); + assert(little_b_ >= big_chunk_->mem_.front().begin); +#if 1 + if (little_e_ > big_e) { + cerr << "before -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n"; + } +#endif + assert(little_e_ <= big_e); + assert(little_b_ <= big_e); + + + while (little_e_ != big_e) { + optional maybe_break = h_.step(*little_e_); + + if (maybe_break) { + // The break is not neccessarily at the current + // byte. + little_e_ = little_b_ + *maybe_break; + break; + } + + little_e_++; + } + + assert(little_e_ >= big_chunk_->mem_.front().begin); + assert(little_b_ >= big_chunk_->mem_.front().begin); +#if 1 + if (little_e_ > big_e) { + cerr << "after -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n"; + } +#endif + assert(little_e_ <= big_e); + assert(little_b_ <= big_e); + + return true; +} + +void +variable_chunk_stream::put_big_chunk() +{ + if (big_chunk_) + stream_.put(*big_chunk_); + + big_chunk_ = 0; +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/variable_chunk_stream.h b/thin-provisioning/variable_chunk_stream.h new file mode 100644 index 0000000..0327f1d --- /dev/null +++ b/thin-provisioning/variable_chunk_stream.h @@ -0,0 +1,42 @@ +#ifndef THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H +#define THIN_PROVISIONING_VARIABLE_CHUNK_STREAM_H + +#include "base/rolling_hash.h" +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class variable_chunk_stream : public chunk_stream { + public: + // window_size must be a power of 2 + variable_chunk_stream(chunk_stream &stream, unsigned window_size); + ~variable_chunk_stream(); + + // FIXME: we don't know in advance how many chunks we will have + virtual void rewind(); + virtual bool next(bcache::block_address count = 1ull); + virtual bool eof() const; + virtual bcache::block_address index() const; + virtual chunk const &get(); + virtual void put(chunk const &c); + + private: + bool next_big_chunk(); + bool advance_one(); + void put_big_chunk(); + + bcache::block_address index_; + base::content_based_hash h_; + + chunk_stream &stream_; + chunk const *big_chunk_; + + uint8_t *little_b_, *little_e_; + chunk little_chunk_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/unit-tests/Makefile.in b/unit-tests/Makefile.in index 9307e5f..38f3d04 100644 --- a/unit-tests/Makefile.in +++ b/unit-tests/Makefile.in @@ -59,6 +59,7 @@ TEST_SOURCE=\ unit-tests/endian_t.cc \ unit-tests/error_state_t.cc \ unit-tests/rmap_visitor_t.cc \ + unit-tests/rolling_hash_t.cc \ unit-tests/run_set_t.cc \ unit-tests/space_map_t.cc \ unit-tests/span_iterator_t.cc \ diff --git a/unit-tests/rolling_hash_t.cc b/unit-tests/rolling_hash_t.cc new file mode 100644 index 0000000..c25b650 --- /dev/null +++ b/unit-tests/rolling_hash_t.cc @@ -0,0 +1,153 @@ +#include "gmock/gmock.h" + +#include "base/rolling_hash.h" + +using namespace base; +using namespace boost; +using namespace std; +using namespace testing; + +//---------------------------------------------------------------- + +namespace { + class RollingHashTests : public Test { + public: + RollingHashTests() + : window_size_(4096), + rhash_(window_size_) { + } + + typedef vector bytes; + bytes random_bytes(unsigned count) { + bytes v(count, 0); + + for (unsigned i = 0; i < count; i++) + v[i] = random_byte(); + + return v; + } + + uint8_t random_byte() const { + return random() % 256; + } + + void apply_bytes(bytes const &bs) { + for (unsigned i = 0; i < bs.size(); i++) + rhash_.step(bs[i]); + } + + unsigned window_size_; + rolling_hash rhash_; + }; + + class ContentBasedHashTests : public Test { + public: + ContentBasedHashTests() + : window_size_(8192), + h_(window_size_) { + } + + typedef vector bytes; + bytes random_bytes(unsigned count) { + bytes v(count, 0); + + for (unsigned i = 0; i < count; i++) + v[i] = random_byte(); + + return v; + } + + uint8_t random_byte() const { + return random() % 256; + } + + unsigned window_size_; + content_based_hash h_; + }; +} + +//---------------------------------------------------------------- + +TEST_F(RollingHashTests, ctr) +{ +} + +//-------------------------------- + +TEST_F(RollingHashTests, hash_changes) +{ + bytes bs = random_bytes(window_size_ * 100); + + uint32_t prev = rhash_.get_hash(); + for (unsigned i = 0; i < bs.size(); i++) { + rhash_.step(bs[i]); + ASSERT_NE(rhash_.get_hash(), prev); + prev = rhash_.get_hash(); + } +} + +TEST_F(RollingHashTests, hash_repeats) +{ + bytes bs = random_bytes(window_size_); + + apply_bytes(bs); + uint32_t h1 = rhash_.get_hash(); + apply_bytes(bs); + + ASSERT_EQ(rhash_.get_hash(), h1); +} + +TEST_F(RollingHashTests, reset_is_deterministic) +{ + uint8_t bytes[] = "lksdfuwerh,sdg"; + + for (unsigned i = 0; i < sizeof(bytes) - 1; i++) + rhash_.step(bytes[i]); + + uint32_t h1 = rhash_.get_hash(); + + rhash_.reset(); + + for (unsigned i = 0; i < sizeof(bytes) - 1; i++) + rhash_.step(bytes[i]); + + uint32_t h2 = rhash_.get_hash(); + + ASSERT_EQ(h1, h2); +} + +//---------------------------------------------------------------- + +TEST_F(ContentBasedHashTests, ctr) +{ +} + +TEST_F(ContentBasedHashTests, chunk_limits_respected) +{ + unsigned min = 100000, max = 0; + + bytes bs = random_bytes(1024 * 1024 * 100); + vector counts(window_size_, 0); + + for (unsigned i = 0; i < bs.size(); i++) { + optional b = h_.step(bs[i]); + if (b) { + counts[*b]++; + + if (*b < min) + min = *b; + + if (*b > max) + max = *b; + } + } + +#if 1 + for (unsigned i = 0; i < counts.size(); i++) + cerr << i << ": " << counts[i] << "\n"; + + cerr << "min: " << min << ", max: " << max << "\n"; +#endif +} + +//---------------------------------------------------------------- From 5d383c029334bb8d85dee5c81c40741c2975cf01 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 10:10:41 +0100 Subject: [PATCH 17/27] [thin_show_dups] get the backup break working in the rolling hash --- base/rolling_hash.cc | 23 ---------------------- thin-provisioning/variable_chunk_stream.cc | 20 +++++++------------ thin-provisioning/variable_chunk_stream.h | 2 +- 3 files changed, 8 insertions(+), 37 deletions(-) diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc index 1ea362f..9c6e1bf 100644 --- a/base/rolling_hash.cc +++ b/base/rolling_hash.cc @@ -82,7 +82,6 @@ content_based_hash::reset() optional content_based_hash::step(uint8_t byte) { -#if 0 optional r; rhash_.step(byte); @@ -114,28 +113,6 @@ content_based_hash::step(uint8_t byte) } return r; -#else - optional r; - - rhash_.step(byte); - len_++; - - if (len_ < min_len_) - return r; - - if (hit_break(div_)) { - // found a break - r = len_; - len_ = 0; - backup_break_.reset(); - - } else if (len_ >= max_len_) { - r = len_; - len_ = 0; - } - - return r; -#endif } bool diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc index 9a9d11e..41c6c96 100644 --- a/thin-provisioning/variable_chunk_stream.cc +++ b/thin-provisioning/variable_chunk_stream.cc @@ -80,7 +80,7 @@ variable_chunk_stream::next_big_chunk() return false; big_chunk_ = &stream_.get(); - little_b_ = little_e_ = big_chunk_->mem_.front().begin; + little_b_ = little_e_ = last_hashed_ = big_chunk_->mem_.front().begin; h_.reset(); return true; @@ -95,6 +95,7 @@ variable_chunk_stream::advance_one() big_e = big_chunk_->mem_.front().end; little_b_ = little_e_; + little_e_ = last_hashed_; if (little_b_ == big_e) { if (next_big_chunk()) @@ -105,35 +106,28 @@ variable_chunk_stream::advance_one() assert(little_e_ >= big_chunk_->mem_.front().begin); assert(little_b_ >= big_chunk_->mem_.front().begin); -#if 1 - if (little_e_ > big_e) { - cerr << "before -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n"; - } -#endif assert(little_e_ <= big_e); assert(little_b_ <= big_e); while (little_e_ != big_e) { optional maybe_break = h_.step(*little_e_); + little_e_++; if (maybe_break) { // The break is not neccessarily at the current // byte. + last_hashed_ = little_e_; little_e_ = little_b_ + *maybe_break; break; } - - little_e_++; } + if (little_e_ == big_e) + last_hashed_ = little_e_; + assert(little_e_ >= big_chunk_->mem_.front().begin); assert(little_b_ >= big_chunk_->mem_.front().begin); -#if 1 - if (little_e_ > big_e) { - cerr << "after -- little_e_: " << (void *) little_e_ << ", big_e: " << (void *) big_e << "\n"; - } -#endif assert(little_e_ <= big_e); assert(little_b_ <= big_e); diff --git a/thin-provisioning/variable_chunk_stream.h b/thin-provisioning/variable_chunk_stream.h index 0327f1d..f9c5ec7 100644 --- a/thin-provisioning/variable_chunk_stream.h +++ b/thin-provisioning/variable_chunk_stream.h @@ -32,7 +32,7 @@ namespace thin_provisioning { chunk_stream &stream_; chunk const *big_chunk_; - uint8_t *little_b_, *little_e_; + uint8_t *little_b_, *little_e_, *last_hashed_; chunk little_chunk_; }; } From 7633c5d7ae81b2b803ace3a9347f0fe510fa60ff Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 10:36:39 +0100 Subject: [PATCH 18/27] [thin_show_dups] get the progress bar working again --- thin-provisioning/thin_show_duplicates.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index dafcd34..9fc96c2 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -224,6 +224,7 @@ namespace { block_address block_size = *fs.block_size; block_address nr_blocks = get_nr_blocks(fs.data_dev, *fs.block_size); + block_address dev_size = nr_blocks * *fs.block_size; cerr << "path = " << fs.data_dev << "\n"; cerr << "nr_blocks = " << nr_blocks << "\n"; @@ -239,7 +240,8 @@ namespace { chunk const &c = stream.get(); detector.examine(c); stream.put(c); -// pbar->update_percent((stream.index() * 100) / stream.nr_chunks()); + + pbar->update_percent((c.offset_ * 100) / dev_size); } while (stream.next()); pbar->update_percent(100); From 506b0a8a080799c219b3d7cc7890fe36b5efce88 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 11:10:19 +0100 Subject: [PATCH 19/27] [thin_show_dups] inline some hash functions --- base/rolling_hash.cc | 71 +------------------------------------------- base/rolling_hash.h | 60 +++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 75 deletions(-) diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc index 9c6e1bf..8de7ac3 100644 --- a/base/rolling_hash.cc +++ b/base/rolling_hash.cc @@ -2,15 +2,11 @@ using namespace base; using namespace boost; +using namespace hash_detail; using namespace std; //---------------------------------------------------------------- -namespace { - uint32_t MULTIPLIER = 4294967291UL; - uint32_t SEED = 123; -} - rolling_hash::rolling_hash(unsigned window_size) : a_(MULTIPLIER), a_to_k_minus_1_(a_), @@ -35,28 +31,6 @@ rolling_hash::reset() } } -uint32_t -rolling_hash::step(uint8_t byte) -{ - update_hash(byte); - return hash_; -} - -uint32_t -rolling_hash::get_hash() const -{ - return hash_; -} - -void -rolling_hash::update_hash(uint8_t byte) -{ - hash_ -= a_to_k_minus_1_ * (chars_.front() + SEED); - chars_.pop_front(); - chars_.push_back(byte); - hash_ = (hash_ * a_) + byte + SEED; -} - //-------------------------------- content_based_hash::content_based_hash(unsigned window_size) @@ -79,47 +53,4 @@ content_based_hash::reset() rhash_.reset(); } -optional -content_based_hash::step(uint8_t byte) -{ - optional r; - - rhash_.step(byte); - len_++; - - if (len_ < min_len_) - return r; - - if (hit_break(backup_div_)) - backup_break_ = len_; - - if (hit_break(div_)) { - // found a break - r = len_; - len_ = 0; - backup_break_.reset(); - - } else if (len_ >= max_len_) { - // too big, is there a backup? - if (backup_break_) { - len_ -= *backup_break_; - r = backup_break_; - backup_break_.reset(); - - } else { - r = len_; - len_ = 0; - } - } - - return r; -} - -bool -content_based_hash::hit_break(uint32_t mask) const -{ - uint32_t h = rhash_.get_hash() >> 8; - return !(h & mask); -} - //---------------------------------------------------------------- diff --git a/base/rolling_hash.h b/base/rolling_hash.h index d44012a..c5fa44c 100644 --- a/base/rolling_hash.h +++ b/base/rolling_hash.h @@ -8,6 +8,11 @@ //---------------------------------------------------------------- namespace base { + namespace hash_detail { + uint32_t const MULTIPLIER = 4294967291UL; + uint32_t const SEED = 123; + } + class rolling_hash { public: rolling_hash(unsigned window_size); @@ -15,12 +20,22 @@ namespace base { void reset(); // Returns the current hash - uint32_t step(uint8_t byte); + uint32_t step(uint8_t byte) { + update_hash(byte); + return hash_; + } - uint32_t get_hash() const; + uint32_t get_hash() const { + return hash_; + } private: - void update_hash(uint8_t byte); + void update_hash(uint8_t byte) { + hash_ -= a_to_k_minus_1_ * (chars_.front() + hash_detail::SEED); + chars_.pop_front(); + chars_.push_back(byte); + hash_ = (hash_ * a_) + byte + hash_detail::SEED; + } uint32_t a_; uint32_t a_to_k_minus_1_; @@ -38,10 +53,45 @@ namespace base { void reset(); // Returns a break point relative to the last reset/break. - boost::optional step(uint8_t byte); + boost::optional step(uint8_t byte) { + boost::optional r; + + rhash_.step(byte); + len_++; + + if (len_ < min_len_) + return r; + + if (hit_break(backup_div_)) + backup_break_ = len_; + + if (hit_break(div_)) { + // found a break + r = len_; + len_ = 0; + backup_break_.reset(); + + } else if (len_ >= max_len_) { + // too big, is there a backup? + if (backup_break_) { + len_ -= *backup_break_; + r = backup_break_; + backup_break_.reset(); + + } else { + r = len_; + len_ = 0; + } + } + + return r; + } private: - bool hit_break(uint32_t div) const; + bool hit_break(uint32_t mask) const { + uint32_t h = rhash_.get_hash() >> 8; + return !(h & mask); + } rolling_hash rhash_; From 3b9681232873029fdd983874126ba093fb611093 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 11:28:33 +0100 Subject: [PATCH 20/27] [thin_show_dups] switch to boost::circular_buffer in the rolling_hash --- base/rolling_hash.cc | 7 ++++--- base/rolling_hash.h | 12 +++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc index 8de7ac3..c780a27 100644 --- a/base/rolling_hash.cc +++ b/base/rolling_hash.cc @@ -10,7 +10,8 @@ using namespace std; rolling_hash::rolling_hash(unsigned window_size) : a_(MULTIPLIER), a_to_k_minus_1_(a_), - window_size_(window_size) { + window_size_(window_size), + buffer_(window_size) { for (unsigned i = 1; i < window_size_ - 1; i++) a_to_k_minus_1_ *= a_; @@ -22,12 +23,12 @@ void rolling_hash::reset() { // prime with zeroes - chars_.clear(); + buffer_.clear(); hash_ = 0; for (unsigned i = 0; i < window_size_; i++) { hash_ = (hash_ * a_) + SEED; - chars_.push_back(0); + buffer_.push_back(0); } } diff --git a/base/rolling_hash.h b/base/rolling_hash.h index c5fa44c..dff3145 100644 --- a/base/rolling_hash.h +++ b/base/rolling_hash.h @@ -1,7 +1,7 @@ #ifndef BASE_ROLLING_HASH_H #define BASE_ROLLING_HASH_H -#include +#include #include #include @@ -31,20 +31,18 @@ namespace base { private: void update_hash(uint8_t byte) { - hash_ -= a_to_k_minus_1_ * (chars_.front() + hash_detail::SEED); - chars_.pop_front(); - chars_.push_back(byte); + hash_ -= a_to_k_minus_1_ * (buffer_.front() + hash_detail::SEED); + buffer_.push_back(byte); hash_ = (hash_ * a_) + byte + hash_detail::SEED; } uint32_t a_; uint32_t a_to_k_minus_1_; - // FIXME: use a ring buffer - std::list chars_; - uint32_t hash_; uint32_t window_size_; + + boost::circular_buffer buffer_; }; class content_based_hash { From 216e5acb6c32a18da1efa1b7e5f87ed0710426b9 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 13:48:02 +0100 Subject: [PATCH 21/27] [thin_show_dups] remove variable number of mems per chunks. Too slow and not used. --- base/rolling_hash.cc | 2 +- thin-provisioning/cache_stream.cc | 4 ++-- thin-provisioning/chunk_stream.cc | 14 -------------- thin-provisioning/chunk_stream.h | 9 ++++++--- thin-provisioning/thin_show_duplicates.cc | 11 ++++------- thin-provisioning/variable_chunk_stream.cc | 18 +++++++++--------- 6 files changed, 22 insertions(+), 36 deletions(-) diff --git a/base/rolling_hash.cc b/base/rolling_hash.cc index c780a27..d2d273a 100644 --- a/base/rolling_hash.cc +++ b/base/rolling_hash.cc @@ -40,7 +40,7 @@ content_based_hash::content_based_hash(unsigned window_size) // FIXME: hard coded values backup_div_((window_size / 4) - 1), div_((window_size / 2) - 1), - min_len_(window_size / 8), + min_len_(window_size / 4), max_len_(window_size), len_(0) { diff --git a/thin-provisioning/cache_stream.cc b/thin-provisioning/cache_stream.cc index 8fbcc72..b21f435 100644 --- a/thin-provisioning/cache_stream.cc +++ b/thin-provisioning/cache_stream.cc @@ -93,8 +93,8 @@ cache_stream::chunk_wrapper::chunk_wrapper(cache_stream &parent) { c_.offset_ = parent.current_index_ * parent.block_size_; c_.len_ = parent.block_size_; - c_.mem_.push_back(mem(static_cast(block_.get_data()), - static_cast(block_.get_data()) + parent.block_size_)); + c_.mem_.begin = static_cast(block_.get_data()); + c_.mem_.end = c_.mem_.begin + parent.block_size_; } //---------------------------------------------------------------- diff --git a/thin-provisioning/chunk_stream.cc b/thin-provisioning/chunk_stream.cc index 4ac99ff..adc41d0 100644 --- a/thin-provisioning/chunk_stream.cc +++ b/thin-provisioning/chunk_stream.cc @@ -5,19 +5,5 @@ using namespace thin_provisioning; //---------------------------------------------------------------- -uint8_t -chunk::operator[](uint64_t n) const -{ - std::deque::const_iterator it; - for (it = mem_.begin(); it != mem_.end(); it++) { - uint64_t mem_len = it->end - it->begin; - if (n > mem_len) - n -= mem_len; - else - return it->begin[n]; - } - - throw runtime_error("chunk out of bounds"); -} //---------------------------------------------------------------- diff --git a/thin-provisioning/chunk_stream.h b/thin-provisioning/chunk_stream.h index 4e1ea96..0886c9f 100644 --- a/thin-provisioning/chunk_stream.h +++ b/thin-provisioning/chunk_stream.h @@ -28,6 +28,11 @@ namespace thin_provisioning { struct mem { + mem() + : begin(0), + end(0) { + } + mem(uint8_t *b, uint8_t *e) : begin(b), end(e) { @@ -38,9 +43,7 @@ namespace thin_provisioning { struct chunk { uint64_t offset_, len_; - std::deque mem_; - - uint8_t operator[](uint64_t n) const; + mem mem_; }; class chunk_stream { diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 9fc96c2..5a75e26 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -142,8 +142,7 @@ namespace { else { digestor_.reset(); - for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) - digestor_.process_bytes(it->begin, it->end - it->begin); + digestor_.process_bytes(c.mem_.begin, c.mem_.end - c.mem_.begin); unsigned int digest[5]; digestor_.get_digest(digest); @@ -167,11 +166,9 @@ namespace { private: bool all_zeroes(chunk const &c) const { - for (deque::const_iterator it = c.mem_.begin(); it != c.mem_.end(); it++) { - for (uint8_t *ptr = it->begin; ptr != it->end; ptr++) { - if (*ptr != 0) - return false; - } + for (uint8_t *ptr = c.mem_.begin; ptr != c.mem_.end; ptr++) { + if (*ptr != 0) + return false; } return true; diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc index 41c6c96..d2f1529 100644 --- a/thin-provisioning/variable_chunk_stream.cc +++ b/thin-provisioning/variable_chunk_stream.cc @@ -59,8 +59,8 @@ variable_chunk_stream::get() little_chunk_.len_ = little_e_ - little_b_; little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_; - little_chunk_.mem_.clear(); - little_chunk_.mem_.push_back(mem(little_b_, little_e_)); + little_chunk_.mem_.begin = little_b_; + little_chunk_.mem_.end = little_e_; return little_chunk_; } @@ -80,7 +80,7 @@ variable_chunk_stream::next_big_chunk() return false; big_chunk_ = &stream_.get(); - little_b_ = little_e_ = last_hashed_ = big_chunk_->mem_.front().begin; + little_b_ = little_e_ = last_hashed_ = big_chunk_->mem_.begin; h_.reset(); return true; @@ -93,19 +93,19 @@ variable_chunk_stream::advance_one() assert(big_chunk_); - big_e = big_chunk_->mem_.front().end; + big_e = big_chunk_->mem_.end; little_b_ = little_e_; little_e_ = last_hashed_; if (little_b_ == big_e) { if (next_big_chunk()) - big_e = big_chunk_->mem_.front().end; + big_e = big_chunk_->mem_.end; else return false; } - assert(little_e_ >= big_chunk_->mem_.front().begin); - assert(little_b_ >= big_chunk_->mem_.front().begin); + assert(little_e_ >= big_chunk_->mem_.begin); + assert(little_b_ >= big_chunk_->mem_.begin); assert(little_e_ <= big_e); assert(little_b_ <= big_e); @@ -126,8 +126,8 @@ variable_chunk_stream::advance_one() if (little_e_ == big_e) last_hashed_ = little_e_; - assert(little_e_ >= big_chunk_->mem_.front().begin); - assert(little_b_ >= big_chunk_->mem_.front().begin); + assert(little_e_ >= big_chunk_->mem_.begin); + assert(little_b_ >= big_chunk_->mem_.begin); assert(little_e_ <= big_e); assert(little_b_ <= big_e); From 41a1b85c2700ac9d88a2158047e1835b5c0214f9 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 13:56:38 +0100 Subject: [PATCH 22/27] [thin_show_dups] take out some old assertions --- thin-provisioning/variable_chunk_stream.cc | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc index d2f1529..99ddc61 100644 --- a/thin-provisioning/variable_chunk_stream.cc +++ b/thin-provisioning/variable_chunk_stream.cc @@ -91,8 +91,6 @@ variable_chunk_stream::advance_one() { uint8_t *big_e; - assert(big_chunk_); - big_e = big_chunk_->mem_.end; little_b_ = little_e_; little_e_ = last_hashed_; @@ -104,12 +102,6 @@ variable_chunk_stream::advance_one() return false; } - assert(little_e_ >= big_chunk_->mem_.begin); - assert(little_b_ >= big_chunk_->mem_.begin); - assert(little_e_ <= big_e); - assert(little_b_ <= big_e); - - while (little_e_ != big_e) { optional maybe_break = h_.step(*little_e_); little_e_++; @@ -126,11 +118,6 @@ variable_chunk_stream::advance_one() if (little_e_ == big_e) last_hashed_ = little_e_; - assert(little_e_ >= big_chunk_->mem_.begin); - assert(little_b_ >= big_chunk_->mem_.begin); - assert(little_e_ <= big_e); - assert(little_b_ <= big_e); - return true; } From 251762e6d94e903f2a82f3415b997cca249d57d2 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 15:16:49 +0100 Subject: [PATCH 23/27] [thin_show_dups] tidy up reporting --- thin-provisioning/cache_stream.cc | 10 +--- thin-provisioning/cache_stream.h | 3 +- thin-provisioning/chunk_stream.h | 2 +- thin-provisioning/pool_stream.cc | 31 +++++------ thin-provisioning/pool_stream.h | 5 +- thin-provisioning/thin_show_duplicates.cc | 64 +++++++++++----------- thin-provisioning/variable_chunk_stream.cc | 12 ++-- thin-provisioning/variable_chunk_stream.h | 3 +- 8 files changed, 58 insertions(+), 72 deletions(-) diff --git a/thin-provisioning/cache_stream.cc b/thin-provisioning/cache_stream.cc index b21f435..379c191 100644 --- a/thin-provisioning/cache_stream.cc +++ b/thin-provisioning/cache_stream.cc @@ -37,9 +37,9 @@ cache_stream::cache_stream(string const &path, } block_address -cache_stream::nr_chunks() const +cache_stream::size() const { - return nr_blocks_; + return nr_blocks_ * block_size_; } void @@ -68,12 +68,6 @@ cache_stream::eof() const return current_index_ >= nr_blocks_; } -block_address -cache_stream::index() const -{ - return current_index_; -} - chunk const & cache_stream::get() { diff --git a/thin-provisioning/cache_stream.h b/thin-provisioning/cache_stream.h index 65c81b1..b7af995 100644 --- a/thin-provisioning/cache_stream.h +++ b/thin-provisioning/cache_stream.h @@ -14,10 +14,9 @@ namespace thin_provisioning { block_address block_size, size_t cache_mem); - block_address nr_chunks() const; + block_address size() const; virtual void rewind(); - virtual block_address index() const; virtual bool next(block_address count = 1ull); virtual bool eof() const; diff --git a/thin-provisioning/chunk_stream.h b/thin-provisioning/chunk_stream.h index 0886c9f..1831f27 100644 --- a/thin-provisioning/chunk_stream.h +++ b/thin-provisioning/chunk_stream.h @@ -51,7 +51,7 @@ namespace thin_provisioning { virtual ~chunk_stream() {} virtual void rewind() = 0; - virtual bcache::block_address index() const = 0; + virtual bcache::block_address size() const = 0; virtual bool next(bcache::block_address count = 1ull) = 0; virtual bool eof() const = 0; diff --git a/thin-provisioning/pool_stream.cc b/thin-provisioning/pool_stream.cc index 21964f9..41a0ab0 100644 --- a/thin-provisioning/pool_stream.cc +++ b/thin-provisioning/pool_stream.cc @@ -41,22 +41,25 @@ pool_stream::pool_stream(cache_stream &stream, transaction_manager::ptr tm, superblock_detail::superblock const &sb, block_address nr_blocks) : stream_(stream), - block_to_thin_(stream.nr_chunks(), UNMAPPED), - nr_mapped_(0) + block_to_thin_(nr_blocks, UNMAPPED), + nr_mapped_(0), + index_(0), + block_size_(sb.data_block_size_ * 512) { init_rmap(tm, sb, nr_blocks); } block_address -pool_stream::nr_chunks() const +pool_stream::size() const { - return nr_mapped_; + return nr_mapped_ * block_size_; } void pool_stream::rewind() { stream_.rewind(); + index_ = 0; } bool @@ -75,12 +78,6 @@ pool_stream::eof() const return stream_.eof(); } -block_address -pool_stream::index() const -{ - return stream_.index(); -} - chunk const & pool_stream::get() { @@ -141,16 +138,14 @@ pool_stream::init_rmap(transaction_manager::ptr tm, bool pool_stream::advance_one() { - block_address new_index = index() + 1; + block_address count = 1; - while (block_to_thin_[new_index] == UNMAPPED && - new_index < nr_chunks()) - new_index++; + while (((index_ + count) < block_to_thin_.size()) && + (block_to_thin_[index_ + count] == UNMAPPED)) + count++; - if (new_index >= nr_chunks()) - return false; - - return stream_.next(new_index - index()); + index_ += count; + return stream_.next(count); } //---------------------------------------------------------------- diff --git a/thin-provisioning/pool_stream.h b/thin-provisioning/pool_stream.h index 71576ed..e419842 100644 --- a/thin-provisioning/pool_stream.h +++ b/thin-provisioning/pool_stream.h @@ -32,11 +32,10 @@ namespace thin_provisioning { transaction_manager::ptr tm, superblock_detail::superblock const &sb, block_address nr_blocks); - block_address nr_chunks() const; + block_address size() const; void rewind(); bool next(block_address count = 1ull); bool eof() const; - block_address index() const; chunk const &get(); void put(chunk const &c); @@ -56,6 +55,8 @@ namespace thin_provisioning { cache_stream &stream_; vector block_to_thin_; block_address nr_mapped_; + block_address index_; + block_address block_size_; }; } diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 5a75e26..5c09af2 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -182,6 +182,34 @@ namespace { duplicate_counter results_; }; + void display_results(chunk_stream const &stream, duplicate_counter const &r) { + block_address meg = 1024 * 1024; + cout << "\n\n" + << stream.size() / meg << "m examined, " + << r.get_non_zeroes() / meg << "m duplicates, " + << r.get_zeroes() / meg << "m zeroes\n"; + } + + void scan(chunk_stream &stream, block_address stream_size) { + duplicate_detector detector; + block_address total_seen(0); + auto_ptr pbar = create_progress_bar("Examining data"); + + do { + // FIXME: use a wrapper class to automate the put() + chunk const &c = stream.get(); + detector.examine(c); + stream.put(c); + + total_seen += c.len_; + pbar->update_percent((total_seen * 100) / stream.size()); + + } while (stream.next()); + + pbar->update_percent(100); + display_results(stream, detector.get_results()); + } + int show_dups_pool(flags const &fs) { block_manager<>::ptr bm = open_bm(*fs.metadata_dev); transaction_manager::ptr tm = open_tm(bm); @@ -195,21 +223,9 @@ namespace { cache_stream stream(fs.data_dev, block_size, fs.cache_mem); pool_stream pstream(stream, tm, sb, nr_blocks); + variable_chunk_stream vstream(pstream, 4096); - duplicate_detector detector; - auto_ptr pbar = create_progress_bar("Examining data"); - - do { - chunk const &c = pstream.get(); - detector.examine(c); - pstream.put(c); - pbar->update_percent((pstream.index() * 100) / pstream.nr_chunks()); - - } while (pstream.next()); - pbar->update_percent(100); - - cout << "\n\ntotal dups: " << detector.get_results().get_total() << endl; - cout << (detector.get_results().get_total() * 100) / pstream.nr_chunks() << "% duplicates\n"; + scan(vstream, nr_blocks * block_size); return 0; } @@ -229,26 +245,8 @@ namespace { cache_stream low_level_stream(fs.data_dev, block_size, fs.cache_mem); variable_chunk_stream stream(low_level_stream, 4096); - duplicate_detector detector; - auto_ptr pbar = create_progress_bar("Examining data"); - do { - // FIXME: use a wrapper class to automate the put() - chunk const &c = stream.get(); - detector.examine(c); - stream.put(c); - - pbar->update_percent((c.offset_ * 100) / dev_size); - - } while (stream.next()); - pbar->update_percent(100); - - duplicate_counter r = detector.get_results(); - block_address meg = 1024 * 1024; - cout << "\n\n" - << (nr_blocks * block_size) / meg << "m examined, " - << r.get_non_zeroes() / meg << "m duplicates, " - << r.get_zeroes() / meg << "m zeroes\n"; + scan(stream, dev_size); return 0; } diff --git a/thin-provisioning/variable_chunk_stream.cc b/thin-provisioning/variable_chunk_stream.cc index 99ddc61..f572db7 100644 --- a/thin-provisioning/variable_chunk_stream.cc +++ b/thin-provisioning/variable_chunk_stream.cc @@ -19,6 +19,12 @@ variable_chunk_stream::~variable_chunk_stream() put_big_chunk(); } +bcache::block_address +variable_chunk_stream::size() const +{ + return stream_.size(); +} + void variable_chunk_stream::rewind() { @@ -45,12 +51,6 @@ variable_chunk_stream::eof() const return stream_.eof(); } -bcache::block_address -variable_chunk_stream::index() const -{ - return index_; -} - chunk const & variable_chunk_stream::get() { diff --git a/thin-provisioning/variable_chunk_stream.h b/thin-provisioning/variable_chunk_stream.h index f9c5ec7..cc62945 100644 --- a/thin-provisioning/variable_chunk_stream.h +++ b/thin-provisioning/variable_chunk_stream.h @@ -13,11 +13,10 @@ namespace thin_provisioning { variable_chunk_stream(chunk_stream &stream, unsigned window_size); ~variable_chunk_stream(); - // FIXME: we don't know in advance how many chunks we will have + virtual bcache::block_address size() const; virtual void rewind(); virtual bool next(bcache::block_address count = 1ull); virtual bool eof() const; - virtual bcache::block_address index() const; virtual chunk const &get(); virtual void put(chunk const &c); From cb56b474000edd45403bf09662a74254027e73be Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 4 Sep 2015 15:27:48 +0100 Subject: [PATCH 24/27] [thin-show-dups] add --content-based-chunks --- thin-provisioning/thin_show_duplicates.cc | 38 +++++++++++++++-------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 5c09af2..fbcdd10 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -90,13 +90,15 @@ namespace { struct flags { flags() - : cache_mem(64 * 1024 * 1024) { + : cache_mem(64 * 1024 * 1024), + content_based_chunks(false) { } string data_dev; optional metadata_dev; optional block_size; unsigned cache_mem; + bool content_based_chunks; }; using namespace mapping_tree_detail; @@ -190,7 +192,7 @@ namespace { << r.get_zeroes() / meg << "m zeroes\n"; } - void scan(chunk_stream &stream, block_address stream_size) { + void scan(chunk_stream &stream) { duplicate_detector detector; block_address total_seen(0); auto_ptr pbar = create_progress_bar("Examining data"); @@ -210,6 +212,11 @@ namespace { display_results(stream, detector.get_results()); } + void scan_with_variable_sized_chunks(chunk_stream &stream) { + variable_chunk_stream vstream(stream, 4096); + scan(vstream); + } + int show_dups_pool(flags const &fs) { block_manager<>::ptr bm = open_bm(*fs.metadata_dev); transaction_manager::ptr tm = open_tm(bm); @@ -217,15 +224,13 @@ namespace { block_address block_size = sb.data_block_size_ * 512; block_address nr_blocks = get_nr_blocks(fs.data_dev, block_size); - cerr << "path = " << fs.data_dev << "\n"; - cerr << "block size = " << block_size << "\n"; - cerr << "nr_blocks = " << nr_blocks << "\n"; - cache_stream stream(fs.data_dev, block_size, fs.cache_mem); pool_stream pstream(stream, tm, sb, nr_blocks); - variable_chunk_stream vstream(pstream, 4096); - scan(vstream, nr_blocks * block_size); + if (fs.content_based_chunks) + scan_with_variable_sized_chunks(pstream); + else + scan(pstream); return 0; } @@ -237,16 +242,17 @@ namespace { block_address block_size = *fs.block_size; block_address nr_blocks = get_nr_blocks(fs.data_dev, *fs.block_size); - block_address dev_size = nr_blocks * *fs.block_size; cerr << "path = " << fs.data_dev << "\n"; cerr << "nr_blocks = " << nr_blocks << "\n"; cerr << "block size = " << block_size << "\n"; - cache_stream low_level_stream(fs.data_dev, block_size, fs.cache_mem); - variable_chunk_stream stream(low_level_stream, 4096); + cache_stream stream(fs.data_dev, block_size, fs.cache_mem); - scan(stream, dev_size); + if (fs.content_based_chunks) + scan_with_variable_sized_chunks(stream); + else + scan(stream); return 0; } @@ -264,6 +270,7 @@ namespace { out << "Usage: " << cmd << " [options] {device|file}\n" << "Options:\n" << " {--block-sectors} \n" + << " {--content-based-chunks}\n" << " {--metadata-dev} \n" << " {-h|--help}\n" << " {-V|--version}" << endl; @@ -278,7 +285,8 @@ int thin_show_dups_main(int argc, char **argv) char const shortopts[] = "qhV"; option const longopts[] = { { "block-sectors", required_argument, NULL, 1}, - { "metadata-dev", required_argument, NULL, 2}, + { "content-based-chunks", no_argument, NULL, 2}, + { "metadata-dev", required_argument, NULL, 3}, { "help", no_argument, NULL, 'h'}, { "version", no_argument, NULL, 'V'}, { NULL, no_argument, NULL, 0 } @@ -299,6 +307,10 @@ int thin_show_dups_main(int argc, char **argv) break; case 2: + fs.content_based_chunks = true; + break; + + case 3: fs.metadata_dev = optarg; break; From b6e3a12297943616043bfcfb86de13b36941e7f4 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 7 Sep 2015 15:40:35 +0100 Subject: [PATCH 25/27] [thin_show_dups] move scan into the duplicate_detector --- thin-provisioning/thin_show_duplicates.cc | 82 ++++++++++++----------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index fbcdd10..5b6b650 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -130,6 +130,14 @@ namespace { return zero_dups_; } + void display_results(chunk_stream const &stream) const { + block_address meg = 1024 * 1024; + cout << "\n\n" + << stream.size() / meg << "m examined, " + << get_non_zeroes() / meg << "m duplicates, " + << get_zeroes() / meg << "m zeroes\n"; + } + private: block_address non_zero_dups_; block_address zero_dups_; @@ -137,13 +145,36 @@ namespace { class duplicate_detector { public: + void scan(chunk_stream &stream) { + block_address total_seen(0); + auto_ptr pbar = create_progress_bar("Examining data"); + + do { + // FIXME: use a wrapper class to automate the put() + chunk const &c = stream.get(); + examine(c); + stream.put(c); + + total_seen += c.len_; + pbar->update_percent((total_seen * 100) / stream.size()); + + } while (stream.next()); + + pbar->update_percent(100); + results_.display_results(stream); + } + + duplicate_counter const &get_results() const { + return results_; + } + + private: void examine(chunk const &c) { if (all_zeroes(c)) results_.add_zero_duplicate(c.len_); else { digestor_.reset(); - digestor_.process_bytes(c.mem_.begin, c.mem_.end - c.mem_.begin); unsigned int digest[5]; @@ -162,11 +193,6 @@ namespace { } } - duplicate_counter const &get_results() const { - return results_; - } - - private: bool all_zeroes(chunk const &c) const { for (uint8_t *ptr = c.mem_.begin; ptr != c.mem_.end; ptr++) { if (*ptr != 0) @@ -184,37 +210,10 @@ namespace { duplicate_counter results_; }; - void display_results(chunk_stream const &stream, duplicate_counter const &r) { - block_address meg = 1024 * 1024; - cout << "\n\n" - << stream.size() / meg << "m examined, " - << r.get_non_zeroes() / meg << "m duplicates, " - << r.get_zeroes() / meg << "m zeroes\n"; - } - - void scan(chunk_stream &stream) { - duplicate_detector detector; - block_address total_seen(0); - auto_ptr pbar = create_progress_bar("Examining data"); - - do { - // FIXME: use a wrapper class to automate the put() - chunk const &c = stream.get(); - detector.examine(c); - stream.put(c); - - total_seen += c.len_; - pbar->update_percent((total_seen * 100) / stream.size()); - - } while (stream.next()); - - pbar->update_percent(100); - display_results(stream, detector.get_results()); - } - - void scan_with_variable_sized_chunks(chunk_stream &stream) { + void scan_with_variable_sized_chunks(chunk_stream &stream, + duplicate_detector &detector) { variable_chunk_stream vstream(stream, 4096); - scan(vstream); + detector.scan(vstream); } int show_dups_pool(flags const &fs) { @@ -227,10 +226,12 @@ namespace { cache_stream stream(fs.data_dev, block_size, fs.cache_mem); pool_stream pstream(stream, tm, sb, nr_blocks); + duplicate_detector detector; + if (fs.content_based_chunks) - scan_with_variable_sized_chunks(pstream); + scan_with_variable_sized_chunks(pstream, detector); else - scan(pstream); + detector.scan(pstream); return 0; } @@ -248,11 +249,12 @@ namespace { cerr << "block size = " << block_size << "\n"; cache_stream stream(fs.data_dev, block_size, fs.cache_mem); + duplicate_detector dd; if (fs.content_based_chunks) - scan_with_variable_sized_chunks(stream); + scan_with_variable_sized_chunks(stream, dd); else - scan(stream); + dd.scan(stream); return 0; } From c58c15e7883e755d756702742fc2146670a5adf8 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 8 Sep 2015 13:17:52 +0100 Subject: [PATCH 26/27] [thin_show_dups] move scan_with_variable_sized_chunks() into the dup detector --- thin-provisioning/thin_show_duplicates.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 5b6b650..3352760 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -164,6 +164,12 @@ namespace { results_.display_results(stream); } + + void scan_with_variable_sized_chunks(chunk_stream &stream) { + variable_chunk_stream vstream(stream, 4096); + scan(vstream); + } + duplicate_counter const &get_results() const { return results_; } @@ -210,12 +216,6 @@ namespace { duplicate_counter results_; }; - void scan_with_variable_sized_chunks(chunk_stream &stream, - duplicate_detector &detector) { - variable_chunk_stream vstream(stream, 4096); - detector.scan(vstream); - } - int show_dups_pool(flags const &fs) { block_manager<>::ptr bm = open_bm(*fs.metadata_dev); transaction_manager::ptr tm = open_tm(bm); @@ -229,7 +229,7 @@ namespace { duplicate_detector detector; if (fs.content_based_chunks) - scan_with_variable_sized_chunks(pstream, detector); + detector.scan_with_variable_sized_chunks(pstream); else detector.scan(pstream); @@ -252,7 +252,7 @@ namespace { duplicate_detector dd; if (fs.content_based_chunks) - scan_with_variable_sized_chunks(stream, dd); + dd.scan_with_variable_sized_chunks(stream); else dd.scan(stream); From 664841ad03506ba1da2b44d9e23623e09f3f3285 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 8 Sep 2015 17:09:41 +0100 Subject: [PATCH 27/27] [thin_show_dups] Support fractions of a pool block size --- Makefile.in | 1 + thin-provisioning/fixed_chunk_stream.cc | 113 ++++++++++++++++++++++ thin-provisioning/fixed_chunk_stream.h | 39 ++++++++ thin-provisioning/thin_show_duplicates.cc | 42 +++++--- 4 files changed, 180 insertions(+), 15 deletions(-) create mode 100644 thin-provisioning/fixed_chunk_stream.cc create mode 100644 thin-provisioning/fixed_chunk_stream.h diff --git a/Makefile.in b/Makefile.in index d09c873..89d45fd 100644 --- a/Makefile.in +++ b/Makefile.in @@ -75,6 +75,7 @@ SOURCE=\ thin-provisioning/cache_stream.cc \ thin-provisioning/chunk_stream.cc \ thin-provisioning/device_tree.cc \ + thin-provisioning/fixed_chunk_stream.cc \ thin-provisioning/human_readable_format.cc \ thin-provisioning/mapping_tree.cc \ thin-provisioning/metadata.cc \ diff --git a/thin-provisioning/fixed_chunk_stream.cc b/thin-provisioning/fixed_chunk_stream.cc new file mode 100644 index 0000000..ea031dd --- /dev/null +++ b/thin-provisioning/fixed_chunk_stream.cc @@ -0,0 +1,113 @@ +#include "thin-provisioning/fixed_chunk_stream.h" + +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +fixed_chunk_stream::fixed_chunk_stream(chunk_stream &stream, unsigned chunk_size) + : index_(0), + stream_(stream), + chunk_size_(chunk_size), + big_chunk_(0) { + next_big_chunk(); +} + +fixed_chunk_stream::~fixed_chunk_stream() +{ + put_big_chunk(); +} + +bcache::block_address +fixed_chunk_stream::size() const +{ + return stream_.size(); +} + +void +fixed_chunk_stream::rewind() +{ + // FIXME: not complete + index_ = 0; + stream_.rewind(); +} + +bool +fixed_chunk_stream::next(bcache::block_address count) +{ + while (count--) { + index_++; + advance_one(); + } + + return !eof(); +} + +bool +fixed_chunk_stream::eof() const +{ + return stream_.eof(); +} + +chunk const & +fixed_chunk_stream::get() +{ + assert(big_chunk_); + + little_chunk_.len_ = little_e_ - little_b_; + little_chunk_.offset_ = big_chunk_->offset_ + little_chunk_.len_; + + little_chunk_.mem_.begin = little_b_; + little_chunk_.mem_.end = little_e_; + + return little_chunk_; +} + +void +fixed_chunk_stream::put(chunk const &c) +{ + // noop +} + +bool +fixed_chunk_stream::next_big_chunk() +{ + put_big_chunk(); + + if (!stream_.next()) + return false; + + big_chunk_ = &stream_.get(); + little_b_ = little_e_ = last_hashed_ = big_chunk_->mem_.begin; + + return true; +} + +bool +fixed_chunk_stream::advance_one() +{ + uint8_t *big_e; + + big_e = big_chunk_->mem_.end; + little_b_ = little_e_; + + if (little_b_ >= big_e) { + if (next_big_chunk()) + big_e = big_chunk_->mem_.end; + else + return false; + } + + little_e_ += chunk_size_; + return true; +} + +void +fixed_chunk_stream::put_big_chunk() +{ + if (big_chunk_) + stream_.put(*big_chunk_); + + big_chunk_ = 0; +} + +//---------------------------------------------------------------- diff --git a/thin-provisioning/fixed_chunk_stream.h b/thin-provisioning/fixed_chunk_stream.h new file mode 100644 index 0000000..f17d15a --- /dev/null +++ b/thin-provisioning/fixed_chunk_stream.h @@ -0,0 +1,39 @@ +#ifndef THIN_PROVISIONING_FIXED_CHUNK_STREAM_H +#define THIN_PROVISIONING_FIXED_CHUNK_STREAM_H + +#include "thin-provisioning/chunk_stream.h" + +//---------------------------------------------------------------- + +namespace thin_provisioning { + class fixed_chunk_stream : public chunk_stream { + public: + fixed_chunk_stream(chunk_stream &stream, unsigned chunk_size); + ~fixed_chunk_stream(); + + virtual bcache::block_address size() const; + virtual void rewind(); + virtual bool next(bcache::block_address count = 1ull); + virtual bool eof() const; + virtual chunk const &get(); + virtual void put(chunk const &c); + + private: + bool next_big_chunk(); + bool advance_one(); + void put_big_chunk(); + + bcache::block_address index_; + + chunk_stream &stream_; + unsigned chunk_size_; + chunk const *big_chunk_; + + uint8_t *little_b_, *little_e_, *last_hashed_; + chunk little_chunk_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/thin_show_duplicates.cc b/thin-provisioning/thin_show_duplicates.cc index 3352760..89c0c9d 100644 --- a/thin-provisioning/thin_show_duplicates.cc +++ b/thin-provisioning/thin_show_duplicates.cc @@ -30,6 +30,7 @@ #include "persistent-data/space-maps/core.h" #include "persistent-data/space-maps/disk.h" #include "thin-provisioning/cache_stream.h" +#include "thin-provisioning/fixed_chunk_stream.h" #include "thin-provisioning/pool_stream.h" #include "thin-provisioning/commands.h" #include "thin-provisioning/device_tree.h" @@ -54,7 +55,6 @@ using namespace thin_provisioning; namespace { bool factor_of(block_address f, block_address n) { - cerr << n << " % " << f << "\n"; return (n % f) == 0; } @@ -145,6 +145,21 @@ namespace { class duplicate_detector { public: + void scan_with_variable_sized_chunks(chunk_stream &stream) { + variable_chunk_stream vstream(stream, 4096); + scan(vstream); + } + + void scan_with_fixed_sized_chunks(chunk_stream &stream, block_address chunk_size) { + fixed_chunk_stream fstream(stream, chunk_size); + scan(fstream); + } + + duplicate_counter const &get_results() const { + return results_; + } + + private: void scan(chunk_stream &stream) { block_address total_seen(0); auto_ptr pbar = create_progress_bar("Examining data"); @@ -164,17 +179,6 @@ namespace { results_.display_results(stream); } - - void scan_with_variable_sized_chunks(chunk_stream &stream) { - variable_chunk_stream vstream(stream, 4096); - scan(vstream); - } - - duplicate_counter const &get_results() const { - return results_; - } - - private: void examine(chunk const &c) { if (all_zeroes(c)) results_.add_zero_duplicate(c.len_); @@ -230,8 +234,16 @@ namespace { if (fs.content_based_chunks) detector.scan_with_variable_sized_chunks(pstream); - else - detector.scan(pstream); + else { + if (*fs.block_size) { + if (factor_of(*fs.block_size, block_size)) + block_size = *fs.block_size; + else + throw runtime_error("specified block size is not a factor of the pool chunk size\n"); + } + + detector.scan_with_fixed_sized_chunks(pstream, block_size); + } return 0; } @@ -254,7 +266,7 @@ namespace { if (fs.content_based_chunks) dd.scan_with_variable_sized_chunks(stream); else - dd.scan(stream); + dd.scan_with_fixed_sized_chunks(stream, block_size); return 0; }