diff --git a/Makefile.in b/Makefile.in index 96a275d..19453cc 100644 --- a/Makefile.in +++ b/Makefile.in @@ -118,6 +118,8 @@ SOURCE=\ thin-provisioning/xml_format.cc DEVTOOLS_SOURCE=\ + thin-provisioning/thin_journal.cc \ + thin-provisioning/thin_journal_check.cc \ thin-provisioning/thin_ll_dump.cc \ thin-provisioning/thin_ll_restore.cc \ thin-provisioning/thin_show_duplicates.cc \ diff --git a/thin-provisioning/commands.h b/thin-provisioning/commands.h index 54e5a88..6b80bdb 100644 --- a/thin-provisioning/commands.h +++ b/thin-provisioning/commands.h @@ -116,6 +116,13 @@ namespace thin_provisioning { virtual void usage(std::ostream &out) const; virtual int run(int argc, char **argv); }; + + class thin_journal_cmd : public base::command { + public: + thin_journal_cmd(); + virtual void usage(std::ostream &out) const; + virtual int run(int argc, char **argv); + }; #endif void register_thin_commands(base::application &app); diff --git a/thin-provisioning/thin_journal.cc b/thin-provisioning/thin_journal.cc new file mode 100644 index 0000000..b91fad8 --- /dev/null +++ b/thin-provisioning/thin_journal.cc @@ -0,0 +1,329 @@ +// Copyright (C) 2018 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include "thin-provisioning/thin_journal.h" + +#include + +using namespace thin_provisioning; +using namespace persistent_data; +using namespace std; + +//---------------------------------------------------------------- + +byte_stream::byte_stream(block_manager::ptr bm) + : bm_(bm), + current_block_(0), + cursor_(0) +{ +} + +void +byte_stream::read_bytes(uint8_t *b, uint8_t *e) +{ + while (b != e) + b += read_some_(b, e); +} + +void +byte_stream::next_block_() +{ + current_block_++; +} + +size_t +byte_stream::read_some_(uint8_t *b, uint8_t *e) +{ + if (cursor_ == JOURNAL_BLOCK_SIZE) + next_block_(); + + size_t len = min(e - b, JOURNAL_BLOCK_SIZE - cursor_); + auto rr = bm_->read_lock(current_block_); + + uint8_t const *data_begin = reinterpret_cast(rr.data()) + cursor_; + memcpy(b, data_begin, len); + cursor_ += len; + + return len; +} + +//--------------------------------- + +journal_msg::journal_msg(bool success) + : success_(success) +{ +} + +block_msg::block_msg(bool success, uint64_t index) + : journal_msg(success), index_(index) +{ +} + +read_lock_msg::read_lock_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +read_lock_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +write_lock_msg::write_lock_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +write_lock_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +zero_lock_msg::zero_lock_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +zero_lock_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +try_read_lock_msg::try_read_lock_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +try_read_lock_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +unlock_msg::unlock_msg(bool success, uint64_t index, delta_list const &deltas) + : block_msg(success, index), + deltas_(deltas) +{ +} + +void +unlock_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +verify_msg::verify_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +verify_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +prepare_msg::prepare_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +prepare_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +flush_msg::flush_msg(bool success) + : journal_msg(success) +{ +} + +void +flush_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +flush_and_unlock_msg::flush_and_unlock_msg(bool success, uint64_t index, delta_list const &deltas) + : block_msg(success, index), + deltas_(deltas) +{ +} + +void +flush_and_unlock_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +prefetch_msg::prefetch_msg(bool success, uint64_t index) + : block_msg(success, index) +{ +} + +void +prefetch_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +set_read_only_msg::set_read_only_msg() + : journal_msg(true) +{ +} + +void +set_read_only_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +set_read_write_msg::set_read_write_msg() + : journal_msg(true) +{ +} + +void +set_read_write_msg::visit(journal_visitor &v) const +{ + v.visit(*this); +} + +//------------------------------------------ + +journal::journal(block_manager::ptr bm) + : in_(bm) +{ +} + +void +journal::read_journal(struct journal_visitor &v) +{ + while (read_one_(v)) + ; +} + +bool +journal::read_one_(struct journal_visitor &v) +{ + uint8_t header = read_(); + uint8_t t = header >> 1; + uint8_t success = header & 0x1; + uint64_t index; + + switch (static_cast(t)) { + case MT_READ_LOCK: + index = read_(); + v.visit(read_lock_msg(success, index)); + break; + + case MT_WRITE_LOCK: + index = read_(); + v.visit(write_lock_msg(success, index)); + break; + + case MT_ZERO_LOCK: + index = read_(); + v.visit(zero_lock_msg(success, index)); + break; + + case MT_TRY_READ_LOCK: + index = read_(); + v.visit(try_read_lock_msg(success, index)); + break; + + case MT_UNLOCK: { + index = read_(); + auto deltas = read_deltas_(); + v.visit(unlock_msg(success, index, deltas)); + } + break; + + case MT_VERIFY: + index = read_(); + v.visit(verify_msg(success, index)); + break; + + case MT_PREPARE: + index = read_(); + v.visit(prepare_msg(success, index)); + break; + + case MT_FLUSH: + v.visit(flush_msg(success)); + break; + + case MT_FLUSH_AND_UNLOCK: { + index = read_(); + auto deltas = read_deltas_(); + v.visit(flush_and_unlock_msg(success, index, deltas)); + } + break; + + case MT_PREFETCH: + index = read_(); + v.visit(prefetch_msg(success, index)); + break; + + case MT_SET_READ_ONLY: + v.visit(set_read_only_msg()); + break; + + case MT_SET_READ_WRITE: + v.visit(set_read_write_msg()); + break; + + case MT_END_OF_JOURNAL: + return false; + } + + return true; +} + +bool +journal::read_delta_(delta_list &ds) +{ + uint8_t chunk = read_(); + + if (chunk == 0xff) + return false; + + auto bytes = vector(JOURNAL_CHUNK_SIZE, 0); + in_.read_bytes(bytes.data(), bytes.data() + JOURNAL_CHUNK_SIZE); + ds.push_back(delta(chunk, bytes)); + + return true; +} + +thin_provisioning::delta_list +journal::read_deltas_() +{ + delta_list ds; + + while (read_delta_(ds)) + ; + + return ds; +} + +//---------------------------------------------------------------- + diff --git a/thin-provisioning/thin_journal.h b/thin-provisioning/thin_journal.h new file mode 100644 index 0000000..5a87351 --- /dev/null +++ b/thin-provisioning/thin_journal.h @@ -0,0 +1,203 @@ +// Copyright (C) 2018 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#ifndef THIN_PROVISIONING_THIN_JOURNAL_H +#define THIN_PROVISIONING_THIN_JOURNAL_H + +#include "persistent-data/block.h" + +#include + +//---------------------------------------------------------------- + +namespace thin_provisioning { + uint32_t const JOURNAL_BLOCK_SIZE = 256 * 1024; + uint32_t const JOURNAL_NR_CHUNKS = 32; + uint32_t const JOURNAL_CHUNK_SIZE = 4096 / JOURNAL_NR_CHUNKS; + + class byte_stream { + public: + byte_stream(persistent_data::block_manager::ptr bm); + + void read_bytes(uint8_t *b, uint8_t *e); + + private: + void next_block_(); + size_t read_some_(uint8_t *b, uint8_t *e); + + persistent_data::block_manager::ptr bm_; + + uint64_t current_block_; + uint64_t cursor_; + }; + + //--------------------------------- + + class journal_visitor; + + struct journal_msg { + journal_msg(bool success); + + virtual ~journal_msg() {} + virtual void visit(journal_visitor &v) const = 0; + + bool success_; + }; + + struct block_msg : public journal_msg { + block_msg(bool success, uint64_t index); + uint64_t index_; + }; + + struct read_lock_msg : public block_msg { + read_lock_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct write_lock_msg : public block_msg { + write_lock_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct zero_lock_msg : public block_msg { + zero_lock_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct try_read_lock_msg : public block_msg { + try_read_lock_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct delta { + delta(uint32_t offset, std::vector &bytes) + : offset_(offset), + bytes_(bytes) { + } + + uint32_t offset_; + std::vector bytes_; + }; + + using delta_list = std::vector; + + struct unlock_msg : public block_msg { + unlock_msg(bool success, uint64_t index, delta_list const &deltas); + virtual void visit(journal_visitor &v) const; + + delta_list deltas_; + }; + + struct verify_msg : public block_msg { + verify_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct prepare_msg : public block_msg { + prepare_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct flush_msg : public journal_msg { + flush_msg(bool success); + virtual void visit(journal_visitor &v) const; + }; + + struct flush_and_unlock_msg : public block_msg { + flush_and_unlock_msg(bool success, uint64_t index, delta_list const &deltas); + virtual void visit(journal_visitor &v) const; + + delta_list deltas_; + }; + + struct prefetch_msg : public block_msg { + prefetch_msg(bool success, uint64_t index); + virtual void visit(journal_visitor &v) const; + }; + + struct set_read_only_msg : public journal_msg { + set_read_only_msg(); + virtual void visit(journal_visitor &v) const; + }; + + struct set_read_write_msg : public journal_msg { + set_read_write_msg(); + virtual void visit(journal_visitor &v) const; + }; + + struct journal_visitor { + public: + virtual ~journal_visitor() {}; + + void visit(journal_msg const &msg) { + msg.visit(*this); + } + + virtual void visit(read_lock_msg const &msg) = 0; + virtual void visit(write_lock_msg const &msg) = 0; + virtual void visit(zero_lock_msg const &msg) = 0; + virtual void visit(try_read_lock_msg const &msg) = 0; + virtual void visit(unlock_msg const &msg) = 0; + virtual void visit(verify_msg const &msg) = 0; + virtual void visit(prepare_msg const &msg) = 0; + virtual void visit(flush_msg const &msg) = 0; + virtual void visit(flush_and_unlock_msg const &msg) = 0; + virtual void visit(prefetch_msg const &msg) = 0; + virtual void visit(set_read_only_msg const &msg) = 0; + virtual void visit(set_read_write_msg const &msg) = 0; + }; + + enum msg_type { + MT_READ_LOCK = 0, + MT_WRITE_LOCK, + MT_ZERO_LOCK, + MT_TRY_READ_LOCK, + MT_UNLOCK, + MT_VERIFY, + MT_PREPARE, + MT_FLUSH, + MT_FLUSH_AND_UNLOCK, + MT_PREFETCH, + MT_SET_READ_ONLY, + MT_SET_READ_WRITE, + MT_END_OF_JOURNAL, + }; + + class journal { + public: + journal(persistent_data::block_manager::ptr bm); + void read_journal(struct journal_visitor &v); + + private: + bool read_delta_(delta_list &ds); + delta_list read_deltas_(); + bool read_one_(struct journal_visitor &v); + + template T read_() { + T r; + in_.read_bytes(reinterpret_cast(&r), reinterpret_cast(&r + 1)); + return r; + } + + byte_stream in_; + }; +} + +//---------------------------------------------------------------- + +#endif diff --git a/thin-provisioning/thin_journal_check.cc b/thin-provisioning/thin_journal_check.cc new file mode 100644 index 0000000..a33a255 --- /dev/null +++ b/thin-provisioning/thin_journal_check.cc @@ -0,0 +1,353 @@ +// Copyright (C) 2018 Red Hat, Inc. All rights reserved. +// +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include "version.h" + +#include "base/application.h" +#include "base/error_state.h" +#include "base/file_utils.h" +#include "base/nested_output.h" +#include "persistent-data/data-structures/btree_counter.h" +#include "persistent-data/space-maps/core.h" +#include "persistent-data/space-maps/disk.h" +#include "persistent-data/file_utils.h" +#include "thin-provisioning/metadata.h" +#include "thin-provisioning/device_tree.h" +#include "thin-provisioning/mapping_tree.h" +#include "thin-provisioning/metadata_counter.h" +#include "thin-provisioning/superblock.h" +#include "thin-provisioning/commands.h" +#include "thin-provisioning/thin_journal.h" + +using namespace base; +using namespace boost; +using namespace file_utils; +using namespace std; +using namespace thin_provisioning; + +//---------------------------------------------------------------- + +namespace { + + unsigned const MAX_HELD_LOCKS = 16; + + // We use a temporary file to hold all the deltas. Assume the metadata initially starts zeroed. + // Need to introduce notion of 'time' that increments everytime a write lock is taken. + // Need to track updates to the superblock to define transactions. + class checker : public journal_visitor { + public: + checker(block_address &nr_metadata_blocks) + : bm_(new block_manager<>("metadata.tmp", nr_metadata_blocks, MAX_HELD_LOCKS, block_manager<>::CREATE)) { + } + + virtual void visit(read_lock_msg const &msg) { + read_lock_(msg.index_); + } + + virtual void visit(write_lock_msg const &msg) { + write_lock_(msg.index_); + } + + virtual void visit(zero_lock_msg const &msg) { + write_lock_(msg.index_); + } + + virtual void visit(try_read_lock_msg const &msg) { + read_lock_(msg.index_); + } + + virtual void visit(unlock_msg const &msg) { + unlock_(msg.index_, msg.deltas_); + } + + virtual void visit(verify_msg const &msg) { + // noop + } + + virtual void visit(prepare_msg const &msg) { + // noop + } + + virtual void visit(flush_msg const &msg) { + cerr << "spurious flush()\n"; + } + + virtual void visit(flush_and_unlock_msg const &msg) { + if (msg.index_ != superblock_detail::SUPERBLOCK_LOCATION) { + cerr << "flush_and_unlock received for block " << msg.index_ + << ", which isn't the superblock\n"; + throw runtime_error("bad flush_and_unlock"); + } + + commit(msg.deltas_); + } + + virtual void visit(prefetch_msg const &msg) { + // ignore + } + + virtual void visit(set_read_only_msg const &msg) { + // ignore + } + + virtual void visit(set_read_write_msg const &msg) { + // ignore + } + + private: + void read_lock_(block_address b) { + if (write_locks_.count(b)) { + cerr << "read lock taken concurrently with write lock for block " + << b << "\n"; + throw runtime_error("bad read lock"); + } + + auto it = read_locks_.find(b); + if (it == read_locks_.end()) + read_locks_.insert(make_pair(b, 1)); + else + it->second++; + } + + void write_lock_(block_address b) { + if (active_.count(b)) { + cerr << "write lock taken for block " + << b + << ", but it is still in the active transaction\n"; + throw runtime_error("bad write lock"); + } + + if (write_locks_.count(b)) { + cerr << "write lock already held for block " + << b + << "\n"; + throw runtime_error("bad write lock"); + } + + if (read_locks_.count(b)) { + cerr << "read lock requested for write locked block " + << b << "\n"; + throw runtime_error("bad write lock"); + } + + write_locks_.insert(b); + } + + + void unlock_(block_address b, delta_list const &deltas) { + if (write_locks_.count(b)) { + write_locks_.erase(b); + + auto wr = bm_->write_lock(b); + + for (auto &&d : deltas) { + uint8_t *data = static_cast(wr.data()); + + if (d.offset_ + d.bytes_.size() > 4096) { + cerr << "delta for block " << b << " is out of range (" + << d.offset_ << ", " << d.offset_ + d.bytes_.size() << "]\n"; + throw runtime_error("bad unlock"); + } + + memcpy(data + d.offset_, d.bytes_.data(), d.bytes_.size()); + } + + } else { + auto it = read_locks_.find(b); + if (it == read_locks_.end()) { + cerr << "unlock requested on block " << b << ", which isn't locked\n"; + throw runtime_error("bad unlock"); + } + + if (deltas.size()) { + cerr << "unlocking a read lock for " << b << ", yet there are " << deltas.size() << " deltas\n"; + throw runtime_error("bad unlock"); + } + + // Decrement lock + if (!it->second) { + cerr << "read lock entry has zero count (internal error)\n"; + throw runtime_error("bad unlock"); + } + + if (!--it->second) + read_locks_.erase(it); + + } + } + + void commit(delta_list const &deltas) { + // At this point the only lock held should be the superblock, + // and that should be a write lock. + if (read_locks_.size()) { + cerr << "committing when the following read locks are still held:\n"; + for (auto &&p : read_locks_) + cerr << p.first << "\n"; + } + + + unlock_(superblock_detail::SUPERBLOCK_LOCATION, deltas); + + if (write_locks_.size()) { + cerr << "commit() called, but the following write locks are held:\n"; + for (auto &&b : write_locks_) + cerr << b << "\n"; + } + + build_active_set_(); + } + + void build_active_set_() { + using namespace thin_provisioning::superblock_detail; + + superblock sb = read_superblock(bm_); + block_counter bc; + + auto tm = open_tm(bm_, SUPERBLOCK_LOCATION); + auto sm = open_metadata_sm(*tm, &sb.metadata_space_map_root_); + tm->set_sm(sm); + + // FIXME: check we don't have a space leak from a cycle between the sm and tm + + count_metadata(tm, sb, bc); + + active_.clear(); + active_.insert(SUPERBLOCK_LOCATION); + + for (auto &&p : bc.get_counts()) { + if (!p.second) { + cerr << "weird zero count for block " << p.first << "\n"; + throw runtime_error("build_active_set() failed"); + } + + active_.insert(p.first); + } + } + + typedef set block_set; + typedef map block_map; + + block_set active_; + block_set write_locks_; + block_map read_locks_; + + block_manager<>::ptr bm_; + transaction_manager::ptr tm_; + }; + + struct flags { + flags() + : quiet(false) { + } + + bool quiet; + }; + + void check(string const &path, block_address nr_metadata_blocks) { + block_address journal_size = get_file_length(path) / JOURNAL_BLOCK_SIZE; + block_manager::ptr bm( + new block_manager(path, journal_size, 4, + block_manager::READ_ONLY)); + journal j(bm); + checker c(nr_metadata_blocks); + + j.read_journal(c); + } +} + +//---------------------------------------------------------------- + +thin_journal_cmd::thin_journal_cmd() + : command("thin_journal_check") +{ +} + +void +thin_journal_cmd::usage(std::ostream &out) const +{ + out << "Usage: " << get_name() << " [options] {device|file} {nr blocks}" << endl + << "Options:\n" + << " {-q|--quiet}\n" + << " {-h|--help}\n" + << " {-V|--version}\n"; +} + +int +thin_journal_cmd::run(int argc, char **argv) +{ + int c; + flags fs; + + char const shortopts[] = "qhV"; + option const longopts[] = { + { "quiet", no_argument, NULL, 'q'}, + { "help", no_argument, NULL, 'h'}, + { "version", no_argument, NULL, 'V'}, + { NULL, no_argument, NULL, 0 } + }; + + while ((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { + switch(c) { + case 'h': + usage(cout); + return 0; + + case 'q': + fs.quiet = true; + break; + + case 'V': + cout << THIN_PROVISIONING_TOOLS_VERSION << endl; + return 0; + + default: + usage(cerr); + return 1; + } + } + + if (argc - optind != 2) { + if (!fs.quiet) + usage(cerr); + + exit(1); + } + + try { + check(argv[optind], lexical_cast(argv[optind + 1])); + + } catch (std::exception &e) { + cerr << e.what() << "\n"; + return 1; + } + + return 0; +} + +//----------------------------------------------------------------