// Copyright (C) 2011 Red Hat, Inc. All rights reserved. // // This file is part of the thin-provisioning-tools source. // // thin-provisioning-tools is free software: you can redistribute it // and/or modify it under the terms of the GNU General Public License // as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // thin-provisioning-tools is distributed in the hope that it will be // useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along // with thin-provisioning-tools. If not, see // . #include "thin-provisioning/emitter.h" #include "thin-provisioning/metadata_dumper.h" #include "thin-provisioning/mapping_tree.h" #include #include using namespace persistent_data; using namespace thin_provisioning; //---------------------------------------------------------------- // We only need to examine the mapping tree, and device details tree. // The space maps can be inferred. // Repair process: // - We only trigger the repair process if there's damage when walking from // the roots given in the superblock. // - If there is damage, then we try and find the most recent roots with the // least corruption. We're seeing cases where just the superblock has been // trashed so finding the best roots is essential, and sadly non trivial. // Finding roots: // This is about classifying and summarising btree nodes. The use of a btree // node may not be obvious when inspecting it in isolation. But more information // may be gleaned by examining child and sibling nodes. // // So the process is: // - scan every metadata block, summarising it's potential uses. // - repeatedly iterate those summaries until we can glean no more useful information. // - sort candidate roots, choose best // Summary information: // - btree; mapping top level, mapping bottom level, device tree (more than one possible) // - node type; internal or leaf // - age; for mapping trees we can infer a minimum age from the block/time // values. In addition two similar leaf nodes can be compared by looking // at the block/time for _specific_ blocks. This means we can define an ordering // on the ages, but not equality. // - Device details can be aged based on the last_snapshot_time field. // Iteration of summary info: // - constraints propagate both up and down the trees. eg, node 'a' may // be ambiguous (all internal nodes are ambigous). If we find that all it's // children are device details trees, then we infer that this is too and lose // the ambiguity. Now if it has a sibling we can infer on this too. // - Some characteristics only propagate upwards. eg, age. So we need two monoids // for summary info (up and down). namespace { using namespace std; using namespace boost; using namespace persistent_data::btree_detail; using namespace thin_provisioning::device_tree_detail; enum btree_type_bit { TOP_LEVEL, BOTTOM_LEVEL, DEVICE_DETAILS }; struct node_info { node_info() : types(0), b(0), values(0), orphan(true), is_leaf(true), key_low(0), key_high(0), age(0) { } void add_type(btree_type_bit b) { types = types | (1 << b); } void clear_type(btree_type_bit b) { types = types & ~(1 << b); } bool has_type(btree_type_bit b) const { return types & (1 << b); } // Indicate corruption by having no fields set unsigned types; // common block_address b; unsigned values; bool orphan; bool is_leaf; uint64_t key_low; uint64_t key_high; set devices; uint32_t age; }; using info_map = map; bool is_btree_node(block_manager<> &bm, block_address b) { auto v = create_btree_node_validator(); auto rr = bm.read_lock(b); return v->check_raw(rr.data()); } uint32_t get_dd_age(device_details const &dd) { return max(dd.creation_time_, dd.snapshotted_time_); } void scan_initial_infos(block_manager<> &bm, info_map &result) { for (block_address b = 0; b < bm.get_nr_blocks(); b++) { if (!is_btree_node(bm, b)) continue; node_info info; info.b = b; auto rr = bm.read_lock(b); auto hdr = reinterpret_cast(rr.data()); auto flags = to_cpu(hdr->flags); if (flags & INTERNAL_NODE) { info.is_leaf = false; info.add_type(TOP_LEVEL); info.add_type(BOTTOM_LEVEL); info.add_type(DEVICE_DETAILS); } else { info.is_leaf = true; auto vsize = to_cpu(hdr->value_size); info.values = to_cpu(hdr->nr_entries); if (vsize == sizeof(device_details_traits::disk_type)) { info.add_type(DEVICE_DETAILS); auto n = to_node(rr); if (n.get_nr_entries()) { info.key_low = n.key_at(0); info.key_high = n.key_at(n.get_nr_entries() - 1); } for (unsigned i = 0; i < n.get_nr_entries(); i++) info.age = max(info.age, get_dd_age(n.value_at(i))); } else if (vsize == sizeof(uint64_t)) { info.add_type(BOTTOM_LEVEL); // This can only be a top level leaf if all the values are // blocks on the metadata device. auto is_top_level = true; auto n = to_node(rr); if (n.get_nr_entries()) { info.key_low = n.key_at(0); info.key_high = n.key_at(n.get_nr_entries() - 1); } for (unsigned i = 0; i < n.get_nr_entries(); i++) { if (n.value_at(i) >= bm.get_nr_blocks()) { is_top_level = false; break; } } if (is_top_level) info.add_type(TOP_LEVEL); } else continue; } result.insert(make_pair(b, info)); } } bool merge_types(node_info &parent, node_info const &child, btree_type_bit b) { if (parent.has_type(b) && !child.has_type(b)) { parent.clear_type(b); return true; } return false; } // return true if something changed bool merge_from_below(node_info &parent, node_info const &child) { bool changed = false; changed = merge_types(parent, child, TOP_LEVEL) || merge_types(parent, child, BOTTOM_LEVEL) || merge_types(parent, child, DEVICE_DETAILS); return changed; } void fail(node_info &n) { n.types = 0; } bool failed(node_info const &n) { return n.types == 0; } bool iterate_infos_(block_manager<> &bm, info_map &infos) { bool changed = false; for (auto &p : infos) { auto &parent = p.second; if (parent.is_leaf) continue; // values refer to blocks, so we should have infos for them. auto rr = bm.read_lock(p.first); auto n = to_node(rr); uint64_t key_low = 0; unsigned values = 0; for (unsigned i = 0; i < n.get_nr_entries(); i++) { auto it = infos.find(n.value_at(i)); if (it == infos.end()) { fail(parent); break; } auto &child = it->second; // we use the keys to help decide if this is a valid child if (child.key_low <= key_low) { fail(parent); break; } else key_low = child.key_high; changed = merge_from_below(parent, child) || changed; if (parent.has_type(DEVICE_DETAILS) && child.age > parent.age) { changed = true; parent.age = child.age; } values += child.values; } // We don't clear the orphan flags until we know the parent is good if (!failed(parent)) { parent.values = values; for (unsigned i = 0; i < n.get_nr_entries(); i++) { auto it = infos.find(n.value_at(i)); if (it == infos.end()) throw runtime_error("no child info, but it was there a moment ago"); auto &child = it->second; child.orphan = false; } } } return changed; } void iterate_infos(block_manager<> &bm, info_map &infos) { while (iterate_infos_(bm, infos)) ; } bool trees_are_compatible(node_info const &mapping, node_info const &devices) { for (auto thin_id : mapping.devices) if (devices.devices.find(thin_id) == devices.devices.end()) return false; return true; } bool cmp_mapping_info(node_info const &lhs, node_info const &rhs) { return lhs.age > rhs.age; } bool has_type(node_info const &i, unsigned bit) { return i.types & (1 << bit); } vector extract_mapping_candidates(info_map const &infos) { vector results; for (auto const &p : infos) if (p.second.orphan && has_type(p.second, TOP_LEVEL)) results.push_back(p.second); //sort(results.begin(), results.end(), cmp_mapping_info); return results; } bool cmp_device_info(node_info const &lhs, node_info const &rhs) { // FIXME: finish return false; //return lhs.dd_age > rhs.dd_age; } vector extract_device_candidates(info_map const &infos) { vector results; for (auto const &p : infos) if (p.second.orphan && has_type(p.second, DEVICE_DETAILS)) results.push_back(p.second); sort(results.begin(), results.end(), cmp_device_info); return results; } // Returns , //pair void find_best_roots(block_manager<> &bm) { info_map infos; scan_initial_infos(bm, infos); iterate_infos(bm, infos); // These will be sorted into best first order vector mapping_candidates = extract_mapping_candidates(infos); vector device_candidates = extract_device_candidates(infos); cerr << "mapping candidates (" << mapping_candidates.size() << "):\n"; for (auto const &i : mapping_candidates) cerr << i.b << ", tree size = " << i.values << ", age = " << i.age << "\n"; cerr << "\ndevice candidates (" << device_candidates.size() << "):\n"; for (auto const &i : device_candidates) cerr << i.b << ", tree size = " << i.values << ", age = " << i.age << "\n"; #if 0 // Choose the best mapping tree, and then the best device tree // that is compatible. for (auto &m : mapping_candidates) for (auto &d : device_candidates) if (trees_are_compatible(m, d)) return make_pair(m.b, d.b); #endif // throw runtime_error("no compatible mapping/device trees"); } } //---------------------------------------------------------------- namespace { void raise_metadata_damage() { throw std::runtime_error("metadata contains errors (run thin_check for details).\n" "perhaps you wanted to run with --repair"); } //-------------------------------- struct ignore_details_damage : public device_tree_detail::damage_visitor { void visit(device_tree_detail::missing_devices const &d) { } }; struct fatal_details_damage : public device_tree_detail::damage_visitor { void visit(device_tree_detail::missing_devices const &d) { raise_metadata_damage(); } }; device_tree_detail::damage_visitor::ptr details_damage_policy(bool repair) { typedef device_tree_detail::damage_visitor::ptr dvp; if (repair) return dvp(new ignore_details_damage()); else return dvp(new fatal_details_damage()); } //-------------------------------- struct ignore_mapping_damage : public mapping_tree_detail::damage_visitor { void visit(mapping_tree_detail::missing_devices const &d) { } void visit(mapping_tree_detail::missing_mappings const &d) { } }; struct fatal_mapping_damage : public mapping_tree_detail::damage_visitor { void visit(mapping_tree_detail::missing_devices const &d) { raise_metadata_damage(); } void visit(mapping_tree_detail::missing_mappings const &d) { raise_metadata_damage(); } }; mapping_tree_detail::damage_visitor::ptr mapping_damage_policy(bool repair) { typedef mapping_tree_detail::damage_visitor::ptr mvp; if (repair) return mvp(new ignore_mapping_damage()); else return mvp(new fatal_mapping_damage()); } //-------------------------------- typedef map dd_map; class details_extractor : public device_tree_detail::device_visitor { public: details_extractor(dump_options const &opts) : opts_(opts) { } void visit(block_address dev_id, device_tree_detail::device_details const &dd) { if (opts_.selected_dev(dev_id)) dd_.insert(make_pair(dev_id, dd)); } dd_map const &get_details() const { return dd_; } private: dump_options const &opts_; dd_map dd_; }; class mapping_emitter : public mapping_tree_detail::mapping_visitor { public: mapping_emitter(emitter::ptr e) : e_(e), in_range_(false) { } ~mapping_emitter() { end_mapping(); } typedef mapping_tree_detail::block_time block_time; void visit(btree_path const &path, block_time const &bt) { add_mapping(path[0], bt); } private: void start_mapping(uint64_t origin_block, block_time const &bt) { origin_start_ = origin_block; dest_start_ = bt.block_; time_ = bt.time_; len_ = 1; in_range_ = true; } void end_mapping() { if (in_range_) { if (len_ == 1) e_->single_map(origin_start_, dest_start_, time_); else e_->range_map(origin_start_, dest_start_, time_, len_); in_range_ = false; } } void add_mapping(uint64_t origin_block, block_time const &bt) { if (!in_range_) start_mapping(origin_block, bt); else if (origin_block == origin_start_ + len_ && bt.block_ == dest_start_ + len_ && time_ == bt.time_) len_++; else { end_mapping(); start_mapping(origin_block, bt); } } emitter::ptr e_; block_address origin_start_; block_address dest_start_; uint32_t time_; block_address len_; bool in_range_; }; class mapping_tree_emitter : public mapping_tree_detail::device_visitor { public: mapping_tree_emitter(dump_options const &opts, metadata::ptr md, emitter::ptr e, dd_map const &dd, mapping_tree_detail::damage_visitor::ptr damage_policy) : opts_(opts), md_(md), e_(e), dd_(dd), damage_policy_(damage_policy) { } void visit(btree_path const &path, block_address tree_root) { block_address dev_id = path[0]; if (!opts_.selected_dev(dev_id)) return; dd_map::const_iterator it = dd_.find(path[0]); if (it != dd_.end()) { device_tree_detail::device_details const &d = it->second; e_->begin_device(dev_id, d.mapped_blocks_, d.transaction_id_, d.creation_time_, d.snapshotted_time_); try { if (!opts_.skip_mappings_) emit_mappings(dev_id, tree_root); } catch (std::exception &e) { cerr << e.what(); e_->end_device(); throw; } e_->end_device(); } else if (!opts_.repair_) { ostringstream msg; msg << "mappings present for device " << dev_id << ", but it isn't present in device tree"; throw runtime_error(msg.str()); } } private: void emit_mappings(uint64_t dev_id, block_address subtree_root) { mapping_emitter me(e_); single_mapping_tree tree(*md_->tm_, subtree_root, mapping_tree_detail::block_time_ref_counter(md_->data_sm_)); walk_mapping_tree(tree, dev_id, static_cast(me), *damage_policy_); } dump_options const &opts_; metadata::ptr md_; emitter::ptr e_; dd_map const &dd_; mapping_tree_detail::damage_visitor::ptr damage_policy_; }; block_address get_nr_blocks(metadata::ptr md) { if (md->data_sm_) return md->data_sm_->get_nr_blocks(); else if (md->sb_.blocknr_ == superblock_detail::SUPERBLOCK_LOCATION) // grab from the root structure of the space map return get_nr_blocks_in_data_sm(*md->tm_, &md->sb_.data_space_map_root_); else // metadata snap, we really don't know return 0ull; } } //---------------------------------------------------------------- void thin_provisioning::metadata_dump(metadata::ptr md, emitter::ptr e, dump_options const &opts) { find_best_roots(*md->tm_->get_bm()); details_extractor de(opts); device_tree_detail::damage_visitor::ptr dd_policy(details_damage_policy(opts.repair_)); walk_device_tree(*md->details_, de, *dd_policy); e->begin_superblock("", md->sb_.time_, md->sb_.trans_id_, md->sb_.flags_, md->sb_.version_, md->sb_.data_block_size_, get_nr_blocks(md), boost::optional()); { mapping_tree_detail::damage_visitor::ptr md_policy(mapping_damage_policy(opts.repair_)); mapping_tree_emitter mte(opts, md, e, de.get_details(), mapping_damage_policy(opts.repair_)); walk_mapping_tree(*md->mappings_top_level_, mte, *md_policy); } e->end_superblock(); } //---------------------------------------------------------------- void thin_provisioning::metadata_dump_subtree(metadata::ptr md, emitter::ptr e, bool repair, uint64_t subtree_root) { mapping_emitter me(e); single_mapping_tree tree(*md->tm_, subtree_root, mapping_tree_detail::block_time_ref_counter(md->data_sm_)); // FIXME: pass the current device id instead of zero walk_mapping_tree(tree, 0, static_cast(me), *mapping_damage_policy(repair)); } //----------------------------------------------------------------