From aace49cdd636e6cd8344ff49719f0e07c3eccfeb Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 29 Mar 2019 12:56:31 +0000 Subject: [PATCH] [thin_repair] work in progress --- functional-tests/find-working-roots.scm | 30 +- .../data-structures/simple_traits.h | 3 + thin-provisioning/metadata_dumper.cc | 351 +++++++++++++++++- thin-provisioning/thin_dump.cc | 2 +- thin-provisioning/thin_scan.cc | 87 ++++- 5 files changed, 456 insertions(+), 17 deletions(-) diff --git a/functional-tests/find-working-roots.scm b/functional-tests/find-working-roots.scm index fb583b5..d72e884 100644 --- a/functional-tests/find-working-roots.scm +++ b/functional-tests/find-working-roots.scm @@ -18,19 +18,22 @@ ;;; node though, can't be used to differentiate). (define (thin-check metadata root) (let ((exit-code (system - (fmt #f "../bin/thin_check --skip-mappings --override-mapping-root " + (fmt #f "../bin/thin_check --override-mapping-root " root " " metadata " > /dev/null 2>&1")))) (fmt #t "exit-code: " (wrt exit-code) ", ") (zero? exit-code))) -(define metadata "../customer-metadata-full.bin") +;(define metadata "../metadataoriginal") +;(define nr-metadata-blocks 262144) + +(define metadata "/home/ejt/work/RedHat/rhel7-vm/dmtest/metadata.bin") +(define nr-metadata-blocks 32768) ;; FIXME: aren't we returning a reference to a cache page and then dropping the lock? (define (read-superblock cache) (with-block (b cache 0 (get-flags)) (block->superblock b))) -(define nr-metadata-blocks 8192) (define (in-metadata-bounds? i) (< i nr-metadata-blocks)) @@ -87,6 +90,11 @@ (< (ftype-ref BTreeNodeHeader (nr-entries) hdr) (/ (ftype-ref BTreeNodeHeader (max-entries) hdr) 3)))) +(define (details-leaf? b hdr) + (and (leaf-node? hdr) + (= (ftype-sizeof ThinDeviceDetails) + (ftype-ref BTreeNodeHeader (value-size) hdr)))) + (define (classify-node b hdr) (fold-left append '() (map (lambda (pair) @@ -95,7 +103,8 @@ '())) `((,internal? internal) (,bottom-level-leaf? bottom-level-leaf) - (,top-level-leaf? top-level-leaf))))) + (,top-level-leaf? top-level-leaf) + (,details-leaf? details-leaf))))) (define (checksum-btree-node b) (checksum-block b (ftype-sizeof unsigned-32) btree-node-salt)) @@ -120,6 +129,7 @@ (define (dump-device-details cache) (let ((sb (read-superblock cache))) + (fmt #t sb) (device-tree-each cache (ftype-ref ThinSuperblock (device-details-root) sb) (lambda (dev-id dd) (fmt #t "dev-id: " dev-id "\n" @@ -139,14 +149,16 @@ ;; An interesting node has more than one class. (define (filter-interesting-blocks classes) (filter (lambda (xs) - (> (length xs) 1)) + (>= (length xs) 1)) classes)) +(define (fmt-list xs) + (for-each (lambda (x) (fmt #t x nl)) xs)) + (with-bcache (cache metadata (* 16 1024)) - (dump-device-details cache) (let ((classes (classify-nodes cache)) (rmap (make-eq-hashtable))) - (fmt #t (filter-interesting-blocks classes)))) + (fmt-list (filter-interesting-blocks classes)))) #| (with-bcache (cache metadata (* 16 1024)) @@ -159,7 +171,7 @@ #| (let loop ((i 0) (successes '())) - (if (> i 8192) + (if (> i nr-metadata-blocks) (fmt #t "successes: " (wrt successes) "\n") ;; add --ignore-non-fatal-errors flag (if (thin-check "../customer-metadata-full.bin" i) @@ -169,4 +181,4 @@ (begin (fmt #t "fail: " i "\n") (loop (+ i 1) successes))))) -|# +|# diff --git a/persistent-data/data-structures/simple_traits.h b/persistent-data/data-structures/simple_traits.h index fa01737..3fc303a 100644 --- a/persistent-data/data-structures/simple_traits.h +++ b/persistent-data/data-structures/simple_traits.h @@ -1,6 +1,9 @@ #ifndef PERSISTENT_DATA_DATA_STRUCTURES_SIMPLE_TRAITS_H #define PERSISTENT_DATA_DATA_STRUCTURES_SIMPLE_TRAITS_H +#include "base/endian_utils.h" +#include "persistent-data/data-structures/btree.h" + //---------------------------------------------------------------- namespace persistent_data { diff --git a/thin-provisioning/metadata_dumper.cc b/thin-provisioning/metadata_dumper.cc index bbb5be8..ef02478 100644 --- a/thin-provisioning/metadata_dumper.cc +++ b/thin-provisioning/metadata_dumper.cc @@ -20,11 +20,358 @@ #include "thin-provisioning/metadata_dumper.h" #include "thin-provisioning/mapping_tree.h" +#include +#include + using namespace persistent_data; using namespace thin_provisioning; //---------------------------------------------------------------- +// We only need to examine the mapping tree, and device details tree. +// The space maps can be inferred. + +// Repair process: +// - We only trigger the repair process if there's damage when walking from +// the roots given in the superblock. +// - If there is damage, then we try and find the most recent roots with the +// least corruption. We're seeing cases where just the superblock has been +// trashed so finding the best roots is essential, and sadly non trivial. + +// Finding roots: +// This is about classifying and summarising btree nodes. The use of a btree +// node may not be obvious when inspecting it in isolation. But more information +// may be gleaned by examining child and sibling nodes. +// +// So the process is: +// - scan every metadata block, summarising it's potential uses. +// - repeatedly iterate those summaries until we can glean no more useful information. +// - sort candidate roots, choose best + +// Summary information: +// - btree; mapping top level, mapping bottom level, device tree (more than one possible) +// - node type; internal or leaf +// - age; for mapping trees we can infer a minimum age from the block/time +// values. In addition two similar leaf nodes can be compared by looking +// at the block/time for _specific_ blocks. This means we can define an ordering +// on the ages, but not equality. +// - Device details can be aged based on the last_snapshot_time field. + +// Iteration of summary info: +// - constraints propagate both up and down the trees. eg, node 'a' may +// be ambiguous (all internal nodes are ambigous). If we find that all it's +// children are device details trees, then we infer that this is too and lose +// the ambiguity. Now if it has a sibling we can infer on this too. +// - Some characteristics only propagate upwards. eg, age. So we need two monoids +// for summary info (up and down). + +namespace { + using namespace std; + using namespace boost; + using namespace persistent_data::btree_detail; + using namespace thin_provisioning::device_tree_detail; + + enum btree_type_bit { + TOP_LEVEL, + BOTTOM_LEVEL, + DEVICE_DETAILS + }; + + struct node_info { + node_info() + : types(0), + b(0), + values(0), + orphan(true), + is_leaf(true), + key_low(0), + key_high(0), + age(0) { + } + + void add_type(btree_type_bit b) { + types = types | (1 << b); + } + + void clear_type(btree_type_bit b) { + types = types & ~(1 << b); + } + + bool has_type(btree_type_bit b) const { + return types & (1 << b); + } + + // Indicate corruption by having no fields set + unsigned types; + + // common + block_address b; + unsigned values; + bool orphan; + bool is_leaf; + uint64_t key_low; + uint64_t key_high; + set devices; + uint32_t age; + }; + + using info_map = map; + + bool is_btree_node(block_manager<> &bm, block_address b) { + auto v = create_btree_node_validator(); + auto rr = bm.read_lock(b); + + return v->check_raw(rr.data()); + } + + uint32_t get_dd_age(device_details const &dd) { + return max(dd.creation_time_, dd.snapshotted_time_); + } + + void scan_initial_infos(block_manager<> &bm, info_map &result) { + for (block_address b = 0; b < bm.get_nr_blocks(); b++) { + if (!is_btree_node(bm, b)) + continue; + + node_info info; + info.b = b; + + auto rr = bm.read_lock(b); + auto hdr = reinterpret_cast(rr.data()); + + auto flags = to_cpu(hdr->flags); + if (flags & INTERNAL_NODE) { + info.is_leaf = false; + info.add_type(TOP_LEVEL); + info.add_type(BOTTOM_LEVEL); + info.add_type(DEVICE_DETAILS); + } else { + info.is_leaf = true; + auto vsize = to_cpu(hdr->value_size); + info.values = to_cpu(hdr->nr_entries); + + if (vsize == sizeof(device_details_traits::disk_type)) { + info.add_type(DEVICE_DETAILS); + + auto n = to_node(rr); + if (n.get_nr_entries()) { + info.key_low = n.key_at(0); + info.key_high = n.key_at(n.get_nr_entries() - 1); + } + + for (unsigned i = 0; i < n.get_nr_entries(); i++) + info.age = max(info.age, get_dd_age(n.value_at(i))); + + } else if (vsize == sizeof(uint64_t)) { + info.add_type(BOTTOM_LEVEL); + + // This can only be a top level leaf if all the values are + // blocks on the metadata device. + auto is_top_level = true; + auto n = to_node(rr); + + if (n.get_nr_entries()) { + info.key_low = n.key_at(0); + info.key_high = n.key_at(n.get_nr_entries() - 1); + } + + for (unsigned i = 0; i < n.get_nr_entries(); i++) { + if (n.value_at(i) >= bm.get_nr_blocks()) { + is_top_level = false; + break; + } + } + + if (is_top_level) + info.add_type(TOP_LEVEL); + } else + continue; + } + + result.insert(make_pair(b, info)); + } + } + + bool merge_types(node_info &parent, node_info const &child, btree_type_bit b) { + if (parent.has_type(b) && !child.has_type(b)) { + parent.clear_type(b); + return true; + } + + return false; + } + + // return true if something changed + bool merge_from_below(node_info &parent, node_info const &child) { + bool changed = false; + + changed = merge_types(parent, child, TOP_LEVEL) || + merge_types(parent, child, BOTTOM_LEVEL) || + merge_types(parent, child, DEVICE_DETAILS); + + return changed; + } + + void fail(node_info &n) { + n.types = 0; + } + + bool failed(node_info const &n) { + return n.types == 0; + } + + bool iterate_infos_(block_manager<> &bm, info_map &infos) { + bool changed = false; + + for (auto &p : infos) { + auto &parent = p.second; + + if (parent.is_leaf) + continue; + + // values refer to blocks, so we should have infos for them. + auto rr = bm.read_lock(p.first); + auto n = to_node(rr); + uint64_t key_low = 0; + unsigned values = 0; + + for (unsigned i = 0; i < n.get_nr_entries(); i++) { + auto it = infos.find(n.value_at(i)); + + if (it == infos.end()) { + fail(parent); + break; + } + + auto &child = it->second; + + // we use the keys to help decide if this is a valid child + if (child.key_low <= key_low) { + fail(parent); + break; + + } else + key_low = child.key_high; + + + changed = merge_from_below(parent, child) || changed; + + if (parent.has_type(DEVICE_DETAILS) && child.age > parent.age) { + changed = true; + parent.age = child.age; + } + + values += child.values; + } + + // We don't clear the orphan flags until we know the parent is good + if (!failed(parent)) { + parent.values = values; + + for (unsigned i = 0; i < n.get_nr_entries(); i++) { + auto it = infos.find(n.value_at(i)); + + if (it == infos.end()) + throw runtime_error("no child info, but it was there a moment ago"); + + auto &child = it->second; + child.orphan = false; + } + } + } + + return changed; + } + + void iterate_infos(block_manager<> &bm, info_map &infos) { + while (iterate_infos_(bm, infos)) + ; + } + + bool trees_are_compatible(node_info const &mapping, node_info const &devices) { + for (auto thin_id : mapping.devices) + if (devices.devices.find(thin_id) == devices.devices.end()) + return false; + + return true; + } + + bool cmp_mapping_info(node_info const &lhs, node_info const &rhs) { + return lhs.age > rhs.age; + } + + bool has_type(node_info const &i, unsigned bit) { + return i.types & (1 << bit); + } + + vector + extract_mapping_candidates(info_map const &infos) { + vector results; + + for (auto const &p : infos) + if (p.second.orphan && has_type(p.second, TOP_LEVEL)) + results.push_back(p.second); + + //sort(results.begin(), results.end(), cmp_mapping_info); + + return results; + } + + bool cmp_device_info(node_info const &lhs, node_info const &rhs) { + // FIXME: finish + return false; + //return lhs.dd_age > rhs.dd_age; + } + + vector + extract_device_candidates(info_map const &infos) { + vector results; + + for (auto const &p : infos) + if (p.second.orphan && has_type(p.second, DEVICE_DETAILS)) + results.push_back(p.second); + + sort(results.begin(), results.end(), cmp_device_info); + + return results; + } + + // Returns , + //pair + void + find_best_roots(block_manager<> &bm) { + info_map infos; + + scan_initial_infos(bm, infos); + iterate_infos(bm, infos); + + // These will be sorted into best first order + vector mapping_candidates = extract_mapping_candidates(infos); + vector device_candidates = extract_device_candidates(infos); + + cerr << "mapping candidates (" << mapping_candidates.size() << "):\n"; + for (auto const &i : mapping_candidates) + cerr << i.b << ", tree size = " << i.values << ", age = " << i.age << "\n"; + + cerr << "\ndevice candidates (" << device_candidates.size() << "):\n"; + for (auto const &i : device_candidates) + cerr << i.b << ", tree size = " << i.values << ", age = " << i.age << "\n"; + +#if 0 + // Choose the best mapping tree, and then the best device tree + // that is compatible. + for (auto &m : mapping_candidates) + for (auto &d : device_candidates) + if (trees_are_compatible(m, d)) + return make_pair(m.b, d.b); +#endif + +// throw runtime_error("no compatible mapping/device trees"); + } +} + +//---------------------------------------------------------------- + namespace { void raise_metadata_damage() { throw std::runtime_error("metadata contains errors (run thin_check for details).\n" @@ -197,7 +544,7 @@ namespace { try { if (!opts_.skip_mappings_) emit_mappings(dev_id, tree_root); - } catch (exception &e) { + } catch (std::exception &e) { cerr << e.what(); e_->end_device(); throw; @@ -246,6 +593,8 @@ namespace { void thin_provisioning::metadata_dump(metadata::ptr md, emitter::ptr e, dump_options const &opts) { + find_best_roots(*md->tm_->get_bm()); + details_extractor de(opts); device_tree_detail::damage_visitor::ptr dd_policy(details_damage_policy(opts.repair_)); walk_device_tree(*md->details_, de, *dd_policy); diff --git a/thin-provisioning/thin_dump.cc b/thin-provisioning/thin_dump.cc index 17ddee6..3d96a05 100644 --- a/thin-provisioning/thin_dump.cc +++ b/thin-provisioning/thin_dump.cc @@ -52,7 +52,7 @@ namespace { metadata::ptr open_metadata(string const &path, struct flags &flags) { block_manager<>::ptr bm = open_bm(path, block_manager<>::READ_ONLY, !flags.use_metadata_snap); - metadata::ptr md(flags.use_metadata_snap ? new metadata(bm, flags.snap_location) : new metadata(bm)); + metadata::ptr md(flags.use_metadata_snap ? new metadata(bm, flags.snap_location) : new metadata(bm, false)); return md; } diff --git a/thin-provisioning/thin_scan.cc b/thin-provisioning/thin_scan.cc index 6bb856e..517c954 100644 --- a/thin-provisioning/thin_scan.cc +++ b/thin-provisioning/thin_scan.cc @@ -30,6 +30,8 @@ #include "thin-provisioning/commands.h" #include "version.h" +using namespace boost; +using namespace std; using namespace thin_provisioning; //---------------------------------------------------------------- @@ -463,11 +465,13 @@ namespace { class metadata_scanner { public: - metadata_scanner(block_manager<>::ptr bm, uint64_t scan_begin, uint64_t scan_end) + metadata_scanner(block_manager<>::ptr bm, uint64_t scan_begin, uint64_t scan_end, + bool check_for_strings) : bm_(bm), scan_begin_(scan_begin), scan_end_(scan_end), - index_(scan_begin) { + index_(scan_begin), + check_for_strings_(check_for_strings) { if (scan_end_ <= scan_begin_) throw std::runtime_error("badly formed region (end <= begin)"); @@ -486,8 +490,6 @@ namespace { run_range_ = r.clone(); } - virtual ~metadata_scanner() {} - std::unique_ptr get_range() { std::unique_ptr ret; @@ -507,7 +509,44 @@ namespace { return ret; } + map> const &get_strings() const { + return strings_; + } + private: + bool interesting_char(char c) + { + return isalnum(c) || ispunct(c); + } + + unsigned printable_len(const char *b, const char *e) + { + const char *p = b; + + while (p != e && interesting_char(*p)) + p++; + + return p - b; + } + + // asci text within our metadata is a sure sign of corruption. + optional > + scan_strings(block_manager<>::read_ref rr) + { + vector r; + const char *data = reinterpret_cast(rr.data()), *end = data + MD_BLOCK_SIZE; + + while (data < end) { + auto len = printable_len(data, end); + if (len >= 4) + r.push_back(string(data, data + len)); + + data += len + 1; + } + + return r.size() ? optional>(r) : optional>(); + } + block_range const &read_block(block_address b) { block_manager<>::read_ref rr = bm_->read_lock(b); int64_t ref_count; @@ -516,6 +555,14 @@ namespace { } catch (std::exception &e) { ref_count = -1; } + + if (check_for_strings_) { + auto ss = scan_strings(rr); + if (ss) { + strings_.insert(make_pair(b, *ss)); + } + } + return factory_.convert_to_range(rr, ref_count); } @@ -531,17 +578,24 @@ namespace { std::unique_ptr run_range_; range_factory factory_; + + bool check_for_strings_; + map> strings_; }; //------------------------------------------------------------------- struct flags { - flags(): exclusive_(true) { + flags() + : exclusive_(true), + examine_corruption_(false) + { } boost::optional scan_begin_; boost::optional scan_end_; bool exclusive_; + bool examine_corruption_; }; int scan_metadata_(string const &input, @@ -552,11 +606,26 @@ namespace { block_address scan_begin = f.scan_begin_ ? *f.scan_begin_ : 0; block_address scan_end = f.scan_end_ ? *f.scan_end_ : bm->get_nr_blocks(); - metadata_scanner scanner(bm, scan_begin, scan_end); + metadata_scanner scanner(bm, scan_begin, scan_end, f.examine_corruption_); std::unique_ptr r; while ((r = scanner.get_range())) { out << *r << std::endl; } + + if (f.examine_corruption_) { + auto ss = scanner.get_strings(); + + for (auto const &ps : ss) { + out << ps.first << ": "; + + unsigned total = 0; + for (auto const &s : ps.second) + total += s.length(); + + out << total << " bytes of text\n"; + } + } + return 0; } @@ -592,6 +661,7 @@ thin_scan_cmd::usage(std::ostream &out) const { << " {-o|--output} \n" << " {--begin} \n" << " {--end} \n" + << " {--examine-corruption}\n" << " {-V|--version}" << endl; } @@ -605,6 +675,7 @@ thin_scan_cmd::run(int argc, char **argv) { "version", no_argument, NULL, 'V'}, { "begin", required_argument, NULL, 1}, { "end", required_argument, NULL, 2}, + { "examine-corruption", no_argument, NULL, 3 }, { NULL, no_argument, NULL, 0 } }; boost::optional output; @@ -643,6 +714,10 @@ thin_scan_cmd::run(int argc, char **argv) } break; + case 3: + f.examine_corruption_ = true; + break; + default: usage(cerr); return 1;