[thin_repair] work in progress

This commit is contained in:
Joe Thornber 2019-03-29 12:56:31 +00:00
parent 8abac422b2
commit aace49cdd6
5 changed files with 456 additions and 17 deletions

View File

@ -18,19 +18,22 @@
;;; node though, can't be used to differentiate).
(define (thin-check metadata root)
(let ((exit-code (system
(fmt #f "../bin/thin_check --skip-mappings --override-mapping-root "
(fmt #f "../bin/thin_check --override-mapping-root "
root " " metadata " > /dev/null 2>&1"))))
(fmt #t "exit-code: " (wrt exit-code) ", ")
(zero? exit-code)))
(define metadata "../customer-metadata-full.bin")
;(define metadata "../metadataoriginal")
;(define nr-metadata-blocks 262144)
(define metadata "/home/ejt/work/RedHat/rhel7-vm/dmtest/metadata.bin")
(define nr-metadata-blocks 32768)
;; FIXME: aren't we returning a reference to a cache page and then dropping the lock?
(define (read-superblock cache)
(with-block (b cache 0 (get-flags))
(block->superblock b)))
(define nr-metadata-blocks 8192)
(define (in-metadata-bounds? i)
(< i nr-metadata-blocks))
@ -87,6 +90,11 @@
(< (ftype-ref BTreeNodeHeader (nr-entries) hdr)
(/ (ftype-ref BTreeNodeHeader (max-entries) hdr) 3))))
(define (details-leaf? b hdr)
(and (leaf-node? hdr)
(= (ftype-sizeof ThinDeviceDetails)
(ftype-ref BTreeNodeHeader (value-size) hdr))))
(define (classify-node b hdr)
(fold-left append '()
(map (lambda (pair)
@ -95,7 +103,8 @@
'()))
`((,internal? internal)
(,bottom-level-leaf? bottom-level-leaf)
(,top-level-leaf? top-level-leaf)))))
(,top-level-leaf? top-level-leaf)
(,details-leaf? details-leaf)))))
(define (checksum-btree-node b)
(checksum-block b (ftype-sizeof unsigned-32) btree-node-salt))
@ -120,6 +129,7 @@
(define (dump-device-details cache)
(let ((sb (read-superblock cache)))
(fmt #t sb)
(device-tree-each cache (ftype-ref ThinSuperblock (device-details-root) sb)
(lambda (dev-id dd)
(fmt #t "dev-id: " dev-id "\n"
@ -139,14 +149,16 @@
;; An interesting node has more than one class.
(define (filter-interesting-blocks classes)
(filter (lambda (xs)
(> (length xs) 1))
(>= (length xs) 1))
classes))
(define (fmt-list xs)
(for-each (lambda (x) (fmt #t x nl)) xs))
(with-bcache (cache metadata (* 16 1024))
(dump-device-details cache)
(let ((classes (classify-nodes cache))
(rmap (make-eq-hashtable)))
(fmt #t (filter-interesting-blocks classes))))
(fmt-list (filter-interesting-blocks classes))))
#|
(with-bcache (cache metadata (* 16 1024))
@ -159,7 +171,7 @@
#|
(let loop ((i 0)
(successes '()))
(if (> i 8192)
(if (> i nr-metadata-blocks)
(fmt #t "successes: " (wrt successes) "\n")
;; add --ignore-non-fatal-errors flag
(if (thin-check "../customer-metadata-full.bin" i)
@ -169,4 +181,4 @@
(begin
(fmt #t "fail: " i "\n")
(loop (+ i 1) successes)))))
|#
|#

View File

@ -1,6 +1,9 @@
#ifndef PERSISTENT_DATA_DATA_STRUCTURES_SIMPLE_TRAITS_H
#define PERSISTENT_DATA_DATA_STRUCTURES_SIMPLE_TRAITS_H
#include "base/endian_utils.h"
#include "persistent-data/data-structures/btree.h"
//----------------------------------------------------------------
namespace persistent_data {

View File

@ -20,11 +20,358 @@
#include "thin-provisioning/metadata_dumper.h"
#include "thin-provisioning/mapping_tree.h"
#include <map>
#include <vector>
using namespace persistent_data;
using namespace thin_provisioning;
//----------------------------------------------------------------
// We only need to examine the mapping tree, and device details tree.
// The space maps can be inferred.
// Repair process:
// - We only trigger the repair process if there's damage when walking from
// the roots given in the superblock.
// - If there is damage, then we try and find the most recent roots with the
// least corruption. We're seeing cases where just the superblock has been
// trashed so finding the best roots is essential, and sadly non trivial.
// Finding roots:
// This is about classifying and summarising btree nodes. The use of a btree
// node may not be obvious when inspecting it in isolation. But more information
// may be gleaned by examining child and sibling nodes.
//
// So the process is:
// - scan every metadata block, summarising it's potential uses.
// - repeatedly iterate those summaries until we can glean no more useful information.
// - sort candidate roots, choose best
// Summary information:
// - btree; mapping top level, mapping bottom level, device tree (more than one possible)
// - node type; internal or leaf
// - age; for mapping trees we can infer a minimum age from the block/time
// values. In addition two similar leaf nodes can be compared by looking
// at the block/time for _specific_ blocks. This means we can define an ordering
// on the ages, but not equality.
// - Device details can be aged based on the last_snapshot_time field.
// Iteration of summary info:
// - constraints propagate both up and down the trees. eg, node 'a' may
// be ambiguous (all internal nodes are ambigous). If we find that all it's
// children are device details trees, then we infer that this is too and lose
// the ambiguity. Now if it has a sibling we can infer on this too.
// - Some characteristics only propagate upwards. eg, age. So we need two monoids
// for summary info (up and down).
namespace {
using namespace std;
using namespace boost;
using namespace persistent_data::btree_detail;
using namespace thin_provisioning::device_tree_detail;
enum btree_type_bit {
TOP_LEVEL,
BOTTOM_LEVEL,
DEVICE_DETAILS
};
struct node_info {
node_info()
: types(0),
b(0),
values(0),
orphan(true),
is_leaf(true),
key_low(0),
key_high(0),
age(0) {
}
void add_type(btree_type_bit b) {
types = types | (1 << b);
}
void clear_type(btree_type_bit b) {
types = types & ~(1 << b);
}
bool has_type(btree_type_bit b) const {
return types & (1 << b);
}
// Indicate corruption by having no fields set
unsigned types;
// common
block_address b;
unsigned values;
bool orphan;
bool is_leaf;
uint64_t key_low;
uint64_t key_high;
set<uint32_t> devices;
uint32_t age;
};
using info_map = map<block_address, node_info>;
bool is_btree_node(block_manager<> &bm, block_address b) {
auto v = create_btree_node_validator();
auto rr = bm.read_lock(b);
return v->check_raw(rr.data());
}
uint32_t get_dd_age(device_details const &dd) {
return max(dd.creation_time_, dd.snapshotted_time_);
}
void scan_initial_infos(block_manager<> &bm, info_map &result) {
for (block_address b = 0; b < bm.get_nr_blocks(); b++) {
if (!is_btree_node(bm, b))
continue;
node_info info;
info.b = b;
auto rr = bm.read_lock(b);
auto hdr = reinterpret_cast<node_header const *>(rr.data());
auto flags = to_cpu<uint32_t>(hdr->flags);
if (flags & INTERNAL_NODE) {
info.is_leaf = false;
info.add_type(TOP_LEVEL);
info.add_type(BOTTOM_LEVEL);
info.add_type(DEVICE_DETAILS);
} else {
info.is_leaf = true;
auto vsize = to_cpu<uint32_t>(hdr->value_size);
info.values = to_cpu<uint32_t>(hdr->nr_entries);
if (vsize == sizeof(device_details_traits::disk_type)) {
info.add_type(DEVICE_DETAILS);
auto n = to_node<device_details_traits>(rr);
if (n.get_nr_entries()) {
info.key_low = n.key_at(0);
info.key_high = n.key_at(n.get_nr_entries() - 1);
}
for (unsigned i = 0; i < n.get_nr_entries(); i++)
info.age = max(info.age, get_dd_age(n.value_at(i)));
} else if (vsize == sizeof(uint64_t)) {
info.add_type(BOTTOM_LEVEL);
// This can only be a top level leaf if all the values are
// blocks on the metadata device.
auto is_top_level = true;
auto n = to_node<block_traits>(rr);
if (n.get_nr_entries()) {
info.key_low = n.key_at(0);
info.key_high = n.key_at(n.get_nr_entries() - 1);
}
for (unsigned i = 0; i < n.get_nr_entries(); i++) {
if (n.value_at(i) >= bm.get_nr_blocks()) {
is_top_level = false;
break;
}
}
if (is_top_level)
info.add_type(TOP_LEVEL);
} else
continue;
}
result.insert(make_pair(b, info));
}
}
bool merge_types(node_info &parent, node_info const &child, btree_type_bit b) {
if (parent.has_type(b) && !child.has_type(b)) {
parent.clear_type(b);
return true;
}
return false;
}
// return true if something changed
bool merge_from_below(node_info &parent, node_info const &child) {
bool changed = false;
changed = merge_types(parent, child, TOP_LEVEL) ||
merge_types(parent, child, BOTTOM_LEVEL) ||
merge_types(parent, child, DEVICE_DETAILS);
return changed;
}
void fail(node_info &n) {
n.types = 0;
}
bool failed(node_info const &n) {
return n.types == 0;
}
bool iterate_infos_(block_manager<> &bm, info_map &infos) {
bool changed = false;
for (auto &p : infos) {
auto &parent = p.second;
if (parent.is_leaf)
continue;
// values refer to blocks, so we should have infos for them.
auto rr = bm.read_lock(p.first);
auto n = to_node<block_traits>(rr);
uint64_t key_low = 0;
unsigned values = 0;
for (unsigned i = 0; i < n.get_nr_entries(); i++) {
auto it = infos.find(n.value_at(i));
if (it == infos.end()) {
fail(parent);
break;
}
auto &child = it->second;
// we use the keys to help decide if this is a valid child
if (child.key_low <= key_low) {
fail(parent);
break;
} else
key_low = child.key_high;
changed = merge_from_below(parent, child) || changed;
if (parent.has_type(DEVICE_DETAILS) && child.age > parent.age) {
changed = true;
parent.age = child.age;
}
values += child.values;
}
// We don't clear the orphan flags until we know the parent is good
if (!failed(parent)) {
parent.values = values;
for (unsigned i = 0; i < n.get_nr_entries(); i++) {
auto it = infos.find(n.value_at(i));
if (it == infos.end())
throw runtime_error("no child info, but it was there a moment ago");
auto &child = it->second;
child.orphan = false;
}
}
}
return changed;
}
void iterate_infos(block_manager<> &bm, info_map &infos) {
while (iterate_infos_(bm, infos))
;
}
bool trees_are_compatible(node_info const &mapping, node_info const &devices) {
for (auto thin_id : mapping.devices)
if (devices.devices.find(thin_id) == devices.devices.end())
return false;
return true;
}
bool cmp_mapping_info(node_info const &lhs, node_info const &rhs) {
return lhs.age > rhs.age;
}
bool has_type(node_info const &i, unsigned bit) {
return i.types & (1 << bit);
}
vector<node_info>
extract_mapping_candidates(info_map const &infos) {
vector<node_info> results;
for (auto const &p : infos)
if (p.second.orphan && has_type(p.second, TOP_LEVEL))
results.push_back(p.second);
//sort(results.begin(), results.end(), cmp_mapping_info);
return results;
}
bool cmp_device_info(node_info const &lhs, node_info const &rhs) {
// FIXME: finish
return false;
//return lhs.dd_age > rhs.dd_age;
}
vector<node_info>
extract_device_candidates(info_map const &infos) {
vector<node_info> results;
for (auto const &p : infos)
if (p.second.orphan && has_type(p.second, DEVICE_DETAILS))
results.push_back(p.second);
sort(results.begin(), results.end(), cmp_device_info);
return results;
}
// Returns <mapping root>, <dev details root>
//pair<block_address, block_address>
void
find_best_roots(block_manager<> &bm) {
info_map infos;
scan_initial_infos(bm, infos);
iterate_infos(bm, infos);
// These will be sorted into best first order
vector<node_info> mapping_candidates = extract_mapping_candidates(infos);
vector<node_info> device_candidates = extract_device_candidates(infos);
cerr << "mapping candidates (" << mapping_candidates.size() << "):\n";
for (auto const &i : mapping_candidates)
cerr << i.b << ", tree size = " << i.values << ", age = " << i.age << "\n";
cerr << "\ndevice candidates (" << device_candidates.size() << "):\n";
for (auto const &i : device_candidates)
cerr << i.b << ", tree size = " << i.values << ", age = " << i.age << "\n";
#if 0
// Choose the best mapping tree, and then the best device tree
// that is compatible.
for (auto &m : mapping_candidates)
for (auto &d : device_candidates)
if (trees_are_compatible(m, d))
return make_pair(m.b, d.b);
#endif
// throw runtime_error("no compatible mapping/device trees");
}
}
//----------------------------------------------------------------
namespace {
void raise_metadata_damage() {
throw std::runtime_error("metadata contains errors (run thin_check for details).\n"
@ -197,7 +544,7 @@ namespace {
try {
if (!opts_.skip_mappings_)
emit_mappings(dev_id, tree_root);
} catch (exception &e) {
} catch (std::exception &e) {
cerr << e.what();
e_->end_device();
throw;
@ -246,6 +593,8 @@ namespace {
void
thin_provisioning::metadata_dump(metadata::ptr md, emitter::ptr e, dump_options const &opts)
{
find_best_roots(*md->tm_->get_bm());
details_extractor de(opts);
device_tree_detail::damage_visitor::ptr dd_policy(details_damage_policy(opts.repair_));
walk_device_tree(*md->details_, de, *dd_policy);

View File

@ -52,7 +52,7 @@ namespace {
metadata::ptr open_metadata(string const &path, struct flags &flags) {
block_manager<>::ptr bm = open_bm(path, block_manager<>::READ_ONLY, !flags.use_metadata_snap);
metadata::ptr md(flags.use_metadata_snap ? new metadata(bm, flags.snap_location) : new metadata(bm));
metadata::ptr md(flags.use_metadata_snap ? new metadata(bm, flags.snap_location) : new metadata(bm, false));
return md;
}

View File

@ -30,6 +30,8 @@
#include "thin-provisioning/commands.h"
#include "version.h"
using namespace boost;
using namespace std;
using namespace thin_provisioning;
//----------------------------------------------------------------
@ -463,11 +465,13 @@ namespace {
class metadata_scanner {
public:
metadata_scanner(block_manager<>::ptr bm, uint64_t scan_begin, uint64_t scan_end)
metadata_scanner(block_manager<>::ptr bm, uint64_t scan_begin, uint64_t scan_end,
bool check_for_strings)
: bm_(bm),
scan_begin_(scan_begin),
scan_end_(scan_end),
index_(scan_begin) {
index_(scan_begin),
check_for_strings_(check_for_strings) {
if (scan_end_ <= scan_begin_)
throw std::runtime_error("badly formed region (end <= begin)");
@ -486,8 +490,6 @@ namespace {
run_range_ = r.clone();
}
virtual ~metadata_scanner() {}
std::unique_ptr<block_range> get_range() {
std::unique_ptr<block_range> ret;
@ -507,7 +509,44 @@ namespace {
return ret;
}
map<block_address, vector<string>> const &get_strings() const {
return strings_;
}
private:
bool interesting_char(char c)
{
return isalnum(c) || ispunct(c);
}
unsigned printable_len(const char *b, const char *e)
{
const char *p = b;
while (p != e && interesting_char(*p))
p++;
return p - b;
}
// asci text within our metadata is a sure sign of corruption.
optional<vector<string> >
scan_strings(block_manager<>::read_ref rr)
{
vector<string> r;
const char *data = reinterpret_cast<const char *>(rr.data()), *end = data + MD_BLOCK_SIZE;
while (data < end) {
auto len = printable_len(data, end);
if (len >= 4)
r.push_back(string(data, data + len));
data += len + 1;
}
return r.size() ? optional<vector<string>>(r) : optional<vector<string>>();
}
block_range const &read_block(block_address b) {
block_manager<>::read_ref rr = bm_->read_lock(b);
int64_t ref_count;
@ -516,6 +555,14 @@ namespace {
} catch (std::exception &e) {
ref_count = -1;
}
if (check_for_strings_) {
auto ss = scan_strings(rr);
if (ss) {
strings_.insert(make_pair(b, *ss));
}
}
return factory_.convert_to_range(rr, ref_count);
}
@ -531,17 +578,24 @@ namespace {
std::unique_ptr<block_range> run_range_;
range_factory factory_;
bool check_for_strings_;
map<block_address, vector<string>> strings_;
};
//-------------------------------------------------------------------
struct flags {
flags(): exclusive_(true) {
flags()
: exclusive_(true),
examine_corruption_(false)
{
}
boost::optional<block_address> scan_begin_;
boost::optional<block_address> scan_end_;
bool exclusive_;
bool examine_corruption_;
};
int scan_metadata_(string const &input,
@ -552,11 +606,26 @@ namespace {
block_address scan_begin = f.scan_begin_ ? *f.scan_begin_ : 0;
block_address scan_end = f.scan_end_ ? *f.scan_end_ : bm->get_nr_blocks();
metadata_scanner scanner(bm, scan_begin, scan_end);
metadata_scanner scanner(bm, scan_begin, scan_end, f.examine_corruption_);
std::unique_ptr<block_range> r;
while ((r = scanner.get_range())) {
out << *r << std::endl;
}
if (f.examine_corruption_) {
auto ss = scanner.get_strings();
for (auto const &ps : ss) {
out << ps.first << ": ";
unsigned total = 0;
for (auto const &s : ps.second)
total += s.length();
out << total << " bytes of text\n";
}
}
return 0;
}
@ -592,6 +661,7 @@ thin_scan_cmd::usage(std::ostream &out) const {
<< " {-o|--output} <xml file>\n"
<< " {--begin} <block#>\n"
<< " {--end} <block#>\n"
<< " {--examine-corruption}\n"
<< " {-V|--version}" << endl;
}
@ -605,6 +675,7 @@ thin_scan_cmd::run(int argc, char **argv)
{ "version", no_argument, NULL, 'V'},
{ "begin", required_argument, NULL, 1},
{ "end", required_argument, NULL, 2},
{ "examine-corruption", no_argument, NULL, 3 },
{ NULL, no_argument, NULL, 0 }
};
boost::optional<string> output;
@ -643,6 +714,10 @@ thin_scan_cmd::run(int argc, char **argv)
}
break;
case 3:
f.examine_corruption_ = true;
break;
default:
usage(cerr);
return 1;