// Copyright (C) 2011 Red Hat, Inc. All rights reserved. // // This file is part of the thin-provisioning-tools source. // // thin-provisioning-tools is free software: you can redistribute it // and/or modify it under the terms of the GNU General Public License // as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // thin-provisioning-tools is distributed in the hope that it will be // useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along // with thin-provisioning-tools. If not, see // . #include "btree.h" #include "persistent-data/errors.h" #include "persistent-data/checksum.h" #include "persistent-data/transaction_manager.h" #include "persistent-data/validators.h" #include #include #include //---------------------------------------------------------------- namespace { using namespace base; using namespace persistent_data; using namespace btree_detail; using namespace std; struct frame { frame(block_address blocknr, uint32_t level, uint32_t nr_entries) : blocknr_(blocknr), level_(level), nr_entries_(nr_entries), current_child_(0) { } block_address blocknr_; uint32_t level_; uint32_t nr_entries_; uint32_t current_child_; }; // stack for postorder DFS traversal // TODO: Refactor it into a spine-like class, e.g., btree_del_spine, // "Spine" sounds better for btree operations. struct btree_del_stack { public: btree_del_stack(transaction_manager &tm): tm_(tm) { } void push_frame(block_address blocknr, uint32_t level, uint32_t nr_entries) { if (tm_.get_sm()->get_count(blocknr) > 1) tm_.get_sm()->dec(blocknr); else spine_.push(frame(blocknr, level, nr_entries)); } void pop_frame() { tm_.get_sm()->dec(spine_.top().blocknr_); spine_.pop(); } frame &top_frame() { return spine_.top(); } bool is_empty() { return spine_.empty(); } private: transaction_manager &tm_; std::stack spine_; }; } //---------------------------------------------------------------- namespace persistent_data { inline void ro_spine::step(block_address b) { spine_.push_back(tm_.read_lock(b, validator_)); if (spine_.size() > 2) spine_.pop_front(); } inline bool shadow_spine::step(block_address b) { pair p = tm_.shadow(b, validator_); try { step(p.first); } catch (...) { tm_.get_sm()->dec(p.first.get_location()); throw; } return p.second; } //---------------------------------------------------------------- template node_ref::node_ref(block_address location, disk_node *raw) : location_(location), raw_(raw), checked_(false) { } template uint32_t node_ref::get_checksum() const { return to_cpu(raw_->header.csum); } template block_address node_ref::get_block_nr() const { return to_cpu(raw_->header.blocknr); } template btree_detail::node_type node_ref::get_type() const { uint32_t flags = to_cpu(raw_->header.flags); if (flags & INTERNAL_NODE) { if (flags & LEAF_NODE) { ostringstream out; out << "btree node is both internal and leaf" << " (block " << location_ << ")"; throw runtime_error(out.str()); } return INTERNAL; } else if (flags & LEAF_NODE) return LEAF; else { ostringstream out; out << "unknown node type" << " (block " << location_ << ")"; throw runtime_error(out.str()); } } template void node_ref::set_type(node_type t) { uint32_t flags = to_cpu(raw_->header.flags); switch (t) { case INTERNAL: flags = INTERNAL_NODE; break; case LEAF: flags = LEAF_NODE; break; } raw_->header.flags = to_disk(flags); } template unsigned node_ref::get_nr_entries() const { return to_cpu(raw_->header.nr_entries); } template void node_ref::set_nr_entries(unsigned n) { raw_->header.nr_entries = to_disk(n); } template unsigned node_ref::get_max_entries() const { return to_cpu(raw_->header.max_entries); } template void node_ref::set_max_entries(unsigned n) { raw_->header.max_entries = to_disk(n); } template void node_ref::set_max_entries() { set_max_entries(calc_max_entries()); } template size_t node_ref::get_value_size() const { return to_cpu(raw_->header.value_size); } template void node_ref::set_value_size(size_t s) { raw_->header.value_size = to_disk(static_cast(s)); } template uint64_t node_ref::key_at(unsigned i) const { if (i >= get_nr_entries()) throw runtime_error("key index out of bounds"); return to_cpu(raw_->keys[i]); } template void node_ref::set_key(unsigned i, uint64_t k) { raw_->keys[i] = to_disk(k); } template typename ValueTraits::value_type node_ref::value_at(unsigned i) const { if (i >= get_nr_entries()) throw runtime_error("value index out of bounds"); // We have to copy because of alignment issues. typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); typename ValueTraits::value_type v; ValueTraits::unpack(d, v); return v; } template void node_ref::set_value(unsigned i, typename ValueTraits::value_type const &v) { typename ValueTraits::disk_type d; ValueTraits::pack(v, d); ::memcpy(value_ptr(i), &d, sizeof(d)); } template void node_ref::insert_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v) { unsigned n = get_nr_entries(); if ((n + 1) > get_max_entries()) throw runtime_error("too many entries"); set_nr_entries(n + 1); ::memmove(key_ptr(i + 1), key_ptr(i), sizeof(uint64_t) * (n - i)); ::memmove(value_ptr(i + 1), value_ptr(i), sizeof(typename ValueTraits::disk_type) * (n - i)); overwrite_at(i, key, v); } template void node_ref::overwrite_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v) { set_key(i, key); set_value(i, v); } template void node_ref::delete_at(unsigned i) { unsigned nr_entries = get_nr_entries(); if (i >= nr_entries) throw runtime_error("key index out of bounds"); unsigned nr_to_copy = nr_entries - (i + 1); if (nr_to_copy) { ::memmove(key_ptr(i), key_ptr(i + 1), sizeof(uint64_t) * nr_to_copy); ::memmove(value_ptr(i), value_ptr(i + 1), sizeof(typename ValueTraits::disk_type) * nr_to_copy); } set_nr_entries(nr_entries - 1); } template void node_ref::copy_entries(node_ref const &rhs, unsigned begin, unsigned end) { unsigned count = end - begin; unsigned n = get_nr_entries(); if ((n + count) > get_max_entries()) throw runtime_error("too many entries"); ::memcpy(key_ptr(n), rhs.key_ptr(begin), sizeof(uint64_t) * count); ::memcpy(value_ptr(n), rhs.value_ptr(begin), sizeof(typename ValueTraits::disk_type) * count); set_nr_entries(n + count); } template void node_ref::move_entries(node_ref &rhs, int count) { if (!count) return; unsigned nr_left = get_nr_entries(); unsigned nr_right = rhs.get_nr_entries(); unsigned max_entries = get_max_entries(); if (nr_left - count > max_entries || nr_right - count > max_entries) throw runtime_error("too many entries"); if (count > 0) { rhs.shift_entries_right(count); copy_entries_to_right(rhs, count); } else { copy_entries_to_left(rhs, -count); rhs.shift_entries_left(-count); } set_nr_entries(nr_left - count); rhs.set_nr_entries(nr_right + count); } template void node_ref::copy_entries_to_left(node_ref const &rhs, unsigned count) { unsigned n = get_nr_entries(); if ((n + count) > get_max_entries()) throw runtime_error("too many entries"); ::memcpy(key_ptr(n), rhs.key_ptr(0), sizeof(uint64_t) * count); ::memcpy(value_ptr(n), rhs.value_ptr(0), sizeof(typename ValueTraits::disk_type) * count); } template void node_ref::copy_entries_to_right(node_ref &rhs, unsigned count) const { unsigned n = rhs.get_nr_entries(); if ((n + count) > get_max_entries()) throw runtime_error("too many entries"); unsigned nr_left = get_nr_entries(); ::memcpy(rhs.key_ptr(0), key_ptr(nr_left - count), sizeof(uint64_t) * count); ::memcpy(rhs.value_ptr(0), value_ptr(nr_left - count), sizeof(typename ValueTraits::disk_type) * count); } template void node_ref::shift_entries_left(unsigned shift) { unsigned n = get_nr_entries(); if (shift > n) throw runtime_error("too many entries"); unsigned nr_shifted = n - shift; ::memmove(key_ptr(0), key_ptr(shift), sizeof(uint64_t) * nr_shifted); ::memmove(value_ptr(0), value_ptr(shift), sizeof(typename ValueTraits::disk_type) * nr_shifted); } template void node_ref::shift_entries_right(unsigned shift) { unsigned n = get_nr_entries(); if (n + shift > get_max_entries()) throw runtime_error("too many entries"); ::memmove(key_ptr(shift), key_ptr(0), sizeof(uint64_t) * n); ::memmove(value_ptr(shift), value_ptr(0), sizeof(typename ValueTraits::disk_type) * n); } template unsigned node_ref::merge_threshold() const { return get_max_entries() / 3; } template int node_ref::bsearch(uint64_t key, int want_hi) const { int lo = -1, hi = get_nr_entries(); while(hi - lo > 1) { int mid = lo + ((hi - lo) / 2); uint64_t mid_key = key_at(mid); if (mid_key == key) return mid; if (mid_key < key) lo = mid; else hi = mid; } return want_hi ? hi : lo; } template boost::optional node_ref::exact_search(uint64_t key) const { int i = bsearch(key, 0); if (i < 0 || static_cast(i) >= get_nr_entries()) return boost::optional(); if (key != key_at(i)) return boost::optional(); return boost::optional(i); } template int node_ref::lower_bound(uint64_t key) const { return bsearch(key, 0); } template unsigned node_ref::calc_max_entries(void) { uint32_t total; // key + value size_t elt_size = sizeof(uint64_t) + sizeof(typename ValueTraits::disk_type); total = (MD_BLOCK_SIZE - sizeof(struct node_header)) / elt_size; return (total / 3) * 3; // rounds down } template void * node_ref::key_ptr(unsigned i) const { check_fits_within_block(); return raw_->keys + i; } template void * node_ref::value_ptr(unsigned i) const { check_fits_within_block(); void *value_base = &raw_->keys[to_cpu(raw_->header.max_entries)]; return static_cast(value_base) + sizeof(typename ValueTraits::disk_type) * i; } template template void node_ref::inc_children(RefCounter &rc) { unsigned nr_entries = get_nr_entries(); for (unsigned i = 0; i < nr_entries; i++) { typename ValueTraits::value_type v; typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); ValueTraits::unpack(d, v); rc.inc(v); } } template template void node_ref::dec_children(RefCounter &rc) { unsigned nr_entries = get_nr_entries(); for (unsigned i = 0; i < nr_entries; i++) { typename ValueTraits::value_type v; typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); ValueTraits::unpack(d, v); rc.dec(v); } } template bool node_ref::value_sizes_match() const { return sizeof(typename ValueTraits::disk_type) == get_value_size(); } template std::string node_ref::value_mismatch_string() const { std::ostringstream out; out << "value size mismatch: expected " << sizeof(typename ValueTraits::disk_type) << ", but got " << get_value_size() << " (block " << location_ << ")." << std::endl; return out.str(); } template void node_ref::check_fits_within_block() const { if (checked_) return; if (!value_sizes_match()) throw std::runtime_error(value_mismatch_string()); unsigned max = calc_max_entries(); if (max < get_nr_entries()) { std::ostringstream out; out << "Bad nr of elements: max per block = " << max << ", actual = " << get_nr_entries() << " (block " << location_ << ")" << std::endl; throw std::runtime_error(out.str()); } checked_ = true; } //-------------------------------- template btree:: btree(transaction_manager &tm, typename ValueTraits::ref_counter rc) : tm_(tm), destroy_(false), internal_rc_(tm.get_sm()), rc_(rc), validator_(create_btree_node_validator()) { using namespace btree_detail; write_ref root = tm_.new_block(validator_); if (Levels > 1) { internal_node n = to_node(root); n.set_type(btree_detail::LEAF); n.set_nr_entries(0); n.set_max_entries(); n.set_value_size(sizeof(typename block_traits::disk_type)); } else { leaf_node n = to_node(root); n.set_type(btree_detail::LEAF); n.set_nr_entries(0); n.set_max_entries(); n.set_value_size(sizeof(typename ValueTraits::disk_type)); } root_ = root.get_location(); } template btree:: btree(transaction_manager &tm, block_address root, typename ValueTraits::ref_counter rc) : tm_(tm), destroy_(false), root_(root), internal_rc_(tm.get_sm()), rc_(rc), validator_(create_btree_node_validator()) { } template btree::~btree() { } namespace { template struct lower_bound_search { static boost::optional search(btree_detail::node_ref n, uint64_t key) { int i = n.lower_bound(key); return (i < 0) ? boost::optional() : boost::optional(i); } }; template struct exact_search { static boost::optional search(btree_detail::node_ref n, uint64_t key) { return n.exact_search(key); } }; } template typename btree::maybe_value btree::lookup(key const &key) const { using namespace btree_detail; ro_spine spine(tm_, validator_); block_address root = root_; for (unsigned level = 0; level < Levels - 1; ++level) { boost::optional mroot = lookup_raw >(spine, root, key[level]); if (!mroot) return maybe_value(); root = *mroot; } return lookup_raw >(spine, root, key[Levels - 1]); } template typename btree::maybe_pair btree::lookup_le(key const &key) const { using namespace btree_detail; return maybe_pair(); } template typename btree::maybe_pair btree::lookup_ge(key const &key) const { using namespace btree_detail; return maybe_pair(); } template bool btree:: insert(key const &key, typename ValueTraits::value_type const &value) { using namespace btree_detail; block_address block = root_; int index = 0; // FIXME: ??? shadow_spine spine(tm_, validator_); for (unsigned level = 0; level < Levels - 1; ++level) { bool need_insert = insert_location(spine, block, key[level], &index, internal_rc_); internal_node n = spine.template get_node(); if (need_insert) { btree new_tree(tm_, rc_); n.insert_at(index, key[level], new_tree.get_root()); } block = n.value_at(index); } bool need_insert = insert_location(spine, block, key[Levels - 1], &index, rc_); leaf_node n = spine.template get_node(); if (need_insert) n.insert_at(index, key[Levels - 1], value); else // FIXME: check if we're overwriting with the same value. n.set_value(index, value); root_ = spine.get_root(); return need_insert; } template block_address btree::get_root() const { return root_; } template void btree::set_root(block_address root) { using namespace btree_detail; root_ = root; } template typename btree::ptr btree::clone() const { tm_.get_sm()->inc(root_); return ptr(new btree(tm_, root_, rc_)); } template void btree::destroy() { using namespace btree_detail; btree_del_stack s(tm_); { read_ref blk = tm_.read_lock(root_, validator_); internal_node n = to_node(blk); s.push_frame(root_, 0, n.get_nr_entries()); } while (!s.is_empty()) { frame &f = s.top_frame(); if (f.current_child_ >= f.nr_entries_) { s.pop_frame(); continue; } // FIXME: Cache the read_ref object in the stack to avoid temporary objects? read_ref current = tm_.read_lock(f.blocknr_, validator_); internal_node n = to_node(current); if (n.get_type() == INTERNAL) { // TODO: test performance penalty of prefetching //if (!f.current_child_) // for (unsigned i = 0; i < n.get_nr_entries(); i++) // tm_.prefetch(n.value_at(i)); block_address b = n.value_at(f.current_child_); read_ref leaf = tm_.read_lock(b, validator_); internal_node o = to_node(leaf); s.push_frame(b, f.level_, o.get_nr_entries()); ++f.current_child_; // internal leaf } else if (f.level_ < Levels - 1) { block_address b = n.value_at(f.current_child_); read_ref leaf = tm_.read_lock(b, validator_); internal_node o = to_node(leaf); s.push_frame(b, f.level_ + 1, o.get_nr_entries()); ++f.current_child_; } else { leaf_node o = to_node(current); o.dec_children(rc_); // FIXME: move this into pop_frame() s.pop_frame(); } } } template template boost::optional btree:: lookup_raw(ro_spine &spine, block_address block, uint64_t key) const { using namespace boost; typedef typename ValueTraits::value_type leaf_type; for (;;) { spine.step(block); node_ref leaf = spine.template get_node(); boost::optional mi; if (leaf.get_type() == btree_detail::LEAF) { mi = Search::search(leaf, key); if (!mi) return boost::optional(); return boost::optional(leaf.value_at(*mi)); } { int lb = leaf.lower_bound(key); if (lb < 0) return boost::optional(); mi = lb; } node_ref internal = spine.template get_node(); block = internal.value_at(*mi); } } template template void btree:: split_node(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key, bool top) { node_ref n = spine.template get_node(); if (n.get_nr_entries() == n.get_max_entries()) { if (top) split_beneath(spine, key); else split_sibling(spine, parent_index, key); } } template template void btree:: split_beneath(btree_detail::shadow_spine &spine, uint64_t key) { using namespace btree_detail; node_type type; unsigned nr_left, nr_right; write_ref left = tm_.new_block(validator_); node_ref l = to_node(left); l.set_nr_entries(0); l.set_max_entries(); l.set_value_size(sizeof(typename ValueTraits::disk_type)); write_ref right = tm_.new_block(validator_); node_ref r = to_node(right); r.set_nr_entries(0); r.set_max_entries(); r.set_value_size(sizeof(typename ValueTraits::disk_type)); { node_ref p = spine.template get_node(); if (p.get_value_size() != sizeof(typename ValueTraits::disk_type)) throw std::runtime_error("bad value_size"); nr_left = p.get_nr_entries() / 2; nr_right = p.get_nr_entries() - nr_left; type = p.get_type(); l.set_type(type); l.copy_entries(p, 0, nr_left); r.set_type(type); r.copy_entries(p, nr_left, nr_left + nr_right); } { // The parent may have changed value type, so we re-get it. internal_node p = spine.template get_node(); p.set_type(btree_detail::INTERNAL); p.set_max_entries(); p.set_nr_entries(2); p.set_value_size(sizeof(typename block_traits::disk_type)); p.overwrite_at(0, l.key_at(0), left.get_location()); p.overwrite_at(1, r.key_at(0), right.get_location()); } if (key < r.key_at(0)) spine.step(left); else spine.step(right); } template template void btree:: split_sibling(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key) { using namespace btree_detail; node_ref l = spine.template get_node(); block_address left = spine.get_block(); write_ref right = tm_.new_block(validator_); node_ref r = to_node(right); unsigned nr_left = l.get_nr_entries() / 2; unsigned nr_right = l.get_nr_entries() - nr_left; r.set_nr_entries(0); r.set_max_entries(); r.set_type(l.get_type()); r.set_value_size(sizeof(typename ValueTraits::disk_type)); r.copy_entries(l, nr_left, nr_left + nr_right); l.set_nr_entries(nr_left); internal_node p = spine.get_parent(); p.overwrite_at(parent_index, l.key_at(0), left); p.insert_at(parent_index + 1, r.key_at(0), right.get_location()); if (key >= r.key_at(0)) { spine.pop(); spine.step(right); } } // Returns true if we need a new insertion, rather than overwrite. template template bool btree:: insert_location(btree_detail::shadow_spine &spine, block_address block, uint64_t key, int *index, RC &leaf_rc) { using namespace btree_detail; bool top = true; // this isn't the same as spine.has_parent() int i = *index; bool inc = false; for (;;) { inc = spine.step(block); if (inc) inc_children(spine, leaf_rc); // patch up the parent to point to the new shadow if (spine.has_parent()) { internal_node p = spine.get_parent(); p.set_value(i, spine.get_block()); } internal_node internal = spine.template get_node(); // Split the node if we're full if (internal.get_type() == INTERNAL) split_node(spine, i, key, top); else split_node(spine, i, key, top); internal = spine.template get_node(); i = internal.lower_bound(key); if (internal.get_type() == btree_detail::LEAF) break; if (i < 0) { internal.set_key(0, key); i = 0; } block = internal.value_at(i); top = false; } node_ref leaf = spine.template get_node(); // FIXME: gross if (i < 0 || leaf.key_at(i) != key) i++; // do decrement the old value if it already exists // FIXME: I'm not sure about this, I don't understand the |inc| reference if (static_cast(i) < leaf.get_nr_entries() && leaf.key_at(i) == key && inc) { // dec old entry } *index = i; return ((static_cast(i) >= leaf.get_nr_entries()) || (leaf.key_at(i) != key)); } template void btree::visit_depth_first(visitor &v) const { node_location loc; walk_tree(v, loc, root_); v.visit_complete(); } template void btree::walk_tree(visitor &v, node_location const &loc, block_address b) const { try { walk_tree_internal(v, loc, b); } catch (std::runtime_error const &e) { switch (v.error_accessing_node(loc, b, e.what())) { case visitor::EXCEPTION_HANDLED: break; case visitor::RETHROW_EXCEPTION: throw; } } } template void btree::walk_tree_internal(visitor &v, node_location const &loc, block_address b) const { using namespace btree_detail; read_ref blk = tm_.read_lock(b, validator_); internal_node o = to_node(blk); // FIXME: use a switch statement if (o.get_type() == INTERNAL) { if (v.visit_internal(loc, o)) { for (unsigned i = 0; i < o.get_nr_entries(); i++) tm_.prefetch(o.value_at(i)); for (unsigned i = 0; i < o.get_nr_entries(); i++) { node_location loc2(loc); loc2.inc_depth(); loc2.key = o.key_at(i); walk_tree(v, loc2, o.value_at(i)); } } } else if (loc.path.size() < Levels - 1) { if (v.visit_internal_leaf(loc, o)) for (unsigned i = 0; i < o.get_nr_entries(); i++) { node_location loc2(loc); loc2.push_key(o.key_at(i)); loc2.key = boost::optional(); walk_tree(v, loc2, o.value_at(i)); } } else { leaf_node ov = to_node(blk); v.visit_leaf(loc, ov); } } template template void btree::inc_children(btree_detail::shadow_spine &spine, RefCounter &leaf_rc) { node_ref nr = spine.template get_node(); if (nr.get_type() == INTERNAL) nr.inc_children(internal_rc_); else { node_ref leaf = spine.template get_node(); leaf.inc_children(leaf_rc); } } } //----------------------------------------------------------------