// Copyright (C) 2011 Red Hat, Inc. All rights reserved. // // This file is part of the thin-provisioning-tools source. // // thin-provisioning-tools is free software: you can redistribute it // and/or modify it under the terms of the GNU General Public License // as published by the Free Software Foundation, either version 3 of // the License, or (at your option) any later version. // // thin-provisioning-tools is distributed in the hope that it will be // useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License along // with thin-provisioning-tools. If not, see // . #include "btree.h" #include "persistent-data/errors.h" #include "persistent-data/checksum.h" #include "persistent-data/transaction_manager.h" #include //---------------------------------------------------------------- namespace { using namespace base; using namespace persistent_data; using namespace btree_detail; using namespace std; struct btree_node_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { disk_node const *data = reinterpret_cast(b.raw()); node_header const *n = &data->header; crc32c sum(BTREE_CSUM_XOR); sum.append(&n->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(n->csum)) throw checksum_error("bad checksum in btree node"); if (to_cpu(n->blocknr) != location) throw checksum_error("bad block nr in btree node"); } virtual void prepare(buffer<> &b, block_address location) const { disk_node *data = reinterpret_cast(b.raw()); node_header *n = &data->header; n->blocknr = to_disk(location); crc32c sum(BTREE_CSUM_XOR); sum.append(&n->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); n->csum = to_disk(sum.get_sum()); } }; } //---------------------------------------------------------------- namespace persistent_data { inline void ro_spine::step(block_address b) { spine_.push_back(tm_->read_lock(b, validator_)); if (spine_.size() > 2) spine_.pop_front(); } inline bool shadow_spine::step(block_address b) { pair p = tm_->shadow(b, validator_); try { step(p.first); } catch (...) { tm_->get_sm()->dec(p.first.get_location()); throw; } return p.second; } //---------------------------------------------------------------- template node_ref::node_ref(block_address location, disk_node *raw) : location_(location), raw_(raw), checked_(false) { } template uint32_t node_ref::get_checksum() const { return to_cpu(raw_->header.csum); } template block_address node_ref::get_block_nr() const { return to_cpu(raw_->header.blocknr); } template btree_detail::node_type node_ref::get_type() const { uint32_t flags = to_cpu(raw_->header.flags); if (flags & INTERNAL_NODE) { if (flags & LEAF_NODE) throw runtime_error("btree node is both internal and leaf"); return INTERNAL; } else if (flags & LEAF_NODE) return LEAF; else throw runtime_error("unknown node type"); } template void node_ref::set_type(node_type t) { uint32_t flags = to_cpu(raw_->header.flags); switch (t) { case INTERNAL: flags = INTERNAL_NODE; break; case LEAF: flags = LEAF_NODE; break; } raw_->header.flags = to_disk(flags); } template unsigned node_ref::get_nr_entries() const { return to_cpu(raw_->header.nr_entries); } template void node_ref::set_nr_entries(unsigned n) { raw_->header.nr_entries = to_disk(n); } template unsigned node_ref::get_max_entries() const { return to_cpu(raw_->header.max_entries); } template void node_ref::set_max_entries(unsigned n) { raw_->header.max_entries = to_disk(n); } template void node_ref::set_max_entries() { set_max_entries(calc_max_entries()); } template size_t node_ref::get_value_size() const { return to_cpu(raw_->header.value_size); } template void node_ref::set_value_size(size_t s) { raw_->header.value_size = to_disk(static_cast(s)); } template uint64_t node_ref::key_at(unsigned i) const { if (i >= get_nr_entries()) throw runtime_error("key index out of bounds"); return to_cpu(raw_->keys[i]); } template void node_ref::set_key(unsigned i, uint64_t k) { raw_->keys[i] = to_disk(k); } template typename ValueTraits::value_type node_ref::value_at(unsigned i) const { if (i >= get_nr_entries()) throw runtime_error("value index out of bounds"); // We have to copy because of alignment issues. typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); typename ValueTraits::value_type v; ValueTraits::unpack(d, v); return v; } template void node_ref::set_value(unsigned i, typename ValueTraits::value_type const &v) { typename ValueTraits::disk_type d; ValueTraits::pack(v, d); ::memcpy(value_ptr(i), &d, sizeof(d)); } template void node_ref::insert_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v) { unsigned n = get_nr_entries(); if ((n + 1) > get_max_entries()) throw runtime_error("too many entries"); set_nr_entries(n + 1); ::memmove(key_ptr(i + 1), key_ptr(i), sizeof(uint64_t) * (n - i)); ::memmove(value_ptr(i + 1), value_ptr(i), sizeof(typename ValueTraits::disk_type) * (n - i)); overwrite_at(i, key, v); } template void node_ref::overwrite_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v) { set_key(i, key); set_value(i, v); } template void node_ref::copy_entries(node_ref const &rhs, unsigned begin, unsigned end) { unsigned count = end - begin; unsigned n = get_nr_entries(); if ((n + count) > get_max_entries()) throw runtime_error("too many entries"); ::memcpy(key_ptr(n), rhs.key_ptr(begin), sizeof(uint64_t) * count); ::memcpy(value_ptr(n), rhs.value_ptr(begin), sizeof(typename ValueTraits::disk_type) * count); set_nr_entries(n + count); } template int node_ref::bsearch(uint64_t key, int want_hi) const { int lo = -1, hi = get_nr_entries(); while(hi - lo > 1) { int mid = lo + ((hi - lo) / 2); uint64_t mid_key = key_at(mid); if (mid_key == key) return mid; if (mid_key < key) lo = mid; else hi = mid; } return want_hi ? hi : lo; } template boost::optional node_ref::exact_search(uint64_t key) const { int i = bsearch(key, 0); if (i < 0 || static_cast(i) >= get_nr_entries()) return boost::optional(); if (key != key_at(i)) return boost::optional(); return boost::optional(i); } template int node_ref::lower_bound(uint64_t key) const { return bsearch(key, 0); } template unsigned node_ref::calc_max_entries(void) { uint32_t total; // key + value size_t elt_size = sizeof(uint64_t) + sizeof(typename ValueTraits::disk_type); total = (MD_BLOCK_SIZE - sizeof(struct node_header)) / elt_size; return (total / 3) * 3; // rounds down } template void * node_ref::key_ptr(unsigned i) const { check_fits_within_block(); return raw_->keys + i; } template void * node_ref::value_ptr(unsigned i) const { check_fits_within_block(); void *value_base = &raw_->keys[to_cpu(raw_->header.max_entries)]; return static_cast(value_base) + sizeof(typename ValueTraits::disk_type) * i; } template template void node_ref::inc_children(RefCounter &rc) { unsigned nr_entries = get_nr_entries(); for (unsigned i = 0; i < nr_entries; i++) { typename ValueTraits::value_type v; typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); ValueTraits::unpack(d, v); rc.inc(v); } } template void node_ref::check_fits_within_block() const { if (checked_) return; if (sizeof(typename ValueTraits::disk_type) != get_value_size()) { std::ostringstream out; out << "value size mismatch: expected " << sizeof(typename ValueTraits::disk_type) << ", but got " << get_value_size() << ". This is not the btree you are looking for." << std::endl; throw std::runtime_error(out.str()); } unsigned max = calc_max_entries(); if (max < get_nr_entries()) { std::ostringstream out; out << "Bad nr of elements: max per block = " << max << ", actual = " << get_nr_entries() << std::endl; throw std::runtime_error(out.str()); } checked_ = true; } //-------------------------------- template btree:: btree(typename transaction_manager::ptr tm, typename ValueTraits::ref_counter rc) : tm_(tm), destroy_(false), internal_rc_(tm->get_sm()), rc_(rc), validator_(new btree_node_validator) { using namespace btree_detail; write_ref root = tm_->new_block(validator_); if (Levels > 1) { internal_node n = to_node(root); n.set_type(btree_detail::LEAF); n.set_nr_entries(0); n.set_max_entries(); n.set_value_size(sizeof(typename block_traits::disk_type)); } else { leaf_node n = to_node(root); n.set_type(btree_detail::LEAF); n.set_nr_entries(0); n.set_max_entries(); n.set_value_size(sizeof(typename ValueTraits::disk_type)); } root_ = root.get_location(); } template btree:: btree(typename transaction_manager::ptr tm, block_address root, typename ValueTraits::ref_counter rc) : tm_(tm), destroy_(false), root_(root), internal_rc_(tm->get_sm()), rc_(rc), validator_(new btree_node_validator) { } template btree::~btree() { } namespace { template struct lower_bound_search { static boost::optional search(btree_detail::node_ref n, uint64_t key) { return n.lower_bound(key); } }; template struct exact_search { static boost::optional search(btree_detail::node_ref n, uint64_t key) { return n.exact_search(key); } }; } template typename btree::maybe_value btree::lookup(key const &key) const { using namespace btree_detail; ro_spine spine(tm_, validator_); block_address root = root_; for (unsigned level = 0; level < Levels - 1; ++level) { boost::optional mroot = lookup_raw >(spine, root, key[level]); if (!mroot) return maybe_value(); root = *mroot; } return lookup_raw >(spine, root, key[Levels - 1]); } template typename btree::maybe_pair btree::lookup_le(key const &key) const { using namespace btree_detail; return maybe_pair(); } template typename btree::maybe_pair btree::lookup_ge(key const &key) const { using namespace btree_detail; return maybe_pair(); } template void btree:: insert(key const &key, typename ValueTraits::value_type const &value) { using namespace btree_detail; block_address block = root_; int index = 0; // FIXME: ??? shadow_spine spine(tm_, validator_); for (unsigned level = 0; level < Levels - 1; ++level) { bool need_insert = insert_location(spine, block, key[level], &index, internal_rc_); internal_node n = spine.template get_node(); if (need_insert) { btree new_tree(tm_, rc_); n.insert_at(index, key[level], new_tree.get_root()); } block = n.value_at(index); } bool need_insert = insert_location(spine, block, key[Levels - 1], &index, rc_); leaf_node n = spine.template get_node(); if (need_insert) n.insert_at(index, key[Levels - 1], value); else // FIXME: check if we're overwriting with the same value. n.set_value(index, value); root_ = spine.get_root(); } template void btree::remove(key const &key) { using namespace btree_detail; } template block_address btree::get_root() const { return root_; } template void btree::set_root(block_address root) { using namespace btree_detail; root_ = root; } template typename btree::ptr btree::clone() const { tm_->get_sm()->inc(root_); return ptr(new btree(tm_, root_, rc_)); } #if 0 template void btree::destroy() { using namespace btree_detail; } #endif template template boost::optional btree:: lookup_raw(ro_spine &spine, block_address block, uint64_t key) const { using namespace boost; typedef typename ValueTraits::value_type leaf_type; for (;;) { spine.step(block); node_ref leaf = spine.template get_node(); boost::optional mi; if (leaf.get_type() == btree_detail::LEAF) { mi = Search::search(leaf, key); if (!mi) return boost::optional(); return boost::optional(leaf.value_at(*mi)); } mi = leaf.lower_bound(key); if (!mi || *mi < 0) return boost::optional(); node_ref internal = spine.template get_node(); block = internal.value_at(*mi); } } template template void btree:: split_node(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key, bool top) { node_ref n = spine.template get_node(); if (n.get_nr_entries() == n.get_max_entries()) { if (top) split_beneath(spine, key); else split_sibling(spine, parent_index, key); } } template template void btree:: split_beneath(btree_detail::shadow_spine &spine, uint64_t key) { using namespace btree_detail; node_type type; unsigned nr_left, nr_right; write_ref left = tm_->new_block(validator_); node_ref l = to_node(left); l.set_nr_entries(0); l.set_max_entries(); l.set_value_size(sizeof(typename ValueTraits::disk_type)); write_ref right = tm_->new_block(validator_); node_ref r = to_node(right); r.set_nr_entries(0); r.set_max_entries(); r.set_value_size(sizeof(typename ValueTraits::disk_type)); { node_ref p = spine.template get_node(); if (p.get_value_size() != sizeof(typename ValueTraits::disk_type)) throw std::runtime_error("bad value_size"); nr_left = p.get_nr_entries() / 2; nr_right = p.get_nr_entries() - nr_left; type = p.get_type(); l.set_type(type); l.copy_entries(p, 0, nr_left); r.set_type(type); r.copy_entries(p, nr_left, nr_left + nr_right); } { // The parent may have changed value type, so we re-get it. internal_node p = spine.template get_node(); p.set_type(btree_detail::INTERNAL); p.set_max_entries(); p.set_nr_entries(2); p.set_value_size(sizeof(typename block_traits::disk_type)); p.overwrite_at(0, l.key_at(0), left.get_location()); p.overwrite_at(1, r.key_at(0), right.get_location()); } if (key < r.key_at(0)) spine.step(left); else spine.step(right); } template template void btree:: split_sibling(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key) { using namespace btree_detail; node_ref l = spine.template get_node(); block_address left = spine.get_block(); write_ref right = tm_->new_block(validator_); node_ref r = to_node(right); unsigned nr_left = l.get_nr_entries() / 2; unsigned nr_right = l.get_nr_entries() - nr_left; r.set_nr_entries(0); r.set_max_entries(); r.set_type(l.get_type()); r.set_value_size(sizeof(typename ValueTraits::disk_type)); r.copy_entries(l, nr_left, nr_left + nr_right); l.set_nr_entries(nr_left); internal_node p = spine.get_parent(); p.overwrite_at(parent_index, l.key_at(0), left); p.insert_at(parent_index + 1, r.key_at(0), right.get_location()); if (key >= r.key_at(0)) { spine.pop(); spine.step(right); } } // Returns true if we need a new insertion, rather than overwrite. template template bool btree:: insert_location(btree_detail::shadow_spine &spine, block_address block, uint64_t key, int *index, RC &leaf_rc) { using namespace btree_detail; bool top = true; // this isn't the same as spine.has_parent() int i = *index; bool inc = false; for (;;) { inc = spine.step(block); if (inc) inc_children(spine, leaf_rc); // patch up the parent to point to the new shadow if (spine.has_parent()) { internal_node p = spine.get_parent(); p.set_value(i, spine.get_block()); } internal_node internal = spine.template get_node(); // Split the node if we're full if (internal.get_type() == INTERNAL) split_node(spine, i, key, top); else split_node(spine, i, key, top); internal = spine.template get_node(); i = internal.lower_bound(key); if (internal.get_type() == btree_detail::LEAF) break; if (i < 0) { internal.set_key(0, key); i = 0; } block = internal.value_at(i); top = false; } node_ref leaf = spine.template get_node(); // FIXME: gross if (i < 0 || leaf.key_at(i) != key) i++; // do decrement the old value if it already exists // FIXME: I'm not sure about this, I don't understand the |inc| reference if (static_cast(i) < leaf.get_nr_entries() && leaf.key_at(i) == key && inc) { // dec old entry } *index = i; return ((static_cast(i) >= leaf.get_nr_entries()) || (leaf.key_at(i) != key)); } template void btree::visit_depth_first(visitor &v) const { node_location loc; walk_tree(v, loc, root_); v.visit_complete(); } template void btree::walk_tree(visitor &v, node_location const &loc, block_address b) const { try { walk_tree_internal(v, loc, b); } catch (std::runtime_error const &e) { switch (v.error_accessing_node(loc, b, e.what())) { case visitor::EXCEPTION_HANDLED: break; case visitor::RETHROW_EXCEPTION: throw; } } } template void btree::walk_tree_internal(visitor &v, node_location const &loc, block_address b) const { using namespace btree_detail; read_ref blk = tm_->read_lock(b, validator_); internal_node o = to_node(blk); // FIXME: use a switch statement if (o.get_type() == INTERNAL) { if (v.visit_internal(loc, o)) for (unsigned i = 0; i < o.get_nr_entries(); i++) { node_location loc2(loc); loc2.inc_depth(); loc2.key = o.key_at(i); walk_tree(v, loc2, o.value_at(i)); } } else if (loc.path.size() < Levels - 1) { if (v.visit_internal_leaf(loc, o)) for (unsigned i = 0; i < o.get_nr_entries(); i++) { node_location loc2(loc); loc2.push_key(o.key_at(i)); loc2.key = boost::optional(); walk_tree(v, loc2, o.value_at(i)); } } else { leaf_node ov = to_node(blk); v.visit_leaf(loc, ov); } } template template void btree::inc_children(btree_detail::shadow_spine &spine, RefCounter &leaf_rc) { node_ref nr = spine.template get_node(); if (nr.get_type() == INTERNAL) nr.inc_children(internal_rc_); else { node_ref leaf = spine.template get_node(); leaf.inc_children(leaf_rc); } } } //----------------------------------------------------------------