#include "btree.h" #include "transaction_manager.h" #include using namespace btree_detail; using namespace persistent_data; using namespace std; //---------------------------------------------------------------- template node_ref::node_ref(block_address location, disk_node *raw) : location_(location), raw_(raw) { } template block_address node_ref::get_block_nr() const { return to_cpu(raw_->header.blocknr); } template btree_detail::node_type node_ref::get_type() const { uint32_t flags = to_cpu(raw_->header.flags); if (flags & INTERNAL_NODE) return INTERNAL; else if (flags & LEAF_NODE) return LEAF; else throw runtime_error("unknow node type"); } template void node_ref::set_type(node_type t) { uint32_t flags = to_cpu(raw_->header.flags); switch (t) { case INTERNAL: flags |= INTERNAL_NODE; break; case LEAF: flags |= LEAF_NODE; break; } raw_->header.flags = to_disk<__le32>(flags); } template unsigned node_ref::get_nr_entries() const { return to_cpu(raw_->header.nr_entries); } template void node_ref::set_nr_entries(unsigned n) { raw_->header.nr_entries = to_disk<__le32>(n); } template unsigned node_ref::get_max_entries() const { return to_cpu(raw_->header.max_entries); } template void node_ref::set_max_entries(unsigned n) { raw_->header.max_entries = to_disk<__le32>(n); } template void node_ref::set_max_entries() { set_max_entries(calc_max_entries()); } template size_t node_ref::get_value_size() const { return to_cpu(raw_->header.value_size); } template uint64_t node_ref::key_at(unsigned i) const { if (i >= get_nr_entries()) throw runtime_error("key index out of bounds"); return to_cpu(raw_->keys[i]); } template void node_ref::set_key(unsigned i, uint64_t k) { raw_->keys[i] = to_disk<__le64>(k); } template typename ValueTraits::value_type node_ref::value_at(unsigned i) const { if (i >= get_nr_entries()) throw runtime_error("value index out of bounds"); // We have to copy because of alignment issues. typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); typename ValueTraits::value_type v; ValueTraits::unpack(d, v); return v; } template void node_ref::set_value(unsigned i, typename ValueTraits::value_type const &v) { typename ValueTraits::disk_type d; ValueTraits::pack(v, d); ::memcpy(value_ptr(i), &d, sizeof(d)); } template void node_ref::insert_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v) { unsigned n = get_nr_entries(); if ((n + 1) > get_max_entries()) throw runtime_error("too many entries"); set_nr_entries(n + 1); ::memmove(key_ptr(i + 1), key_ptr(i), sizeof(uint64_t) * (n - i)); ::memmove(value_ptr(i + 1), value_ptr(i), sizeof(typename ValueTraits::disk_type) * (n - i)); overwrite_at(i, key, v); } template void node_ref::overwrite_at(unsigned i, uint64_t key, typename ValueTraits::value_type const &v) { set_key(i, key); set_value(i, v); } template void node_ref::copy_entries(node_ref const &rhs, unsigned begin, unsigned end) { unsigned count = end - begin; unsigned n = get_nr_entries(); if ((n + count) > get_max_entries()) throw runtime_error("too many entries"); set_nr_entries(n + count); ::memcpy(key_ptr(n), rhs.key_ptr(begin), sizeof(uint64_t) * count); ::memcpy(value_ptr(n), rhs.value_ptr(begin), sizeof(typename ValueTraits::disk_type) * count); } template int node_ref::bsearch(uint64_t key, int want_hi) const { int lo = -1, hi = get_nr_entries(); while(hi - lo > 1) { int mid = lo + ((hi - lo) / 2); uint64_t mid_key = key_at(mid); if (mid_key == key) return mid; if (mid_key < key) lo = mid; else hi = mid; } return want_hi ? hi : lo; } template optional node_ref::exact_search(uint64_t key) const { int i = bsearch(key, 0); if (i < 0 || static_cast(i) >= get_nr_entries()) return optional(); return optional(i); } template int node_ref::lower_bound(uint64_t key) const { return bsearch(key, 0); } template unsigned node_ref::calc_max_entries(void) { uint32_t total; // key + value size_t elt_size = sizeof(uint64_t) + sizeof(typename ValueTraits::disk_type); total = (BlockSize - sizeof(struct node_header)) / elt_size; return (total / 3) * 3; // rounds down } template void * node_ref::key_ptr(unsigned i) const { return raw_->keys + i; } template void * node_ref::value_ptr(unsigned i) const { void *value_base = &raw_->keys[to_cpu(raw_->header.max_entries)]; return static_cast(value_base) + sizeof(typename ValueTraits::disk_type) * i; } template template void node_ref::inc_children(RefCounter &rc) { unsigned nr_entries = get_nr_entries(); for (unsigned i = 0; i < nr_entries; i++) { typename ValueTraits::value_type v; typename ValueTraits::disk_type d; ::memcpy(&d, value_ptr(i), sizeof(d)); ValueTraits::unpack(d, v); rc.inc(v); } } //---------------------------------------------------------------- template btree:: btree(typename transaction_manager::ptr tm, typename ValueTraits::ref_counter rc) : tm_(tm), destroy_(false), rc_(rc) { using namespace btree_detail; write_ref root = tm_->new_block(); leaf_node n = to_node(root); n.set_type(btree_detail::LEAF); n.set_nr_entries(0); n.set_max_entries(); root_ = root.get_location(); } template btree:: btree(typename transaction_manager::ptr tm, block_address root, typename ValueTraits::ref_counter rc) : tm_(tm), destroy_(false), root_(root), rc_(rc) { } template btree::~btree() { } template typename btree::maybe_value btree::lookup(key const &key) const { using namespace btree_detail; ro_spine spine(tm_); block_address root = root_; for (unsigned level = 0; level < Levels - 1; ++level) { optional mroot = lookup_raw(spine, root, key[level]); if (!mroot) return maybe_value(); root = *mroot; } return lookup_raw(spine, root, key[Levels - 1]); } template typename btree::maybe_pair btree::lookup_le(key const &key) const { using namespace btree_detail; return maybe_pair(); } template typename btree::maybe_pair btree::lookup_ge(key const &key) const { using namespace btree_detail; return maybe_pair(); } template void btree:: insert(key const &key, typename ValueTraits::value_type const &value) { using namespace btree_detail; block_address block = root_; int index = 0; // FIXME: ??? shadow_spine spine(tm_); for (unsigned level = 0; level < Levels - 1; ++level) { bool need_insert = insert_location(spine, block, key[level], &index); internal_node n = spine.template get_node(); if (need_insert) { btree new_tree(tm_, rc_); n.insert_at(index, key[level], new_tree.get_root()); } block = n.value_at(index); } bool need_insert = insert_location(spine, block, key[Levels - 1], &index); leaf_node n = spine.template get_node(); if (need_insert) n.insert_at(index, key[Levels - 1], value); else // FIXME: check if we're overwriting with the same value. n.set_value(index, value); } template void btree::remove(key const &key) { using namespace btree_detail; } template block_address btree::get_root() const { return root_; } template void btree::set_root(block_address root) { using namespace btree_detail; root_ = root; } template typename btree::ptr btree::clone() const { using namespace btree_detail; ro_spine spine(tm_); spine.step(root_); write_ref new_root = tm_->new_block(); internal_node o = spine.template get_node(); if (o.get_type() == INTERNAL) { internal_node n = to_node(new_root); ::memcpy(n.raw(), o.raw(), BlockSize); typename uint64_traits::ref_counter rc(internal_rc_); n.inc_children(rc); } else { leaf_node n = to_node(new_root); ::memcpy(n.raw(), o.raw(), BlockSize); typename ValueTraits::ref_counter rc(rc_); n.inc_children(rc); } return btree::ptr( new btree( tm_, new_root.get_location(), rc_)); } #if 0 template void btree::destroy() { using namespace btree_detail; } #endif template template void btree:: split_node(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key, bool top) { node_ref n = spine.template get_node(); if (n.get_nr_entries() == n.get_max_entries()) { if (top) split_beneath(spine, key); else split_sibling(spine, parent_index, key); } } template template void btree:: split_beneath(btree_detail::shadow_spine &spine, uint64_t key) { using namespace btree_detail; node_type type; unsigned nr_left, nr_right; write_ref left = tm_->new_block(); node_ref l = to_node(left); l.set_nr_entries(0); l.set_max_entries(); write_ref right = tm_->new_block(); node_ref r = to_node(right); r.set_nr_entries(0); r.set_max_entries(); { node_ref p = spine.template get_node(); nr_left = p.get_nr_entries() / 2; nr_right = p.get_nr_entries() - nr_left; type = p.get_type(); l.set_type(type); l.copy_entries(p, 0, nr_left); r.set_type(type); r.copy_entries(p, nr_left, nr_left + nr_right); } { // The parent may have changed value type, so we re-get it. internal_node p = spine.template get_node(); p.set_type(btree_detail::INTERNAL); p.set_nr_entries(2); // FIXME: set the value_size p.overwrite_at(0, l.key_at(0), left.get_location()); p.overwrite_at(1, r.key_at(0), right.get_location()); } if (key < r.key_at(0)) spine.step(left); else spine.step(right); } template template void btree:: split_sibling(btree_detail::shadow_spine &spine, block_address parent_index, uint64_t key) { using namespace btree_detail; node_ref l = spine.template get_node(); block_address left = spine.get_block(); write_ref right = tm_->new_block(); node_ref r = to_node(right); unsigned nr_left = l.get_nr_entries() / 2; unsigned nr_right = l.get_nr_entries() - nr_left; r.set_nr_entries(0); r.set_type(l.get_type()); r.set_max_entries(l.get_max_entries()); r.copy_entries(l, nr_left, nr_left + nr_right); l.set_nr_entries(nr_left); internal_node p = spine.get_parent(); p.overwrite_at(parent_index, l.key_at(0), left); p.insert_at(parent_index + 1, r.key_at(0), right.get_location()); spine.pop(); if (key < r.key_at(0)) spine.step(left); else spine.step(right); } // Returns true if we need a new insertion, rather than overwrite. template template bool btree:: insert_location(btree_detail::shadow_spine &spine, block_address block, uint64_t key, int *index) { using namespace btree_detail; bool top = true; // this isn't the same as spine.has_parent() int i = *index; bool inc = false; for (;;) { inc = spine.step(block); #if 0 if (inc) inc_children(); #endif // patch up the parent to point to the new shadow if (spine.has_parent()) { internal_node p = spine.get_parent(); p.set_value(i, spine.get_block()); } internal_node internal = spine.template get_node(); // Split the node if we're full if (internal.get_type() == INTERNAL) split_node(spine, i, key, top); else split_node(spine, i, key, top); internal = spine.template get_node(); i = internal.lower_bound(key); if (internal.get_type() == btree_detail::LEAF) break; if (i < 0) { internal.set_key(0, key); i = 0; } block = internal.value_at(i); top = false; } node_ref leaf = spine.template get_node(); // FIXME: gross if (i < 0 || leaf.key_at(i) != key) i++; // do decrement the old value if it already exists // FIXME: I'm not sure about this, I don't understand the |inc| reference if (static_cast(i) < leaf.get_nr_entries() && leaf.key_at(i) == key && inc) { // dec old entry } *index = i; return ((static_cast(i) >= leaf.get_nr_entries()) || (leaf.key_at(i) != key)); } template void btree::visit(typename visitor::ptr visitor) const { walk_tree(visitor, 0, true, root_); } template void btree:: walk_tree(typename visitor::ptr visitor, unsigned level, bool is_root, block_address b) const { using namespace btree_detail; read_ref blk = tm_->read_lock(b); internal_node o = to_node(blk); if (o.get_type() == INTERNAL) { if (visitor->visit_internal(level, is_root, o)) for (unsigned i = 0; i < o.get_nr_entries(); i++) walk_tree(visitor, level, false, o.value_at(i)); } else if (level < Levels - 1) { if (visitor->visit_internal_leaf(level, is_root, o)) for (unsigned i = 0; i < o.get_nr_entries(); i++) walk_tree(visitor, level + 1, true, o.value_at(i)); } else { leaf_node ov = to_node(blk); visitor->visit_leaf(level, is_root, ov); } } //----------------------------------------------------------------