diff --git a/persistent-data/data-structures/btree-remove.tcc b/persistent-data/data-structures/btree-remove.tcc new file mode 100644 index 0000000..d222273 --- /dev/null +++ b/persistent-data/data-structures/btree-remove.tcc @@ -0,0 +1,373 @@ +// This file is part of the thin-provisioning-tools source. +// +// thin-provisioning-tools is free software: you can redistribute it +// and/or modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation, either version 3 of +// the License, or (at your option) any later version. +// +// thin-provisioning-tools is distributed in the hope that it will be +// useful, but WITHOUT ANY WARRANTY; without even the implied warranty +// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with thin-provisioning-tools. If not, see +// . + +namespace persistent_data { + template + btree_detail::shadow_child + btree:: + create_shadow_child(internal_node &parent, + unsigned index) + { + block_address b = parent.value_at(index); + + pair p = tm_.shadow(b, validator_); + write_ref &wr = p.first; + btree_detail::node_type type; + + node_ref n = to_node(wr); + if (n.get_type() == btree_detail::INTERNAL) { + type = btree_detail::INTERNAL; + if (p.second) + n.inc_children(internal_rc_); + } else { + type = btree_detail::LEAF; + if (p.second) { + node_ref leaf = to_node(wr); + leaf.inc_children(rc_); + } + } + + parent.set_value(index, wr.get_location()); + + return btree_detail::shadow_child(wr, type); + } + + template + void + btree:: + remove(key const &key) + { + using namespace btree_detail; + + block_address block = root_; + unsigned index = 0; + shadow_spine spine(tm_, validator_); + bool need_remove = true; + + for (unsigned level = 0; level < Levels - 1; ++level) { + need_remove = remove_location(spine, block, + key[level], &index, + internal_rc_); + if (!need_remove) + break; + + internal_node n = spine.get_node(); + block = n.value_at(index); + } + + if (need_remove) { + need_remove = remove_location(spine, block, + key[Levels - 1], &index, + rc_); + if (need_remove) { + leaf_node leaf = spine.get_node(); + leaf.delete_at(index); + } + } + + root_ = spine.get_root(); + } + + template + template + bool + btree:: + remove_location(btree_detail::shadow_spine &spine, + block_address block, + uint64_t key, + unsigned *index, + RC &leaf_rc) + { + using namespace btree_detail; + + unsigned i = 0; + bool r = false; + + for (;;) { + r = spine.step(block); + + // patch up the parent to point to the new shadow + if (spine.has_parent()) { + internal_node p = spine.get_parent(); + p.set_value(i, spine.get_block()); + } + + internal_node n = spine.get_node(); + if (n.get_type() == btree_detail::LEAF) { + node_ref leaf = spine.get_node(); + boost::optional idx = leaf.exact_search(key); + if (!idx) + return false; + *index = *idx; + return true; + } + + r = rebalance_children(spine, key); + if (!r) + break; + + n = spine.get_node(); + if (n.get_type() == btree_detail::LEAF) { + node_ref leaf = spine.get_node(); + boost::optional idx = leaf.exact_search(key); + if (!idx) + return false; + *index = *idx; + return true; + } + + i = n.lower_bound(key); + block = n.value_at(i); + } + + return r; + } + + template + template + bool + btree:: + rebalance_children(btree_detail::shadow_spine &spine, uint64_t key) + { + internal_node n = spine.get_node(); + + if (n.get_nr_entries() == 1) { + block_address b = n.value_at(0); + read_ref child = tm_.read_lock(b, validator_); + + // FIXME: is it safe? + ::memcpy(n.raw(), child.data(), read_ref::BLOCK_SIZE); + + tm_.get_sm()->dec(child.get_location()); + return true; + } + + int i = n.lower_bound(key); + if (i < 0) + return false; + + bool has_left_sibling = i > 0; + bool has_right_sibling = static_cast(i) < (n.get_nr_entries() - 1); + + if (!has_left_sibling) + rebalance2(spine, i); + else if (!has_right_sibling) + rebalance2(spine, i - 1); + else + rebalance3(spine, i - 1); + + return true; + } + + template + template + void + btree:: + rebalance2(btree_detail::shadow_spine &spine, unsigned left_index) + { + internal_node parent = spine.get_node(); + shadow_child left = create_shadow_child(parent, left_index); + shadow_child right = create_shadow_child(parent, left_index + 1); + + // FIXME: ugly + if (left.get_type() == btree_detail::INTERNAL) { + internal_node l = left.get_node(); + internal_node r = right.get_node(); + __rebalance2(parent, l, r, left_index); + } else { + node_ref l = left.get_node(); + node_ref r = right.get_node(); + __rebalance2(parent, l, r, left_index); + } + } + + template + template + void + btree:: + __rebalance2(internal_node &parent, + node_ref &left, + node_ref &right, + unsigned left_index) + { + unsigned nr_left = left.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + unsigned right_index = left_index + 1; + + unsigned threshold = 2 * (left.merge_threshold() + 1); + if (nr_left + nr_right < threshold) { + // Merge the right child into the left + left.copy_entries_to_left(right, nr_right); + left.set_nr_entries(nr_left + nr_right); + parent.delete_at(right_index); + tm_.get_sm()->dec(right.get_location()); + } else { + // Rebalance + unsigned target_left = (nr_left + nr_right) / 2; + left.move_entries(right, nr_left - target_left); + parent.set_key(right_index, right.key_at(0)); + } + } + + template + template + void + btree:: + rebalance3(btree_detail::shadow_spine &spine, unsigned left_index) + { + internal_node parent = spine.get_node(); + shadow_child left = create_shadow_child(parent, left_index); + shadow_child center = create_shadow_child(parent, left_index + 1); + shadow_child right = create_shadow_child(parent, left_index + 2); + + // FIXME: ugly + if (left.get_type() == btree_detail::INTERNAL) { + internal_node l = left.get_node(); + internal_node c = center.get_node(); + internal_node r = right.get_node(); + __rebalance3(parent, l, c, r, left_index); + } else { + node_ref l = left.get_node(); + node_ref c = center.get_node(); + node_ref r = right.get_node(); + __rebalance3(parent, l, c, r, left_index); + } + } + + template + template + void + btree:: + __rebalance3(internal_node &parent, + node_ref &left, + node_ref ¢er, + node_ref &right, + unsigned left_index) + { + unsigned nr_left = left.get_nr_entries(); + unsigned nr_center = center.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + + unsigned threshold = left.merge_threshold() * 4 + 1; + + if ((nr_left + nr_center + nr_right) < threshold) + delete_center_node(parent, left, center, right, left_index); + else + redistribute3(parent, left, center, right, left_index); + } + + template + template + void + btree:: + delete_center_node(internal_node &parent, + node_ref &left, + node_ref ¢er, + node_ref &right, + unsigned left_index) + { + unsigned center_index = left_index + 1; + unsigned right_index = left_index + 2; + + unsigned max_entries = left.get_max_entries(); + unsigned nr_left = left.get_nr_entries(); + unsigned nr_center = center.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + unsigned shift = std::min(max_entries - nr_left, nr_center); + + if (nr_left + shift > max_entries) + throw std::runtime_error("too many entries"); + + left.copy_entries_to_left(center, shift); + left.set_nr_entries(nr_left + shift); + + if (shift != nr_center) { + shift = nr_center - shift; + if ((nr_right + shift) > max_entries) + throw std::runtime_error("too many entries"); + right.shift_entries_right(shift); + center.copy_entries_to_right(right, shift); + right.set_nr_entries(nr_right + shift); + } + parent.set_key(right_index, right.key_at(0)); + + parent.delete_at(center_index); + --right_index; + + tm_.get_sm()->dec(center.get_location()); + __rebalance2(parent, left, right, left_index); + } + + template + template + void + btree:: + redistribute3(internal_node &parent, + node_ref &left, + node_ref ¢er, + node_ref &right, + unsigned left_index) + { + unsigned center_index = left_index + 1; + unsigned right_index = left_index + 2; + + unsigned nr_left = left.get_nr_entries(); + unsigned nr_center = center.get_nr_entries(); + unsigned nr_right = right.get_nr_entries(); + + unsigned max_entries = left.get_max_entries(); + unsigned total = nr_left + nr_center + nr_right; + unsigned target_right = total / 3; + unsigned remainder = (target_right * 3) != total; + unsigned target_left = target_right + remainder; + + if (target_left > max_entries || target_right > max_entries) + throw std::runtime_error("too many entries"); + + if (nr_left < nr_right) { + int s = nr_left - target_left; + + // FIXME: signed & unsigned comparison + if (s < 0 && nr_center < static_cast(-s)) { + // not enough in central node + left.move_entries(center, -nr_center); + s += nr_center; + left.move_entries(right, s); + nr_right += s; + } else + left.move_entries(center, s); + + center.move_entries(right, target_right - nr_right); + + } else { + int s = target_right - nr_right; + + if (s > 0 && nr_center < static_cast(s)) { + // not enough in central node + center.move_entries(right, nr_center); + s -= nr_center; + left.move_entries(right, s); + nr_left -= s; + } else + center.move_entries(right, s); + + left.move_entries(center, nr_left - target_left); + } + + parent.set_key(center_index, center.key_at(0)); + parent.set_key(right_index, right.key_at(0)); + } +}; diff --git a/persistent-data/data-structures/btree.h b/persistent-data/data-structures/btree.h index c92a484..d3423c7 100644 --- a/persistent-data/data-structures/btree.h +++ b/persistent-data/data-structures/btree.h @@ -110,12 +110,34 @@ namespace persistent_data { uint64_t key, typename ValueTraits::value_type const &v); + // Decrements the nr_entries field + void delete_at(unsigned i); + // Copies entries from another node, appends them // to the back of this node. Adjusts nr_entries. void copy_entries(node_ref const &rhs, unsigned begin, unsigned end); + // Moves entries between the sibling node, + // and maintains the key ordering. + // The nr_entreis of both nodes are adjusted. + void move_entries(node_ref &rhs, + int count); + + // Copies entries from the beginning of rhs to the end of lhs, + // or copies entries from the end of lhs to the beginning of rhs. + // The nr_entries is not adjusted. + void copy_entries_to_left(node_ref const &rhs, unsigned count); + void copy_entries_to_right(node_ref &rhs, unsigned count) const; + + // Shifts entries to left or right. + // The nr_entries is not adjusted. + void shift_entries_left(unsigned shift); + void shift_entries_right(unsigned shift); + + unsigned merge_threshold() const; + // Various searches int bsearch(uint64_t key, int want_hi) const; boost::optional exact_search(uint64_t key) const; @@ -259,6 +281,26 @@ namespace persistent_data { maybe_block root_; }; + class shadow_child { + public: + shadow_child(block_manager::write_ref &wr, node_type type) + : wr_(wr), type_(type) { + } + + node_type get_type() const { + return type_; + } + + template + node_ref get_node() { + return to_node(wr_); + } + + private: + block_manager::write_ref wr_; + node_type type_; + }; + // Used to keep a record of a nested btree's position. typedef std::vector btree_path; @@ -399,6 +441,14 @@ namespace persistent_data { int *index, RC &leaf_rc); + template + bool + remove_location(btree_detail::shadow_spine &spine, + block_address block, + uint64_t key, + unsigned *index, + RC &leaf_rc); + void walk_tree(visitor &visitor, btree_detail::node_location const &loc, block_address b) const; @@ -411,6 +461,53 @@ namespace persistent_data { void inc_children(btree_detail::shadow_spine &spine, RefCounter &leaf_rc); + btree_detail::shadow_child + create_shadow_child(internal_node &parent, + unsigned index); + + template + bool rebalance_children(btree_detail::shadow_spine &spine, + uint64_t key); + + template + void rebalance2(btree_detail::shadow_spine &spine, + unsigned left_index); + + template + void rebalance3(btree_detail::shadow_spine &spine, + unsigned left_index); + + template + void + __rebalance2(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref &right, + unsigned left_index); + + template + void + __rebalance3(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref ¢er, + btree_detail::node_ref &right, + unsigned left_index); + + template + void + delete_center_node(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref ¢er, + btree_detail::node_ref &right, + unsigned left_index); + + template + void + redistribute3(internal_node &parent, + btree_detail::node_ref &left, + btree_detail::node_ref ¢er, + btree_detail::node_ref &right, + unsigned left_index); + transaction_manager &tm_; bool destroy_; block_address root_; @@ -421,6 +518,7 @@ namespace persistent_data { }; #include "btree.tcc" +#include "btree-remove.tcc" //---------------------------------------------------------------- diff --git a/persistent-data/data-structures/btree.tcc b/persistent-data/data-structures/btree.tcc index 0c7ca3d..059ebaa 100644 --- a/persistent-data/data-structures/btree.tcc +++ b/persistent-data/data-structures/btree.tcc @@ -293,6 +293,23 @@ namespace persistent_data { set_value(i, v); } + template + void + node_ref::delete_at(unsigned i) + { + unsigned nr_entries = get_nr_entries(); + if (i >= nr_entries) + throw runtime_error("key index out of bounds"); + unsigned nr_to_copy = nr_entries - (i + 1); + + if (nr_to_copy) { + ::memmove(key_ptr(i), key_ptr(i + 1), sizeof(uint64_t) * nr_to_copy); + ::memmove(value_ptr(i), value_ptr(i + 1), sizeof(typename ValueTraits::disk_type) * nr_to_copy); + } + + set_nr_entries(nr_entries - 1); + } + template void node_ref::copy_entries(node_ref const &rhs, @@ -309,6 +326,90 @@ namespace persistent_data { set_nr_entries(n + count); } + template + void + node_ref::move_entries(node_ref &rhs, + int count) + { + if (!count) + return; + + unsigned nr_left = get_nr_entries(); + unsigned nr_right = rhs.get_nr_entries(); + unsigned max_entries = get_max_entries(); + + if (nr_left - count > max_entries || nr_right - count > max_entries) + throw runtime_error("too many entries"); + + if (count > 0) { + rhs.shift_entries_right(count); + copy_entries_to_right(rhs, count); + } else { + copy_entries_to_left(rhs, -count); + rhs.shift_entries_left(-count); + } + + set_nr_entries(nr_left - count); + rhs.set_nr_entries(nr_right + count); + } + + template + void + node_ref::copy_entries_to_left(node_ref const &rhs, unsigned count) + { + unsigned n = get_nr_entries(); + if ((n + count) > get_max_entries()) + throw runtime_error("too many entries"); + + ::memcpy(key_ptr(n), rhs.key_ptr(0), sizeof(uint64_t) * count); + ::memcpy(value_ptr(n), rhs.value_ptr(0), sizeof(typename ValueTraits::disk_type) * count); + } + + template + void + node_ref::copy_entries_to_right(node_ref &rhs, unsigned count) const + { + unsigned n = rhs.get_nr_entries(); + if ((n + count) > get_max_entries()) + throw runtime_error("too many entries"); + + unsigned nr_left = get_nr_entries(); + ::memcpy(rhs.key_ptr(0), key_ptr(nr_left - count), sizeof(uint64_t) * count); + ::memcpy(rhs.value_ptr(0), value_ptr(nr_left - count), sizeof(typename ValueTraits::disk_type) * count); + } + + template + void + node_ref::shift_entries_left(unsigned shift) + { + unsigned n = get_nr_entries(); + if (shift > n) + throw runtime_error("too many entries"); + + unsigned nr_shifted = n - shift; + ::memmove(key_ptr(0), key_ptr(shift), sizeof(uint64_t) * nr_shifted); + ::memmove(value_ptr(0), value_ptr(shift), sizeof(typename ValueTraits::disk_type) * nr_shifted); + } + + template + void + node_ref::shift_entries_right(unsigned shift) + { + unsigned n = get_nr_entries(); + if (n + shift > get_max_entries()) + throw runtime_error("too many entries"); + + ::memmove(key_ptr(shift), key_ptr(0), sizeof(uint64_t) * n); + ::memmove(value_ptr(shift), value_ptr(0), sizeof(typename ValueTraits::disk_type) * n); + } + + template + unsigned + node_ref::merge_threshold() const + { + return get_max_entries() / 3; + } + template int node_ref::bsearch(uint64_t key, int want_hi) const @@ -601,13 +702,6 @@ namespace persistent_data { return need_insert; } - template - void - btree::remove(key const &key) - { - using namespace btree_detail; - } - template block_address btree::get_root() const