diff --git a/persistent-data/data-structures/btree-remove.tcc b/persistent-data/data-structures/btree-remove.tcc
new file mode 100644
index 0000000..d222273
--- /dev/null
+++ b/persistent-data/data-structures/btree-remove.tcc
@@ -0,0 +1,373 @@
+// This file is part of the thin-provisioning-tools source.
+//
+// thin-provisioning-tools is free software: you can redistribute it
+// and/or modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation, either version 3 of
+// the License, or (at your option) any later version.
+//
+// thin-provisioning-tools is distributed in the hope that it will be
+// useful, but WITHOUT ANY WARRANTY; without even the implied warranty
+// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with thin-provisioning-tools. If not, see
+// .
+
+namespace persistent_data {
+ template
+ btree_detail::shadow_child
+ btree::
+ create_shadow_child(internal_node &parent,
+ unsigned index)
+ {
+ block_address b = parent.value_at(index);
+
+ pair p = tm_.shadow(b, validator_);
+ write_ref &wr = p.first;
+ btree_detail::node_type type;
+
+ node_ref n = to_node(wr);
+ if (n.get_type() == btree_detail::INTERNAL) {
+ type = btree_detail::INTERNAL;
+ if (p.second)
+ n.inc_children(internal_rc_);
+ } else {
+ type = btree_detail::LEAF;
+ if (p.second) {
+ node_ref leaf = to_node(wr);
+ leaf.inc_children(rc_);
+ }
+ }
+
+ parent.set_value(index, wr.get_location());
+
+ return btree_detail::shadow_child(wr, type);
+ }
+
+ template
+ void
+ btree::
+ remove(key const &key)
+ {
+ using namespace btree_detail;
+
+ block_address block = root_;
+ unsigned index = 0;
+ shadow_spine spine(tm_, validator_);
+ bool need_remove = true;
+
+ for (unsigned level = 0; level < Levels - 1; ++level) {
+ need_remove = remove_location(spine, block,
+ key[level], &index,
+ internal_rc_);
+ if (!need_remove)
+ break;
+
+ internal_node n = spine.get_node();
+ block = n.value_at(index);
+ }
+
+ if (need_remove) {
+ need_remove = remove_location(spine, block,
+ key[Levels - 1], &index,
+ rc_);
+ if (need_remove) {
+ leaf_node leaf = spine.get_node();
+ leaf.delete_at(index);
+ }
+ }
+
+ root_ = spine.get_root();
+ }
+
+ template
+ template
+ bool
+ btree::
+ remove_location(btree_detail::shadow_spine &spine,
+ block_address block,
+ uint64_t key,
+ unsigned *index,
+ RC &leaf_rc)
+ {
+ using namespace btree_detail;
+
+ unsigned i = 0;
+ bool r = false;
+
+ for (;;) {
+ r = spine.step(block);
+
+ // patch up the parent to point to the new shadow
+ if (spine.has_parent()) {
+ internal_node p = spine.get_parent();
+ p.set_value(i, spine.get_block());
+ }
+
+ internal_node n = spine.get_node();
+ if (n.get_type() == btree_detail::LEAF) {
+ node_ref leaf = spine.get_node();
+ boost::optional idx = leaf.exact_search(key);
+ if (!idx)
+ return false;
+ *index = *idx;
+ return true;
+ }
+
+ r = rebalance_children(spine, key);
+ if (!r)
+ break;
+
+ n = spine.get_node();
+ if (n.get_type() == btree_detail::LEAF) {
+ node_ref leaf = spine.get_node();
+ boost::optional idx = leaf.exact_search(key);
+ if (!idx)
+ return false;
+ *index = *idx;
+ return true;
+ }
+
+ i = n.lower_bound(key);
+ block = n.value_at(i);
+ }
+
+ return r;
+ }
+
+ template
+ template
+ bool
+ btree::
+ rebalance_children(btree_detail::shadow_spine &spine, uint64_t key)
+ {
+ internal_node n = spine.get_node();
+
+ if (n.get_nr_entries() == 1) {
+ block_address b = n.value_at(0);
+ read_ref child = tm_.read_lock(b, validator_);
+
+ // FIXME: is it safe?
+ ::memcpy(n.raw(), child.data(), read_ref::BLOCK_SIZE);
+
+ tm_.get_sm()->dec(child.get_location());
+ return true;
+ }
+
+ int i = n.lower_bound(key);
+ if (i < 0)
+ return false;
+
+ bool has_left_sibling = i > 0;
+ bool has_right_sibling = static_cast(i) < (n.get_nr_entries() - 1);
+
+ if (!has_left_sibling)
+ rebalance2(spine, i);
+ else if (!has_right_sibling)
+ rebalance2(spine, i - 1);
+ else
+ rebalance3(spine, i - 1);
+
+ return true;
+ }
+
+ template
+ template
+ void
+ btree::
+ rebalance2(btree_detail::shadow_spine &spine, unsigned left_index)
+ {
+ internal_node parent = spine.get_node();
+ shadow_child left = create_shadow_child(parent, left_index);
+ shadow_child right = create_shadow_child(parent, left_index + 1);
+
+ // FIXME: ugly
+ if (left.get_type() == btree_detail::INTERNAL) {
+ internal_node l = left.get_node();
+ internal_node r = right.get_node();
+ __rebalance2(parent, l, r, left_index);
+ } else {
+ node_ref l = left.get_node();
+ node_ref r = right.get_node();
+ __rebalance2(parent, l, r, left_index);
+ }
+ }
+
+ template
+ template
+ void
+ btree::
+ __rebalance2(internal_node &parent,
+ node_ref &left,
+ node_ref &right,
+ unsigned left_index)
+ {
+ unsigned nr_left = left.get_nr_entries();
+ unsigned nr_right = right.get_nr_entries();
+ unsigned right_index = left_index + 1;
+
+ unsigned threshold = 2 * (left.merge_threshold() + 1);
+ if (nr_left + nr_right < threshold) {
+ // Merge the right child into the left
+ left.copy_entries_to_left(right, nr_right);
+ left.set_nr_entries(nr_left + nr_right);
+ parent.delete_at(right_index);
+ tm_.get_sm()->dec(right.get_location());
+ } else {
+ // Rebalance
+ unsigned target_left = (nr_left + nr_right) / 2;
+ left.move_entries(right, nr_left - target_left);
+ parent.set_key(right_index, right.key_at(0));
+ }
+ }
+
+ template
+ template
+ void
+ btree::
+ rebalance3(btree_detail::shadow_spine &spine, unsigned left_index)
+ {
+ internal_node parent = spine.get_node();
+ shadow_child left = create_shadow_child(parent, left_index);
+ shadow_child center = create_shadow_child(parent, left_index + 1);
+ shadow_child right = create_shadow_child(parent, left_index + 2);
+
+ // FIXME: ugly
+ if (left.get_type() == btree_detail::INTERNAL) {
+ internal_node l = left.get_node();
+ internal_node c = center.get_node();
+ internal_node r = right.get_node();
+ __rebalance3(parent, l, c, r, left_index);
+ } else {
+ node_ref l = left.get_node();
+ node_ref c = center.get_node();
+ node_ref r = right.get_node();
+ __rebalance3(parent, l, c, r, left_index);
+ }
+ }
+
+ template
+ template
+ void
+ btree::
+ __rebalance3(internal_node &parent,
+ node_ref &left,
+ node_ref ¢er,
+ node_ref &right,
+ unsigned left_index)
+ {
+ unsigned nr_left = left.get_nr_entries();
+ unsigned nr_center = center.get_nr_entries();
+ unsigned nr_right = right.get_nr_entries();
+
+ unsigned threshold = left.merge_threshold() * 4 + 1;
+
+ if ((nr_left + nr_center + nr_right) < threshold)
+ delete_center_node(parent, left, center, right, left_index);
+ else
+ redistribute3(parent, left, center, right, left_index);
+ }
+
+ template
+ template
+ void
+ btree::
+ delete_center_node(internal_node &parent,
+ node_ref &left,
+ node_ref ¢er,
+ node_ref &right,
+ unsigned left_index)
+ {
+ unsigned center_index = left_index + 1;
+ unsigned right_index = left_index + 2;
+
+ unsigned max_entries = left.get_max_entries();
+ unsigned nr_left = left.get_nr_entries();
+ unsigned nr_center = center.get_nr_entries();
+ unsigned nr_right = right.get_nr_entries();
+ unsigned shift = std::min(max_entries - nr_left, nr_center);
+
+ if (nr_left + shift > max_entries)
+ throw std::runtime_error("too many entries");
+
+ left.copy_entries_to_left(center, shift);
+ left.set_nr_entries(nr_left + shift);
+
+ if (shift != nr_center) {
+ shift = nr_center - shift;
+ if ((nr_right + shift) > max_entries)
+ throw std::runtime_error("too many entries");
+ right.shift_entries_right(shift);
+ center.copy_entries_to_right(right, shift);
+ right.set_nr_entries(nr_right + shift);
+ }
+ parent.set_key(right_index, right.key_at(0));
+
+ parent.delete_at(center_index);
+ --right_index;
+
+ tm_.get_sm()->dec(center.get_location());
+ __rebalance2(parent, left, right, left_index);
+ }
+
+ template
+ template
+ void
+ btree::
+ redistribute3(internal_node &parent,
+ node_ref &left,
+ node_ref ¢er,
+ node_ref &right,
+ unsigned left_index)
+ {
+ unsigned center_index = left_index + 1;
+ unsigned right_index = left_index + 2;
+
+ unsigned nr_left = left.get_nr_entries();
+ unsigned nr_center = center.get_nr_entries();
+ unsigned nr_right = right.get_nr_entries();
+
+ unsigned max_entries = left.get_max_entries();
+ unsigned total = nr_left + nr_center + nr_right;
+ unsigned target_right = total / 3;
+ unsigned remainder = (target_right * 3) != total;
+ unsigned target_left = target_right + remainder;
+
+ if (target_left > max_entries || target_right > max_entries)
+ throw std::runtime_error("too many entries");
+
+ if (nr_left < nr_right) {
+ int s = nr_left - target_left;
+
+ // FIXME: signed & unsigned comparison
+ if (s < 0 && nr_center < static_cast(-s)) {
+ // not enough in central node
+ left.move_entries(center, -nr_center);
+ s += nr_center;
+ left.move_entries(right, s);
+ nr_right += s;
+ } else
+ left.move_entries(center, s);
+
+ center.move_entries(right, target_right - nr_right);
+
+ } else {
+ int s = target_right - nr_right;
+
+ if (s > 0 && nr_center < static_cast(s)) {
+ // not enough in central node
+ center.move_entries(right, nr_center);
+ s -= nr_center;
+ left.move_entries(right, s);
+ nr_left -= s;
+ } else
+ center.move_entries(right, s);
+
+ left.move_entries(center, nr_left - target_left);
+ }
+
+ parent.set_key(center_index, center.key_at(0));
+ parent.set_key(right_index, right.key_at(0));
+ }
+};
diff --git a/persistent-data/data-structures/btree.h b/persistent-data/data-structures/btree.h
index c92a484..d3423c7 100644
--- a/persistent-data/data-structures/btree.h
+++ b/persistent-data/data-structures/btree.h
@@ -110,12 +110,34 @@ namespace persistent_data {
uint64_t key,
typename ValueTraits::value_type const &v);
+ // Decrements the nr_entries field
+ void delete_at(unsigned i);
+
// Copies entries from another node, appends them
// to the back of this node. Adjusts nr_entries.
void copy_entries(node_ref const &rhs,
unsigned begin,
unsigned end);
+ // Moves entries between the sibling node,
+ // and maintains the key ordering.
+ // The nr_entreis of both nodes are adjusted.
+ void move_entries(node_ref &rhs,
+ int count);
+
+ // Copies entries from the beginning of rhs to the end of lhs,
+ // or copies entries from the end of lhs to the beginning of rhs.
+ // The nr_entries is not adjusted.
+ void copy_entries_to_left(node_ref const &rhs, unsigned count);
+ void copy_entries_to_right(node_ref &rhs, unsigned count) const;
+
+ // Shifts entries to left or right.
+ // The nr_entries is not adjusted.
+ void shift_entries_left(unsigned shift);
+ void shift_entries_right(unsigned shift);
+
+ unsigned merge_threshold() const;
+
// Various searches
int bsearch(uint64_t key, int want_hi) const;
boost::optional exact_search(uint64_t key) const;
@@ -259,6 +281,26 @@ namespace persistent_data {
maybe_block root_;
};
+ class shadow_child {
+ public:
+ shadow_child(block_manager::write_ref &wr, node_type type)
+ : wr_(wr), type_(type) {
+ }
+
+ node_type get_type() const {
+ return type_;
+ }
+
+ template
+ node_ref get_node() {
+ return to_node(wr_);
+ }
+
+ private:
+ block_manager::write_ref wr_;
+ node_type type_;
+ };
+
// Used to keep a record of a nested btree's position.
typedef std::vector btree_path;
@@ -399,6 +441,14 @@ namespace persistent_data {
int *index,
RC &leaf_rc);
+ template
+ bool
+ remove_location(btree_detail::shadow_spine &spine,
+ block_address block,
+ uint64_t key,
+ unsigned *index,
+ RC &leaf_rc);
+
void walk_tree(visitor &visitor,
btree_detail::node_location const &loc,
block_address b) const;
@@ -411,6 +461,53 @@ namespace persistent_data {
void inc_children(btree_detail::shadow_spine &spine,
RefCounter &leaf_rc);
+ btree_detail::shadow_child
+ create_shadow_child(internal_node &parent,
+ unsigned index);
+
+ template
+ bool rebalance_children(btree_detail::shadow_spine &spine,
+ uint64_t key);
+
+ template
+ void rebalance2(btree_detail::shadow_spine &spine,
+ unsigned left_index);
+
+ template
+ void rebalance3(btree_detail::shadow_spine &spine,
+ unsigned left_index);
+
+ template
+ void
+ __rebalance2(internal_node &parent,
+ btree_detail::node_ref &left,
+ btree_detail::node_ref &right,
+ unsigned left_index);
+
+ template
+ void
+ __rebalance3(internal_node &parent,
+ btree_detail::node_ref &left,
+ btree_detail::node_ref ¢er,
+ btree_detail::node_ref &right,
+ unsigned left_index);
+
+ template
+ void
+ delete_center_node(internal_node &parent,
+ btree_detail::node_ref &left,
+ btree_detail::node_ref ¢er,
+ btree_detail::node_ref &right,
+ unsigned left_index);
+
+ template
+ void
+ redistribute3(internal_node &parent,
+ btree_detail::node_ref &left,
+ btree_detail::node_ref ¢er,
+ btree_detail::node_ref &right,
+ unsigned left_index);
+
transaction_manager &tm_;
bool destroy_;
block_address root_;
@@ -421,6 +518,7 @@ namespace persistent_data {
};
#include "btree.tcc"
+#include "btree-remove.tcc"
//----------------------------------------------------------------
diff --git a/persistent-data/data-structures/btree.tcc b/persistent-data/data-structures/btree.tcc
index 0c7ca3d..059ebaa 100644
--- a/persistent-data/data-structures/btree.tcc
+++ b/persistent-data/data-structures/btree.tcc
@@ -293,6 +293,23 @@ namespace persistent_data {
set_value(i, v);
}
+ template
+ void
+ node_ref::delete_at(unsigned i)
+ {
+ unsigned nr_entries = get_nr_entries();
+ if (i >= nr_entries)
+ throw runtime_error("key index out of bounds");
+ unsigned nr_to_copy = nr_entries - (i + 1);
+
+ if (nr_to_copy) {
+ ::memmove(key_ptr(i), key_ptr(i + 1), sizeof(uint64_t) * nr_to_copy);
+ ::memmove(value_ptr(i), value_ptr(i + 1), sizeof(typename ValueTraits::disk_type) * nr_to_copy);
+ }
+
+ set_nr_entries(nr_entries - 1);
+ }
+
template
void
node_ref::copy_entries(node_ref const &rhs,
@@ -309,6 +326,90 @@ namespace persistent_data {
set_nr_entries(n + count);
}
+ template
+ void
+ node_ref::move_entries(node_ref &rhs,
+ int count)
+ {
+ if (!count)
+ return;
+
+ unsigned nr_left = get_nr_entries();
+ unsigned nr_right = rhs.get_nr_entries();
+ unsigned max_entries = get_max_entries();
+
+ if (nr_left - count > max_entries || nr_right - count > max_entries)
+ throw runtime_error("too many entries");
+
+ if (count > 0) {
+ rhs.shift_entries_right(count);
+ copy_entries_to_right(rhs, count);
+ } else {
+ copy_entries_to_left(rhs, -count);
+ rhs.shift_entries_left(-count);
+ }
+
+ set_nr_entries(nr_left - count);
+ rhs.set_nr_entries(nr_right + count);
+ }
+
+ template
+ void
+ node_ref::copy_entries_to_left(node_ref const &rhs, unsigned count)
+ {
+ unsigned n = get_nr_entries();
+ if ((n + count) > get_max_entries())
+ throw runtime_error("too many entries");
+
+ ::memcpy(key_ptr(n), rhs.key_ptr(0), sizeof(uint64_t) * count);
+ ::memcpy(value_ptr(n), rhs.value_ptr(0), sizeof(typename ValueTraits::disk_type) * count);
+ }
+
+ template
+ void
+ node_ref::copy_entries_to_right(node_ref &rhs, unsigned count) const
+ {
+ unsigned n = rhs.get_nr_entries();
+ if ((n + count) > get_max_entries())
+ throw runtime_error("too many entries");
+
+ unsigned nr_left = get_nr_entries();
+ ::memcpy(rhs.key_ptr(0), key_ptr(nr_left - count), sizeof(uint64_t) * count);
+ ::memcpy(rhs.value_ptr(0), value_ptr(nr_left - count), sizeof(typename ValueTraits::disk_type) * count);
+ }
+
+ template
+ void
+ node_ref::shift_entries_left(unsigned shift)
+ {
+ unsigned n = get_nr_entries();
+ if (shift > n)
+ throw runtime_error("too many entries");
+
+ unsigned nr_shifted = n - shift;
+ ::memmove(key_ptr(0), key_ptr(shift), sizeof(uint64_t) * nr_shifted);
+ ::memmove(value_ptr(0), value_ptr(shift), sizeof(typename ValueTraits::disk_type) * nr_shifted);
+ }
+
+ template
+ void
+ node_ref::shift_entries_right(unsigned shift)
+ {
+ unsigned n = get_nr_entries();
+ if (n + shift > get_max_entries())
+ throw runtime_error("too many entries");
+
+ ::memmove(key_ptr(shift), key_ptr(0), sizeof(uint64_t) * n);
+ ::memmove(value_ptr(shift), value_ptr(0), sizeof(typename ValueTraits::disk_type) * n);
+ }
+
+ template
+ unsigned
+ node_ref::merge_threshold() const
+ {
+ return get_max_entries() / 3;
+ }
+
template
int
node_ref::bsearch(uint64_t key, int want_hi) const
@@ -601,13 +702,6 @@ namespace persistent_data {
return need_insert;
}
- template
- void
- btree::remove(key const &key)
- {
- using namespace btree_detail;
- }
-
template
block_address
btree::get_root() const