From e158dc7601dcd47a4523072885f0f7b3d8c99cc0 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Fri, 30 Jul 2021 22:37:01 +0800 Subject: [PATCH] [btree_builder] Fix issues with under populated shared nodes A pre-built node could be under populated (less than half-full) due to following reasons: - A single shared leaf generated by dm-thin key removal. The residency could drop below 50%, until it reaches the merge threshold (33% or 44%, depends on its context). - A shared root, which could have any possible nr_entries. - Underfull shared nodes (less than 33% residency) caused by kernel issues. To avoid producing under populated nodes, those kinds of pre-built nodes, except the roots, will be merged into their siblings. --- src/pdata/btree_builder.rs | 89 ++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 23 deletions(-) diff --git a/src/pdata/btree_builder.rs b/src/pdata/btree_builder.rs index f6c62f2..2054030 100644 --- a/src/pdata/btree_builder.rs +++ b/src/pdata/btree_builder.rs @@ -272,13 +272,19 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { shared, } } + /// Push a single value. This may emit a new node, hence the Result /// return type. The value's ref count will be incremented. pub fn push_value(&mut self, w: &mut WriteBatcher, key: u64, val: V) -> Result<()> { + // Unshift the previously pushed node since it is not the root + let half_full = self.max_entries_per_node / 2; + if self.nodes.len() == 1 && (self.nodes.last().unwrap().nr_entries < half_full) { + self.unshift_node(w)?; + } // Have we got enough values to emit a node? We try and keep // at least max_entries_per_node entries unflushed so we // can ensure the final node is balanced properly. - if self.values.len() == self.max_entries_per_node * 2 { + else if self.values.len() == self.max_entries_per_node * 2 { self.emit_node(w)?; } @@ -287,6 +293,19 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { Ok(()) } + // To avoid writing an under populated node we have to grab some + // values from the first of the shared nodes. + fn append_values(&mut self, w: &mut WriteBatcher, node: &NodeSummary) -> Result<()> { + let (keys, values) = self.read_node(w, node.block)?; + + for i in 0..keys.len() { + self.value_rc.inc(&values[i])?; + self.values.push_back((keys[i], values[i].clone())); + } + + Ok(()) + } + /// Push a number of prebuilt, shared nodes. The builder may decide to not /// use a shared node, instead reading the values and packing them /// directly. This may do IO to emit nodes, so returns a Result. @@ -296,43 +315,67 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { pub fn push_nodes(&mut self, w: &mut WriteBatcher, nodes: &[NodeSummary]) -> Result<()> { assert!(!nodes.is_empty()); + // Assume that the node is a shared root if it is the first comer. + // A rooted leaf could have any number of entries. + let maybe_root = (nodes.len() == 1) && self.nodes.is_empty() && self.values.is_empty(); + if maybe_root { + let n = &nodes[0]; + w.sm.lock().unwrap().inc(n.block, 1)?; + self.nodes.push(n.clone()); + return Ok(()); + } + // As a sanity check we make sure that all the shared nodes contain the // minimum nr of entries. + // A single shared node could be possibly under populated (less than half-full) + // due to btree removal, or even underfull (<33% residency) due to kernel issues. + // Those kinds of nodes will be merged into their siblings. let half_full = self.max_entries_per_node / 2; - for n in nodes { - if n.nr_entries < half_full { - panic!("under populated node"); + if nodes.len() > 1 { + for n in nodes { + if n.nr_entries < half_full { + panic!("under populated node"); + } } } + // Unshift the previously pushed node since it is not the root + if self.nodes.len() == 1 && (self.nodes.last().unwrap().nr_entries < half_full) { + self.unshift_node(w)?; + } + // Decide if we're going to use the pre-built nodes. if !self.values.is_empty() && (self.values.len() < half_full) { - // To avoid writing an under populated node we have to grab some - // values from the first of the shared nodes. - let (keys, values) = self.read_node(w, nodes.get(0).unwrap().block)?; + let mut nodes_iter = nodes.iter(); + let n = nodes_iter.next(); + self.append_values(w, n.unwrap())?; - for i in 0..keys.len() { - self.value_rc.inc(&values[i])?; - self.values.push_back((keys[i], values[i].clone())); - } + // Do not flush if there's no succeeding nodes, + // so that it could produce a more compact metadata. + if nodes.len() > 1 { + // Flush all the values. + self.emit_all(w)?; - // Flush all the values. - self.emit_all(w)?; - - // Add the remaining nodes. - for i in 1..nodes.len() { - let n = nodes.get(i).unwrap(); - w.sm.lock().unwrap().inc(n.block, 1)?; - self.nodes.push(n.clone()); + // Add the remaining nodes. + for n in nodes_iter { + w.sm.lock().unwrap().inc(n.block, 1)?; + self.nodes.push(n.clone()); + } } } else { // Flush all the values. self.emit_all(w)?; - // add the nodes - for n in nodes { - w.sm.lock().unwrap().inc(n.block, 1)?; - self.nodes.push(n.clone()); + if nodes[0].nr_entries < half_full { + // An under populated nodes[0] implies nodes.len() == 1, + // and that has to be merged into their siblings. + self.append_values(w, &nodes[0])?; + } else { + // Add the nodes. + for n in nodes { + w.sm.lock().unwrap().inc(n.block, 1)?; + self.nodes.push(n.clone()); + } } }