From 052c9f90eaab0faca48b35e95757ed5e431dcb18 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Fri, 30 Jul 2021 01:28:56 +0800 Subject: [PATCH 1/5] [thin/cache_repair (rust)] Fix file open options --- src/cache/repair.rs | 4 ++-- src/thin/repair.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cache/repair.rs b/src/cache/repair.rs index dd88a26..1273da7 100644 --- a/src/cache/repair.rs +++ b/src/cache/repair.rs @@ -32,11 +32,11 @@ fn new_context(opts: &CacheRepairOptions) -> Result { let engine_out: Arc; if opts.async_io { - engine_in = Arc::new(AsyncIoEngine::new(opts.input, MAX_CONCURRENT_IO, true)?); + engine_in = Arc::new(AsyncIoEngine::new(opts.input, MAX_CONCURRENT_IO, false)?); engine_out = Arc::new(AsyncIoEngine::new(opts.output, MAX_CONCURRENT_IO, true)?); } else { let nr_threads = std::cmp::max(8, num_cpus::get() * 2); - engine_in = Arc::new(SyncIoEngine::new(opts.input, nr_threads, true)?); + engine_in = Arc::new(SyncIoEngine::new(opts.input, nr_threads, false)?); engine_out = Arc::new(SyncIoEngine::new(opts.output, nr_threads, true)?); } diff --git a/src/thin/repair.rs b/src/thin/repair.rs index 89d6e09..d4ad6ff 100644 --- a/src/thin/repair.rs +++ b/src/thin/repair.rs @@ -33,11 +33,11 @@ fn new_context(opts: &ThinRepairOptions) -> Result { let engine_out: Arc; if opts.async_io { - engine_in = Arc::new(AsyncIoEngine::new(opts.input, MAX_CONCURRENT_IO, true)?); + engine_in = Arc::new(AsyncIoEngine::new(opts.input, MAX_CONCURRENT_IO, false)?); engine_out = Arc::new(AsyncIoEngine::new(opts.output, MAX_CONCURRENT_IO, true)?); } else { let nr_threads = std::cmp::max(8, num_cpus::get() * 2); - engine_in = Arc::new(SyncIoEngine::new(opts.input, nr_threads, true)?); + engine_in = Arc::new(SyncIoEngine::new(opts.input, nr_threads, false)?); engine_out = Arc::new(SyncIoEngine::new(opts.output, nr_threads, true)?); } From bd39b570efa2313e37a6851fb6e56e41b3fa6e34 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Fri, 30 Jul 2021 14:26:29 +0800 Subject: [PATCH 2/5] [btree_builder] Fix reference counts of unshifted child values The ref counts now are handled in a straightforward method: The pre-built subtrees serve as snapshots of potentially shared nodes. They hold an initial ref count on the pre-built nodes and their children. The temporary ref counts are later dropped at the end of device building. This way fixes the ref counts of unshifted nodes, thus the 'reserve' operation introduced by commit 6d16c58 is reverted. --- src/pdata/btree_builder.rs | 63 +++++++++++++++++++++++++------------- src/thin/restore.rs | 20 +++++++++++- src/write_batcher.rs | 24 +++------------ 3 files changed, 66 insertions(+), 41 deletions(-) diff --git a/src/pdata/btree_builder.rs b/src/pdata/btree_builder.rs index 7c509ba..f6c62f2 100644 --- a/src/pdata/btree_builder.rs +++ b/src/pdata/btree_builder.rs @@ -141,16 +141,12 @@ pub struct WriteResult { loc: u64, } -/// Write a node to a free metadata block, and mark the block as reserved, -/// without increasing its reference count. -fn write_reserved_node_( - w: &mut WriteBatcher, - mut node: Node, -) -> Result { +/// Write a node to a free metadata block. +fn write_node_(w: &mut WriteBatcher, mut node: Node) -> Result { let keys = node.get_keys(); let first_key = *keys.first().unwrap_or(&0u64); - let b = w.reserve()?; + let b = w.alloc()?; node.set_block(b.loc); let mut cursor = Cursor::new(b.get_data()); @@ -187,7 +183,7 @@ impl NodeIO for LeafIO { values, }; - write_reserved_node_(w, node) + write_node_(w, node) } fn read(&self, w: &mut WriteBatcher, block: u64) -> Result<(Vec, Vec)> { @@ -220,7 +216,7 @@ impl NodeIO for InternalIO { values, }; - write_reserved_node_(w, node) + write_node_(w, node) } fn read(&self, w: &mut WriteBatcher, block: u64) -> Result<(Vec, Vec)> { @@ -246,6 +242,7 @@ pub struct NodeBuilder { max_entries_per_node: usize, values: VecDeque<(u64, V)>, nodes: Vec, + shared: bool, } /// When the builder is including pre-built nodes it has to decide whether @@ -265,13 +262,14 @@ pub struct NodeSummary { impl<'a, V: Pack + Unpack + Clone> NodeBuilder { /// Create a new NodeBuilder - pub fn new(nio: Box>, value_rc: Box>) -> Self { + pub fn new(nio: Box>, value_rc: Box>, shared: bool) -> Self { NodeBuilder { nio, value_rc, max_entries_per_node: calc_max_entries::(), values: VecDeque::new(), nodes: Vec::new(), + shared, } } /// Push a single value. This may emit a new node, hence the Result @@ -324,6 +322,7 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { // Add the remaining nodes. for i in 1..nodes.len() { let n = nodes.get(i).unwrap(); + w.sm.lock().unwrap().inc(n.block, 1)?; self.nodes.push(n.clone()); } } else { @@ -332,6 +331,7 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { // add the nodes for n in nodes { + w.sm.lock().unwrap().inc(n.block, 1)?; self.nodes.push(n.clone()); } } @@ -388,7 +388,7 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { block: wresult.loc, key: wresult.first_key, nr_entries, - shared: false, + shared: self.shared, }); Ok(()) } @@ -433,6 +433,7 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { fn unshift_node(&mut self, w: &mut WriteBatcher) -> Result<()> { let ls = self.nodes.pop().unwrap(); let (keys, values) = self.read_node(w, ls.block)?; + w.sm.lock().unwrap().dec(ls.block)?; let mut vals = VecDeque::new(); @@ -460,7 +461,7 @@ pub struct BTreeBuilder { impl BTreeBuilder { pub fn new(value_rc: Box>) -> BTreeBuilder { BTreeBuilder { - leaf_builder: NodeBuilder::new(Box::new(LeafIO {}), value_rc), + leaf_builder: NodeBuilder::new(Box::new(LeafIO {}), value_rc, false), } } @@ -486,10 +487,7 @@ pub fn build_btree(w: &mut WriteBatcher, leaves: Vec) -> Result 1 { - let mut builder = NodeBuilder::new( - Box::new(InternalIO {}), - Box::new(SMRefCounter::new(w.sm.clone())), - ); + let mut builder = NodeBuilder::new(Box::new(InternalIO {}), Box::new(NoopRC {}), false); for n in nodes { builder.push_value(w, n.key, n.block)?; @@ -500,13 +498,36 @@ pub fn build_btree(w: &mut WriteBatcher, leaves: Vec) -> Result( + w: &mut WriteBatcher, + leaves: &[NodeSummary], + value_rc: &mut dyn RefCounter, +) -> Result<()> { + let nio = LeafIO {}; + + for n in leaves { + let deleted = w.sm.lock().unwrap().dec(n.block)?; + if deleted { + let (_, values) = nio.read(w, n.block)?; + for v in values { + value_rc.dec(&v)?; + } + } + } + + Ok(()) +} + +//------------------------------------------ diff --git a/src/thin/restore.rs b/src/thin/restore.rs index e6a756f..a6738bb 100644 --- a/src/thin/restore.rs +++ b/src/thin/restore.rs @@ -101,7 +101,8 @@ impl<'a> Restorer<'a> { let value_rc = Box::new(MappingRC { sm: self.data_sm.as_ref().unwrap().clone(), }); - let leaf_builder = NodeBuilder::new(Box::new(LeafIO {}), value_rc); + let shared = matches!(section, MappedSection::Def(_)); + let leaf_builder = NodeBuilder::new(Box::new(LeafIO {}), value_rc, shared); self.current_map = Some((section, leaf_builder)); Ok(Visit::Continue) @@ -134,9 +135,26 @@ impl<'a> Restorer<'a> { Ok((details_root, mapping_root)) } + // Release the temporary references to the leaves of pre-built subtrees. + // The contained child values will also be decreased if the leaf is + // no longer referenced. + fn release_subtrees(&mut self) -> Result<()> { + let mut value_rc = MappingRC { + sm: self.data_sm.as_ref().unwrap().clone(), + }; + + for (_, leaves) in self.sub_trees.iter() { + release_leaves(self.w, &leaves, &mut value_rc)?; + } + + Ok(()) + } + fn finalize(&mut self) -> Result<()> { let (details_root, mapping_root) = self.build_device_details()?; + self.release_subtrees()?; + // Build data space map let data_sm = self.data_sm.as_ref().unwrap(); let data_sm_root = build_data_sm(self.w, data_sm.lock().unwrap().deref())?; diff --git a/src/write_batcher.rs b/src/write_batcher.rs index 2300c87..9e64577 100644 --- a/src/write_batcher.rs +++ b/src/write_batcher.rs @@ -18,9 +18,11 @@ pub struct WriteBatcher { batch_size: usize, queue: Vec, - // The reserved range covers all the blocks allocated or reserved by this - // WriteBatcher, and the blocks already occupied. No blocks in this range - // are expected to be freed, hence a single range is used for the representation. + // The reserved range keeps track of all the blocks allocated. + // An allocated block won't be reused even though it was freed. + // In other words, the WriteBatcher performs allocation in + // transactional fashion, that simplifies block allocationas + // as well as tracking. reserved: std::ops::Range, } @@ -83,22 +85,6 @@ impl WriteBatcher { Ok(Block::zeroed(b)) } - pub fn reserve(&mut self) -> Result { - let mut sm = self.sm.lock().unwrap(); - let b = find_free(sm.deref_mut(), &self.reserved)?; - self.reserved.end = b + 1; - - Ok(Block::new(b)) - } - - pub fn reserve_zeroed(&mut self) -> Result { - let mut sm = self.sm.lock().unwrap(); - let b = find_free(sm.deref_mut(), &self.reserved)?; - self.reserved.end = b + 1; - - Ok(Block::zeroed(b)) - } - pub fn get_reserved_range(&self) -> std::ops::Range { std::ops::Range { start: self.reserved.start, From e158dc7601dcd47a4523072885f0f7b3d8c99cc0 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Fri, 30 Jul 2021 22:37:01 +0800 Subject: [PATCH 3/5] [btree_builder] Fix issues with under populated shared nodes A pre-built node could be under populated (less than half-full) due to following reasons: - A single shared leaf generated by dm-thin key removal. The residency could drop below 50%, until it reaches the merge threshold (33% or 44%, depends on its context). - A shared root, which could have any possible nr_entries. - Underfull shared nodes (less than 33% residency) caused by kernel issues. To avoid producing under populated nodes, those kinds of pre-built nodes, except the roots, will be merged into their siblings. --- src/pdata/btree_builder.rs | 89 ++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 23 deletions(-) diff --git a/src/pdata/btree_builder.rs b/src/pdata/btree_builder.rs index f6c62f2..2054030 100644 --- a/src/pdata/btree_builder.rs +++ b/src/pdata/btree_builder.rs @@ -272,13 +272,19 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { shared, } } + /// Push a single value. This may emit a new node, hence the Result /// return type. The value's ref count will be incremented. pub fn push_value(&mut self, w: &mut WriteBatcher, key: u64, val: V) -> Result<()> { + // Unshift the previously pushed node since it is not the root + let half_full = self.max_entries_per_node / 2; + if self.nodes.len() == 1 && (self.nodes.last().unwrap().nr_entries < half_full) { + self.unshift_node(w)?; + } // Have we got enough values to emit a node? We try and keep // at least max_entries_per_node entries unflushed so we // can ensure the final node is balanced properly. - if self.values.len() == self.max_entries_per_node * 2 { + else if self.values.len() == self.max_entries_per_node * 2 { self.emit_node(w)?; } @@ -287,6 +293,19 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { Ok(()) } + // To avoid writing an under populated node we have to grab some + // values from the first of the shared nodes. + fn append_values(&mut self, w: &mut WriteBatcher, node: &NodeSummary) -> Result<()> { + let (keys, values) = self.read_node(w, node.block)?; + + for i in 0..keys.len() { + self.value_rc.inc(&values[i])?; + self.values.push_back((keys[i], values[i].clone())); + } + + Ok(()) + } + /// Push a number of prebuilt, shared nodes. The builder may decide to not /// use a shared node, instead reading the values and packing them /// directly. This may do IO to emit nodes, so returns a Result. @@ -296,43 +315,67 @@ impl<'a, V: Pack + Unpack + Clone> NodeBuilder { pub fn push_nodes(&mut self, w: &mut WriteBatcher, nodes: &[NodeSummary]) -> Result<()> { assert!(!nodes.is_empty()); + // Assume that the node is a shared root if it is the first comer. + // A rooted leaf could have any number of entries. + let maybe_root = (nodes.len() == 1) && self.nodes.is_empty() && self.values.is_empty(); + if maybe_root { + let n = &nodes[0]; + w.sm.lock().unwrap().inc(n.block, 1)?; + self.nodes.push(n.clone()); + return Ok(()); + } + // As a sanity check we make sure that all the shared nodes contain the // minimum nr of entries. + // A single shared node could be possibly under populated (less than half-full) + // due to btree removal, or even underfull (<33% residency) due to kernel issues. + // Those kinds of nodes will be merged into their siblings. let half_full = self.max_entries_per_node / 2; - for n in nodes { - if n.nr_entries < half_full { - panic!("under populated node"); + if nodes.len() > 1 { + for n in nodes { + if n.nr_entries < half_full { + panic!("under populated node"); + } } } + // Unshift the previously pushed node since it is not the root + if self.nodes.len() == 1 && (self.nodes.last().unwrap().nr_entries < half_full) { + self.unshift_node(w)?; + } + // Decide if we're going to use the pre-built nodes. if !self.values.is_empty() && (self.values.len() < half_full) { - // To avoid writing an under populated node we have to grab some - // values from the first of the shared nodes. - let (keys, values) = self.read_node(w, nodes.get(0).unwrap().block)?; + let mut nodes_iter = nodes.iter(); + let n = nodes_iter.next(); + self.append_values(w, n.unwrap())?; - for i in 0..keys.len() { - self.value_rc.inc(&values[i])?; - self.values.push_back((keys[i], values[i].clone())); - } + // Do not flush if there's no succeeding nodes, + // so that it could produce a more compact metadata. + if nodes.len() > 1 { + // Flush all the values. + self.emit_all(w)?; - // Flush all the values. - self.emit_all(w)?; - - // Add the remaining nodes. - for i in 1..nodes.len() { - let n = nodes.get(i).unwrap(); - w.sm.lock().unwrap().inc(n.block, 1)?; - self.nodes.push(n.clone()); + // Add the remaining nodes. + for n in nodes_iter { + w.sm.lock().unwrap().inc(n.block, 1)?; + self.nodes.push(n.clone()); + } } } else { // Flush all the values. self.emit_all(w)?; - // add the nodes - for n in nodes { - w.sm.lock().unwrap().inc(n.block, 1)?; - self.nodes.push(n.clone()); + if nodes[0].nr_entries < half_full { + // An under populated nodes[0] implies nodes.len() == 1, + // and that has to be merged into their siblings. + self.append_values(w, &nodes[0])?; + } else { + // Add the nodes. + for n in nodes { + w.sm.lock().unwrap().inc(n.block, 1)?; + self.nodes.push(n.clone()); + } } } From c16740321238bafb517479b0b2bbbeb522d381be Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Tue, 3 Aug 2021 18:14:32 +0800 Subject: [PATCH 4/5] [space_map (rust)] Fix the maximum value of reference counts --- src/pdata/space_map.rs | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/src/pdata/space_map.rs b/src/pdata/space_map.rs index 9e680ad..7b1cf50 100644 --- a/src/pdata/space_map.rs +++ b/src/pdata/space_map.rs @@ -1,6 +1,8 @@ use anyhow::Result; use fixedbitset::FixedBitSet; +use num_traits::Bounded; use std::boxed::Box; +use std::convert::{TryFrom, TryInto}; use std::sync::{Arc, Mutex}; //------------------------------------------ @@ -61,7 +63,16 @@ where impl SpaceMap for CoreSpaceMap where - V: Copy + Default + Eq + std::ops::AddAssign + From + Into, + V: Copy + + Default + + Eq + + std::ops::AddAssign + + From + + Into + + Bounded + + TryFrom + + std::cmp::PartialOrd, + >::Error: std::fmt::Debug, { fn get_nr_blocks(&self) -> Result { Ok(self.counts.len() as u64) @@ -77,8 +88,8 @@ where fn set(&mut self, b: u64, v: u32) -> Result { let old = self.counts[b as usize]; - assert!(v < 0xff); // FIXME: we can't assume this - self.counts[b as usize] = V::from(v as u8); + assert!(v <= V::max_value().into()); + self.counts[b as usize] = v.try_into().unwrap(); // FIXME: do not panic if old == V::from(0u8) && v != 0 { self.nr_allocated += 1; @@ -91,12 +102,14 @@ where fn inc(&mut self, begin: u64, len: u64) -> Result<()> { for b in begin..(begin + len) { - if self.counts[b as usize] == V::from(0u8) { + let c = &mut self.counts[b as usize]; + assert!(*c < V::max_value()); + if *c == V::from(0u8) { // FIXME: can we get a ref to save dereferencing counts twice? self.nr_allocated += 1; - self.counts[b as usize] = V::from(1u8); + *c = V::from(1u8); } else { - self.counts[b as usize] += V::from(1u8); + *c += V::from(1u8); } } Ok(()) From ec44d043e80741b86eb07da6278547357535e4bd Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 5 Aug 2021 12:28:10 +0800 Subject: [PATCH 5/5] [space_map (rust)] Restrict metadata space map size --- src/cache/repair.rs | 4 ++-- src/cache/restore.rs | 3 +-- src/pdata/space_map_metadata.rs | 12 ++++++++++++ src/thin/repair.rs | 4 ++-- src/thin/restore.rs | 2 +- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/cache/repair.rs b/src/cache/repair.rs index 1273da7..30650b4 100644 --- a/src/cache/repair.rs +++ b/src/cache/repair.rs @@ -6,7 +6,7 @@ use crate::cache::dump::*; use crate::cache::restore::*; use crate::cache::superblock::*; use crate::io_engine::*; -use crate::pdata::space_map::*; +use crate::pdata::space_map_metadata::*; use crate::report::*; use crate::write_batcher::*; @@ -54,7 +54,7 @@ pub fn repair(opts: CacheRepairOptions) -> Result<()> { let sb = read_superblock(ctx.engine_in.as_ref(), SUPERBLOCK_LOCATION)?; - let sm = core_sm(ctx.engine_out.get_nr_blocks(), u32::MAX); + let sm = core_metadata_sm(ctx.engine_out.get_nr_blocks(), u32::MAX); let mut w = WriteBatcher::new( ctx.engine_out.clone(), sm.clone(), diff --git a/src/cache/restore.rs b/src/cache/restore.rs index 067462a..7997e94 100644 --- a/src/cache/restore.rs +++ b/src/cache/restore.rs @@ -14,7 +14,6 @@ use crate::cache::xml; use crate::io_engine::*; use crate::math::*; use crate::pdata::array_builder::*; -use crate::pdata::space_map::*; use crate::pdata::space_map_metadata::*; use crate::pdata::unpack::Pack; use crate::report::*; @@ -259,7 +258,7 @@ pub fn restore(opts: CacheRestoreOptions) -> Result<()> { let ctx = mk_context(&opts)?; - let sm = core_sm(ctx.engine.get_nr_blocks(), u32::MAX); + let sm = core_metadata_sm(ctx.engine.get_nr_blocks(), u32::MAX); let mut w = WriteBatcher::new(ctx.engine.clone(), sm.clone(), ctx.engine.get_batch_size()); // build cache mappings diff --git a/src/pdata/space_map_metadata.rs b/src/pdata/space_map_metadata.rs index 9002ca0..39728b5 100644 --- a/src/pdata/space_map_metadata.rs +++ b/src/pdata/space_map_metadata.rs @@ -2,9 +2,11 @@ use anyhow::{anyhow, Result}; use byteorder::{LittleEndian, WriteBytesExt}; use nom::{number::complete::*, IResult}; use std::io::Cursor; +use std::sync::{Arc, Mutex}; use crate::checksum; use crate::io_engine::*; +use crate::pdata::space_map::*; use crate::pdata::space_map_common::*; use crate::pdata::unpack::*; use crate::write_batcher::*; @@ -12,6 +14,7 @@ use crate::write_batcher::*; //------------------------------------------ const MAX_METADATA_BITMAPS: usize = 255; +const MAX_METADATA_BLOCKS: usize = MAX_METADATA_BITMAPS * ENTRIES_PER_BITMAP; //------------------------------------------ @@ -102,6 +105,15 @@ fn adjust_counts( }) } +//------------------------------------------ + +pub fn core_metadata_sm(nr_blocks: u64, max_count: u32) -> Arc> { + core_sm( + std::cmp::min(nr_blocks, MAX_METADATA_BLOCKS as u64), + max_count, + ) +} + pub fn write_metadata_sm(w: &mut WriteBatcher) -> Result { let r1 = w.get_reserved_range(); diff --git a/src/thin/repair.rs b/src/thin/repair.rs index d4ad6ff..4f74371 100644 --- a/src/thin/repair.rs +++ b/src/thin/repair.rs @@ -3,7 +3,7 @@ use std::path::Path; use std::sync::Arc; use crate::io_engine::*; -use crate::pdata::space_map::*; +use crate::pdata::space_map_metadata::*; use crate::report::*; use crate::thin::dump::*; use crate::thin::metadata::*; @@ -57,7 +57,7 @@ pub fn repair(opts: ThinRepairOptions) -> Result<()> { let md = build_metadata(ctx.engine_in.clone(), &sb)?; let md = optimise_metadata(md)?; - let sm = core_sm(ctx.engine_out.get_nr_blocks(), u32::MAX); + let sm = core_metadata_sm(ctx.engine_out.get_nr_blocks(), u32::MAX); let mut w = WriteBatcher::new( ctx.engine_out.clone(), sm.clone(), diff --git a/src/thin/restore.rs b/src/thin/restore.rs index a6738bb..a1fbb6a 100644 --- a/src/thin/restore.rs +++ b/src/thin/restore.rs @@ -351,7 +351,7 @@ pub fn restore(opts: ThinRestoreOptions) -> Result<()> { let ctx = new_context(&opts)?; let max_count = u32::MAX; - let sm = core_sm(ctx.engine.get_nr_blocks(), max_count); + let sm = core_metadata_sm(ctx.engine.get_nr_blocks(), max_count); let mut w = WriteBatcher::new(ctx.engine.clone(), sm.clone(), ctx.engine.get_batch_size()); let mut restorer = Restorer::new(&mut w, ctx.report); xml::read(input, &mut restorer)?;