[thin_metadata_pack/unpack] Replace C++ implementation with a Rust one.

The Rust implementation is multithreaded, performs better in general and
does custom compression of btree nodes to achieve much better compression
ratios.  unpack also checksums expanded metadata to validate it.

Format version has jumped to 3, no backwards compatibility, but I think
that's ok since we never made a release that contained the C++ version
of these tools.

Benchmarks
==========

On an 8 core, 16 hyperthread machine.

metadata 1G, full:

      Pack size    pack time     unpack time
------------------------------------------------------
C++      193M        50.3s          6.9s (no verify)
Rust      70M         1.4s          1.8s (verify)

metadata 16G, sparse:

       Pack size    pack time     unpack time
------------------------------------------------------
C++      21M          68s           1s   (no verify)
Rust      4M           8.6s         0.5s (verify)
This commit is contained in:
Joe Thornber
2020-06-09 09:15:00 +01:00
parent c48851e747
commit 61de3f9287
21 changed files with 1818 additions and 376 deletions

View File

@ -0,0 +1,32 @@
extern crate clap;
extern crate thinp;
use clap::{App, Arg};
use std::process;
fn main() {
let parser = App::new("thin_metadata_pack")
.version("0.8.5") // FIXME: use actual version
.about("Produces a compressed file of thin metadata. Only packs metadata blocks that are actually used.")
.arg(Arg::with_name("INPUT")
.help("Specify thinp metadata binary device/file")
.required(true)
.short("i")
.value_name("DEV")
.takes_value(true))
.arg(Arg::with_name("OUTPUT")
.help("Specify packed output file")
.required(true)
.short("o")
.value_name("FILE")
.takes_value(true));
let matches = parser.get_matches();
let input_file = matches.value_of("INPUT").unwrap();
let output_file = matches.value_of("OUTPUT").unwrap();
if let Err(reason) = thinp::pack::pack::pack(&input_file, &output_file) {
println!("Application error: {}\n", reason);
process::exit(1);
}
}

View File

@ -0,0 +1,33 @@
extern crate clap;
extern crate thinp;
use clap::{App, Arg};
use std::process;
fn main() {
let parser = App::new("thin_metadata_unpack")
.version("0.8.5") // FIXME: use actual version
.about("Unpack a compressed file of thin metadata.")
.arg(Arg::with_name("INPUT")
.help("Specify thinp metadata binary device/file")
.required(true)
.short("i")
.value_name("DEV")
.takes_value(true))
.arg(Arg::with_name("OUTPUT")
.help("Specify packed output file")
.required(true)
.short("o")
.value_name("FILE")
.takes_value(true));
let matches = parser.get_matches();
let input_file = matches.value_of("INPUT").unwrap();
let output_file = matches.value_of("OUTPUT").unwrap();
if let Err(reason) = thinp::pack::pack::unpack(&input_file, &output_file) {
println!("Application error: {}", reason);
process::exit(1);
}
}

51
src/block_manager.rs Normal file
View File

@ -0,0 +1,51 @@
use std::io;
use std::io::{Read, Seek};
use std::fs::OpenOptions;
use std::os::unix::fs::OpenOptionsExt;
use std::fs::File;
pub const BLOCK_SIZE: usize = 4096;
#[repr(align(4096))]
pub struct Block {
pub data: [u8; BLOCK_SIZE as usize],
}
pub struct BlockManager {
pub nr_blocks: u64,
input: File,
}
fn get_nr_blocks(path: &str) -> io::Result<u64> {
let metadata = std::fs::metadata(path)?;
Ok(metadata.len() / (BLOCK_SIZE as u64))
}
impl BlockManager {
pub fn new(path: &str, _cache_size: usize) -> io::Result<BlockManager> {
let input = OpenOptions::new()
.read(true)
.write(false)
.custom_flags(libc::O_DIRECT)
.open(path)?;
Ok(BlockManager {
nr_blocks: get_nr_blocks(path)?,
input: input,
})
}
pub fn get(&mut self, b: u64) -> io::Result<Block> {
self.read_block(b)
}
fn read_block(&mut self, b: u64) -> io::Result<Block>
{
let mut buf = Block {data: [0; BLOCK_SIZE]};
self.input.seek(io::SeekFrom::Start(b * (BLOCK_SIZE as u64)))?;
self.input.read_exact(&mut buf.data)?;
Ok(buf)
}
}

13
src/check.rs Normal file
View File

@ -0,0 +1,13 @@
use std::error::Error;
use crate::block_manager::BlockManager;
pub fn check(dev: &str) -> Result<(), Box<dyn Error>> {
let mut bm = BlockManager::new(dev, 1024)?;
for b in 0..100 {
let _block = bm.get(b)?;
}
Ok(())
}

16
src/lib.rs Normal file
View File

@ -0,0 +1,16 @@
extern crate byteorder;
extern crate crc32c;
extern crate flate2;
extern crate nom;
extern crate num_cpus;
#[cfg(test)]
extern crate quickcheck;
#[cfg(test)]
#[macro_use(quickcheck)]
#[cfg(test)]
extern crate quickcheck_macros;
pub mod block_manager;
pub mod check;
pub mod pack;

169
src/pack/delta_list.rs Normal file
View File

@ -0,0 +1,169 @@
//-------------------------------------------------
#[derive(PartialEq, Debug, Clone)]
pub enum Delta {
Base { n: u64 },
Const { count: u64 },
Pos { delta: u64, count: u64 },
Neg { delta: u64, count: u64 },
}
use Delta::*;
pub fn to_delta(ns: &[u64]) -> Vec<Delta> {
let mut ds = Vec::new();
if ns.len() > 0 {
let mut base = ns[0];
ds.push(Base { n: base });
let mut i = 1;
while i < ns.len() {
let n = ns[i];
if n > base {
let delta = n - base;
let mut count = 1;
while i < ns.len() && (ns[i] == (base + (count * delta))) {
i += 1;
count += 1;
}
count -= 1;
ds.push(Pos {
delta: delta,
count: count,
});
base += delta * count;
} else if n < base {
let delta = base - n;
let mut count = 1;
while i < ns.len() && (ns[i] + (count * delta) == base) {
i += 1;
count += 1;
}
count -= 1;
ds.push(Neg {
delta: delta,
count: count,
});
base -= delta * count;
} else {
let mut count = 1;
while i < ns.len() && ns[i] == base {
i += 1;
count += 1;
}
count -= 1;
ds.push(Const { count: count });
}
}
}
ds
}
#[cfg(test)]
mod tests {
use super::*;
fn from_delta(ds: &[Delta]) -> Vec<u64> {
let mut ns: Vec<u64> = Vec::new();
let mut base = 0u64;
for d in ds {
match d {
Base { n } => {
ns.push(*n);
base = *n;
}
Const { count } => {
for _ in 0..*count {
ns.push(base);
}
}
Pos { delta, count } => {
for _ in 0..*count {
base += delta;
ns.push(base);
}
}
Neg { delta, count } => {
for _ in 0..*count {
assert!(base >= *delta);
base -= delta;
ns.push(base);
}
}
}
}
ns
}
#[test]
fn test_to_delta() {
struct TestCase(Vec<u64>, Vec<Delta>);
let cases = [
TestCase(vec![], vec![]),
TestCase(vec![1], vec![Base { n: 1 }]),
TestCase(vec![1, 2], vec![Base { n: 1 }, Pos { delta: 1, count: 1 }]),
TestCase(
vec![1, 2, 3, 4],
vec![Base { n: 1 }, Pos { delta: 1, count: 3 }],
),
TestCase(
vec![2, 4, 6, 8],
vec![Base { n: 2 }, Pos { delta: 2, count: 3 }],
),
TestCase(
vec![7, 14, 21, 28],
vec![Base { n: 7 }, Pos { delta: 7, count: 3 }],
),
TestCase(
vec![10, 9],
vec![Base { n: 10 }, Neg { delta: 1, count: 1 }],
),
TestCase(
vec![10, 9, 8, 7],
vec![Base { n: 10 }, Neg { delta: 1, count: 3 }],
),
TestCase(
vec![10, 8, 6, 4],
vec![Base { n: 10 }, Neg { delta: 2, count: 3 }],
),
TestCase(
vec![28, 21, 14, 7],
vec![Base { n: 28 }, Neg { delta: 7, count: 3 }],
),
TestCase(
vec![42, 42, 42, 42],
vec![Base { n: 42 }, Const { count: 3 }],
),
TestCase(
vec![1, 2, 3, 10, 20, 30, 40, 38, 36, 34, 0, 0, 0, 0],
vec![
Base { n: 1 },
Pos { delta: 1, count: 2 },
Pos { delta: 7, count: 1 },
Pos {
delta: 10,
count: 3,
},
Neg { delta: 2, count: 3 },
Neg {
delta: 34,
count: 1,
},
Const { count: 3 },
],
),
];
for t in &cases {
assert_eq!(to_delta(&t.0), t.1);
assert_eq!(from_delta(&t.1), t.0);
}
}
}
//-------------------------------------------------

6
src/pack/mod.rs Normal file
View File

@ -0,0 +1,6 @@
pub mod pack;
mod node_encode;
mod delta_list;
mod vm;

127
src/pack/node_encode.rs Normal file
View File

@ -0,0 +1,127 @@
use std::{io, io::Write};
use nom::{bytes::complete::*, number::complete::*, IResult};
use crate::pack::vm::*;
//-------------------------------------------
#[derive(Debug)]
pub enum PackError {
ParseError,
IOError,
}
impl std::error::Error for PackError {}
pub type PResult<T> = Result<T, PackError>;
fn nom_to_pr<T>(r: IResult<&[u8], T>) -> PResult<(&[u8], T)> {
return match r {
Ok(v) => Ok(v),
Err(_) => Err(PackError::ParseError),
};
}
fn io_to_pr<T>(r: io::Result<T>) -> PResult<T> {
return match r {
Ok(v) => Ok(v),
Err(_) => Err(PackError::IOError),
};
}
//-------------------------------------------
impl std::fmt::Display for PackError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
PackError::ParseError => write!(f, "parse error"),
PackError::IOError => write!(f, "IO error"),
}
}
}
fn run64(i: &[u8], count: usize) -> IResult<&[u8], Vec<u64>> {
let (i, ns) = nom::multi::many_m_n(count, count, le_u64)(i)?;
Ok((i, ns))
}
struct NodeSummary {
is_leaf: bool,
max_entries: usize,
value_size: usize
}
fn summarise_node(data: &[u8]) -> IResult<&[u8], NodeSummary> {
let (i, _csum) = le_u32(data)?;
let (i, flags) = le_u32(i)?;
let (i, _blocknr) = le_u64(i)?;
let (i, _nr_entries) = le_u32(i)?;
let (i, max_entries) = le_u32(i)?;
let (i, value_size) = le_u32(i)?;
let (i, _padding) = le_u32(i)?;
Ok((i, NodeSummary {
is_leaf: flags == 2,
max_entries: max_entries as usize,
value_size: value_size as usize,
}))
}
pub fn pack_btree_node<W: Write>(w: &mut W, data: &[u8]) -> PResult<()> {
let (_, info) = nom_to_pr(summarise_node(data))?;
if info.is_leaf {
if info.value_size == std::mem::size_of::<u64>() {
let (i, hdr) = nom_to_pr(take(32usize)(data))?;
let (i, keys) = nom_to_pr(run64(i, info.max_entries))?;
let (tail, values) = nom_to_pr(run64(i, info.max_entries))?;
io_to_pr(pack_literal(w, hdr))?;
io_to_pr(pack_u64s(w, &keys))?;
io_to_pr(pack_shifted_u64s(w, &values))?;
if tail.len() > 0 {
io_to_pr(pack_literal(w, tail))?;
}
return Ok(());
} else {
// We don't bother packing the values if they aren't u64
let (i, hdr) = nom_to_pr(take(32usize)(data))?;
let (tail, keys) = nom_to_pr(run64(i, info.max_entries))?;
io_to_pr(pack_literal(w, hdr))?;
io_to_pr(pack_u64s(w, &keys))?;
io_to_pr(pack_literal(w, tail))?;
return Ok(());
}
} else {
// Internal node, values are also u64s
let (i, hdr) = nom_to_pr(take(32usize)(data))?;
let (i, keys) = nom_to_pr(run64(i, info.max_entries))?;
let (tail, values) = nom_to_pr(run64(i, info.max_entries))?;
io_to_pr(pack_literal(w, hdr))?;
io_to_pr(pack_u64s(w, &keys))?;
io_to_pr(pack_u64s(w, &values))?;
if tail.len() > 0 {
io_to_pr(pack_literal(w, tail))?;
}
return Ok(());
}
}
pub fn pack_superblock<W: Write>(w: &mut W, bytes: &[u8]) -> PResult<()> {
io_to_pr(pack_literal(w, bytes))
}
pub fn pack_bitmap<W: Write>(w: &mut W, bytes: &[u8]) -> PResult<()> {
io_to_pr(pack_literal(w, bytes))
}
pub fn pack_index<W: Write>(w: &mut W, bytes: &[u8]) -> PResult<()> {
io_to_pr(pack_literal(w, bytes))
}
//-------------------------------------

350
src/pack/pack.rs Normal file
View File

@ -0,0 +1,350 @@
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use flate2::{read::ZlibDecoder, write::ZlibEncoder, Compression};
use std::os::unix::fs::OpenOptionsExt;
use std::{
error::Error,
fs::OpenOptions,
io,
io::prelude::*,
io::Cursor,
io::Write,
ops::DerefMut,
sync::{Arc, Mutex},
thread::spawn,
};
use rand::prelude::*;
use std::sync::mpsc::{sync_channel, Receiver};
use crate::pack::node_encode::*;
const BLOCK_SIZE: u64 = 4096;
const MAGIC: u64 = 0xa537a0aa6309ef77;
const PACK_VERSION: u64 = 3;
const SUPERBLOCK_CSUM_XOR: u32 = 160774;
const BITMAP_CSUM_XOR: u32 = 240779;
const INDEX_CSUM_XOR: u32 = 160478;
const BTREE_CSUM_XOR: u32 = 121107;
fn shuffle<T>(v: &mut Vec<T>) {
let mut rng = rand::thread_rng();
v.shuffle(&mut rng);
}
// FIXME: move to a utils module
fn div_up(n: u64, d: u64) -> u64 {
(n + d - 1) / d
}
// Each thread processes multiple contiguous runs of blocks, called
// chunks. Chunks are shuffled so each thread gets chunks spread
// across the dev in case there are large regions that don't contain
// metadata.
fn mk_chunk_vecs(nr_blocks: u64, nr_jobs: u64) -> Vec<Vec<(u64, u64)>> {
use std::cmp::{max, min};
let chunk_size = min(4 * 1024u64, max(128u64, nr_blocks / (nr_jobs * 64)));
let nr_chunks = div_up(nr_blocks, chunk_size);
let mut chunks = Vec::with_capacity(nr_chunks as usize);
for i in 0..nr_chunks {
chunks.push((i * chunk_size, (i + 1) * chunk_size));
}
shuffle(&mut chunks);
let mut vs = Vec::with_capacity(nr_jobs as usize);
for _ in 0..nr_jobs {
vs.push(Vec::new());
}
for c in 0..nr_chunks {
vs[(c % nr_jobs) as usize].push(chunks[c as usize]);
}
vs
}
pub fn pack(input_file: &str, output_file: &str) -> Result<(), Box<dyn Error>> {
let nr_blocks = get_nr_blocks(&input_file)?;
let nr_jobs = std::cmp::max(1, std::cmp::min(num_cpus::get() as u64, nr_blocks / 128));
let chunk_vecs = mk_chunk_vecs(nr_blocks, nr_jobs);
let input = OpenOptions::new()
.read(true)
.write(false)
.custom_flags(libc::O_EXCL)
.open(input_file)?;
let output = OpenOptions::new()
.read(false)
.write(true)
.create(true)
.truncate(true)
.open(output_file)?;
write_header(&output, nr_blocks)?;
let sync_input = Arc::new(Mutex::new(input));
let sync_output = Arc::new(Mutex::new(output));
let mut threads = Vec::new();
for job in 0..nr_jobs {
let sync_input = Arc::clone(&sync_input);
let sync_output = Arc::clone(&sync_output);
let chunks = chunk_vecs[job as usize].clone();
threads.push(spawn(move || crunch(sync_input, sync_output, chunks)));
}
for t in threads {
t.join().unwrap()?;
}
Ok(())
}
fn crunch<R, W>(
input: Arc<Mutex<R>>,
output: Arc<Mutex<W>>,
ranges: Vec<(u64, u64)>,
) -> io::Result<()>
where
R: Read + Seek,
W: Write,
{
let mut written = 0u64;
let mut z = ZlibEncoder::new(Vec::new(), Compression::default());
for (lo, hi) in ranges {
// We read multiple blocks at once to reduce contention
// on input.
let mut input = input.lock().unwrap();
let big_data = read_blocks(input.deref_mut(), lo, hi - lo)?;
drop(input);
for b in lo..hi {
let block_start = ((b - lo) * BLOCK_SIZE) as usize;
let data = &big_data[block_start..(block_start + BLOCK_SIZE as usize)];
let kind = metadata_block_type(data);
if kind != BT::UNKNOWN {
z.write_u64::<LittleEndian>(b)?;
pack_block(&mut z, kind, &data);
written += 1;
if written == 1024 {
let compressed = z.reset(Vec::new())?;
let mut output = output.lock().unwrap();
output.write_u64::<LittleEndian>(compressed.len() as u64)?;
output.write_all(&compressed)?;
written = 0;
}
}
}
}
if written > 0 {
let compressed = z.finish()?;
let mut output = output.lock().unwrap();
output.write_u64::<LittleEndian>(compressed.len() as u64)?;
output.write_all(&compressed)?;
}
Ok(())
}
fn write_header<W>(mut w: W, nr_blocks: u64) -> io::Result<()>
where
W: byteorder::WriteBytesExt,
{
w.write_u64::<LittleEndian>(MAGIC)?;
w.write_u64::<LittleEndian>(PACK_VERSION)?;
w.write_u64::<LittleEndian>(4096)?;
w.write_u64::<LittleEndian>(nr_blocks)?;
Ok(())
}
fn read_header<R>(mut r: R) -> io::Result<u64>
where
R: byteorder::ReadBytesExt,
{
let magic = r.read_u64::<LittleEndian>()?;
assert_eq!(magic, MAGIC);
let version = r.read_u64::<LittleEndian>()?;
assert_eq!(version, PACK_VERSION);
let block_size = r.read_u64::<LittleEndian>()?;
assert_eq!(block_size, 4096);
r.read_u64::<LittleEndian>()
}
fn get_nr_blocks(path: &str) -> io::Result<u64> {
let metadata = std::fs::metadata(path)?;
Ok(metadata.len() / (BLOCK_SIZE as u64))
}
fn read_blocks<R>(rdr: &mut R, b: u64, count: u64) -> io::Result<Vec<u8>>
where
R: io::Read + io::Seek,
{
let mut buf: Vec<u8> = vec![0; (BLOCK_SIZE * count) as usize];
rdr.seek(io::SeekFrom::Start(b * BLOCK_SIZE))?;
rdr.read_exact(&mut buf)?;
Ok(buf)
}
fn checksum(buf: &[u8]) -> u32 {
crc32c::crc32c(&buf[4..]) ^ 0xffffffff
}
#[derive(PartialEq)]
enum BT {
SUPERBLOCK,
BTREE,
INDEX,
BITMAP,
UNKNOWN,
}
fn metadata_block_type(buf: &[u8]) -> BT {
if buf.len() != BLOCK_SIZE as usize {
return BT::UNKNOWN;
}
// The checksum is always stored in the first u32 of the buffer.
let mut rdr = Cursor::new(buf);
let sum_on_disk = rdr.read_u32::<LittleEndian>().unwrap();
let csum = checksum(buf);
let btype = csum ^ sum_on_disk;
match btype {
SUPERBLOCK_CSUM_XOR => return BT::SUPERBLOCK,
BTREE_CSUM_XOR => return BT::BTREE,
BITMAP_CSUM_XOR => return BT::BITMAP,
INDEX_CSUM_XOR => return BT::INDEX,
_ => {
return BT::UNKNOWN;
}
}
}
fn check<T>(r: &PResult<T>) {
match r {
Ok(_) => {
return;
}
Err(PackError::ParseError) => panic!("parse error"),
Err(PackError::IOError) => panic!("io error"),
}
}
fn pack_block<W: Write>(w: &mut W, kind: BT, buf: &[u8]) {
match kind {
BT::SUPERBLOCK => check(&pack_superblock(w, buf)),
BT::BTREE => check(&pack_btree_node(w, buf)),
BT::INDEX => check(&pack_index(w, buf)),
BT::BITMAP => check(&pack_bitmap(w, buf)),
BT::UNKNOWN => {
assert!(false);
}
}
}
fn write_zero_block<W>(w: &mut W, b: u64) -> io::Result<()>
where
W: Write + Seek,
{
let zeroes: Vec<u8> = vec![0; BLOCK_SIZE as usize];
w.seek(io::SeekFrom::Start(b * BLOCK_SIZE))?;
w.write_all(&zeroes)?;
Ok(())
}
fn write_blocks<W>(w: &Arc<Mutex<W>>, blocks: &mut Vec<(u64, Vec<u8>)>) -> io::Result<()>
where
W: Write + Seek,
{
let mut w = w.lock().unwrap();
while let Some((b, block)) = blocks.pop() {
w.seek(io::SeekFrom::Start(b * BLOCK_SIZE))?;
w.write_all(&block[0..])?;
}
Ok(())
}
fn decode_worker<W>(rx: Receiver<Vec<u8>>, w: Arc<Mutex<W>>) -> io::Result<()>
where
W: Write + Seek,
{
let mut blocks = Vec::new();
while let Ok(bytes) = rx.recv() {
let mut z = ZlibDecoder::new(&bytes[0..]);
while let Ok(b) = z.read_u64::<LittleEndian>() {
let block = crate::pack::vm::unpack(&mut z, BLOCK_SIZE as usize).unwrap();
assert!(metadata_block_type(&block[0..]) != BT::UNKNOWN);
blocks.push((b, block));
if blocks.len() >= 32 {
write_blocks(&w, &mut blocks)?;
}
}
}
write_blocks(&w, &mut blocks)?;
Ok(())
}
pub fn unpack(input_file: &str, output_file: &str) -> Result<(), Box<dyn Error>> {
let mut input = OpenOptions::new()
.read(true)
.write(false)
.open(input_file)?;
let mut output = OpenOptions::new()
.read(false)
.write(true)
.create(true)
.truncate(true)
.open(output_file)?;
let nr_blocks = read_header(&input)?;
// zero the last block to size the file
write_zero_block(&mut output, nr_blocks - 1)?;
// Run until we hit the end
let output = Arc::new(Mutex::new(output));
// kick off the workers
let nr_jobs = num_cpus::get();
let mut senders = Vec::new();
let mut threads = Vec::new();
for _ in 0..nr_jobs {
let (tx, rx) = sync_channel(1);
let output = Arc::clone(&output);
senders.push(tx);
threads.push(spawn(move || decode_worker(rx, output)));
}
// Read z compressed chunk, and hand to worker thread.
let mut next_worker = 0;
while let Ok(len) = input.read_u64::<LittleEndian>() {
let mut bytes = vec![0; len as usize];
input.read_exact(&mut bytes)?;
senders[next_worker].send(bytes).unwrap();
next_worker = (next_worker + 1) % nr_jobs;
}
for s in senders {
drop(s);
}
for t in threads {
t.join().unwrap()?;
}
Ok(())
}

490
src/pack/vm.rs Normal file
View File

@ -0,0 +1,490 @@
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
use std::io;
use std::io::{Cursor, Read, Write};
use num_derive::FromPrimitive;
use num_traits::FromPrimitive;
use crate::pack::delta_list::*;
//-------------------------------------------------
// Deltas are converted to instructions. A delta may not fit
// into a single instruction.
#[derive(Debug, FromPrimitive)]
enum Tag {
Set, // Operand width given in nibble
Pos, // Delta in nibble
PosW, // Delta in operand, whose width is in nibble
Neg, // Delta in nibble
NegW, // Delta in operand, whose width is in nibble
Const, // Count in nibble
Const8, // count = (nibble << 8) | byte
// Controls how many times the next instruction is applied.
// Not applicable to Const instructions which hold their own count.
Count, // count stored in nibble
Count8, // count = (nibble << 8) | byte
Lit, // len in nibble
LitW,
ShiftedRun,
}
fn pack_tag<W: Write>(w: &mut W, t: Tag, nibble: u8) -> io::Result<()> {
assert!(nibble < 16);
let mut b: u8 = t as u8;
assert!(b < 16);
b = (b << 4) | nibble;
w.write_u8(b)
}
fn pack_count<W>(w: &mut W, count: u64) -> io::Result<()>
where
W: Write,
{
if count == 1u64 {
return Ok(());
} else if count < 16 {
return pack_tag(w, Tag::Count, count as u8);
} else {
assert!(count < 4096);
let nibble = count >> 8;
assert!(nibble < 16);
let byte = count & 0xff;
pack_tag(w, Tag::Count8, nibble as u8)?;
return w.write_u8(byte as u8);
}
}
fn pack_delta<W: Write>(w: &mut W, d: &Delta) -> io::Result<()> {
use Tag::*;
match d {
Delta::Base { n } => {
if *n <= std::u8::MAX as u64 {
pack_tag(w, Set, 1)?;
return w.write_u8(*n as u8);
} else if *n <= std::u16::MAX as u64 {
pack_tag(w, Set, 2)?;
return w.write_u16::<LittleEndian>(*n as u16);
} else if *n <= u32::MAX as u64 {
pack_tag(w, Set, 4)?;
return w.write_u32::<LittleEndian>(*n as u32);
} else {
pack_tag(w, Set, 8)?;
return w.write_u64::<LittleEndian>(*n);
}
}
Delta::Pos { delta, count } => {
pack_count(w, *count)?;
if *delta < 16 {
return pack_tag(w, Tag::Pos, *delta as u8);
} else if *delta <= u8::MAX as u64 {
pack_tag(w, PosW, 1)?;
return w.write_u8(*delta as u8);
} else if *delta <= u16::MAX as u64 {
pack_tag(w, PosW, 2)?;
return w.write_u16::<LittleEndian>(*delta as u16);
} else if *delta <= u32::MAX as u64 {
pack_tag(w, PosW, 4)?;
return w.write_u32::<LittleEndian>(*delta as u32);
} else {
pack_tag(w, PosW, 8)?;
return w.write_u64::<LittleEndian>(*delta as u64);
}
}
Delta::Neg { delta, count } => {
pack_count(w, *count)?;
if *delta < 16 {
return pack_tag(w, Neg, *delta as u8);
} else if *delta <= u8::MAX as u64 {
pack_tag(w, NegW, 1)?;
return w.write_u8(*delta as u8);
} else if *delta <= u16::MAX as u64 {
pack_tag(w, NegW, 2)?;
return w.write_u16::<LittleEndian>(*delta as u16);
} else if *delta <= u32::MAX as u64 {
pack_tag(w, NegW, 4)?;
return w.write_u32::<LittleEndian>(*delta as u32);
} else {
pack_tag(w, NegW, 8)?;
return w.write_u64::<LittleEndian>(*delta as u64);
}
}
Delta::Const { count } => {
if *count < 16 {
return pack_tag(w, Tag::Const, *count as u8);
} else {
assert!(*count < 4096);
let nibble = *count >> 8;
assert!(nibble < 16);
pack_tag(w, Tag::Const8, nibble as u8)?;
return w.write_u8((*count & 0xff) as u8);
}
}
}
}
fn pack_deltas<W: Write>(w: &mut W, ds: &[Delta]) -> io::Result<()> {
for d in ds {
pack_delta(w, d)?;
}
Ok(())
}
//-------------------------------------------------
pub fn pack_u64s<W: Write>(w: &mut W, ns: &[u64]) -> io::Result<()> {
let ds = to_delta(ns);
pack_deltas(w, &ds[0..])
}
fn unshift_nrs(shift: usize, ns: &[u64]) -> (Vec<u64>, Vec<u64>) {
let mut values = Vec::new();
let mut shifts = Vec::new();
let mask = (1 << shift) - 1;
for n in ns {
values.push(n >> shift);
shifts.push(n & mask);
}
(values, shifts)
}
pub fn pack_shifted_u64s<W: Write>(w: &mut W, ns: &[u64]) -> io::Result<()> {
let len = ns.len();
let nibble = len >> 8;
assert!(nibble < 16);
pack_tag(w, Tag::ShiftedRun, nibble as u8)?;
w.write_u8((len & 0xff) as u8)?;
let (high, low) = unshift_nrs(24, ns);
pack_u64s(w, &high[0..])?;
pack_u64s(w, &low[0..])
}
pub fn pack_literal<W: Write>(w: &mut W, bs: &[u8]) -> io::Result<()> {
use Tag::LitW;
let len = bs.len() as u64;
if len < 16 as u64 {
pack_tag(w, Tag::Lit, len as u8)?;
} else if len <= u8::MAX as u64 {
pack_tag(w, LitW, 1)?;
w.write_u8(len as u8)?;
} else if len <= u16::MAX as u64 {
pack_tag(w, LitW, 2)?;
w.write_u16::<LittleEndian>(len as u16)?;
} else if len <= u32::MAX as u64 {
pack_tag(w, LitW, 4)?;
w.write_u32::<LittleEndian>(len as u32)?;
} else {
pack_tag(w, LitW, 8)?;
w.write_u64::<LittleEndian>(len as u64)?;
}
w.write_all(bs)
}
//-------------------------------------------------
fn unpack_with_width<R: Read>(r: &mut R, nibble: u8) -> io::Result<u64> {
let v = match nibble {
1 => r.read_u8()? as u64,
2 => r.read_u16::<LittleEndian>()? as u64,
4 => r.read_u32::<LittleEndian>()? as u64,
8 => r.read_u64::<LittleEndian>()? as u64,
_ => {
panic!("SET with bad width");
}
};
Ok(v)
}
fn unpack_u64s<R: Read>(r: &mut R, count: usize) -> io::Result<Vec<u64>> {
let mut v = Vec::new();
for _ in 0..count {
let n = r.read_u64::<LittleEndian>()?;
v.push(n);
}
Ok(v)
}
struct VM {
base: u64,
bytes_written: usize,
}
impl VM {
fn new() -> VM {
VM {
base: 0,
bytes_written: 0,
}
}
fn emit_u64<W: Write>(&mut self, w: &mut W, n: u64) -> io::Result<()> {
w.write_u64::<LittleEndian>(n)?;
self.bytes_written += 8;
Ok(())
}
fn emit_base<W: Write>(&mut self, w: &mut W) -> io::Result<()> {
self.emit_u64(w, self.base)
}
fn emit_bytes<W: Write>(&mut self, w: &mut W, bytes: &[u8]) -> io::Result<()> {
let len = bytes.len();
w.write_all(bytes)?;
self.bytes_written += len;
Ok(())
}
fn unpack_instr<R: Read, W: Write>(
&mut self,
r: &mut R,
w: &mut W,
count: usize,
) -> io::Result<()> {
use Tag::*;
let b = r.read_u8()?;
let kind: Tag = match Tag::from_u8(b >> 4) {
Some(k) => k,
None => {
panic!("bad tag");
}
};
let nibble = b & 0xf;
match kind {
Set => {
self.base = unpack_with_width(r, nibble)?;
for _ in 0..count {
self.emit_base(w)?;
}
}
Pos => {
for _ in 0..count {
self.base += nibble as u64;
self.emit_base(w)?;
}
}
PosW => {
let delta = unpack_with_width(r, nibble)?;
for _ in 0..count {
self.base += delta;
self.emit_base(w)?;
}
}
Neg => {
for _ in 0..count {
self.base -= nibble as u64;
self.emit_base(w)?;
}
}
NegW => {
let delta = unpack_with_width(r, nibble)?;
for _ in 0..count {
self.base -= delta;
self.emit_base(w)?;
}
}
Const => {
assert_eq!(count, 1);
for _ in 0..nibble as usize {
self.emit_base(w)?;
}
}
Const8 => {
assert_eq!(count, 1);
let count = ((nibble as usize) << 8) | (r.read_u8()? as usize);
for _ in 0..count {
self.emit_base(w)?;
}
}
Count => {
self.unpack_instr(r, w, nibble as usize)?;
}
Count8 => {
let count = ((nibble as usize) << 8) | (r.read_u8()? as usize);
self.unpack_instr(r, w, count as usize)?;
}
Lit => {
assert_eq!(count, 1);
let len = nibble as usize;
let mut bytes = vec![0; len];
r.read_exact(&mut bytes[0..])?;
self.emit_bytes(w, &bytes)?;
}
LitW => {
assert_eq!(count, 1);
let len = unpack_with_width(r, nibble)? as usize;
let mut bytes = vec![0; len];
r.read_exact(&mut bytes[0..])?;
self.emit_bytes(w, &bytes)?;
}
ShiftedRun => {
// FIXME: repeated unpack, pack, unpack
let len = ((nibble as usize) << 8) | (r.read_u8()? as usize);
let nr_bytes = (len as usize) * std::mem::size_of::<u64>() as usize;
let mut high_bytes: Vec<u8> = Vec::with_capacity(nr_bytes);
let written = self.exec(r, &mut high_bytes, nr_bytes)?;
self.bytes_written -= written; // hack
let mut high_r = Cursor::new(high_bytes);
let high = unpack_u64s(&mut high_r, len)?;
let mut low_bytes: Vec<u8> = Vec::with_capacity(nr_bytes);
let written = self.exec(r, &mut low_bytes, nr_bytes)?;
self.bytes_written -= written; // hack
let mut low_r = Cursor::new(low_bytes);
let low = unpack_u64s(&mut low_r, len)?;
let mask = (1 << 24) - 1;
for i in 0..len {
self.emit_u64(w, (high[i] << 24) | (low[i] & mask))?;
}
}
}
Ok(())
}
// Runs until at least a number of bytes have been emitted. Returns nr emitted.
fn exec<R: Read, W: Write>(
&mut self,
r: &mut R,
w: &mut W,
emit_bytes: usize,
) -> io::Result<usize> {
let begin = self.bytes_written;
while (self.bytes_written - begin) < emit_bytes {
self.unpack_instr(r, w, 1)?;
}
Ok(self.bytes_written - begin)
}
}
pub fn unpack<R: Read>(r: &mut R, count: usize) -> io::Result<Vec<u8>> {
let mut w = Vec::with_capacity(4096);
let mut cursor = Cursor::new(&mut w);
let mut vm = VM::new();
let written = vm.exec(r, &mut cursor, count)?;
assert_eq!(w.len(), count);
assert_eq!(written, count);
Ok(w)
}
//-------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pack_literals() {
struct TestCase(Vec<u8>);
let cases = [
// This is a bad test case, because unpack will not exec
// any instructions.
TestCase(b"".to_vec()),
TestCase(b"foo".to_vec()),
TestCase(vec![42; 15]),
TestCase(vec![42; 256]),
TestCase(vec![42; 4096]),
];
for t in &cases {
let mut bs = Vec::with_capacity(4096);
let mut w = Cursor::new(&mut bs);
pack_literal(&mut w, &t.0[0..]).unwrap();
let mut r = Cursor::new(&mut bs);
let unpacked = unpack(&mut r, t.0.len()).unwrap();
assert_eq!(&t.0[0..], &unpacked[0..]);
}
}
fn check_u64s_match(ns: &Vec<u64>, bytes: &[u8]) -> bool {
let mut packed = Vec::with_capacity(ns.len() * 8);
let mut w = Cursor::new(&mut packed);
for n in ns {
w.write_u64::<LittleEndian>(*n).unwrap();
}
packed == bytes
}
fn check_pack_u64s(ns: &Vec<u64>) -> bool {
println!("packing {:?}", &ns);
let mut bs = Vec::with_capacity(4096);
let mut w = Cursor::new(&mut bs);
pack_u64s(&mut w, &ns[0..]).unwrap();
println!("unpacked len = {}, packed len = {}", ns.len() * 8, bs.len());
let mut r = Cursor::new(&mut bs);
let unpacked = unpack(&mut r, ns.len() * 8).unwrap();
check_u64s_match(&ns, &unpacked[0..])
}
#[test]
fn test_pack_u64s() {
let cases = [
vec![0],
vec![1, 5, 9, 10],
b"the quick brown fox jumps over the lazy dog"
.iter()
.map(|b| *b as u64)
.collect(),
];
for t in &cases {
assert!(check_pack_u64s(&t));
}
}
#[quickcheck]
fn prop_pack_u64s(mut ns: Vec<u64>) -> bool {
ns.push(42); // We don't handle empty vecs
check_pack_u64s(&ns)
}
fn check_pack_shifted_u64s(ns: &Vec<(u64, u64)>) -> bool {
let shifted: Vec<u64> = ns
.iter()
.map(|(h, l)| (h << 24) | (l & ((1 << 24) - 1)))
.collect();
println!("packing {:?}", &ns);
let mut bs = Vec::with_capacity(4096);
let mut w = Cursor::new(&mut bs);
pack_shifted_u64s(&mut w, &shifted[0..]).unwrap();
println!("unpacked len = {}, packed len = {}", ns.len() * 8, bs.len());
let mut r = Cursor::new(&mut bs);
let unpacked = unpack(&mut r, ns.len() * 8).unwrap();
check_u64s_match(&shifted, &unpacked[0..])
}
#[quickcheck]
fn prop_pack_shifted_u64s(mut ns: Vec<(u64, u64)>) -> bool {
ns.push((42, 42));
check_pack_shifted_u64s(&ns)
}
}
//-------------------------------------------------