From b32908d5c200a616c7a4fdca1d4147bec5f78f8f Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 25 Jul 2014 10:35:04 +0100
Subject: [PATCH] work in progress

---
 block-cache/block_cache.cc                | 1095 ++++++++++-----------
 block-cache/block_cache.h                 |  185 +++-
 {persistent-data => block-cache}/buffer.h |    0
 caching/superblock.cc                     |    4 +-
 era/superblock.cc                         |    4 +-
 persistent-data/block.h                   |   14 +-
 persistent-data/block.tcc                 |   54 +-
 persistent-data/data-structures/array.h   |    4 +-
 persistent-data/data-structures/btree.tcc |    4 +-
 persistent-data/space-maps/disk.cc        |   12 +-
 thin-provisioning/superblock.cc           |    4 +-
 11 files changed, 717 insertions(+), 663 deletions(-)
 rename {persistent-data => block-cache}/buffer.h (100%)
diff --git a/block-cache/block_cache.cc b/block-cache/block_cache.cc
index 0cca970..6127a23 100644
--- a/block-cache/block_cache.cc
+++ b/block-cache/block_cache.cc
@@ -1,7 +1,5 @@
 #include "block-cache/block_cache.h"
 
-#include "block-cache/list.h"
-
 #include <assert.h>
 #include <libaio.h>
 #include <errno.h>
@@ -11,6 +9,10 @@
 #include <string.h>
 #include <unistd.h>
 
+#include <iostream>
+
+//----------------------------------------------------------------
+
 // FIXME: get from linux headers
 #define SECTOR_SHIFT 9
 #define PAGE_SIZE 4096
@@ -19,648 +21,581 @@
 #define WRITEBACK_LOW_THRESHOLD_PERCENT 33
 #define WRITEBACK_HIGH_THRESHOLD_PERCENT 66
 
-/*----------------------------------------------------------------
- * Structures
- *--------------------------------------------------------------*/
-struct block_cache;
+//----------------------------------------------------------------
 
-enum block_flags {
-	IO_PENDING = (1 << 0),
-	DIRTY = (1 << 1)
-};
+namespace {
+	// FIXME: remove
 
-struct block {
-	struct list_head list;
-	struct list_head hash_list;
+	/*----------------------------------------------------------------
+	 * Logging
+	 *--------------------------------------------------------------*/
+	void info(const char *format, ...)
+	{
+		va_list ap;
 
-	struct block_cache *bc;
-	unsigned ref_count;
-
-	int error;
-	unsigned flags;
-
-	struct iocb control_block;
-
-	struct bc_block b;
-};
-
-struct block_cache {
-	int fd;
-	sector_t block_size;
-	uint64_t nr_data_blocks;
-	uint64_t nr_cache_blocks;
-
-	void *blocks_memory;
-	void *blocks_data;
-
-	io_context_t aio_context;
-	struct io_event *events;
-
-	/*
-	 * Blocks on the free list are not initialised, apart from the
-	 * b.data field.
-	 */
-	struct list_head free;
-	struct list_head errored;
-	struct list_head dirty;
-	struct list_head clean;
-
-	unsigned nr_io_pending;
-	struct list_head io_pending;
-
-	unsigned nr_dirty;
-
-	/*
-	 * Hash table fields.
-	 */
-	unsigned nr_buckets;
-	unsigned mask;
-	struct list_head buckets[0];
-};
-
-/*----------------------------------------------------------------
- * Logging
- *--------------------------------------------------------------*/
-static void info(struct block_cache *bc, const char *format, ...)
-	__attribute__ ((format (printf, 2, 3)));
-
-static void info(struct block_cache *bc, const char *format, ...)
-{
-	va_list ap;
-
-	va_start(ap, format);
-	vfprintf(stderr, format, ap);
-	va_end(ap);
-}
-
-/*----------------------------------------------------------------
- * Allocation
- *--------------------------------------------------------------*/
-static void *alloc_aligned(size_t len, size_t alignment)
-{
-	void *result = NULL;
-	int r = posix_memalign(&result, alignment, len);
-	if (r)
-		return NULL;
-
-	return result;
-}
-
-static int init_free_list(struct block_cache *bc, unsigned count)
-{
-	size_t len;
-	struct block *blocks;
-	size_t block_size = bc->block_size << SECTOR_SHIFT;
-	void *data;
-	unsigned i;
-
-	/* Allocate the block structures */
-	len = sizeof(struct block) * count;
-	blocks = static_cast<block *>(malloc(len));
-	if (!blocks)
-		return -ENOMEM;
-
-	bc->blocks_memory = blocks;
-
-	/* Allocate the data for each block.  We page align the data. */
-	data = alloc_aligned(count * block_size, PAGE_SIZE);
-	if (!data) {
-		free(blocks);
-		return -ENOMEM;
+		va_start(ap, format);
+		vfprintf(stderr, format, ap);
+		va_end(ap);
 	}
 
-	bc->blocks_data = data;
+	void *alloc_aligned(size_t len, size_t alignment)
+	{
+		void *result = NULL;
+		int r = posix_memalign(&result, alignment, len);
+		if (r)
+			return NULL;
 
-	for (i = 0; i < count; i++) {
-		struct block *b = blocks + i;
-		INIT_LIST_HEAD(&b->list);
-		b->b.data = data + block_size * i;
-
-		list_add(&b->list, &bc->free);
+		return result;
 	}
-
-	return 0;
 }
 
-static struct block *__alloc_block(struct block_cache *bc)
-{
-	struct block *b;
+//----------------------------------------------------------------
 
-	if (list_empty(&bc->free))
-		return NULL;
+namespace bcache {
+	int
+	block_cache::init_free_list(unsigned count)
+	{
+		size_t len;
+		block *blocks;
+		size_t block_size = block_size_ << SECTOR_SHIFT;
+		void *data;
+		unsigned i;
 
-	b = list_first_entry(&bc->free, struct block, list);
-	list_del(&b->list);
+		/* Allocate the block structures */
+		len = sizeof(block) * count;
+		blocks = static_cast<block *>(malloc(len));
+		if (!blocks)
+			return -ENOMEM;
 
-	return b;
-}
+		blocks_memory_.reset(reinterpret_cast<unsigned char *>(blocks));
 
-/*----------------------------------------------------------------
- * Flags handling
- *--------------------------------------------------------------*/
-static unsigned test_flags(struct block *b, unsigned flags)
-{
-	return b->flags & flags;
-}
-
-static void clear_flags(struct block *b, unsigned flags)
-{
-	b->flags &= ~flags;
-}
-
-static void set_flags(struct block *b, unsigned flags)
-{
-	b->flags |= flags;
-}
-
-/*----------------------------------------------------------------
- * Low level IO handling
- *
- * We cannot have two concurrent writes on the same block.
- * eg, background writeback, put with dirty, flush?
- *
- * To avoid this we introduce some restrictions:
- *
- * i)  A held block can never be written back.
- * ii) You cannot get a block until writeback has completed.
- *
- *--------------------------------------------------------------*/
-
-/*
- * This can be called from the context of the aio thread.  So we have a
- * separate 'top half' complete function that we know is only called by the
- * main cache thread.
- */
-static void complete_io(struct block *b, int result)
-{
-	b->error = result;
-	clear_flags(b, IO_PENDING);
-	b->bc->nr_io_pending--;
-
-	if (b->error)
-		list_move_tail(&b->list, &b->bc->errored);
-	else {
-		if (test_flags(b, DIRTY)) {
-			clear_flags(b, DIRTY);
-			b->bc->nr_dirty--;
+		/* Allocate the data for each block.  We page align the data. */
+		data = alloc_aligned(count * block_size, PAGE_SIZE);
+		if (!data) {
+			free(blocks);
+			return -ENOMEM;
 		}
 
-		list_move_tail(&b->list, &b->bc->clean);
-	}
-}
+		blocks_data_.reset(reinterpret_cast<unsigned char *>(data));
 
-/*
- * |b->list| should be valid (either pointing to itself, on one of the other
- * lists.
- */
-static int issue_low_level(struct block *b, enum io_iocb_cmd opcode, const char *desc)
-{
-	int r;
-	struct block_cache *bc = b->bc;
-	struct iocb *control_blocks[1];
+		for (i = 0; i < count; i++) {
+			block *b = blocks + i;
+			INIT_LIST_HEAD(&b->list_);
+			b->data_ = data + block_size * i;
 
-	assert(!test_flags(b, IO_PENDING));
-	set_flags(b, IO_PENDING);
-	bc->nr_io_pending++;
-	list_move_tail(&b->list, &bc->io_pending);
+			list_add(&b->list_, &free_);
+		}
 
-	b->control_block.aio_lio_opcode = opcode;
-	control_blocks[0] = &b->control_block;
-	r = io_submit(bc->aio_context, 1, control_blocks);
-	if (r != 1) {
-		if (r < 0) {
-			perror("io_submit error");
-			info(bc, "io_submit failed with %s op: %d\n", desc, r);
-		} else
-			info(bc, "could not submit IOs, with %s op\n", desc);
-
-		complete_io(b, EIO);
-		return -EIO;
+		return 0;
 	}
 
-	return 0;
-}
+	block_cache::block *
+	block_cache::__alloc_block()
+	{
+		block *b;
 
-static int issue_read(struct block *b)
-{
-	return issue_low_level(b, IO_CMD_PREAD, "read");
-}
+		if (list_empty(&free_))
+			return NULL;
 
-static int issue_write(struct block *b)
-{
-	return issue_low_level(b, IO_CMD_PWRITE, "write");
-}
+		b = list_first_entry(&free_, block, list_);
+		list_del(&b->list_);
 
-static void wait_io(struct block_cache *bc)
-{
-	int r;
-	unsigned i;
-
-	// FIXME: use a timeout to prevent hanging
-	r = io_getevents(bc->aio_context, 1, bc->nr_cache_blocks, bc->events, NULL);
-	if (r < 0) {
-		info(bc, "io_getevents failed %d\n", r);
-		exit(1);	/* FIXME: handle more gracefully */
+		return b;
 	}
 
-	for (i = 0; i < static_cast<unsigned>(r); i++) {
-		struct io_event *e = bc->events + i;
-		struct block *b = container_of(e->obj, struct block, control_block);
+	/*----------------------------------------------------------------
+	 * Low level IO handling
+	 *
+	 * We cannot have two concurrent writes on the same block.
+	 * eg, background writeback, put with dirty, flush?
+	 *
+	 * To avoid this we introduce some restrictions:
+	 *
+	 * i)  A held block can never be written back.
+	 * ii) You cannot get a block until writeback has completed.
+	 *
+	 *--------------------------------------------------------------*/
 
-		if (e->res == bc->block_size << SECTOR_SHIFT)
-			complete_io(b, 0);
-		else if (e->res < 0)
-			complete_io(b, e->res);
+	/*
+	 * This can be called from the context of the aio thread.  So we have a
+	 * separate 'top half' complete function that we know is only called by the
+	 * main cache thread.
+	 */
+	void
+	block_cache::complete_io(block &b, int result)
+	{
+		b.error_ = result;
+		clear_flags(b, IO_PENDING);
+		nr_io_pending_--;
+
+		if (b.error_)
+			list_move_tail(&b.list_, &errored_);
 		else {
-			info(bc, "incomplete io, unexpected\n");
-		}
-	}
-}
+			if (test_flags(b, DIRTY)) {
+				clear_flags(b, DIRTY);
+				nr_dirty_--;
+			}
 
-/*----------------------------------------------------------------
- * Clean/dirty list management
- *--------------------------------------------------------------*/
-
-/*
- * We're using lru lists atm, but I think it would be worth
- * experimenting with a multiqueue approach.
- */
-static struct list_head *__categorise(struct block *b)
-{
-	if (b->error)
-		return &b->bc->errored;
-
-	return (b->flags & DIRTY) ? &b->bc->dirty : &b->bc->clean;
-}
-
-static void hit(struct block *b)
-{
-	list_move_tail(&b->list, __categorise(b));
-}
-
-/*----------------------------------------------------------------
- * High level IO handling
- *--------------------------------------------------------------*/
-static void wait_all(struct block_cache *bc)
-{
-	while (!list_empty(&bc->io_pending))
-		wait_io(bc);
-}
-
-static void wait_specific(struct block *b)
-{
-	while (test_flags(b, IO_PENDING))
-		wait_io(b->bc);
-}
-
-static unsigned writeback(struct block_cache *bc, unsigned count)
-{
-	int r;
-	struct block *b, *tmp;
-	unsigned actual = 0;
-
-	list_for_each_entry_safe (b, tmp, &bc->dirty, list) {
-		if (actual == count)
-			break;
-
-		if (b->ref_count)
-			continue;
-
-		r = issue_write(b);
-		if (!r)
-			actual++;
-	}
-
-	info(bc, "writeback: requested %u, actual %u\n", count, actual);
-	return actual;
-}
-
-/*----------------------------------------------------------------
- * Hash table
- *---------------------------------------------------------------*/
-
-/*
- * |nr_buckets| must be a power of two.
- */
-static void hash_init(struct block_cache *bc, unsigned nr_buckets)
-{
-	unsigned i;
-
-	bc->nr_buckets = nr_buckets;
-	bc->mask = nr_buckets - 1;
-
-	for (i = 0; i < nr_buckets; i++)
-		INIT_LIST_HEAD(bc->buckets + i);
-}
-
-static unsigned hash(struct block_cache *bc, uint64_t index)
-{
-	const unsigned BIG_PRIME = 4294967291UL;
-	return (((unsigned) index) * BIG_PRIME) & bc->mask;
-}
-
-static struct block *hash_lookup(struct block_cache *bc, block_index index)
-{
-	struct block *b;
-	unsigned bucket = hash(bc, index);
-
-	list_for_each_entry (b, bc->buckets + bucket, hash_list) {
-		if (b->b.index == index)
-			return b;
-	}
-
-	return NULL;
-}
-
-static void hash_insert(struct block *b)
-{
-	unsigned bucket = hash(b->bc, b->b.index);
-
-	list_move_tail(&b->hash_list, b->bc->buckets + bucket);
-}
-
-static void hash_remove(struct block *b)
-{
-	list_del_init(&b->hash_list);
-}
-
-/*----------------------------------------------------------------
- * High level allocation
- *--------------------------------------------------------------*/
-static void setup_control_block(struct block *b)
-{
-	struct iocb *cb = &b->control_block;
-	size_t block_size_bytes = b->bc->block_size << SECTOR_SHIFT;
-
-	memset(cb, 0, sizeof(*cb));
-	cb->aio_fildes = b->bc->fd;
-
-	cb->u.c.buf = b->b.data;
-	cb->u.c.offset = block_size_bytes * b->b.index;
-	cb->u.c.nbytes = block_size_bytes;
-}
-
-static struct block *new_block(struct block_cache *bc,
-			       block_index index)
-{
-	struct block *b;
-
-	b = __alloc_block(bc);
-	if (!b) {
-		if (list_empty(&bc->clean)) {
-			if (list_empty(&bc->io_pending))
-				writeback(bc, 9000);
-			wait_io(bc);
-		}
-
-		if (!list_empty(&bc->clean)) {
-			b = list_first_entry(&bc->clean, struct block, list);
-			hash_remove(b);
-			list_del(&b->list);
+			list_move_tail(&b.list_, &clean_);
 		}
 	}
 
-	if (b) {
-		INIT_LIST_HEAD(&b->list);
-		INIT_LIST_HEAD(&b->hash_list);
-		b->bc = bc;
-		b->ref_count = 0;
+	/*
+	 * |b->list| should be valid (either pointing to itself, on one of the other
+	 * lists.
+	 */
+	int
+	block_cache::issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc)
+	{
+		int r;
+		iocb *control_blocks[1];
 
-		b->error = 0;
-		clear_flags(b, IO_PENDING | DIRTY);
+		assert(!test_flags(b, IO_PENDING));
+		set_flags(b, IO_PENDING);
+		nr_io_pending_++;
+		list_move_tail(&b.list_, &io_pending_);
 
-		b->b.index = index;
-		setup_control_block(b);
+		b.control_block_.aio_lio_opcode = opcode;
+		control_blocks[0] = &b.control_block_;
+		r = io_submit(aio_context_, 1, control_blocks);
+		if (r != 1) {
+			if (r < 0) {
+				perror("io_submit error");
+				info("io_submit failed with %s op: %d\n", desc, r);
+			} else
+				info("could not submit IOs, with %s op\n", desc);
 
-		hash_insert(b);
-	}
-
-	return b;
-}
-
-/*----------------------------------------------------------------
- * Block reference counting
- *--------------------------------------------------------------*/
-static void get_block(struct block *b)
-{
-	b->ref_count++;
-}
-
-static void put_block(struct block *b)
-{
-	assert(b->ref_count);
-	b->ref_count--;
-}
-
-static void mark_dirty(struct block *b)
-{
-	struct block_cache *bc = b->bc;
-
-	if (!test_flags(b, DIRTY)) {
-		set_flags(b, DIRTY);
-		list_move_tail(&b->list, &b->bc->dirty);
-		bc->nr_dirty++;
-	}
-}
-
-/*----------------------------------------------------------------
- * Public interface
- *--------------------------------------------------------------*/
-unsigned calc_nr_cache_blocks(size_t mem, sector_t block_size)
-{
-	size_t space_per_block = (block_size << SECTOR_SHIFT) + sizeof(struct block);
-	unsigned r = mem / space_per_block;
-
-	return (r < MIN_BLOCKS) ? MIN_BLOCKS : r;
-}
-
-unsigned calc_nr_buckets(unsigned nr_blocks)
-{
-	unsigned r = 8;
-	unsigned n = nr_blocks / 4;
-
-	if (n < 8)
-		n = 8;
-
-	while (r < n)
-		r <<= 1;
-
-	return r;
-}
-
-void
-block_cache_destroy(struct block_cache *bc)
-{
-	wait_all(bc);
-
-	if (bc->aio_context)
-		io_destroy(bc->aio_context);
-
-	if (bc->events)
-		free(bc->events);
-
-	if (bc->blocks_memory)
-		free(bc->blocks_memory);
-
-	if (bc->blocks_data)
-		free(bc->blocks_data);
-
-	free(bc);
-}
-
-struct block_cache *
-block_cache_create(int fd, sector_t block_size, uint64_t on_disk_blocks, size_t mem)
-{
-	int r;
-	struct block_cache *bc;
-	unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size);
-	unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks);
-
-	bc = static_cast<block_cache *>(malloc(sizeof(*bc) + sizeof(*bc->buckets) * nr_buckets));
-	if (bc) {
-		memset(bc, 0, sizeof(*bc));
-
-		bc->fd = fd;
-		bc->block_size = block_size;
-		bc->nr_data_blocks = on_disk_blocks;
-		bc->nr_cache_blocks = nr_cache_blocks;
-
-		bc->events = static_cast<io_event *>(malloc(sizeof(*bc->events) * nr_cache_blocks));
-		if (!bc->events) {
-			info(bc, "couldn't allocate events array\n");
-			goto bad;
+			complete_io(b, EIO);
+			return -EIO;
 		}
 
-		bc->aio_context = 0; /* needed or io_setup will fail */
-		r = io_setup(nr_cache_blocks, &bc->aio_context);
+		return 0;
+	}
+
+	int
+	block_cache::issue_read(block &b)
+	{
+		return issue_low_level(b, IO_CMD_PREAD, "read");
+	}
+
+	int
+	block_cache::issue_write(block &b)
+	{
+		std::cerr << "issuing write for block " << b.index_ << "\n";
+		return issue_low_level(b, IO_CMD_PWRITE, "write");
+	}
+
+	void
+	block_cache::wait_io()
+	{
+		int r;
+		unsigned i;
+
+		// FIXME: use a timeout to prevent hanging
+		r = io_getevents(aio_context_, 1, nr_cache_blocks_, &events_[0], NULL);
 		if (r < 0) {
-			info(bc, "io_setup failed: %d\n", r);
-			goto bad;
+			info("io_getevents failed %d\n", r);
+			exit(1);	/* FIXME: handle more gracefully */
 		}
 
-		hash_init(bc, nr_buckets);
-		INIT_LIST_HEAD(&bc->free);
-		INIT_LIST_HEAD(&bc->errored);
-		INIT_LIST_HEAD(&bc->dirty);
-		INIT_LIST_HEAD(&bc->clean);
-		INIT_LIST_HEAD(&bc->io_pending);
+		for (i = 0; i < static_cast<unsigned>(r); i++) {
+			io_event const &e = events_[i];
+			block *b = container_of(e.obj, block, control_block_);
 
-		r = init_free_list(bc, nr_cache_blocks);
-		if (r) {
-			info(bc, "couldn't allocate blocks: %d\n", r);
-			goto bad;
+			if (e.res == block_size_ << SECTOR_SHIFT)
+				complete_io(*b, 0);
+
+			else if (e.res < 0)
+				complete_io(*b, e.res);
+
+			else
+				info("incomplete io, unexpected: %d\n", r);
 		}
 	}
 
-	return bc;
+	/*----------------------------------------------------------------
+	 * Clean/dirty list management
+	 *--------------------------------------------------------------*/
 
-bad:
-	block_cache_destroy(bc);
-	return NULL;
-}
+	/*
+	 * We're using lru lists atm, but I think it would be worth
+	 * experimenting with a multiqueue approach.
+	 */
+	list_head *
+	block_cache::__categorise(block &b)
+	{
+		if (b.error_)
+			return &errored_;
 
-uint64_t block_cache_get_nr_blocks(struct block_cache *bc)
-{
-	return bc->nr_data_blocks;
-}
+		return (b.flags_ & DIRTY) ? &dirty_ : &clean_;
+	}
 
-static void zero_block(struct block *b)
-{
-	memset(b->b.data, 0, b->bc->block_size << SECTOR_SHIFT);
-	mark_dirty(b);
-}
+	void
+	block_cache::hit(block &b)
+	{
+		list_move_tail(&b.list_, __categorise(b));
+	}
 
-static struct block *lookup_or_read_block(struct block_cache *bc, block_index index, unsigned flags)
-{
-	struct block *b = hash_lookup(bc, index);
+	/*----------------------------------------------------------------
+	 * High level IO handling
+	 *--------------------------------------------------------------*/
+	void
+	block_cache::wait_all()
+	{
+		while (!list_empty(&io_pending_))
+			wait_io();
+	}
 
-	if (b) {
-		if (test_flags(b, IO_PENDING))
-			wait_specific(b);
+	void
+	block_cache::wait_specific(block &b)
+	{
+		while (test_flags(b, IO_PENDING))
+			wait_io();
+	}
 
-		if (flags & GF_ZERO)
-			zero_block(b);
+	unsigned
+	block_cache::writeback(unsigned count)
+	{
+		int r;
+		block *b, *tmp;
+		unsigned actual = 0;
 
-	} else {
-		if (flags & GF_CAN_BLOCK) {
-			b = new_block(bc, index);
-			if (b) {
-				if (flags & GF_ZERO)
-					zero_block(b);
-				else {
-					issue_read(b);
-					wait_specific(b);
+		list_for_each_entry_safe (b, tmp, &dirty_, list_) {
+			if (actual == count)
+				break;
+
+			if (b->ref_count_)
+				continue;
+
+			r = issue_write(*b);
+			if (!r)
+				actual++;
+		}
+
+		info("writeback: requested %u, actual %u\n", count, actual);
+		return actual;
+	}
+
+	/*----------------------------------------------------------------
+	 * Hash table
+	 *---------------------------------------------------------------*/
+
+	/*
+	 * |nr_buckets| must be a power of two.
+	 */
+	void
+	block_cache::hash_init(unsigned nr_buckets)
+	{
+		unsigned i;
+
+		nr_buckets_ = nr_buckets;
+		mask_ = nr_buckets - 1;
+
+		for (i = 0; i < nr_buckets; i++)
+			INIT_LIST_HEAD(&buckets_[i]);
+	}
+
+	unsigned
+	block_cache::hash(uint64_t index)
+	{
+		const unsigned BIG_PRIME = 4294967291UL;
+		return (((unsigned) index) * BIG_PRIME) & mask_;
+	}
+
+	block_cache::block *
+	block_cache::hash_lookup(block_index index)
+	{
+		block *b;
+		unsigned bucket = hash(index);
+
+		list_for_each_entry (b, &buckets_[bucket], hash_list_) {
+			if (b->index_ == index)
+				return b;
+		}
+
+		return NULL;
+	}
+
+	void
+	block_cache::hash_insert(block &b)
+	{
+		unsigned bucket = hash(b.index_);
+		list_move_tail(&b.hash_list_, &buckets_[bucket]);
+	}
+
+	void
+	block_cache::hash_remove(block &b)
+	{
+		list_del_init(&b.hash_list_);
+	}
+
+	/*----------------------------------------------------------------
+	 * High level allocation
+	 *--------------------------------------------------------------*/
+	void
+	block_cache::setup_control_block(block &b)
+	{
+		iocb *cb = &b.control_block_;
+		size_t block_size_bytes = block_size_ << SECTOR_SHIFT;
+
+		memset(cb, 0, sizeof(*cb));
+		cb->aio_fildes = fd_;
+
+		cb->u.c.buf = b.data_;
+		cb->u.c.offset = block_size_bytes * b.index_;
+		cb->u.c.nbytes = block_size_bytes;
+	}
+
+	block_cache::block *
+	block_cache::new_block(block_index index)
+	{
+		block *b;
+
+		b = __alloc_block();
+		if (!b) {
+			if (list_empty(&clean_)) {
+				if (list_empty(&io_pending_))
+					writeback(9000);
+				wait_io();
+			}
+
+			if (!list_empty(&clean_)) {
+				b = list_first_entry(&clean_, block, list_);
+				hash_remove(*b);
+				list_del(&b->list_);
+			}
+		}
+
+		if (b) {
+			INIT_LIST_HEAD(&b->list_);
+			INIT_LIST_HEAD(&b->hash_list_);
+			b->bc_ = this;
+			b->ref_count_ = 0;
+
+			b->error_ = 0;
+			clear_flags(*b, IO_PENDING | DIRTY);
+
+			b->index_ = index;
+			setup_control_block(*b);
+
+			hash_insert(*b);
+		}
+
+		return b;
+	}
+
+	/*----------------------------------------------------------------
+	 * Block reference counting
+	 *--------------------------------------------------------------*/
+	void
+	block_cache::mark_dirty(block &b)
+	{
+		if (!test_flags(b, DIRTY)) {
+			set_flags(b, DIRTY);
+			list_move_tail(&b.list_, &dirty_);
+			nr_dirty_++;
+		}
+	}
+
+	unsigned
+	block_cache::calc_nr_cache_blocks(size_t mem, sector_t block_size)
+	{
+		size_t space_per_block = (block_size << SECTOR_SHIFT) + sizeof(block);
+		unsigned r = mem / space_per_block;
+
+		return (r < MIN_BLOCKS) ? MIN_BLOCKS : r;
+	}
+
+	unsigned
+	block_cache::calc_nr_buckets(unsigned nr_blocks)
+	{
+		unsigned r = 8;
+		unsigned n = nr_blocks / 4;
+
+		if (n < 8)
+			n = 8;
+
+		while (r < n)
+			r <<= 1;
+
+		return r;
+	}
+
+	block_cache::block_cache(int fd, sector_t block_size, uint64_t on_disk_blocks, size_t mem)
+		: nr_dirty_(0),
+		  nr_io_pending_(0)
+	{
+		int r;
+		unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size);
+		unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks);
+
+		info("block_size = %llu, on_disk_blocks = %llu, mem = %llu, nr_cache_blocks = %llu\n",
+		     (unsigned long long) block_size,
+		     (unsigned long long) on_disk_blocks,
+		     (unsigned long long) mem,
+		     (unsigned long long) nr_cache_blocks);
+
+
+		buckets_.resize(nr_buckets);
+
+		fd_ = fd;
+		block_size_ = block_size;
+		nr_data_blocks_ = on_disk_blocks;
+		nr_cache_blocks_ = nr_cache_blocks;
+
+		events_.resize(nr_cache_blocks);
+
+		aio_context_ = 0; /* needed or io_setup will fail */
+		r = io_setup(nr_cache_blocks, &aio_context_);
+		if (r < 0)
+			throw std::runtime_error("io_setup failed");
+
+		hash_init(nr_buckets);
+		INIT_LIST_HEAD(&free_);
+		INIT_LIST_HEAD(&errored_);
+		INIT_LIST_HEAD(&dirty_);
+		INIT_LIST_HEAD(&clean_);
+		INIT_LIST_HEAD(&io_pending_);
+
+		r = init_free_list(nr_cache_blocks);
+		if (r)
+			throw std::runtime_error("couldn't allocate blocks");
+	}
+
+	block_cache::~block_cache()
+	{
+		wait_all();
+
+		// FIXME: use unique_ptrs
+		if (aio_context_)
+			io_destroy(aio_context_);
+	}
+
+	uint64_t
+	block_cache::get_nr_blocks() const
+	{
+		return nr_data_blocks_;
+	}
+
+	void
+	block_cache::zero_block(block &b)
+	{
+		memset(b.data_, 0, block_size_ << SECTOR_SHIFT);
+		mark_dirty(b);
+	}
+
+	block_cache::block *
+	block_cache::lookup_or_read_block(block_index index, unsigned flags)
+	{
+		block *b = hash_lookup(index);
+
+		if (b) {
+			if (test_flags(*b, IO_PENDING))
+				wait_specific(*b);
+
+			if (flags & GF_ZERO)
+				zero_block(*b);
+
+		} else {
+			if (flags & GF_CAN_BLOCK) {
+				b = new_block(index);
+				if (b) {
+					if (flags & GF_ZERO)
+						zero_block(*b);
+					else {
+						issue_read(*b);
+						wait_specific(*b);
+					}
 				}
 			}
 		}
+
+		return (!b || b->error_) ? NULL : b;
 	}
 
-	return (!b || b->error) ? NULL : b;
-}
+	block_cache::block &
+	block_cache::get(block_index index, unsigned flags)
+	{
+		block *b = lookup_or_read_block(index, flags);
 
-struct bc_block *
-block_cache_get(struct block_cache *bc, block_index index, unsigned flags)
-{
-	struct block *b = lookup_or_read_block(bc, index, flags);
+		if (b) {
+			hit(*b);
+			b->ref_count_++;
 
-	if (b) {
-		hit(b);
-		get_block(b);
-
-		return &b->b;
-	}
-
-	return NULL;
-}
-
-void
-block_cache_put(struct bc_block *bcb, unsigned flags)
-{
-	unsigned nr_available;
-	struct block *b = container_of(bcb, struct block, b);
-	struct block_cache *bc = b->bc;
-
-	put_block(b);
-
-	if (flags & PF_DIRTY) {
-		mark_dirty(b);
-
-		nr_available = bc->nr_cache_blocks - (bc->nr_dirty - bc->nr_io_pending);
-		if (nr_available < (WRITEBACK_LOW_THRESHOLD_PERCENT * bc->nr_cache_blocks / 100))
-			writeback(bc, (WRITEBACK_HIGH_THRESHOLD_PERCENT * bc->nr_cache_blocks / 100) - nr_available);
-	}
-}
-
-int
-block_cache_flush(struct block_cache *bc)
-{
-	struct block *b;
-
-	list_for_each_entry (b, &bc->dirty, list) {
-		if (b->ref_count) {
-			info(bc, "attempt to lock an already locked block\n");
-			return -EAGAIN;
+			return *b;
 		}
 
-		issue_write(b);
+		throw std::runtime_error("couldn't get block");
 	}
 
-	wait_all(bc);
+	void
+	block_cache::put(block_cache::block &b, unsigned flags)
+	{
+		if (b.ref_count_ == 0)
+			throw std::runtime_error("bad put");
 
-	return list_empty(&bc->errored) ? 0 : -EIO;
-}
+		b.ref_count_--;
 
-void
-block_cache_prefetch(struct block_cache *bc, block_index index)
-{
-	struct block *b = hash_lookup(bc, index);
+		if (flags & PF_DIRTY) {
+			mark_dirty(b);
 
-	if (!b) {
-		b = new_block(bc, index);
-		if (b)
-			issue_read(b);
+			// FIXME: factor out
+			unsigned nr_available = nr_cache_blocks_ - (nr_dirty_ - nr_io_pending_);
+			if (nr_available < (WRITEBACK_LOW_THRESHOLD_PERCENT * nr_cache_blocks_ / 100))
+				writeback((WRITEBACK_HIGH_THRESHOLD_PERCENT * nr_cache_blocks_ / 100) - nr_available);
+		}
+	}
+
+	int
+	block_cache::flush()
+	{
+		block *b;
+
+		list_for_each_entry (b, &dirty_, list_) {
+			if (b->ref_count_) {
+				info("attempt to lock an already locked block\n");
+				return -EAGAIN;
+			}
+
+			issue_write(*b);
+		}
+
+		wait_all();
+
+		return list_empty(&errored_) ? 0 : -EIO;
+	}
+
+	void
+	block_cache::prefetch(block_index index)
+	{
+		block *b = hash_lookup(index);
+
+		if (!b) {
+			b = new_block(index);
+			if (b)
+				issue_read(*b);
+		}
+	}
+
+	//--------------------------------
+
+	unsigned
+	block_cache::test_flags(block &b, unsigned flags)
+	{
+		return b.flags_ & flags;
+	}
+
+	void
+	block_cache::clear_flags(block &b, unsigned flags)
+	{
+		b.flags_ &= ~flags;
+	}
+
+	void
+	block_cache::set_flags(block &b, unsigned flags)
+	{
+		b.flags_ |= flags;
 	}
 }
 
-/*----------------------------------------------------------------*/
-
+//----------------------------------------------------------------
diff --git a/block-cache/block_cache.h b/block-cache/block_cache.h
index 046083f..f59a1a0 100644
--- a/block-cache/block_cache.h
+++ b/block-cache/block_cache.h
@@ -1,54 +1,173 @@
 #ifndef BLOCK_CACHE_H
 #define BLOCK_CACHE_H
 
+#include "block-cache/buffer.h"
+#include "block-cache/list.h"
+
+#include <boost/shared_ptr.hpp>
+#include <boost/noncopyable.hpp>
+
+#include <libaio.h>
+#include <memory>
 #include <stdint.h>
 #include <stdlib.h>
+#include <vector>
 
-/*----------------------------------------------------------------*/
+//----------------------------------------------------------------
 
-/* FIXME: add logging */
+namespace bcache {
+#if 0
+	class validator {
+	public:
+		typedef boost::shared_ptr<validator> ptr;
 
-/*----------------------------------------------------------------*/
+		virtual ~validator() {}
 
-/*
- * This library is not thread-safe.
- */
-typedef uint64_t block_index;
+		virtual void check(buffer<BlockSize> const &b, block_address location) const = 0;
+		virtual void prepare(buffer<BlockSize> &b, block_address location) const = 0;
+	};
 
-struct block_cache;
+	class noop_validator : public validator {
+	public:
+		void check(buffer<BlockSize> const &b, block_address location) const {}
+		void prepare(buffer<BlockSize> &b, block_address location) const {}
+	};
+#endif
+	//----------------------------------------------------------------
 
-struct bc_block {
-	block_index index;
-	void *data;
-};
+	// FIXME: throw exceptions rather than returning errors
+	class block_cache : private boost::noncopyable {
+	public:
+		enum block_flags {
+			IO_PENDING = (1 << 0),
+			DIRTY = (1 << 1)
+		};
 
-typedef uint64_t sector_t;
+		class block : private boost::noncopyable {
+		public:
+			uint64_t get_index() const {
+				return index_;
+			}
 
-struct block_cache *block_cache_create(int fd, sector_t block_size,
-				       uint64_t max_nr_blocks, size_t mem);
-void block_cache_destroy(struct block_cache *bc);
+			void *get_data() const {
+				return data_;
+			}
 
-uint64_t block_cache_get_nr_blocks(struct block_cache *bc);
+		private:
+			friend class block_cache;
 
-enum get_flags {
-	GF_ZERO = (1 << 0),
-	GF_CAN_BLOCK = (1 << 1)
-};
-struct bc_block *block_cache_get(struct block_cache *bc, block_index index, unsigned flags);
+			uint64_t index_;
+			void *data_;
 
-enum put_flags {
-	PF_DIRTY = (1 << 0),
-};
-void block_cache_put(struct bc_block *b, unsigned flags);
+			list_head list_;
+			list_head hash_list_;
 
-/*
- * Flush can fail if an earlier write failed.  You do not know which block
- * failed.  Make sure you build your recovery with this in mind.
- */
-int block_cache_flush(struct block_cache *bc);
+			block_cache *bc_;
+			unsigned ref_count_;
 
-void block_cache_prefetch(struct block_cache *bc, block_index index);
+			int error_;
+			unsigned flags_;
 
-/*----------------------------------------------------------------*/
+			iocb control_block_;
+		};
+
+		typedef uint64_t block_index;
+		typedef uint64_t sector_t;
+
+		//--------------------------------
+
+		block_cache(int fd, sector_t block_size,
+			    uint64_t max_nr_blocks, size_t mem);
+		~block_cache();
+
+		uint64_t get_nr_blocks() const;
+
+		enum get_flags {
+			GF_ZERO = (1 << 0),
+			GF_CAN_BLOCK = (1 << 1)
+		};
+
+		// FIXME: what if !GF_CAN_BLOCK?
+		block_cache::block &get(block_index index, unsigned flags);
+
+		enum put_flags {
+			PF_DIRTY = (1 << 0),
+		};
+
+		void put(block_cache::block &block, unsigned flags);
+
+		/*
+		 * Flush can fail if an earlier write failed.  You do not know which block
+		 * failed.  Make sure you build your recovery with this in mind.
+		 */
+		int flush();
+		void prefetch(block_index index);
+
+	private:
+		int init_free_list(unsigned count);
+		block *__alloc_block();
+		void complete_io(block &b, int result);
+		int issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc);
+		int issue_read(block &b);
+		int issue_write(block &b);
+		void wait_io();
+		list_head *__categorise(block &b);
+		void hit(block &b);
+		void wait_all();
+		void wait_specific(block &b);
+		unsigned writeback(unsigned count);
+		void hash_init(unsigned nr_buckets);
+		unsigned hash(uint64_t index);
+		block *hash_lookup(block_index index);
+		void hash_insert(block &b);
+		void hash_remove(block &b);
+		void setup_control_block(block &b);
+		block *new_block(block_index index);
+		void mark_dirty(block &b);
+		unsigned calc_nr_cache_blocks(size_t mem, sector_t block_size);
+		unsigned calc_nr_buckets(unsigned nr_blocks);
+		void zero_block(block &b);
+		block *lookup_or_read_block(block_index index, unsigned flags);
+		unsigned test_flags(block &b, unsigned flags);
+		void clear_flags(block &b, unsigned flags);
+		void set_flags(block &b, unsigned flags);
+
+		//--------------------------------
+
+		int fd_;
+		sector_t block_size_;
+		uint64_t nr_data_blocks_;
+		uint64_t nr_cache_blocks_;
+
+		std::auto_ptr<unsigned char> blocks_memory_; // FIXME: change to a vector
+		std::auto_ptr<unsigned char> blocks_data_;
+
+		io_context_t aio_context_;
+		std::vector<io_event> events_;
+
+		/*
+		 * Blocks on the free list are not initialised, apart from the
+		 * b.data field.
+		 */
+		list_head free_;
+		list_head errored_;
+		list_head dirty_;
+		list_head clean_;
+
+		unsigned nr_io_pending_;
+		struct list_head io_pending_;
+
+		unsigned nr_dirty_;
+
+		/*
+		 * Hash table fields.
+		 */
+		unsigned nr_buckets_;
+		unsigned mask_;
+		std::vector<list_head> buckets_;
+	};
+}
+
+//----------------------------------------------------------------
 
 #endif
diff --git a/persistent-data/buffer.h b/block-cache/buffer.h
similarity index 100%
rename from persistent-data/buffer.h
rename to block-cache/buffer.h
diff --git a/caching/superblock.cc b/caching/superblock.cc
index 93a8d60..8b34af3 100644
--- a/caching/superblock.cc
+++ b/caching/superblock.cc
@@ -277,7 +277,7 @@ namespace validator {
 
 	struct sb_validator : public block_manager<>::validator {
 		virtual void check(buffer<> const &b, block_address location) const {
-			superblock_disk const *sbd = reinterpret_cast<superblock_disk const *>(&b);
+			superblock_disk const *sbd = reinterpret_cast<superblock_disk const *>(b.raw());
 			crc32c sum(SUPERBLOCK_CSUM_SEED);
 			sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t));
 			if (sum.get_sum() != to_cpu<uint32_t>(sbd->csum))
@@ -285,7 +285,7 @@ namespace validator {
 		}
 
 		virtual void prepare(buffer<> &b, block_address location) const {
-			superblock_disk *sbd = reinterpret_cast<superblock_disk *>(&b);
+			superblock_disk *sbd = reinterpret_cast<superblock_disk *>(b.raw());
 			crc32c sum(SUPERBLOCK_CSUM_SEED);
 			sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t));
 			sbd->csum = to_disk<base::le32>(sum.get_sum());
diff --git a/era/superblock.cc b/era/superblock.cc
index c319e9b..9d0ae57 100644
--- a/era/superblock.cc
+++ b/era/superblock.cc
@@ -212,7 +212,7 @@ namespace era_validator {
 	// FIXME: turn into a template, we have 3 similar classes now
 	struct sb_validator : public block_manager<>::validator {
 		virtual void check(buffer<> const &b, block_address location) const {
-			superblock_disk const *sbd = reinterpret_cast<superblock_disk const *>(&b);
+			superblock_disk const *sbd = reinterpret_cast<superblock_disk const *>(b.raw());
 			crc32c sum(SUPERBLOCK_CSUM_SEED);
 			sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t));
 			if (sum.get_sum() != to_cpu<uint32_t>(sbd->csum))
@@ -220,7 +220,7 @@ namespace era_validator {
 		}
 
 		virtual void prepare(buffer<> &b, block_address location) const {
-			superblock_disk *sbd = reinterpret_cast<superblock_disk *>(&b);
+			superblock_disk *sbd = reinterpret_cast<superblock_disk *>(b.raw());
 			crc32c sum(SUPERBLOCK_CSUM_SEED);
 			sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t));
 			sbd->csum = to_disk<base::le32>(sum.get_sum());
diff --git a/persistent-data/block.h b/persistent-data/block.h
index c514383..487a0e1 100644
--- a/persistent-data/block.h
+++ b/persistent-data/block.h
@@ -20,7 +20,7 @@
 #define BLOCK_H
 
 #include "block-cache/block_cache.h"
-#include "persistent-data/buffer.h"
+#include "block-cache/buffer.h"
 
 #include <stdint.h>
 #include <map>
@@ -35,6 +35,8 @@
 //----------------------------------------------------------------
 
 namespace persistent_data {
+	using namespace bcache;
+
 
 	uint32_t const MD_BLOCK_SIZE = 4096;
 
@@ -77,10 +79,11 @@ namespace persistent_data {
 			BT_NORMAL
 		};
 
+		// FIXME: eventually this will disappear to be replaced with block_cache::block
 		struct block : private boost::noncopyable {
 			typedef boost::shared_ptr<block> ptr;
 
-			block(block_cache *bc,
+			block(block_cache &bc,
 			      block_address location,
 			      block_type bt,
 			      typename validator::ptr v,
@@ -110,7 +113,8 @@ namespace persistent_data {
 		private:
 			void check_not_unlocked() const;
 
-			bc_block *internal_;
+			block_cache &bc_;
+			block_cache::block *internal_;
 			typename validator::ptr validator_;
 			block_type bt_;
 			bool dirty_;
@@ -196,7 +200,9 @@ namespace persistent_data {
 		void write_block(typename block::ptr b) const;
 
 		int fd_;
-		block_cache *bc_;
+
+		// FIXME: the mutable is a fudge to allow flush() to be const, which I'm not sure is necc.
+		mutable block_cache bc_;
 	};
 
 	// A little utility to help build validators
diff --git a/persistent-data/block.tcc b/persistent-data/block.tcc
index ab9a129..4c39c7a 100644
--- a/persistent-data/block.tcc
+++ b/persistent-data/block.tcc
@@ -40,7 +40,7 @@ namespace {
 	unsigned const SECTOR_SHIFT = 9;
 
 	 // FIXME: these will slow it down until we start doing async io.
-	int const OPEN_FLAGS = O_DIRECT | O_SYNC;
+	int const OPEN_FLAGS = O_DIRECT;
 
 	// FIXME: introduce a new exception for this, or at least lift this
 	// to exception.h
@@ -106,31 +106,27 @@ namespace {
 
 namespace persistent_data {
 	template <uint32_t BlockSize>
-	block_manager<BlockSize>::block::block(block_cache *bc,
+	block_manager<BlockSize>::block::block(block_cache &bc,
 					       block_address location,
 					       block_type bt,
 					       typename validator::ptr v,
 					       bool zero)
-		: validator_(v),
-		bt_(bt),
-		dirty_(false),
-		unlocked_(false),
-		buffer_(0, true) // FIXME: we don't know if it's writeable here :(
+		: bc_(bc),
+		  validator_(v),
+		  bt_(bt),
+		  dirty_(false),
+		  unlocked_(false),
+		  buffer_(0, true) // FIXME: we don't know if it's writeable here :(
 	{
 		if (zero) {
-			internal_ = block_cache_get(bc, location, GF_ZERO | GF_CAN_BLOCK);
-			if (!internal_)
-				throw std::runtime_error("Couldn't get block");
+			internal_ = &bc.get(location, block_cache::GF_ZERO | block_cache::GF_CAN_BLOCK);
 			dirty_ = true;
+			buffer_.set_data(internal_->get_data());
 		} else {
-			internal_ = block_cache_get(bc, location, GF_CAN_BLOCK);
-			if (!internal_)
-				throw std::runtime_error("Couldn't get block");
-
-			validator_->check(buffer_, internal_->index);
+			internal_ = &bc.get(location, block_cache::GF_CAN_BLOCK);
+			buffer_.set_data(internal_->get_data());
+			validator_->check(buffer_, internal_->get_index());
 		}
-
-		buffer_.set_data(internal_->data);
 	}
 
 	template <uint32_t BlockSize>
@@ -144,8 +140,9 @@ namespace persistent_data {
 	void
 	block_manager<BlockSize>::block::unlock()
 	{
-		validator_->prepare(buffer_, internal_->index);
-		block_cache_put(internal_, dirty_ ? PF_DIRTY : 0);
+		if (dirty_)
+			validator_->prepare(buffer_, internal_->get_index());
+		bc_.put(*internal_, dirty_ ? block_cache::PF_DIRTY : 0);
 		unlocked_ = true;
 	}
 
@@ -161,7 +158,7 @@ namespace persistent_data {
 	block_manager<BlockSize>::block::get_location() const
 	{
 		check_not_unlocked();
-		return internal_->index;
+		return internal_->get_index();
 	}
 
 	template <uint32_t BlockSize>
@@ -196,12 +193,12 @@ namespace persistent_data {
 			if (dirty_)
 				// It may have already happened, by calling
 				// this we ensure we're consistent.
-				validator_->prepare(*internal_->data, internal_->index);
+				validator_->prepare(*internal_->get_data(), internal_->get_index());
 
 			validator_ = v;
 
 			if (check)
-				validator_->check(*internal_->data, internal_->index);
+				validator_->check(*internal_->get_data(), internal_->get_index());
 		}
 	}
 
@@ -301,14 +298,9 @@ namespace persistent_data {
 						block_address nr_blocks,
 						unsigned max_concurrent_blocks,
 						mode m)
+		: fd_(open_block_file(path, nr_blocks * BlockSize, m == READ_WRITE)),
+		  bc_(fd_, BlockSize >> SECTOR_SHIFT, nr_blocks, 1024u * 1024u * 4)
 	{
-		// Open the file descriptor
-		fd_ = open_block_file(path, nr_blocks * BlockSize, m == READ_WRITE);
-
-		// Create the cache
-		bc_ = block_cache_create(fd_, BlockSize << SECTOR_SHIFT, nr_blocks, 1024u * BlockSize * 1.2);
-		if (!bc_)
-			throw std::runtime_error("couldn't create block cache");
 	}
 
 	template <uint32_t BlockSize>
@@ -360,7 +352,7 @@ namespace persistent_data {
 	block_address
 	block_manager<BlockSize>::get_nr_blocks() const
 	{
-		return block_cache_get_nr_blocks(bc_);
+		return bc_.get_nr_blocks();
 	}
 
 	template <uint32_t BlockSize>
@@ -374,7 +366,7 @@ namespace persistent_data {
 	void
 	block_manager<BlockSize>::flush() const
 	{
-		block_cache_flush(bc_);
+		bc_.flush();
 	}
 }
 
diff --git a/persistent-data/data-structures/array.h b/persistent-data/data-structures/array.h
index 6c76bf2..1d8fce6 100644
--- a/persistent-data/data-structures/array.h
+++ b/persistent-data/data-structures/array.h
@@ -33,7 +33,7 @@ namespace persistent_data {
 
 		struct array_block_validator : public block_manager<>::validator {
 			virtual void check(buffer<> const &b, block_address location) const {
-				array_block_disk const *data = reinterpret_cast<array_block_disk const *>(&b);
+				array_block_disk const *data = reinterpret_cast<array_block_disk const *>(b.raw());
 				crc32c sum(ARRAY_CSUM_XOR);
 				sum.append(&data->max_entries, MD_BLOCK_SIZE - sizeof(uint32_t));
 				if (sum.get_sum() != to_cpu<uint32_t>(data->csum))
@@ -44,7 +44,7 @@ namespace persistent_data {
 			}
 
 			virtual void prepare(buffer<> &b, block_address location) const {
-				array_block_disk *data = reinterpret_cast<array_block_disk *>(&b);
+				array_block_disk *data = reinterpret_cast<array_block_disk *>(b.raw());
 				data->blocknr = to_disk<base::le64, uint64_t>(location);
 
 				crc32c sum(ARRAY_CSUM_XOR);
diff --git a/persistent-data/data-structures/btree.tcc b/persistent-data/data-structures/btree.tcc
index 1234aa0..a220334 100644
--- a/persistent-data/data-structures/btree.tcc
+++ b/persistent-data/data-structures/btree.tcc
@@ -34,7 +34,7 @@ namespace {
 
 	struct btree_node_validator : public block_manager<>::validator {
 		virtual void check(buffer<> const &b, block_address location) const {
-			disk_node const *data = reinterpret_cast<disk_node const *>(&b);
+			disk_node const *data = reinterpret_cast<disk_node const *>(b.raw());
 			node_header const *n = &data->header;
 			crc32c sum(BTREE_CSUM_XOR);
 			sum.append(&n->flags, MD_BLOCK_SIZE - sizeof(uint32_t));
@@ -46,7 +46,7 @@ namespace {
 		}
 
 		virtual void prepare(buffer<> &b, block_address location) const {
-			disk_node *data = reinterpret_cast<disk_node *>(&b);
+			disk_node *data = reinterpret_cast<disk_node *>(b.raw());
 			node_header *n = &data->header;
 			n->blocknr = to_disk<base::le64, uint64_t>(location);
 
diff --git a/persistent-data/space-maps/disk.cc b/persistent-data/space-maps/disk.cc
index 0c851f6..f75f0a9 100644
--- a/persistent-data/space-maps/disk.cc
+++ b/persistent-data/space-maps/disk.cc
@@ -39,7 +39,7 @@ namespace {
 
 	struct bitmap_block_validator : public block_manager<>::validator {
 		virtual void check(buffer<> const &b, block_address location) const {
-			bitmap_header const *data = reinterpret_cast<bitmap_header const *>(&b);
+			bitmap_header const *data = reinterpret_cast<bitmap_header const *>(b.raw());
 			crc32c sum(BITMAP_CSUM_XOR);
 			sum.append(&data->not_used, MD_BLOCK_SIZE - sizeof(uint32_t));
 			if (sum.get_sum() != to_cpu<uint32_t>(data->csum))
@@ -50,7 +50,7 @@ namespace {
 		}
 
 		virtual void prepare(buffer<> &b, block_address location) const {
-			bitmap_header *data = reinterpret_cast<bitmap_header *>(&b);
+			bitmap_header *data = reinterpret_cast<bitmap_header *>(b.raw());
 			data->blocknr = to_disk<base::le64, uint64_t>(location);
 
 			crc32c sum(BITMAP_CSUM_XOR);
@@ -66,7 +66,8 @@ namespace {
 	// FIXME: factor out the common code in these validators
 	struct index_block_validator : public block_manager<>::validator {
 		virtual void check(buffer<> const &b, block_address location) const {
-			metadata_index const *mi = reinterpret_cast<metadata_index const *>(&b);
+			metadata_index const *mi = reinterpret_cast<metadata_index const *>(b.raw());
+			std::cerr << "check mi = " << mi << "\n";
 			crc32c sum(INDEX_CSUM_XOR);
 			sum.append(&mi->padding_, MD_BLOCK_SIZE - sizeof(uint32_t));
 			if (sum.get_sum() != to_cpu<uint32_t>(mi->csum_))
@@ -77,7 +78,8 @@ namespace {
 		}
 
 		virtual void prepare(buffer<> &b, block_address location) const {
-			metadata_index *mi = reinterpret_cast<metadata_index *>(&b);
+			metadata_index *mi = reinterpret_cast<metadata_index *>(b.raw());
+			std::cerr << "prepare mi = " << mi << "\n";
 			mi->blocknr_ = to_disk<base::le64, uint64_t>(location);
 
 			crc32c sum(INDEX_CSUM_XOR);
@@ -630,7 +632,7 @@ namespace {
 				tm_->shadow(bitmap_root_, index_validator());
 
 			bitmap_root_ = p.first.get_location();
-			metadata_index *mdi = reinterpret_cast<metadata_index *>(&p.first.data());
+			metadata_index *mdi = reinterpret_cast<metadata_index *>(p.first.data().raw());
 
 			for (unsigned i = 0; i < entries_.size(); i++)
 				index_entry_traits::pack(entries_[i], mdi->index[i]);
diff --git a/thin-provisioning/superblock.cc b/thin-provisioning/superblock.cc
index 4412696..e808c6c 100644
--- a/thin-provisioning/superblock.cc
+++ b/thin-provisioning/superblock.cc
@@ -87,7 +87,7 @@ namespace {
 
 	struct sb_validator : public block_manager<>::validator {
 		virtual void check(buffer<> const &b, block_address location) const {
-			superblock_disk const *sbd = reinterpret_cast<superblock_disk const *>(&b);
+			superblock_disk const *sbd = reinterpret_cast<superblock_disk const *>(b.raw());
 			crc32c sum(SUPERBLOCK_CSUM_SEED);
 			sum.append(&sbd->flags_, MD_BLOCK_SIZE - sizeof(uint32_t));
 			if (sum.get_sum() != to_cpu<uint32_t>(sbd->csum_))
@@ -95,7 +95,7 @@ namespace {
 		}
 
 		virtual void prepare(buffer<> &b, block_address location) const {
-			superblock_disk *sbd = reinterpret_cast<superblock_disk *>(&b);
+			superblock_disk *sbd = reinterpret_cast<superblock_disk *>(b.raw());
 			crc32c sum(SUPERBLOCK_CSUM_SEED);
 			sum.append(&sbd->flags_, MD_BLOCK_SIZE - sizeof(uint32_t));
 			sbd->csum_ = to_disk<base::le32>(sum.get_sum());