From b32908d5c200a616c7a4fdca1d4147bec5f78f8f Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 25 Jul 2014 10:35:04 +0100 Subject: [PATCH] work in progress --- block-cache/block_cache.cc | 1095 ++++++++++----------- block-cache/block_cache.h | 185 +++- {persistent-data => block-cache}/buffer.h | 0 caching/superblock.cc | 4 +- era/superblock.cc | 4 +- persistent-data/block.h | 14 +- persistent-data/block.tcc | 54 +- persistent-data/data-structures/array.h | 4 +- persistent-data/data-structures/btree.tcc | 4 +- persistent-data/space-maps/disk.cc | 12 +- thin-provisioning/superblock.cc | 4 +- 11 files changed, 717 insertions(+), 663 deletions(-) rename {persistent-data => block-cache}/buffer.h (100%) diff --git a/block-cache/block_cache.cc b/block-cache/block_cache.cc index 0cca970..6127a23 100644 --- a/block-cache/block_cache.cc +++ b/block-cache/block_cache.cc @@ -1,7 +1,5 @@ #include "block-cache/block_cache.h" -#include "block-cache/list.h" - #include #include #include @@ -11,6 +9,10 @@ #include #include +#include + +//---------------------------------------------------------------- + // FIXME: get from linux headers #define SECTOR_SHIFT 9 #define PAGE_SIZE 4096 @@ -19,648 +21,581 @@ #define WRITEBACK_LOW_THRESHOLD_PERCENT 33 #define WRITEBACK_HIGH_THRESHOLD_PERCENT 66 -/*---------------------------------------------------------------- - * Structures - *--------------------------------------------------------------*/ -struct block_cache; +//---------------------------------------------------------------- -enum block_flags { - IO_PENDING = (1 << 0), - DIRTY = (1 << 1) -}; +namespace { + // FIXME: remove -struct block { - struct list_head list; - struct list_head hash_list; + /*---------------------------------------------------------------- + * Logging + *--------------------------------------------------------------*/ + void info(const char *format, ...) + { + va_list ap; - struct block_cache *bc; - unsigned ref_count; - - int error; - unsigned flags; - - struct iocb control_block; - - struct bc_block b; -}; - -struct block_cache { - int fd; - sector_t block_size; - uint64_t nr_data_blocks; - uint64_t nr_cache_blocks; - - void *blocks_memory; - void *blocks_data; - - io_context_t aio_context; - struct io_event *events; - - /* - * Blocks on the free list are not initialised, apart from the - * b.data field. - */ - struct list_head free; - struct list_head errored; - struct list_head dirty; - struct list_head clean; - - unsigned nr_io_pending; - struct list_head io_pending; - - unsigned nr_dirty; - - /* - * Hash table fields. - */ - unsigned nr_buckets; - unsigned mask; - struct list_head buckets[0]; -}; - -/*---------------------------------------------------------------- - * Logging - *--------------------------------------------------------------*/ -static void info(struct block_cache *bc, const char *format, ...) - __attribute__ ((format (printf, 2, 3))); - -static void info(struct block_cache *bc, const char *format, ...) -{ - va_list ap; - - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); -} - -/*---------------------------------------------------------------- - * Allocation - *--------------------------------------------------------------*/ -static void *alloc_aligned(size_t len, size_t alignment) -{ - void *result = NULL; - int r = posix_memalign(&result, alignment, len); - if (r) - return NULL; - - return result; -} - -static int init_free_list(struct block_cache *bc, unsigned count) -{ - size_t len; - struct block *blocks; - size_t block_size = bc->block_size << SECTOR_SHIFT; - void *data; - unsigned i; - - /* Allocate the block structures */ - len = sizeof(struct block) * count; - blocks = static_cast(malloc(len)); - if (!blocks) - return -ENOMEM; - - bc->blocks_memory = blocks; - - /* Allocate the data for each block. We page align the data. */ - data = alloc_aligned(count * block_size, PAGE_SIZE); - if (!data) { - free(blocks); - return -ENOMEM; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); } - bc->blocks_data = data; + void *alloc_aligned(size_t len, size_t alignment) + { + void *result = NULL; + int r = posix_memalign(&result, alignment, len); + if (r) + return NULL; - for (i = 0; i < count; i++) { - struct block *b = blocks + i; - INIT_LIST_HEAD(&b->list); - b->b.data = data + block_size * i; - - list_add(&b->list, &bc->free); + return result; } - - return 0; } -static struct block *__alloc_block(struct block_cache *bc) -{ - struct block *b; +//---------------------------------------------------------------- - if (list_empty(&bc->free)) - return NULL; +namespace bcache { + int + block_cache::init_free_list(unsigned count) + { + size_t len; + block *blocks; + size_t block_size = block_size_ << SECTOR_SHIFT; + void *data; + unsigned i; - b = list_first_entry(&bc->free, struct block, list); - list_del(&b->list); + /* Allocate the block structures */ + len = sizeof(block) * count; + blocks = static_cast(malloc(len)); + if (!blocks) + return -ENOMEM; - return b; -} + blocks_memory_.reset(reinterpret_cast(blocks)); -/*---------------------------------------------------------------- - * Flags handling - *--------------------------------------------------------------*/ -static unsigned test_flags(struct block *b, unsigned flags) -{ - return b->flags & flags; -} - -static void clear_flags(struct block *b, unsigned flags) -{ - b->flags &= ~flags; -} - -static void set_flags(struct block *b, unsigned flags) -{ - b->flags |= flags; -} - -/*---------------------------------------------------------------- - * Low level IO handling - * - * We cannot have two concurrent writes on the same block. - * eg, background writeback, put with dirty, flush? - * - * To avoid this we introduce some restrictions: - * - * i) A held block can never be written back. - * ii) You cannot get a block until writeback has completed. - * - *--------------------------------------------------------------*/ - -/* - * This can be called from the context of the aio thread. So we have a - * separate 'top half' complete function that we know is only called by the - * main cache thread. - */ -static void complete_io(struct block *b, int result) -{ - b->error = result; - clear_flags(b, IO_PENDING); - b->bc->nr_io_pending--; - - if (b->error) - list_move_tail(&b->list, &b->bc->errored); - else { - if (test_flags(b, DIRTY)) { - clear_flags(b, DIRTY); - b->bc->nr_dirty--; + /* Allocate the data for each block. We page align the data. */ + data = alloc_aligned(count * block_size, PAGE_SIZE); + if (!data) { + free(blocks); + return -ENOMEM; } - list_move_tail(&b->list, &b->bc->clean); - } -} + blocks_data_.reset(reinterpret_cast(data)); -/* - * |b->list| should be valid (either pointing to itself, on one of the other - * lists. - */ -static int issue_low_level(struct block *b, enum io_iocb_cmd opcode, const char *desc) -{ - int r; - struct block_cache *bc = b->bc; - struct iocb *control_blocks[1]; + for (i = 0; i < count; i++) { + block *b = blocks + i; + INIT_LIST_HEAD(&b->list_); + b->data_ = data + block_size * i; - assert(!test_flags(b, IO_PENDING)); - set_flags(b, IO_PENDING); - bc->nr_io_pending++; - list_move_tail(&b->list, &bc->io_pending); + list_add(&b->list_, &free_); + } - b->control_block.aio_lio_opcode = opcode; - control_blocks[0] = &b->control_block; - r = io_submit(bc->aio_context, 1, control_blocks); - if (r != 1) { - if (r < 0) { - perror("io_submit error"); - info(bc, "io_submit failed with %s op: %d\n", desc, r); - } else - info(bc, "could not submit IOs, with %s op\n", desc); - - complete_io(b, EIO); - return -EIO; + return 0; } - return 0; -} + block_cache::block * + block_cache::__alloc_block() + { + block *b; -static int issue_read(struct block *b) -{ - return issue_low_level(b, IO_CMD_PREAD, "read"); -} + if (list_empty(&free_)) + return NULL; -static int issue_write(struct block *b) -{ - return issue_low_level(b, IO_CMD_PWRITE, "write"); -} + b = list_first_entry(&free_, block, list_); + list_del(&b->list_); -static void wait_io(struct block_cache *bc) -{ - int r; - unsigned i; - - // FIXME: use a timeout to prevent hanging - r = io_getevents(bc->aio_context, 1, bc->nr_cache_blocks, bc->events, NULL); - if (r < 0) { - info(bc, "io_getevents failed %d\n", r); - exit(1); /* FIXME: handle more gracefully */ + return b; } - for (i = 0; i < static_cast(r); i++) { - struct io_event *e = bc->events + i; - struct block *b = container_of(e->obj, struct block, control_block); + /*---------------------------------------------------------------- + * Low level IO handling + * + * We cannot have two concurrent writes on the same block. + * eg, background writeback, put with dirty, flush? + * + * To avoid this we introduce some restrictions: + * + * i) A held block can never be written back. + * ii) You cannot get a block until writeback has completed. + * + *--------------------------------------------------------------*/ - if (e->res == bc->block_size << SECTOR_SHIFT) - complete_io(b, 0); - else if (e->res < 0) - complete_io(b, e->res); + /* + * This can be called from the context of the aio thread. So we have a + * separate 'top half' complete function that we know is only called by the + * main cache thread. + */ + void + block_cache::complete_io(block &b, int result) + { + b.error_ = result; + clear_flags(b, IO_PENDING); + nr_io_pending_--; + + if (b.error_) + list_move_tail(&b.list_, &errored_); else { - info(bc, "incomplete io, unexpected\n"); - } - } -} + if (test_flags(b, DIRTY)) { + clear_flags(b, DIRTY); + nr_dirty_--; + } -/*---------------------------------------------------------------- - * Clean/dirty list management - *--------------------------------------------------------------*/ - -/* - * We're using lru lists atm, but I think it would be worth - * experimenting with a multiqueue approach. - */ -static struct list_head *__categorise(struct block *b) -{ - if (b->error) - return &b->bc->errored; - - return (b->flags & DIRTY) ? &b->bc->dirty : &b->bc->clean; -} - -static void hit(struct block *b) -{ - list_move_tail(&b->list, __categorise(b)); -} - -/*---------------------------------------------------------------- - * High level IO handling - *--------------------------------------------------------------*/ -static void wait_all(struct block_cache *bc) -{ - while (!list_empty(&bc->io_pending)) - wait_io(bc); -} - -static void wait_specific(struct block *b) -{ - while (test_flags(b, IO_PENDING)) - wait_io(b->bc); -} - -static unsigned writeback(struct block_cache *bc, unsigned count) -{ - int r; - struct block *b, *tmp; - unsigned actual = 0; - - list_for_each_entry_safe (b, tmp, &bc->dirty, list) { - if (actual == count) - break; - - if (b->ref_count) - continue; - - r = issue_write(b); - if (!r) - actual++; - } - - info(bc, "writeback: requested %u, actual %u\n", count, actual); - return actual; -} - -/*---------------------------------------------------------------- - * Hash table - *---------------------------------------------------------------*/ - -/* - * |nr_buckets| must be a power of two. - */ -static void hash_init(struct block_cache *bc, unsigned nr_buckets) -{ - unsigned i; - - bc->nr_buckets = nr_buckets; - bc->mask = nr_buckets - 1; - - for (i = 0; i < nr_buckets; i++) - INIT_LIST_HEAD(bc->buckets + i); -} - -static unsigned hash(struct block_cache *bc, uint64_t index) -{ - const unsigned BIG_PRIME = 4294967291UL; - return (((unsigned) index) * BIG_PRIME) & bc->mask; -} - -static struct block *hash_lookup(struct block_cache *bc, block_index index) -{ - struct block *b; - unsigned bucket = hash(bc, index); - - list_for_each_entry (b, bc->buckets + bucket, hash_list) { - if (b->b.index == index) - return b; - } - - return NULL; -} - -static void hash_insert(struct block *b) -{ - unsigned bucket = hash(b->bc, b->b.index); - - list_move_tail(&b->hash_list, b->bc->buckets + bucket); -} - -static void hash_remove(struct block *b) -{ - list_del_init(&b->hash_list); -} - -/*---------------------------------------------------------------- - * High level allocation - *--------------------------------------------------------------*/ -static void setup_control_block(struct block *b) -{ - struct iocb *cb = &b->control_block; - size_t block_size_bytes = b->bc->block_size << SECTOR_SHIFT; - - memset(cb, 0, sizeof(*cb)); - cb->aio_fildes = b->bc->fd; - - cb->u.c.buf = b->b.data; - cb->u.c.offset = block_size_bytes * b->b.index; - cb->u.c.nbytes = block_size_bytes; -} - -static struct block *new_block(struct block_cache *bc, - block_index index) -{ - struct block *b; - - b = __alloc_block(bc); - if (!b) { - if (list_empty(&bc->clean)) { - if (list_empty(&bc->io_pending)) - writeback(bc, 9000); - wait_io(bc); - } - - if (!list_empty(&bc->clean)) { - b = list_first_entry(&bc->clean, struct block, list); - hash_remove(b); - list_del(&b->list); + list_move_tail(&b.list_, &clean_); } } - if (b) { - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->hash_list); - b->bc = bc; - b->ref_count = 0; + /* + * |b->list| should be valid (either pointing to itself, on one of the other + * lists. + */ + int + block_cache::issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc) + { + int r; + iocb *control_blocks[1]; - b->error = 0; - clear_flags(b, IO_PENDING | DIRTY); + assert(!test_flags(b, IO_PENDING)); + set_flags(b, IO_PENDING); + nr_io_pending_++; + list_move_tail(&b.list_, &io_pending_); - b->b.index = index; - setup_control_block(b); + b.control_block_.aio_lio_opcode = opcode; + control_blocks[0] = &b.control_block_; + r = io_submit(aio_context_, 1, control_blocks); + if (r != 1) { + if (r < 0) { + perror("io_submit error"); + info("io_submit failed with %s op: %d\n", desc, r); + } else + info("could not submit IOs, with %s op\n", desc); - hash_insert(b); - } - - return b; -} - -/*---------------------------------------------------------------- - * Block reference counting - *--------------------------------------------------------------*/ -static void get_block(struct block *b) -{ - b->ref_count++; -} - -static void put_block(struct block *b) -{ - assert(b->ref_count); - b->ref_count--; -} - -static void mark_dirty(struct block *b) -{ - struct block_cache *bc = b->bc; - - if (!test_flags(b, DIRTY)) { - set_flags(b, DIRTY); - list_move_tail(&b->list, &b->bc->dirty); - bc->nr_dirty++; - } -} - -/*---------------------------------------------------------------- - * Public interface - *--------------------------------------------------------------*/ -unsigned calc_nr_cache_blocks(size_t mem, sector_t block_size) -{ - size_t space_per_block = (block_size << SECTOR_SHIFT) + sizeof(struct block); - unsigned r = mem / space_per_block; - - return (r < MIN_BLOCKS) ? MIN_BLOCKS : r; -} - -unsigned calc_nr_buckets(unsigned nr_blocks) -{ - unsigned r = 8; - unsigned n = nr_blocks / 4; - - if (n < 8) - n = 8; - - while (r < n) - r <<= 1; - - return r; -} - -void -block_cache_destroy(struct block_cache *bc) -{ - wait_all(bc); - - if (bc->aio_context) - io_destroy(bc->aio_context); - - if (bc->events) - free(bc->events); - - if (bc->blocks_memory) - free(bc->blocks_memory); - - if (bc->blocks_data) - free(bc->blocks_data); - - free(bc); -} - -struct block_cache * -block_cache_create(int fd, sector_t block_size, uint64_t on_disk_blocks, size_t mem) -{ - int r; - struct block_cache *bc; - unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size); - unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks); - - bc = static_cast(malloc(sizeof(*bc) + sizeof(*bc->buckets) * nr_buckets)); - if (bc) { - memset(bc, 0, sizeof(*bc)); - - bc->fd = fd; - bc->block_size = block_size; - bc->nr_data_blocks = on_disk_blocks; - bc->nr_cache_blocks = nr_cache_blocks; - - bc->events = static_cast(malloc(sizeof(*bc->events) * nr_cache_blocks)); - if (!bc->events) { - info(bc, "couldn't allocate events array\n"); - goto bad; + complete_io(b, EIO); + return -EIO; } - bc->aio_context = 0; /* needed or io_setup will fail */ - r = io_setup(nr_cache_blocks, &bc->aio_context); + return 0; + } + + int + block_cache::issue_read(block &b) + { + return issue_low_level(b, IO_CMD_PREAD, "read"); + } + + int + block_cache::issue_write(block &b) + { + std::cerr << "issuing write for block " << b.index_ << "\n"; + return issue_low_level(b, IO_CMD_PWRITE, "write"); + } + + void + block_cache::wait_io() + { + int r; + unsigned i; + + // FIXME: use a timeout to prevent hanging + r = io_getevents(aio_context_, 1, nr_cache_blocks_, &events_[0], NULL); if (r < 0) { - info(bc, "io_setup failed: %d\n", r); - goto bad; + info("io_getevents failed %d\n", r); + exit(1); /* FIXME: handle more gracefully */ } - hash_init(bc, nr_buckets); - INIT_LIST_HEAD(&bc->free); - INIT_LIST_HEAD(&bc->errored); - INIT_LIST_HEAD(&bc->dirty); - INIT_LIST_HEAD(&bc->clean); - INIT_LIST_HEAD(&bc->io_pending); + for (i = 0; i < static_cast(r); i++) { + io_event const &e = events_[i]; + block *b = container_of(e.obj, block, control_block_); - r = init_free_list(bc, nr_cache_blocks); - if (r) { - info(bc, "couldn't allocate blocks: %d\n", r); - goto bad; + if (e.res == block_size_ << SECTOR_SHIFT) + complete_io(*b, 0); + + else if (e.res < 0) + complete_io(*b, e.res); + + else + info("incomplete io, unexpected: %d\n", r); } } - return bc; + /*---------------------------------------------------------------- + * Clean/dirty list management + *--------------------------------------------------------------*/ -bad: - block_cache_destroy(bc); - return NULL; -} + /* + * We're using lru lists atm, but I think it would be worth + * experimenting with a multiqueue approach. + */ + list_head * + block_cache::__categorise(block &b) + { + if (b.error_) + return &errored_; -uint64_t block_cache_get_nr_blocks(struct block_cache *bc) -{ - return bc->nr_data_blocks; -} + return (b.flags_ & DIRTY) ? &dirty_ : &clean_; + } -static void zero_block(struct block *b) -{ - memset(b->b.data, 0, b->bc->block_size << SECTOR_SHIFT); - mark_dirty(b); -} + void + block_cache::hit(block &b) + { + list_move_tail(&b.list_, __categorise(b)); + } -static struct block *lookup_or_read_block(struct block_cache *bc, block_index index, unsigned flags) -{ - struct block *b = hash_lookup(bc, index); + /*---------------------------------------------------------------- + * High level IO handling + *--------------------------------------------------------------*/ + void + block_cache::wait_all() + { + while (!list_empty(&io_pending_)) + wait_io(); + } - if (b) { - if (test_flags(b, IO_PENDING)) - wait_specific(b); + void + block_cache::wait_specific(block &b) + { + while (test_flags(b, IO_PENDING)) + wait_io(); + } - if (flags & GF_ZERO) - zero_block(b); + unsigned + block_cache::writeback(unsigned count) + { + int r; + block *b, *tmp; + unsigned actual = 0; - } else { - if (flags & GF_CAN_BLOCK) { - b = new_block(bc, index); - if (b) { - if (flags & GF_ZERO) - zero_block(b); - else { - issue_read(b); - wait_specific(b); + list_for_each_entry_safe (b, tmp, &dirty_, list_) { + if (actual == count) + break; + + if (b->ref_count_) + continue; + + r = issue_write(*b); + if (!r) + actual++; + } + + info("writeback: requested %u, actual %u\n", count, actual); + return actual; + } + + /*---------------------------------------------------------------- + * Hash table + *---------------------------------------------------------------*/ + + /* + * |nr_buckets| must be a power of two. + */ + void + block_cache::hash_init(unsigned nr_buckets) + { + unsigned i; + + nr_buckets_ = nr_buckets; + mask_ = nr_buckets - 1; + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(&buckets_[i]); + } + + unsigned + block_cache::hash(uint64_t index) + { + const unsigned BIG_PRIME = 4294967291UL; + return (((unsigned) index) * BIG_PRIME) & mask_; + } + + block_cache::block * + block_cache::hash_lookup(block_index index) + { + block *b; + unsigned bucket = hash(index); + + list_for_each_entry (b, &buckets_[bucket], hash_list_) { + if (b->index_ == index) + return b; + } + + return NULL; + } + + void + block_cache::hash_insert(block &b) + { + unsigned bucket = hash(b.index_); + list_move_tail(&b.hash_list_, &buckets_[bucket]); + } + + void + block_cache::hash_remove(block &b) + { + list_del_init(&b.hash_list_); + } + + /*---------------------------------------------------------------- + * High level allocation + *--------------------------------------------------------------*/ + void + block_cache::setup_control_block(block &b) + { + iocb *cb = &b.control_block_; + size_t block_size_bytes = block_size_ << SECTOR_SHIFT; + + memset(cb, 0, sizeof(*cb)); + cb->aio_fildes = fd_; + + cb->u.c.buf = b.data_; + cb->u.c.offset = block_size_bytes * b.index_; + cb->u.c.nbytes = block_size_bytes; + } + + block_cache::block * + block_cache::new_block(block_index index) + { + block *b; + + b = __alloc_block(); + if (!b) { + if (list_empty(&clean_)) { + if (list_empty(&io_pending_)) + writeback(9000); + wait_io(); + } + + if (!list_empty(&clean_)) { + b = list_first_entry(&clean_, block, list_); + hash_remove(*b); + list_del(&b->list_); + } + } + + if (b) { + INIT_LIST_HEAD(&b->list_); + INIT_LIST_HEAD(&b->hash_list_); + b->bc_ = this; + b->ref_count_ = 0; + + b->error_ = 0; + clear_flags(*b, IO_PENDING | DIRTY); + + b->index_ = index; + setup_control_block(*b); + + hash_insert(*b); + } + + return b; + } + + /*---------------------------------------------------------------- + * Block reference counting + *--------------------------------------------------------------*/ + void + block_cache::mark_dirty(block &b) + { + if (!test_flags(b, DIRTY)) { + set_flags(b, DIRTY); + list_move_tail(&b.list_, &dirty_); + nr_dirty_++; + } + } + + unsigned + block_cache::calc_nr_cache_blocks(size_t mem, sector_t block_size) + { + size_t space_per_block = (block_size << SECTOR_SHIFT) + sizeof(block); + unsigned r = mem / space_per_block; + + return (r < MIN_BLOCKS) ? MIN_BLOCKS : r; + } + + unsigned + block_cache::calc_nr_buckets(unsigned nr_blocks) + { + unsigned r = 8; + unsigned n = nr_blocks / 4; + + if (n < 8) + n = 8; + + while (r < n) + r <<= 1; + + return r; + } + + block_cache::block_cache(int fd, sector_t block_size, uint64_t on_disk_blocks, size_t mem) + : nr_dirty_(0), + nr_io_pending_(0) + { + int r; + unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size); + unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks); + + info("block_size = %llu, on_disk_blocks = %llu, mem = %llu, nr_cache_blocks = %llu\n", + (unsigned long long) block_size, + (unsigned long long) on_disk_blocks, + (unsigned long long) mem, + (unsigned long long) nr_cache_blocks); + + + buckets_.resize(nr_buckets); + + fd_ = fd; + block_size_ = block_size; + nr_data_blocks_ = on_disk_blocks; + nr_cache_blocks_ = nr_cache_blocks; + + events_.resize(nr_cache_blocks); + + aio_context_ = 0; /* needed or io_setup will fail */ + r = io_setup(nr_cache_blocks, &aio_context_); + if (r < 0) + throw std::runtime_error("io_setup failed"); + + hash_init(nr_buckets); + INIT_LIST_HEAD(&free_); + INIT_LIST_HEAD(&errored_); + INIT_LIST_HEAD(&dirty_); + INIT_LIST_HEAD(&clean_); + INIT_LIST_HEAD(&io_pending_); + + r = init_free_list(nr_cache_blocks); + if (r) + throw std::runtime_error("couldn't allocate blocks"); + } + + block_cache::~block_cache() + { + wait_all(); + + // FIXME: use unique_ptrs + if (aio_context_) + io_destroy(aio_context_); + } + + uint64_t + block_cache::get_nr_blocks() const + { + return nr_data_blocks_; + } + + void + block_cache::zero_block(block &b) + { + memset(b.data_, 0, block_size_ << SECTOR_SHIFT); + mark_dirty(b); + } + + block_cache::block * + block_cache::lookup_or_read_block(block_index index, unsigned flags) + { + block *b = hash_lookup(index); + + if (b) { + if (test_flags(*b, IO_PENDING)) + wait_specific(*b); + + if (flags & GF_ZERO) + zero_block(*b); + + } else { + if (flags & GF_CAN_BLOCK) { + b = new_block(index); + if (b) { + if (flags & GF_ZERO) + zero_block(*b); + else { + issue_read(*b); + wait_specific(*b); + } } } } + + return (!b || b->error_) ? NULL : b; } - return (!b || b->error) ? NULL : b; -} + block_cache::block & + block_cache::get(block_index index, unsigned flags) + { + block *b = lookup_or_read_block(index, flags); -struct bc_block * -block_cache_get(struct block_cache *bc, block_index index, unsigned flags) -{ - struct block *b = lookup_or_read_block(bc, index, flags); + if (b) { + hit(*b); + b->ref_count_++; - if (b) { - hit(b); - get_block(b); - - return &b->b; - } - - return NULL; -} - -void -block_cache_put(struct bc_block *bcb, unsigned flags) -{ - unsigned nr_available; - struct block *b = container_of(bcb, struct block, b); - struct block_cache *bc = b->bc; - - put_block(b); - - if (flags & PF_DIRTY) { - mark_dirty(b); - - nr_available = bc->nr_cache_blocks - (bc->nr_dirty - bc->nr_io_pending); - if (nr_available < (WRITEBACK_LOW_THRESHOLD_PERCENT * bc->nr_cache_blocks / 100)) - writeback(bc, (WRITEBACK_HIGH_THRESHOLD_PERCENT * bc->nr_cache_blocks / 100) - nr_available); - } -} - -int -block_cache_flush(struct block_cache *bc) -{ - struct block *b; - - list_for_each_entry (b, &bc->dirty, list) { - if (b->ref_count) { - info(bc, "attempt to lock an already locked block\n"); - return -EAGAIN; + return *b; } - issue_write(b); + throw std::runtime_error("couldn't get block"); } - wait_all(bc); + void + block_cache::put(block_cache::block &b, unsigned flags) + { + if (b.ref_count_ == 0) + throw std::runtime_error("bad put"); - return list_empty(&bc->errored) ? 0 : -EIO; -} + b.ref_count_--; -void -block_cache_prefetch(struct block_cache *bc, block_index index) -{ - struct block *b = hash_lookup(bc, index); + if (flags & PF_DIRTY) { + mark_dirty(b); - if (!b) { - b = new_block(bc, index); - if (b) - issue_read(b); + // FIXME: factor out + unsigned nr_available = nr_cache_blocks_ - (nr_dirty_ - nr_io_pending_); + if (nr_available < (WRITEBACK_LOW_THRESHOLD_PERCENT * nr_cache_blocks_ / 100)) + writeback((WRITEBACK_HIGH_THRESHOLD_PERCENT * nr_cache_blocks_ / 100) - nr_available); + } + } + + int + block_cache::flush() + { + block *b; + + list_for_each_entry (b, &dirty_, list_) { + if (b->ref_count_) { + info("attempt to lock an already locked block\n"); + return -EAGAIN; + } + + issue_write(*b); + } + + wait_all(); + + return list_empty(&errored_) ? 0 : -EIO; + } + + void + block_cache::prefetch(block_index index) + { + block *b = hash_lookup(index); + + if (!b) { + b = new_block(index); + if (b) + issue_read(*b); + } + } + + //-------------------------------- + + unsigned + block_cache::test_flags(block &b, unsigned flags) + { + return b.flags_ & flags; + } + + void + block_cache::clear_flags(block &b, unsigned flags) + { + b.flags_ &= ~flags; + } + + void + block_cache::set_flags(block &b, unsigned flags) + { + b.flags_ |= flags; } } -/*----------------------------------------------------------------*/ - +//---------------------------------------------------------------- diff --git a/block-cache/block_cache.h b/block-cache/block_cache.h index 046083f..f59a1a0 100644 --- a/block-cache/block_cache.h +++ b/block-cache/block_cache.h @@ -1,54 +1,173 @@ #ifndef BLOCK_CACHE_H #define BLOCK_CACHE_H +#include "block-cache/buffer.h" +#include "block-cache/list.h" + +#include +#include + +#include +#include #include #include +#include -/*----------------------------------------------------------------*/ +//---------------------------------------------------------------- -/* FIXME: add logging */ +namespace bcache { +#if 0 + class validator { + public: + typedef boost::shared_ptr ptr; -/*----------------------------------------------------------------*/ + virtual ~validator() {} -/* - * This library is not thread-safe. - */ -typedef uint64_t block_index; + virtual void check(buffer const &b, block_address location) const = 0; + virtual void prepare(buffer &b, block_address location) const = 0; + }; -struct block_cache; + class noop_validator : public validator { + public: + void check(buffer const &b, block_address location) const {} + void prepare(buffer &b, block_address location) const {} + }; +#endif + //---------------------------------------------------------------- -struct bc_block { - block_index index; - void *data; -}; + // FIXME: throw exceptions rather than returning errors + class block_cache : private boost::noncopyable { + public: + enum block_flags { + IO_PENDING = (1 << 0), + DIRTY = (1 << 1) + }; -typedef uint64_t sector_t; + class block : private boost::noncopyable { + public: + uint64_t get_index() const { + return index_; + } -struct block_cache *block_cache_create(int fd, sector_t block_size, - uint64_t max_nr_blocks, size_t mem); -void block_cache_destroy(struct block_cache *bc); + void *get_data() const { + return data_; + } -uint64_t block_cache_get_nr_blocks(struct block_cache *bc); + private: + friend class block_cache; -enum get_flags { - GF_ZERO = (1 << 0), - GF_CAN_BLOCK = (1 << 1) -}; -struct bc_block *block_cache_get(struct block_cache *bc, block_index index, unsigned flags); + uint64_t index_; + void *data_; -enum put_flags { - PF_DIRTY = (1 << 0), -}; -void block_cache_put(struct bc_block *b, unsigned flags); + list_head list_; + list_head hash_list_; -/* - * Flush can fail if an earlier write failed. You do not know which block - * failed. Make sure you build your recovery with this in mind. - */ -int block_cache_flush(struct block_cache *bc); + block_cache *bc_; + unsigned ref_count_; -void block_cache_prefetch(struct block_cache *bc, block_index index); + int error_; + unsigned flags_; -/*----------------------------------------------------------------*/ + iocb control_block_; + }; + + typedef uint64_t block_index; + typedef uint64_t sector_t; + + //-------------------------------- + + block_cache(int fd, sector_t block_size, + uint64_t max_nr_blocks, size_t mem); + ~block_cache(); + + uint64_t get_nr_blocks() const; + + enum get_flags { + GF_ZERO = (1 << 0), + GF_CAN_BLOCK = (1 << 1) + }; + + // FIXME: what if !GF_CAN_BLOCK? + block_cache::block &get(block_index index, unsigned flags); + + enum put_flags { + PF_DIRTY = (1 << 0), + }; + + void put(block_cache::block &block, unsigned flags); + + /* + * Flush can fail if an earlier write failed. You do not know which block + * failed. Make sure you build your recovery with this in mind. + */ + int flush(); + void prefetch(block_index index); + + private: + int init_free_list(unsigned count); + block *__alloc_block(); + void complete_io(block &b, int result); + int issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc); + int issue_read(block &b); + int issue_write(block &b); + void wait_io(); + list_head *__categorise(block &b); + void hit(block &b); + void wait_all(); + void wait_specific(block &b); + unsigned writeback(unsigned count); + void hash_init(unsigned nr_buckets); + unsigned hash(uint64_t index); + block *hash_lookup(block_index index); + void hash_insert(block &b); + void hash_remove(block &b); + void setup_control_block(block &b); + block *new_block(block_index index); + void mark_dirty(block &b); + unsigned calc_nr_cache_blocks(size_t mem, sector_t block_size); + unsigned calc_nr_buckets(unsigned nr_blocks); + void zero_block(block &b); + block *lookup_or_read_block(block_index index, unsigned flags); + unsigned test_flags(block &b, unsigned flags); + void clear_flags(block &b, unsigned flags); + void set_flags(block &b, unsigned flags); + + //-------------------------------- + + int fd_; + sector_t block_size_; + uint64_t nr_data_blocks_; + uint64_t nr_cache_blocks_; + + std::auto_ptr blocks_memory_; // FIXME: change to a vector + std::auto_ptr blocks_data_; + + io_context_t aio_context_; + std::vector events_; + + /* + * Blocks on the free list are not initialised, apart from the + * b.data field. + */ + list_head free_; + list_head errored_; + list_head dirty_; + list_head clean_; + + unsigned nr_io_pending_; + struct list_head io_pending_; + + unsigned nr_dirty_; + + /* + * Hash table fields. + */ + unsigned nr_buckets_; + unsigned mask_; + std::vector buckets_; + }; +} + +//---------------------------------------------------------------- #endif diff --git a/persistent-data/buffer.h b/block-cache/buffer.h similarity index 100% rename from persistent-data/buffer.h rename to block-cache/buffer.h diff --git a/caching/superblock.cc b/caching/superblock.cc index 93a8d60..8b34af3 100644 --- a/caching/superblock.cc +++ b/caching/superblock.cc @@ -277,7 +277,7 @@ namespace validator { struct sb_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - superblock_disk const *sbd = reinterpret_cast(&b); + superblock_disk const *sbd = reinterpret_cast(b.raw()); crc32c sum(SUPERBLOCK_CSUM_SEED); sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(sbd->csum)) @@ -285,7 +285,7 @@ namespace validator { } virtual void prepare(buffer<> &b, block_address location) const { - superblock_disk *sbd = reinterpret_cast(&b); + superblock_disk *sbd = reinterpret_cast(b.raw()); crc32c sum(SUPERBLOCK_CSUM_SEED); sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); sbd->csum = to_disk(sum.get_sum()); diff --git a/era/superblock.cc b/era/superblock.cc index c319e9b..9d0ae57 100644 --- a/era/superblock.cc +++ b/era/superblock.cc @@ -212,7 +212,7 @@ namespace era_validator { // FIXME: turn into a template, we have 3 similar classes now struct sb_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - superblock_disk const *sbd = reinterpret_cast(&b); + superblock_disk const *sbd = reinterpret_cast(b.raw()); crc32c sum(SUPERBLOCK_CSUM_SEED); sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(sbd->csum)) @@ -220,7 +220,7 @@ namespace era_validator { } virtual void prepare(buffer<> &b, block_address location) const { - superblock_disk *sbd = reinterpret_cast(&b); + superblock_disk *sbd = reinterpret_cast(b.raw()); crc32c sum(SUPERBLOCK_CSUM_SEED); sum.append(&sbd->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); sbd->csum = to_disk(sum.get_sum()); diff --git a/persistent-data/block.h b/persistent-data/block.h index c514383..487a0e1 100644 --- a/persistent-data/block.h +++ b/persistent-data/block.h @@ -20,7 +20,7 @@ #define BLOCK_H #include "block-cache/block_cache.h" -#include "persistent-data/buffer.h" +#include "block-cache/buffer.h" #include #include @@ -35,6 +35,8 @@ //---------------------------------------------------------------- namespace persistent_data { + using namespace bcache; + uint32_t const MD_BLOCK_SIZE = 4096; @@ -77,10 +79,11 @@ namespace persistent_data { BT_NORMAL }; + // FIXME: eventually this will disappear to be replaced with block_cache::block struct block : private boost::noncopyable { typedef boost::shared_ptr ptr; - block(block_cache *bc, + block(block_cache &bc, block_address location, block_type bt, typename validator::ptr v, @@ -110,7 +113,8 @@ namespace persistent_data { private: void check_not_unlocked() const; - bc_block *internal_; + block_cache &bc_; + block_cache::block *internal_; typename validator::ptr validator_; block_type bt_; bool dirty_; @@ -196,7 +200,9 @@ namespace persistent_data { void write_block(typename block::ptr b) const; int fd_; - block_cache *bc_; + + // FIXME: the mutable is a fudge to allow flush() to be const, which I'm not sure is necc. + mutable block_cache bc_; }; // A little utility to help build validators diff --git a/persistent-data/block.tcc b/persistent-data/block.tcc index ab9a129..4c39c7a 100644 --- a/persistent-data/block.tcc +++ b/persistent-data/block.tcc @@ -40,7 +40,7 @@ namespace { unsigned const SECTOR_SHIFT = 9; // FIXME: these will slow it down until we start doing async io. - int const OPEN_FLAGS = O_DIRECT | O_SYNC; + int const OPEN_FLAGS = O_DIRECT; // FIXME: introduce a new exception for this, or at least lift this // to exception.h @@ -106,31 +106,27 @@ namespace { namespace persistent_data { template - block_manager::block::block(block_cache *bc, + block_manager::block::block(block_cache &bc, block_address location, block_type bt, typename validator::ptr v, bool zero) - : validator_(v), - bt_(bt), - dirty_(false), - unlocked_(false), - buffer_(0, true) // FIXME: we don't know if it's writeable here :( + : bc_(bc), + validator_(v), + bt_(bt), + dirty_(false), + unlocked_(false), + buffer_(0, true) // FIXME: we don't know if it's writeable here :( { if (zero) { - internal_ = block_cache_get(bc, location, GF_ZERO | GF_CAN_BLOCK); - if (!internal_) - throw std::runtime_error("Couldn't get block"); + internal_ = &bc.get(location, block_cache::GF_ZERO | block_cache::GF_CAN_BLOCK); dirty_ = true; + buffer_.set_data(internal_->get_data()); } else { - internal_ = block_cache_get(bc, location, GF_CAN_BLOCK); - if (!internal_) - throw std::runtime_error("Couldn't get block"); - - validator_->check(buffer_, internal_->index); + internal_ = &bc.get(location, block_cache::GF_CAN_BLOCK); + buffer_.set_data(internal_->get_data()); + validator_->check(buffer_, internal_->get_index()); } - - buffer_.set_data(internal_->data); } template @@ -144,8 +140,9 @@ namespace persistent_data { void block_manager::block::unlock() { - validator_->prepare(buffer_, internal_->index); - block_cache_put(internal_, dirty_ ? PF_DIRTY : 0); + if (dirty_) + validator_->prepare(buffer_, internal_->get_index()); + bc_.put(*internal_, dirty_ ? block_cache::PF_DIRTY : 0); unlocked_ = true; } @@ -161,7 +158,7 @@ namespace persistent_data { block_manager::block::get_location() const { check_not_unlocked(); - return internal_->index; + return internal_->get_index(); } template @@ -196,12 +193,12 @@ namespace persistent_data { if (dirty_) // It may have already happened, by calling // this we ensure we're consistent. - validator_->prepare(*internal_->data, internal_->index); + validator_->prepare(*internal_->get_data(), internal_->get_index()); validator_ = v; if (check) - validator_->check(*internal_->data, internal_->index); + validator_->check(*internal_->get_data(), internal_->get_index()); } } @@ -301,14 +298,9 @@ namespace persistent_data { block_address nr_blocks, unsigned max_concurrent_blocks, mode m) + : fd_(open_block_file(path, nr_blocks * BlockSize, m == READ_WRITE)), + bc_(fd_, BlockSize >> SECTOR_SHIFT, nr_blocks, 1024u * 1024u * 4) { - // Open the file descriptor - fd_ = open_block_file(path, nr_blocks * BlockSize, m == READ_WRITE); - - // Create the cache - bc_ = block_cache_create(fd_, BlockSize << SECTOR_SHIFT, nr_blocks, 1024u * BlockSize * 1.2); - if (!bc_) - throw std::runtime_error("couldn't create block cache"); } template @@ -360,7 +352,7 @@ namespace persistent_data { block_address block_manager::get_nr_blocks() const { - return block_cache_get_nr_blocks(bc_); + return bc_.get_nr_blocks(); } template @@ -374,7 +366,7 @@ namespace persistent_data { void block_manager::flush() const { - block_cache_flush(bc_); + bc_.flush(); } } diff --git a/persistent-data/data-structures/array.h b/persistent-data/data-structures/array.h index 6c76bf2..1d8fce6 100644 --- a/persistent-data/data-structures/array.h +++ b/persistent-data/data-structures/array.h @@ -33,7 +33,7 @@ namespace persistent_data { struct array_block_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - array_block_disk const *data = reinterpret_cast(&b); + array_block_disk const *data = reinterpret_cast(b.raw()); crc32c sum(ARRAY_CSUM_XOR); sum.append(&data->max_entries, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(data->csum)) @@ -44,7 +44,7 @@ namespace persistent_data { } virtual void prepare(buffer<> &b, block_address location) const { - array_block_disk *data = reinterpret_cast(&b); + array_block_disk *data = reinterpret_cast(b.raw()); data->blocknr = to_disk(location); crc32c sum(ARRAY_CSUM_XOR); diff --git a/persistent-data/data-structures/btree.tcc b/persistent-data/data-structures/btree.tcc index 1234aa0..a220334 100644 --- a/persistent-data/data-structures/btree.tcc +++ b/persistent-data/data-structures/btree.tcc @@ -34,7 +34,7 @@ namespace { struct btree_node_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - disk_node const *data = reinterpret_cast(&b); + disk_node const *data = reinterpret_cast(b.raw()); node_header const *n = &data->header; crc32c sum(BTREE_CSUM_XOR); sum.append(&n->flags, MD_BLOCK_SIZE - sizeof(uint32_t)); @@ -46,7 +46,7 @@ namespace { } virtual void prepare(buffer<> &b, block_address location) const { - disk_node *data = reinterpret_cast(&b); + disk_node *data = reinterpret_cast(b.raw()); node_header *n = &data->header; n->blocknr = to_disk(location); diff --git a/persistent-data/space-maps/disk.cc b/persistent-data/space-maps/disk.cc index 0c851f6..f75f0a9 100644 --- a/persistent-data/space-maps/disk.cc +++ b/persistent-data/space-maps/disk.cc @@ -39,7 +39,7 @@ namespace { struct bitmap_block_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - bitmap_header const *data = reinterpret_cast(&b); + bitmap_header const *data = reinterpret_cast(b.raw()); crc32c sum(BITMAP_CSUM_XOR); sum.append(&data->not_used, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(data->csum)) @@ -50,7 +50,7 @@ namespace { } virtual void prepare(buffer<> &b, block_address location) const { - bitmap_header *data = reinterpret_cast(&b); + bitmap_header *data = reinterpret_cast(b.raw()); data->blocknr = to_disk(location); crc32c sum(BITMAP_CSUM_XOR); @@ -66,7 +66,8 @@ namespace { // FIXME: factor out the common code in these validators struct index_block_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - metadata_index const *mi = reinterpret_cast(&b); + metadata_index const *mi = reinterpret_cast(b.raw()); + std::cerr << "check mi = " << mi << "\n"; crc32c sum(INDEX_CSUM_XOR); sum.append(&mi->padding_, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(mi->csum_)) @@ -77,7 +78,8 @@ namespace { } virtual void prepare(buffer<> &b, block_address location) const { - metadata_index *mi = reinterpret_cast(&b); + metadata_index *mi = reinterpret_cast(b.raw()); + std::cerr << "prepare mi = " << mi << "\n"; mi->blocknr_ = to_disk(location); crc32c sum(INDEX_CSUM_XOR); @@ -630,7 +632,7 @@ namespace { tm_->shadow(bitmap_root_, index_validator()); bitmap_root_ = p.first.get_location(); - metadata_index *mdi = reinterpret_cast(&p.first.data()); + metadata_index *mdi = reinterpret_cast(p.first.data().raw()); for (unsigned i = 0; i < entries_.size(); i++) index_entry_traits::pack(entries_[i], mdi->index[i]); diff --git a/thin-provisioning/superblock.cc b/thin-provisioning/superblock.cc index 4412696..e808c6c 100644 --- a/thin-provisioning/superblock.cc +++ b/thin-provisioning/superblock.cc @@ -87,7 +87,7 @@ namespace { struct sb_validator : public block_manager<>::validator { virtual void check(buffer<> const &b, block_address location) const { - superblock_disk const *sbd = reinterpret_cast(&b); + superblock_disk const *sbd = reinterpret_cast(b.raw()); crc32c sum(SUPERBLOCK_CSUM_SEED); sum.append(&sbd->flags_, MD_BLOCK_SIZE - sizeof(uint32_t)); if (sum.get_sum() != to_cpu(sbd->csum_)) @@ -95,7 +95,7 @@ namespace { } virtual void prepare(buffer<> &b, block_address location) const { - superblock_disk *sbd = reinterpret_cast(&b); + superblock_disk *sbd = reinterpret_cast(b.raw()); crc32c sum(SUPERBLOCK_CSUM_SEED); sum.append(&sbd->flags_, MD_BLOCK_SIZE - sizeof(uint32_t)); sbd->csum_ = to_disk(sum.get_sum());