From d9040949fc96c942200d812bd0e0d67f3047ea8a Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 22 Jul 2014 16:43:44 +0100 Subject: [PATCH] forgot block_cache itself --- block-cache/block_cache.cc | 666 +++++++++++++++++++++++++++++++++++++ block-cache/block_cache.h | 54 +++ block-cache/list.h | 216 ++++++++++++ 3 files changed, 936 insertions(+) create mode 100644 block-cache/block_cache.cc create mode 100644 block-cache/block_cache.h create mode 100644 block-cache/list.h diff --git a/block-cache/block_cache.cc b/block-cache/block_cache.cc new file mode 100644 index 0000000..0cca970 --- /dev/null +++ b/block-cache/block_cache.cc @@ -0,0 +1,666 @@ +#include "block-cache/block_cache.h" + +#include "block-cache/list.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// FIXME: get from linux headers +#define SECTOR_SHIFT 9 +#define PAGE_SIZE 4096 + +#define MIN_BLOCKS 16 +#define WRITEBACK_LOW_THRESHOLD_PERCENT 33 +#define WRITEBACK_HIGH_THRESHOLD_PERCENT 66 + +/*---------------------------------------------------------------- + * Structures + *--------------------------------------------------------------*/ +struct block_cache; + +enum block_flags { + IO_PENDING = (1 << 0), + DIRTY = (1 << 1) +}; + +struct block { + struct list_head list; + struct list_head hash_list; + + struct block_cache *bc; + unsigned ref_count; + + int error; + unsigned flags; + + struct iocb control_block; + + struct bc_block b; +}; + +struct block_cache { + int fd; + sector_t block_size; + uint64_t nr_data_blocks; + uint64_t nr_cache_blocks; + + void *blocks_memory; + void *blocks_data; + + io_context_t aio_context; + struct io_event *events; + + /* + * Blocks on the free list are not initialised, apart from the + * b.data field. + */ + struct list_head free; + struct list_head errored; + struct list_head dirty; + struct list_head clean; + + unsigned nr_io_pending; + struct list_head io_pending; + + unsigned nr_dirty; + + /* + * Hash table fields. + */ + unsigned nr_buckets; + unsigned mask; + struct list_head buckets[0]; +}; + +/*---------------------------------------------------------------- + * Logging + *--------------------------------------------------------------*/ +static void info(struct block_cache *bc, const char *format, ...) + __attribute__ ((format (printf, 2, 3))); + +static void info(struct block_cache *bc, const char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); +} + +/*---------------------------------------------------------------- + * Allocation + *--------------------------------------------------------------*/ +static void *alloc_aligned(size_t len, size_t alignment) +{ + void *result = NULL; + int r = posix_memalign(&result, alignment, len); + if (r) + return NULL; + + return result; +} + +static int init_free_list(struct block_cache *bc, unsigned count) +{ + size_t len; + struct block *blocks; + size_t block_size = bc->block_size << SECTOR_SHIFT; + void *data; + unsigned i; + + /* Allocate the block structures */ + len = sizeof(struct block) * count; + blocks = static_cast(malloc(len)); + if (!blocks) + return -ENOMEM; + + bc->blocks_memory = blocks; + + /* Allocate the data for each block. We page align the data. */ + data = alloc_aligned(count * block_size, PAGE_SIZE); + if (!data) { + free(blocks); + return -ENOMEM; + } + + bc->blocks_data = data; + + for (i = 0; i < count; i++) { + struct block *b = blocks + i; + INIT_LIST_HEAD(&b->list); + b->b.data = data + block_size * i; + + list_add(&b->list, &bc->free); + } + + return 0; +} + +static struct block *__alloc_block(struct block_cache *bc) +{ + struct block *b; + + if (list_empty(&bc->free)) + return NULL; + + b = list_first_entry(&bc->free, struct block, list); + list_del(&b->list); + + return b; +} + +/*---------------------------------------------------------------- + * Flags handling + *--------------------------------------------------------------*/ +static unsigned test_flags(struct block *b, unsigned flags) +{ + return b->flags & flags; +} + +static void clear_flags(struct block *b, unsigned flags) +{ + b->flags &= ~flags; +} + +static void set_flags(struct block *b, unsigned flags) +{ + b->flags |= flags; +} + +/*---------------------------------------------------------------- + * Low level IO handling + * + * We cannot have two concurrent writes on the same block. + * eg, background writeback, put with dirty, flush? + * + * To avoid this we introduce some restrictions: + * + * i) A held block can never be written back. + * ii) You cannot get a block until writeback has completed. + * + *--------------------------------------------------------------*/ + +/* + * This can be called from the context of the aio thread. So we have a + * separate 'top half' complete function that we know is only called by the + * main cache thread. + */ +static void complete_io(struct block *b, int result) +{ + b->error = result; + clear_flags(b, IO_PENDING); + b->bc->nr_io_pending--; + + if (b->error) + list_move_tail(&b->list, &b->bc->errored); + else { + if (test_flags(b, DIRTY)) { + clear_flags(b, DIRTY); + b->bc->nr_dirty--; + } + + list_move_tail(&b->list, &b->bc->clean); + } +} + +/* + * |b->list| should be valid (either pointing to itself, on one of the other + * lists. + */ +static int issue_low_level(struct block *b, enum io_iocb_cmd opcode, const char *desc) +{ + int r; + struct block_cache *bc = b->bc; + struct iocb *control_blocks[1]; + + assert(!test_flags(b, IO_PENDING)); + set_flags(b, IO_PENDING); + bc->nr_io_pending++; + list_move_tail(&b->list, &bc->io_pending); + + b->control_block.aio_lio_opcode = opcode; + control_blocks[0] = &b->control_block; + r = io_submit(bc->aio_context, 1, control_blocks); + if (r != 1) { + if (r < 0) { + perror("io_submit error"); + info(bc, "io_submit failed with %s op: %d\n", desc, r); + } else + info(bc, "could not submit IOs, with %s op\n", desc); + + complete_io(b, EIO); + return -EIO; + } + + return 0; +} + +static int issue_read(struct block *b) +{ + return issue_low_level(b, IO_CMD_PREAD, "read"); +} + +static int issue_write(struct block *b) +{ + return issue_low_level(b, IO_CMD_PWRITE, "write"); +} + +static void wait_io(struct block_cache *bc) +{ + int r; + unsigned i; + + // FIXME: use a timeout to prevent hanging + r = io_getevents(bc->aio_context, 1, bc->nr_cache_blocks, bc->events, NULL); + if (r < 0) { + info(bc, "io_getevents failed %d\n", r); + exit(1); /* FIXME: handle more gracefully */ + } + + for (i = 0; i < static_cast(r); i++) { + struct io_event *e = bc->events + i; + struct block *b = container_of(e->obj, struct block, control_block); + + if (e->res == bc->block_size << SECTOR_SHIFT) + complete_io(b, 0); + else if (e->res < 0) + complete_io(b, e->res); + else { + info(bc, "incomplete io, unexpected\n"); + } + } +} + +/*---------------------------------------------------------------- + * Clean/dirty list management + *--------------------------------------------------------------*/ + +/* + * We're using lru lists atm, but I think it would be worth + * experimenting with a multiqueue approach. + */ +static struct list_head *__categorise(struct block *b) +{ + if (b->error) + return &b->bc->errored; + + return (b->flags & DIRTY) ? &b->bc->dirty : &b->bc->clean; +} + +static void hit(struct block *b) +{ + list_move_tail(&b->list, __categorise(b)); +} + +/*---------------------------------------------------------------- + * High level IO handling + *--------------------------------------------------------------*/ +static void wait_all(struct block_cache *bc) +{ + while (!list_empty(&bc->io_pending)) + wait_io(bc); +} + +static void wait_specific(struct block *b) +{ + while (test_flags(b, IO_PENDING)) + wait_io(b->bc); +} + +static unsigned writeback(struct block_cache *bc, unsigned count) +{ + int r; + struct block *b, *tmp; + unsigned actual = 0; + + list_for_each_entry_safe (b, tmp, &bc->dirty, list) { + if (actual == count) + break; + + if (b->ref_count) + continue; + + r = issue_write(b); + if (!r) + actual++; + } + + info(bc, "writeback: requested %u, actual %u\n", count, actual); + return actual; +} + +/*---------------------------------------------------------------- + * Hash table + *---------------------------------------------------------------*/ + +/* + * |nr_buckets| must be a power of two. + */ +static void hash_init(struct block_cache *bc, unsigned nr_buckets) +{ + unsigned i; + + bc->nr_buckets = nr_buckets; + bc->mask = nr_buckets - 1; + + for (i = 0; i < nr_buckets; i++) + INIT_LIST_HEAD(bc->buckets + i); +} + +static unsigned hash(struct block_cache *bc, uint64_t index) +{ + const unsigned BIG_PRIME = 4294967291UL; + return (((unsigned) index) * BIG_PRIME) & bc->mask; +} + +static struct block *hash_lookup(struct block_cache *bc, block_index index) +{ + struct block *b; + unsigned bucket = hash(bc, index); + + list_for_each_entry (b, bc->buckets + bucket, hash_list) { + if (b->b.index == index) + return b; + } + + return NULL; +} + +static void hash_insert(struct block *b) +{ + unsigned bucket = hash(b->bc, b->b.index); + + list_move_tail(&b->hash_list, b->bc->buckets + bucket); +} + +static void hash_remove(struct block *b) +{ + list_del_init(&b->hash_list); +} + +/*---------------------------------------------------------------- + * High level allocation + *--------------------------------------------------------------*/ +static void setup_control_block(struct block *b) +{ + struct iocb *cb = &b->control_block; + size_t block_size_bytes = b->bc->block_size << SECTOR_SHIFT; + + memset(cb, 0, sizeof(*cb)); + cb->aio_fildes = b->bc->fd; + + cb->u.c.buf = b->b.data; + cb->u.c.offset = block_size_bytes * b->b.index; + cb->u.c.nbytes = block_size_bytes; +} + +static struct block *new_block(struct block_cache *bc, + block_index index) +{ + struct block *b; + + b = __alloc_block(bc); + if (!b) { + if (list_empty(&bc->clean)) { + if (list_empty(&bc->io_pending)) + writeback(bc, 9000); + wait_io(bc); + } + + if (!list_empty(&bc->clean)) { + b = list_first_entry(&bc->clean, struct block, list); + hash_remove(b); + list_del(&b->list); + } + } + + if (b) { + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->hash_list); + b->bc = bc; + b->ref_count = 0; + + b->error = 0; + clear_flags(b, IO_PENDING | DIRTY); + + b->b.index = index; + setup_control_block(b); + + hash_insert(b); + } + + return b; +} + +/*---------------------------------------------------------------- + * Block reference counting + *--------------------------------------------------------------*/ +static void get_block(struct block *b) +{ + b->ref_count++; +} + +static void put_block(struct block *b) +{ + assert(b->ref_count); + b->ref_count--; +} + +static void mark_dirty(struct block *b) +{ + struct block_cache *bc = b->bc; + + if (!test_flags(b, DIRTY)) { + set_flags(b, DIRTY); + list_move_tail(&b->list, &b->bc->dirty); + bc->nr_dirty++; + } +} + +/*---------------------------------------------------------------- + * Public interface + *--------------------------------------------------------------*/ +unsigned calc_nr_cache_blocks(size_t mem, sector_t block_size) +{ + size_t space_per_block = (block_size << SECTOR_SHIFT) + sizeof(struct block); + unsigned r = mem / space_per_block; + + return (r < MIN_BLOCKS) ? MIN_BLOCKS : r; +} + +unsigned calc_nr_buckets(unsigned nr_blocks) +{ + unsigned r = 8; + unsigned n = nr_blocks / 4; + + if (n < 8) + n = 8; + + while (r < n) + r <<= 1; + + return r; +} + +void +block_cache_destroy(struct block_cache *bc) +{ + wait_all(bc); + + if (bc->aio_context) + io_destroy(bc->aio_context); + + if (bc->events) + free(bc->events); + + if (bc->blocks_memory) + free(bc->blocks_memory); + + if (bc->blocks_data) + free(bc->blocks_data); + + free(bc); +} + +struct block_cache * +block_cache_create(int fd, sector_t block_size, uint64_t on_disk_blocks, size_t mem) +{ + int r; + struct block_cache *bc; + unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size); + unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks); + + bc = static_cast(malloc(sizeof(*bc) + sizeof(*bc->buckets) * nr_buckets)); + if (bc) { + memset(bc, 0, sizeof(*bc)); + + bc->fd = fd; + bc->block_size = block_size; + bc->nr_data_blocks = on_disk_blocks; + bc->nr_cache_blocks = nr_cache_blocks; + + bc->events = static_cast(malloc(sizeof(*bc->events) * nr_cache_blocks)); + if (!bc->events) { + info(bc, "couldn't allocate events array\n"); + goto bad; + } + + bc->aio_context = 0; /* needed or io_setup will fail */ + r = io_setup(nr_cache_blocks, &bc->aio_context); + if (r < 0) { + info(bc, "io_setup failed: %d\n", r); + goto bad; + } + + hash_init(bc, nr_buckets); + INIT_LIST_HEAD(&bc->free); + INIT_LIST_HEAD(&bc->errored); + INIT_LIST_HEAD(&bc->dirty); + INIT_LIST_HEAD(&bc->clean); + INIT_LIST_HEAD(&bc->io_pending); + + r = init_free_list(bc, nr_cache_blocks); + if (r) { + info(bc, "couldn't allocate blocks: %d\n", r); + goto bad; + } + } + + return bc; + +bad: + block_cache_destroy(bc); + return NULL; +} + +uint64_t block_cache_get_nr_blocks(struct block_cache *bc) +{ + return bc->nr_data_blocks; +} + +static void zero_block(struct block *b) +{ + memset(b->b.data, 0, b->bc->block_size << SECTOR_SHIFT); + mark_dirty(b); +} + +static struct block *lookup_or_read_block(struct block_cache *bc, block_index index, unsigned flags) +{ + struct block *b = hash_lookup(bc, index); + + if (b) { + if (test_flags(b, IO_PENDING)) + wait_specific(b); + + if (flags & GF_ZERO) + zero_block(b); + + } else { + if (flags & GF_CAN_BLOCK) { + b = new_block(bc, index); + if (b) { + if (flags & GF_ZERO) + zero_block(b); + else { + issue_read(b); + wait_specific(b); + } + } + } + } + + return (!b || b->error) ? NULL : b; +} + +struct bc_block * +block_cache_get(struct block_cache *bc, block_index index, unsigned flags) +{ + struct block *b = lookup_or_read_block(bc, index, flags); + + if (b) { + hit(b); + get_block(b); + + return &b->b; + } + + return NULL; +} + +void +block_cache_put(struct bc_block *bcb, unsigned flags) +{ + unsigned nr_available; + struct block *b = container_of(bcb, struct block, b); + struct block_cache *bc = b->bc; + + put_block(b); + + if (flags & PF_DIRTY) { + mark_dirty(b); + + nr_available = bc->nr_cache_blocks - (bc->nr_dirty - bc->nr_io_pending); + if (nr_available < (WRITEBACK_LOW_THRESHOLD_PERCENT * bc->nr_cache_blocks / 100)) + writeback(bc, (WRITEBACK_HIGH_THRESHOLD_PERCENT * bc->nr_cache_blocks / 100) - nr_available); + } +} + +int +block_cache_flush(struct block_cache *bc) +{ + struct block *b; + + list_for_each_entry (b, &bc->dirty, list) { + if (b->ref_count) { + info(bc, "attempt to lock an already locked block\n"); + return -EAGAIN; + } + + issue_write(b); + } + + wait_all(bc); + + return list_empty(&bc->errored) ? 0 : -EIO; +} + +void +block_cache_prefetch(struct block_cache *bc, block_index index) +{ + struct block *b = hash_lookup(bc, index); + + if (!b) { + b = new_block(bc, index); + if (b) + issue_read(b); + } +} + +/*----------------------------------------------------------------*/ + diff --git a/block-cache/block_cache.h b/block-cache/block_cache.h new file mode 100644 index 0000000..046083f --- /dev/null +++ b/block-cache/block_cache.h @@ -0,0 +1,54 @@ +#ifndef BLOCK_CACHE_H +#define BLOCK_CACHE_H + +#include +#include + +/*----------------------------------------------------------------*/ + +/* FIXME: add logging */ + +/*----------------------------------------------------------------*/ + +/* + * This library is not thread-safe. + */ +typedef uint64_t block_index; + +struct block_cache; + +struct bc_block { + block_index index; + void *data; +}; + +typedef uint64_t sector_t; + +struct block_cache *block_cache_create(int fd, sector_t block_size, + uint64_t max_nr_blocks, size_t mem); +void block_cache_destroy(struct block_cache *bc); + +uint64_t block_cache_get_nr_blocks(struct block_cache *bc); + +enum get_flags { + GF_ZERO = (1 << 0), + GF_CAN_BLOCK = (1 << 1) +}; +struct bc_block *block_cache_get(struct block_cache *bc, block_index index, unsigned flags); + +enum put_flags { + PF_DIRTY = (1 << 0), +}; +void block_cache_put(struct bc_block *b, unsigned flags); + +/* + * Flush can fail if an earlier write failed. You do not know which block + * failed. Make sure you build your recovery with this in mind. + */ +int block_cache_flush(struct block_cache *bc); + +void block_cache_prefetch(struct block_cache *bc, block_index index); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/block-cache/list.h b/block-cache/list.h new file mode 100644 index 0000000..63e8830 --- /dev/null +++ b/block-cache/list.h @@ -0,0 +1,216 @@ +#ifndef LIB_BLOCK_CACHE_LIST_H +#define LIB_BLOCK_CACHE_LIST_H + +#include + +/*----------------------------------------------------------------*/ + +/* + * Simple intrusive linked list code. Lifted from Linux kernel. + */ + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +struct list_head { + struct list_head *next, *prev; +}; + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new_, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new_; + new_->next = next; + new_->prev = prev; + prev->next = new_; +} + +/** + * list_add - add a new entry + * @new_: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new_, struct list_head *head) +{ + __list_add(new_, head, head->next); +} + + +/** + * list_add_tail - add a new entry + * @new_: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new_, struct list_head *head) +{ + __list_add(new_, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = NULL; + entry->prev = NULL; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del_entry(entry); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del_entry(list); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del_entry(list); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_next_entry - get the next element in list + * @pos: the type * to cursor + * @member: the name of the list_struct within the struct. + */ +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member), \ + n = list_next_entry(pos, member); \ + &pos->member != (head); \ + pos = n, n = list_next_entry(n, member)) + + +/*----------------------------------------------------------------*/ + +#endif