#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "list.h" #include "bcache.h" //---------------------------------------------------------------- static void warn(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\n"); } // FIXME: raise a condition somehow? static void raise_(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fprintf(stderr, "\n"); exit(1); } /* * Assumes the list is not empty. */ static inline struct list_head *list_pop(struct list_head *head) { struct list_head *l; if (head->next == head) raise_("list is empty\n"); l = head->next; list_del(l); return l; } //---------------------------------------------------------------- struct control_block { struct list_head list; void *context; struct iocb cb; }; struct cb_set { struct list_head free; struct list_head allocated; struct control_block *vec; } control_block_set; static struct cb_set *cb_set_create(unsigned nr) { int i; struct cb_set *cbs = malloc(sizeof(*cbs)); if (!cbs) return NULL; cbs->vec = malloc(nr * sizeof(*cbs->vec)); if (!cbs->vec) { free(cbs); return NULL; } init_list_head(&cbs->free); init_list_head(&cbs->allocated); for (i = 0; i < nr; i++) list_add(&cbs->vec[i].list, &cbs->free); return cbs; } static void cb_set_destroy(struct cb_set *cbs) { if (!list_empty(&cbs->allocated)) raise_("async io still in flight"); free(cbs->vec); free(cbs); } static struct control_block *cb_alloc(struct cb_set *cbs, void *context) { struct control_block *cb; if (list_empty(&cbs->free)) return NULL; cb = container_of(list_pop(&cbs->free), struct control_block, list); cb->context = context; list_add(&cb->list, &cbs->allocated); return cb; } static void cb_free(struct cb_set *cbs, struct control_block *cb) { list_del(&cb->list); list_add(&cb->list, &cbs->free); } static struct control_block *iocb_to_cb(struct iocb *icb) { return container_of(icb, struct control_block, cb); } //---------------------------------------------------------------- // FIXME: get from linux headers #define SECTOR_SHIFT 9 #define PAGE_SIZE 4096 enum dir { DIR_READ, DIR_WRITE }; struct io_engine { io_context_t aio_context; struct cb_set *cbs; }; static struct io_engine *engine_create(unsigned max_io) { int r; struct io_engine *e = malloc(sizeof(*e)); if (!e) return NULL; e->aio_context = 0; r = io_setup(max_io, &e->aio_context); if (r < 0) { warn("io_setup failed"); return NULL; } e->cbs = cb_set_create(max_io); if (!e->cbs) { warn("couldn't create control block set"); free(e); return NULL; } return e; } static void engine_destroy(struct io_engine *e) { cb_set_destroy(e->cbs); io_destroy(e->aio_context); free(e); } static int engine_issue(struct io_engine *e, int fd, enum dir d, sector_t sb, sector_t se, void *data, void *context) { int r; struct iocb *cb_array[1]; struct control_block *cb; if (((uint64_t) data) & (PAGE_SIZE - 1)) return -1; cb = cb_alloc(e->cbs, context); if (!cb) return false; memset(&cb->cb, 0, sizeof(cb->cb)); cb->cb.aio_fildes = (int) fd; cb->cb.u.c.buf = data; cb->cb.u.c.offset = sb << SECTOR_SHIFT; cb->cb.u.c.nbytes = (se - sb) << SECTOR_SHIFT; cb->cb.aio_lio_opcode = (d == DIR_READ) ? IO_CMD_PREAD : IO_CMD_PWRITE; cb_array[0] = &cb->cb; r = io_submit(e->aio_context, 1, cb_array); if (r < 0) cb_free(e->cbs, cb); return r; } #define MAX_IO 64 typedef void complete_fn(void *context, int io_error); static int engine_wait(struct io_engine *e, struct timespec *ts, complete_fn fn) { int i, r; struct io_event event[MAX_IO]; struct control_block *cb; memset(&event, 0, sizeof(event)); r = io_getevents(e->aio_context, 1, MAX_IO, event, ts); if (r < 0) { warn("io_getevents failed"); return r; } if (r == 0) return 0; for (i = 0; i < r; i++) { struct io_event *ev = event + i; cb = iocb_to_cb((struct iocb *) ev->obj); if (ev->res == cb->cb.u.c.nbytes) fn((void *) cb->context, 0); else if ((int) ev->res < 0) fn(cb->context, (int) ev->res); else { warn("short io"); fn(cb->context, -ENODATA); } cb_free(e->cbs, cb); } return -ENODATA; } //---------------------------------------------------------------- #if 0 struct timespec micro_to_ts(unsigned micro) { struct timespec ts; ts.tv_sec = micro / 1000000u; ts.tv_nsec = (micro % 1000000) * 1000; return ts; } static unsigned ts_to_micro(struct timespec const *ts) { unsigned micro = ts->tv_sec * 1000000; micro += ts->tv_nsec / 1000; return micro; } #endif //---------------------------------------------------------------- #define MIN_BLOCKS 16 #define WRITEBACK_LOW_THRESHOLD_PERCENT 33 #define WRITEBACK_HIGH_THRESHOLD_PERCENT 66 //---------------------------------------------------------------- static void *alloc_aligned(size_t len, size_t alignment) { void *result = NULL; int r = posix_memalign(&result, alignment, len); if (r) return NULL; return result; } //---------------------------------------------------------------- static bool test_flags(struct block *b, unsigned bits) { return (b->flags & bits) != 0; } static void set_flags(struct block *b, unsigned bits) { b->flags |= bits; } static void clear_flags(struct block *b, unsigned bits) { b->flags &= ~bits; } //---------------------------------------------------------------- enum block_flags { BF_IO_PENDING = (1 << 0), BF_DIRTY = (1 << 1), }; struct bcache { int fd; sector_t block_sectors; uint64_t nr_data_blocks; uint64_t nr_cache_blocks; struct io_engine *engine; void *raw_data; struct block *raw_blocks; /* * Lists that categorise the blocks. */ unsigned nr_locked; unsigned nr_dirty; unsigned nr_io_pending; struct list_head free; struct list_head errored; struct list_head dirty; struct list_head clean; struct list_head io_pending; /* * Hash table. */ unsigned nr_buckets; unsigned hash_mask; struct list_head *buckets; /* * Statistics */ unsigned read_hits; unsigned read_misses; unsigned write_zeroes; unsigned write_hits; unsigned write_misses; unsigned prefetches; }; //---------------------------------------------------------------- /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ #define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL static unsigned hash(struct bcache *cache, uint64_t index) { uint64_t h = index; h *= GOLDEN_RATIO_PRIME_64; return h & cache->hash_mask; } static struct block *hash_lookup(struct bcache *cache, uint64_t index) { struct block *b; unsigned h = hash(cache, index); list_for_each_entry (b, cache->buckets + h, hash) if (b->index == index) return b; return NULL; } static void hash_insert(struct block *b) { unsigned h = hash(b->cache, b->index); list_add(&b->hash, b->cache->buckets + h); } static void hash_remove(struct block *b) { list_del(&b->hash); } /* * Must return a power of 2. */ static unsigned calc_nr_buckets(unsigned nr_blocks) { unsigned r = 8; unsigned n = nr_blocks / 4; if (n < 8) n = 8; while (r < n) r <<= 1; return r; } static int hash_table_init(struct bcache *cache, unsigned nr_entries) { unsigned i; cache->nr_buckets = calc_nr_buckets(nr_entries); cache->hash_mask = cache->nr_buckets - 1; cache->buckets = malloc(cache->nr_buckets * sizeof(*cache->buckets)); if (!cache->buckets) return -ENOMEM; for (i = 0; i < cache->nr_buckets; i++) init_list_head(cache->buckets + i); return 0; } static void hash_table_exit(struct bcache *cache) { free(cache->buckets); } //---------------------------------------------------------------- static int init_free_list(struct bcache *cache, unsigned count) { unsigned i; size_t block_size = cache->block_sectors << SECTOR_SHIFT; unsigned char *data = (unsigned char *) alloc_aligned(count * block_size, PAGE_SIZE); /* Allocate the data for each block. We page align the data. */ if (!data) return -ENOMEM; cache->raw_data = data; cache->raw_blocks = malloc(count * sizeof(*cache->raw_blocks)); if (!cache->raw_blocks) free(cache->raw_data); for (i = 0; i < count; i++) { struct block *b = cache->raw_blocks + i; b->cache = cache; b->data = data + (block_size * i); list_add_tail(&b->list, &cache->free); } return 0; } static void exit_free_list(struct bcache *cache) { free(cache->raw_data); free(cache->raw_blocks); } static struct block *alloc_block(struct bcache *cache) { if (list_empty(&cache->free)) return NULL; return container_of(list_pop(&cache->free), struct block, list); } /*---------------------------------------------------------------- * Clean/dirty list management. * Always use these methods to ensure nr_dirty_ is correct. *--------------------------------------------------------------*/ static void unlink_block(struct block *b) { if (test_flags(b, BF_DIRTY)) b->cache->nr_dirty--; list_del(&b->list); } static void link_block(struct block *b) { struct bcache *cache = b->cache; if (test_flags(b, BF_DIRTY)) { list_add_tail(&b->list, &cache->dirty); cache->nr_dirty++; } else list_add_tail(&b->list, &cache->clean); } static void relink(struct block *b) { unlink_block(b); link_block(b); } /*---------------------------------------------------------------- * Low level IO handling * * We cannot have two concurrent writes on the same block. * eg, background writeback, put with dirty, flush? * * To avoid this we introduce some restrictions: * * i) A held block can never be written back. * ii) You cannot get a block until writeback has completed. * *--------------------------------------------------------------*/ /* * |b->list| should be valid (either pointing to itself, on one of the other * lists. */ static int issue_low_level(struct block *b, enum dir d) { struct bcache *cache = b->cache; sector_t sb = b->index * cache->block_sectors; sector_t se = sb + cache->block_sectors; set_flags(b, BF_IO_PENDING); return engine_issue(cache->engine, cache->fd, d, sb, se, b->data, b); } static void issue_read(struct block *b) { assert(!test_flags(b, BF_IO_PENDING)); issue_low_level(b, DIR_READ); } static void issue_write(struct block *b) { assert(!test_flags(b, BF_IO_PENDING)); //b.v_->prepare(b.data_, b.index_); issue_low_level(b, DIR_WRITE); } static void complete_io(void *context, int err) { struct block *b = context; struct bcache *cache = b->cache; b->error = err; clear_flags(b, BF_IO_PENDING); cache->nr_io_pending--; /* * b is on the io_pending list, so we don't want to use unlink_block. * Which would incorrectly adjust nr_dirty. */ list_del(&b->list); if (b->error) list_add_tail(&b->list, &cache->errored); else { clear_flags(b, BF_DIRTY); link_block(b); } } static int wait_io(struct bcache *cache) { return engine_wait(cache->engine, NULL, complete_io); } /*---------------------------------------------------------------- * High level IO handling *--------------------------------------------------------------*/ static void wait_all(struct bcache *cache) { while (!list_empty(&cache->io_pending)) wait_io(cache); } static void wait_specific(struct block *b) { while (test_flags(b, BF_IO_PENDING)) wait_io(b->cache); } static unsigned writeback(struct bcache *cache, unsigned count) { unsigned actual = 0; struct block *b, *tmp; list_for_each_entry_safe (b, tmp, &cache->dirty, list) { if (actual == count) break; // We can't writeback anything that's still in use. if (!b->ref_count) { issue_write(b); actual++; } } return actual; } /*---------------------------------------------------------------- * High level allocation *--------------------------------------------------------------*/ static struct block *find_unused_clean_block(struct bcache *cache) { struct block *b; list_for_each_entry (b, &cache->clean, list) { if (!b->ref_count) { unlink_block(b); hash_remove(b); return b; } } return NULL; } static struct block *new_block(struct bcache *cache, block_address index) { struct block *b; b = alloc_block(cache); while (!b && (cache->nr_locked < cache->nr_cache_blocks)) { b = find_unused_clean_block(cache); if (!b) { if (list_empty(&cache->io_pending)) writeback(cache, 16); wait_io(cache); } } if (b) { init_list_head(&b->list); init_list_head(&b->hash); b->flags = 0; b->index = index; b->ref_count = 0; b->error = 0; hash_insert(b); } return b; } /*---------------------------------------------------------------- * Block reference counting *--------------------------------------------------------------*/ struct bcache *bcache_create(int fd, sector_t block_sectors, uint64_t on_disk_blocks, unsigned nr_cache_blocks) { int r; struct bcache *cache; cache = malloc(sizeof(*cache)); if (!cache) return NULL; cache->fd = fd; cache->block_sectors = block_sectors; cache->nr_data_blocks = on_disk_blocks; cache->nr_cache_blocks = nr_cache_blocks; cache->engine = engine_create(nr_cache_blocks < 1024u ? nr_cache_blocks : 1024u); if (!cache->engine) { free(cache); return NULL; } cache->nr_locked = 0; cache->nr_dirty = 0; cache->nr_io_pending = 0; init_list_head(&cache->free); init_list_head(&cache->errored); init_list_head(&cache->dirty); init_list_head(&cache->clean); init_list_head(&cache->io_pending); if (hash_table_init(cache, nr_cache_blocks)) { engine_destroy(cache->engine); free(cache); } cache->read_hits = 0; cache->read_misses = 0; cache->write_zeroes = 0; cache->write_hits = 0; cache->write_misses = 0; cache->prefetches = 0; r = init_free_list(cache, nr_cache_blocks); if (r) { engine_destroy(cache->engine); hash_table_exit(cache); free(cache); } return cache; } #define MD_BLOCK_SIZE 4096ull struct bcache *bcache_simple(const char *path, unsigned nr_cache_blocks) { int r; struct stat info; struct bcache *cache; int fd = open(path, O_DIRECT | O_EXCL | O_RDONLY); uint64_t s; if (fd < 0) { raise_("couldn't open cache file"); return NULL; } r = fstat(fd, &info); if (r < 0) { raise_("couldn't stat cache file"); return NULL; } s = info.st_size; cache = bcache_create(fd, MD_BLOCK_SIZE >> SECTOR_SHIFT, s / MD_BLOCK_SIZE, nr_cache_blocks); if (!cache) close(fd); return cache; } void bcache_destroy(struct bcache *cache) { if (cache->nr_locked) warn("%u blocks are still locked\n", cache->nr_locked); flush_cache(cache); wait_all(cache); exit_free_list(cache); hash_table_exit(cache); engine_destroy(cache->engine); close(cache->fd); free(cache); } // FIXME: we have to return an error code that can be turned into a Scheme // condition. static void check_index(struct bcache *cache, block_address index) { if (index >= cache->nr_data_blocks) raise_("block out of bounds (%llu >= %llu)", (unsigned long long) index, (unsigned long long) cache->nr_data_blocks); } uint64_t get_nr_blocks(struct bcache *cache) { return cache->nr_data_blocks; } uint64_t get_nr_locked(struct bcache *cache) { return cache->nr_locked; } static void zero_block(struct block *b) { b->cache->write_zeroes++; memset(b->data, 0, b->cache->block_sectors << SECTOR_SHIFT); set_flags(b, BF_DIRTY); } static void hit(struct block *b, unsigned flags) { struct bcache *cache = b->cache; if (flags & (GF_ZERO | GF_DIRTY)) cache->write_hits++; else cache->read_hits++; relink(b); } static void miss(struct bcache *cache, unsigned flags) { if (flags & (GF_ZERO | GF_DIRTY)) cache->write_misses++; else cache->read_misses++; } static struct block *lookup_or_read_block(struct bcache *cache, block_address index, unsigned flags) { struct block *b = hash_lookup(cache, index); if (b) { // FIXME: this is insufficient. We need to also catch a read // lock of a write locked block. Ref count needs to distinguish. if (b->ref_count && (flags & (GF_DIRTY | GF_ZERO))) raise_("concurrent write lock attempt"); if (test_flags(b, BF_IO_PENDING)) { miss(cache, flags); wait_specific(b); } else hit(b, flags); unlink_block(b); if (flags & GF_ZERO) zero_block(b); } else { miss(cache, flags); b = new_block(cache, index); if (b) { if (flags & GF_ZERO) zero_block(b); else { issue_read(b); wait_specific(b); // we know the block is clean and unerrored. unlink_block(b); } } } if (b && !b->error) { if (flags & (GF_DIRTY | GF_ZERO)) set_flags(b, BF_DIRTY); link_block(b); return b; } return NULL; } struct block *get_block(struct bcache *cache, block_address index, unsigned flags) { check_index(cache, index); struct block *b = lookup_or_read_block(cache, index, flags); if (b) { if (!b->ref_count) cache->nr_locked++; b->ref_count++; return b; } raise_("couldn't get block"); return NULL; } static void preemptive_writeback(struct bcache *cache) { // FIXME: this ignores those blocks that are in the error state. Track // nr_clean instead? unsigned nr_available = cache->nr_cache_blocks - (cache->nr_dirty - cache->nr_io_pending); if (nr_available < (WRITEBACK_LOW_THRESHOLD_PERCENT * cache->nr_cache_blocks / 100)) writeback(cache, (WRITEBACK_HIGH_THRESHOLD_PERCENT * cache->nr_cache_blocks / 100) - nr_available); } void release_block(struct block *b) { assert(b->ref_count); b->ref_count--; if (!b->ref_count) b->cache->nr_locked--; if (test_flags(b, BF_DIRTY)) preemptive_writeback(b->cache); } int flush_cache(struct bcache *cache) { while (!list_empty(&cache->dirty)) { struct block *b = container_of(list_pop(&cache->dirty), struct block, list); if (b->ref_count || test_flags(b, BF_IO_PENDING)) // The superblock may well be still locked. continue; issue_write(b); } wait_all(cache); return list_empty(&cache->errored) ? 0 : -EIO; } void prefetch_block(struct bcache *cache, block_address index) { check_index(cache, index); struct block *b = hash_lookup(cache, index); if (!b) { cache->prefetches++; b = new_block(cache, index); if (b) issue_read(b); } } //----------------------------------------------------------------