Merge branch 'v0.7-devel'

Conflicts:
	VERSION
This commit is contained in:
Joe Thornber
2017-03-08 11:05:50 -05:00
112 changed files with 7021 additions and 561 deletions

View File

@@ -1,5 +1,6 @@
#include "block-cache/block_cache.h"
#include <algorithm>
#include <assert.h>
#include <libaio.h>
#include <errno.h>
@@ -44,34 +45,19 @@ namespace {
int
block_cache::init_free_list(unsigned count)
{
size_t len;
block *blocks;
size_t block_size = block_size_ << SECTOR_SHIFT;
void *data;
unsigned i;
/* Allocate the block structures */
len = sizeof(block) * count;
blocks = static_cast<block *>(malloc(len));
if (!blocks)
return -ENOMEM;
blocks_memory_ = blocks;
unsigned char *data = static_cast<unsigned char *>(alloc_aligned(count * block_size, PAGE_SIZE));
/* Allocate the data for each block. We page align the data. */
data = alloc_aligned(count * block_size, PAGE_SIZE);
if (!data) {
free(blocks);
if (!data)
return -ENOMEM;
}
blocks_data_ = data;
for (i = 0; i < count; i++) {
block *b = new (blocks + i) block();
b->data_ = static_cast<unsigned char *>(data) + block_size * i;
list_add(&b->list_, &free_);
for (unsigned i = 0; i < count; i++) {
block &b = (*blocks_memory_)[i];
b.data_ = data + (block_size * i);
free_.push_front(b);
}
return 0;
@@ -82,28 +68,18 @@ block_cache::exit_free_list()
{
if (blocks_data_)
free(blocks_data_);
if (blocks_memory_) {
struct block *blocks = static_cast<block *>(blocks_memory_);
for (unsigned i = 0; i < nr_cache_blocks_; i++)
(blocks + i)->~block();
free(blocks_memory_);
}
}
block_cache::block *
block_cache::__alloc_block()
{
block *b;
if (list_empty(&free_))
if (free_.empty())
return NULL;
b = list_first_entry(&free_, block, list_);
list_del(&b->list_);
block &b = free_.front();
b.unlink();
return b;
return &b;
}
/*----------------------------------------------------------------
@@ -131,15 +107,18 @@ block_cache::complete_io(block &b, int result)
b.clear_flags(BF_IO_PENDING);
nr_io_pending_--;
if (b.error_)
list_move_tail(&b.list_, &errored_);
else {
if (b.error_) {
b.unlink();
errored_.push_back(b);
} else {
if (b.test_flags(BF_DIRTY)) {
b.clear_flags(BF_DIRTY | BF_PREVIOUSLY_DIRTY);
nr_dirty_--;
}
list_move_tail(&b.list_, &clean_);
b.unlink();
clean_.push_back(b);
}
}
@@ -157,7 +136,8 @@ block_cache::issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc
assert(!b.test_flags(BF_IO_PENDING));
b.set_flags(BF_IO_PENDING);
nr_io_pending_++;
list_move_tail(&b.list_, &io_pending_);
b.unlink();
io_pending_.push_back(b);
b.control_block_.aio_lio_opcode = opcode;
control_blocks[0] = &b.control_block_;
@@ -208,7 +188,7 @@ block_cache::wait_io()
for (i = 0; i < static_cast<unsigned>(r); i++) {
io_event const &e = events_[i];
block *b = container_of(e.obj, block, control_block_);
block *b = base::container_of(e.obj, &block::control_block_);
if (e.res == block_size_ << SECTOR_SHIFT)
complete_io(*b, 0);
@@ -236,19 +216,20 @@ block_cache::wait_io()
* We're using lru lists atm, but I think it would be worth
* experimenting with a multiqueue approach.
*/
list_head *
block_cache::block_list &
block_cache::__categorise(block &b)
{
if (b.error_)
return &errored_;
return errored_;
return b.test_flags(BF_DIRTY) ? &dirty_ : &clean_;
return b.test_flags(BF_DIRTY) ? dirty_ : clean_;
}
void
block_cache::hit(block &b)
{
list_move_tail(&b.list_, __categorise(b));
b.unlink();
__categorise(b).push_back(b);
}
/*----------------------------------------------------------------
@@ -257,7 +238,7 @@ block_cache::hit(block &b)
void
block_cache::wait_all()
{
while (!list_empty(&io_pending_))
while (!io_pending_.empty())
wait_io();
}
@@ -271,10 +252,15 @@ block_cache::wait_specific(block &b)
unsigned
block_cache::writeback(unsigned count)
{
block *b, *tmp;
unsigned actual = 0, dirty_length = 0;
list_for_each_entry_safe (b, tmp, &dirty_, list_) {
// issue_write unlinks b, which invalidates the iteration, so we
// keep track of the next element before removing.
auto it = dirty_.begin();
auto next = it;
while (it != dirty_.end()) {
next = it;
++next;
dirty_length++;
if (actual == count)
@@ -282,69 +268,18 @@ block_cache::writeback(unsigned count)
// The block may be on the dirty list from a prior
// acquisition.
if (b->ref_count_)
if (it->ref_count_)
continue;
issue_write(*b);
issue_write(*it);
actual++;
it = next;
}
return actual;
}
/*----------------------------------------------------------------
* Hash table
*---------------------------------------------------------------*/
/*
* |nr_buckets| must be a power of two.
*/
void
block_cache::hash_init(unsigned nr_buckets)
{
unsigned i;
nr_buckets_ = nr_buckets;
mask_ = nr_buckets - 1;
for (i = 0; i < nr_buckets; i++)
INIT_LIST_HEAD(&buckets_[i]);
}
unsigned
block_cache::hash(uint64_t index)
{
const unsigned BIG_PRIME = 4294967291UL;
return (((unsigned) index) * BIG_PRIME) & mask_;
}
block_cache::block *
block_cache::hash_lookup(block_address index)
{
block *b;
unsigned bucket = hash(index);
list_for_each_entry (b, &buckets_[bucket], hash_list_) {
if (b->index_ == index)
return b;
}
return NULL;
}
void
block_cache::hash_insert(block &b)
{
unsigned bucket = hash(b.index_);
list_move_tail(&b.hash_list_, &buckets_[bucket]);
}
void
block_cache::hash_remove(block &b)
{
list_del_init(&b.hash_list_);
}
/*----------------------------------------------------------------
* High level allocation
*--------------------------------------------------------------*/
@@ -362,18 +297,17 @@ block_cache::setup_control_block(block &b)
cb->u.c.nbytes = block_size_bytes;
}
// FIXME: return a reference
block_cache::block *
block_cache::find_unused_clean_block()
{
struct block *b, *tmp;
list_for_each_entry_safe (b, tmp, &clean_, list_) {
if (b->ref_count_)
for (block &b : clean_) {
if (b.ref_count_)
continue;
hash_remove(*b);
list_del(&b->list_);
return b;
b.unlink_set();
b.unlink();
return &b;
}
return NULL;
@@ -386,8 +320,8 @@ block_cache::new_block(block_address index)
b = __alloc_block();
if (!b) {
if (list_empty(&clean_)) {
if (list_empty(&io_pending_))
if (clean_.empty()) {
if (io_pending_.empty())
writeback(16);
wait_io();
}
@@ -396,8 +330,6 @@ block_cache::new_block(block_address index)
}
if (b) {
INIT_LIST_HEAD(&b->list_);
INIT_LIST_HEAD(&b->hash_list_);
b->bc_ = this;
b->ref_count_ = 0;
@@ -408,7 +340,7 @@ block_cache::new_block(block_address index)
b->index_ = index;
setup_control_block(*b);
hash_insert(*b);
block_set_.insert(*b);
}
return b;
@@ -455,9 +387,6 @@ block_cache::block_cache(int fd, sector_t block_size, uint64_t on_disk_blocks, s
{
int r;
unsigned nr_cache_blocks = calc_nr_cache_blocks(mem, block_size);
unsigned nr_buckets = calc_nr_buckets(nr_cache_blocks);
buckets_.resize(nr_buckets);
fd_ = fd;
block_size_ = block_size;
@@ -473,12 +402,7 @@ block_cache::block_cache(int fd, sector_t block_size, uint64_t on_disk_blocks, s
throw std::runtime_error("io_setup failed");
}
hash_init(nr_buckets);
INIT_LIST_HEAD(&free_);
INIT_LIST_HEAD(&errored_);
INIT_LIST_HEAD(&dirty_);
INIT_LIST_HEAD(&clean_);
INIT_LIST_HEAD(&io_pending_);
blocks_memory_.reset(new std::vector<block>(nr_cache_blocks));
r = init_free_list(nr_cache_blocks);
if (r)
@@ -552,30 +476,31 @@ block_cache::block *
block_cache::lookup_or_read_block(block_address index, unsigned flags,
validator::ptr v)
{
block *b = hash_lookup(index);
auto it = block_set_.find(index, cmp_index());
if (b) {
if (b->test_flags(BF_IO_PENDING)) {
if (it != block_set_.end()) {
if (it->test_flags(BF_IO_PENDING)) {
inc_miss_counter(flags);
wait_specific(*b);
wait_specific(*it);
} else
inc_hit_counter(flags);
if (flags & GF_ZERO)
zero_block(*b);
zero_block(*it);
else {
if (b->v_.get() != v.get()) {
if (b->test_flags(BF_DIRTY))
b->v_->prepare(b->data_, b->index_);
v->check(b->data_, b->index_);
if (it->v_.get() != v.get()) {
if (it->test_flags(BF_DIRTY))
it->v_->prepare(it->data_, it->index_);
v->check(it->data_, it->index_);
}
}
b->v_ = v;
it->v_ = v;
return &(*it);
} else {
inc_miss_counter(flags);
b = new_block(index);
block *b = new_block(index);
if (b) {
if (flags & GF_ZERO)
zero_block(*b);
@@ -587,9 +512,9 @@ block_cache::lookup_or_read_block(block_address index, unsigned flags,
b->v_ = v;
}
}
return (!b || b->error_) ? NULL : b;
return (!b || b->error_) ? NULL : b;
}
}
block_cache::block &
@@ -600,8 +525,11 @@ block_cache::get(block_address index, unsigned flags, validator::ptr v)
block *b = lookup_or_read_block(index, flags, v);
if (b) {
if (b->ref_count_ && flags & (GF_DIRTY | GF_ZERO))
throw std::runtime_error("attempt to write lock block concurrently");
if (b->ref_count_ && (flags & (GF_DIRTY | GF_ZERO))) {
std::ostringstream out;
out << "attempt to write lock block " << index << " concurrently";
throw std::runtime_error(out.str());
}
// FIXME: this gets called even for new blocks
hit(*b);
@@ -620,7 +548,9 @@ block_cache::get(block_address index, unsigned flags, validator::ptr v)
return *b;
}
throw std::runtime_error("couldn't get block");
std::ostringstream out;
out << "couldn't get block " << index;
throw std::runtime_error(out.str());
}
void
@@ -644,7 +574,8 @@ block_cache::release(block_cache::block &b)
if (b.test_flags(BF_DIRTY)) {
if (!b.test_flags(BF_PREVIOUSLY_DIRTY)) {
list_move_tail(&b.list_, &dirty_);
b.unlink();
dirty_.push_back(b);
nr_dirty_++;
b.set_flags(BF_PREVIOUSLY_DIRTY);
}
@@ -661,19 +592,18 @@ block_cache::release(block_cache::block &b)
int
block_cache::flush()
{
block *b, *tmp;
list_for_each_entry_safe (b, tmp, &dirty_, list_) {
if (b->ref_count_ || b->test_flags(BF_IO_PENDING))
while (!dirty_.empty()) {
block &b = dirty_.front();
if (b.ref_count_ || b.test_flags(BF_IO_PENDING))
// The superblock may well be still locked.
continue;
issue_write(*b);
issue_write(b);
}
wait_all();
return list_empty(&errored_) ? 0 : -EIO;
return errored_.empty() ? 0 : -EIO;
}
void
@@ -681,11 +611,12 @@ block_cache::prefetch(block_address index)
{
check_index(index);
block *b = hash_lookup(index);
if (!b) {
auto it = block_set_.find(index, cmp_index());
if (it == block_set_.end()) {
prefetches_++;
b = new_block(index);
block *b = new_block(index);
if (b)
issue_read(*b);
}

View File

@@ -1,17 +1,23 @@
#ifndef BLOCK_CACHE_H
#define BLOCK_CACHE_H
#include "block-cache/list.h"
#include "base/container_of.h"
#include <boost/shared_ptr.hpp>
#include <boost/intrusive/list.hpp>
#include <boost/intrusive/set.hpp>
#include <boost/noncopyable.hpp>
#include <stdexcept>
#include <boost/shared_ptr.hpp>
#include <functional>
#include <iostream>
#include <libaio.h>
#include <memory>
#include <stdexcept>
#include <stdint.h>
#include <stdlib.h>
#include <vector>
#include <iostream>
namespace bi = boost::intrusive;
//----------------------------------------------------------------
@@ -26,12 +32,14 @@ namespace bcache {
virtual ~validator() {}
virtual void check(void const *data, block_address location) const = 0;
virtual bool check_raw(void const *data) const = 0;
virtual void prepare(void *data, block_address location) const = 0;
};
class noop_validator : public validator {
public:
void check(void const *data, block_address location) const {}
bool check_raw(void const *data) const {return true;}
void prepare(void *data, block_address location) const {}
};
@@ -50,7 +58,14 @@ namespace bcache {
public:
block()
: v_() {
INIT_LIST_HEAD(&list_);
}
bool operator <(block const &rhs) const {
return index_ > rhs.index_;
}
bool operator ==(block const &rhs) const {
return index_ == rhs.index_;
}
// Do not give this class a destructor, it wont get
@@ -92,16 +107,25 @@ namespace bcache {
bc_->release(*this);
}
void unlink_set() {
set_hook_.unlink();
}
void unlink() {
list_hook_.unlink();
}
private:
friend class block_cache;
friend class cmp_index;
block_cache *bc_;
uint64_t index_;
void *data_;
list_head list_;
list_head hash_list_;
bi::list_member_hook<bi::link_mode<bi::auto_unlink>> list_hook_;
bi::set_member_hook<bi::link_mode<bi::auto_unlink>> set_hook_;
unsigned ref_count_;
@@ -112,6 +136,54 @@ namespace bcache {
validator::ptr v_;
};
struct cmp_index {
bool operator()(block_address index, block const &b) const {
return index > b.index_;
}
bool operator()(block const &b, block_address index) const {
return b.index_ > index;
}
};
class auto_block {
public:
auto_block()
: b_(0) {
}
auto_block(block &b)
: b_(&b) {
}
~auto_block() {
put();
}
auto_block &operator =(block &b) {
put();
b_ = &b;
return *this;
}
void *get_data() const {
if (b_)
return b_->get_data();
throw std::runtime_error("auto_block not set");
}
private:
void put() {
if (b_) {
b_->put();
b_ = 0;
}
}
block *b_;
};
//--------------------------------
block_cache(int fd, sector_t block_size,
@@ -137,24 +209,24 @@ namespace bcache {
void prefetch(block_address index);
private:
typedef bi::member_hook<block,
bi::list_member_hook<bi::link_mode<bi::auto_unlink>>,
&block::list_hook_> list_hook_option;
typedef bi::list<block, list_hook_option,
bi::constant_time_size<false>> block_list;
int init_free_list(unsigned count);
void exit_free_list();
block *__alloc_block();
void complete_io(block &b, int result);
void issue_low_level(block &b, enum io_iocb_cmd opcode, const char *desc);
void issue_read(block &b);
void issue_write(block &b);
void wait_io();
list_head *__categorise(block &b);
block_list &__categorise(block &b);
void hit(block &b);
void wait_all();
void wait_specific(block &b);
unsigned writeback(unsigned count);
void hash_init(unsigned nr_buckets);
unsigned hash(uint64_t index);
block *hash_lookup(block_address index);
void hash_insert(block &b);
void hash_remove(block &b);
void setup_control_block(block &b);
block *find_unused_clean_block();
block *new_block(block_address index);
@@ -163,6 +235,7 @@ namespace bcache {
unsigned calc_nr_buckets(unsigned nr_blocks);
void zero_block(block &b);
block *lookup_or_read_block(block_address index, unsigned flags, validator::ptr v);
void exit_free_list();
void preemptive_writeback();
void release(block_cache::block &block);
@@ -178,9 +251,8 @@ namespace bcache {
uint64_t nr_data_blocks_;
uint64_t nr_cache_blocks_;
// We can't use auto_ptr or unique_ptr because the memory is allocated with malloc
void *blocks_memory_;
void *blocks_data_;
std::unique_ptr<std::vector<block>> blocks_memory_;
unsigned char *blocks_data_;
io_context_t aio_context_;
std::vector<io_event> events_;
@@ -189,23 +261,23 @@ namespace bcache {
* Blocks on the free list are not initialised, apart from the
* b.data field.
*/
list_head free_;
list_head errored_;
list_head dirty_;
list_head clean_;
block_list free_;
block_list errored_;
block_list dirty_;
block_list clean_;
unsigned nr_locked_;
unsigned nr_dirty_;
unsigned nr_io_pending_;
struct list_head io_pending_;
block_list io_pending_;
/*
* Hash table fields.
*/
unsigned nr_buckets_;
unsigned mask_;
std::vector<list_head> buckets_;
typedef bi::member_hook<block,
bi::set_member_hook<bi::link_mode<bi::auto_unlink>>,
&block::set_hook_> block_option;
typedef bi::set<block, block_option,
bi::constant_time_size<false>> block_set;
block_set block_set_;
// Stats
unsigned read_hits_;

192
block-cache/copier.cc Normal file
View File

@@ -0,0 +1,192 @@
#include "block-cache/copier.h"
#include <stdexcept>
using namespace bcache;
using namespace boost;
using namespace std;
//----------------------------------------------------------------
copier::copier(io_engine &engine,
string const &src, string const &dest,
sector_t block_size, size_t mem)
: pool_(block_size * 512, mem, PAGE_SIZE),
block_size_(block_size),
nr_blocks_(mem / block_size),
engine_(engine),
src_handle_(engine_.open_file(src, io_engine::M_READ_ONLY)),
dest_handle_(engine_.open_file(dest, io_engine::M_READ_WRITE)),
genkey_count_(0)
{
}
copier::~copier()
{
engine_.close_file(src_handle_);
engine_.close_file(dest_handle_);
}
void
copier::issue(copy_op const &op)
{
void *data;
while (!(data = pool_.alloc())) {
wait_();
// data may still not be present because the wait_ could
// have completed a read and issued the corresponding
// write.
}
copy_job job(op, data);
job.op.read_complete = job.op.write_complete = false;
unsigned key = genkey(); // used as context for the io_engine
auto r = engine_.issue_io(src_handle_,
io_engine::D_READ,
to_sector(op.src_b),
to_sector(op.src_e),
data,
key);
if (r)
jobs_.insert(make_pair(key, job));
else
complete(job);
}
unsigned
copier::nr_pending() const
{
return jobs_.size() + complete_.size();
}
boost::optional<copy_op>
copier::wait()
{
if (complete_.empty())
wait_();
return wait_complete();
}
boost::optional<copy_op>
copier::wait(unsigned &micro)
{
if (complete_.empty())
wait_(micro);
return wait_complete();
}
bool
copier::pending() const
{
return !jobs_.empty();
}
boost::optional<copy_op>
copier::wait_complete()
{
if (complete_.empty()) {
return optional<copy_op>();
} else {
auto op = complete_.front();
complete_.pop_front();
return optional<copy_op>(op);
}
}
void
copier::wait_(unsigned &micro)
{
optional<io_engine::wait_result> mp;
if (!pending())
return;
bool completed = false;
while (pending() && !completed) {
mp = engine_.wait(micro);
if (mp)
completed = wait_successful(*mp);
if (!micro)
break;
}
}
void
copier::wait_()
{
bool completed = false;
while (pending() && !completed) {
auto mp = engine_.wait();
if (mp)
completed = wait_successful(*mp);
}
}
bool
copier::wait_successful(io_engine::wait_result const &p)
{
auto it = jobs_.find(p.second);
if (it == jobs_.end())
throw runtime_error("Internal error. Lost track of copy job.");
copy_job &j = it->second;
if (!p.first) {
// IO was unsuccessful
complete(j);
jobs_.erase(it);
return true;
}
// IO was successful
if (!j.op.read_complete) {
j.op.read_complete = true;
if (!engine_.issue_io(dest_handle_,
io_engine::D_WRITE,
to_sector(j.op.dest_b),
to_sector(j.op.dest_b + (j.op.src_e - j.op.src_b)),
j.data,
it->first)) {
complete(j);
jobs_.erase(it);
return true;
}
return false;
} else {
j.op.write_complete = true;
complete(j);
jobs_.erase(it);
return true;
}
}
void
copier::complete(copy_job const &j)
{
pool_.free(j.data);
complete_.push_back(j.op);
}
sector_t
copier::to_sector(block_address b) const
{
return b * block_size_;
}
unsigned
copier::genkey()
{
return genkey_count_++;
}
//----------------------------------------------------------------

106
block-cache/copier.h Normal file
View File

@@ -0,0 +1,106 @@
#ifndef BLOCK_CACHE_COPIER_H
#define BLOCK_CACHE_COPIER_H
#include "block-cache/io_engine.h"
#include "block-cache/mem_pool.h"
#include <string>
#include <list>
#include <map>
//----------------------------------------------------------------
namespace bcache {
using block_address = uint64_t;
struct copy_op {
copy_op()
: src_b(0),
src_e(0),
dest_b(0),
read_complete(false),
write_complete(false) {
}
copy_op(block_address src_b_,
block_address src_e_,
block_address dest_b_)
: src_b(src_b_),
src_e(src_e_),
dest_b(dest_b_),
read_complete(false),
write_complete(false) {
}
bool operator <(copy_op const &rhs) const {
return dest_b < rhs.dest_b;
}
bool success() const {
return read_complete && write_complete;
}
block_address src_b, src_e;
block_address dest_b;
bool read_complete;
bool write_complete;
};
class copy_job {
public:
copy_job(copy_op const &op_, void *data_)
: op(op_), data(data_) {
}
copy_op op;
void *data;
};
class copier {
public:
copier(io_engine &engine,
std::string const &src, std::string const &dest,
sector_t block_size, size_t mem);
~copier();
sector_t get_block_size() const {
return block_size_;
}
// Blocks if out of memory.
void issue(copy_op const &op);
unsigned nr_pending() const;
boost::optional<copy_op> wait();
boost::optional<copy_op> wait(unsigned &micro);
private:
bool pending() const;
bool wait_successful(io_engine::wait_result const &p);
boost::optional<copy_op> wait_complete();
void wait_(unsigned &micro);
void wait_();
void complete(copy_job const &j);
sector_t to_sector(block_address b) const;
unsigned genkey();
mempool pool_;
sector_t block_size_;
unsigned nr_blocks_;
io_engine &engine_;
io_engine::handle src_handle_;
io_engine::handle dest_handle_;
unsigned genkey_count_;
using job_map = std::map<unsigned, copy_job>;
using op_list = std::list<copy_op>;
job_map jobs_;
op_list complete_;
};
}
//----------------------------------------------------------------
#endif

199
block-cache/io_engine.cc Normal file
View File

@@ -0,0 +1,199 @@
#include "base/container_of.h"
#include "block-cache/io_engine.h"
#include <errno.h>
#include <fcntl.h>
#include <sstream>
#include <stdexcept>
#include <sys/stat.h>
#include <sys/types.h>
using namespace bcache;
using namespace boost;
using namespace std;
//----------------------------------------------------------------
control_block_set::control_block_set(unsigned nr)
: cbs_(nr)
{
for (auto i = 0u; i < nr; i++)
free_cbs_.insert(i);
}
iocb *
control_block_set::alloc(unsigned context)
{
if (free_cbs_.empty())
return nullptr;
auto it = free_cbs_.begin();
cblock &cb = cbs_[*it];
cb.context = context;
free_cbs_.erase(it);
return &cb.cb;
}
void
control_block_set::free(iocb *cb)
{
cblock *b = base::container_of(cb, &cblock::cb);
unsigned index = b - &cbs_[0];
free_cbs_.insert(index);
}
unsigned
control_block_set::context(iocb *cb) const
{
cblock *b = base::container_of(cb, &cblock::cb);
return b->context;
}
//----------------------------------------------------------------
aio_engine::aio_engine(unsigned max_io)
: aio_context_(0),
cbs_(max_io)
{
int r = io_setup(max_io, &aio_context_);
if (r < 0)
throw runtime_error("io_setup failed");
}
aio_engine::~aio_engine()
{
io_destroy(aio_context_);
}
aio_engine::handle
aio_engine::open_file(std::string const &path, mode m, sharing s)
{
int flags = (m == M_READ_ONLY) ? O_RDONLY : O_RDWR;
if (s == EXCLUSIVE)
flags |= O_EXCL;
int fd = ::open(path.c_str(), O_DIRECT | flags);
if (fd < 0) {
ostringstream out;
out << "unable to open '" << path << "'";
throw runtime_error(out.str());
}
descriptors_.push_back(base::unique_fd(fd));
return static_cast<handle>(fd);
}
void
aio_engine::close_file(handle h)
{
for (auto it = descriptors_.begin(); it != descriptors_.end(); ++it) {
unsigned it_h = it->get();
if (it_h == h) {
descriptors_.erase(it);
return;
}
}
ostringstream out;
out << "unknown descriptor (" << h << ")";
throw runtime_error(out.str());
}
bool
aio_engine::issue_io(handle h, dir d, sector_t b, sector_t e, void *data, unsigned context)
{
if (reinterpret_cast<uint64_t>(data) & (PAGE_SIZE - 1))
throw runtime_error("Data passed to issue_io must be page aligned\n");
iocb *cb;
cb = cbs_.alloc(context);
if (!cb)
return false;
memset(cb, 0, sizeof(*cb));
cb->aio_fildes = static_cast<int>(h);
cb->u.c.buf = data;
cb->u.c.offset = b << SECTOR_SHIFT;
cb->u.c.nbytes = (e - b) << SECTOR_SHIFT;
cb->aio_lio_opcode = (d == D_READ) ? IO_CMD_PREAD : IO_CMD_PWRITE;
int r = io_submit(aio_context_, 1, &cb);
return r == 1;
}
optional<io_engine::wait_result>
aio_engine::wait()
{
return wait_(NULL);
}
optional<io_engine::wait_result>
aio_engine::wait(unsigned &microsec)
{
timespec start = micro_to_ts(microsec);
timespec stop = start;
auto r = wait_(&stop);
microsec = ts_to_micro(stop) - microsec;
return r;
}
boost::optional<io_engine::wait_result>
aio_engine::wait_(timespec *ts)
{
int r;
struct io_event event;
memset(&event, 0, sizeof(event));
r = io_getevents(aio_context_, 1, 1, &event, ts);
if (r < 0) {
std::ostringstream out;
out << "io_getevents failed: " << r;
throw std::runtime_error(out.str());
}
if (r == 0) {
return optional<wait_result>();
}
iocb *cb = reinterpret_cast<iocb *>(event.obj);
unsigned context = cbs_.context(cb);
if (event.res == cb->u.c.nbytes) {
cbs_.free(cb);
return optional<wait_result>(make_pair(true, context));
} else if (static_cast<int>(event.res) < 0) {
cbs_.free(cb);
return optional<wait_result>(make_pair(false, context));
} else {
cbs_.free(cb);
return optional<wait_result>(make_pair(false, context));
}
// shouldn't get here
return optional<wait_result>(make_pair(false, 0));
}
struct timespec
aio_engine::micro_to_ts(unsigned micro)
{
timespec ts;
ts.tv_sec = micro / 1000000u;
ts.tv_nsec = (micro % 1000000) * 1000;
return ts;
}
unsigned
aio_engine::ts_to_micro(timespec const &ts)
{
unsigned micro = ts.tv_sec * 1000000;
micro += ts.tv_nsec / 1000;
return micro;
}
//----------------------------------------------------------------

117
block-cache/io_engine.h Normal file
View File

@@ -0,0 +1,117 @@
#ifndef BLOCK_CACHE_IO_ENGINE_H
#define BLOCK_CACHE_IO_ENGINE_H
#include "base/unique_handle.h"
#include <boost/optional.hpp>
#include <ctype.h>
#include <set>
#include <string>
#include <libaio.h>
//----------------------------------------------------------------
namespace bcache {
using sector_t = uint64_t;
unsigned const SECTOR_SHIFT = 9;
unsigned const PAGE_SIZE = 4096;
// Virtual base class to aid unit testing
class io_engine {
public:
enum mode {
M_READ_ONLY,
M_READ_WRITE
};
enum dir {
D_READ,
D_WRITE
};
enum sharing {
EXCLUSIVE,
SHARED
};
io_engine() {}
virtual ~io_engine() {}
using handle = unsigned;
virtual handle open_file(std::string const &path, mode m, sharing s = EXCLUSIVE) = 0;
virtual void close_file(handle h) = 0;
// returns false if there are insufficient resources to
// queue the IO
virtual bool issue_io(handle h, dir d, sector_t b, sector_t e, void *data, unsigned context) = 0;
// returns (success, context)
using wait_result = std::pair<bool, unsigned>;
virtual boost::optional<wait_result> wait() = 0;
virtual boost::optional<wait_result> wait(unsigned &microsec) = 0;
private:
io_engine(io_engine const &) = delete;
io_engine &operator =(io_engine const &) = delete;
};
//--------------------------------
class control_block_set {
public:
control_block_set(unsigned nr);
iocb *alloc(unsigned context);
void free(iocb *);
unsigned context(iocb *) const;
private:
struct cblock {
unsigned context;
struct iocb cb;
};
std::set<unsigned> free_cbs_;
std::vector<cblock> cbs_;
};
//----------------
class aio_engine : public io_engine {
public:
// max_io is the maximum nr of concurrent ios expected
aio_engine(unsigned max_io);
~aio_engine();
using handle = unsigned;
virtual handle open_file(std::string const &path, mode m, sharing s = EXCLUSIVE);
virtual void close_file(handle h);
// Returns false if queueing the io failed
virtual bool issue_io(handle h, dir d, sector_t b, sector_t e, void *data, unsigned context);
virtual boost::optional<wait_result> wait();
virtual boost::optional<wait_result> wait(unsigned &microsec);
private:
static struct timespec micro_to_ts(unsigned micro);
static unsigned ts_to_micro(timespec const &ts);
boost::optional<io_engine::wait_result> wait_(timespec *ts);
std::list<base::unique_fd> descriptors_;
io_context_t aio_context_;
control_block_set cbs_;
aio_engine(io_engine const &) = delete;
aio_engine &operator =(io_engine const &) = delete;
};
}
//----------------------------------------------------------------
#endif

62
block-cache/mem_pool.cc Normal file
View File

@@ -0,0 +1,62 @@
#include "block-cache/mem_pool.h"
#include <sstream>
#include <stdexcept>
#include <stdlib.h>
using namespace bcache;
using namespace boost;
using namespace mempool_detail;
using namespace std;
//----------------------------------------------------------------
mempool::mempool(size_t block_size, size_t total_mem, size_t alignment)
{
mem_ = alloc_aligned(total_mem, alignment);
unsigned nr_blocks = total_mem / block_size;
for (auto i = 0u; i < nr_blocks; i++)
free(static_cast<unsigned char *>(mem_) + (block_size * i));
}
mempool::~mempool()
{
free_.clear();
::free(mem_);
}
void *
mempool::alloc()
{
if (free_.empty())
return nullptr;
mempool_detail::alloc_block &b = free_.front();
free_.pop_front();
return reinterpret_cast<void *>(&b);
}
void
mempool::free(void *data)
{
mempool_detail::alloc_block *b = reinterpret_cast<mempool_detail::alloc_block *>(data);
free_.push_front(*b);
}
void *
mempool::alloc_aligned(size_t len, size_t alignment)
{
void *result = NULL;
int r = posix_memalign(&result, alignment, len);
if (r) {
ostringstream out;
out << "posix_memalign failed: len = " << len << ", alignment = " << alignment << ", r = " << r << "\n";
throw runtime_error(out.str());
}
return result;
}
//----------------------------------------------------------------

46
block-cache/mem_pool.h Normal file
View File

@@ -0,0 +1,46 @@
#ifndef BLOCK_CACHE_MEM_POOL_H
#define BLOCK_CACHE_MEM_POOL_H
#include <boost/intrusive/list.hpp>
#include <boost/optional.hpp>
#include <list>
namespace bi = boost::intrusive;
//----------------------------------------------------------------
namespace bcache {
// FIXME: move to base?
namespace mempool_detail {
struct alloc_block : public bi::list_base_hook<bi::link_mode<bi::normal_link>> {
};
};
class mempool {
public:
// alignment must be a power of 2
mempool(size_t block_size, size_t total_mem, size_t alignment = 8);
~mempool();
void *alloc();
void free(void *data);
private:
static void *alloc_aligned(size_t len, size_t alignment);
using block_list = bi::list<mempool_detail::alloc_block>;
void *mem_;
block_list free_;
//----------------
mempool(mempool const &) = delete;
mempool &operator =(mempool const &) = delete;
};
}
//----------------------------------------------------------------
#endif