// call with *e locked static store_page *_allocate_page(store_engine *e, unsigned int bucket, unsigned int free_bucket) { assert(!e->page_buckets[bucket] || e->page_buckets[bucket]->allocated == e->page_size); store_page *tmp = NULL; // if a specific free bucket was requested, check there first if (free_bucket != 0 && e->free_page_buckets[free_bucket] != NULL) { assert(e->page_free > 0); tmp = e->free_page_buckets[free_bucket]; e->free_page_buckets[free_bucket] = tmp->next; } // failing that, try the global list. if (tmp == NULL && e->page_freelist != NULL) { tmp = e->page_freelist; e->page_freelist = tmp->next; } E_DEBUG("EXTSTORE: allocating new page\n"); // page_freelist can be empty if the only free pages are specialized and // we didn't just request one. if (e->page_free > 0 && tmp != NULL) { tmp->next = e->page_buckets[bucket]; e->page_buckets[bucket] = tmp; tmp->active = true; tmp->free = false; tmp->closed = false; tmp->version = _next_version(e); tmp->bucket = bucket; e->page_free--; STAT_INCR(e, page_allocs, 1); } else { extstore_run_maint(e); } if (tmp) E_DEBUG("EXTSTORE: got page %u\n", tmp->id); return tmp; }
/* engine note delete function: takes engine, page id, size? * note that an item in this page is no longer valid */ int extstore_delete(void *ptr, unsigned int page_id, uint64_t page_version, unsigned int count, unsigned int bytes) { store_engine *e = (store_engine *)ptr; // FIXME: validate page_id in bounds store_page *p = &e->pages[page_id]; int ret = 0; pthread_mutex_lock(&p->mutex); if (!p->closed && p->version == page_version) { if (p->bytes_used >= bytes) { p->bytes_used -= bytes; } else { p->bytes_used = 0; } if (p->obj_count >= count) { p->obj_count -= count; } else { p->obj_count = 0; // caller has bad accounting? } STAT_L(e); e->stats.bytes_used -= bytes; e->stats.objects_used -= count; STAT_UL(e); if (p->obj_count == 0) { extstore_run_maint(e); } } else { ret = -1; } pthread_mutex_unlock(&p->mutex); return ret; }
/* allows a compactor to say "we're done with this page, kill it. */ void extstore_close_page(void *ptr, unsigned int page_id, uint64_t page_version) { store_engine *e = (store_engine *)ptr; store_page *p = &e->pages[page_id]; pthread_mutex_lock(&p->mutex); if (!p->closed && p->version == page_version) { p->closed = true; extstore_run_maint(e); } pthread_mutex_unlock(&p->mutex); }
// TODO: hoist the storage bits from lru_maintainer_thread in here. // would be nice if they could avoid hammering the same locks though? // I guess it's only COLD. that's probably fine. static void *storage_compact_thread(void *arg) { void *storage = arg; useconds_t to_sleep = MAX_STORAGE_COMPACT_SLEEP; bool compacting = false; uint64_t page_version = 0; uint64_t page_size = 0; uint64_t page_offset = 0; uint32_t page_id = 0; bool drop_unread = false; char *readback_buf = NULL; struct storage_compact_wrap wrap; logger *l = logger_create(); if (l == NULL) { fprintf(stderr, "Failed to allocate logger for storage compaction thread\n"); abort(); } readback_buf = malloc(settings.ext_wbuf_size); if (readback_buf == NULL) { fprintf(stderr, "Failed to allocate readback buffer for storage compaction thread\n"); abort(); } pthread_mutex_init(&wrap.lock, NULL); wrap.done = false; wrap.submitted = false; wrap.io.data = &wrap; wrap.io.buf = (void *)readback_buf; wrap.io.len = settings.ext_wbuf_size; wrap.io.mode = OBJ_IO_READ; wrap.io.cb = _storage_compact_cb; pthread_mutex_lock(&storage_compact_plock); while (1) { pthread_mutex_unlock(&storage_compact_plock); if (to_sleep) { extstore_run_maint(storage); usleep(to_sleep); } pthread_mutex_lock(&storage_compact_plock); if (!compacting && storage_compact_check(storage, l, &page_id, &page_version, &page_size, &drop_unread)) { page_offset = 0; compacting = true; LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_COMPACT_START, NULL, page_id, page_version); } if (compacting) { pthread_mutex_lock(&wrap.lock); if (page_offset < page_size && !wrap.done && !wrap.submitted) { wrap.io.page_version = page_version; wrap.io.page_id = page_id; wrap.io.offset = page_offset; // FIXME: should be smarter about io->next (unlink at use?) wrap.io.next = NULL; wrap.submitted = true; wrap.miss = false; extstore_submit(storage, &wrap.io); } else if (wrap.miss) { LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_COMPACT_ABORT, NULL, page_id); wrap.done = false; wrap.submitted = false; compacting = false; } else if (wrap.submitted && wrap.done) { LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_COMPACT_READ_START, NULL, page_id, page_offset); storage_compact_readback(storage, l, drop_unread, readback_buf, page_id, page_version, settings.ext_wbuf_size); page_offset += settings.ext_wbuf_size; wrap.done = false; wrap.submitted = false; } else if (page_offset >= page_size) { compacting = false; wrap.done = false; wrap.submitted = false; extstore_close_page(storage, page_id, page_version); LOGGER_LOG(l, LOG_SYSEVENTS, LOGGER_COMPACT_END, NULL, page_id); } pthread_mutex_unlock(&wrap.lock); if (to_sleep > MIN_STORAGE_COMPACT_SLEEP) to_sleep /= 2; } else { if (to_sleep < MAX_STORAGE_COMPACT_SLEEP) to_sleep += MIN_STORAGE_COMPACT_SLEEP; } } free(readback_buf); return NULL; }
// TODO: #define's for DEFAULT_BUCKET, FREE_VERSION, etc void *extstore_init(struct extstore_conf_file *fh, struct extstore_conf *cf, enum extstore_res *res) { int i; struct extstore_conf_file *f = NULL; pthread_t thread; if (cf->page_size % cf->wbuf_size != 0) { *res = EXTSTORE_INIT_BAD_WBUF_SIZE; return NULL; } // Should ensure at least one write buffer per potential page if (cf->page_buckets > cf->wbuf_count) { *res = EXTSTORE_INIT_NEED_MORE_WBUF; return NULL; } if (cf->page_buckets < 1) { *res = EXTSTORE_INIT_NEED_MORE_BUCKETS; return NULL; } // TODO: More intelligence around alignment of flash erasure block sizes if (cf->page_size % (1024 * 1024 * 2) != 0 || cf->wbuf_size % (1024 * 1024 * 2) != 0) { *res = EXTSTORE_INIT_PAGE_WBUF_ALIGNMENT; return NULL; } store_engine *e = calloc(1, sizeof(store_engine)); if (e == NULL) { *res = EXTSTORE_INIT_OOM; return NULL; } e->page_size = cf->page_size; for (f = fh; f != NULL; f = f->next) { f->fd = open(f->file, O_RDWR | O_CREAT | O_TRUNC, 0644); if (f->fd < 0) { *res = EXTSTORE_INIT_OPEN_FAIL; #ifdef EXTSTORE_DEBUG perror("open"); #endif free(e); return NULL; } e->page_count += f->page_count; f->offset = 0; } e->pages = calloc(e->page_count, sizeof(store_page)); if (e->pages == NULL) { *res = EXTSTORE_INIT_OOM; // FIXME: loop-close. make error label free(e); return NULL; } // interleave the pages between devices f = NULL; // start at the first device. for (i = 0; i < e->page_count; i++) { // find next device with available pages while (1) { // restart the loop if (f == NULL || f->next == NULL) { f = fh; } else { f = f->next; } if (f->page_count) { f->page_count--; break; } } pthread_mutex_init(&e->pages[i].mutex, NULL); e->pages[i].id = i; e->pages[i].fd = f->fd; e->pages[i].free_bucket = f->free_bucket; e->pages[i].offset = f->offset; e->pages[i].free = true; f->offset += e->page_size; } // free page buckets allows the app to organize devices by use case e->free_page_buckets = calloc(cf->page_buckets, sizeof(store_page *)); e->page_bucketcount = cf->page_buckets; for (i = e->page_count-1; i > 0; i--) { e->page_free++; if (e->pages[i].free_bucket == 0) { e->pages[i].next = e->page_freelist; e->page_freelist = &e->pages[i]; } else { int fb = e->pages[i].free_bucket; e->pages[i].next = e->free_page_buckets[fb]; e->free_page_buckets[fb] = &e->pages[i]; } } // 0 is magic "page is freed" version e->version = 1; // scratch data for stats. TODO: malloc failure handle e->stats.page_data = calloc(e->page_count, sizeof(struct extstore_page_data)); e->stats.page_count = e->page_count; e->stats.page_size = e->page_size; // page buckets lazily have pages assigned into them e->page_buckets = calloc(cf->page_buckets, sizeof(store_page *)); e->page_bucketcount = cf->page_buckets; // allocate write buffers // also IO's to use for shipping to IO thread for (i = 0; i < cf->wbuf_count; i++) { _store_wbuf *w = wbuf_new(cf->wbuf_size); obj_io *io = calloc(1, sizeof(obj_io)); /* TODO: on error, loop again and free stack. */ w->next = e->wbuf_stack; e->wbuf_stack = w; io->next = e->io_stack; e->io_stack = io; } pthread_mutex_init(&e->mutex, NULL); pthread_mutex_init(&e->stats_mutex, NULL); e->io_depth = cf->io_depth; // spawn threads e->io_threads = calloc(cf->io_threadcount, sizeof(store_io_thread)); for (i = 0; i < cf->io_threadcount; i++) { pthread_mutex_init(&e->io_threads[i].mutex, NULL); pthread_cond_init(&e->io_threads[i].cond, NULL); e->io_threads[i].e = e; // FIXME: error handling pthread_create(&thread, NULL, extstore_io_thread, &e->io_threads[i]); } e->io_threadcount = cf->io_threadcount; e->maint_thread = calloc(1, sizeof(store_maint_thread)); e->maint_thread->e = e; // FIXME: error handling pthread_mutex_init(&e->maint_thread->mutex, NULL); pthread_cond_init(&e->maint_thread->cond, NULL); pthread_create(&thread, NULL, extstore_maint_thread, e->maint_thread); extstore_run_maint(e); return (void *)e; }