/* * returns * - NULL on full write * - MOG_WR_ERROR on error (and sets errno) * - address to a new mog_wbuf with unbuffered contents on partial write */ void * mog_trysend(int fd, void *buf, size_t len, off_t more) { if (MOG_MSG_MORE) { int flags = more > 0 ? MOG_MSG_MORE : 0; while (len > 0) { ssize_t w = send(fd, buf, len, flags); if (w == (ssize_t)len) return NULL; /* all done */ if (w < 0) { switch (errno) { case_EAGAIN: return wbuf_new(buf, len); case EINTR: continue; } return MOG_WR_ERROR; } else { buf = (char *)buf + w; len -= w; } } return NULL; } else { struct iovec iov; iov.iov_base = buf; iov.iov_len = len; return mog_trywritev(fd, &iov, 1); } }
// TODO: #define's for DEFAULT_BUCKET, FREE_VERSION, etc void *extstore_init(struct extstore_conf_file *fh, struct extstore_conf *cf, enum extstore_res *res) { int i; struct extstore_conf_file *f = NULL; pthread_t thread; if (cf->page_size % cf->wbuf_size != 0) { *res = EXTSTORE_INIT_BAD_WBUF_SIZE; return NULL; } // Should ensure at least one write buffer per potential page if (cf->page_buckets > cf->wbuf_count) { *res = EXTSTORE_INIT_NEED_MORE_WBUF; return NULL; } if (cf->page_buckets < 1) { *res = EXTSTORE_INIT_NEED_MORE_BUCKETS; return NULL; } // TODO: More intelligence around alignment of flash erasure block sizes if (cf->page_size % (1024 * 1024 * 2) != 0 || cf->wbuf_size % (1024 * 1024 * 2) != 0) { *res = EXTSTORE_INIT_PAGE_WBUF_ALIGNMENT; return NULL; } store_engine *e = calloc(1, sizeof(store_engine)); if (e == NULL) { *res = EXTSTORE_INIT_OOM; return NULL; } e->page_size = cf->page_size; for (f = fh; f != NULL; f = f->next) { f->fd = open(f->file, O_RDWR | O_CREAT | O_TRUNC, 0644); if (f->fd < 0) { *res = EXTSTORE_INIT_OPEN_FAIL; #ifdef EXTSTORE_DEBUG perror("open"); #endif free(e); return NULL; } e->page_count += f->page_count; f->offset = 0; } e->pages = calloc(e->page_count, sizeof(store_page)); if (e->pages == NULL) { *res = EXTSTORE_INIT_OOM; // FIXME: loop-close. make error label free(e); return NULL; } // interleave the pages between devices f = NULL; // start at the first device. for (i = 0; i < e->page_count; i++) { // find next device with available pages while (1) { // restart the loop if (f == NULL || f->next == NULL) { f = fh; } else { f = f->next; } if (f->page_count) { f->page_count--; break; } } pthread_mutex_init(&e->pages[i].mutex, NULL); e->pages[i].id = i; e->pages[i].fd = f->fd; e->pages[i].free_bucket = f->free_bucket; e->pages[i].offset = f->offset; e->pages[i].free = true; f->offset += e->page_size; } // free page buckets allows the app to organize devices by use case e->free_page_buckets = calloc(cf->page_buckets, sizeof(store_page *)); e->page_bucketcount = cf->page_buckets; for (i = e->page_count-1; i > 0; i--) { e->page_free++; if (e->pages[i].free_bucket == 0) { e->pages[i].next = e->page_freelist; e->page_freelist = &e->pages[i]; } else { int fb = e->pages[i].free_bucket; e->pages[i].next = e->free_page_buckets[fb]; e->free_page_buckets[fb] = &e->pages[i]; } } // 0 is magic "page is freed" version e->version = 1; // scratch data for stats. TODO: malloc failure handle e->stats.page_data = calloc(e->page_count, sizeof(struct extstore_page_data)); e->stats.page_count = e->page_count; e->stats.page_size = e->page_size; // page buckets lazily have pages assigned into them e->page_buckets = calloc(cf->page_buckets, sizeof(store_page *)); e->page_bucketcount = cf->page_buckets; // allocate write buffers // also IO's to use for shipping to IO thread for (i = 0; i < cf->wbuf_count; i++) { _store_wbuf *w = wbuf_new(cf->wbuf_size); obj_io *io = calloc(1, sizeof(obj_io)); /* TODO: on error, loop again and free stack. */ w->next = e->wbuf_stack; e->wbuf_stack = w; io->next = e->io_stack; e->io_stack = io; } pthread_mutex_init(&e->mutex, NULL); pthread_mutex_init(&e->stats_mutex, NULL); e->io_depth = cf->io_depth; // spawn threads e->io_threads = calloc(cf->io_threadcount, sizeof(store_io_thread)); for (i = 0; i < cf->io_threadcount; i++) { pthread_mutex_init(&e->io_threads[i].mutex, NULL); pthread_cond_init(&e->io_threads[i].cond, NULL); e->io_threads[i].e = e; // FIXME: error handling pthread_create(&thread, NULL, extstore_io_thread, &e->io_threads[i]); } e->io_threadcount = cf->io_threadcount; e->maint_thread = calloc(1, sizeof(store_maint_thread)); e->maint_thread->e = e; // FIXME: error handling pthread_mutex_init(&e->maint_thread->mutex, NULL); pthread_cond_init(&e->maint_thread->cond, NULL); pthread_create(&thread, NULL, extstore_maint_thread, e->maint_thread); extstore_run_maint(e); return (void *)e; }