/** fobuf_close * @param b: the fobuf structure to finish up with. * * Flush the buffers, ensure data is on disk and close the file descriptor. * * Return value: zero on error, non-zero on success. */ int fobuf_close(fobuf_t b) { int ret=1; DEBUG("%p", b); if ( b->fd == -1 ) goto noclose; if ( !_fobuf_flush(b) ) ret = 0; /* don't error if the output file is a special file * which does not support fsync (eg: a pipe) */ if ( fsync(b->fd) && errno != EROFS && errno != EINVAL ) { ERR("fsync: %s", os_err()); ret = 0; } if ( !fd_close(b->fd) ) { ERR("fsync: %s", os_err()); ret = 0; } noclose: if ( b->buf ) free(b->buf); free(b); return ret; }
static int aio_submit(struct iothread *t, http_conn_t h) { struct iocb *iocb; size_t data_len; off_t data_off; int ret, fd; data_len = http_conn_data(h, &fd, &data_off); assert(data_len); iocb = hgang_alloc(aio_iocbs); if ( NULL == iocb ) return 0; io_prep_sendfile(iocb, fd, data_len, data_off, http_conn_socket(h)); iocb->data = h; io_set_eventfd(iocb, efd->fd); ret = io_submit(aio_ctx, 1, &iocb); if ( ret <= 0 ) { errno = -ret; fprintf(stderr, "io_submit: %s\n", os_err()); return 0; } dprintf("io_submit: sendfile: %zu bytes\n", data_len); in_flight++; return 1; }
static void handle_completion(struct iothread *t, struct iocb *iocb, http_conn_t h, int ret) { hgang_return(aio_iocbs, iocb); in_flight--; if ( ret > 0 ) { size_t data_len; data_len = http_conn_data_read(h, ret); if ( data_len ) { printf("re-submit from completion\n"); if ( !aio_submit(t, h) ) http_conn_abort(t, h); }else{ dprintf("aio_sendfile: done\n"); /* automatically removes from waitq */ http_conn_data_complete(t, h); } return; } if ( ret == -EAGAIN ) { dprintf("aio_sendfile: failed EAGAIN\n"); http_conn_wait_on(t, h, NBIO_WRITE); return; }else if (ret < 0 ) { errno = -ret; printf("aio_sendfile: %s\n", os_err()); } http_conn_abort(t, h); }
static void epoll_pump(struct iothread *t, int mto) { struct epoll_event ev[8]; struct nbio *n; int nfd, i; again: nfd = epoll_wait(t->priv.epoll, ev, sizeof(ev)/sizeof(*ev), mto); if ( nfd < 0 ) { if ( errno == EINTR ) goto again; fprintf(stderr, "epoll_wait: %s\n", os_err()); return; } for(i=0; i < nfd; i++) { n = ev[i].data.ptr; n->flags = 0; if ( ev[i].events & (EPOLLIN|EPOLLHUP) ) n->flags |= NBIO_READ; if ( ev[i].events & EPOLLOUT ) n->flags |= NBIO_WRITE; if ( ev[i].events & EPOLLERR ) n->flags |= NBIO_ERROR; list_move_tail(&n->list, &t->active); } }
static int write_prep(struct _cola *c, unsigned int lvlno, struct buf *buf) { cola_key_t nr_ent, ofs; size_t sz; nr_ent = (1 << lvlno); ofs = nr_ent - 1; ofs *= sizeof(*buf->ptr); ofs += sizeof(struct cola_hdr); sz = nr_ent * sizeof(*buf->ptr); if ( lvlno > c->c_maplvls ) { buf->ptr = malloc(nr_ent); if ( NULL == buf->ptr ) { fprintf(stderr, "%s: malloc: %s\n", cmd, os_err()); return 0; } buf->heap = 1; }else{ buf->ptr = (struct cola_elem *)(c->c_map + ofs); buf->heap = 0; } buf->nelem = nr_ent; return 1; }
int os_sigpipe_ignore(void) { if ( SIG_ERR == signal(SIGPIPE, SIG_IGN) ) { fprintf(stderr, "signal: %s\n", os_err()); return 0; } return 1; }
static int size_up_chunks(struct _gidx_wr *wr) { unsigned int i, c, num_chunks; for(i = num_chunks = 0; i < wr->wr_num_fields; i++) { struct wr_field *f; f = wr->wr_field + i; f->f_num_chunks = (*f->f_type->t_wr_num_chunks)(f->f_priv); f->f_sys_chunks = sys_num_chunks(f); DEBUG("%s: %s: %u system chunks + %u chunks requested", f->f_type->t_name, f->f_name, f->f_sys_chunks, f->f_num_chunks); f->f_num_chunks += f->f_sys_chunks; num_chunks += f->f_num_chunks; } wr->wr_chunk = calloc(num_chunks, sizeof(*wr->wr_chunk)); if ( NULL == wr->wr_chunk ) { ERR("calloc: %s", os_err()); return 0; } wr->wr_tot_chunks = num_chunks; for(i = c = 0; i < wr->wr_num_fields; i++) { struct wr_field *f; struct wr_chunk *ch; unsigned int j, num; f = wr->wr_field + i; f->f_chunk = wr->wr_chunk + c; setup_sys_chunks(wr, f, f->f_chunk); c += sys_num_chunks(f); num = (*f->f_type->t_wr_num_chunks)(f->f_priv); for(j = 0; j < num; j++, c++) { ch = wr->wr_chunk + c; ch->c_field = f; ch->c_blkid = j; if ( f->f_type->t_wr_chunk_format ) ch->c_format = (*f->f_type->t_wr_chunk_format) (f->f_priv, j); else ch->c_format = GIDX_CHUNK_USER; ch->c_len = (*f->f_type->t_wr_chunk_size) (f->f_priv, ch->c_blkid); } } assert(c == wr->wr_tot_chunks); return 1; }
struct nbio *nbio_eventfd_new(eventfd_t initval, eventfd_cb_t cb, void *priv) { struct nb_efd *efd; efd = calloc(1, sizeof(*efd)); if ( NULL == efd ) { fprintf(stderr, "nbio_eventfd_new: %s\n", os_err()); return 0; } efd->e_nbio.fd = eventfd(initval, EFD_NONBLOCK|EFD_CLOEXEC); if ( efd->e_nbio.fd < 0 ) { fprintf(stderr, "eventfd: %s\n", os_err()); free(efd); return 0; } efd->e_nbio.ops = &ops; efd->e_cb = cb; efd->e_priv = priv; return &efd->e_nbio; }
static void efd_read(struct iothread *t, struct nbio *n) { struct nb_efd *efd = (struct nb_efd *)n; eventfd_t val; if ( eventfd_read(efd->e_nbio.fd, &val) ) { if ( errno == EAGAIN ) { nbio_inactive(t, &efd->e_nbio, NBIO_READ); return; } fprintf(stderr, "eventfd_read: %s\n", os_err()); return; } efd->e_cb(t, efd->e_priv, val); }
static int node_oid_id_insert(struct u64_val *node, gidx_oid_t oid_id) { gidx_oid_t *poid; poid = hgang_alloc(node->n_oids); if ( NULL == poid ) { ERR("hgang_alloc: %s", os_err()); return 0; } *poid = oid_id; node->n_num_oid++; DEBUG("append oid_id=%u to node val=%"PRIu64, oid_id, node->n_val); return 1; }
static int io_async_sendfile_init(struct iothread *t) { memset(&aio_ctx, 0, sizeof(aio_ctx)); if ( io_queue_init(AIO_QUEUE_SIZE, &aio_ctx) ) { fprintf(stderr, "io_queue_init: %s\n", os_err()); return 0; } aio_iocbs = hgang_new(sizeof(struct iocb), 0); if ( NULL == aio_iocbs ) return 0; efd = nbio_eventfd_new(0, aio_event, NULL); if ( NULL == efd ) return 0; nbio_eventfd_add(t, efd); return 1; }
/** _fobuf_flush * @param b: the fobuf structure to flush * * Flush the userspace buffer to disk. Note this does not call fsync() * so do not rely on it in order to verify that data is written to disk. * * Failure modes: * 0. return 1: success, all buffered data was written to the kernel * 1. undefined: any of the failure modes of fd_write() * 2. sig11|sig6|file-corruption: b->buf_len > b->buf_sz */ static int _fobuf_flush(struct _fobuf *b) { size_t len = b->buf_sz - b->buf_len; const void *buf = b->buf; /* buffer empty */ if ( len == 0 ) return 1; if ( !fd_write(b->fd, buf, len) ) { ERR("fd_write: %s", os_err()); return 0; } b->ptr = b->buf; b->buf_len = b->buf_sz; return 1; }
static void aio_event(struct iothread *t, void *priv, eventfd_t val) { struct io_event ev[in_flight]; struct timespec tmo; int ret, i; memset(&tmo, 0, sizeof(tmo)); dprintf("aio_event ready, %"PRIu64"/%u in flight\n", val, in_flight); ret = io_getevents(aio_ctx, 1, in_flight, ev, &tmo); if ( ret < 0 ) { fprintf(stderr, "io_getevents: %s\n", os_err()); return; } for(i = 0; i < ret; i++) handle_completion(t, ev[i].obj, ev[i].data, ev[i].res); }
static int remap(struct _cola *c, unsigned int lvlno) { size_t sz; uint8_t *map; dprintf(" - remap %u\n", lvlno); sz = (1U << (lvlno + 2)) - 1; sz *= sizeof(struct cola_elem); sz += sizeof(struct cola_hdr); map = mremap(c->c_map, c->c_mapsz, sz, MREMAP_MAYMOVE); if ( map == MAP_FAILED ) { fprintf(stderr, "%s: mmap: %s\n", cmd, os_err()); return 0; } madvise(map, c->c_mapsz, MADV_RANDOM); c->c_maplvls = lvlno; c->c_mapsz = sz; c->c_map = map; return 1; }
static int map(struct _cola *c) { int f; size_t sz; uint8_t *map; f = (c->c_rw) ? (PROT_READ|PROT_WRITE) : (PROT_READ); sz = (1U << (INITIAL_LEVELS + 1)) - 1; sz *= sizeof(struct cola_elem); sz += sizeof(struct cola_hdr); map = mmap(NULL, sz, f, MAP_SHARED, c->c_fd, 0); if ( map == MAP_FAILED ) { fprintf(stderr, "%s: mmap: %s\n", cmd, os_err()); return 0; } madvise(map, sz, MADV_RANDOM); c->c_maplvls = INITIAL_LEVELS; c->c_mapsz = sz; c->c_map = map; return 1; }
int cola_insert(cola_t c, cola_key_t key) { cola_key_t newcnt = c->c_nelem + 1; struct buf level; unsigned int i; dprintf("Insert key %"PRIu64"\n", key); if ( !buf_alloc(c, 1, &level) ) return 0; level.ptr[0].key = key; /* make sure the level we're about to write to is allocated and, * if required, mapped */ if ( newcnt == (1ULL << c->c_nxtlvl) ) { cola_key_t nr_ent, ofs; size_t sz; nr_ent = (1ULL << c->c_nxtlvl); ofs = nr_ent - 1; ofs *= sizeof(struct cola_elem); ofs += sizeof(struct cola_hdr); sz = nr_ent * sizeof(struct cola_elem); dprintf("fallocate level %u\n", c->c_nxtlvl); if ( posix_fallocate(c->c_fd, ofs, ofs + sz) ) fprintf(stderr, "%s: fallocate: %s\n", cmd, os_err()); if ( c->c_nxtlvl <= MAP_LEVELS && (1U << c->c_nxtlvl) > c->c_nelem ) { if ( !remap(c, c->c_nxtlvl) ) return 0; } c->c_nxtlvl++; } for(i = 0; newcnt >= (1U << i); i++) { if ( c->c_nelem & (1U << i) ) { struct buf level2, merged; int ret; dprintf(" - level %u full\n", i); if ( !read_level(c, i, &level2) ) { buf_finish(&level); return 0; } if ( (c->c_nelem & (1U << (i + 1))) || i + 1 >= c->c_maplvls ) { ret = buf_alloc(c, (1U << (i + 1)), &merged); }else{ /* landing in next level so write to map */ ret = write_prep(c, i + 1, &merged); } if ( !ret ) { buf_finish(&level2); buf_finish(&level); return 0; } level_merge(&level2, &level, &merged); if ( !write_level(c, i, &level2) ) { buf_finish(&level2); buf_finish(&level); buf_finish(&merged); return 0; } buf_finish(&level2); buf_finish(&level); memcpy(&level, &merged, sizeof(level)); }else{ dprintf(" - level %u empty\n", i); if ( !fractional_cascade(c, i, level.ptr) || !write_level(c, i, &level) ) { buf_finish(&level); return 0; } buf_finish(&level); break; } } c->c_nelem++; dprintf("\n"); #if DEBUG cola_dump(c); dprintf("\n"); #endif return 1; }
static struct _cola *do_open(const char *fn, int rw, int create, int overwrite) { struct _cola *c = NULL; struct cola_hdr hdr; size_t sz; int eof, oflags; c = calloc(1, sizeof(*c)); if ( NULL == c ) goto out; if ( create ) { oflags = O_RDWR | O_CREAT | ((overwrite) ? O_TRUNC : O_EXCL); }else{ oflags = (rw) ? O_RDWR : O_RDONLY; } c->c_fd = open(fn, oflags, 0644); if ( c->c_fd < 0 ) { fprintf(stderr, "%s: open: %s: %s\n", cmd, fn, os_err()); goto out_free; } if ( create ) { off_t initial; hdr.h_nelem = 0; hdr.h_magic = COLA_MAGIC; hdr.h_vers = COLA_CURRENT_VER; if ( !fd_write(c->c_fd, &hdr, sizeof(hdr)) ) { fprintf(stderr, "%s: write: %s: %s\n", cmd, fn, os_err()); goto out_close; } initial = (1U << (INITIAL_LEVELS + 1)) - 1; initial *= sizeof(struct cola_elem); initial += sizeof(hdr); if ( posix_fallocate(c->c_fd, 0, initial) ) { fprintf(stderr, "%s: %s: fallocate: %s\n", cmd, fn, os_err()); } }else{ sz = sizeof(hdr); if ( !fd_read(c->c_fd, &hdr, &sz, &eof) || sz != sizeof(hdr) ) { fprintf(stderr, "%s: read: %s: %s\n", cmd, fn, os_err2("File truncated")); goto out_close; } if ( hdr.h_magic != COLA_MAGIC ) { fprintf(stderr, "%s: %s: Bad magic\n", cmd, fn); goto out_close; } if ( hdr.h_vers != COLA_CURRENT_VER ) { fprintf(stderr, "%s: %s: Unsupported vers\n", cmd, fn); goto out_close; } c->c_nelem = hdr.h_nelem; } c->c_rw = rw; if ( !map(c) ) goto out_close; c->c_nxtlvl = cfls(c->c_nelem); if ( c->c_nxtlvl < INITIAL_LEVELS ) c->c_nxtlvl = INITIAL_LEVELS + 1; dprintf("next level init to %u\n", c->c_nxtlvl); /* success */ goto out; out_close: close(c->c_fd); out_free: free(c); c = NULL; out: return c; }
_public gidx_wr_t gidx_wr_new(const struct gidx_schema *schema, unsigned int num) { struct _gidx_wr *wr; unsigned int i; if ( num > GIDX_MAX_INDEX ) { ERR("num_fields (%u) greater than maximum permitted (%u)", num, GIDX_MAX_INDEX); return 0; } wr = calloc(1, sizeof(*wr)); if ( NULL == wr ) { ERR("calloc: %s", os_err()); return NULL; } wr->wr_field = calloc(num, sizeof(*wr->wr_field)); if ( NULL == wr->wr_field ) { ERR("calloc: %s", os_err()); goto out; } for(i = 0; i < num; i++) { /* Field name */ if ( NULL == schema[i].s_name || strlen(schema[i].s_name) > GIDX_MAX_INDEX_NLEN ) { ERR("%s: bad field name", schema[i].s_name); goto out_free_field; } wr->wr_field[i].f_name = schema[i].s_name; /* get_val */ if ( NULL == schema[i].s_get_val.su_get_str ) { ERR("%s: get_val unset", schema[i].s_name); goto out_free_field; } wr->wr_field[i].f_get_val = schema[i].s_get_val; /* field type */ if ( schema->s_type >= GIDX_NR_TYPES ) { ERR("%s: bad type (%u)", schema[i].s_name, schema[i].s_type); goto out_free_field; } wr->wr_field[i].f_type = _gidx_types[schema[i].s_type]; wr->wr_field[i].f_type_id = schema[i].s_type; wr->wr_field[i].f_mode = schema[i].s_mode; wr->wr_num_fields++; } /* Initialise field */ for(i = 0; i < num; i++) { struct _gidx_type_wr *type; gidx_mode_t mode; type = wr->wr_field[i].f_type; mode = wr->wr_field[i].f_mode; DEBUG("field[%u]: %s %s %s", i, gidx_mode_str(mode), type->t_name, wr->wr_field[i].f_name); wr->wr_field[i].f_priv =(*type->t_wr_init)(mode, schema[i].s_options); if ( NULL == wr->wr_field[i].f_priv ) { num = i; goto out_free_priv; } } wr->wr_rec = malloc(wr->wr_num_fields * sizeof(*wr->wr_rec->r_field) + sizeof(*wr->wr_rec)); if ( NULL == wr->wr_rec ) goto out_free_rec; wr->wr_rec->r_record = NULL; return wr; out_free_rec: free(wr->wr_rec); out_free_priv: for(i = 0; i < num; i++) { struct _gidx_type_wr *type; type = wr->wr_field[i].f_type; type->t_wr_fini(wr->wr_field[i].f_priv); DEBUG("free field %u", i); } out_free_field: free(wr->wr_field); out: free(wr); return NULL; }
static int decompress(fibuf_t in, int outfd) { uint8_t buf[BMO_BLOCK_SIZE]; struct bmo_hdr h; int compressed; size_t sz; bwt_t idx; sz = sizeof(h); if ( !fibuf_read(in, &h, &sz) ) { dprintf("%s: read: %s\n", cmd, os_err()); return 0; } if ( fibuf_eof(in) || sz < sizeof(h) ) { dprintf("%s: desync on hdr read\n", cmd); return 1; } if ( h.h_magic != BMO_MAGIC ) { dprintf("%s: bad magic\n", cmd); return 1; } if ( h.h_vers != BMO_CURRENT_VERS ) { dprintf("%s: wrong version\n", cmd); return 1; } again: sz = sizeof(idx); if ( !fibuf_read(in, &idx, &sz) ) { dprintf("%s: read: %s\n", cmd, os_err()); return 0; } if ( fibuf_eof(in) || sz < sizeof(idx) ) { dprintf("%s: desync on bwt read\n", cmd); return 1; } compressed = !!(idx & BMO_BLOCK_COMPRESSED); idx &= ~BMO_BLOCK_COMPRESSED; sz = (h.h_len < BMO_BLOCK_SIZE) ? h.h_len : BMO_BLOCK_SIZE; h.h_len -= sz; if ( compressed ) { omega_decode(in, buf, sz); dprintf("omega decode:\n"); dhex_dump(buf, sz, 0); mtf_decode(buf, sz); dprintf("MTF decode:\n"); dhex_dump(buf, sz, 0); bwt_decode(buf, sz, idx); dprintf("BWT decode:\n"); dhex_dump(buf, sz, 0); }else{ if ( !fibuf_read(in, buf, &sz) ) return 0; dprintf("uncompressed block\n"); } if ( !fd_write(outfd, buf, sz) ) return 0; if ( !fibuf_eof(in) ) goto again; return 1; }