static inline int si_qgetindex(siquery *q, sinode *node) { svindex *second; svindex *first = si_nodeindex_priority(node, &second); ssiter i; ss_iterinit(sv_indexiter, &i); int rc; if (first->count > 0) { rc = ss_iteropen(sv_indexiter, &i, q->r, first, SS_GTE, q->key, q->keysize); if (rc) { goto result; } } if (sslikely(second == NULL || !second->count)) return 0; rc = ss_iteropen(sv_indexiter, &i, q->r, second, SS_GTE, q->key, q->keysize); if (! rc) { return 0; } result:; sv *v = ss_iterof(sv_indexiter, &i); assert(v != NULL); svv *visible = v->v; if (sslikely(! q->has)) { visible = sv_visible((svv*)v->v, q->vlsn); if (visible == NULL) return 0; } sv vret; sv_init(&vret, &sv_vif, visible, NULL); return si_qgetresult(q, &vret, 0); }
sxstate sx_set_autocommit(sxmanager *m, sxindex *index, sx *x, svv *v) { if (sslikely(m->count_rw == 0)) { sx_init(m, x); svlogv lv; lv.id = index->dsn; lv.next = UINT32_MAX; sv_init(&lv.v, &sv_vif, v, NULL); sv_logadd(&x->log, m->r->a, &lv, index->ptr); sr_seq(m->r->seq, SR_TSNNEXT); return SXCOMMIT; } sx_begin(m, x, SXRW, 0); int rc = sx_set(x, index, v); if (ssunlikely(rc == -1)) { sx_rollback(x); return SXROLLBACK; } sxstate s = sx_prepare(x, NULL, NULL); if (sslikely(s == SXPREPARE)) sx_commit(x); else if (s == SXLOCK) sx_rollback(x); return s; }
static inline int se_document_opt(const char *path) { if (sslikely((intptr_t)path <= (intptr_t)SE_DOCUMENT_FIELD_9)) return (int)(intptr_t)path; switch (path[0]) { case 'o': if (sslikely(strcmp(path, "order") == 0)) return SE_DOCUMENT_ORDER; break; case 'l': if (sslikely(strcmp(path, "log") == 0)) return SE_DOCUMENT_LOG; break; case 'p': if (sslikely(strcmp(path, "prefix") == 0)) return SE_DOCUMENT_PREFIX; break; case 'r': if (sslikely(strcmp(path, "raw") == 0)) return SE_DOCUMENT_RAW; break; } return SE_DOCUMENT_FIELD; }
int se_reqwrite(sereq *r) { sereqarg *arg = &r->arg; svlog *log = r->arg.log; se *e = se_of(r->object); /* set lsn */ sl_prepare(&e->lp, log, arg->lsn); /* log write */ if (! arg->recover) { sltx tl; sl_begin(&e->lp, &tl); int rc = sl_write(&tl, log); if (ssunlikely(rc == -1)) { sl_rollback(&tl); r->rc = -1; return -1; } sl_commit(&tl); } /* commit */ if (sslikely(arg->vlsn_generate)) arg->vlsn = sx_vlsn(&e->xm); uint64_t now = ss_utime(); svlogindex *i = (svlogindex*)log->index.s; svlogindex *end = (svlogindex*)log->index.p; while (i < end) { sedb *db = i->ptr; sitx ti; si_begin(&ti, &db->index, arg->vlsn, now, log, i); si_write(&ti, arg->recover); si_commit(&ti); i++; } return 0; }
sxstate sx_begin(sxmanager *m, sx *x, sxtype type, uint64_t vlsn) { sx_promote(x, SXREADY); x->type = type; x->log_read = -1; sr_seqlock(m->r->seq); x->csn = m->csn; x->id = sr_seqdo(m->r->seq, SR_TSNNEXT); if (sslikely(vlsn == 0)) x->vlsn = sr_seqdo(m->r->seq, SR_LSN); else x->vlsn = vlsn; sr_sequnlock(m->r->seq); sx_init(m, x); ss_spinlock(&m->lock); ssrbnode *n = NULL; int rc = sx_matchtx(&m->i, NULL, (char*)&x->id, sizeof(x->id), &n); if (rc == 0 && n) { assert(0); } else { ss_rbset(&m->i, n, rc, &x->node); } if (type == SXRO) m->count_rd++; else m->count_rw++; ss_spinunlock(&m->lock); return SXREADY; }
int si_recover(si *i) { sr *r = i->r; int exist = ss_vfsexists(r->vfs, i->scheme->path); if (exist == 0) goto deploy; if (i->scheme->path_fail_on_exists) { sr_error(r->e, "directory '%s' already exists", i->scheme->path); return -1; } int rc = si_recoverdrop(i, r); switch (rc) { case -1: return -1; case 1: goto deploy; } rc = si_schemerecover(i->scheme, r); if (ssunlikely(rc == -1)) return -1; r->scheme = &i->scheme->scheme; r->fmt = i->scheme->fmt; r->fmt_storage = i->scheme->fmt_storage; sdsnapshot snapshot; sd_snapshot_init(&snapshot); rc = si_recoversnapshot(i, r, &snapshot); if (ssunlikely(rc == -1)) { sd_snapshot_free(&snapshot, r); return -1; } rc = si_recoverindex(i, r, &snapshot); sd_snapshot_free(&snapshot, r); if (sslikely(rc <= 0)) return rc; deploy: return si_deploy(i, r, !exist); }
static inline int si_plannerpeek_gc(siplanner *p, siplan *plan) { /* try to peek a node with a biggest number * of branches which is ready for gc */ int rc_inprogress = 0; sinode *n; ssrqnode *pn = NULL; while ((pn = ss_rqprev(&p->compact, pn))) { n = sscast(pn, sinode, nodecompact); sdindexheader *h = n->self.index.h; if (sslikely(h->dupkeys == 0) || (h->dupmin >= plan->a)) continue; uint32_t used = (h->dupkeys * 100) / h->keys; if (used >= plan->b) { if (n->flags & SI_LOCK) { rc_inprogress = 2; continue; } goto match; } } if (rc_inprogress) plan->explain = SI_ERETRY; return rc_inprogress; match: si_nodelock(n); plan->explain = SI_ENONE; plan->node = n; return 1; }
void sx_gc(sx *t, sr *r) { sxmanager *m = t->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &t->log.buf, sizeof(svlogv)); if (sslikely(t->s == SXCOMMIT)) { for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->vgc; ss_free(m->asxv, v); } } else if (t->s == SXROLLBACK) { int gc = 0; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; gc += sv_vsize((svv*)v->v); sx_vfree(m->a, m->asxv, v); } ss_quota(r->quota, SS_QREMOVE, gc); } sv_logfree(&t->log, m->a); t->s = SXUNDEF; }
sxstate sx_set_autocommit(sxmanager *m, sxindex *index, sx *x, svlog *log, svv *v) { if (sslikely(m->count_rw == 0)) { sx_init(m, x, log); svlogv lv; lv.index_id = index->dsn; lv.next = UINT32_MAX; lv.v = v; lv.ptr = NULL; sv_logadd(x->log, index->r, &lv); sr_seq(index->r->seq, SR_TSNNEXT); sx_promote(x, SX_COMMIT); return SX_COMMIT; } sx_begin(m, x, SX_RW, log, 0); int rc = sx_set(x, index, v); if (ssunlikely(rc == -1)) { sx_rollback(x); return SX_ROLLBACK; } sxstate s = sx_prepare(x, NULL, NULL); switch (s) { case SX_PREPARE: s = sx_commit(x); break; case SX_LOCK: s = sx_rollback(x); break; case SX_ROLLBACK: break; default: assert(0); } return s; }
int se_reqread(sereq *r) { sereqarg *arg = &r->arg; sedb *db = (sedb*)r->db; uint32_t keysize; void *key; if (sslikely(arg->v.v)) { keysize = sv_size(&arg->v); key = sv_pointer(&arg->v); } else { keysize = 0; key = NULL; } char *prefix; uint32_t prefixsize; if (arg->vprefix.v) { void *vptr = sv_vpointer(arg->vprefix.v); prefix = sf_key(vptr, 0); prefixsize = sf_keysize(vptr, 0); } else { prefix = NULL; prefixsize = 0; } if (sslikely(arg->vlsn_generate)) arg->vlsn = sr_seq(db->r.seq, SR_LSN); sitx x; si_begin(&x, &db->index, 1); siread q; si_readopen(&q, &x, arg->cache, arg->order, arg->vlsn, prefix, prefixsize, key, keysize); if (arg->update) si_readupdate(&q, &arg->vup, arg->update_eq); if (arg->cache_only) si_readcache_only(&q); if (arg->has) si_readhas(&q); r->rc = si_read(&q); r->read_disk = q.read_disk; r->read_cache = q.read_cache; r->v = q.result.v; si_readclose(&q); si_commit(&x); return r->rc; }
static int so_snapshotfree(sosnapshot *s) { so *e = so_of(&s->o); sx_rollback(&s->t); if (sslikely(s->name)) { ss_free(&e->a, s->name); s->name = NULL; } ss_free(&e->a_snapshot, s); return 0; }
static inline int sx_preparecb(sx *x, svlogv *v, uint64_t lsn, sxpreparef prepare, void *arg) { if (sslikely(lsn == x->vlsn)) return 0; if (prepare == NULL) return 0; sxindex *i = ((sxv*)v->ptr)->index; if (prepare(x, v->v, i->object, arg)) return 1; return 0; }
void ss_rbset(ssrb *t, ssrbnode *p, int prel, ssrbnode *n) { n->color = SS_RBRED; n->p = p; n->l = NULL; n->r = NULL; if (sslikely(p)) { assert(prel != 0); if (prel > 0) p->l = n; else p->r = n; } else { t->root = n; } ss_rbset_fixup(t, n); }
static inline int si_recoverdrop(si *i, sr *r) { char path[1024]; snprintf(path, sizeof(path), "%s/drop", i->scheme->path); int rc = ss_vfsexists(r->vfs, path); if (sslikely(! rc)) return 0; if (i->scheme->path_fail_on_drop) { sr_malfunction(r->e, "attempt to recover a dropped database: %s:", i->scheme->path); return -1; } rc = si_droprepository(i->scheme, r, 0); if (ssunlikely(rc == -1)) return -1; return 1; }
static inline int sl_iterprepare(sliter *i) { srversion *ver = (srversion*)i->map.p; if (! sr_versioncheck(ver)) return sr_malfunction(i->r->e, "bad log file '%s' version", i->log->file); if (ssunlikely(i->log->size < (sizeof(srversion)))) return sr_malfunction(i->r->e, "corrupted log file '%s': bad size", i->log->file); slv *next = (slv*)((char*)i->map.p + sizeof(srversion)); int rc = sl_iternext_of(i, next, 1); if (ssunlikely(rc == -1)) return -1; if (sslikely(i->next)) return sl_itercontinue_of(i); return 0; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SX_RO || sv_logcount_write(x->log) == 0) return sx_promote(x, SX_PREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); sxstate rc; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SX_ROLLBACK); if (sslikely(v->prev == NULL)) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SX_ROLLBACK); continue; } /* force commit for read-only conflicts */ sxindex *i = v->prev->index; if (sv_vflags(v->prev->v, i->r) & SVGET) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } return sx_promote(x, SX_LOCK); } return sx_promote(x, SX_PREPARE); }
static inline void ss_rbrotate_right(ssrb *t, ssrbnode *n) { ssrbnode *p = n; ssrbnode *q = n->l; ssrbnode *parent = n->p; if (sslikely(p->p != NULL)) { if (parent->l == p) parent->l = q; else parent->r = q; } else { t->root = q; } q->p = parent; p->p = q; p->l = q->r; if (p->l) p->l->p = p; q->r = p; }
static inline void sx_rollback_svp(sx *x, ssiter *i, int free) { sxmanager *m = x->manager; int gc = 0; for (; ss_iterhas(ss_bufiter, i); ss_iternext(ss_bufiter, i)) { svlogv *lv = ss_iterof(ss_bufiter, i); sxv *v = lv->v.v; /* remove from index and replace head with * a first waiter */ sx_untrack(v); /* translate log version from sxv to svv */ sv_init(&lv->v, &sv_vif, v->v, NULL); if (free) { if (sslikely(! (v->v->flags & SVGET))) gc += sv_vsize((svv*)v->v); sx_vfree(m->r, m->asxv, v); } } ss_quota(m->r->quota, SS_QREMOVE, gc); }
int sd_buildend(sdbuild *b, sr *r) { /* update sizes */ sdbuildref *ref = sd_buildref(b); ref->msize = ss_bufused(&b->m) - ref->m; ref->vsize = ss_bufused(&b->v) - ref->v; ref->ksize = ss_bufused(&b->k) - ref->k; ref->csize = 0; /* calculate data crc (non-compressed) */ sdpageheader *h = sd_buildheader(b); uint32_t crc = 0; if (sslikely(b->crc)) { crc = ss_crcp(r->crc, b->m.s + ref->m, ref->msize, 0); crc = ss_crcp(r->crc, b->v.s + ref->v, ref->vsize, crc); crc = ss_crcp(r->crc, b->k.s + ref->k, ref->ksize, crc); } h->crcdata = crc; /* compression */ if (b->compress) { int rc = sd_buildcompress(b, r); if (ssunlikely(rc == -1)) return -1; ref->csize = ss_bufused(&b->c) - ref->c; } /* update page header */ int total = ref->msize + ref->vsize + ref->ksize; h->sizekeys = ref->ksize; h->sizeorigin = total - sizeof(sdpageheader); h->size = h->sizeorigin; if (b->compress) h->size = ref->csize - sizeof(sdpageheader); else h->size = h->sizeorigin; h->crc = ss_crcs(r->crc, h, sizeof(sdpageheader), 0); if (b->compress) memcpy(b->c.s + ref->c, h, sizeof(sdpageheader)); return 0; }
static inline int se_confsophia_error(srconf *c, srconfstmt *s) { se *e = s->ptr; char *errorp; char error[128]; error[0] = 0; int len = sr_errorcopy(&e->error, error, sizeof(error)); if (sslikely(len == 0)) errorp = NULL; else errorp = error; srconf conf = { .key = c->key, .flags = c->flags, .type = c->type, .function = NULL, .value = errorp, .ptr = NULL, .next = NULL }; return se_confv(&conf, s); } static inline srconf* se_confsophia(se *e, seconfrt *rt, srconf **pc) { srconf *sophia = *pc; srconf *p = NULL; sr_C(&p, pc, se_confv, "version", SS_STRING, rt->version, SR_RO, NULL); sr_C(&p, pc, se_confv, "version_storage", SS_STRING, rt->version_storage, SR_RO, NULL); sr_C(&p, pc, se_confv, "build", SS_STRING, rt->build, SR_RO, NULL); sr_C(&p, pc, se_confsophia_error, "error", SS_STRING, NULL, SR_RO, NULL); sr_c(&p, pc, se_confv_offline, "path", SS_STRINGPTR, &e->conf.path); sr_c(&p, pc, se_confv_offline, "path_create", SS_U32, &e->conf.path_create); sr_c(&p, pc, se_confv_offline, "recover", SS_U32, &e->conf.recover); return sr_C(NULL, pc, NULL, "sophia", SS_UNDEF, sophia, SR_NS, NULL); }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->r->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SXRO || sv_logcount_write(&x->log) == 0) return sx_promote(x, SXPREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SXROLLBACK); if (sslikely(v->prev == NULL)) { if (prepare && lsn != x->vlsn) { sxindex *i = v->index; if (prepare(x, &lv->v, arg, i->ptr)) return sx_promote(x, SXROLLBACK); } continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SXROLLBACK); continue; } /* force commit for read-only conflicts */ if (v->prev->v->flags & SVGET) continue; return sx_promote(x, SXLOCK); } return sx_promote(x, SXPREPARE); }
static inline svv* sv_vset(svv *head, svv *v) { /* default */ if (sslikely(head->lsn < v->lsn)) { v->next = head; head->flags |= SVDUP; return v; } /* redistribution (starting from highest lsn) */ svv *prev = head; svv *c = head->next; while (c) { assert(c->lsn != v->lsn); if (c->lsn < v->lsn) break; prev = c; c = c->next; } prev->next = v; v->next = c; v->flags |= SVDUP; return head; }
static inline sshot void* ss_slabamalloc(ssa *a, int size ssunused) { ssslaba *s = (ssslaba*)a->priv; assert(size == (int)s->slab_size); ss_spinlock(&s->lock); char *slab; if (sslikely(s->pool_free_count)) { slab = s->pool_free; s->pool_free = *(char**)slab; s->pool_free_count--; if (ssunlikely(s->pool_free_count == 0)) s->pool_free = NULL; } else if (ssunlikely(s->pool_next == s->pool_end)) { slab = NULL; } else { slab = s->pool_next; s->pool_next += s->slab_size; } ss_spinunlock(&s->lock); return slab; }
static inline int se_metasophia_error(srmeta *c, srmetastmt *s) { se *e = s->ptr; char *errorp; char error[128]; error[0] = 0; int len = sr_errorcopy(&e->error, error, sizeof(error)); if (sslikely(len == 0)) errorp = NULL; else errorp = error; srmeta meta = { .key = c->key, .flags = c->flags, .type = c->type, .function = NULL, .value = errorp, .ptr = NULL, .next = NULL }; return se_metav(&meta, s); } static inline srmeta* se_metasophia(se *e, semetart *rt, srmeta **pc) { srmeta *sophia = *pc; srmeta *p = NULL; sr_M(&p, pc, se_metav, "version", SS_STRING, rt->version, SR_RO, NULL); sr_M(&p, pc, se_metav, "build", SS_STRING, rt->build, SR_RO, NULL); sr_M(&p, pc, se_metasophia_error, "error", SS_STRING, NULL, SR_RO, NULL); sr_m(&p, pc, se_metav_offline, "path", SS_STRINGPTR, &e->meta.path); sr_m(&p, pc, se_metav_offline, "path_create", SS_U32, &e->meta.path_create); return sr_M(NULL, pc, NULL, "sophia", SS_UNDEF, sophia, SR_NS, NULL); }
sxstate sx_begin(sxmanager *m, sx *t, uint64_t vlsn) { t->s = SXREADY; t->complete = 0; sr_seqlock(m->seq); t->id = sr_seqdo(m->seq, SR_TSNNEXT); if (sslikely(vlsn == 0)) t->vlsn = sr_seqdo(m->seq, SR_LSN); else t->vlsn = vlsn; sr_sequnlock(m->seq); sx_init(m, t); ss_spinlock(&m->lock); ssrbnode *n = NULL; int rc = sx_matchtx(&m->i, NULL, (char*)&t->id, sizeof(t->id), &n); if (rc == 0 && n) { assert(0); } else { ss_rbset(&m->i, n, rc, &t->node); } m->count++; ss_spinunlock(&m->lock); return SXREADY; }
static inline int si_rangebranch(siread *q, sinode *n, sibranch *b, svmerge *m) { sicachebranch *c = si_cachefollow(q->cache); assert(c->branch == b); /* iterate cache */ if (ss_iterhas(sd_read, &c->i)) { svmergesrc *s = sv_mergeadd(m, &c->i); si_readstat(q, 1, n, 1); s->ptr = c; return 1; } if (c->open) { return 1; } if (q->cache_only) { return 2; } c->open = 1; /* choose compression type */ int compression; ssfilterif *compression_if; if (! si_branchis_root(b)) { compression = q->index->scheme->compression_branch; compression_if = q->index->scheme->compression_branch_if; } else { compression = q->index->scheme->compression; compression_if = q->index->scheme->compression_if; } sdreadarg arg = { .index = &b->index, .buf = &c->buf_a, .buf_xf = &c->buf_b, .buf_read = &q->index->readbuf, .index_iter = &c->index_iter, .page_iter = &c->page_iter, .use_memory = n->in_memory, .use_mmap = q->index->scheme->mmap, .use_mmap_copy = 1, .use_compression = compression, .compression_if = compression_if, .has = 0, .has_vlsn = 0, .o = q->order, .memory = &b->copy, .mmap = &n->map, .file = &n->file, .r = q->r }; ss_iterinit(sd_read, &c->i); int rc = ss_iteropen(sd_read, &c->i, &arg, q->key, q->keysize); int reads = sd_read_stat(&c->i); si_readstat(q, 0, n, reads); if (ssunlikely(rc == -1)) return -1; if (ssunlikely(! ss_iterhas(sd_read, &c->i))) return 0; svmergesrc *s = sv_mergeadd(m, &c->i); s->ptr = c; return 1; } static inline int si_range(siread *q) { assert(q->has == 0); ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, q->order, q->key, q->keysize); sinode *node; next_node: node = ss_iterof(si_iter, &i); if (ssunlikely(node == NULL)) return 0; si_txtrack(q->x, node); /* prepare sources */ svmerge *m = &q->merge; int count = node->branch_count + 2 + 1; int rc = sv_mergeprepare(m, q->r, count); if (ssunlikely(rc == -1)) { sr_errorreset(q->r->e); return -1; } /* external source (upsert) */ svmergesrc *s; sv upbuf_reserve; ssbuf upbuf; if (ssunlikely(q->upsert_v && q->upsert_v->v)) { ss_bufinit_reserve(&upbuf, &upbuf_reserve, sizeof(upbuf_reserve)); ss_bufadd(&upbuf, NULL, (void*)&q->upsert_v, sizeof(sv*)); s = sv_mergeadd(m, NULL); ss_iterinit(ss_bufiterref, &s->src); ss_iteropen(ss_bufiterref, &s->src, &upbuf, sizeof(sv*)); } /* in-memory indexes */ svindex *second; svindex *first = si_nodeindex_priority(node, &second); if (first->count) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, first, q->order, q->key, q->keysize); } if (ssunlikely(second && second->count)) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, second, q->order, q->key, q->keysize); } /* cache and branches */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } sibranch *b = node->branch; while (b) { rc = si_rangebranch(q, node, b, m); if (ssunlikely(rc == -1 || rc == 2)) return rc; b = b->next; } /* merge and filter data stream */ ssiter j; ss_iterinit(sv_mergeiter, &j); ss_iteropen(sv_mergeiter, &j, q->r, m, q->order); ssiter k; ss_iterinit(sv_readiter, &k); ss_iteropen(sv_readiter, &k, q->r, &j, &q->index->u, q->vlsn, 0); sv *v = ss_iterof(sv_readiter, &k); if (ssunlikely(v == NULL)) { sv_mergereset(&q->merge); ss_iternext(si_iter, &i); goto next_node; } rc = 1; /* convert upsert search to SS_EQ */ if (q->upsert_eq) { rc = sr_compare(q->r->scheme, sv_pointer(v), sv_size(v), q->key, q->keysize); rc = rc == 0; } /* do prefix search */ if (q->prefix && rc) { rc = sr_compareprefix(q->r->scheme, q->prefix, q->prefixsize, sv_pointer(v), sv_size(v)); } if (sslikely(rc == 1)) { if (ssunlikely(si_readdup(q, v) == -1)) return -1; } /* skip a possible duplicates from data sources */ ss_iternext(sv_readiter, &k); return rc; }
sinode *si_bootstrap(si *i, uint64_t parent) { sr *r = i->r; /* create node */ sinode *n = si_nodenew(r); if (ssunlikely(n == NULL)) return NULL; sdid id = { .parent = parent, .flags = 0, .id = sr_seq(r->seq, SR_NSNNEXT) }; int rc; rc = si_nodecreate(n, r, i->scheme, &id); if (ssunlikely(rc == -1)) goto e0; n->branch = &n->self; n->branch_count++; /* in-memory mode support */ ssblob *blob = NULL; if (i->scheme->storage == SI_SIN_MEMORY) { blob = &n->self.copy; rc = ss_blobensure(blob, 4096); if (ssunlikely(rc == -1)) goto e0; n->in_memory = 1; } /* create index with one empty page */ sdindex index; sd_indexinit(&index); rc = sd_indexbegin(&index, r); if (ssunlikely(rc == -1)) goto e0; ssqf f, *qf = NULL; ss_qfinit(&f); sdbuild build; sd_buildinit(&build); rc = sd_buildbegin(&build, r, i->scheme->node_page_checksum, i->scheme->compression_key, i->scheme->compression, i->scheme->compression_if); if (ssunlikely(rc == -1)) goto e1; sd_buildend(&build, r); rc = sd_indexadd(&index, r, &build, sizeof(sdseal)); if (ssunlikely(rc == -1)) goto e1; /* write seal */ uint64_t seal = n->file.size; rc = sd_writeseal(r, &n->file, blob); if (ssunlikely(rc == -1)) goto e1; /* write page */ rc = sd_writepage(r, &n->file, blob, &build); if (ssunlikely(rc == -1)) goto e1; /* amqf */ if (i->scheme->amqf) { rc = ss_qfensure(&f, r->a, 0); if (ssunlikely(rc == -1)) goto e1; qf = &f; } rc = sd_indexcommit(&index, r, &id, qf, n->file.size); if (ssunlikely(rc == -1)) goto e1; ss_qffree(&f, r->a); /* write index */ rc = sd_writeindex(r, &n->file, blob, &index); if (ssunlikely(rc == -1)) goto e1; /* close seal */ rc = sd_seal(r, &n->file, blob, &index, seal); if (ssunlikely(rc == -1)) goto e1; if (blob) { rc = ss_blobfit(blob); if (ssunlikely(rc == -1)) goto e1; } if (i->scheme->mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto e1; } si_branchset(&n->self, &index); sd_buildcommit(&build, r); sd_buildfree(&build, r); return n; e1: ss_qffree(&f, r->a); sd_indexfree(&index, r); sd_buildfree(&build, r); e0: si_nodefree(n, r, 0); return NULL; } static inline int si_deploy(si *i, sr *r, int create_directory) { /* create directory */ int rc; if (sslikely(create_directory)) { rc = ss_vfsmkdir(r->vfs, i->scheme->path, 0755); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "directory '%s' create error: %s", i->scheme->path, strerror(errno)); return -1; } } /* create scheme file */ rc = si_schemedeploy(i->scheme, r); if (ssunlikely(rc == -1)) { sr_malfunction_set(r->e); return -1; } /* create initial node */ sinode *n = si_bootstrap(i, 0); if (ssunlikely(n == NULL)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_RECOVER_0, si_nodefree(n, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); rc = si_nodecomplete(n, r, i->scheme); if (ssunlikely(rc == -1)) { si_nodefree(n, r, 1); return -1; } si_insert(i, n); si_plannerupdate(&i->p, SI_COMPACT|SI_BRANCH|SI_TEMP, n); i->size = si_nodesize(n); return 1; }
static inline int si_trackdir(sitrack *track, sr *r, si *i) { DIR *dir = opendir(i->scheme->path); if (ssunlikely(dir == NULL)) { sr_malfunction(r->e, "directory '%s' open error: %s", i->scheme->path, strerror(errno)); return -1; } struct dirent *de; while ((de = readdir(dir))) { if (ssunlikely(de->d_name[0] == '.')) continue; uint64_t id_parent = 0; uint64_t id = 0; int rc = si_process(de->d_name, &id, &id_parent); if (ssunlikely(rc == -1)) continue; /* skip unknown file */ si_tracknsn(track, id_parent); si_tracknsn(track, id); sinode *head, *node; sspath path; switch (rc) { case SI_RDB_DBI: case SI_RDB_DBSEAL: { /* find parent node and mark it as having * incomplete compaction process */ head = si_trackget(track, id_parent); if (sslikely(head == NULL)) { head = si_nodenew(r); if (ssunlikely(head == NULL)) goto error; head->self.id.id = id_parent; head->recover = SI_RDB_UNDEF; si_trackset(track, head); } head->recover |= rc; /* remove any incomplete file made during compaction */ if (rc == SI_RDB_DBI) { ss_pathcompound(&path, i->scheme->path, id_parent, id, ".db.incomplete"); rc = ss_vfsunlink(r->vfs, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", path.path, strerror(errno)); goto error; } continue; } assert(rc == SI_RDB_DBSEAL); /* recover 'sealed' node */ node = si_nodenew(r); if (ssunlikely(node == NULL)) goto error; node->recover = SI_RDB_DBSEAL; ss_pathcompound(&path, i->scheme->path, id_parent, id, ".db.seal"); rc = si_nodeopen(node, r, i->scheme, &path, NULL); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); goto error; } si_trackset(track, node); si_trackmetrics(track, node); continue; } } assert(rc == SI_RDB); head = si_trackget(track, id); if (head != NULL && (head->recover & SI_RDB)) { /* loaded by snapshot */ continue; } /* recover node */ node = si_nodenew(r); if (ssunlikely(node == NULL)) goto error; node->recover = SI_RDB; ss_path(&path, i->scheme->path, id, ".db"); rc = si_nodeopen(node, r, i->scheme, &path, NULL); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); goto error; } si_trackmetrics(track, node); /* track node */ if (sslikely(head == NULL)) { si_trackset(track, node); } else { /* replace a node previously created by a * incomplete compaction */ si_trackreplace(track, head, node); head->recover &= ~SI_RDB_UNDEF; node->recover |= head->recover; si_nodefree(head, r, 0); } } closedir(dir); return 0; error: closedir(dir); return -1; }
static inline int se_document_opt(const char *path) { switch (path[0]) { case 'v': if (sslikely(strcmp(path, "value") == 0)) return SE_DOCUMENT_VALUE; break; case 'k': if (sslikely(strncmp(path, "key", 3) == 0)) return SE_DOCUMENT_KEY; break; case 'o': if (sslikely(strcmp(path, "order") == 0)) return SE_DOCUMENT_ORDER; break; case 'l': if (sslikely(strcmp(path, "lsn") == 0)) return SE_DOCUMENT_LSN; if (sslikely(strcmp(path, "log") == 0)) return SE_DOCUMENT_LOG; break; case 'p': if (sslikely(strcmp(path, "prefix") == 0)) return SE_DOCUMENT_PREFIX; break; case 'a': if (sslikely(strcmp(path, "arg") == 0)) return SE_DOCUMENT_ARG; if (strcmp(path, "async") == 0) return SE_DOCUMENT_ASYNC; break; case 'r': if (sslikely(strcmp(path, "raw") == 0)) return SE_DOCUMENT_RAW; break; case 'f': if (sslikely(strcmp(path, "flags") == 0)) return SE_DOCUMENT_FLAGS; break; case 't': if (sslikely(strcmp(path, "type") == 0)) return SE_DOCUMENT_TYPE; break; case 'c': if (sslikely(strcmp(path, "cache_only") == 0)) return SE_DOCUMENT_CACHE_ONLY; break; case 'i': if (sslikely(strcmp(path, "immutable") == 0)) return SE_DOCUMENT_IMMUTABLE; break; case 's': if (sslikely(strcmp(path, "status") == 0)) return SE_DOCUMENT_STATUS; if (strcmp(path, "seq") == 0) return SE_DOCUMENT_SEQ; break; } return SE_DOCUMENT_UNKNOWN; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint64_t size_stream, uint32_t stream, uint64_t vlsn) { sr *r = &index->r; uint32_t timestamp = ss_timestamp(); int rc; sdmergeconf mergeconf = { .stream = stream, .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme.compaction.node_page_size, .checksum = index->scheme.compaction.node_page_checksum, .expire = index->scheme.expire, .timestamp = timestamp, .compression = index->scheme.compression, .compression_if = index->scheme.compression_if, .direct_io = index->scheme.direct_io, .direct_io_page_size = index->scheme.direct_io_page_size, .vlsn = vlsn }; sinode *n = NULL; sdmerge merge; rc = sd_mergeinit(&merge, r, i, &c->build, &c->build_index, &c->upsert, &mergeconf); if (ssunlikely(rc == -1)) return -1; while ((rc = sd_merge(&merge)) > 0) { /* create new node */ uint64_t id = sr_seq(index->r.seq, SR_NSNNEXT); n = si_nodenew(r, id, parent->id); if (ssunlikely(n == NULL)) goto error; rc = si_nodecreate(n, r, &index->scheme); if (ssunlikely(rc == -1)) goto error; /* write pages */ uint64_t offset; offset = sd_iosize(&c->io, &n->file); while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &n->file, &c->io, merge.build); if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); } if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); rc = sd_mergeend(&merge, offset); if (ssunlikely(rc == -1)) goto error; /* write index */ rc = sd_writeindex(r, &n->file, &c->io, &merge.index); if (ssunlikely(rc == -1)) goto error; /* mmap mode */ if (index->scheme.mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } /* add node to the list */ rc = ss_bufadd(result, index->r.a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r.e); goto error; } n->index = merge.index; } if (ssunlikely(rc == -1)) goto error; return 0; error: if (n) si_nodefree(n, r, 0); sd_mergefree(&merge); si_splitfree(result, r); return -1; } static int si_merge(si *index, sdc *c, sinode *node, uint64_t vlsn, ssiter *stream, uint64_t size_stream, uint32_t n_stream) { sr *r = &index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * Split merge stream into a number of * a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme.compaction.node_size, size_stream, n_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, node); si_nodesplit(node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = j->used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme.sync) { rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } rc = si_noderename_seal(n, r, &index->scheme); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc node */ uint16_t refs = si_noderefof(node); if (sslikely(refs == 0)) { rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; } else { /* node concurrently being read, schedule for * delayed removal */ si_nodegc(node, r, &index->scheme); si_lock(index); ss_listappend(&index->gc, &node->gc); index->gc_count++; si_unlock(index); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_noderename_complete(n, r, &index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }