void sx_gc(sx *t, sr *r) { sxmanager *m = t->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &t->log.buf, sizeof(svlogv)); if (sslikely(t->s == SXCOMMIT)) { for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->vgc; ss_free(m->asxv, v); } } else if (t->s == SXROLLBACK) { int gc = 0; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; gc += sv_vsize((svv*)v->v); sx_vfree(m->a, m->asxv, v); } ss_quota(r->quota, SS_QREMOVE, gc); } sv_logfree(&t->log, m->a); t->s = SXUNDEF; }
static int si_redistribute_index(si *index, sr *r, sdc *c, sinode *node) { svindex *vindex = si_nodeindex(node); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, r, vindex, SS_GTE, NULL, 0); while (ss_iterhas(sv_indexiter, &i)) { sv *v = ss_iterof(sv_indexiter, &i); int rc = ss_bufadd(&c->b, r->a, &v->v, sizeof(svv**)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); ss_iternext(sv_indexiter, &i); } if (ssunlikely(ss_bufused(&c->b) == 0)) return 0; uint64_t now = ss_utime(); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, &c->b, sizeof(svv*)); while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); si_redistribute_set(index, r, now, v); ss_iternext(ss_bufiterref, &i); } return 0; }
static int si_redistribute(si *index, sr *r, sdc *c, sinode *node, ssbuf *result) { (void)index; svindex *vindex = si_nodeindex(node); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, r, vindex, SS_GTE, NULL, 0); while (ss_iterhas(sv_indexiter, &i)) { sv *v = ss_iterof(sv_indexiter, &i); int rc = ss_bufadd(&c->b, r->a, &v->v, sizeof(svv**)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); ss_iternext(sv_indexiter, &i); } if (ssunlikely(ss_bufused(&c->b) == 0)) return 0; ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, &c->b, sizeof(svv*)); ssiter j; ss_iterinit(ss_bufiterref, &j); ss_iteropen(ss_bufiterref, &j, result, sizeof(sinode*)); sinode *prev = ss_iterof(ss_bufiterref, &j); ss_iternext(ss_bufiterref, &j); while (1) { sinode *p = ss_iterof(ss_bufiterref, &j); if (p == NULL) { assert(prev != NULL); while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); v->next = NULL; sv_indexset(&prev->i0, r, v); ss_iternext(ss_bufiterref, &i); } break; } while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); v->next = NULL; sdindexpage *page = sd_indexmin(&p->self.index); int rc = sr_compare(r->scheme, sv_vpointer(v), v->size, sd_indexpage_min(&p->self.index, page), page->sizemin); if (ssunlikely(rc >= 0)) break; sv_indexset(&prev->i0, r, v); ss_iternext(ss_bufiterref, &i); } if (ssunlikely(! ss_iterhas(ss_bufiterref, &i))) break; prev = p; ss_iternext(ss_bufiterref, &j); } assert(ss_iterof(ss_bufiterref, &i) == NULL); return 0; }
sxstate sx_rollback(sx *x) { sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); /* support log free after commit and half-commit mode */ if (x->state == SXCOMMIT) { int gc = 0; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); svv *v = lv->v.v; int size = sv_vsize(v); if (sv_vunref(m->r, v)) gc += size; } ss_quota(m->r->quota, SS_QREMOVE, gc); sx_promote(x, SXROLLBACK); return SXROLLBACK; } sx_rollback_svp(x, &i, 1); sx_promote(x, SXROLLBACK); sx_end(x); return SXROLLBACK; }
static inline void sx_rollback_index(sx *t, int translate) { ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &t->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; /* remove from index and replace head with * a first waiter */ if (v->prev) goto unlink; sxindex *i = v->index; if (v->next == NULL) ss_rbremove(&i->i, &v->node); else ss_rbreplace(&i->i, &v->node, &v->next->node); unlink: sx_vunlink(v); /* translate log version from sxv to svv */ if (translate) { sv_init(&lv->v, &sv_vif, v->v, NULL); lv->vgc = v; } } }
sxstate sx_commit(sx *t) { assert(t->s == SXPREPARE); if (t->complete) goto complete; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &t->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; /* mark waiters as aborted */ sx_vabortwaiters(v); /* remove from concurrent index and replace * head with a first waiter */ sxindex *i = v->index; if (v->next == NULL) ss_rbremove(&i->i, &v->node); else ss_rbreplace(&i->i, &v->node, &v->next->node); /* unlink version */ sx_vunlink(v); /* translate log version from sxv to svv */ sv_init(&lv->v, &sv_vif, v->v, NULL); lv->vgc = v; } complete: t->s = SXCOMMIT; sx_end(t); return SXCOMMIT; }
sxstate sx_prepare(sx *t, sxpreparef prepare, void *arg) { ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &t->log.buf, sizeof(svlogv)); sxstate s = SXPREPARE; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; /* cancelled by a concurrent commited * transaction */ if (v->v->flags & SVABORT) { s = SXROLLBACK; goto done; } /* concurrent update in progress */ if (v->prev != NULL) { s = SXLOCK; goto done; } /* check that new key has not been committed by * a concurrent transaction */ if (prepare) { sxindex *i = v->index; s = prepare(t, &lv->v, arg, i->ptr); if (ssunlikely(s != SXPREPARE)) goto done; } } done: t->s = s; return s; }
static inline int sx_deadlock_in(sxmanager *m, sslist *mark, sx *t, sx *p) { if (p->deadlock.next != &p->deadlock) return 0; ss_listappend(mark, &p->deadlock); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &p->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if (v->prev == NULL) continue; do { sx *n = sx_find(m, v->id); assert(n != NULL); if (ssunlikely(n == t)) return 1; int rc = sx_deadlock_in(m, mark, t, n); if (ssunlikely(rc == 1)) return 1; v = v->prev; } while (v); } return 0; }
static inline int si_recovercomplete(sitrack *track, sr *r, si *index, ssbuf *buf) { /* prepare and build primary index */ ss_bufreset(buf); ssrbnode *p = ss_rbmin(&track->i); while (p) { sinode *n = sscast(p, sinode, node); int rc = ss_bufadd(buf, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); p = ss_rbnext(&track->i, p); } ssiter i; ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, buf, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { sinode *n = ss_iterof(ss_bufiterref, &i); if (n->recover & SI_RDB_REMOVE) { int rc = si_nodefree(n, r, 1); if (ssunlikely(rc == -1)) return -1; ss_iternext(ss_bufiterref, &i); continue; } n->recover = SI_RDB; si_insert(index, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH|SI_TEMP, n); ss_iternext(ss_bufiterref, &i); } return 0; }
static int si_splitfree(ssbuf *result, sr *r) { ssiter i; ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { sinode *p = ss_iterof(ss_bufiterref, &i); si_nodefree(p, r, 0); ss_iternext(ss_bufiterref, &i); } return 0; }
sxstate sx_commit(sx *x) { if (x->state == SX_COMMIT) return SX_COMMIT; assert(x->state == SX_PREPARE); sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); uint64_t csn = ++m->csn; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; /* abort conflict reader */ if (v->prev && !sx_vcommitted(v->prev)) { sxindex *i = v->prev->index; assert(sv_vflags(v->prev->v, i->r) & SVGET); sx_vabort(v->prev); } /* abort waiters */ sx_vabort_all(v->next); /* mark stmt as commited */ sx_vcommit(v, csn); lv->ptr = NULL; /* schedule read stmt for gc */ sxindex *i = v->index; if (sv_vflags(v->v, i->r) & SVGET) { sv_vref(v->v); v->gc = m->gc; m->gc = v; m->count_gc++; } else { sx_untrack(v); sx_vpool_push(&m->pool, v); } } /* rollback latest reads */ sx_rollback_svp(x, &i, 0); sx_promote(x, SX_COMMIT); sx_end(x); return SX_COMMIT; }
sxstate sx_commit(sx *x) { assert(x->state == SXPREPARE); sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); uint64_t csn = ++m->csn; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if ((int)v->lo == x->log_read) break; /* abort conflict reader */ if (v->prev && !sx_vcommitted(v->prev)) { assert(v->prev->v->flags & SVGET); sx_vabort(v->prev); } /* abort waiters */ sx_vabort_all(v->next); /* mark stmt as commited */ sx_vcommit(v, csn); /* translate log version from sxv to svv */ sv_init(&lv->v, &sv_vif, v->v, NULL); /* schedule read stmt for gc */ if (v->v->flags & SVGET) { sv_vref(v->v); v->gc = m->gc; m->gc = v; m->count_gc++; } else { sx_untrack(v); ss_free(m->asxv, v); } } /* rollback latest reads */ sx_rollback_svp(x, &i, 0); sx_promote(x, SXCOMMIT); sx_end(x); return SXCOMMIT; }
static inline int si_tracksnapshot(sitrack *track, sr *r, si *i, sdsnapshot *s) { /* read snapshot */ ssiter iter; ss_iterinit(sd_snapshotiter, &iter); int rc; rc = ss_iteropen(sd_snapshotiter, &iter, r, s); if (ssunlikely(rc == -1)) return -1; for (; ss_iterhas(sd_snapshotiter, &iter); ss_iternext(sd_snapshotiter, &iter)) { sdsnapshotnode *n = ss_iterof(sd_snapshotiter, &iter); /* skip updated nodes */ sspath path; ss_path(&path, i->scheme->path, n->id, ".db"); rc = ss_vfsexists(r->vfs, path.path); if (! rc) continue; uint64_t size = ss_vfssize(r->vfs, path.path); if (size != n->size_file) continue; /* recover node */ sinode *node = si_nodenew(r); if (ssunlikely(node == NULL)) return -1; node->recover = SI_RDB; rc = si_nodeopen(node, r, i->scheme, &path, n); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } si_trackmetrics(track, node); si_trackset(track, node); } /* recover index temperature (read stats) */ sdsnapshotheader *h = sd_snapshot_header(s); i->read_cache = h->read_cache; i->read_disk = h->read_disk; i->lru_v = h->lru_v; i->lru_steps = h->lru_steps; return 0; }
static inline void sx_rollback_svp(sx *x, ssiter *i, int free) { sxmanager *m = x->manager; for (; ss_iterhas(ss_bufiter, i); ss_iternext(ss_bufiter, i)) { svlogv *lv = ss_iterof(ss_bufiter, i); sxv *v = lv->ptr; /* remove from index and replace head with * a first waiter */ sx_untrack(v); lv->ptr = NULL; if (free) { sxindex *i = v->index; sv_vunref(i->r, v->v); } sx_vpool_push(&m->pool, v); } }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SX_RO || sv_logcount_write(x->log) == 0) return sx_promote(x, SX_PREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); sxstate rc; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SX_ROLLBACK); if (sslikely(v->prev == NULL)) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SX_ROLLBACK); continue; } /* force commit for read-only conflicts */ sxindex *i = v->prev->index; if (sv_vflags(v->prev->v, i->r) & SVGET) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } return sx_promote(x, SX_LOCK); } return sx_promote(x, SX_PREPARE); }
static inline void sx_rollback_svp(sx *x, ssiter *i, int free) { sxmanager *m = x->manager; int gc = 0; for (; ss_iterhas(ss_bufiter, i); ss_iternext(ss_bufiter, i)) { svlogv *lv = ss_iterof(ss_bufiter, i); sxv *v = lv->v.v; /* remove from index and replace head with * a first waiter */ sx_untrack(v); /* translate log version from sxv to svv */ sv_init(&lv->v, &sv_vif, v->v, NULL); if (free) { if (sslikely(! (v->v->flags & SVGET))) gc += sv_vsize((svv*)v->v); sx_vfree(m->r, m->asxv, v); } } ss_quota(m->r->quota, SS_QREMOVE, gc); }
sxstate sx_rollback(sx *x) { sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); /* support half-commit mode */ if (x->state == SXCOMMIT) { for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); svv *v = lv->v.v; sv_vfree(m->r, v); } sx_promote(x, SXROLLBACK); return SXROLLBACK; } sx_rollback_svp(x, &i, 1); sx_promote(x, SXROLLBACK); sx_end(x); return SXROLLBACK; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->r->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SXRO || sv_logcount_write(&x->log) == 0) return sx_promote(x, SXPREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SXROLLBACK); if (sslikely(v->prev == NULL)) { if (prepare && lsn != x->vlsn) { sxindex *i = v->index; if (prepare(x, &lv->v, arg, i->ptr)) return sx_promote(x, SXROLLBACK); } continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SXROLLBACK); continue; } /* force commit for read-only conflicts */ if (v->prev->v->flags & SVGET) continue; return sx_promote(x, SXLOCK); } return sx_promote(x, SXPREPARE); }
static inline void si_qrangebranch(siquery *q, sinode *n, sibranch *b, svmerge *m) { sicachebranch *cb = si_cachefollow(q->cache); assert(cb->branch == b); /* iterate cache */ if (ss_iterhas(si_read, &cb->i)) { svmergesrc *s = sv_mergeadd(m, &cb->i); q->index->read_cache++; s->ptr = cb; return; } if (cb->open) { return; } cb->open = 1; sireadarg arg = { .scheme = q->index->scheme, .index = q->index, .n = n, .b = b, .buf = &cb->buf_a, .buf_xf = &cb->buf_b, .buf_read = &q->index->readbuf, .index_iter = &cb->index_iter, .page_iter = &cb->page_iter, .vlsn = q->vlsn, .has = 0, .mmap_copy = 1, .o = q->order, .r = q->r }; ss_iterinit(si_read, &cb->i); int rc = ss_iteropen(si_read, &cb->i, &arg, q->key, q->keysize); if (ssunlikely(rc == -1)) return; if (ssunlikely(! ss_iterhas(si_read, &cb->i))) return; svmergesrc *s = sv_mergeadd(m, &cb->i); s->ptr = cb; } static inline int si_qrange(siquery *q) { ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, q->order, q->key, q->keysize); sinode *node; next_node: node = ss_iterof(si_iter, &i); if (ssunlikely(node == NULL)) return 0; /* prepare sources */ svmerge *m = &q->merge; int count = node->branch_count + 2 + 1; int rc = sv_mergeprepare(m, q->r, count); if (ssunlikely(rc == -1)) { sr_errorreset(q->r->e); return -1; } /* external source (update) */ svmergesrc *s; sv upbuf_reserve; ssbuf upbuf; if (ssunlikely(q->update_v && q->update_v->v)) { ss_bufinit_reserve(&upbuf, &upbuf_reserve, sizeof(upbuf_reserve)); ss_bufadd(&upbuf, NULL, (void*)&q->update_v, sizeof(sv*)); s = sv_mergeadd(m, NULL); ss_iterinit(ss_bufiterref, &s->src); ss_iteropen(ss_bufiterref, &s->src, &upbuf, sizeof(sv*)); } /* in-memory indexes */ svindex *second; svindex *first = si_nodeindex_priority(node, &second); if (first->count) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, first, q->order, q->key, q->keysize); } if (ssunlikely(second && second->count)) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, second, q->order, q->key, q->keysize); } /* cache and branches */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } sibranch *b = node->branch; while (b) { si_qrangebranch(q, node, b, m); b = b->next; } /* merge and filter data stream */ ssiter j; ss_iterinit(sv_mergeiter, &j); ss_iteropen(sv_mergeiter, &j, q->r, m, q->order); ssiter k; ss_iterinit(sv_readiter, &k); ss_iteropen(sv_readiter, &k, q->r, &j, &q->index->u, q->vlsn, 0); sv *v = ss_iterof(sv_readiter, &k); if (ssunlikely(v == NULL)) { sv_mergereset(&q->merge); ss_iternext(si_iter, &i); goto next_node; } rc = 1; /* convert update search to SS_EQ */ if (q->update_eq) { rc = sr_compare(q->r->scheme, sv_pointer(v), sv_size(v), q->key, q->keysize); rc = rc == 0; } /* do prefix search */ if (q->prefix && rc) { rc = sr_compareprefix(q->r->scheme, q->prefix, q->prefixsize, sv_pointer(v), sv_size(v)); } if (sslikely(rc == 1)) { if (ssunlikely(si_querydup(q, v) == -1)) return -1; } /* skip a possible duplicates from data sources */ ss_iternext(sv_readiter, &k); return rc; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint32_t size_stream, uint64_t vlsn) { sr *r = index->r; int count = 0; int rc; sdmergeconf mergeconf = { .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression = index->scheme->compression, .compression_key = index->scheme->compression_key, .offset = 0, .vlsn = vlsn, .save_delete = 0, .save_update = 0 }; sdmerge merge; sd_mergeinit(&merge, r, i, &c->build, &c->update, &mergeconf); while ((rc = sd_merge(&merge)) > 0) { sinode *n = si_nodenew(r); if (ssunlikely(n == NULL)) goto error; sdid id = { .parent = parent->self.id.id, .flags = 0, .id = sr_seq(index->r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id); if (ssunlikely(rc == -1)) goto error; rc = si_nodecreate(n, r, index->scheme, &id, &merge.index, &c->build); if (ssunlikely(rc == -1)) goto error; rc = ss_bufadd(result, index->r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r->e); si_nodefree(n, r, 1); goto error; } sd_buildreset(&c->build); count++; } if (ssunlikely(rc == -1)) goto error; return 0; error: si_splitfree(result, r); sd_mergefree(&merge); return -1; } int si_compaction(si *index, sdc *c, uint64_t vlsn, sinode *node, ssiter *stream, uint32_t size_stream) { sr *r = index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * split merge stream into a number * of a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme->node_size, size_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->self.id.id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, SI_COMPACT|SI_BRANCH, node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); uint32_t used = sv_indexused(j); if (used) { ss_quota(r->quota, SS_QREMOVE, used); } break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = sv_indexused(j); si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = sv_indexused(&n->i0); si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = sv_indexused(&n->i0); si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme->sync) { rc = si_nodesync(n, r); if (ssunlikely(rc == -1)) return -1; } rc = si_nodeseal(n, r, index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc old node */ rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_nodecomplete(n, r, index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
static inline int si_rangebranch(siread *q, sinode *n, sibranch *b, svmerge *m) { sicachebranch *c = si_cachefollow(q->cache); assert(c->branch == b); /* iterate cache */ if (ss_iterhas(sd_read, &c->i)) { svmergesrc *s = sv_mergeadd(m, &c->i); si_readstat(q, 1, n, 1); s->ptr = c; return 1; } if (c->open) { return 1; } if (q->cache_only) { return 2; } c->open = 1; /* choose compression type */ int compression; ssfilterif *compression_if; if (! si_branchis_root(b)) { compression = q->index->scheme->compression_branch; compression_if = q->index->scheme->compression_branch_if; } else { compression = q->index->scheme->compression; compression_if = q->index->scheme->compression_if; } sdreadarg arg = { .index = &b->index, .buf = &c->buf_a, .buf_xf = &c->buf_b, .buf_read = &q->index->readbuf, .index_iter = &c->index_iter, .page_iter = &c->page_iter, .use_memory = n->in_memory, .use_mmap = q->index->scheme->mmap, .use_mmap_copy = 1, .use_compression = compression, .compression_if = compression_if, .has = 0, .has_vlsn = 0, .o = q->order, .memory = &b->copy, .mmap = &n->map, .file = &n->file, .r = q->r }; ss_iterinit(sd_read, &c->i); int rc = ss_iteropen(sd_read, &c->i, &arg, q->key, q->keysize); int reads = sd_read_stat(&c->i); si_readstat(q, 0, n, reads); if (ssunlikely(rc == -1)) return -1; if (ssunlikely(! ss_iterhas(sd_read, &c->i))) return 0; svmergesrc *s = sv_mergeadd(m, &c->i); s->ptr = c; return 1; } static inline int si_range(siread *q) { assert(q->has == 0); ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, q->order, q->key, q->keysize); sinode *node; next_node: node = ss_iterof(si_iter, &i); if (ssunlikely(node == NULL)) return 0; si_txtrack(q->x, node); /* prepare sources */ svmerge *m = &q->merge; int count = node->branch_count + 2 + 1; int rc = sv_mergeprepare(m, q->r, count); if (ssunlikely(rc == -1)) { sr_errorreset(q->r->e); return -1; } /* external source (upsert) */ svmergesrc *s; sv upbuf_reserve; ssbuf upbuf; if (ssunlikely(q->upsert_v && q->upsert_v->v)) { ss_bufinit_reserve(&upbuf, &upbuf_reserve, sizeof(upbuf_reserve)); ss_bufadd(&upbuf, NULL, (void*)&q->upsert_v, sizeof(sv*)); s = sv_mergeadd(m, NULL); ss_iterinit(ss_bufiterref, &s->src); ss_iteropen(ss_bufiterref, &s->src, &upbuf, sizeof(sv*)); } /* in-memory indexes */ svindex *second; svindex *first = si_nodeindex_priority(node, &second); if (first->count) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, first, q->order, q->key, q->keysize); } if (ssunlikely(second && second->count)) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, second, q->order, q->key, q->keysize); } /* cache and branches */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } sibranch *b = node->branch; while (b) { rc = si_rangebranch(q, node, b, m); if (ssunlikely(rc == -1 || rc == 2)) return rc; b = b->next; } /* merge and filter data stream */ ssiter j; ss_iterinit(sv_mergeiter, &j); ss_iteropen(sv_mergeiter, &j, q->r, m, q->order); ssiter k; ss_iterinit(sv_readiter, &k); ss_iteropen(sv_readiter, &k, q->r, &j, &q->index->u, q->vlsn, 0); sv *v = ss_iterof(sv_readiter, &k); if (ssunlikely(v == NULL)) { sv_mergereset(&q->merge); ss_iternext(si_iter, &i); goto next_node; } rc = 1; /* convert upsert search to SS_EQ */ if (q->upsert_eq) { rc = sr_compare(q->r->scheme, sv_pointer(v), sv_size(v), q->key, q->keysize); rc = rc == 0; } /* do prefix search */ if (q->prefix && rc) { rc = sr_compareprefix(q->r->scheme, q->prefix, q->prefixsize, sv_pointer(v), sv_size(v)); } if (sslikely(rc == 1)) { if (ssunlikely(si_readdup(q, v) == -1)) return -1; } /* skip a possible duplicates from data sources */ ss_iternext(sv_readiter, &k); return rc; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint64_t size_stream, uint32_t stream, uint64_t vlsn) { sr *r = &index->r; uint32_t timestamp = ss_timestamp(); int rc; sdmergeconf mergeconf = { .stream = stream, .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme.compaction.node_page_size, .checksum = index->scheme.compaction.node_page_checksum, .expire = index->scheme.expire, .timestamp = timestamp, .compression = index->scheme.compression, .compression_if = index->scheme.compression_if, .direct_io = index->scheme.direct_io, .direct_io_page_size = index->scheme.direct_io_page_size, .vlsn = vlsn }; sinode *n = NULL; sdmerge merge; rc = sd_mergeinit(&merge, r, i, &c->build, &c->build_index, &c->upsert, &mergeconf); if (ssunlikely(rc == -1)) return -1; while ((rc = sd_merge(&merge)) > 0) { /* create new node */ uint64_t id = sr_seq(index->r.seq, SR_NSNNEXT); n = si_nodenew(r, id, parent->id); if (ssunlikely(n == NULL)) goto error; rc = si_nodecreate(n, r, &index->scheme); if (ssunlikely(rc == -1)) goto error; /* write pages */ uint64_t offset; offset = sd_iosize(&c->io, &n->file); while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &n->file, &c->io, merge.build); if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); } if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); rc = sd_mergeend(&merge, offset); if (ssunlikely(rc == -1)) goto error; /* write index */ rc = sd_writeindex(r, &n->file, &c->io, &merge.index); if (ssunlikely(rc == -1)) goto error; /* mmap mode */ if (index->scheme.mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } /* add node to the list */ rc = ss_bufadd(result, index->r.a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r.e); goto error; } n->index = merge.index; } if (ssunlikely(rc == -1)) goto error; return 0; error: if (n) si_nodefree(n, r, 0); sd_mergefree(&merge); si_splitfree(result, r); return -1; } static int si_merge(si *index, sdc *c, sinode *node, uint64_t vlsn, ssiter *stream, uint64_t size_stream, uint32_t n_stream) { sr *r = &index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * Split merge stream into a number of * a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme.compaction.node_size, size_stream, n_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, node); si_nodesplit(node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = j->used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme.sync) { rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } rc = si_noderename_seal(n, r, &index->scheme); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc node */ uint16_t refs = si_noderefof(node); if (sslikely(refs == 0)) { rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; } else { /* node concurrently being read, schedule for * delayed removal */ si_nodegc(node, r, &index->scheme); si_lock(index); ss_listappend(&index->gc, &node->gc); index->gc_count++; si_unlock(index); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_noderename_complete(n, r, &index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
int si_schemerecover(sischeme *s, sr *r) { sdscheme c; sd_schemeinit(&c); char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/scheme", s->path); int version_storage_set = 0; int rc; rc = sd_schemerecover(&c, r, path); if (ssunlikely(rc == -1)) goto error; ssiter i; ss_iterinit(sd_schemeiter, &i); rc = ss_iteropen(sd_schemeiter, &i, r, &c, 1); if (ssunlikely(rc == -1)) goto error; while (ss_iterhas(sd_schemeiter, &i)) { sdschemeopt *opt = ss_iterof(sd_schemeiter, &i); switch (opt->id) { case SI_SCHEME_VERSION: break; case SI_SCHEME_VERSION_STORAGE: { if (opt->size != sizeof(srversion)) goto error; srversion *version = (srversion*)sd_schemesz(opt); if (! sr_versionstorage_check(version)) goto error_format; version_storage_set = 1; break; } case SI_SCHEME_SCHEME: { sf_schemefree(&s->scheme, r->a); sf_schemeinit(&s->scheme); ssbuf buf; ss_bufinit(&buf); rc = sf_schemeload(&s->scheme, r->a, sd_schemesz(opt), opt->size); if (ssunlikely(rc == -1)) goto error; rc = sf_schemevalidate(&s->scheme, r->a); if (ssunlikely(rc == -1)) goto error; ss_buffree(&buf, r->a); break; } case SI_SCHEME_NODE_SIZE: s->compaction.node_size = sd_schemeu64(opt); break; case SI_SCHEME_NODE_PAGE_SIZE: s->compaction.node_page_size = sd_schemeu32(opt); break; case SI_SCHEME_COMPRESSION: { char *name = sd_schemesz(opt); ssfilterif *cif = ss_filterof(name); if (ssunlikely(cif == NULL)) goto error; s->compression_if = cif; s->compression = s->compression_if != &ss_nonefilter; ss_free(r->a, s->compression_sz); s->compression_sz = ss_strdup(r->a, cif->name); if (ssunlikely(s->compression_sz == NULL)) goto error; break; } case SI_SCHEME_EXPIRE: s->expire = sd_schemeu32(opt); break; default: /* skip unknown */ break; } ss_iternext(sd_schemeiter, &i); } if (ssunlikely(! version_storage_set)) goto error_format; sd_schemefree(&c, r); return 0; error_format: sr_error(r->e, "%s", "incompatible storage format version"); error: sd_schemefree(&c, r); return -1; }
int si_schemerecover(sischeme *s, sr *r) { sdscheme c; sd_schemeinit(&c); char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/scheme", s->path); int rc; rc = sd_schemerecover(&c, r, path); if (ssunlikely(rc == -1)) goto error; ssiter i; ss_iterinit(sd_schemeiter, &i); rc = ss_iteropen(sd_schemeiter, &i, r, &c, 1); if (ssunlikely(rc == -1)) goto error; while (ss_iterhas(sd_schemeiter, &i)) { sdschemeopt *opt = ss_iterof(sd_schemeiter, &i); switch (opt->id) { case SI_SCHEME_FORMAT: s->fmt = sd_schemeu32(opt); char *name; if (s->fmt == SF_KV) name = "kv"; else if (s->fmt == SF_DOCUMENT) name = "document"; else goto error; ss_free(r->a, s->fmt_sz); s->fmt_sz = ss_strdup(r->a, name); if (ssunlikely(s->fmt_sz == NULL)) goto error; break; case SI_SCHEME_FORMAT_STORAGE: s->fmt_storage = sd_schemeu32(opt); break; case SI_SCHEME_SCHEME: { sr_schemefree(&s->scheme, r->a); sr_schemeinit(&s->scheme); ssbuf buf; ss_bufinit(&buf); rc = sr_schemeload(&s->scheme, r->a, sd_schemesz(opt), opt->size); if (ssunlikely(rc == -1)) goto error; ss_buffree(&buf, r->a); break; } case SI_SCHEME_NODE_SIZE: s->node_size = sd_schemeu64(opt); break; case SI_SCHEME_NODE_PAGE_SIZE: s->node_page_size = sd_schemeu32(opt); break; case SI_SCHEME_COMPRESSION_KEY: s->compression_key = sd_schemeu32(opt); break; case SI_SCHEME_COMPRESSION: { char *name = sd_schemesz(opt); ssfilterif *cif = ss_filterof(name); if (ssunlikely(cif == NULL)) goto error; s->compression_if = cif; s->compression = s->compression_if != &ss_nonefilter; ss_free(r->a, s->compression_sz); s->compression_sz = ss_strdup(r->a, cif->name); if (ssunlikely(s->compression_sz == NULL)) goto error; break; } case SI_SCHEME_COMPRESSION_BRANCH: { char *name = sd_schemesz(opt); ssfilterif *cif = ss_filterof(name); if (ssunlikely(cif == NULL)) goto error; s->compression_branch_if = cif; s->compression_branch = s->compression_branch_if != &ss_nonefilter; ss_free(r->a, s->compression_branch_sz); s->compression_branch_sz = ss_strdup(r->a, cif->name); if (ssunlikely(s->compression_branch_sz == NULL)) goto error; break; } case SI_SCHEME_AMQF: s->amqf = sd_schemeu32(opt); break; case SI_SCHEME_CACHE_MODE: s->cache_mode = sd_schemeu32(opt); break; case SI_SCHEME_EXPIRE: s->expire = sd_schemeu32(opt); break; default: /* skip unknown */ break; } ss_iternext(sd_schemeiter, &i); } sd_schemefree(&c, r); return 0; error: sd_schemefree(&c, r); return -1; }