static void freelog(svlog *log, sr *c) { ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &log->buf, sizeof(svlogv)); for (; ss_iteratorhas(&i); ss_iteratornext(&i)) { svlogv *v = ss_iteratorof(&i); ss_free(c->a, v->v.v); } sv_logfree(log, c->a); }
static int si_querycommited_branch(sr *r, sibranch *b, sv *v) { ssiter i; ss_iterinit(sd_indexiter, &i); ss_iteropen(sd_indexiter, &i, r, &b->index, SS_GTE, sv_pointer(v), sv_size(v)); sdindexpage *page = ss_iterof(sd_indexiter, &i); if (page == NULL) return 0; return page->lsnmax >= sv_lsn(v); }
static void sv_indexiter_iterate0(void) { svindex i; t( sv_indexinit(&i) == 0 ); int keyb = 3; int keya = 7; int keyc = 15; svv *h = st_svv(&st_r.g, NULL, 0, 0, keyb); t( sv_indexset(&i, &st_r.r, h) == 0 ); svv *p = st_svv(&st_r.g, NULL, 2, 0, keyc); t( sv_indexset(&i, &st_r.r, p) == 0 ); svv *va = st_svv(&st_r.g, NULL, 1, 0, keya); t( sv_indexset(&i, &st_r.r, va) == 0 ); svv *vb = st_svv(&st_r.g, NULL, 2, 0, keya); t( sv_indexset(&i, &st_r.r, vb) == 0 ); svv *vc = st_svv(&st_r.g, NULL, 3, 0, keya); t( sv_indexset(&i, &st_r.r, vc) == 0 ); ssiter it; ss_iterinit(sv_indexiter, &it); ss_iteropen(sv_indexiter, &it, &st_r.r, &i, SS_GTE, NULL, 0); t( ss_iteratorhas(&it) != 0 ); sv *v = ss_iteratorof(&it); t( v->v == h ); ss_iteratornext(&it); v = ss_iteratorof(&it); t( v->v == vc ); ss_iteratornext(&it); v = ss_iteratorof(&it); t( v->v == vb ); ss_iteratornext(&it); v = ss_iteratorof(&it); t( v->v == va ); ss_iteratornext(&it); v = ss_iteratorof(&it); t( v->v == p ); ss_iteratornext(&it); v = ss_iteratorof(&it); t( v == NULL ); sv_indexfree(&i, &st_r.r); }
static void sd_v_test(void) { sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &st_r.r, 1, 0, 0) == 0); int i = 7; int j = 8; addv(&b, &st_r.r, 3, 0, &i); addv(&b, &st_r.r, 4, 0, &j); sd_buildend(&b, &st_r.r); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &st_r.a, 1024) == 0 ); t( sd_commitpage(&b, &st_r.r, &buf) == 0 ); sdpageheader *h = (sdpageheader*)buf.s; sdpage page; sd_pageinit(&page, h); ssiter it; ss_iterinit(sd_pageiter, &it); ss_iteropen(sd_pageiter, &it, &st_r.r, &xfbuf, &page, SS_GTE, NULL, 0); t( ss_iteratorhas(&it) != 0 ); sv *v = ss_iteratorof(&it); t( v != NULL ); t( *(int*)sv_key(v, &st_r.r, 0) == i ); t( sv_lsn(v) == 3 ); t( sv_flags(v) == 0 ); ss_iteratornext(&it); t( ss_iteratorhas(&it) != 0 ); v = ss_iteratorof(&it); t( v != NULL ); t( *(int*)sv_key(v, &st_r.r, 0) == j ); t( sv_lsn(v) == 4 ); t( sv_flags(v) == 0 ); ss_iteratornext(&it); v = ss_iteratorof(&it); t( v == NULL ); sd_buildfree(&b, &st_r.r); ss_buffree(&buf, &st_r.a); ss_buffree(&xfbuf, &st_r.a); }
static void sv_indexiter_lte_empty(void) { svindex i; t( sv_indexinit(&i) == 0 ); ssiter it; ss_iterinit(sv_indexiter, &it); ss_iteropen(sv_indexiter, &it, &st_r.r, &i, SS_LTE, NULL, 0); t( ss_iteratorhas(&it) == 0 ); sv *v = ss_iteratorof(&it); t( v == NULL ); sv_indexfree(&i, &st_r.r); }
static int si_splitfree(ssbuf *result, sr *r) { ssiter i; ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { sinode *p = ss_iterof(ss_bufiterref, &i); si_nodefree(p, r, 0); ss_iternext(ss_bufiterref, &i); } return 0; }
static void sv_indexiter_lt_eq(void) { svindex i; t( sv_indexinit(&i) == 0 ); int keya = 7; int keyb = 5; int keyc = 2; svv *va = st_svv(&st_r.g, NULL, 0, 0, keya); t( sv_indexset(&i, &st_r.r, va) == 0 ); svv *vb = st_svv(&st_r.g, NULL, 0, 0, keyb); t( sv_indexset(&i, &st_r.r, vb) == 0 ); svv *vc = st_svv(&st_r.g, NULL, 0, 0, keyc); t( sv_indexset(&i, &st_r.r, vc) == 0 ); ssiter it; ss_iterinit(sv_indexiter, &it); ss_iteropen(sv_indexiter, &it, &st_r.r, &i, SS_LT, sv_vpointer(va), va->size); t( ss_iteratorhas(&it) != 0 ); sv *v = ss_iteratorof(&it); t( v->v == vb ); ss_iterinit(sv_indexiter, &it); ss_iteropen(sv_indexiter, &it, &st_r.r, &i, SS_LT, sv_vpointer(vb), vb->size); t( ss_iteratorhas(&it) != 0 ); v = ss_iteratorof(&it); t( v->v == vc ); ss_iterinit(sv_indexiter, &it); ss_iteropen(sv_indexiter, &it, &st_r.r, &i, SS_LT, sv_vpointer(vc), vc->size); t( ss_iteratorhas(&it) == 0 ); v = ss_iteratorof(&it); t( v == NULL ); sv_indexfree(&i, &st_r.r); }
static inline void si_redistribute_set(si *index, sr *r, svv *v) { /* match node */ ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, r, index, SS_GTE, sv_vpointer(v)); sinode *node = ss_iterof(si_iter, &i); assert(node != NULL); /* update node */ svindex *vindex = si_nodeindex(node); sv_indexset(vindex, r, v); node->used += sv_vsize(v, &index->r); /* schedule node */ si_plannerupdate(&index->p, node); }
static void sv_indexiter_gte_empty(void) { svindex i; t( sv_indexinit(&i) == 0 ); svv *key = st_svv(&st_r.g, &st_r.gc, 0, 0, 7); ssiter it; ss_iterinit(sv_indexiter, &it); ss_iteropen(sv_indexiter, &it, &st_r.r, &i, SS_GTE, sv_vpointer(key), key->size); t( ss_iteratorhas(&it) == 0 ); sv *v = ss_iteratorof(&it); t( v == NULL ); sv_indexfree(&i, &st_r.r); }
sxstate sx_commit(sx *x) { if (x->state == SX_COMMIT) return SX_COMMIT; assert(x->state == SX_PREPARE); sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); uint64_t csn = ++m->csn; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; /* abort conflict reader */ if (v->prev && !sx_vcommitted(v->prev)) { sxindex *i = v->prev->index; assert(sv_vflags(v->prev->v, i->r) & SVGET); sx_vabort(v->prev); } /* abort waiters */ sx_vabort_all(v->next); /* mark stmt as commited */ sx_vcommit(v, csn); lv->ptr = NULL; /* schedule read stmt for gc */ sxindex *i = v->index; if (sv_vflags(v->v, i->r) & SVGET) { sv_vref(v->v); v->gc = m->gc; m->gc = v; m->count_gc++; } else { sx_untrack(v); sx_vpool_push(&m->pool, v); } } /* rollback latest reads */ sx_rollback_svp(x, &i, 0); sx_promote(x, SX_COMMIT); sx_end(x); return SX_COMMIT; }
sxstate sx_commit(sx *x) { assert(x->state == SXPREPARE); sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); uint64_t csn = ++m->csn; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if ((int)v->lo == x->log_read) break; /* abort conflict reader */ if (v->prev && !sx_vcommitted(v->prev)) { assert(v->prev->v->flags & SVGET); sx_vabort(v->prev); } /* abort waiters */ sx_vabort_all(v->next); /* mark stmt as commited */ sx_vcommit(v, csn); /* translate log version from sxv to svv */ sv_init(&lv->v, &sv_vif, v->v, NULL); /* schedule read stmt for gc */ if (v->v->flags & SVGET) { sv_vref(v->v); v->gc = m->gc; m->gc = v; m->count_gc++; } else { sx_untrack(v); ss_free(m->asxv, v); } } /* rollback latest reads */ sx_rollback_svp(x, &i, 0); sx_promote(x, SXCOMMIT); sx_end(x); return SXCOMMIT; }
static inline void si_redistribute_set(si *index, sr *r, uint64_t now, svv *v) { index->update_time = now; /* match node */ ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, r, index, SS_ROUTE, sv_vpointer(v), v->size); sinode *node = ss_iterof(si_iter, &i); assert(node != NULL); /* update node */ svindex *vindex = si_nodeindex(node); sv_indexset(vindex, r, v); node->update_time = index->update_time; node->used += sv_vsize(v); /* schedule node */ si_plannerupdate(&index->p, SI_BRANCH, node); }
static inline int si_tracksnapshot(sitrack *track, sr *r, si *i, sdsnapshot *s) { /* read snapshot */ ssiter iter; ss_iterinit(sd_snapshotiter, &iter); int rc; rc = ss_iteropen(sd_snapshotiter, &iter, r, s); if (ssunlikely(rc == -1)) return -1; for (; ss_iterhas(sd_snapshotiter, &iter); ss_iternext(sd_snapshotiter, &iter)) { sdsnapshotnode *n = ss_iterof(sd_snapshotiter, &iter); /* skip updated nodes */ sspath path; ss_path(&path, i->scheme->path, n->id, ".db"); rc = ss_vfsexists(r->vfs, path.path); if (! rc) continue; uint64_t size = ss_vfssize(r->vfs, path.path); if (size != n->size_file) continue; /* recover node */ sinode *node = si_nodenew(r); if (ssunlikely(node == NULL)) return -1; node->recover = SI_RDB; rc = si_nodeopen(node, r, i->scheme, &path, n); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } si_trackmetrics(track, node); si_trackset(track, node); } /* recover index temperature (read stats) */ sdsnapshotheader *h = sd_snapshot_header(s); i->read_cache = h->read_cache; i->read_disk = h->read_disk; i->lru_v = h->lru_v; i->lru_steps = h->lru_steps; return 0; }
static inline int si_noderecover(sinode *n, sr *r) { /* recover branches */ ssiter i; ss_iterinit(sd_recover, &i); ss_iteropen(sd_recover, &i, r, &n->file); int first = 1; int rc; while (ss_iteratorhas(&i)) { sdindexheader *h = ss_iteratorof(&i); sibranch *b; if (first) { b = &n->self; } else { b = si_branchnew(r); if (ssunlikely(b == NULL)) goto error; } sdindex index; sd_indexinit(&index); rc = sd_indexcopy(&index, r, h); if (ssunlikely(rc == -1)) goto error; si_branchset(b, &index); b->next = n->branch; n->branch = b; n->branch_count++; first = 0; ss_iteratornext(&i); } rc = sd_recover_complete(&i); if (ssunlikely(rc == -1)) goto error; ss_iteratorclose(&i); return 0; error: ss_iteratorclose(&i); return -1; }
int si_readcommited(si *index, sr *r, sv *v) { ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, r, index, SS_GTE, sv_pointer(v), sv_size(v)); sinode *node; node = ss_iterof(si_iter, &i); assert(node != NULL); sibranch *b = node->branch; int rc; while (b) { rc = si_readcommited_branch(r, b, v); if (rc) return 1; b = b->next; } rc = si_readcommited_branch(r, &node->self, v); return rc; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SX_RO || sv_logcount_write(x->log) == 0) return sx_promote(x, SX_PREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); sxstate rc; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SX_ROLLBACK); if (sslikely(v->prev == NULL)) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SX_ROLLBACK); continue; } /* force commit for read-only conflicts */ sxindex *i = v->prev->index; if (sv_vflags(v->prev->v, i->r) & SVGET) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } return sx_promote(x, SX_LOCK); } return sx_promote(x, SX_PREPARE); }
sxstate sx_rollback(sx *x) { sxmanager *m = x->manager; ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); /* support half-commit mode */ if (x->state == SXCOMMIT) { for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); svv *v = lv->v.v; sv_vfree(m->r, v); } sx_promote(x, SXROLLBACK); return SXROLLBACK; } sx_rollback_svp(x, &i, 1); sx_promote(x, SXROLLBACK); sx_end(x); return SXROLLBACK; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->r->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SXRO || sv_logcount_write(&x->log) == 0) return sx_promote(x, SXPREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SXROLLBACK); if (sslikely(v->prev == NULL)) { if (prepare && lsn != x->vlsn) { sxindex *i = v->index; if (prepare(x, &lv->v, arg, i->ptr)) return sx_promote(x, SXROLLBACK); } continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SXROLLBACK); continue; } /* force commit for read-only conflicts */ if (v->prev->v->flags & SVGET) continue; return sx_promote(x, SXLOCK); } return sx_promote(x, SXPREPARE); }
static inline int si_getindex(siread *q, sinode *n) { svindex *second; svindex *first = si_nodeindex_priority(n, &second); ssiter i; ss_iterinit(sv_indexiter, &i); int rc; if (first->count > 0) { rc = ss_iteropen(sv_indexiter, &i, q->r, first, SS_GTE, q->key, q->keysize); if (rc) { goto result; } } if (sslikely(second == NULL || !second->count)) return 0; rc = ss_iteropen(sv_indexiter, &i, q->r, second, SS_GTE, q->key, q->keysize); if (! rc) { return 0; } result:; si_readstat(q, 1, n, 1); sv *v = ss_iterof(sv_indexiter, &i); assert(v != NULL); svv *visible = v->v; if (sslikely(! q->has)) { visible = sv_visible(visible, q->vlsn); if (visible == NULL) return 0; } sv vret; sv_init(&vret, &sv_vif, visible, NULL); return si_getresult(q, &vret, 0); }
static inline int si_getbranch(siread *q, sinode *n, sibranch *b) { sicachebranch *c = si_cachefollow(q->cache); assert(c->branch == b); /* choose compression type */ int compression; ssfilterif *compression_if; if (! si_branchis_root(b)) { compression = q->index->scheme->compression_branch; compression_if = q->index->scheme->compression_branch_if; } else { compression = q->index->scheme->compression; compression_if = q->index->scheme->compression_if; } sdreadarg arg = { .index = &b->index, .buf = &c->buf_a, .buf_xf = &c->buf_b, .buf_read = &q->index->readbuf, .index_iter = &c->index_iter, .page_iter = &c->page_iter, .use_memory = n->in_memory, .use_mmap = q->index->scheme->mmap, .use_mmap_copy = 0, .use_compression = compression, .compression_if = compression_if, .has = q->has, .has_vlsn = q->vlsn, .o = SS_GTE, .mmap = &n->map, .memory = &b->copy, .file = &n->file, .r = q->r }; ss_iterinit(sd_read, &c->i); int rc = ss_iteropen(sd_read, &c->i, &arg, q->key, q->keysize); int reads = sd_read_stat(&c->i); si_readstat(q, 0, n, reads); if (ssunlikely(rc <= 0)) return rc; /* prepare sources */ sv_mergereset(&q->merge); sv_mergeadd(&q->merge, &c->i); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, q->r, &q->merge, SS_GTE); uint64_t vlsn = q->vlsn; if (ssunlikely(q->has)) vlsn = UINT64_MAX; ssiter j; ss_iterinit(sv_readiter, &j); ss_iteropen(sv_readiter, &j, q->r, &i, &q->index->u, vlsn, 1); sv *v = ss_iterof(sv_readiter, &j); if (ssunlikely(v == NULL)) return 0; return si_getresult(q, v, 1); } static inline int si_get(siread *q) { ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, SS_GTE, q->key, q->keysize); sinode *node; node = ss_iterof(si_iter, &i); assert(node != NULL); si_txtrack(q->x, node); /* search in memory */ int rc; rc = si_getindex(q, node); if (rc != 0) return rc; if (q->cache_only) return 2; /* */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } svmerge *m = &q->merge; rc = sv_mergeprepare(m, q->r, 1); assert(rc == 0); /* search on disk */ sibranch *b = node->branch; while (b) { rc = si_getbranch(q, node, b); if (rc != 0) return rc; b = b->next; } return 0; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint32_t size_stream, uint64_t vlsn) { sr *r = index->r; int count = 0; int rc; sdmergeconf mergeconf = { .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression = index->scheme->compression, .compression_key = index->scheme->compression_key, .offset = 0, .vlsn = vlsn, .save_delete = 0, .save_update = 0 }; sdmerge merge; sd_mergeinit(&merge, r, i, &c->build, &c->update, &mergeconf); while ((rc = sd_merge(&merge)) > 0) { sinode *n = si_nodenew(r); if (ssunlikely(n == NULL)) goto error; sdid id = { .parent = parent->self.id.id, .flags = 0, .id = sr_seq(index->r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id); if (ssunlikely(rc == -1)) goto error; rc = si_nodecreate(n, r, index->scheme, &id, &merge.index, &c->build); if (ssunlikely(rc == -1)) goto error; rc = ss_bufadd(result, index->r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r->e); si_nodefree(n, r, 1); goto error; } sd_buildreset(&c->build); count++; } if (ssunlikely(rc == -1)) goto error; return 0; error: si_splitfree(result, r); sd_mergefree(&merge); return -1; } int si_compaction(si *index, sdc *c, uint64_t vlsn, sinode *node, ssiter *stream, uint32_t size_stream) { sr *r = index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * split merge stream into a number * of a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme->node_size, size_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->self.id.id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, SI_COMPACT|SI_BRANCH, node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); uint32_t used = sv_indexused(j); if (used) { ss_quota(r->quota, SS_QREMOVE, used); } break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = sv_indexused(j); si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = sv_indexused(&n->i0); si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = sv_indexused(&n->i0); si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme->sync) { rc = si_nodesync(n, r); if (ssunlikely(rc == -1)) return -1; } rc = si_nodeseal(n, r, index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc old node */ rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_nodecomplete(n, r, index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
static void sd_read_gt0(void) { sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &st_r.r, 1, 0, 0, 0, NULL) == 0); int key = 7; addv(&b, &st_r.r, 3, 0, &key); key = 8; addv(&b, &st_r.r, 4, 0, &key); key = 9; addv(&b, &st_r.r, 5, 0, &key); sd_buildend(&b, &st_r.r); sdindex index; sd_indexinit(&index); t( sd_indexbegin(&index, &st_r.r) == 0 ); int rc; rc = sd_indexadd(&index, &st_r.r, &b, sizeof(sdseal)); t( rc == 0 ); sdid id; memset(&id, 0, sizeof(id)); ssfile f; ss_fileinit(&f, &st_r.vfs); t( ss_filenew(&f, "./0000.db") == 0 ); t( sd_writeseal(&st_r.r, &f, NULL) == 0 ); t( sd_writepage(&st_r.r, &f, NULL, &b) == 0 ); t( sd_indexcommit(&index, &st_r.r, &id, NULL, f.size) == 0 ); t( sd_writeindex(&st_r.r, &f, NULL, &index) == 0 ); t( sd_seal(&st_r.r, &f, NULL, &index, 0) == 0 ); ssmmap map; t( ss_vfsmmap(&st_r.vfs, &map, f.fd, f.size, 1) == 0 ); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &st_r.a, 1024) == 0 ); ssiter index_iter; ssiter page_iter; sdreadarg arg = { .index = &index, .buf = &buf, .buf_xf = &xfbuf, .buf_read = NULL, .index_iter = &index_iter, .page_iter = &page_iter, .mmap = &map, .memory = NULL, .file = NULL, .o = SS_GT, .use_memory = 0, .use_mmap = 1, .use_mmap_copy = 0, .use_compression = 0, .compression_if = NULL, .has = 0, .has_vlsn = 0, .r = &st_r.r }; ssiter it; ss_iterinit(sd_read, &it); ss_iteropen(sd_read, &it, &arg, NULL, 0); t( ss_iteratorhas(&it) == 1 ); sv *v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 7); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 8); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 9); ss_iteratornext(&it); t( ss_iteratorhas(&it) == 0 ); ss_iteratorclose(&it); ss_fileclose(&f); t( ss_vfsmunmap(&st_r.vfs, &map) == 0 ); t( ss_vfsunlink(&st_r.vfs, "./0000.db") == 0 ); sd_indexfree(&index, &st_r.r); sd_buildfree(&b, &st_r.r); ss_buffree(&xfbuf, &st_r.a); ss_buffree(&buf, &st_r.a); } static void sd_read_gt1(void) { ssfile f; ss_fileinit(&f, &st_r.vfs); t( ss_filenew(&f, "./0000.db") == 0 ); t( sd_writeseal(&st_r.r, &f, NULL) == 0 ); sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &st_r.r, 1, 0, 0, 0, NULL) == 0); int key = 7; addv(&b, &st_r.r, 3, 0, &key); key = 8; addv(&b, &st_r.r, 4, 0, &key); key = 9; addv(&b, &st_r.r, 5, 0, &key); sd_buildend(&b, &st_r.r); uint64_t poff = f.size; t( sd_writepage(&st_r.r, &f, NULL, &b) == 0 ); sdindex index; sd_indexinit(&index); t( sd_indexbegin(&index, &st_r.r) == 0 ); int rc; rc = sd_indexadd(&index, &st_r.r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &st_r.r) == 0 ); t( sd_buildbegin(&b, &st_r.r, 1, 0, 0, 0, NULL) == 0); key = 10; addv(&b, &st_r.r, 6, 0, &key); key = 11; addv(&b, &st_r.r, 7, 0, &key); key = 13; addv(&b, &st_r.r, 8, 0, &key); sd_buildend(&b, &st_r.r); poff = f.size; t( sd_writepage(&st_r.r, &f, NULL, &b) == 0 ); rc = sd_indexadd(&index, &st_r.r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &st_r.r) == 0 ); t( sd_buildbegin(&b, &st_r.r, 1, 0, 0, 0, NULL) == 0); key = 15; addv(&b, &st_r.r, 9, 0, &key); key = 18; addv(&b, &st_r.r, 10, 0, &key); key = 20; addv(&b, &st_r.r, 11, 0, &key); sd_buildend(&b, &st_r.r); poff = f.size; t( sd_writepage(&st_r.r, &f, NULL, &b) == 0 ); rc = sd_indexadd(&index, &st_r.r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &st_r.r) == 0 ); sdid id; memset(&id, 0, sizeof(id)); t( sd_indexcommit(&index, &st_r.r, &id, NULL, f.size) == 0 ); t( sd_writeindex(&st_r.r, &f, NULL, &index) == 0 ); t( sd_seal(&st_r.r, &f, NULL, &index, 0) == 0 ); ssmmap map; t( ss_vfsmmap(&st_r.vfs, &map, f.fd, f.size, 1) == 0 ); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &st_r.a, 1024) == 0 ); ssiter index_iter; ssiter page_iter; sdreadarg arg = { .index = &index, .buf = &buf, .buf_xf = &xfbuf, .buf_read = NULL, .index_iter = &index_iter, .page_iter = &page_iter, .mmap = &map, .memory = NULL, .file = NULL, .o = SS_GT, .use_memory = 0, .use_mmap = 1, .use_mmap_copy = 0, .use_compression = 0, .compression_if = NULL, .has = 0, .has_vlsn = 0, .r = &st_r.r }; ssiter it; ss_iterinit(sd_read, &it); ss_iteropen(sd_read, &it, &arg, NULL, 0); t( ss_iteratorhas(&it) == 1 ); /* page 0 */ t( ss_iteratorhas(&it) != 0 ); sv *v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 7); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 8); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 9); ss_iteratornext(&it); /* page 1 */ v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 10); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 11); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 13); ss_iteratornext(&it); /* page 2 */ v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 15); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 18); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &st_r.r, 0, NULL) == 20); ss_iteratornext(&it); t( ss_iteratorhas(&it) == 0 ); ss_iteratorclose(&it); ss_fileclose(&f); t( ss_vfsmunmap(&st_r.vfs, &map) == 0 ); t( ss_vfsunlink(&st_r.vfs, "./0000.db") == 0 ); sd_indexfree(&index, &st_r.r); sd_buildfree(&b, &st_r.r); ss_buffree(&xfbuf, &st_r.a); ss_buffree(&buf, &st_r.a); } static void sd_read_gt0_compression_zstd(void) { ssa a; ss_aopen(&a, &ss_stda); ssa aref; ss_aopen(&aref, &ss_stda); ssvfs vfs; ss_vfsinit(&vfs, &ss_stdvfs); sfscheme cmp; sf_schemeinit(&cmp); sffield *field = sf_fieldnew(&a, "key"); t( sf_fieldoptions(field, &a, "u32,key(0)") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); field = sf_fieldnew(&a, "value"); t( sf_fieldoptions(field, &a, "string") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); t( sf_schemevalidate(&cmp, &a) == 0 ); ssinjection ij; memset(&ij, 0, sizeof(ij)); srstat stat; memset(&stat, 0, sizeof(stat)); srerror error; sr_errorinit(&error); srseq seq; sr_seqinit(&seq); sscrcf crc = ss_crc32c_function(); sr r; sr_init(&r, NULL, &error, &a, &aref, &vfs, NULL, NULL, &seq, SF_RAW, NULL, &cmp, &ij, &stat, crc); sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_zstdfilter) == 0); int key = 7; addv(&b, &r, 3, 0, &key); key = 8; addv(&b, &r, 4, 0, &key); key = 9; addv(&b, &r, 5, 0, &key); t( sd_buildend(&b, &r) == 0 ); sdindex index; sd_indexinit(&index); t( sd_indexbegin(&index, &r) == 0 ); int rc; rc = sd_indexadd(&index, &r, &b, sizeof(sdseal)); t( rc == 0 ); sdid id; memset(&id, 0, sizeof(id)); ssfile f; ss_fileinit(&f, &vfs); t( ss_filenew(&f, "./0000.db") == 0 ); t( sd_writeseal(&r, &f, NULL) == 0 ); t( sd_writepage(&r, &f, NULL, &b) == 0 ); t( sd_indexcommit(&index, &r, &id, NULL, f.size) == 0 ); t( sd_writeindex(&r, &f, NULL, &index) == 0 ); t( sd_seal(&r, &f, NULL, &index, 0) == 0 ); t( sd_buildcommit(&b, &r) == 0 ); ssmmap map; t( ss_vfsmmap(&st_r.vfs, &map, f.fd, f.size, 1) == 0 ); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &a, 1024) == 0 ); ssiter index_iter; ssiter page_iter; sdreadarg arg = { .index = &index, .buf = &buf, .buf_xf = &xfbuf, .buf_read = NULL, .index_iter = &index_iter, .page_iter = &page_iter, .mmap = &map, .memory = NULL, .file = NULL, .o = SS_GT, .use_memory = 0, .use_mmap = 1, .use_mmap_copy = 0, .use_compression = 1, .compression_if = &ss_zstdfilter, .has = 0, .has_vlsn = 0, .r = &r }; ssiter it; ss_iterinit(sd_read, &it); ss_iteropen(sd_read, &it, &arg, NULL, 0); t( ss_iteratorhas(&it) == 1 ); sv *v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 7); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 8); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 9); ss_iteratornext(&it); t( ss_iteratorhas(&it) == 0 ); ss_iteratorclose(&it); ss_fileclose(&f); t( ss_vfsmunmap(&st_r.vfs, &map) == 0 ); t( ss_vfsunlink(&vfs, "./0000.db") == 0 ); sd_indexfree(&index, &r); sd_buildfree(&b, &r); ss_buffree(&xfbuf, &a); ss_buffree(&buf, &a); sf_schemefree(&cmp, &a); } static void sd_read_gt0_compression_lz4(void) { ssa a; ss_aopen(&a, &ss_stda); ssa aref; ss_aopen(&aref, &ss_stda); ssvfs vfs; ss_vfsinit(&vfs, &ss_stdvfs); sfscheme cmp; sf_schemeinit(&cmp); sffield *field = sf_fieldnew(&a, "key"); t( sf_fieldoptions(field, &a, "u32,key(0)") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); field = sf_fieldnew(&a, "value"); t( sf_fieldoptions(field, &a, "string") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); t( sf_schemevalidate(&cmp, &a) == 0 ); ssinjection ij; memset(&ij, 0, sizeof(ij)); srstat stat; memset(&stat, 0, sizeof(stat)); srerror error; sr_errorinit(&error); srseq seq; sr_seqinit(&seq); sscrcf crc = ss_crc32c_function(); sr r; sr_init(&r, NULL, &error, &a, &aref, &vfs, NULL, NULL, &seq, SF_RAW, NULL, &cmp, &ij, &stat, crc); sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_lz4filter) == 0); int key = 7; addv(&b, &r, 3, 0, &key); key = 8; addv(&b, &r, 4, 0, &key); key = 9; addv(&b, &r, 5, 0, &key); t( sd_buildend(&b, &r) == 0 ); sdindex index; sd_indexinit(&index); t( sd_indexbegin(&index, &r) == 0 ); int rc; rc = sd_indexadd(&index, &r, &b, sizeof(sdseal)); t( rc == 0 ); sdid id; memset(&id, 0, sizeof(id)); t( sd_indexcommit(&index, &r, &id, NULL, 0) == 0 ); ssfile f; ss_fileinit(&f, &vfs); t( ss_filenew(&f, "./0000.db") == 0 ); t( sd_writeseal(&r, &f, NULL) == 0 ); t( sd_writepage(&r, &f, NULL, &b) == 0 ); t( sd_indexcommit(&index, &r, &id, NULL, f.size) == 0 ); t( sd_writeindex(&r, &f, NULL, &index) == 0 ); t( sd_seal(&r, &f, NULL, &index, 0) == 0 ); ssmmap map; t( ss_vfsmmap(&st_r.vfs, &map, f.fd, f.size, 1) == 0 ); t( sd_buildcommit(&b, &r) == 0 ); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &a, 1024) == 0 ); ssiter index_iter; ssiter page_iter; sdreadarg arg = { .index = &index, .buf = &buf, .buf_xf = &xfbuf, .buf_read = NULL, .index_iter = &index_iter, .page_iter = &page_iter, .mmap = &map, .memory = NULL, .file = NULL, .o = SS_GT, .use_memory = 0, .use_mmap = 1, .use_mmap_copy = 0, .use_compression = 1, .compression_if = &ss_lz4filter, .has = 0, .has_vlsn = 0, .r = &r }; ssiter it; ss_iterinit(sd_read, &it); ss_iteropen(sd_read, &it, &arg, NULL, 0); t( ss_iteratorhas(&it) == 1 ); sv *v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 7); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 8); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 9); ss_iteratornext(&it); t( ss_iteratorhas(&it) == 0 ); ss_iteratorclose(&it); ss_fileclose(&f); t( ss_vfsmunmap(&st_r.vfs, &map) == 0 ); t( ss_vfsunlink(&vfs, "./0000.db") == 0 ); sd_indexfree(&index, &r); sd_buildfree(&b, &r); ss_buffree(&xfbuf, &a); ss_buffree(&buf, &a); sf_schemefree(&cmp, &a); } static void sd_read_gt1_compression_zstd(void) { ssa a; ss_aopen(&a, &ss_stda); ssa aref; ss_aopen(&aref, &ss_stda); ssvfs vfs; ss_vfsinit(&vfs, &ss_stdvfs); sfscheme cmp; sf_schemeinit(&cmp); sffield *field = sf_fieldnew(&a, "key"); t( sf_fieldoptions(field, &a, "u32,key(0)") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); field = sf_fieldnew(&a, "value"); t( sf_fieldoptions(field, &a, "string") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); t( sf_schemevalidate(&cmp, &a) == 0 ); ssinjection ij; memset(&ij, 0, sizeof(ij)); srstat stat; memset(&stat, 0, sizeof(stat)); srerror error; sr_errorinit(&error); srseq seq; sr_seqinit(&seq); sscrcf crc = ss_crc32c_function(); sr r; sr_init(&r, NULL, &error, &a, &aref, &vfs, NULL, NULL, &seq, SF_RAW, NULL, &cmp, &ij, &stat, crc); ssfile f; ss_fileinit(&f, &vfs); t( ss_filenew(&f, "./0000.db") == 0 ); t( sd_writeseal(&r, &f, NULL) == 0 ); sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_zstdfilter) == 0); int key = 7; addv(&b, &r, 3, 0, &key); key = 8; addv(&b, &r, 4, 0, &key); key = 9; addv(&b, &r, 5, 0, &key); sd_buildend(&b, &r); uint64_t poff = f.size; t( sd_writepage(&r, &f, NULL, &b) == 0 ); sdindex index; sd_indexinit(&index); t( sd_indexbegin(&index, &r) == 0 ); int rc; rc = sd_indexadd(&index, &r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &r) == 0 ); sd_buildreset(&b, &r); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_zstdfilter) == 0); key = 10; addv(&b, &r, 6, 0, &key); key = 11; addv(&b, &r, 7, 0, &key); key = 13; addv(&b, &r, 8, 0, &key); sd_buildend(&b, &r); poff = f.size; t( sd_writepage(&r, &f, NULL, &b) == 0 ); rc = sd_indexadd(&index, &r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &r) == 0 ); sd_buildreset(&b, &r); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_zstdfilter) == 0); key = 15; addv(&b, &r, 9, 0, &key); key = 18; addv(&b, &r, 10, 0, &key); key = 20; addv(&b, &r, 11, 0, &key); sd_buildend(&b, &r); poff = f.size; t( sd_writepage(&r, &f, NULL, &b) == 0 ); rc = sd_indexadd(&index, &r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &r) == 0 ); sdid id; memset(&id, 0, sizeof(id)); t( sd_indexcommit(&index, &r, &id, NULL, f.size) == 0 ); t( sd_writeindex(&r, &f, NULL, &index) == 0 ); t( sd_seal(&r, &f, NULL, &index, 0) == 0 ); ssmmap map; t( ss_vfsmmap(&st_r.vfs, &map, f.fd, f.size, 1) == 0 ); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &a, 1024) == 0 ); ssiter index_iter; ssiter page_iter; sdreadarg arg = { .index = &index, .buf = &buf, .buf_xf = &xfbuf, .buf_read = NULL, .index_iter = &index_iter, .page_iter = &page_iter, .mmap = &map, .memory = NULL, .file = NULL, .o = SS_GT, .use_memory = 0, .use_mmap = 1, .use_mmap_copy = 0, .use_compression = 1, .compression_if = &ss_zstdfilter, .has = 0, .has_vlsn = 0, .r = &r }; ssiter it; ss_iterinit(sd_read, &it); ss_iteropen(sd_read, &it, &arg, NULL, 0); t( ss_iteratorhas(&it) == 1 ); /* page 0 */ t( ss_iteratorhas(&it) != 0 ); sv *v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 7); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 8); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 9); ss_iteratornext(&it); /* page 1 */ v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 10); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 11); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 13); ss_iteratornext(&it); /* page 2 */ v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 15); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 18); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 20); ss_iteratornext(&it); t( ss_iteratorhas(&it) == 0 ); ss_iteratorclose(&it); ss_fileclose(&f); t( ss_vfsmunmap(&st_r.vfs, &map) == 0 ); t( ss_vfsunlink(&vfs, "./0000.db") == 0 ); sd_indexfree(&index, &r); sd_buildfree(&b, &r); ss_buffree(&buf, &a); ss_buffree(&xfbuf, &a); sf_schemefree(&cmp, &a); } static void sd_read_gt1_compression_lz4(void) { ssa a; ss_aopen(&a, &ss_stda); ssa aref; ss_aopen(&aref, &ss_stda); ssvfs vfs; ss_vfsinit(&vfs, &ss_stdvfs); sfscheme cmp; sf_schemeinit(&cmp); sffield *field = sf_fieldnew(&a, "key"); t( sf_fieldoptions(field, &a, "u32,key(0)") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); field = sf_fieldnew(&a, "value"); t( sf_fieldoptions(field, &a, "string") == 0 ); t( sf_schemeadd(&cmp, &a, field) == 0 ); t( sf_schemevalidate(&cmp, &a) == 0 ); ssinjection ij; memset(&ij, 0, sizeof(ij)); srstat stat; memset(&stat, 0, sizeof(stat)); srerror error; sr_errorinit(&error); srseq seq; sr_seqinit(&seq); sscrcf crc = ss_crc32c_function(); sr r; sr_init(&r, NULL, &error, &a, &aref, &vfs, NULL, NULL, &seq, SF_RAW, NULL, &cmp, &ij, &stat, crc); ssfile f; ss_fileinit(&f, &vfs); t( ss_filenew(&f, "./0000.db") == 0 ); t( sd_writeseal(&r, &f, NULL) == 0 ); sdbuild b; sd_buildinit(&b); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_lz4filter) == 0); int key = 7; addv(&b, &r, 3, 0, &key); key = 8; addv(&b, &r, 4, 0, &key); key = 9; addv(&b, &r, 5, 0, &key); sd_buildend(&b, &r); uint64_t poff = f.size; t( sd_writepage(&r, &f, NULL, &b) == 0 ); sdindex index; sd_indexinit(&index); t( sd_indexbegin(&index, &r) == 0 ); int rc; rc = sd_indexadd(&index, &r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &r) == 0 ); sd_buildreset(&b, &r); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_lz4filter) == 0); key = 10; addv(&b, &r, 6, 0, &key); key = 11; addv(&b, &r, 7, 0, &key); key = 13; addv(&b, &r, 8, 0, &key); sd_buildend(&b, &r); poff = f.size; t( sd_writepage(&r, &f, NULL, &b) == 0 ); rc = sd_indexadd(&index, &r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &r) == 0 ); sd_buildreset(&b, &r); t( sd_buildbegin(&b, &r, 1, 0, 0, 1, &ss_lz4filter) == 0); key = 15; addv(&b, &r, 9, 0, &key); key = 18; addv(&b, &r, 10, 0, &key); key = 20; addv(&b, &r, 11, 0, &key); sd_buildend(&b, &r); poff = f.size; t( sd_writepage(&r, &f, NULL, &b) == 0 ); rc = sd_indexadd(&index, &r, &b, poff); t( rc == 0 ); t( sd_buildcommit(&b, &r) == 0 ); sdid id; memset(&id, 0, sizeof(id)); t( sd_indexcommit(&index, &r, &id, NULL, f.size) == 0 ); t( sd_writeindex(&r, &f, NULL, &index) == 0 ); t( sd_seal(&r, &f, NULL, &index, 0) == 0 ); ssmmap map; t( ss_vfsmmap(&st_r.vfs, &map, f.fd, f.size, 1) == 0 ); ssbuf buf; ss_bufinit(&buf); ssbuf xfbuf; ss_bufinit(&xfbuf); t( ss_bufensure(&xfbuf, &a, 1024) == 0 ); ssiter index_iter; ssiter page_iter; sdreadarg arg = { .index = &index, .buf = &buf, .buf_xf = &xfbuf, .buf_read = NULL, .index_iter = &index_iter, .page_iter = &page_iter, .mmap = &map, .memory = NULL, .file = NULL, .o = SS_GT, .use_memory = 0, .use_mmap = 1, .use_mmap_copy = 0, .use_compression = 1, .compression_if = &ss_lz4filter, .has = 0, .has_vlsn = 0, .r = &r }; ssiter it; ss_iterinit(sd_read, &it); ss_iteropen(sd_read, &it, &arg, NULL, 0); t( ss_iteratorhas(&it) == 1 ); /* page 0 */ t( ss_iteratorhas(&it) != 0 ); sv *v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 7); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 8); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 9); ss_iteratornext(&it); /* page 1 */ v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 10); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 11); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 13); ss_iteratornext(&it); /* page 2 */ v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 15); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 18); ss_iteratornext(&it); v = ss_iteratorof(&it); t( *(int*)sv_field(v, &r, 0, NULL) == 20); ss_iteratornext(&it); t( ss_iteratorhas(&it) == 0 ); ss_iteratorclose(&it); ss_fileclose(&f); t( ss_vfsmunmap(&st_r.vfs, &map) == 0 ); t( ss_vfsunlink(&vfs, "./0000.db") == 0 ); sd_indexfree(&index, &r); sd_buildfree(&b, &r); ss_buffree(&buf, &a); ss_buffree(&xfbuf, &a); sf_schemefree(&cmp, &a); } stgroup *sd_read_group(void) { stgroup *group = st_group("sdread"); st_groupadd(group, st_test("gt0", sd_read_gt0)); st_groupadd(group, st_test("gt1", sd_read_gt1)); st_groupadd(group, st_test("gt0_compression_zstd", sd_read_gt0_compression_zstd)); st_groupadd(group, st_test("gt0_compression_lz4", sd_read_gt0_compression_lz4)); st_groupadd(group, st_test("gt1_compression_zstd", sd_read_gt1_compression_zstd)); st_groupadd(group, st_test("gt1_compression_lz4", sd_read_gt1_compression_lz4)); return group; }
static inline sibranch* si_branchcreate(si *index, sdc *c, sinode *parent, svindex *vindex, uint64_t vlsn) { sr *r = index->r; sibranch *branch = NULL; /* in-memory mode blob */ int rc; ssblob copy, *blob = NULL; if (parent->in_memory) { ss_blobinit(©, r->vfs); rc = ss_blobensure(©, 10ULL * 1024 * 1024); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); return NULL; } blob = © } svmerge vmerge; sv_mergeinit(&vmerge); rc = sv_mergeprepare(&vmerge, r, 1); if (ssunlikely(rc == -1)) return NULL; svmergesrc *s = sv_mergeadd(&vmerge, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, r, vindex, SS_GTE, NULL, 0); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &vmerge, SS_GTE); /* merge iter is not used */ sdmergeconf mergeconf = { .size_stream = UINT32_MAX, .size_node = UINT64_MAX, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression_key = index->scheme->compression_key, .compression = index->scheme->compression_branch, .compression_if = index->scheme->compression_branch_if, .vlsn = vlsn, .vlsn_lru = 0, .save_delete = 1, .save_upsert = 1 }; sdmerge merge; sd_mergeinit(&merge, r, &i, &c->build, &c->upsert, &mergeconf); while ((rc = sd_merge(&merge)) > 0) { assert(branch == NULL); /* write open seal */ uint64_t seal = parent->file.size; rc = sd_writeseal(r, &parent->file, blob); if (ssunlikely(rc == -1)) goto e0; /* write pages */ uint64_t offset = parent->file.size; while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &parent->file, blob, merge.build); if (ssunlikely(rc == -1)) goto e0; offset = parent->file.size; } if (ssunlikely(rc == -1)) goto e0; sdid id = { .parent = parent->self.id.id, .flags = SD_IDBRANCH, .id = sr_seq(r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id, parent->file.size); if (ssunlikely(rc == -1)) goto e0; /* write index */ rc = sd_writeindex(r, &parent->file, blob, &merge.index); if (ssunlikely(rc == -1)) goto e0; if (index->scheme->sync) { rc = ss_filesync(&parent->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "file '%s' sync error: %s", ss_pathof(&parent->file.path), strerror(errno)); goto e0; } } SS_INJECTION(r->i, SS_INJECTION_SI_BRANCH_0, sd_mergefree(&merge); sr_malfunction(r->e, "%s", "error injection"); return NULL); /* seal the branch */ rc = sd_seal(r, &parent->file, blob, &merge.index, seal); if (ssunlikely(rc == -1)) goto e0; if (index->scheme->sync == 2) { rc = ss_filesync(&parent->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "file '%s' sync error: %s", ss_pathof(&parent->file.path), strerror(errno)); goto e0; } } /* create new branch object */ branch = si_branchnew(r); if (ssunlikely(branch == NULL)) goto e0; si_branchset(branch, &merge.index); } sv_mergefree(&vmerge, r->a); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); goto e0; } assert(branch != NULL); /* in-memory mode support */ if (blob) { rc = ss_blobfit(blob); if (ssunlikely(rc == -1)) { ss_blobfree(blob); goto e1; } branch->copy = copy; } /* mmap support */ if (index->scheme->mmap) { ss_mmapinit(&parent->map_swap); rc = ss_vfsmmap(r->vfs, &parent->map_swap, parent->file.fd, parent->file.size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' mmap error: %s", ss_pathof(&parent->file.path), strerror(errno)); goto e1; } } return branch; e0: sd_mergefree(&merge); if (blob) ss_blobfree(blob); return NULL; e1: si_branchfree(branch, r); return NULL; } int si_branch(si *index, sdc *c, siplan *plan, uint64_t vlsn) { sr *r = index->r; sinode *n = plan->node; assert(n->flags & SI_LOCK); si_lock(index); if (ssunlikely(n->used == 0)) { si_nodeunlock(n); si_unlock(index); return 0; } svindex *i; i = si_noderotate(n); si_unlock(index); sibranch *branch = si_branchcreate(index, c, n, i, vlsn); if (ssunlikely(branch == NULL)) return -1; /* commit */ si_lock(index); branch->next = n->branch; n->branch->link = branch; n->branch = branch; n->branch_count++; uint32_t used = sv_indexused(i); n->used -= used; ss_quota(r->quota, SS_QREMOVE, used); index->size += sd_indexsize(branch->index.h) + sd_indextotal(&branch->index); svindex swap = *i; si_nodeunrotate(n); si_nodeunlock(n); si_plannerupdate(&index->p, SI_BRANCH|SI_COMPACT, n); ssmmap swap_map = n->map; n->map = n->map_swap; memset(&n->map_swap, 0, sizeof(n->map_swap)); si_unlock(index); /* gc */ if (index->scheme->mmap) { int rc = ss_vfsmunmap(r->vfs, &swap_map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } si_nodegc_index(r, &swap); return 1; } int si_compact(si *index, sdc *c, siplan *plan, uint64_t vlsn, uint64_t vlsn_lru, ssiter *vindex, uint64_t vindex_used) { sr *r = index->r; sinode *node = plan->node; assert(node->flags & SI_LOCK); /* prepare for compaction */ int rc; rc = sd_censure(c, r, node->branch_count); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); svmerge merge; sv_mergeinit(&merge); rc = sv_mergeprepare(&merge, r, node->branch_count + 1); if (ssunlikely(rc == -1)) return -1; /* read node file into memory */ int use_mmap = index->scheme->mmap; ssmmap *map = &node->map; ssmmap preload; if (index->scheme->node_compact_load) { rc = si_noderead(node, r, &c->c); if (ssunlikely(rc == -1)) return -1; preload.p = c->c.s; preload.size = ss_bufused(&c->c); map = &preload; use_mmap = 1; } /* include vindex into merge process */ svmergesrc *s; uint64_t size_stream = 0; if (vindex) { s = sv_mergeadd(&merge, vindex); size_stream = vindex_used; } sdcbuf *cbuf = c->head; sibranch *b = node->branch; while (b) { s = sv_mergeadd(&merge, NULL); /* choose compression type */ int compression; ssfilterif *compression_if; if (! si_branchis_root(b)) { compression = index->scheme->compression_branch; compression_if = index->scheme->compression_branch_if; } else { compression = index->scheme->compression; compression_if = index->scheme->compression_if; } sdreadarg arg = { .index = &b->index, .buf = &cbuf->a, .buf_xf = &cbuf->b, .buf_read = &c->d, .index_iter = &cbuf->index_iter, .page_iter = &cbuf->page_iter, .use_memory = node->in_memory, .use_mmap = use_mmap, .use_mmap_copy = 0, .use_compression = compression, .compression_if = compression_if, .has = 0, .has_vlsn = 0, .o = SS_GTE, .memory = &b->copy, .mmap = map, .file = &node->file, .r = r }; ss_iterinit(sd_read, &s->src); int rc = ss_iteropen(sd_read, &s->src, &arg, NULL, 0); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); size_stream += sd_indextotal(&b->index); cbuf = cbuf->next; b = b->next; } ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &merge, SS_GTE); rc = si_merge(index, c, node, vlsn, vlsn_lru, &i, size_stream); sv_mergefree(&merge, r->a); return rc; } int si_compact_index(si *index, sdc *c, siplan *plan, uint64_t vlsn, uint64_t vlsn_lru) { sinode *node = plan->node; si_lock(index); if (ssunlikely(node->used == 0)) { si_nodeunlock(node); si_unlock(index); return 0; } svindex *vindex; vindex = si_noderotate(node); si_unlock(index); uint64_t size_stream = sv_indexused(vindex); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, index->r, vindex, SS_GTE, NULL, 0); return si_compact(index, c, plan, vlsn, vlsn_lru, &i, size_stream); }
static inline int si_qgetbranch(siquery *q, sinode *n, sibranch *b) { sicachebranch *cb = si_cachefollow(q->cache); assert(cb->branch == b); sireadarg arg = { .scheme = q->index->scheme, .index = q->index, .n = n, .b = b, .buf = &cb->buf_a, .buf_xf = &cb->buf_b, .buf_read = &q->index->readbuf, .index_iter = &cb->index_iter, .page_iter = &cb->page_iter, .vlsn = q->vlsn, .has = q->has, .mmap_copy = 0, .o = SS_GTE, .r = q->r }; ss_iterinit(si_read, &cb->i); int rc = ss_iteropen(si_read, &cb->i, &arg, q->key, q->keysize); if (ssunlikely(rc <= 0)) return rc; uint64_t vlsn = q->vlsn; if (q->has) { vlsn = UINT64_MAX; } /* prepare sources */ sv_mergereset(&q->merge); sv_mergeadd(&q->merge, &cb->i); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, q->r, &q->merge, SS_GTE); ssiter j; ss_iterinit(sv_readiter, &j); ss_iteropen(sv_readiter, &j, q->r, &i, &q->index->u, vlsn, 1); sv *v = ss_iterof(sv_readiter, &j); if (ssunlikely(v == NULL)) return 0; return si_qgetresult(q, v, 1); } static inline int si_qget(siquery *q) { ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, SS_GTE, q->key, q->keysize); sinode *node; node = ss_iterof(si_iter, &i); assert(node != NULL); /* search in memory */ int rc; rc = si_qgetindex(q, node); if (rc != 0) return rc; /* */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } svmerge *m = &q->merge; rc = sv_mergeprepare(m, q->r, 1); assert(rc == 0); /* search on disk */ sibranch *b = node->branch; while (b) { rc = si_qgetbranch(q, node, b); if (rc != 0) return rc; b = b->next; } return 0; }
static inline void si_qrangebranch(siquery *q, sinode *n, sibranch *b, svmerge *m) { sicachebranch *cb = si_cachefollow(q->cache); assert(cb->branch == b); /* iterate cache */ if (ss_iterhas(si_read, &cb->i)) { svmergesrc *s = sv_mergeadd(m, &cb->i); q->index->read_cache++; s->ptr = cb; return; } if (cb->open) { return; } cb->open = 1; sireadarg arg = { .scheme = q->index->scheme, .index = q->index, .n = n, .b = b, .buf = &cb->buf_a, .buf_xf = &cb->buf_b, .buf_read = &q->index->readbuf, .index_iter = &cb->index_iter, .page_iter = &cb->page_iter, .vlsn = q->vlsn, .has = 0, .mmap_copy = 1, .o = q->order, .r = q->r }; ss_iterinit(si_read, &cb->i); int rc = ss_iteropen(si_read, &cb->i, &arg, q->key, q->keysize); if (ssunlikely(rc == -1)) return; if (ssunlikely(! ss_iterhas(si_read, &cb->i))) return; svmergesrc *s = sv_mergeadd(m, &cb->i); s->ptr = cb; } static inline int si_qrange(siquery *q) { ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, q->order, q->key, q->keysize); sinode *node; next_node: node = ss_iterof(si_iter, &i); if (ssunlikely(node == NULL)) return 0; /* prepare sources */ svmerge *m = &q->merge; int count = node->branch_count + 2 + 1; int rc = sv_mergeprepare(m, q->r, count); if (ssunlikely(rc == -1)) { sr_errorreset(q->r->e); return -1; } /* external source (update) */ svmergesrc *s; sv upbuf_reserve; ssbuf upbuf; if (ssunlikely(q->update_v && q->update_v->v)) { ss_bufinit_reserve(&upbuf, &upbuf_reserve, sizeof(upbuf_reserve)); ss_bufadd(&upbuf, NULL, (void*)&q->update_v, sizeof(sv*)); s = sv_mergeadd(m, NULL); ss_iterinit(ss_bufiterref, &s->src); ss_iteropen(ss_bufiterref, &s->src, &upbuf, sizeof(sv*)); } /* in-memory indexes */ svindex *second; svindex *first = si_nodeindex_priority(node, &second); if (first->count) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, first, q->order, q->key, q->keysize); } if (ssunlikely(second && second->count)) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, second, q->order, q->key, q->keysize); } /* cache and branches */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } sibranch *b = node->branch; while (b) { si_qrangebranch(q, node, b, m); b = b->next; } /* merge and filter data stream */ ssiter j; ss_iterinit(sv_mergeiter, &j); ss_iteropen(sv_mergeiter, &j, q->r, m, q->order); ssiter k; ss_iterinit(sv_readiter, &k); ss_iteropen(sv_readiter, &k, q->r, &j, &q->index->u, q->vlsn, 0); sv *v = ss_iterof(sv_readiter, &k); if (ssunlikely(v == NULL)) { sv_mergereset(&q->merge); ss_iternext(si_iter, &i); goto next_node; } rc = 1; /* convert update search to SS_EQ */ if (q->update_eq) { rc = sr_compare(q->r->scheme, sv_pointer(v), sv_size(v), q->key, q->keysize); rc = rc == 0; } /* do prefix search */ if (q->prefix && rc) { rc = sr_compareprefix(q->r->scheme, q->prefix, q->prefixsize, sv_pointer(v), sv_size(v)); } if (sslikely(rc == 1)) { if (ssunlikely(si_querydup(q, v) == -1)) return -1; } /* skip a possible duplicates from data sources */ ss_iternext(sv_readiter, &k); return rc; }
int si_compaction(si *index, sdc *c, siplan *plan, uint64_t vlsn) { sr *r = &index->r; sinode *node = plan->node; assert(node->flags & SI_LOCK); si_lock(index); svindex *vindex; vindex = si_noderotate(node); si_unlock(index); uint64_t size_stream = vindex->used; ssiter vindex_iter; ss_iterinit(sv_indexiter, &vindex_iter); ss_iteropen(sv_indexiter, &vindex_iter, &index->r, vindex, SS_GTE, NULL); /* prepare direct_io stream */ int rc; if (index->scheme.direct_io) { rc = sd_ioprepare(&c->io, r, index->scheme.direct_io, index->scheme.direct_io_page_size, index->scheme.direct_io_buffer_size); if (ssunlikely(rc == -1)) return sr_oom(r->e); } /* prepare for compaction */ svmerge merge; sv_mergeinit(&merge); rc = sv_mergeprepare(&merge, r, 1 + 1); if (ssunlikely(rc == -1)) return -1; svmergesrc *s; s = sv_mergeadd(&merge, &vindex_iter); sdcbuf *cbuf = &c->e; s = sv_mergeadd(&merge, NULL); sdreadarg arg = { .from_compaction = 1, .io = &c->io, .index = &node->index, .buf = &cbuf->a, .buf_read = &c->d, .index_iter = &cbuf->index_iter, .page_iter = &cbuf->page_iter, .use_mmap = index->scheme.mmap, .use_mmap_copy = 0, .use_compression = index->scheme.compression, .use_direct_io = index->scheme.direct_io, .direct_io_page_size = index->scheme.direct_io_page_size, .compression_if = index->scheme.compression_if, .has = 0, .has_vlsn = 0, .o = SS_GTE, .mmap = &node->map, .file = &node->file, .r = r }; ss_iterinit(sd_read, &s->src); rc = ss_iteropen(sd_read, &s->src, &arg, NULL); if (ssunlikely(rc == -1)) return -1; size_stream += sd_indextotal(&node->index); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &merge, SS_GTE); rc = si_merge(index, c, node, vlsn, &i, size_stream, sd_indexkeys(&node->index)); sv_mergefree(&merge, r->a); return rc; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint64_t size_stream, uint32_t stream, uint64_t vlsn) { sr *r = &index->r; uint32_t timestamp = ss_timestamp(); int rc; sdmergeconf mergeconf = { .stream = stream, .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme.compaction.node_page_size, .checksum = index->scheme.compaction.node_page_checksum, .expire = index->scheme.expire, .timestamp = timestamp, .compression = index->scheme.compression, .compression_if = index->scheme.compression_if, .direct_io = index->scheme.direct_io, .direct_io_page_size = index->scheme.direct_io_page_size, .vlsn = vlsn }; sinode *n = NULL; sdmerge merge; rc = sd_mergeinit(&merge, r, i, &c->build, &c->build_index, &c->upsert, &mergeconf); if (ssunlikely(rc == -1)) return -1; while ((rc = sd_merge(&merge)) > 0) { /* create new node */ uint64_t id = sr_seq(index->r.seq, SR_NSNNEXT); n = si_nodenew(r, id, parent->id); if (ssunlikely(n == NULL)) goto error; rc = si_nodecreate(n, r, &index->scheme); if (ssunlikely(rc == -1)) goto error; /* write pages */ uint64_t offset; offset = sd_iosize(&c->io, &n->file); while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &n->file, &c->io, merge.build); if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); } if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); rc = sd_mergeend(&merge, offset); if (ssunlikely(rc == -1)) goto error; /* write index */ rc = sd_writeindex(r, &n->file, &c->io, &merge.index); if (ssunlikely(rc == -1)) goto error; /* mmap mode */ if (index->scheme.mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } /* add node to the list */ rc = ss_bufadd(result, index->r.a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r.e); goto error; } n->index = merge.index; } if (ssunlikely(rc == -1)) goto error; return 0; error: if (n) si_nodefree(n, r, 0); sd_mergefree(&merge); si_splitfree(result, r); return -1; } static int si_merge(si *index, sdc *c, sinode *node, uint64_t vlsn, ssiter *stream, uint64_t size_stream, uint32_t n_stream) { sr *r = &index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * Split merge stream into a number of * a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme.compaction.node_size, size_stream, n_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, node); si_nodesplit(node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = j->used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme.sync) { rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } rc = si_noderename_seal(n, r, &index->scheme); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc node */ uint16_t refs = si_noderefof(node); if (sslikely(refs == 0)) { rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; } else { /* node concurrently being read, schedule for * delayed removal */ si_nodegc(node, r, &index->scheme); si_lock(index); ss_listappend(&index->gc, &node->gc); index->gc_count++; si_unlock(index); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_noderename_complete(n, r, &index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
static inline sibranch* si_branchcreate(si *index, sdc *c, sinode *parent, svindex *vindex, uint64_t vlsn) { sr *r = index->r; svmerge vmerge; sv_mergeinit(&vmerge); int rc = sv_mergeprepare(&vmerge, r, 1); if (ssunlikely(rc == -1)) return NULL; svmergesrc *s = sv_mergeadd(&vmerge, NULL); ss_iterinit(sv_indexiterraw, &s->src); ss_iteropen(sv_indexiterraw, &s->src, vindex); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &vmerge, SS_GTE, 1); /* merge iter is not used */ sdmergeconf mergeconf = { .size_stream = UINT32_MAX, .size_node = UINT64_MAX, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression = index->scheme->compression, .compression_key = index->scheme->compression_key, .offset = parent->file.size, .vlsn = vlsn, .save_delete = 1 }; sdmerge merge; sd_mergeinit(&merge, r, &i, &c->build, &mergeconf); rc = sd_merge(&merge); if (ssunlikely(rc == -1)) { sv_mergefree(&vmerge, r->a); sr_oom_malfunction(r->e); goto error; } assert(rc == 1); sv_mergefree(&vmerge, r->a); sibranch *branch = si_branchnew(r); if (ssunlikely(branch == NULL)) goto error; sdid id = { .parent = parent->self.id.id, .flags = SD_IDBRANCH, .id = sr_seq(r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id); if (ssunlikely(rc == -1)) goto error; si_branchset(branch, &merge.index); rc = sd_commit(&c->build, r, &branch->index, &parent->file); if (ssunlikely(rc == -1)) { si_branchfree(branch, r); return NULL; } SS_INJECTION(r->i, SS_INJECTION_SI_BRANCH_0, sr_malfunction(r->e, "%s", "error injection"); si_branchfree(branch, r); return NULL); if (index->scheme->sync) { rc = si_nodesync(parent, r); if (ssunlikely(rc == -1)) { si_branchfree(branch, r); return NULL; } } if (index->scheme->mmap) { ss_mmapinit(&parent->map_swap); rc = ss_mmap(&parent->map_swap, parent->file.fd, parent->file.size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' mmap error: %s", parent->file.file, strerror(errno)); return NULL; } } return branch; error: sd_mergefree(&merge); return NULL; } int si_branch(si *index, sdc *c, siplan *plan, uint64_t vlsn) { sr *r = index->r; sinode *n = plan->node; assert(n->flags & SI_LOCK); si_lock(index); if (ssunlikely(n->used == 0)) { si_nodeunlock(n); si_unlock(index); return 0; } svindex *i; i = si_noderotate(n); si_unlock(index); sd_creset(c); sibranch *branch = si_branchcreate(index, c, n, i, vlsn); if (ssunlikely(branch == NULL)) return -1; /* commit */ si_lock(index); branch->next = n->branch; n->branch = branch; n->branch_count++; uint32_t used = sv_indexused(i); n->used -= used; ss_quota(r->quota, SS_QREMOVE, used); svindex swap = *i; si_nodeunrotate(n); si_nodeunlock(n); si_plannerupdate(&index->p, SI_BRANCH|SI_COMPACT, n); ssmmap swap_map = n->map; n->map = n->map_swap; memset(&n->map_swap, 0, sizeof(n->map_swap)); si_unlock(index); /* gc */ if (index->scheme->mmap) { int rc = ss_munmap(&swap_map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", n->file.file, strerror(errno)); return -1; } } si_nodegc_index(r, &swap); return 1; } static inline char* si_noderead(si *index, ssbuf *dest, sinode *node) { sr *r = index->r; if (index->scheme->mmap) { return node->map.p; } int rc = ss_bufensure(dest, r->a, node->file.size); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); return NULL; } rc = ss_filepread(&node->file, 0, dest->s, node->file.size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' read error: %s", node->file.file, strerror(errno)); return NULL; } ss_bufadvance(dest, node->file.size); return dest->s; }
int si_schemerecover(sischeme *s, sr *r) { sdscheme c; sd_schemeinit(&c); char path[PATH_MAX]; snprintf(path, sizeof(path), "%s/scheme", s->path); int version_storage_set = 0; int rc; rc = sd_schemerecover(&c, r, path); if (ssunlikely(rc == -1)) goto error; ssiter i; ss_iterinit(sd_schemeiter, &i); rc = ss_iteropen(sd_schemeiter, &i, r, &c, 1); if (ssunlikely(rc == -1)) goto error; while (ss_iterhas(sd_schemeiter, &i)) { sdschemeopt *opt = ss_iterof(sd_schemeiter, &i); switch (opt->id) { case SI_SCHEME_VERSION: break; case SI_SCHEME_VERSION_STORAGE: { if (opt->size != sizeof(srversion)) goto error; srversion *version = (srversion*)sd_schemesz(opt); if (! sr_versionstorage_check(version)) goto error_format; version_storage_set = 1; break; } case SI_SCHEME_SCHEME: { sf_schemefree(&s->scheme, r->a); sf_schemeinit(&s->scheme); ssbuf buf; ss_bufinit(&buf); rc = sf_schemeload(&s->scheme, r->a, sd_schemesz(opt), opt->size); if (ssunlikely(rc == -1)) goto error; rc = sf_schemevalidate(&s->scheme, r->a); if (ssunlikely(rc == -1)) goto error; ss_buffree(&buf, r->a); break; } case SI_SCHEME_NODE_SIZE: s->compaction.node_size = sd_schemeu64(opt); break; case SI_SCHEME_NODE_PAGE_SIZE: s->compaction.node_page_size = sd_schemeu32(opt); break; case SI_SCHEME_COMPRESSION: { char *name = sd_schemesz(opt); ssfilterif *cif = ss_filterof(name); if (ssunlikely(cif == NULL)) goto error; s->compression_if = cif; s->compression = s->compression_if != &ss_nonefilter; ss_free(r->a, s->compression_sz); s->compression_sz = ss_strdup(r->a, cif->name); if (ssunlikely(s->compression_sz == NULL)) goto error; break; } case SI_SCHEME_EXPIRE: s->expire = sd_schemeu32(opt); break; default: /* skip unknown */ break; } ss_iternext(sd_schemeiter, &i); } if (ssunlikely(! version_storage_set)) goto error_format; sd_schemefree(&c, r); return 0; error_format: sr_error(r->e, "%s", "incompatible storage format version"); error: sd_schemefree(&c, r); return -1; }
static inline int si_rangebranch(siread *q, sinode *n, sibranch *b, svmerge *m) { sicachebranch *c = si_cachefollow(q->cache); assert(c->branch == b); /* iterate cache */ if (ss_iterhas(sd_read, &c->i)) { svmergesrc *s = sv_mergeadd(m, &c->i); si_readstat(q, 1, n, 1); s->ptr = c; return 1; } if (c->open) { return 1; } if (q->cache_only) { return 2; } c->open = 1; /* choose compression type */ int compression; ssfilterif *compression_if; if (! si_branchis_root(b)) { compression = q->index->scheme->compression_branch; compression_if = q->index->scheme->compression_branch_if; } else { compression = q->index->scheme->compression; compression_if = q->index->scheme->compression_if; } sdreadarg arg = { .index = &b->index, .buf = &c->buf_a, .buf_xf = &c->buf_b, .buf_read = &q->index->readbuf, .index_iter = &c->index_iter, .page_iter = &c->page_iter, .use_memory = n->in_memory, .use_mmap = q->index->scheme->mmap, .use_mmap_copy = 1, .use_compression = compression, .compression_if = compression_if, .has = 0, .has_vlsn = 0, .o = q->order, .memory = &b->copy, .mmap = &n->map, .file = &n->file, .r = q->r }; ss_iterinit(sd_read, &c->i); int rc = ss_iteropen(sd_read, &c->i, &arg, q->key, q->keysize); int reads = sd_read_stat(&c->i); si_readstat(q, 0, n, reads); if (ssunlikely(rc == -1)) return -1; if (ssunlikely(! ss_iterhas(sd_read, &c->i))) return 0; svmergesrc *s = sv_mergeadd(m, &c->i); s->ptr = c; return 1; } static inline int si_range(siread *q) { assert(q->has == 0); ssiter i; ss_iterinit(si_iter, &i); ss_iteropen(si_iter, &i, q->r, q->index, q->order, q->key, q->keysize); sinode *node; next_node: node = ss_iterof(si_iter, &i); if (ssunlikely(node == NULL)) return 0; si_txtrack(q->x, node); /* prepare sources */ svmerge *m = &q->merge; int count = node->branch_count + 2 + 1; int rc = sv_mergeprepare(m, q->r, count); if (ssunlikely(rc == -1)) { sr_errorreset(q->r->e); return -1; } /* external source (upsert) */ svmergesrc *s; sv upbuf_reserve; ssbuf upbuf; if (ssunlikely(q->upsert_v && q->upsert_v->v)) { ss_bufinit_reserve(&upbuf, &upbuf_reserve, sizeof(upbuf_reserve)); ss_bufadd(&upbuf, NULL, (void*)&q->upsert_v, sizeof(sv*)); s = sv_mergeadd(m, NULL); ss_iterinit(ss_bufiterref, &s->src); ss_iteropen(ss_bufiterref, &s->src, &upbuf, sizeof(sv*)); } /* in-memory indexes */ svindex *second; svindex *first = si_nodeindex_priority(node, &second); if (first->count) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, first, q->order, q->key, q->keysize); } if (ssunlikely(second && second->count)) { s = sv_mergeadd(m, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, q->r, second, q->order, q->key, q->keysize); } /* cache and branches */ rc = si_cachevalidate(q->cache, node); if (ssunlikely(rc == -1)) { sr_oom(q->r->e); return -1; } sibranch *b = node->branch; while (b) { rc = si_rangebranch(q, node, b, m); if (ssunlikely(rc == -1 || rc == 2)) return rc; b = b->next; } /* merge and filter data stream */ ssiter j; ss_iterinit(sv_mergeiter, &j); ss_iteropen(sv_mergeiter, &j, q->r, m, q->order); ssiter k; ss_iterinit(sv_readiter, &k); ss_iteropen(sv_readiter, &k, q->r, &j, &q->index->u, q->vlsn, 0); sv *v = ss_iterof(sv_readiter, &k); if (ssunlikely(v == NULL)) { sv_mergereset(&q->merge); ss_iternext(si_iter, &i); goto next_node; } rc = 1; /* convert upsert search to SS_EQ */ if (q->upsert_eq) { rc = sr_compare(q->r->scheme, sv_pointer(v), sv_size(v), q->key, q->keysize); rc = rc == 0; } /* do prefix search */ if (q->prefix && rc) { rc = sr_compareprefix(q->r->scheme, q->prefix, q->prefixsize, sv_pointer(v), sv_size(v)); } if (sslikely(rc == 1)) { if (ssunlikely(si_readdup(q, v) == -1)) return -1; } /* skip a possible duplicates from data sources */ ss_iternext(sv_readiter, &k); return rc; }