static int si_redistribute_index(si *index, sr *r, sdc *c, sinode *node) { svindex *vindex = si_nodeindex(node); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, r, vindex, SS_GTE, NULL, 0); while (ss_iterhas(sv_indexiter, &i)) { sv *v = ss_iterof(sv_indexiter, &i); int rc = ss_bufadd(&c->b, r->a, &v->v, sizeof(svv**)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); ss_iternext(sv_indexiter, &i); } if (ssunlikely(ss_bufused(&c->b) == 0)) return 0; uint64_t now = ss_utime(); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, &c->b, sizeof(svv*)); while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); si_redistribute_set(index, r, now, v); ss_iternext(ss_bufiterref, &i); } return 0; }
sinode *si_nodenew(sr *r) { sinode *n = (sinode*)ss_malloc(r->a, sizeof(sinode)); if (ssunlikely(n == NULL)) { sr_oom_malfunction(r->e); return NULL; } n->recover = 0; n->backup = 0; n->lru = 0; n->ac = 0; n->flags = 0; n->update_time = 0; n->used = 0; n->in_memory = 0; si_branchinit(&n->self, r); n->branch = NULL; n->branch_count = 0; n->temperature = 0; n->temperature_reads = 0; ss_fileinit(&n->file, r->vfs); ss_mmapinit(&n->map); ss_mmapinit(&n->map_swap); sv_indexinit(&n->i0); sv_indexinit(&n->i1); ss_rbinitnode(&n->node); ss_rqinitnode(&n->nodecompact); ss_rqinitnode(&n->nodebranch); ss_rqinitnode(&n->nodetemp); ss_listinit(&n->commit); return n; }
static inline int si_recovercomplete(sitrack *track, sr *r, si *index, ssbuf *buf) { /* prepare and build primary index */ ss_bufreset(buf); ssrbnode *p = ss_rbmin(&track->i); while (p) { sinode *n = sscast(p, sinode, node); int rc = ss_bufadd(buf, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); p = ss_rbnext(&track->i, p); } ssiter i; ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, buf, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { sinode *n = ss_iterof(ss_bufiterref, &i); if (n->recover & SI_RDB_REMOVE) { int rc = si_nodefree(n, r, 1); if (ssunlikely(rc == -1)) return -1; ss_iternext(ss_bufiterref, &i); continue; } n->recover = SI_RDB; si_insert(index, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH|SI_TEMP, n); ss_iternext(ss_bufiterref, &i); } return 0; }
static int si_redistribute(si *index, sr *r, sdc *c, sinode *node, ssbuf *result) { (void)index; svindex *vindex = si_nodeindex(node); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, r, vindex, SS_GTE, NULL, 0); while (ss_iterhas(sv_indexiter, &i)) { sv *v = ss_iterof(sv_indexiter, &i); int rc = ss_bufadd(&c->b, r->a, &v->v, sizeof(svv**)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); ss_iternext(sv_indexiter, &i); } if (ssunlikely(ss_bufused(&c->b) == 0)) return 0; ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, &c->b, sizeof(svv*)); ssiter j; ss_iterinit(ss_bufiterref, &j); ss_iteropen(ss_bufiterref, &j, result, sizeof(sinode*)); sinode *prev = ss_iterof(ss_bufiterref, &j); ss_iternext(ss_bufiterref, &j); while (1) { sinode *p = ss_iterof(ss_bufiterref, &j); if (p == NULL) { assert(prev != NULL); while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); v->next = NULL; sv_indexset(&prev->i0, r, v); ss_iternext(ss_bufiterref, &i); } break; } while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); v->next = NULL; sdindexpage *page = sd_indexmin(&p->self.index); int rc = sr_compare(r->scheme, sv_vpointer(v), v->size, sd_indexpage_min(&p->self.index, page), page->sizemin); if (ssunlikely(rc >= 0)) break; sv_indexset(&prev->i0, r, v); ss_iternext(ss_bufiterref, &i); } if (ssunlikely(! ss_iterhas(ss_bufiterref, &i))) break; prev = p; ss_iternext(ss_bufiterref, &j); } assert(ss_iterof(ss_bufiterref, &i) == NULL); return 0; }
static inline int si_noderecover(sinode *n, sr *r, int in_memory) { /* recover branches */ ssiter i; ss_iterinit(sd_recover, &i); ss_iteropen(sd_recover, &i, r, &n->file); int first = 1; int rc; while (ss_iteratorhas(&i)) { sdindexheader *h = ss_iteratorof(&i); sibranch *b; if (first) { b = &n->self; } else { b = si_branchnew(r); if (ssunlikely(b == NULL)) goto error; } sdindex index; sd_indexinit(&index); rc = sd_indexcopy(&index, r, h); if (ssunlikely(rc == -1)) goto error; si_branchset(b, &index); if (in_memory) { char *start = (char*)h - h->total - sizeof(sdseal); char *end = start + sizeof(sdseal) + h->total + sizeof(sdindexheader) + h->size + h->extension; int branch_size = end - start; rc = ss_blobensure(&b->copy, branch_size); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); goto error; } memcpy(b->copy.p, start, branch_size); } b->next = n->branch; n->branch = b; n->branch_count++; first = 0; ss_iteratornext(&i); } rc = sd_recover_complete(&i); if (ssunlikely(rc == -1)) goto error; ss_iteratorclose(&i); return 0; error: ss_iteratorclose(&i); return -1; }
int si_compact(si *index, sdc *c, siplan *plan, uint64_t vlsn) { sr *r = index->r; sinode *node = plan->node; assert(node->flags & SI_LOCK); /* read node file */ sd_creset(c); char *node_file = si_noderead(index, &c->c, node); if (ssunlikely(node_file == NULL)) return -1; /* prepare for compaction */ int rc; rc = sd_censure(c, r, node->branch_count); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); svmerge merge; sv_mergeinit(&merge); rc = sv_mergeprepare(&merge, r, node->branch_count); if (ssunlikely(rc == -1)) return -1; uint32_t size_stream = 0; sdcbuf *cbuf = c->head; sibranch *b = node->branch; while (b) { svmergesrc *s = sv_mergeadd(&merge, NULL); rc = ss_bufensure(&cbuf->b, r->a, b->index.h->sizevmax); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); size_stream += sd_indextotal(&b->index); ss_iterinit(sd_iter, &s->src); ss_iteropen(sd_iter, &s->src, r, &b->index, node_file, 0, index->scheme->compression, &cbuf->a, &cbuf->b); cbuf = cbuf->next; b = b->next; } ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &merge, SS_GTE, 0); rc = si_compaction(index, c, vlsn, node, &i, size_stream); sv_mergefree(&merge, r->a); return rc; }
int si_noderead(sinode *n, sr *r, ssbuf *dest) { int rc = ss_bufensure(dest, r->a, n->file.size); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); rc = ss_filepread(&n->file, 0, dest->s, n->file.size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' read error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } ss_bufadvance(dest, n->file.size); return 0; }
static inline sibranch* si_branchcreate(si *index, sdc *c, sinode *parent, svindex *vindex, uint64_t vlsn) { sr *r = index->r; svmerge vmerge; sv_mergeinit(&vmerge); int rc = sv_mergeprepare(&vmerge, r, 1); if (ssunlikely(rc == -1)) return NULL; svmergesrc *s = sv_mergeadd(&vmerge, NULL); ss_iterinit(sv_indexiterraw, &s->src); ss_iteropen(sv_indexiterraw, &s->src, vindex); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &vmerge, SS_GTE, 1); /* merge iter is not used */ sdmergeconf mergeconf = { .size_stream = UINT32_MAX, .size_node = UINT64_MAX, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression = index->scheme->compression, .compression_key = index->scheme->compression_key, .offset = parent->file.size, .vlsn = vlsn, .save_delete = 1 }; sdmerge merge; sd_mergeinit(&merge, r, &i, &c->build, &mergeconf); rc = sd_merge(&merge); if (ssunlikely(rc == -1)) { sv_mergefree(&vmerge, r->a); sr_oom_malfunction(r->e); goto error; } assert(rc == 1); sv_mergefree(&vmerge, r->a); sibranch *branch = si_branchnew(r); if (ssunlikely(branch == NULL)) goto error; sdid id = { .parent = parent->self.id.id, .flags = SD_IDBRANCH, .id = sr_seq(r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id); if (ssunlikely(rc == -1)) goto error; si_branchset(branch, &merge.index); rc = sd_commit(&c->build, r, &branch->index, &parent->file); if (ssunlikely(rc == -1)) { si_branchfree(branch, r); return NULL; } SS_INJECTION(r->i, SS_INJECTION_SI_BRANCH_0, sr_malfunction(r->e, "%s", "error injection"); si_branchfree(branch, r); return NULL); if (index->scheme->sync) { rc = si_nodesync(parent, r); if (ssunlikely(rc == -1)) { si_branchfree(branch, r); return NULL; } } if (index->scheme->mmap) { ss_mmapinit(&parent->map_swap); rc = ss_mmap(&parent->map_swap, parent->file.fd, parent->file.size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' mmap error: %s", parent->file.file, strerror(errno)); return NULL; } } return branch; error: sd_mergefree(&merge); return NULL; } int si_branch(si *index, sdc *c, siplan *plan, uint64_t vlsn) { sr *r = index->r; sinode *n = plan->node; assert(n->flags & SI_LOCK); si_lock(index); if (ssunlikely(n->used == 0)) { si_nodeunlock(n); si_unlock(index); return 0; } svindex *i; i = si_noderotate(n); si_unlock(index); sd_creset(c); sibranch *branch = si_branchcreate(index, c, n, i, vlsn); if (ssunlikely(branch == NULL)) return -1; /* commit */ si_lock(index); branch->next = n->branch; n->branch = branch; n->branch_count++; uint32_t used = sv_indexused(i); n->used -= used; ss_quota(r->quota, SS_QREMOVE, used); svindex swap = *i; si_nodeunrotate(n); si_nodeunlock(n); si_plannerupdate(&index->p, SI_BRANCH|SI_COMPACT, n); ssmmap swap_map = n->map; n->map = n->map_swap; memset(&n->map_swap, 0, sizeof(n->map_swap)); si_unlock(index); /* gc */ if (index->scheme->mmap) { int rc = ss_munmap(&swap_map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", n->file.file, strerror(errno)); return -1; } } si_nodegc_index(r, &swap); return 1; } static inline char* si_noderead(si *index, ssbuf *dest, sinode *node) { sr *r = index->r; if (index->scheme->mmap) { return node->map.p; } int rc = ss_bufensure(dest, r->a, node->file.size); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); return NULL; } rc = ss_filepread(&node->file, 0, dest->s, node->file.size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' read error: %s", node->file.file, strerror(errno)); return NULL; } ss_bufadvance(dest, node->file.size); return dest->s; }
static inline int si_recoversnapshot(si *i, sr *r, sdsnapshot *s) { /* recovery stages: snapshot (1) ok snapshot.incomplete (2) remove snapshot.incomplete snapshot (3) remove snapshot.incomplete, load snapshot snapshot.incomplete */ /* recover snapshot file (crash recover) */ int snapshot = 0; int snapshot_incomplete = 0; char path[1024]; snprintf(path, sizeof(path), "%s/index", i->scheme->path); snapshot = ss_vfsexists(r->vfs, path); snprintf(path, sizeof(path), "%s/index.incomplete", i->scheme->path); snapshot_incomplete = ss_vfsexists(r->vfs, path); int rc; if (snapshot_incomplete) { rc = ss_vfsunlink(r->vfs, path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "index file '%s' unlink error: %s", path, strerror(errno)); return -1; } } if (! snapshot) return 0; /* read snapshot file */ snprintf(path, sizeof(path), "%s/index", i->scheme->path); ssize_t size = ss_vfssize(r->vfs, path); if (ssunlikely(size == -1)) { sr_malfunction(r->e, "index file '%s' read error: %s", path, strerror(errno)); return -1; } rc = ss_bufensure(&s->buf, r->a, size); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); ssfile file; ss_fileinit(&file, r->vfs); rc = ss_fileopen(&file, path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "index file '%s' open error: %s", path, strerror(errno)); return -1; } rc = ss_filepread(&file, 0, s->buf.s, size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "index file '%s' read error: %s", path, strerror(errno)); ss_fileclose(&file); return -1; } ss_bufadvance(&s->buf, size); ss_fileclose(&file); return 0; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint32_t size_stream, uint64_t vlsn) { sr *r = index->r; int count = 0; int rc; sdmergeconf mergeconf = { .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression = index->scheme->compression, .compression_key = index->scheme->compression_key, .offset = 0, .vlsn = vlsn, .save_delete = 0, .save_update = 0 }; sdmerge merge; sd_mergeinit(&merge, r, i, &c->build, &c->update, &mergeconf); while ((rc = sd_merge(&merge)) > 0) { sinode *n = si_nodenew(r); if (ssunlikely(n == NULL)) goto error; sdid id = { .parent = parent->self.id.id, .flags = 0, .id = sr_seq(index->r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id); if (ssunlikely(rc == -1)) goto error; rc = si_nodecreate(n, r, index->scheme, &id, &merge.index, &c->build); if (ssunlikely(rc == -1)) goto error; rc = ss_bufadd(result, index->r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r->e); si_nodefree(n, r, 1); goto error; } sd_buildreset(&c->build); count++; } if (ssunlikely(rc == -1)) goto error; return 0; error: si_splitfree(result, r); sd_mergefree(&merge); return -1; } int si_compaction(si *index, sdc *c, uint64_t vlsn, sinode *node, ssiter *stream, uint32_t size_stream) { sr *r = index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * split merge stream into a number * of a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme->node_size, size_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->self.id.id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, SI_COMPACT|SI_BRANCH, node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); uint32_t used = sv_indexused(j); if (used) { ss_quota(r->quota, SS_QREMOVE, used); } break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = sv_indexused(j); si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = sv_indexused(&n->i0); si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = sv_indexused(&n->i0); si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme->sync) { rc = si_nodesync(n, r); if (ssunlikely(rc == -1)) return -1; } rc = si_nodeseal(n, r, index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc old node */ rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_nodecomplete(n, r, index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint64_t size_stream, uint32_t stream, uint64_t vlsn) { sr *r = &index->r; uint32_t timestamp = ss_timestamp(); int rc; sdmergeconf mergeconf = { .stream = stream, .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme.compaction.node_page_size, .checksum = index->scheme.compaction.node_page_checksum, .expire = index->scheme.expire, .timestamp = timestamp, .compression = index->scheme.compression, .compression_if = index->scheme.compression_if, .direct_io = index->scheme.direct_io, .direct_io_page_size = index->scheme.direct_io_page_size, .vlsn = vlsn }; sinode *n = NULL; sdmerge merge; rc = sd_mergeinit(&merge, r, i, &c->build, &c->build_index, &c->upsert, &mergeconf); if (ssunlikely(rc == -1)) return -1; while ((rc = sd_merge(&merge)) > 0) { /* create new node */ uint64_t id = sr_seq(index->r.seq, SR_NSNNEXT); n = si_nodenew(r, id, parent->id); if (ssunlikely(n == NULL)) goto error; rc = si_nodecreate(n, r, &index->scheme); if (ssunlikely(rc == -1)) goto error; /* write pages */ uint64_t offset; offset = sd_iosize(&c->io, &n->file); while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &n->file, &c->io, merge.build); if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); } if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); rc = sd_mergeend(&merge, offset); if (ssunlikely(rc == -1)) goto error; /* write index */ rc = sd_writeindex(r, &n->file, &c->io, &merge.index); if (ssunlikely(rc == -1)) goto error; /* mmap mode */ if (index->scheme.mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } /* add node to the list */ rc = ss_bufadd(result, index->r.a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r.e); goto error; } n->index = merge.index; } if (ssunlikely(rc == -1)) goto error; return 0; error: if (n) si_nodefree(n, r, 0); sd_mergefree(&merge); si_splitfree(result, r); return -1; } static int si_merge(si *index, sdc *c, sinode *node, uint64_t vlsn, ssiter *stream, uint64_t size_stream, uint32_t n_stream) { sr *r = &index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * Split merge stream into a number of * a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme.compaction.node_size, size_stream, n_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, node); si_nodesplit(node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = j->used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme.sync) { rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } rc = si_noderename_seal(n, r, &index->scheme); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc node */ uint16_t refs = si_noderefof(node); if (sslikely(refs == 0)) { rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; } else { /* node concurrently being read, schedule for * delayed removal */ si_nodegc(node, r, &index->scheme); si_lock(index); ss_listappend(&index->gc, &node->gc); index->gc_count++; si_unlock(index); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_noderename_complete(n, r, &index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
static inline sibranch* si_branchcreate(si *index, sdc *c, sinode *parent, svindex *vindex, uint64_t vlsn) { sr *r = index->r; sibranch *branch = NULL; /* in-memory mode blob */ int rc; ssblob copy, *blob = NULL; if (parent->in_memory) { ss_blobinit(©, r->vfs); rc = ss_blobensure(©, 10ULL * 1024 * 1024); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); return NULL; } blob = © } svmerge vmerge; sv_mergeinit(&vmerge); rc = sv_mergeprepare(&vmerge, r, 1); if (ssunlikely(rc == -1)) return NULL; svmergesrc *s = sv_mergeadd(&vmerge, NULL); ss_iterinit(sv_indexiter, &s->src); ss_iteropen(sv_indexiter, &s->src, r, vindex, SS_GTE, NULL, 0); ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &vmerge, SS_GTE); /* merge iter is not used */ sdmergeconf mergeconf = { .size_stream = UINT32_MAX, .size_node = UINT64_MAX, .size_page = index->scheme->node_page_size, .checksum = index->scheme->node_page_checksum, .compression_key = index->scheme->compression_key, .compression = index->scheme->compression_branch, .compression_if = index->scheme->compression_branch_if, .vlsn = vlsn, .vlsn_lru = 0, .save_delete = 1, .save_upsert = 1 }; sdmerge merge; sd_mergeinit(&merge, r, &i, &c->build, &c->upsert, &mergeconf); while ((rc = sd_merge(&merge)) > 0) { assert(branch == NULL); /* write open seal */ uint64_t seal = parent->file.size; rc = sd_writeseal(r, &parent->file, blob); if (ssunlikely(rc == -1)) goto e0; /* write pages */ uint64_t offset = parent->file.size; while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &parent->file, blob, merge.build); if (ssunlikely(rc == -1)) goto e0; offset = parent->file.size; } if (ssunlikely(rc == -1)) goto e0; sdid id = { .parent = parent->self.id.id, .flags = SD_IDBRANCH, .id = sr_seq(r->seq, SR_NSNNEXT) }; rc = sd_mergecommit(&merge, &id, parent->file.size); if (ssunlikely(rc == -1)) goto e0; /* write index */ rc = sd_writeindex(r, &parent->file, blob, &merge.index); if (ssunlikely(rc == -1)) goto e0; if (index->scheme->sync) { rc = ss_filesync(&parent->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "file '%s' sync error: %s", ss_pathof(&parent->file.path), strerror(errno)); goto e0; } } SS_INJECTION(r->i, SS_INJECTION_SI_BRANCH_0, sd_mergefree(&merge); sr_malfunction(r->e, "%s", "error injection"); return NULL); /* seal the branch */ rc = sd_seal(r, &parent->file, blob, &merge.index, seal); if (ssunlikely(rc == -1)) goto e0; if (index->scheme->sync == 2) { rc = ss_filesync(&parent->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "file '%s' sync error: %s", ss_pathof(&parent->file.path), strerror(errno)); goto e0; } } /* create new branch object */ branch = si_branchnew(r); if (ssunlikely(branch == NULL)) goto e0; si_branchset(branch, &merge.index); } sv_mergefree(&vmerge, r->a); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); goto e0; } assert(branch != NULL); /* in-memory mode support */ if (blob) { rc = ss_blobfit(blob); if (ssunlikely(rc == -1)) { ss_blobfree(blob); goto e1; } branch->copy = copy; } /* mmap support */ if (index->scheme->mmap) { ss_mmapinit(&parent->map_swap); rc = ss_vfsmmap(r->vfs, &parent->map_swap, parent->file.fd, parent->file.size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' mmap error: %s", ss_pathof(&parent->file.path), strerror(errno)); goto e1; } } return branch; e0: sd_mergefree(&merge); if (blob) ss_blobfree(blob); return NULL; e1: si_branchfree(branch, r); return NULL; } int si_branch(si *index, sdc *c, siplan *plan, uint64_t vlsn) { sr *r = index->r; sinode *n = plan->node; assert(n->flags & SI_LOCK); si_lock(index); if (ssunlikely(n->used == 0)) { si_nodeunlock(n); si_unlock(index); return 0; } svindex *i; i = si_noderotate(n); si_unlock(index); sibranch *branch = si_branchcreate(index, c, n, i, vlsn); if (ssunlikely(branch == NULL)) return -1; /* commit */ si_lock(index); branch->next = n->branch; n->branch->link = branch; n->branch = branch; n->branch_count++; uint32_t used = sv_indexused(i); n->used -= used; ss_quota(r->quota, SS_QREMOVE, used); index->size += sd_indexsize(branch->index.h) + sd_indextotal(&branch->index); svindex swap = *i; si_nodeunrotate(n); si_nodeunlock(n); si_plannerupdate(&index->p, SI_BRANCH|SI_COMPACT, n); ssmmap swap_map = n->map; n->map = n->map_swap; memset(&n->map_swap, 0, sizeof(n->map_swap)); si_unlock(index); /* gc */ if (index->scheme->mmap) { int rc = ss_vfsmunmap(r->vfs, &swap_map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } si_nodegc_index(r, &swap); return 1; } int si_compact(si *index, sdc *c, siplan *plan, uint64_t vlsn, uint64_t vlsn_lru, ssiter *vindex, uint64_t vindex_used) { sr *r = index->r; sinode *node = plan->node; assert(node->flags & SI_LOCK); /* prepare for compaction */ int rc; rc = sd_censure(c, r, node->branch_count); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); svmerge merge; sv_mergeinit(&merge); rc = sv_mergeprepare(&merge, r, node->branch_count + 1); if (ssunlikely(rc == -1)) return -1; /* read node file into memory */ int use_mmap = index->scheme->mmap; ssmmap *map = &node->map; ssmmap preload; if (index->scheme->node_compact_load) { rc = si_noderead(node, r, &c->c); if (ssunlikely(rc == -1)) return -1; preload.p = c->c.s; preload.size = ss_bufused(&c->c); map = &preload; use_mmap = 1; } /* include vindex into merge process */ svmergesrc *s; uint64_t size_stream = 0; if (vindex) { s = sv_mergeadd(&merge, vindex); size_stream = vindex_used; } sdcbuf *cbuf = c->head; sibranch *b = node->branch; while (b) { s = sv_mergeadd(&merge, NULL); /* choose compression type */ int compression; ssfilterif *compression_if; if (! si_branchis_root(b)) { compression = index->scheme->compression_branch; compression_if = index->scheme->compression_branch_if; } else { compression = index->scheme->compression; compression_if = index->scheme->compression_if; } sdreadarg arg = { .index = &b->index, .buf = &cbuf->a, .buf_xf = &cbuf->b, .buf_read = &c->d, .index_iter = &cbuf->index_iter, .page_iter = &cbuf->page_iter, .use_memory = node->in_memory, .use_mmap = use_mmap, .use_mmap_copy = 0, .use_compression = compression, .compression_if = compression_if, .has = 0, .has_vlsn = 0, .o = SS_GTE, .memory = &b->copy, .mmap = map, .file = &node->file, .r = r }; ss_iterinit(sd_read, &s->src); int rc = ss_iteropen(sd_read, &s->src, &arg, NULL, 0); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); size_stream += sd_indextotal(&b->index); cbuf = cbuf->next; b = b->next; } ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &merge, SS_GTE); rc = si_merge(index, c, node, vlsn, vlsn_lru, &i, size_stream); sv_mergefree(&merge, r->a); return rc; } int si_compact_index(si *index, sdc *c, siplan *plan, uint64_t vlsn, uint64_t vlsn_lru) { sinode *node = plan->node; si_lock(index); if (ssunlikely(node->used == 0)) { si_nodeunlock(node); si_unlock(index); return 0; } svindex *vindex; vindex = si_noderotate(node); si_unlock(index); uint64_t size_stream = sv_indexused(vindex); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, index->r, vindex, SS_GTE, NULL, 0); return si_compact(index, c, plan, vlsn, vlsn_lru, &i, size_stream); }