static int se_dbscheme_init(sedb *db, char *name, int size) { se *e = se_of(&db->o); /* database id */ uint32_t id = sr_seq(&e->seq, SR_DSN); sr_seq(&e->seq, SR_DSNNEXT); /* prepare index scheme */ sischeme *scheme = db->scheme; if (size == 0) size = strlen(name); scheme->name = ss_malloc(&e->a, size + 1); if (ssunlikely(scheme->name == NULL)) goto error; memcpy(scheme->name, name, size); scheme->name[size] = 0; scheme->id = id; scheme->sync = 2; scheme->mmap = 0; scheme->storage = SI_SCACHE; scheme->node_size = 64 * 1024 * 1024; scheme->node_compact_load = 0; scheme->node_page_size = 128 * 1024; scheme->node_page_checksum = 1; scheme->compression_copy = 0; scheme->compression_cold = 0; scheme->compression_cold_if = &ss_nonefilter; scheme->compression_hot = 0; scheme->compression_hot_if = &ss_nonefilter; scheme->temperature = 0; scheme->expire = 0; scheme->amqf = 0; scheme->fmt_storage = SF_RAW; scheme->lru = 0; scheme->lru_step = 128 * 1024; scheme->buf_gc_wm = 1024 * 1024; scheme->storage_sz = ss_strdup(&e->a, "cache"); if (ssunlikely(scheme->storage_sz == NULL)) goto error; scheme->compression_cold_sz = ss_strdup(&e->a, scheme->compression_cold_if->name); if (ssunlikely(scheme->compression_cold_sz == NULL)) goto error; scheme->compression_hot_sz = ss_strdup(&e->a, scheme->compression_hot_if->name); if (ssunlikely(scheme->compression_hot_sz == NULL)) goto error; sf_upsertinit(&scheme->fmt_upsert); sf_schemeinit(&scheme->scheme); return 0; error: sr_oom(&e->error); return -1; }
sxstate sx_set_autocommit(sxmanager *m, sxindex *index, sx *x, svlog *log, svv *v) { if (sslikely(m->count_rw == 0)) { sx_init(m, x, log); svlogv lv; lv.index_id = index->dsn; lv.next = UINT32_MAX; lv.v = v; lv.ptr = NULL; sv_logadd(x->log, index->r, &lv); sr_seq(index->r->seq, SR_TSNNEXT); sx_promote(x, SX_COMMIT); return SX_COMMIT; } sx_begin(m, x, SX_RW, log, 0); int rc = sx_set(x, index, v); if (ssunlikely(rc == -1)) { sx_rollback(x); return SX_ROLLBACK; } sxstate s = sx_prepare(x, NULL, NULL); switch (s) { case SX_PREPARE: s = sx_commit(x); break; case SX_LOCK: s = sx_rollback(x); break; case SX_ROLLBACK: break; default: assert(0); } return s; }
sxstate sx_set_autocommit(sxmanager *m, sxindex *index, sx *x, svv *v) { if (sslikely(m->count_rw == 0)) { sx_init(m, x); svlogv lv; lv.id = index->dsn; lv.next = UINT32_MAX; sv_init(&lv.v, &sv_vif, v, NULL); sv_logadd(&x->log, m->r->a, &lv, index->ptr); sr_seq(m->r->seq, SR_TSNNEXT); return SXCOMMIT; } sx_begin(m, x, SXRW, 0); int rc = sx_set(x, index, v); if (ssunlikely(rc == -1)) { sx_rollback(x); return SXROLLBACK; } sxstate s = sx_prepare(x, NULL, NULL); if (sslikely(s == SXPREPARE)) sx_commit(x); else if (s == SXLOCK) sx_rollback(x); return s; }
int se_scheduler_backup(void *arg) { se *e = arg; sescheduler *s = &e->sched; if (ssunlikely(e->conf.backup_path == NULL)) { sr_error(&e->error, "%s", "backup is not enabled"); return -1; } /* begin backup procedure * state 0 * * disable log garbage-collection */ sl_poolgc_enable(&e->lp, 0); ss_mutexlock(&s->lock); if (ssunlikely(s->backup > 0)) { ss_mutexunlock(&s->lock); sl_poolgc_enable(&e->lp, 1); /* in progress */ return 0; } uint64_t bsn = sr_seq(&e->seq, SR_BSNNEXT); s->backup = 1; s->backup_bsn = bsn; ss_mutexunlock(&s->lock); return 0; }
sxstate sx_setstmt(sxmanager *m, sxindex *index, sv *v) { sr_seq(m->seq, SR_TSNNEXT); ssrbnode *n = NULL; int rc = sx_match(&index->i, index->scheme, sv_pointer(v), sv_size(v), &n); if (rc == 0 && n) return SXLOCK; return SXCOMMIT; }
int sc_ctl_checkpoint(sc *s) { uint64_t lsn = sr_seq(s->r->seq, SR_LSN); ss_mutexlock(&s->lock); s->checkpoint_lsn = lsn; s->checkpoint = 1; ss_mutexunlock(&s->lock); return 0; }
int sc_ctl_snapshot(sc *s) { uint64_t ssn = sr_seq(s->r->seq, SR_SSNNEXT); ss_mutexlock(&s->lock); s->snapshot_ssn = ssn; s->snapshot = 1; ss_mutexunlock(&s->lock); return 0; }
int se_scheduler_checkpoint(void *arg) { se *o = arg; sescheduler *s = &o->sched; uint64_t lsn = sr_seq(&o->seq, SR_LSN); ss_mutexlock(&s->lock); s->checkpoint_lsn = lsn; s->checkpoint = 1; ss_mutexunlock(&s->lock); return 0; }
int se_scheduler_snapshot(void *arg) { se *o = arg; sescheduler *s = &o->sched; uint64_t ssn = sr_seq(&o->seq, SR_SSNNEXT); ss_mutexlock(&s->lock); s->snapshot_ssn = ssn; s->snapshot = 1; ss_mutexunlock(&s->lock); return 0; }
uint64_t sx_vlsn(sxmanager *m) { ss_spinlock(&m->lock); uint64_t vlsn; if (sx_count(m) > 0) { ssrbnode *node = ss_rbmin(&m->i); sx *min = sscast(node, sx, node); vlsn = min->vlsn; } else { vlsn = sr_seq(m->r->seq, SR_LSN); } ss_spinunlock(&m->lock); return vlsn; }
static int se_dbscheme_init(sedb *db, char *name, int size) { se *e = se_of(&db->o); /* database id */ uint32_t id = sr_seq(&e->seq, SR_DSN); sr_seq(&e->seq, SR_DSNNEXT); /* prepare index scheme */ sischeme *scheme = db->scheme; if (size == 0) size = strlen(name); scheme->name = ss_malloc(&e->a, size + 1); if (ssunlikely(scheme->name == NULL)) goto error; memcpy(scheme->name, name, size); scheme->name[size] = 0; scheme->id = id; scheme->sync = 1; scheme->mmap = 1; scheme->direct_io = 0; scheme->direct_io_page_size = 4096; scheme->direct_io_buffer_size = 8 * 1024 * 1024; scheme->compression = 0; scheme->compression_if = &ss_nonefilter; scheme->expire = 0; scheme->buf_gc_wm = 1024 * 1024; scheme->compression_sz = ss_strdup(&e->a, scheme->compression_if->name); if (ssunlikely(scheme->compression_sz == NULL)) goto error; sf_upsertinit(&scheme->upsert); sf_schemeinit(&scheme->scheme); return 0; error: sr_oom(&e->error); return -1; }
int se_reqread(sereq *r) { sereqarg *arg = &r->arg; sedb *db = (sedb*)r->db; uint32_t keysize; void *key; if (sslikely(arg->v.v)) { keysize = sv_size(&arg->v); key = sv_pointer(&arg->v); } else { keysize = 0; key = NULL; } char *prefix; uint32_t prefixsize; if (arg->vprefix.v) { void *vptr = sv_vpointer(arg->vprefix.v); prefix = sf_key(vptr, 0); prefixsize = sf_keysize(vptr, 0); } else { prefix = NULL; prefixsize = 0; } if (sslikely(arg->vlsn_generate)) arg->vlsn = sr_seq(db->r.seq, SR_LSN); sitx x; si_begin(&x, &db->index, 1); siread q; si_readopen(&q, &x, arg->cache, arg->order, arg->vlsn, prefix, prefixsize, key, keysize); if (arg->update) si_readupdate(&q, &arg->vup, arg->update_eq); if (arg->cache_only) si_readcache_only(&q); if (arg->has) si_readhas(&q); r->rc = si_read(&q); r->read_disk = q.read_disk; r->read_cache = q.read_cache; r->v = q.result.v; si_readclose(&q); si_commit(&x); return r->rc; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SX_RO || sv_logcount_write(x->log) == 0) return sx_promote(x, SX_PREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); sxstate rc; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SX_ROLLBACK); if (sslikely(v->prev == NULL)) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SX_ROLLBACK); continue; } /* force commit for read-only conflicts */ sxindex *i = v->prev->index; if (sv_vflags(v->prev->v, i->r) & SVGET) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } return sx_promote(x, SX_LOCK); } return sx_promote(x, SX_PREPARE); }
int sc_backupstart(sc *s) { /* begin backup procedure * state 0 * * disable log garbage-collection */ sl_poolgc_enable(s->lp, 0); ss_mutexlock(&s->lock); if (ssunlikely(s->backup > 0)) { ss_mutexunlock(&s->lock); sl_poolgc_enable(s->lp, 1); /* in progress */ return 1; } uint64_t bsn = sr_seq(s->r->seq, SR_BSNNEXT); s->backup = 1; s->backup_bsn = bsn; ss_mutexunlock(&s->lock); return 0; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->r->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SXRO || sv_logcount_write(&x->log) == 0) return sx_promote(x, SXPREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv)); for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->v.v; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SXROLLBACK); if (sslikely(v->prev == NULL)) { if (prepare && lsn != x->vlsn) { sxindex *i = v->index; if (prepare(x, &lv->v, arg, i->ptr)) return sx_promote(x, SXROLLBACK); } continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SXROLLBACK); continue; } /* force commit for read-only conflicts */ if (v->prev->v->flags & SVGET) continue; return sx_promote(x, SXLOCK); } return sx_promote(x, SXPREPARE); }
sxstate sx_get_autocommit(sxmanager *m, sxindex *index) { (void)m; sr_seq(index->r->seq, SR_TSNNEXT); return SX_COMMIT; }
int se_scheduler_branch(void *arg) { sedb *db = arg; se *e = se_of(&db->o); srzone *z = se_zoneof(e); seworker *w = se_workerpool_pop(&e->sched.workers, &e->r); if (ssunlikely(w == NULL)) return -1; int rc; while (1) { uint64_t vlsn = sx_vlsn(&e->xm); uint64_t vlsn_lru = si_lru_vlsn(&db->index); siplan plan = { .explain = SI_ENONE, .plan = SI_BRANCH, .a = z->branch_wm, .b = 0, .c = 0, .node = NULL }; rc = si_plan(&db->index, &plan); if (rc == 0) break; rc = si_execute(&db->index, &w->dc, &plan, vlsn, vlsn_lru); if (ssunlikely(rc == -1)) break; } se_workerpool_push(&e->sched.workers, w); return rc; } int se_scheduler_compact(void *arg) { sedb *db = arg; se *e = se_of(&db->o); srzone *z = se_zoneof(e); seworker *w = se_workerpool_pop(&e->sched.workers, &e->r); if (ssunlikely(w == NULL)) return -1; int rc; while (1) { uint64_t vlsn = sx_vlsn(&e->xm); uint64_t vlsn_lru = si_lru_vlsn(&db->index); siplan plan = { .explain = SI_ENONE, .plan = SI_COMPACT, .a = z->compact_wm, .b = z->compact_mode, .c = 0, .node = NULL }; rc = si_plan(&db->index, &plan); if (rc == 0) break; rc = si_execute(&db->index, &w->dc, &plan, vlsn, vlsn_lru); if (ssunlikely(rc == -1)) break; } se_workerpool_push(&e->sched.workers, w); return rc; } int se_scheduler_compact_index(void *arg) { sedb *db = arg; se *e = se_of(&db->o); srzone *z = se_zoneof(e); seworker *w = se_workerpool_pop(&e->sched.workers, &e->r); if (ssunlikely(w == NULL)) return -1; int rc; while (1) { uint64_t vlsn = sx_vlsn(&e->xm); uint64_t vlsn_lru = si_lru_vlsn(&db->index); siplan plan = { .explain = SI_ENONE, .plan = SI_COMPACT_INDEX, .a = z->branch_wm, .b = 0, .c = 0, .node = NULL }; rc = si_plan(&db->index, &plan); if (rc == 0) break; rc = si_execute(&db->index, &w->dc, &plan, vlsn, vlsn_lru); if (ssunlikely(rc == -1)) break; } se_workerpool_push(&e->sched.workers, w); return rc; } int se_scheduler_anticache(void *arg) { se *o = arg; sescheduler *s = &o->sched; uint64_t asn = sr_seq(&o->seq, SR_ASNNEXT); ss_mutexlock(&s->lock); s->anticache_asn = asn; s->anticache_storage = o->conf.anticache; s->anticache = 1; ss_mutexunlock(&s->lock); return 0; }
static int se_schedule(sescheduler *s, setask *task, seworker *w) { ss_trace(&w->trace, "%s", "schedule"); si_planinit(&task->plan); uint64_t now = ss_utime(); se *e = (se*)s->env; sedb *db; srzone *zone = se_zoneof(e); assert(zone != NULL); task->checkpoint_complete = 0; task->backup_complete = 0; task->rotate = 0; task->req = 0; task->gc = 0; task->db = NULL; ss_mutexlock(&s->lock); /* asynchronous reqs dispatcher */ if (s->req == 0) { switch (zone->async) { case 2: if (se_reqqueue(e) == 0) break; case 1: s->req = 1; task->req = zone->async; ss_mutexunlock(&s->lock); return 0; } } /* log gc and rotation */ if (s->rotate == 0) { task->rotate = 1; s->rotate = 1; } /* checkpoint */ int in_progress = 0; int rc; checkpoint: if (s->checkpoint) { task->plan.plan = SI_CHECKPOINT; task->plan.a = s->checkpoint_lsn; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_branch++; se_dbref(db, 1); task->db = db; task->gc = 1; ss_mutexunlock(&s->lock); return 1; case 2: /* work in progress */ in_progress = 1; break; case 0: /* complete checkpoint */ s->checkpoint = 0; s->checkpoint_lsn_last = s->checkpoint_lsn; s->checkpoint_lsn = 0; task->checkpoint_complete = 1; break; } } /* apply zone policy */ switch (zone->mode) { case 0: /* compact_index */ case 1: /* compact_index + branch_count prio */ assert(0); break; case 2: /* checkpoint */ { if (in_progress) { ss_mutexunlock(&s->lock); return 0; } uint64_t lsn = sr_seq(&e->seq, SR_LSN); s->checkpoint_lsn = lsn; s->checkpoint = 1; goto checkpoint; } default: /* branch + compact */ assert(zone->mode == 3); } /* database shutdown-drop */ if (s->workers_gc_db < zone->gc_db_prio) { ss_spinlock(&e->dblock); db = NULL; if (ssunlikely(e->db_shutdown.n > 0)) { db = (sedb*)so_listfirst(&e->db_shutdown); if (se_dbgarbage(db)) { so_listdel(&e->db_shutdown, &db->o); } else { db = NULL; } } ss_spinunlock(&e->dblock); if (ssunlikely(db)) { if (db->dropped) task->plan.plan = SI_DROP; else task->plan.plan = SI_SHUTDOWN; s->workers_gc_db++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; } } /* backup */ if (s->backup && (s->workers_backup < zone->backup_prio)) { /* backup procedure. * * state 0 (start) * ------- * * a. disable log gc * b. mark to start backup (state 1) * * state 1 (background, delayed start) * ------- * * a. create backup_path/<bsn.incomplete> directory * b. create database directories * c. create log directory * d. state 2 * * state 2 (background, copy) * ------- * * a. schedule and execute node backup which bsn < backup_bsn * b. state 3 * * state 3 (background, completion) * ------- * * a. rotate log file * b. copy log files * c. enable log gc, schedule gc * d. rename <bsn.incomplete> into <bsn> * e. set last backup, set COMPLETE * */ if (s->backup == 1) { /* state 1 */ rc = se_backupstart(s); if (ssunlikely(rc == -1)) { se_backuperror(s); goto backup_error; } s->backup = 2; } /* state 2 */ task->plan.plan = SI_BACKUP; task->plan.a = s->backup_bsn; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_backup++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; case 2: /* work in progress */ break; case 0: /* state 3 */ rc = se_backupcomplete(s, w); if (ssunlikely(rc == -1)) { se_backuperror(s); goto backup_error; } s->backup_events++; task->gc = 1; task->backup_complete = 1; break; } backup_error:; } /* garbage-collection */ if (s->gc) { if (s->workers_gc < zone->gc_prio) { task->plan.plan = SI_GC; task->plan.a = sx_vlsn(&e->xm); task->plan.b = zone->gc_wm; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_gc++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; case 2: /* work in progress */ break; case 0: /* state 3 */ s->gc = 0; s->gc_last = now; break; } } } else { if (zone->gc_prio && zone->gc_period) { if ( (now - s->gc_last) >= ((uint64_t)zone->gc_period * 1000000) ) { s->gc = 1; } } } /* index aging */ if (s->age) { if (s->workers_branch < zone->branch_prio) { task->plan.plan = SI_AGE; task->plan.a = zone->branch_age * 1000000; /* ms */ task->plan.b = zone->branch_age_wm; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_branch++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; case 0: s->age = 0; s->age_last = now; break; } } } else { if (zone->branch_prio && zone->branch_age_period) { if ( (now - s->age_last) >= ((uint64_t)zone->branch_age_period * 1000000) ) { s->age = 1; } } } /* branching */ if (s->workers_branch < zone->branch_prio) { /* schedule branch task using following * priority: * * a. peek node with the largest in-memory index * which is equal or greater then branch * watermark. * If nothing is found, stick to b. * * b. peek node with the largest in-memory index, * which has oldest update time. * * c. if no branch work is needed, schedule a * compaction job * */ task->plan.plan = SI_BRANCH; task->plan.a = zone->branch_wm; rc = se_schedule_plan(s, &task->plan, &db); if (rc == 1) { s->workers_branch++; se_dbref(db, 1); task->db = db; task->gc = 1; ss_mutexunlock(&s->lock); return 1; } } /* compaction */ task->plan.plan = SI_COMPACT; task->plan.a = zone->compact_wm; task->plan.b = zone->compact_mode; rc = se_schedule_plan(s, &task->plan, &db); if (rc == 1) { se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; } ss_mutexunlock(&s->lock); return 0; }
int se_scheduler_branch(void *arg) { sedb *db = arg; se *e = se_of(&db->o); srzone *z = se_zoneof(e); seworker stub; se_workerstub_init(&stub); int rc; while (1) { uint64_t vlsn = sx_vlsn(&e->xm); siplan plan = { .explain = SI_ENONE, .plan = SI_BRANCH, .a = z->branch_wm, .b = 0, .c = 0, .node = NULL }; rc = si_plan(&db->index, &plan); if (rc == 0) break; rc = si_execute(&db->index, &stub.dc, &plan, vlsn); if (ssunlikely(rc == -1)) break; } se_workerstub_free(&stub, &db->r); return rc; } int se_scheduler_compact(void *arg) { sedb *db = arg; se *e = se_of(&db->o); srzone *z = se_zoneof(e); seworker stub; se_workerstub_init(&stub); int rc; while (1) { uint64_t vlsn = sx_vlsn(&e->xm); siplan plan = { .explain = SI_ENONE, .plan = SI_COMPACT, .a = z->compact_wm, .b = z->compact_mode, .c = 0, .node = NULL }; rc = si_plan(&db->index, &plan); if (rc == 0) break; rc = si_execute(&db->index, &stub.dc, &plan, vlsn); if (ssunlikely(rc == -1)) break; } se_workerstub_free(&stub, &db->r); return rc; } int se_scheduler_checkpoint(void *arg) { se *o = arg; sescheduler *s = &o->sched; uint64_t lsn = sr_seq(&o->seq, SR_LSN); ss_mutexlock(&s->lock); s->checkpoint_lsn = lsn; s->checkpoint = 1; ss_mutexunlock(&s->lock); return 0; }
int sc_ctl_branch(sc *s, uint64_t vlsn, si *index) { sr *r = s->r; int rc = sr_statusactive(r->status); if (ssunlikely(rc == 0)) return 0; srzone *z = sr_zoneof(r); scworker *w = sc_workerpool_pop(&s->wp, r); if (ssunlikely(w == NULL)) return -1; while (1) { uint64_t vlsn_lru = si_lru_vlsn(index); siplan plan = { .explain = SI_ENONE, .plan = SI_BRANCH, .a = z->branch_wm, .b = 0, .c = 0, .node = NULL }; rc = si_plan(index, &plan); if (rc == 0) break; rc = si_execute(index, &w->dc, &plan, vlsn, vlsn_lru); if (ssunlikely(rc == -1)) break; } sc_workerpool_push(&s->wp, w); return rc; } int sc_ctl_compact(sc *s, uint64_t vlsn, si *index) { sr *r = s->r; int rc = sr_statusactive(r->status); if (ssunlikely(rc == 0)) return 0; srzone *z = sr_zoneof(r); scworker *w = sc_workerpool_pop(&s->wp, r); if (ssunlikely(w == NULL)) return -1; while (1) { uint64_t vlsn_lru = si_lru_vlsn(index); siplan plan = { .explain = SI_ENONE, .plan = SI_COMPACT, .a = z->compact_wm, .b = z->compact_mode, .c = 0, .node = NULL }; rc = si_plan(index, &plan); if (rc == 0) break; rc = si_execute(index, &w->dc, &plan, vlsn, vlsn_lru); if (ssunlikely(rc == -1)) break; } sc_workerpool_push(&s->wp, w); return rc; } int sc_ctl_compact_index(sc *s, uint64_t vlsn, si *index) { sr *r = s->r; int rc = sr_statusactive(r->status); if (ssunlikely(rc == 0)) return 0; srzone *z = sr_zoneof(r); scworker *w = sc_workerpool_pop(&s->wp, r); if (ssunlikely(w == NULL)) return -1; while (1) { uint64_t vlsn_lru = si_lru_vlsn(index); siplan plan = { .explain = SI_ENONE, .plan = SI_COMPACT_INDEX, .a = z->branch_wm, .b = 0, .c = 0, .node = NULL }; rc = si_plan(index, &plan); if (rc == 0) break; rc = si_execute(index, &w->dc, &plan, vlsn, vlsn_lru); if (ssunlikely(rc == -1)) break; } sc_workerpool_push(&s->wp, w); return rc; } int sc_ctl_anticache(sc *s) { uint64_t asn = sr_seq(s->r->seq, SR_ASNNEXT); ss_mutexlock(&s->lock); s->anticache_asn = asn; s->anticache_storage = s->anticache_limit; s->anticache = 1; ss_mutexunlock(&s->lock); return 0; }
static inline int si_split(si *index, sdc *c, ssbuf *result, sinode *parent, ssiter *i, uint64_t size_node, uint64_t size_stream, uint32_t stream, uint64_t vlsn) { sr *r = &index->r; uint32_t timestamp = ss_timestamp(); int rc; sdmergeconf mergeconf = { .stream = stream, .size_stream = size_stream, .size_node = size_node, .size_page = index->scheme.compaction.node_page_size, .checksum = index->scheme.compaction.node_page_checksum, .expire = index->scheme.expire, .timestamp = timestamp, .compression = index->scheme.compression, .compression_if = index->scheme.compression_if, .direct_io = index->scheme.direct_io, .direct_io_page_size = index->scheme.direct_io_page_size, .vlsn = vlsn }; sinode *n = NULL; sdmerge merge; rc = sd_mergeinit(&merge, r, i, &c->build, &c->build_index, &c->upsert, &mergeconf); if (ssunlikely(rc == -1)) return -1; while ((rc = sd_merge(&merge)) > 0) { /* create new node */ uint64_t id = sr_seq(index->r.seq, SR_NSNNEXT); n = si_nodenew(r, id, parent->id); if (ssunlikely(n == NULL)) goto error; rc = si_nodecreate(n, r, &index->scheme); if (ssunlikely(rc == -1)) goto error; /* write pages */ uint64_t offset; offset = sd_iosize(&c->io, &n->file); while ((rc = sd_mergepage(&merge, offset)) == 1) { rc = sd_writepage(r, &n->file, &c->io, merge.build); if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); } if (ssunlikely(rc == -1)) goto error; offset = sd_iosize(&c->io, &n->file); rc = sd_mergeend(&merge, offset); if (ssunlikely(rc == -1)) goto error; /* write index */ rc = sd_writeindex(r, &n->file, &c->io, &merge.index); if (ssunlikely(rc == -1)) goto error; /* mmap mode */ if (index->scheme.mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } /* add node to the list */ rc = ss_bufadd(result, index->r.a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(index->r.e); goto error; } n->index = merge.index; } if (ssunlikely(rc == -1)) goto error; return 0; error: if (n) si_nodefree(n, r, 0); sd_mergefree(&merge); si_splitfree(result, r); return -1; } static int si_merge(si *index, sdc *c, sinode *node, uint64_t vlsn, ssiter *stream, uint64_t size_stream, uint32_t n_stream) { sr *r = &index->r; ssbuf *result = &c->a; ssiter i; /* begin compaction. * * Split merge stream into a number of * a new nodes. */ int rc; rc = si_split(index, c, result, node, stream, index->scheme.compaction.node_size, size_stream, n_stream, vlsn); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0, si_splitfree(result, r); sr_malfunction(r->e, "%s", "error injection"); return -1); /* mask removal of a single node as a * single node update */ int count = ss_bufused(result) / sizeof(sinode*); int count_index; si_lock(index); count_index = index->n; si_unlock(index); sinode *n; if (ssunlikely(count == 0 && count_index == 1)) { n = si_bootstrap(index, node->id); if (ssunlikely(n == NULL)) return -1; rc = ss_bufadd(result, r->a, &n, sizeof(sinode*)); if (ssunlikely(rc == -1)) { sr_oom_malfunction(r->e); si_nodefree(n, r, 1); return -1; } count++; } /* commit compaction changes */ si_lock(index); svindex *j = si_nodeindex(node); si_plannerremove(&index->p, node); si_nodesplit(node); switch (count) { case 0: /* delete */ si_remove(index, node); si_redistribute_index(index, r, c, node); break; case 1: /* self update */ n = *(sinode**)result->s; n->i0 = *j; n->used = j->used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); break; default: /* split */ rc = si_redistribute(index, r, c, node, result); if (ssunlikely(rc == -1)) { si_unlock(index); si_splitfree(result, r); return -1; } ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_replace(index, node, n); si_plannerupdate(&index->p, n); for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i); ss_iternext(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); n->used = n->i0.used; si_nodelock(n); si_insert(index, n); si_plannerupdate(&index->p, n); } break; } sv_indexinit(j); si_unlock(index); /* compaction completion */ /* seal nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); if (index->scheme.sync) { rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } rc = si_noderename_seal(n, r, &index->scheme); if (ssunlikely(rc == -1)) { si_nodefree(node, r, 0); return -1; } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1, si_nodefree(node, r, 0); sr_malfunction(r->e, "%s", "error injection"); return -1); /* gc node */ uint16_t refs = si_noderefof(node); if (sslikely(refs == 0)) { rc = si_nodefree(node, r, 1); if (ssunlikely(rc == -1)) return -1; } else { /* node concurrently being read, schedule for * delayed removal */ si_nodegc(node, r, &index->scheme); si_lock(index); ss_listappend(&index->gc, &node->gc); index->gc_count++; si_unlock(index); } SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2, sr_malfunction(r->e, "%s", "error injection"); return -1); /* complete new nodes */ ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); rc = si_noderename_complete(n, r, &index->scheme); if (ssunlikely(rc == -1)) return -1; SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4, sr_malfunction(r->e, "%s", "error injection"); return -1); ss_iternext(ss_bufiterref, &i); } /* unlock */ si_lock(index); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*)); while (ss_iterhas(ss_bufiterref, &i)) { n = ss_iterof(ss_bufiterref, &i); si_nodeunlock(n); ss_iternext(ss_bufiterref, &i); } si_unlock(index); return 0; }
static int se_dbscheme_init(sedb *db, char *name) { se *e = se_of(&db->o); /* prepare index scheme */ sischeme *scheme = &db->scheme; scheme->name = ss_strdup(&e->a, name); if (ssunlikely(scheme->name == NULL)) goto e0; scheme->id = sr_seq(&e->seq, SR_DSNNEXT); scheme->sync = 2; scheme->mmap = 0; scheme->storage = SI_SCACHE; scheme->cache_mode = 0; scheme->cache_sz = NULL; scheme->node_size = 64 * 1024 * 1024; scheme->node_compact_load = 0; scheme->node_page_size = 128 * 1024; scheme->node_page_checksum = 1; scheme->compression_key = 0; scheme->compression = 0; scheme->compression_if = &ss_nonefilter; scheme->compression_branch = 0; scheme->compression_branch_if = &ss_nonefilter; scheme->amqf = 0; scheme->fmt = SF_KV; scheme->fmt_storage = SF_SRAW; scheme->path_fail_on_exists = 0; scheme->path_fail_on_drop = 1; scheme->lru = 0; scheme->lru_step = 128 * 1024; scheme->buf_gc_wm = 1024 * 1024; scheme->storage_sz = ss_strdup(&e->a, "cache"); if (ssunlikely(scheme->storage_sz == NULL)) goto e1; scheme->compression_sz = ss_strdup(&e->a, scheme->compression_if->name); if (ssunlikely(scheme->compression_sz == NULL)) goto e1; scheme->compression_branch_sz = ss_strdup(&e->a, scheme->compression_branch_if->name); if (ssunlikely(scheme->compression_branch_sz == NULL)) goto e1; sf_upsertinit(&scheme->fmt_upsert); scheme->fmt_sz = ss_strdup(&e->a, "kv"); if (ssunlikely(scheme->fmt_sz == NULL)) goto e1; /* init single key part as string */ int rc; sr_schemeinit(&scheme->scheme); srkey *part = sr_schemeadd(&scheme->scheme); rc = sr_keysetname(part, &e->a, "key"); if (ssunlikely(rc == -1)) goto e1; rc = sr_keyset(part, &e->a, "string"); if (ssunlikely(rc == -1)) goto e1; return 0; e1: si_schemefree(&db->scheme, &db->r); e0: sr_oom(&e->error); return -1; }