static int si_redistribute_index(si *index, sr *r, sdc *c, sinode *node) { svindex *vindex = si_nodeindex(node); ssiter i; ss_iterinit(sv_indexiter, &i); ss_iteropen(sv_indexiter, &i, r, vindex, SS_GTE, NULL, 0); while (ss_iterhas(sv_indexiter, &i)) { sv *v = ss_iterof(sv_indexiter, &i); int rc = ss_bufadd(&c->b, r->a, &v->v, sizeof(svv**)); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); ss_iternext(sv_indexiter, &i); } if (ssunlikely(ss_bufused(&c->b) == 0)) return 0; uint64_t now = ss_utime(); ss_iterinit(ss_bufiterref, &i); ss_iteropen(ss_bufiterref, &i, &c->b, sizeof(svv*)); while (ss_iterhas(ss_bufiterref, &i)) { svv *v = ss_iterof(ss_bufiterref, &i); si_redistribute_set(index, r, now, v); ss_iternext(ss_bufiterref, &i); } return 0; }
int se_reqwrite(sereq *r) { sereqarg *arg = &r->arg; svlog *log = r->arg.log; se *e = se_of(r->object); /* set lsn */ sl_prepare(&e->lp, log, arg->lsn); /* log write */ if (! arg->recover) { sltx tl; sl_begin(&e->lp, &tl); int rc = sl_write(&tl, log); if (ssunlikely(rc == -1)) { sl_rollback(&tl); r->rc = -1; return -1; } sl_commit(&tl); } /* commit */ if (sslikely(arg->vlsn_generate)) arg->vlsn = sx_vlsn(&e->xm); uint64_t now = ss_utime(); svlogindex *i = (svlogindex*)log->index.s; svlogindex *end = (svlogindex*)log->index.p; while (i < end) { sedb *db = i->ptr; sitx ti; si_begin(&ti, &db->index, arg->vlsn, now, log, i); si_write(&ti, arg->recover); si_commit(&ti); i++; } return 0; }
static inline void sc_taskbegin(sctask *task, scworker *w, uint64_t vlsn) { task->time = ss_utime(); task->w = w; task->vlsn = vlsn; task->db = NULL; task->rotate = 0; task->gc = 0; task->backup = 0; si_planinit(&task->plan); }
int se_scheduler_init(sescheduler *s, so *env) { uint64_t now = ss_utime(); ss_mutexinit(&s->lock); s->workers_branch = 0; s->workers_backup = 0; s->workers_gc = 0; s->workers_gc_db = 0; s->workers_lru = 0; s->rotate = 0; s->req = 0; s->i = NULL; s->count = 0; s->rr = 0; s->env = env; s->checkpoint_lsn = 0; s->checkpoint_lsn_last = 0; s->checkpoint = 0; s->age = 0; s->age_last = now; s->backup_bsn = 0; s->backup_bsn_last = 0; s->backup_bsn_last_complete = 0; s->backup_events = 0; s->backup = 0; s->anticache_asn = 0; s->anticache_asn_last = 0; s->anticache_last = now; s->anticache_storage = 0; s->anticache = 0; s->snapshot_ssn = 0; s->snapshot_ssn_last = 0; s->snapshot_last = now; s->snapshot = 0; s->gc = 0; s->gc_last = now; s->lru = 0; s->lru_last = now; ss_threadpool_init(&s->tp); se_workerpool_init(&s->workers); return 0; }
static inline so* se_readresult(se *e, siread *r) { sedocument *v = (sedocument*)se_document_new(e, r->index->object, &r->result); if (ssunlikely(v == NULL)) return NULL; v->read_disk = r->read_disk; v->read_cache = r->read_cache; v->read_latency = 0; if (r->result.v) { v->read_latency = ss_utime() - r->read_start; sr_statget(&e->stat, v->read_latency, v->read_disk, v->read_cache); } /* propagate current document settings to * the result one */ v->orderset = 1; v->order = r->order; if (v->order == SS_GTE) v->order = SS_GT; else if (v->order == SS_LTE) v->order = SS_LT; /* set prefix */ if (r->prefix) { v->prefix = r->prefix; v->prefix_copy = r->prefix; v->prefix_size = r->prefix_size; } v->cold_only = r->cold_only; v->created = 1; v->flagset = 1; return &v->o; }
static inline int si_plannerpeek_age(siplanner *p, siplan *plan) { /* try to peek a node with update >= a and in-memory * index size >= b */ /* full scan */ uint64_t now = ss_utime(); sinode *n = NULL; ssrqnode *pn = NULL; while ((pn = ss_rqprev(&p->branch, pn))) { n = sscast(pn, sinode, nodebranch); if (n->flags & SI_LOCK) continue; if (n->used >= plan->b && ((now - n->update_time) >= plan->a)) goto match; } return 0; match: si_nodelock(n); plan->explain = SI_EINDEX_AGE; plan->node = n; return 1; }
static int se_schedule(sescheduler *s, setask *task, seworker *w) { ss_trace(&w->trace, "%s", "schedule"); si_planinit(&task->plan); uint64_t now = ss_utime(); se *e = (se*)s->env; sedb *db; srzone *zone = se_zoneof(e); assert(zone != NULL); task->checkpoint_complete = 0; task->backup_complete = 0; task->rotate = 0; task->req = 0; task->gc = 0; task->db = NULL; ss_mutexlock(&s->lock); /* asynchronous reqs dispatcher */ if (s->req == 0) { switch (zone->async) { case 2: if (se_reqqueue(e) == 0) break; case 1: s->req = 1; task->req = zone->async; ss_mutexunlock(&s->lock); return 0; } } /* log gc and rotation */ if (s->rotate == 0) { task->rotate = 1; s->rotate = 1; } /* checkpoint */ int in_progress = 0; int rc; checkpoint: if (s->checkpoint) { task->plan.plan = SI_CHECKPOINT; task->plan.a = s->checkpoint_lsn; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_branch++; se_dbref(db, 1); task->db = db; task->gc = 1; ss_mutexunlock(&s->lock); return 1; case 2: /* work in progress */ in_progress = 1; break; case 0: /* complete checkpoint */ s->checkpoint = 0; s->checkpoint_lsn_last = s->checkpoint_lsn; s->checkpoint_lsn = 0; task->checkpoint_complete = 1; break; } } /* apply zone policy */ switch (zone->mode) { case 0: /* compact_index */ case 1: /* compact_index + branch_count prio */ assert(0); break; case 2: /* checkpoint */ { if (in_progress) { ss_mutexunlock(&s->lock); return 0; } uint64_t lsn = sr_seq(&e->seq, SR_LSN); s->checkpoint_lsn = lsn; s->checkpoint = 1; goto checkpoint; } default: /* branch + compact */ assert(zone->mode == 3); } /* database shutdown-drop */ if (s->workers_gc_db < zone->gc_db_prio) { ss_spinlock(&e->dblock); db = NULL; if (ssunlikely(e->db_shutdown.n > 0)) { db = (sedb*)so_listfirst(&e->db_shutdown); if (se_dbgarbage(db)) { so_listdel(&e->db_shutdown, &db->o); } else { db = NULL; } } ss_spinunlock(&e->dblock); if (ssunlikely(db)) { if (db->dropped) task->plan.plan = SI_DROP; else task->plan.plan = SI_SHUTDOWN; s->workers_gc_db++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; } } /* backup */ if (s->backup && (s->workers_backup < zone->backup_prio)) { /* backup procedure. * * state 0 (start) * ------- * * a. disable log gc * b. mark to start backup (state 1) * * state 1 (background, delayed start) * ------- * * a. create backup_path/<bsn.incomplete> directory * b. create database directories * c. create log directory * d. state 2 * * state 2 (background, copy) * ------- * * a. schedule and execute node backup which bsn < backup_bsn * b. state 3 * * state 3 (background, completion) * ------- * * a. rotate log file * b. copy log files * c. enable log gc, schedule gc * d. rename <bsn.incomplete> into <bsn> * e. set last backup, set COMPLETE * */ if (s->backup == 1) { /* state 1 */ rc = se_backupstart(s); if (ssunlikely(rc == -1)) { se_backuperror(s); goto backup_error; } s->backup = 2; } /* state 2 */ task->plan.plan = SI_BACKUP; task->plan.a = s->backup_bsn; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_backup++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; case 2: /* work in progress */ break; case 0: /* state 3 */ rc = se_backupcomplete(s, w); if (ssunlikely(rc == -1)) { se_backuperror(s); goto backup_error; } s->backup_events++; task->gc = 1; task->backup_complete = 1; break; } backup_error:; } /* garbage-collection */ if (s->gc) { if (s->workers_gc < zone->gc_prio) { task->plan.plan = SI_GC; task->plan.a = sx_vlsn(&e->xm); task->plan.b = zone->gc_wm; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_gc++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; case 2: /* work in progress */ break; case 0: /* state 3 */ s->gc = 0; s->gc_last = now; break; } } } else { if (zone->gc_prio && zone->gc_period) { if ( (now - s->gc_last) >= ((uint64_t)zone->gc_period * 1000000) ) { s->gc = 1; } } } /* index aging */ if (s->age) { if (s->workers_branch < zone->branch_prio) { task->plan.plan = SI_AGE; task->plan.a = zone->branch_age * 1000000; /* ms */ task->plan.b = zone->branch_age_wm; rc = se_schedule_plan(s, &task->plan, &db); switch (rc) { case 1: s->workers_branch++; se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; case 0: s->age = 0; s->age_last = now; break; } } } else { if (zone->branch_prio && zone->branch_age_period) { if ( (now - s->age_last) >= ((uint64_t)zone->branch_age_period * 1000000) ) { s->age = 1; } } } /* branching */ if (s->workers_branch < zone->branch_prio) { /* schedule branch task using following * priority: * * a. peek node with the largest in-memory index * which is equal or greater then branch * watermark. * If nothing is found, stick to b. * * b. peek node with the largest in-memory index, * which has oldest update time. * * c. if no branch work is needed, schedule a * compaction job * */ task->plan.plan = SI_BRANCH; task->plan.a = zone->branch_wm; rc = se_schedule_plan(s, &task->plan, &db); if (rc == 1) { s->workers_branch++; se_dbref(db, 1); task->db = db; task->gc = 1; ss_mutexunlock(&s->lock); return 1; } } /* compaction */ task->plan.plan = SI_COMPACT; task->plan.a = zone->compact_wm; task->plan.b = zone->compact_mode; rc = se_schedule_plan(s, &task->plan, &db); if (rc == 1) { se_dbref(db, 1); task->db = db; ss_mutexunlock(&s->lock); return 1; } ss_mutexunlock(&s->lock); return 0; }
so *se_read(sedb *db, sedocument *o, sx *x, uint64_t vlsn, sicache *cache) { se *e = se_of(&db->o); uint64_t start = ss_utime(); /* prepare the key */ int auto_close = o->created <= 1; int rc = se_document_createkey(o); if (ssunlikely(rc == -1)) goto error; rc = se_document_validate_ro(o, &db->o); if (ssunlikely(rc == -1)) goto error; if (ssunlikely(! se_active(e))) goto error; sv vup; sv_init(&vup, &sv_vif, NULL, NULL); sedocument *ret = NULL; /* concurrent */ if (x && o->order == SS_EQ) { /* note: prefix is ignored during concurrent * index search */ int rc = sx_get(x, &db->coindex, &o->v, &vup); if (ssunlikely(rc == -1 || rc == 2 /* delete */)) goto error; if (rc == 1 && !sv_is(&vup, SVUPSERT)) { ret = (sedocument*)se_document_new(e, &db->o, &vup); if (sslikely(ret)) { ret->cold_only = o->cold_only; ret->created = 1; ret->orderset = 1; ret->flagset = 1; } else { sv_vunref(db->r, vup.v); } if (auto_close) so_destroy(&o->o); return &ret->o; } } else { sx_get_autocommit(&e->xm, &db->coindex); } /* prepare read cache */ int cachegc = 0; if (cache == NULL) { cachegc = 1; cache = si_cachepool_pop(&e->cachepool); if (ssunlikely(cache == NULL)) { if (vup.v) sv_vunref(db->r, vup.v); sr_oom(&e->error); goto error; } } sv_vref(o->v.v); /* do read */ siread rq; si_readopen(&rq, db->index, cache, o->order, vlsn, sv_pointer(&o->v), vup.v, o->prefix_copy, o->prefix_size, o->cold_only, 0, start); rc = si_read(&rq); si_readclose(&rq); /* prepare result */ if (rc == 1) { ret = (sedocument*)se_readresult(e, &rq); if (ret) o->prefix_copy = NULL; } /* cleanup */ if (o->v.v) sv_vunref(db->r, o->v.v); if (vup.v) sv_vunref(db->r, vup.v); if (ret == NULL && rq.result.v) sv_vunref(db->r, rq.result.v); if (cachegc && cache) si_cachepool_push(cache); if (auto_close) so_destroy(&o->o); return &ret->o; error: if (auto_close) so_destroy(&o->o); return NULL; }