static int se_gc(sescheduler *s, seworker *w) { ss_trace(&w->trace, "%s", "log gc"); se *e = (se*)s->env; int rc = sl_poolgc(&e->lp); if (ssunlikely(rc == -1)) return -1; return 0; }
static void* so_snapshotcursor(srobj *o, va_list args) { sosnapshot *s = (sosnapshot*)o; so *e = so_of(o); va_list va; va_copy(va, args); sov *v = va_arg(va, sov*); va_end(va); if (ssunlikely(v->o.id != SOV)) goto error; if (ssunlikely(v->parent == NULL || v->parent->id != SODB)) goto error; sodb *db = (sodb*)v->parent; return so_cursornew(db, s->vlsn, 0, args); error: sr_error(&e->error, "%s", "bad arguments"); return NULL; }
int se_scheduler_run(sescheduler *s) { se *e = (se*)s->env; int rc; rc = ss_threadpool_new(&s->tp, &e->a, e->conf.threads, se_worker, e); if (ssunlikely(rc == -1)) return -1; return 0; }
static inline int si_noderecover(sinode *n, sr *r) { /* recover branches */ ssiter i; ss_iterinit(sd_recover, &i); ss_iteropen(sd_recover, &i, r, &n->file); int first = 1; int rc; while (ss_iteratorhas(&i)) { sdindexheader *h = ss_iteratorof(&i); sibranch *b; if (first) { b = &n->self; } else { b = si_branchnew(r); if (ssunlikely(b == NULL)) goto error; } sdindex index; sd_indexinit(&index); rc = sd_indexcopy(&index, r, h); if (ssunlikely(rc == -1)) goto error; si_branchset(b, &index); b->next = n->branch; n->branch = b; n->branch_count++; first = 0; ss_iteratornext(&i); } rc = sd_recover_complete(&i); if (ssunlikely(rc == -1)) goto error; ss_iteratorclose(&i); return 0; error: ss_iteratorclose(&i); return -1; }
int se_document_createkey(sedocument *o) { sedb *db = (sedb*)o->o.parent; se *e = se_of(&db->o); if (o->created) return 0; assert(o->v == NULL); /* set prefix */ if (o->prefix) { if (db->scheme->scheme.keys[0]->type != SS_STRING) return sr_error(&e->error, "%s", "prefix search is only " "supported for a string key"); void *copy = ss_malloc(&e->a, o->prefix_size); if (ssunlikely(copy == NULL)) return sr_oom(&e->error); memcpy(copy, o->prefix, o->prefix_size); o->prefix_copy = copy; } /* set unspecified min/max keys, depending on * iteration order */ if (ssunlikely(o->fields_count_keys != db->scheme->scheme.keys_count)) { if (o->prefix && o->fields_count_keys == 0) { memset(o->fields, 0, sizeof(o->fields)); o->fields[0].pointer = o->prefix; o->fields[0].size = o->prefix_size; } sf_limitapply(&db->limit, &db->scheme->scheme, o->fields, o->order); o->fields_count = db->scheme->scheme.fields_count; o->fields_count_keys = db->scheme->scheme.keys_count; } o->v = sv_vbuild(db->r, o->fields); if (ssunlikely(o->v == NULL)) return sr_oom(&e->error); sf_flagsset(db->r->scheme, sv_vpointer(o->v), SVGET); o->created = 1; return 0; }
int si_nodefree(sinode *n, sr *r, int gc) { int rcret = 0; int rc; if (gc && n->file.file) { rc = ss_fileunlink(n->file.file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", n->file.file, strerror(errno)); rcret = -1; } } si_nodefree_branches(n, r); rc = si_nodeclose(n, r); if (ssunlikely(rc == -1)) rcret = -1; ss_free(r->a, n); return rcret; }
int si_nodesync(sinode *n, sr *r) { int rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", n->file.file, strerror(errno)); return -1; } return 0; }
int si_nodemap(sinode *n, sr *r) { int rc = ss_mmap(&n->map, n->file.fd, n->file.size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' mmap error: %s", n->file.file, strerror(errno)); return -1; } return 0; }
static inline int sy_recoverbackup(sy *i, sr *r) { if (i->conf->path_backup == NULL) return 0; int rc; int exists = ss_vfsexists(r->vfs, i->conf->path_backup); if (! exists) { rc = ss_vfsmkdir(r->vfs, i->conf->path_backup, 0755); if (ssunlikely(rc == -1)) { sr_error(r->e, "backup directory '%s' create error: %s", i->conf->path_backup, strerror(errno)); return -1; } } /* recover backup sequential number */ DIR *dir = opendir(i->conf->path_backup); if (ssunlikely(dir == NULL)) { sr_error(r->e, "backup directory '%s' open error: %s", i->conf->path_backup, strerror(errno)); return -1; } uint32_t bsn = 0; struct dirent *de; while ((de = readdir(dir))) { if (ssunlikely(de->d_name[0] == '.')) continue; uint32_t id = 0; rc = sy_process(de->d_name, &id); switch (rc) { case 1: case 0: if (id > bsn) bsn = id; break; case -1: /* skip unknown file */ continue; } } closedir(dir); r->seq->bsn = bsn; return 0; }
int sd_schemebegin(sdscheme *c, sr *r) { int rc = ss_bufensure(&c->buf, r->a, sizeof(sdschemeheader)); if (ssunlikely(rc == -1)) return sr_oom(r->e); sdschemeheader *h = (sdschemeheader*)c->buf.s; memset(h, 0, sizeof(sdschemeheader)); ss_bufadvance(&c->buf, sizeof(sdschemeheader)); return 0; }
int sd_indexcommit(sdindex *i, sr *r, sdid *id, ssqf *qf, uint32_t align, uint64_t offset) { int size = ss_bufused(&i->v); int size_extension = 0; int extensions = 0; if (qf) { extensions = SD_INDEXEXT_AMQF; size_extension += sizeof(sdindexamqf); size_extension += qf->qf_table_size; } /* calculate index align for direct_io */ int size_meta = size + size_extension + sizeof(sdindexheader); int size_align = 0; if (align) { size_align += align - ((offset + size_meta + ss_bufused(&i->i)) % align); size_meta += size_align; } int rc = ss_bufensure(&i->i, r->a, size_meta); if (ssunlikely(rc == -1)) return sr_oom(r->e); /* min/max pairs */ memcpy(i->i.p, i->v.s, size); ss_bufadvance(&i->i, size); /* extension */ if (qf) { sdindexamqf *qh = (sdindexamqf*)(i->i.p); qh->q = qf->qf_qbits; qh->r = qf->qf_rbits; qh->entries = qf->qf_entries; qh->size = qf->qf_table_size; ss_bufadvance(&i->i, sizeof(sdindexamqf)); memcpy(i->i.p, qf->qf_table, qf->qf_table_size); ss_bufadvance(&i->i, qf->qf_table_size); } ss_buffree(&i->v, r->a); sdindexheader *h = &i->build; /* align */ if (size_align) { h->align = size_align; memset(i->i.p, 0, size_align); ss_bufadvance(&i->i, size_align); } /* header */ h->offset = offset; h->id = *id; h->extension = size_extension; h->extensions = extensions; h->crc = ss_crcs(r->crc, h, sizeof(sdindexheader), 0); memcpy(i->i.p, &i->build, sizeof(sdindexheader)); ss_bufadvance(&i->i, sizeof(sdindexheader)); i->h = sd_indexheader(i); return 0; }
static inline int se_backupstart(sescheduler *s) { se *e = (se*)s->env; /* * a. create backup_path/<bsn.incomplete> directory * b. create database directories * c. create log directory */ char path[1024]; snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete", e->conf.backup_path, s->backup_bsn); int rc = ss_vfsmkdir(&e->vfs, path, 0755); if (ssunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' create error: %s", path, strerror(errno)); return -1; } int i = 0; while (i < s->count) { sedb *db = s->i[i]; snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete/%s", e->conf.backup_path, s->backup_bsn, db->scheme.name); rc = ss_vfsmkdir(&e->vfs, path, 0755); if (ssunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' create error: %s", path, strerror(errno)); return -1; } i++; } snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete/log", e->conf.backup_path, s->backup_bsn); rc = ss_vfsmkdir(&e->vfs, path, 0755); if (ssunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' create error: %s", path, strerror(errno)); return -1; } return 0; }
static inline ssize_t sw_diridof(char *s) { size_t v = 0; while (*s && *s != '.') { if (ssunlikely(!isdigit(*s))) return -1; v = (v * 10) + *s - '0'; s++; } return v; }
int sy_open(sy *e, sr *r, syconf *conf) { e->conf = conf; int rc = sy_recoverbackup(e, r); if (ssunlikely(rc == -1)) return -1; int exists = ss_vfsexists(r->vfs, conf->path); if (exists == 0) return sy_deploy(e, r); return 0; }
int si_nodecomplete(sinode *n, sr *r, sischeme *scheme) { sspath path; ss_pathA(&path, scheme->path, n->self.id.id, ".db"); int rc = ss_filerename(&n->file, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' rename error: %s", n->file.file, strerror(errno)); } return rc; }
static inline int si_nodeclose(sinode *n, sr *r) { int rcret = 0; int rc = ss_munmap(&n->map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", n->file.file, strerror(errno)); rcret = -1; } rc = ss_fileclose(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' close error: %s", n->file.file, strerror(errno)); rcret = -1; } sv_indexfree(&n->i0, r); sv_indexfree(&n->i1, r); return rcret; }
int sd_indexcopy(sdindex *i, sr *r, sdindexheader *h) { int size = sd_indexsize_ext(h); int rc = ss_bufensure(&i->i, r->a, size); if (ssunlikely(rc == -1)) return sr_oom(r->e); memcpy(i->i.s, (char*)h, size); ss_bufadvance(&i->i, size); i->h = sd_indexheader(i); return 0; }
int se_scheduler_call(void *arg) { se *e = arg; sescheduler *s = &e->sched; seworker *w = se_workerpool_pop(&e->sched.workers, &e->r); if (ssunlikely(w == NULL)) return -1; int rc = se_scheduler(s, w); se_workerpool_push(&e->sched.workers, w); return rc; }
int sc_ctl_backup_event(sc *s) { int event = 0; ss_mutexlock(&s->lock); if (ssunlikely(s->backup_events > 0)) { s->backup_events--; event = 1; } ss_mutexunlock(&s->lock); return event; }
int si_plannerinit(siplanner *p, ssa *a, void *i) { int rc = ss_rqinit(&p->compact, a, 1, 20); if (ssunlikely(rc == -1)) return -1; /* 1Mb step up to 4Gb */ rc = ss_rqinit(&p->branch, a, 1024 * 1024, 4000); if (ssunlikely(rc == -1)) { ss_rqfree(&p->compact, a); return -1; } rc = ss_rqinit(&p->temp, a, 1, 100); if (ssunlikely(rc == -1)) { ss_rqfree(&p->compact, a); ss_rqfree(&p->branch, a); return -1; } p->i = i; return 0; }
int sc_step(sc *s, scworker *w, uint64_t vlsn) { sctask task; sc_taskbegin(&task, w, vlsn); int rc = sc_schedule(s, &task); int rc_job = rc; /* log rotation */ if (task.rotate) { rc = sc_rotate(s, w); if (ssunlikely(rc == -1)) goto error; } /* backup completion */ if (task.backup) { rc = sc_backupend(s, w); if (ssunlikely(rc == -1)) sc_backupstop(s); } if (rc_job == 1) { rc = sc_execute(&task, w, vlsn); if (ssunlikely(rc == -1)) { if (task.plan.plan != SI_BACKUP && task.plan.plan != SI_BACKUPEND) { sr_statusset(s->r->status, SR_MALFUNCTION); goto error; } sc_backupstop(s); } } sc_taskend(s, &task); if (task.gc) { rc = sc_gc(s, w); if (ssunlikely(rc == -1)) goto error; } ss_trace(&w->trace, "%s", "sleep"); return rc_job; error: ss_trace(&w->trace, "%s", "malfunction"); return -1; }
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg) { uint64_t lsn = sr_seq(x->manager->seq, SR_LSN); /* proceed read-only transactions */ if (x->type == SX_RO || sv_logcount_write(x->log) == 0) return sx_promote(x, SX_PREPARE); ssiter i; ss_iterinit(ss_bufiter, &i); ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv)); sxstate rc; for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i)) { svlogv *lv = ss_iterof(ss_bufiter, &i); sxv *v = lv->ptr; if ((int)v->lo == x->log_read) break; if (sx_vaborted(v)) return sx_promote(x, SX_ROLLBACK); if (sslikely(v->prev == NULL)) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } if (sx_vcommitted(v->prev)) { if (v->prev->csn > x->csn) return sx_promote(x, SX_ROLLBACK); continue; } /* force commit for read-only conflicts */ sxindex *i = v->prev->index; if (sv_vflags(v->prev->v, i->r) & SVGET) { rc = sx_preparecb(x, lv, lsn, prepare, arg); if (ssunlikely(rc != 0)) return sx_promote(x, SX_ROLLBACK); continue; } return sx_promote(x, SX_LOCK); } return sx_promote(x, SX_PREPARE); }
static void *se_worker(void *arg) { ssthread *self = arg; se *e = self->arg; seworker *worker = se_workerpool_pop(&e->sched.workers, &e->r); if (ssunlikely(worker == NULL)) return NULL; for (;;) { int rc = se_active(e); if (ssunlikely(rc == 0)) break; rc = se_scheduler(&e->sched, worker); if (ssunlikely(rc == -1)) break; if (ssunlikely(rc == 0)) ss_sleep(10000000); /* 10ms */ } se_workerpool_push(&e->sched.workers, worker); return NULL; }
static int sy_deploy(sy *e, sr *r) { int rc; rc = ss_vfsmkdir(r->vfs, e->conf->path, 0755); if (ssunlikely(rc == -1)) { sr_error(r->e, "directory '%s' create error: %s", e->conf->path, strerror(errno)); return -1; } return 0; }
int si_nodecreate(sinode *n, sr *r, sischeme *scheme, sdid *id) { sspath path; ss_pathAB(&path, scheme->path, id->parent, id->id, ".db.incomplete"); int rc = ss_filenew(&n->file, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' create error: %s", path.path, strerror(errno)); return -1; } return 0; }
int si_nodefree(sinode *n, sr *r, int gc) { int rcret = 0; int rc; if (gc && ss_pathis_set(&n->file.path)) { ss_fileadvise(&n->file, 0, 0, n->file.size); rc = ss_vfsunlink(r->vfs, ss_pathof(&n->file.path)); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", ss_pathof(&n->file.path), strerror(errno)); rcret = -1; } } si_nodefree_branches(n, r); rc = si_nodeclose(n, r, gc); if (ssunlikely(rc == -1)) rcret = -1; ss_free(r->a, n); return rcret; }
static inline int sc_rotate(sc *s, scworker *w) { ss_trace(&w->trace, "%s", "log rotation"); int rc = sw_managerrotate_ready(s->wm); if (rc) { rc = sw_managerrotate(s->wm); if (ssunlikely(rc == -1)) return -1; } return 0; }
int sd_indexcopy(sdindex *i, sr *r, sdindexheader *h) { int size = sd_indexsize_ext(h); int rc = ss_bufensure(&i->i, r->a, size); if (ssunlikely(rc == -1)) return sr_oom(r->e); char *start = (char*)h - (h->align + h->size + h->extension); memcpy(i->i.s, start, size); ss_bufadvance(&i->i, size); i->h = sd_indexheader(i); return 0; }
static inline int se_txwrite(setx *t, sedocument *o, uint8_t flags) { se *e = se_of(&t->o); sedb *db = se_cast(o->o.parent, sedb*, SEDB); /* validate database status */ if (ssunlikely(! se_active(e))) goto error; /* ensure memory quota */ int rc; rc = sr_quota(&db->quota, &db->stat); if (ssunlikely(rc)) { sr_error(&e->error, "%s", "memory quota limit reached"); goto error; } /* create document */ rc = se_document_validate(o, &db->o); if (ssunlikely(rc == -1)) goto error; rc = se_document_create(o, flags); if (ssunlikely(rc == -1)) goto error; svv *v = o->v; v->log = o->log; sv_vref(v); so_destroy(&o->o); /* concurrent index only */ rc = sx_set(&t->t, &db->coindex, v); if (ssunlikely(rc == -1)) return -1; return 0; error: so_destroy(&o->o); return -1; }
int sd_buildadd(sdbuild *b, sr *r, sv *v, uint32_t flags) { /* prepare object metadata */ int rc = ss_bufensure(&b->m, r->a, sizeof(sdv)); if (ssunlikely(rc == -1)) return sr_oom(r->e); sdpageheader *h = sd_buildheader(b); sdv *sv = (sdv*)b->m.p; sv->flags = sv_flags(v) | flags; sv->offset = ss_bufused(&b->v) - sd_buildref(b)->v; ss_bufadvance(&b->m, sizeof(sdv)); /* copy object */ switch (r->fmt_storage) { case SF_SKEYVALUE: rc = sd_buildadd_keyvalue(b, r, v); break; case SF_SRAW: rc = sd_buildadd_raw(b, r, v); break; } if (ssunlikely(rc == -1)) return -1; /* update page header */ h->count++; uint32_t size = sizeof(sdv) + sv_size(v) + sizeof(sfref) * r->scheme->count; if (size > b->vmax) b->vmax = size; uint64_t lsn = sv_lsn(v); if (lsn > h->lsnmax) h->lsnmax = lsn; if (lsn < h->lsnmin) h->lsnmin = lsn; if (sv->flags & SVDUP) { h->countdup++; if (lsn < h->lsnmindup) h->lsnmindup = lsn; } return 0; }