int sd_buildbegin(sdbuild *b, sr *r, int crc, int compress) { b->crc = crc; b->compress = compress; int rc = sr_bufensure(&b->list, r->a, sizeof(sdbuildref)); if (srunlikely(rc == -1)) return sr_error(r->e, "%s", "memory allocation failed"); sdbuildref *ref = (sdbuildref*)sr_bufat(&b->list, sizeof(sdbuildref), b->n); ref->k = sr_bufused(&b->k); ref->ksize = 0; ref->v = sr_bufused(&b->v); ref->vsize = 0; ref->c = sr_bufused(&b->c); ref->csize = 0; rc = sr_bufensure(&b->k, r->a, sizeof(sdpageheader)); if (srunlikely(rc == -1)) return sr_error(r->e, "%s", "memory allocation failed"); sdpageheader *h = sd_buildheader(b); memset(h, 0, sizeof(*h)); h->lsnmin = UINT64_MAX; h->lsnmindup = UINT64_MAX; h->tsmin = 0; memset(h->reserve, 0, sizeof(h->reserve)); sr_bufadvance(&b->list, sizeof(sdbuildref)); sr_bufadvance(&b->k, sizeof(sdpageheader)); return 0; }
static int sr_cserializer(src *c, srcstmt *stmt, char *root, va_list args) { char path[256]; while (c) { if (root) snprintf(path, sizeof(path), "%s.%s", root, c->name); else snprintf(path, sizeof(path), "%s", c->name); int rc; int type = c->flags & ~SR_CRO; if (type == SR_CC) { rc = sr_cserializer(c->value, stmt, path, args); if (srunlikely(rc == -1)) return -1; } else { stmt->path = path; rc = c->function(c, stmt, args); if (srunlikely(rc == -1)) return -1; stmt->path = NULL; } c = c->next; } return 0; }
int sd_buildadd(sdbuild *b, sr *r, sv *v, uint32_t flags) { /* prepare metadata reference */ int rc = sr_bufensure(&b->k, r->a, sizeof(sdv)); if (srunlikely(rc == -1)) return sr_error(r->e, "%s", "memory allocation failed"); sdpageheader *h = sd_buildheader(b); sdv *sv = (sdv*)b->k.p; sv->lsn = sv_lsn(v); sv->flags = sv_flags(v) | flags; sv->size = sv_size(v); sv->offset = sr_bufused(&b->v) - sd_buildref(b)->v; /* copy object */ rc = sr_bufensure(&b->v, r->a, sv->size); if (srunlikely(rc == -1)) return sr_error(r->e, "%s", "memory allocation failed"); memcpy(b->v.p, sv_pointer(v), sv->size); sr_bufadvance(&b->v, sv->size); sr_bufadvance(&b->k, sizeof(sdv)); /* update page header */ h->count++; h->size += sv->size + sizeof(sdv); if (sv->lsn > h->lsnmax) h->lsnmax = sv->lsn; if (sv->lsn < h->lsnmin) h->lsnmin = sv->lsn; if (sv->flags & SVDUP) { h->countdup++; if (sv->lsn < h->lsnmindup) h->lsnmindup = sv->lsn; } return 0; }
static inline int si_recovercomplete(sitrack *track, sr *r, si *index, srbuf *buf) { /* prepare and build primary index */ sr_bufreset(buf); srrbnode *p = sr_rbmin(&track->i); while (p) { sinode *n = srcast(p, sinode, node); int rc = sr_bufadd(buf, r->a, &n, sizeof(sinode**)); if (srunlikely(rc == -1)) return sr_malfunction(r->e, "%s", "memory allocation failed"); p = sr_rbnext(&track->i, p); } sriter i; sr_iterinit(sr_bufiterref, &i, r); sr_iteropen(sr_bufiterref, &i, buf, sizeof(sinode*)); while (sr_iterhas(sr_bufiterref, &i)) { sinode *n = sr_iterof(sr_bufiterref, &i); if (n->recover & SI_RDB_REMOVE) { int rc = si_nodefree(n, r, 1); if (srunlikely(rc == -1)) return -1; sr_iternext(sr_bufiterref, &i); continue; } n->recover = SI_RDB; si_insert(index, r, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); sr_iternext(sr_bufiterref, &i); } return 0; }
static inline int si_process(char *name, uint32_t *nsn, uint32_t *parent) { /* id.db */ /* id.id.db.incomplete */ /* id.id.db.seal */ char *token = name; ssize_t id = si_processid(&token); if (srunlikely(id == -1)) return -1; *parent = id; *nsn = id; if (strcmp(token, ".db") == 0) return SI_RDB; if (srunlikely(*token != '.')) return -1; token++; id = si_processid(&token); if (srunlikely(id == -1)) return -1; *nsn = id; if (strcmp(token, ".db.incomplete") == 0) return SI_RDB_DBI; else if (strcmp(token, ".db.seal") == 0) return SI_RDB_DBSEAL; return -1; }
int so_scheduler_backup(void *arg) { so *e = arg; soscheduler *s = &e->sched; if (srunlikely(e->ctl.backup_path == NULL)) { sr_error(&e->error, "%s", "backup is not enabled"); return -1; } /* begin backup procedure * state 0 * * disable log garbage-collection */ sl_poolgc_enable(&e->lp, 0); sr_mutexlock(&s->lock); if (srunlikely(s->backup > 0)) { sr_mutexunlock(&s->lock); sl_poolgc_enable(&e->lp, 1); /* in progress */ return 0; } uint64_t bsn = sr_seq(&e->seq, SR_BSNNEXT); s->backup = 1; s->backup_bsn = bsn; sr_mutexunlock(&s->lock); return 0; }
int sd_commitpage(sdbuild *b, sr *r, srbuf *buf) { sdbuildref *ref = sd_buildref(b); /* compressed */ uint32_t size = sr_bufused(&b->c); int rc; if (size > 0) { rc = sr_bufensure(buf, r->a, ref->csize); if (srunlikely(rc == -1)) return -1; memcpy(buf->p, b->c.s, ref->csize); sr_bufadvance(buf, ref->csize); return 0; } /* not compressed */ assert(ref->ksize != 0); rc = sr_bufensure(buf, r->a, ref->ksize + ref->vsize); if (srunlikely(rc == -1)) return -1; memcpy(buf->p, b->k.s + ref->k, ref->ksize); sr_bufadvance(buf, ref->ksize); memcpy(buf->p, b->v.s + ref->v, ref->vsize); sr_bufadvance(buf, ref->vsize); return 0; }
static inline int so_backupcomplete(soscheduler *s, soworker *w) { /* * a. rotate log file * b. copy log files * c. enable log gc * d. rename <bsn.incomplete> into <bsn> * e. set last backup, set COMPLETE */ so *e = s->env; /* force log rotation */ sr_trace(&w->trace, "%s", "log rotation for backup"); int rc = sl_poolrotate(&e->lp); if (srunlikely(rc == -1)) return -1; /* copy log files */ sr_trace(&w->trace, "%s", "log files backup"); char path[1024]; snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete/log", e->ctl.backup_path, s->backup_bsn); rc = sl_poolcopy(&e->lp, path, &w->dc.c); if (srunlikely(rc == -1)) { sr_errorrecover(&e->error); return -1; } /* enable log gc */ sl_poolgc_enable(&e->lp, 1); /* complete backup */ snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete", e->ctl.backup_path, s->backup_bsn); char newpath[1024]; snprintf(newpath, sizeof(newpath), "%s/%" PRIu32, e->ctl.backup_path, s->backup_bsn); rc = rename(path, newpath); if (srunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' rename error: %s", path, strerror(errno)); return -1; } /* complete */ s->backup_last = s->backup_bsn; s->backup_last_complete = 1; s->backup = 0; s->backup_bsn = 0; return 0; }
static int sd_recovernext_of(sriter *i, sdindexheader *next) { sdrecover *ri = (sdrecover*)i->priv; if (next == NULL) return 0; char *eof = (char*)ri->map.p + ri->map.size; char *start = (char*)next; /* eof */ if (srunlikely(start == eof)) { ri->v = NULL; return 0; } /* validate crc */ uint32_t crc = sr_crcs(i->r->crc, next, sizeof(sdindexheader), 0); if (next->crc != crc) { sr_malfunction(i->r->e, "corrupted db file '%s': bad index crc", ri->file->file); ri->corrupt = 1; ri->v = NULL; return -1; } /* check version */ if (! sr_versioncheck(&next->version)) return sr_malfunction(i->r->e, "bad db file '%s' version", ri->file->file); char *end = start + sizeof(sdindexheader) + next->size + next->total + next->extension + sizeof(sdseal); if (srunlikely((start > eof || (end > eof)))) { sr_malfunction(i->r->e, "corrupted db file '%s': bad record size", ri->file->file); ri->corrupt = 1; ri->v = NULL; return -1; } /* check seal */ sdseal *s = (sdseal*)(end - sizeof(sdseal)); int rc = sd_sealvalidate(s, i->r, next); if (srunlikely(rc == -1)) { sr_malfunction(i->r->e, "corrupted db file '%s': bad seal", ri->file->file); ri->corrupt = 1; ri->v = NULL; return -1; } ri->actual = next; ri->v = next; return 1; }
int sr_cexecv(src *start, srcstmt *stmt, va_list args) { if (stmt->op == SR_CSERIALIZE) return sr_cserializer(start, stmt, NULL, args); char path[256]; snprintf(path, sizeof(path), "%s", stmt->path); char *ptr = NULL; char *token; token = strtok_r(path, ".", &ptr); if (srunlikely(token == NULL)) return -1; src *c = start; while (c) { if (strcmp(token, c->name) != 0) { c = c->next; continue; } int type = c->flags & ~SR_CRO; switch (type) { case SR_CU32: case SR_CU64: case SR_CSZREF: case SR_CSZ: case SR_CVOID: token = strtok_r(NULL, ".", &ptr); if (srunlikely(token != NULL)) goto error; return c->function(c, stmt, args); case SR_CC: token = strtok_r(NULL, ".", &ptr); if (srunlikely(token == NULL)) { if (c->function) return c->function(c, stmt, args); /* not supported */ goto error; } c = (src*)c->value; continue; } assert(0); } error: sr_error(stmt->r->e, "bad ctl path: %s", stmt->path); return -1; }
int so_scheduler(soscheduler *s, soworker *w) { sotask task; int rc = so_schedule(s, &task, w); int job = rc; if (task.rotate) { rc = so_rotate(s, w); if (srunlikely(rc == -1)) goto error; } so *e = s->env; if (task.req) { rc = so_dispatch(s, w); if (srunlikely(rc == -1)) { goto error; } } if (task.checkpoint_complete) { sr_triggerrun(&e->ctl.checkpoint_on_complete, &e->o); } if (task.backup_complete) { sr_triggerrun(&e->ctl.backup_on_complete, &e->o); } if (job) { rc = so_execute(&task, w); if (srunlikely(rc == -1)) { if (task.plan.plan != SI_BACKUP) { if (task.db) so_dbmalfunction(task.db); goto error; } sr_mutexlock(&s->lock); so_backuperror(s); sr_mutexunlock(&s->lock); } } if (task.gc) { rc = so_gc(s, w); if (srunlikely(rc == -1)) goto error; } so_complete(s, &task); sr_trace(&w->trace, "%s", "sleep"); return job; error: sr_trace(&w->trace, "%s", "malfunction"); return -1; }
int sr_cset(src *c, srcstmt *stmt, char *value) { int type = c->flags & ~SR_CRO; if (c->flags & SR_CRO) { sr_error(stmt->r->e, "%s is read-only", stmt->path); return -1; } switch (type) { case SR_CU32: *((uint32_t*)c->value) = sr_atoi(value); break; case SR_CU64: *((uint64_t*)c->value) = sr_atoi(value); break; case SR_CSZREF: { char *nsz = NULL; if (value) { nsz = sr_strdup(stmt->r->a, value); if (srunlikely(nsz == NULL)) { sr_error(stmt->r->e, "%s", "memory allocation failed"); return -1; } } char **sz = (char**)c->value; if (*sz) sr_free(stmt->r->a, *sz); *sz = nsz; break; } default: assert(0); } return 0; }
int sd_indexbegin(sdindex *i, sr *r, uint64_t offset) { int rc = sr_bufensure(&i->i, r->a, sizeof(sdindexheader)); if (srunlikely(rc == -1)) return sr_error(r->e, "%s", "memory allocation failed"); sdindexheader *h = sd_indexheader(i); sr_version(&h->version); h->crc = 0; h->size = 0; h->count = 0; h->keys = 0; h->total = 0; h->totalorigin = 0; h->extension = 0; h->lsnmin = UINT64_MAX; h->lsnmax = 0; h->tsmin = 0; h->offset = offset; h->dupkeys = 0; h->dupmin = UINT64_MAX; memset(h->reserve, 0, sizeof(h->reserve)); sd_idinit(&h->id, 0, 0, 0); i->h = NULL; sr_bufadvance(&i->i, sizeof(sdindexheader)); return 0; }
int se_open(se *e, sr *r, seconf *conf) { e->conf = conf; int rc = se_recoverbackup(e, r); if (srunlikely(rc == -1)) return -1; int exists = sr_fileexists(conf->path); if (exists == 0) { if (srunlikely(! conf->path_create)) { sr_error(r->e, "directory '%s' does not exist", conf->path); return -1; } return se_deploy(e, r); } return se_recover(e, r); }
int sd_buildend(sdbuild *b, sr *r) { /* update sizes */ sdbuildref *ref = sd_buildref(b); ref->ksize = sr_bufused(&b->k) - ref->k; ref->vsize = sr_bufused(&b->v) - ref->v; ref->csize = 0; /* calculate data crc (non-compressed) */ sdpageheader *h = sd_buildheader(b); uint32_t crc = 0; if (srlikely(b->crc)) { crc = sr_crcp(r->crc, b->k.s + ref->k, ref->ksize, 0); crc = sr_crcp(r->crc, b->v.s + ref->v, ref->vsize, crc); } h->crcdata = crc; /* compression */ if (b->compress) { int rc = sd_buildcompress(b, r); if (srunlikely(rc == -1)) return -1; ref->csize = sr_bufused(&b->c) - ref->c; } /* update page header */ h->sizeorigin = h->size; if (b->compress) h->size = ref->csize - sizeof(sdpageheader); h->crc = sr_crcs(r->crc, h, sizeof(sdpageheader), 0); if (b->compress) memcpy(b->c.s + ref->c, h, sizeof(sdpageheader)); return 0; }
int sd_commit(sdbuild *b, sr *r, sdindex *index, srfile *file) { sdseal seal; sd_seal(&seal, r, index->h); struct iovec iovv[1024]; sriov iov; sr_iovinit(&iov, iovv, 1024); sr_iovadd(&iov, index->i.s, sr_bufused(&index->i)); SR_INJECTION(r->i, SR_INJECTION_SD_BUILD_0, sr_malfunction(r->e, "%s", "error injection"); assert( sr_filewritev(file, &iov) == 0 ); return -1); /* compression enabled */ uint32_t size = sr_bufused(&b->c); int rc; if (size > 0) { sr_iovadd(&iov, b->c.s, size); sr_iovadd(&iov, &seal, sizeof(seal)); rc = sr_filewritev(file, &iov); if (srunlikely(rc == -1)) sr_malfunction(r->e, "file '%s' write error: %s", file->file, strerror(errno)); return rc; } /* uncompressed */ sdcommitiov iter; sd_commitiov_init(&iter, b, 1022); int more = 1; while (more) { more = sd_commitiov(&iter, &iov); if (srlikely(! more)) { SR_INJECTION(r->i, SR_INJECTION_SD_BUILD_1, seal.crc++); /* corrupt seal */ sr_iovadd(&iov, &seal, sizeof(seal)); } rc = sr_filewritev(file, &iov); if (srunlikely(rc == -1)) { return sr_malfunction(r->e, "file '%s' write error: %s", file->file, strerror(errno)); } sr_iovreset(&iov); } return 0; }
int sr_mapunmap(srmap *m) { if (srunlikely(m->p == NULL)) return 0; int rc = munmap(m->p, m->size); m->p = NULL; return rc; }
int si_drop(si *i, sr *r) { siconf *conf = i->conf; /* drop file must exists at this point */ /* shutdown */ int rc = si_close(i, r); if (srunlikely(rc == -1)) return -1; if (! i->dropped) { rc = si_dropmark(i, r); if (srunlikely(rc == -1)) return -1; } /* remove directory */ rc = si_dropof(conf, r); return rc; }
static void *so_worker(void *arg) { soworker *self = arg; so *o = self->arg; for (;;) { int rc = so_active(o); if (srunlikely(rc == 0)) break; rc = so_scheduler(&o->sched, self); if (srunlikely(rc == -1)) break; if (srunlikely(rc == 0)) sr_sleep(10000000); /* 10ms */ } return NULL; }
srrbnode *sr_rbmax(srrb *t) { srrbnode *n = t->root; if (srunlikely(n == NULL)) return NULL; while (n->r) n = n->r; return n; }
static inline int si_dropof(siconf *conf, sr *r) { DIR *dir = opendir(conf->path); if (dir == NULL) { sr_malfunction(r->e, "directory '%s' open error: %s", conf->path, strerror(errno)); return -1; } char path[1024]; int rc; struct dirent *de; while ((de = readdir(dir))) { if (de->d_name[0] == '.') continue; /* skip drop file */ if (srunlikely(strcmp(de->d_name, "drop") == 0)) continue; snprintf(path, sizeof(path), "%s/%s", conf->path, de->d_name); rc = sr_fileunlink(path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", path, strerror(errno)); closedir(dir); return -1; } } closedir(dir); snprintf(path, sizeof(path), "%s/drop", conf->path); rc = sr_fileunlink(path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", path, strerror(errno)); return -1; } rc = rmdir(conf->path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "directory '%s' unlink error: %s", conf->path, strerror(errno)); return -1; } return 0; }
static inline int si_noderecover(sinode *n, sr *r) { /* recover branches */ sriter i; sr_iterinit(sd_recover, &i, r); sr_iteropen(sd_recover, &i, &n->file); int first = 1; int rc; while (sr_iteratorhas(&i)) { sdindexheader *h = sr_iteratorof(&i); sibranch *b; if (first) { b = &n->self; } else { b = si_branchnew(r); if (srunlikely(b == NULL)) goto error; } sdindex index; sd_indexinit(&index); rc = sd_indexcopy(&index, r, h); if (srunlikely(rc == -1)) goto error; si_branchset(b, &index); b->next = n->branch; n->branch = b; n->branch_count++; first = 0; sr_iteratornext(&i); } rc = sd_recover_complete(&i); if (srunlikely(rc == -1)) goto error; sr_iteratorclose(&i); return 0; error: sr_iteratorclose(&i); return -1; }
int si_nodesync(sinode *n, sr *r) { int rc = sr_filesync(&n->file); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", n->file.file, strerror(errno)); return -1; } return 0; }
static int so_gc(soscheduler *s, soworker *w) { sr_trace(&w->trace, "%s", "log gc"); so *e = s->env; int rc = sl_poolgc(&e->lp); if (srunlikely(rc == -1)) return -1; return 0; }
int so_scheduler_run(soscheduler *s) { so *e = s->env; int rc; rc = so_workersnew(&s->workers, &e->r, e->ctl.threads, so_worker, e); if (srunlikely(rc == -1)) return -1; return 0; }
static inline int se_recoverbackup(se *i, sr *r) { if (i->conf->path_backup == NULL) return 0; int rc; int exists = sr_fileexists(i->conf->path_backup); if (! exists) { rc = sr_filemkdir(i->conf->path_backup); if (srunlikely(rc == -1)) { sr_error(r->e, "backup directory '%s' create error: %s", i->conf->path_backup, strerror(errno)); return -1; } } /* recover backup sequential number */ DIR *dir = opendir(i->conf->path_backup); if (srunlikely(dir == NULL)) { sr_error(r->e, "backup directory '%s' open error: %s", i->conf->path_backup, strerror(errno)); return -1; } uint32_t bsn = 0; struct dirent *de; while ((de = readdir(dir))) { if (srunlikely(de->d_name[0] == '.')) continue; uint32_t id = 0; rc = se_process(de->d_name, &id); switch (rc) { case 1: case 0: if (id > bsn) bsn = id; break; case -1: /* skip unknown file */ continue; } } closedir(dir); r->seq->bsn = bsn; return 0; }
int si_nodefree(sinode *n, sr *r, int gc) { int rcret = 0; int rc; if (gc && n->file.file) { rc = sr_fileunlink(n->file.file); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", n->file.file, strerror(errno)); rcret = -1; } } si_nodefree_branches(n, r); rc = si_nodeclose(n, r); if (srunlikely(rc == -1)) rcret = -1; sr_free(r->a, n); return rcret; }
static inline int so_backupstart(soscheduler *s) { so *e = s->env; /* * a. create backup_path/<bsn.incomplete> directory * b. create database directories * c. create log directory */ char path[1024]; snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete", e->ctl.backup_path, s->backup_bsn); int rc = sr_filemkdir(path); if (srunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' create error: %s", path, strerror(errno)); return -1; } int i = 0; while (i < s->count) { sodb *db = s->i[i]; snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete/%s", e->ctl.backup_path, s->backup_bsn, db->ctl.name); rc = sr_filemkdir(path); if (srunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' create error: %s", path, strerror(errno)); return -1; } i++; } snprintf(path, sizeof(path), "%s/%" PRIu32 ".incomplete/log", e->ctl.backup_path, s->backup_bsn); rc = sr_filemkdir(path); if (srunlikely(rc == -1)) { sr_error(&e->error, "backup directory '%s' create error: %s", path, strerror(errno)); return -1; } return 0; }
int si_nodecomplete(sinode *n, sr *r, siconf *conf) { srpath path; sr_pathA(&path, conf->path, n->self.id.id, ".db"); int rc = sr_filerename(&n->file, path.path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' rename error: %s", n->file.file, strerror(errno)); } return rc; }
int si_nodeopen(sinode *n, sr *r, srpath *path) { int rc = sr_fileopen(&n->file, path->path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' open error: %s", n->file.file, strerror(errno)); return -1; } rc = sr_fileseek(&n->file, n->file.size); if (srunlikely(rc == -1)) { si_nodeclose(n, r); sr_malfunction(r->e, "db file '%s' seek error: %s", n->file.file, strerror(errno)); return -1; } rc = si_noderecover(n, r); if (srunlikely(rc == -1)) si_nodeclose(n, r); return rc; }