int si_nodeopen(sinode *n, sr *r, sischeme *scheme, sspath *path) { int rc = ss_fileopen(&n->file, path->path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' open error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } rc = ss_fileseek(&n->file, n->file.size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' seek error: %s", ss_pathof(&n->file.path), strerror(errno)); goto error; } rc = si_noderecover(n, r, scheme->in_memory); if (ssunlikely(rc == -1)) goto error; if (scheme->mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } return 0; error: si_nodeclose(n, r); return -1; }
int sl_iter_open(ssiter *i, sr *r, ssfile *file, int validate) { sliter *li = (sliter*)i->priv; memset(li, 0, sizeof(*li)); li->r = r; li->log = file; li->validate = validate; if (ssunlikely(li->log->size < sizeof(srversion))) { sr_malfunction(li->r->e, "corrupted log file '%s': bad size", li->log->file); return -1; } if (ssunlikely(li->log->size == sizeof(srversion))) return 0; int rc = ss_map(&li->map, li->log->fd, li->log->size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(li->r->e, "failed to mmap log file '%s': %s", li->log->file, strerror(errno)); return -1; } rc = sl_iterprepare(li); if (ssunlikely(rc == -1)) ss_mapunmap(&li->map); return 0; }
int si_nodeopen(sinode *n, sr *r, sischeme *scheme, sspath *path, sdsnapshotnode *sn) { int rc = ss_fileopen(&n->file, path->path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' open error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } rc = ss_fileseek(&n->file, n->file.size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' seek error: %s", ss_pathof(&n->file.path), strerror(errno)); goto error; } int in_memory = 0; if (scheme->storage == SI_SIN_MEMORY) in_memory = 1; rc = si_noderecover(n, r, sn, in_memory); if (ssunlikely(rc == -1)) goto error; if (scheme->mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) goto error; } return 0; error: si_nodeclose(n, r, 0); return -1; }
int si_nodeseal(sinode *n, sr *r, sischeme *scheme) { int rc; if (scheme->sync) { rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } } sspath path; ss_pathcompound(&path, scheme->path, n->self.id.parent, n->self.id.id, ".db.seal"); rc = ss_filerename(&n->file, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' rename error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } return 0; }
static inline int si_nodeclose(sinode *n, sr *r, int gc) { int rcret = 0; int rc = ss_vfsmunmap(r->vfs, &n->map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", ss_pathof(&n->file.path), strerror(errno)); rcret = -1; } rc = ss_fileclose(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' close error: %s", ss_pathof(&n->file.path), strerror(errno)); rcret = -1; } if (gc) { si_nodegc_index(r, &n->i0); si_nodegc_index(r, &n->i1); } else { sv_indexfree(&n->i0, r); sv_indexfree(&n->i1, r); } return rcret; }
static int sl_iternext_of(sliter *i, slv *next, int validate) { if (next == NULL) return 0; char *eof = (char*)i->map.p + i->map.size; char *start = (char*)next; /* eof */ if (ssunlikely(start == eof)) { if (i->count != i->pos) { sr_malfunction(i->r->e, "corrupted log file '%s': transaction is incomplete", i->log->file); sl_iterseterror(i); return -1; } i->v = NULL; i->next = NULL; return 0; } char *end = start + next->size; if (ssunlikely((start > eof || (end > eof)))) { sr_malfunction(i->r->e, "corrupted log file '%s': bad record size", i->log->file); sl_iterseterror(i); return -1; } if (validate && i->validate) { uint32_t crc = 0; if (! (next->flags & SVBEGIN)) { crc = ss_crcp(i->r->crc, start + sizeof(slv), next->size, 0); } crc = ss_crcs(i->r->crc, start, sizeof(slv), crc); if (ssunlikely(crc != next->crc)) { sr_malfunction(i->r->e, "corrupted log file '%s': bad record crc", i->log->file); sl_iterseterror(i); return -1; } } i->pos++; if (i->pos > i->count) { /* next transaction */ i->v = NULL; i->pos = 0; i->count = 0; i->next = next; return 0; } i->v = next; sv_init(&i->current, &sl_vif, i->v, NULL); return 1; }
int si_compact(si *index, sr *r, sdc *c, siplan *plan, uint64_t vlsn) { sinode *node = plan->node; assert(node->flags & SI_LOCK); /* read node file */ sd_creset(c); int rc = si_noderead(r, &c->c, node); if (ssunlikely(rc == -1)) return -1; /* prepare for compaction */ rc = sd_censure(c, r, node->branch_count); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "%s", "memory allocation failed"); return -1; } svmerge merge; sv_mergeinit(&merge); rc = sv_mergeprepare(&merge, r, node->branch_count); if (ssunlikely(rc == -1)) return -1; uint32_t size_stream = 0; sdcbuf *cbuf = c->head; sibranch *b = node->branch; while (b) { svmergessc *s = sv_mergeadd(&merge, NULL); rc = ss_bufensure(&cbuf->b, r->a, b->index.h->sizevmax); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "%s", "memory allocation failed"); return -1; } size_stream += sd_indextotal(&b->index); ss_iterinit(sd_iter, &s->ssc); ss_iteropen(sd_iter, &s->ssc, r, &b->index, c->c.s, 0, index->scheme->compression, &cbuf->a, &cbuf->b); cbuf = cbuf->next; b = b->next; } ssiter i; ss_iterinit(sv_mergeiter, &i); ss_iteropen(sv_mergeiter, &i, r, &merge, SS_GTE); rc = si_compaction(index, r, c, vlsn, node, &i, size_stream); if (ssunlikely(rc == -1)) { sv_mergefree(&merge, r->a); return -1; } sv_mergefree(&merge, r->a); return 0; }
static int sd_recovernext_of(sriter *i, sdindexheader *next) { sdrecover *ri = (sdrecover*)i->priv; if (next == NULL) return 0; char *eof = (char*)ri->map.p + ri->map.size; char *start = (char*)next; /* eof */ if (srunlikely(start == eof)) { ri->v = NULL; return 0; } /* validate crc */ uint32_t crc = sr_crcs(i->r->crc, next, sizeof(sdindexheader), 0); if (next->crc != crc) { sr_malfunction(i->r->e, "corrupted db file '%s': bad index crc", ri->file->file); ri->corrupt = 1; ri->v = NULL; return -1; } /* check version */ if (! sr_versioncheck(&next->version)) return sr_malfunction(i->r->e, "bad db file '%s' version", ri->file->file); char *end = start + sizeof(sdindexheader) + next->size + next->total + next->extension + sizeof(sdseal); if (srunlikely((start > eof || (end > eof)))) { sr_malfunction(i->r->e, "corrupted db file '%s': bad record size", ri->file->file); ri->corrupt = 1; ri->v = NULL; return -1; } /* check seal */ sdseal *s = (sdseal*)(end - sizeof(sdseal)); int rc = sd_sealvalidate(s, i->r, next); if (srunlikely(rc == -1)) { sr_malfunction(i->r->e, "corrupted db file '%s': bad seal", ri->file->file); ri->corrupt = 1; ri->v = NULL; return -1; } ri->actual = next; ri->v = next; return 1; }
static inline int si_recovercomplete(sitrack *track, sr *r, si *index, srbuf *buf) { /* prepare and build primary index */ sr_bufreset(buf); srrbnode *p = sr_rbmin(&track->i); while (p) { sinode *n = srcast(p, sinode, node); int rc = sr_bufadd(buf, r->a, &n, sizeof(sinode**)); if (srunlikely(rc == -1)) return sr_malfunction(r->e, "%s", "memory allocation failed"); p = sr_rbnext(&track->i, p); } sriter i; sr_iterinit(sr_bufiterref, &i, r); sr_iteropen(sr_bufiterref, &i, buf, sizeof(sinode*)); while (sr_iterhas(sr_bufiterref, &i)) { sinode *n = sr_iterof(sr_bufiterref, &i); if (n->recover & SI_RDB_REMOVE) { int rc = si_nodefree(n, r, 1); if (srunlikely(rc == -1)) return -1; sr_iternext(sr_bufiterref, &i); continue; } n->recover = SI_RDB; si_insert(index, r, n); si_plannerupdate(&index->p, SI_COMPACT|SI_BRANCH, n); sr_iternext(sr_bufiterref, &i); } return 0; }
int si_nodecreate(sinode *n, sr *r, sischeme *scheme, sdid *id, sdindex *i, sdbuild *build) { si_branchset(&n->self, i); sspath path; ss_pathAB(&path, scheme->path, id->parent, id->id, ".db.incomplete"); int rc = ss_filenew(&n->file, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' create error: %s", path.path, strerror(errno)); return -1; } rc = sd_commit(build, r, &n->self.index, &n->file); if (ssunlikely(rc == -1)) return -1; if (scheme->mmap) { rc = si_nodemap(n, r); if (ssunlikely(rc == -1)) return -1; } n->branch = &n->self; n->branch_count++; return 0; }
int sd_commit(sdbuild *b, sr *r, sdindex *index, srfile *file) { sdseal seal; sd_seal(&seal, r, index->h); struct iovec iovv[1024]; sriov iov; sr_iovinit(&iov, iovv, 1024); sr_iovadd(&iov, index->i.s, sr_bufused(&index->i)); SR_INJECTION(r->i, SR_INJECTION_SD_BUILD_0, sr_malfunction(r->e, "%s", "error injection"); assert( sr_filewritev(file, &iov) == 0 ); return -1); /* compression enabled */ uint32_t size = sr_bufused(&b->c); int rc; if (size > 0) { sr_iovadd(&iov, b->c.s, size); sr_iovadd(&iov, &seal, sizeof(seal)); rc = sr_filewritev(file, &iov); if (srunlikely(rc == -1)) sr_malfunction(r->e, "file '%s' write error: %s", file->file, strerror(errno)); return rc; } /* uncompressed */ sdcommitiov iter; sd_commitiov_init(&iter, b, 1022); int more = 1; while (more) { more = sd_commitiov(&iter, &iov); if (srlikely(! more)) { SR_INJECTION(r->i, SR_INJECTION_SD_BUILD_1, seal.crc++); /* corrupt seal */ sr_iovadd(&iov, &seal, sizeof(seal)); } rc = sr_filewritev(file, &iov); if (srunlikely(rc == -1)) { return sr_malfunction(r->e, "file '%s' write error: %s", file->file, strerror(errno)); } sr_iovreset(&iov); } return 0; }
static inline int sl_iterprepare(sliter *i) { srversion *ver = (srversion*)i->map.p; if (! sr_versioncheck(ver)) return sr_malfunction(i->r->e, "bad log file '%s' version", i->log->file); if (ssunlikely(i->log->size < (sizeof(srversion)))) return sr_malfunction(i->r->e, "corrupted log file '%s': bad size", i->log->file); slv *next = (slv*)((char*)i->map.p + sizeof(srversion)); int rc = sl_iternext_of(i, next, 1); if (ssunlikely(rc == -1)) return -1; if (sslikely(i->next)) return sl_itercontinue_of(i); return 0; }
static inline int si_dropof(siconf *conf, sr *r) { DIR *dir = opendir(conf->path); if (dir == NULL) { sr_malfunction(r->e, "directory '%s' open error: %s", conf->path, strerror(errno)); return -1; } char path[1024]; int rc; struct dirent *de; while ((de = readdir(dir))) { if (de->d_name[0] == '.') continue; /* skip drop file */ if (srunlikely(strcmp(de->d_name, "drop") == 0)) continue; snprintf(path, sizeof(path), "%s/%s", conf->path, de->d_name); rc = sr_fileunlink(path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", path, strerror(errno)); closedir(dir); return -1; } } closedir(dir); snprintf(path, sizeof(path), "%s/drop", conf->path); rc = sr_fileunlink(path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", path, strerror(errno)); return -1; } rc = rmdir(conf->path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "directory '%s' unlink error: %s", conf->path, strerror(errno)); return -1; } return 0; }
int si_nodesync(sinode *n, sr *r) { int rc = ss_filesync(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' sync error: %s", n->file.file, strerror(errno)); return -1; } return 0; }
int si_nodemap(sinode *n, sr *r) { int rc = ss_mmap(&n->map, n->file.fd, n->file.size, 1); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' mmap error: %s", n->file.file, strerror(errno)); return -1; } return 0; }
static inline int si_nodeclose(sinode *n, sr *r) { int rcret = 0; int rc = ss_munmap(&n->map); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' munmap error: %s", n->file.file, strerror(errno)); rcret = -1; } rc = ss_fileclose(&n->file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' close error: %s", n->file.file, strerror(errno)); rcret = -1; } sv_indexfree(&n->i0, r); sv_indexfree(&n->i1, r); return rcret; }
int si_nodecomplete(sinode *n, sr *r, sischeme *scheme) { sspath path; ss_pathA(&path, scheme->path, n->self.id.id, ".db"); int rc = ss_filerename(&n->file, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' rename error: %s", n->file.file, strerror(errno)); } return rc; }
int si_nodecomplete(sinode *n, sr *r, siconf *conf) { srpath path; sr_pathA(&path, conf->path, n->self.id.id, ".db"); int rc = sr_filerename(&n->file, path.path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' rename error: %s", n->file.file, strerror(errno)); } return rc; }
int si_nodeopen(sinode *n, sr *r, srpath *path) { int rc = sr_fileopen(&n->file, path->path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' open error: %s", n->file.file, strerror(errno)); return -1; } rc = sr_fileseek(&n->file, n->file.size); if (srunlikely(rc == -1)) { si_nodeclose(n, r); sr_malfunction(r->e, "db file '%s' seek error: %s", n->file.file, strerror(errno)); return -1; } rc = si_noderecover(n, r); if (srunlikely(rc == -1)) si_nodeclose(n, r); return rc; }
int si_nodecreate(sinode *n, sr *r, sischeme *scheme, sdid *id) { sspath path; ss_pathAB(&path, scheme->path, id->parent, id->id, ".db.incomplete"); int rc = ss_filenew(&n->file, path.path); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' create error: %s", path.path, strerror(errno)); return -1; } return 0; }
static inline int si_trackvalidate(sitrack *track, ssbuf *buf, sr *r, si *i) { ss_bufreset(buf); ssrbnode *p = ss_rbmax(&track->i); while (p) { sinode *n = sscast(p, sinode, node); switch (n->recover) { case SI_RDB|SI_RDB_DBI|SI_RDB_DBSEAL|SI_RDB_REMOVE: case SI_RDB|SI_RDB_DBSEAL|SI_RDB_REMOVE: case SI_RDB|SI_RDB_REMOVE: case SI_RDB_UNDEF|SI_RDB_DBSEAL|SI_RDB_REMOVE: case SI_RDB|SI_RDB_DBI|SI_RDB_DBSEAL: case SI_RDB|SI_RDB_DBI: case SI_RDB: case SI_RDB|SI_RDB_DBSEAL: case SI_RDB_UNDEF|SI_RDB_DBSEAL: { /* match and remove any leftover ancestor */ sinode *ancestor = si_trackget(track, n->self.id.parent); if (ancestor && (ancestor != n)) ancestor->recover |= SI_RDB_REMOVE; break; } case SI_RDB_DBSEAL: { /* find parent */ sinode *parent = si_trackget(track, n->self.id.parent); if (parent) { /* schedule node for removal, if has incomplete merges */ if (parent->recover & SI_RDB_DBI) n->recover |= SI_RDB_REMOVE; else parent->recover |= SI_RDB_REMOVE; } if (! (n->recover & SI_RDB_REMOVE)) { /* complete node */ int rc = si_nodecomplete(n, r, i->scheme); if (ssunlikely(rc == -1)) return -1; n->recover = SI_RDB; } break; } default: /* corrupted states */ return sr_malfunction(r->e, "corrupted database repository: %s", i->scheme->path); } p = ss_rbprev(&track->i, p); } return 0; }
int sd_recover_open(sriter *i, srfile *file) { sdrecover *ri = (sdrecover*)i->priv; memset(ri, 0, sizeof(*ri)); ri->file = file; if (srunlikely(ri->file->size < (sizeof(sdindexheader) + sizeof(sdseal)))) { sr_malfunction(i->r->e, "corrupted db file '%s': bad size", ri->file->file); ri->corrupt = 1; return -1; } int rc = sr_map(&ri->map, ri->file->fd, ri->file->size, 1); if (srunlikely(rc == -1)) { sr_malfunction(i->r->e, "failed to mmap db file '%s': %s", ri->file->file, strerror(errno)); return -1; } sdindexheader *next = (sdindexheader*)((char*)ri->map.p); rc = sd_recovernext_of(i, next); if (srunlikely(rc == -1)) sr_mapunmap(&ri->map); return rc; }
int si_noderead(sinode *n, sr *r, ssbuf *dest) { int rc = ss_bufensure(dest, r->a, n->file.size); if (ssunlikely(rc == -1)) return sr_oom_malfunction(r->e); rc = ss_filepread(&n->file, 0, dest->s, n->file.size); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' read error: %s", ss_pathof(&n->file.path), strerror(errno)); return -1; } ss_bufadvance(dest, n->file.size); return 0; }
static int si_dropmark(si *i, sr *r) { /* create drop file */ char path[1024]; snprintf(path, sizeof(path), "%s/drop", i->conf->path); srfile drop; sr_fileinit(&drop, r->a); int rc = sr_filenew(&drop, path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "drop file '%s' create error: %s", path, strerror(errno)); return -1; } sr_fileclose(&drop); return 0; }
static inline int si_recoverdrop(si *i, sr *r) { char path[1024]; snprintf(path, sizeof(path), "%s/drop", i->scheme->path); int rc = ss_vfsexists(r->vfs, path); if (sslikely(! rc)) return 0; if (i->scheme->path_fail_on_drop) { sr_malfunction(r->e, "attempt to recover a dropped database: %s:", i->scheme->path); return -1; } rc = si_droprepository(i->scheme, r, 0); if (ssunlikely(rc == -1)) return -1; return 1; }
int si_nodefree(sinode *n, sr *r, int gc) { int rcret = 0; int rc; if (gc && n->file.file) { rc = ss_fileunlink(n->file.file); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", n->file.file, strerror(errno)); rcret = -1; } } si_nodefree_branches(n, r); rc = si_nodeclose(n, r); if (ssunlikely(rc == -1)) rcret = -1; ss_free(r->a, n); return rcret; }
int si_nodecreate(sinode *n, sr *r, siconf *conf, sdid *id, sdindex *i, sdbuild *build) { si_branchset(&n->self, i); srpath path; sr_pathAB(&path, conf->path, id->parent, id->id, ".db.incomplete"); int rc = sr_filenew(&n->file, path.path); if (srunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' create error: %s", path.path, strerror(errno)); return -1; } rc = sd_buildwrite(build, r, &n->self.index, &n->file); if (srunlikely(rc == -1)) return -1; n->branch = &n->self; n->branch_count++; return 0; }
int si_nodefree(sinode *n, sr *r, int gc) { int rcret = 0; int rc; if (gc && ss_pathis_set(&n->file.path)) { ss_fileadvise(&n->file, 0, 0, n->file.size); rc = ss_vfsunlink(r->vfs, ss_pathof(&n->file.path)); if (ssunlikely(rc == -1)) { sr_malfunction(r->e, "db file '%s' unlink error: %s", ss_pathof(&n->file.path), strerror(errno)); rcret = -1; } } si_nodefree_branches(n, r); rc = si_nodeclose(n, r, gc); if (ssunlikely(rc == -1)) rcret = -1; ss_free(r->a, n); return rcret; }
static inline int si_recoverindex(si *i, sr *r) { int rc = si_recoverdrop(i); if (srunlikely(rc)) return 0; sitrack track; si_trackinit(&track); srbuf buf; sr_bufinit(&buf); rc = si_trackdir(&track, r, i); if (srunlikely(rc == -1)) goto error; if (srunlikely(track.count == 0)) { sr_malfunction(r->e, "corrupted database repository: %s", i->conf->path); goto error; } rc = si_trackvalidate(&track, &buf, r, i); if (srunlikely(rc == -1)) goto error; rc = si_recovercomplete(&track, r, i, &buf); if (srunlikely(rc == -1)) goto error; /* set actual metrics */ if (track.nsn > r->seq->nsn) r->seq->nsn = track.nsn; if (track.lsn > r->seq->lsn) r->seq->lsn = track.lsn; sr_buffree(&buf, r->a); return 0; error: sr_buffree(&buf, r->a); si_trackfree(&track, r); return -1; }
sinode *si_nodenew(sr *r) { sinode *n = (sinode*)sr_malloc(r->a, sizeof(sinode)); if (srunlikely(n == NULL)) { sr_malfunction(r->e, "%s", "memory allocation failed"); return NULL; } n->recover = 0; n->backup = 0; n->flags = 0; n->update_time = 0; n->used = 0; si_branchinit(&n->self); n->branch = NULL; n->branch_count = 0; sr_fileinit(&n->file, r->a); sv_indexinit(&n->i0); sv_indexinit(&n->i1); sr_rbinitnode(&n->node); sr_rqinitnode(&n->nodecompact); sr_rqinitnode(&n->nodebranch); sr_listinit(&n->commit); return n; }