Пример #1
0
static inline int
si_qgetindex(siquery *q, sinode *node)
{
	svindex *second;
	svindex *first = si_nodeindex_priority(node, &second);
	ssiter i;
	ss_iterinit(sv_indexiter, &i);
	int rc;
	if (first->count > 0) {
		rc = ss_iteropen(sv_indexiter, &i, q->r, first,
		                 SS_GTE, q->key, q->keysize);
		if (rc) {
			goto result;
		}
	}
	if (sslikely(second == NULL || !second->count))
		return 0;
	rc = ss_iteropen(sv_indexiter, &i, q->r, second,
	                 SS_GTE, q->key, q->keysize);
	if (! rc) {
		return 0;
	}
result:;
	sv *v = ss_iterof(sv_indexiter, &i);
	assert(v != NULL);
	svv *visible = v->v;
	if (sslikely(! q->has)) {
		visible = sv_visible((svv*)v->v, q->vlsn);
		if (visible == NULL)
			return 0;
	}
	sv vret;
	sv_init(&vret, &sv_vif, visible, NULL);
	return si_qgetresult(q, &vret, 0);
}
Пример #2
0
sxstate sx_set_autocommit(sxmanager *m, sxindex *index, sx *x, svv *v)
{
	if (sslikely(m->count_rw == 0)) {
		sx_init(m, x);
		svlogv lv;
		lv.id   = index->dsn;
		lv.next = UINT32_MAX;
		sv_init(&lv.v, &sv_vif, v, NULL);
		sv_logadd(&x->log, m->r->a, &lv, index->ptr);
		sr_seq(m->r->seq, SR_TSNNEXT);
		return SXCOMMIT;
	}
	sx_begin(m, x, SXRW, 0);
	int rc = sx_set(x, index, v);
	if (ssunlikely(rc == -1)) {
		sx_rollback(x);
		return SXROLLBACK;
	}
	sxstate s = sx_prepare(x, NULL, NULL);
	if (sslikely(s == SXPREPARE))
		sx_commit(x);
	else
	if (s == SXLOCK)
		sx_rollback(x);
	return s;
}
Пример #3
0
static inline int
se_document_opt(const char *path)
{
	if (sslikely((intptr_t)path <= (intptr_t)SE_DOCUMENT_FIELD_9))
		return (int)(intptr_t)path;
	switch (path[0]) {
	case 'o':
		if (sslikely(strcmp(path, "order") == 0))
			return SE_DOCUMENT_ORDER;
		break;
	case 'l':
		if (sslikely(strcmp(path, "log") == 0))
			return SE_DOCUMENT_LOG;
		break;
	case 'p':
		if (sslikely(strcmp(path, "prefix") == 0))
			return SE_DOCUMENT_PREFIX;
		break;
	case 'r':
		if (sslikely(strcmp(path, "raw") == 0))
			return SE_DOCUMENT_RAW;
		break;
	}
	return SE_DOCUMENT_FIELD;
}
Пример #4
0
int se_reqwrite(sereq *r)
{
	sereqarg *arg = &r->arg;
	svlog *log = r->arg.log;
	se *e = se_of(r->object);
	/* set lsn */
	sl_prepare(&e->lp, log, arg->lsn);
	/* log write */
	if (! arg->recover) {
		sltx tl;
		sl_begin(&e->lp, &tl);
		int rc = sl_write(&tl, log);
		if (ssunlikely(rc == -1)) {
			sl_rollback(&tl);
			r->rc = -1;
			return -1;
		}
		sl_commit(&tl);
	}
	/* commit */
	if (sslikely(arg->vlsn_generate))
		arg->vlsn = sx_vlsn(&e->xm);
	uint64_t now = ss_utime();
	svlogindex *i   = (svlogindex*)log->index.s;
	svlogindex *end = (svlogindex*)log->index.p;
	while (i < end) {
		sedb *db = i->ptr;
		sitx ti;
		si_begin(&ti, &db->index, arg->vlsn, now, log, i);
		si_write(&ti, arg->recover);
		si_commit(&ti);
		i++;
	}
	return 0;
}
Пример #5
0
sxstate sx_begin(sxmanager *m, sx *x, sxtype type, uint64_t vlsn)
{
	sx_promote(x, SXREADY);
	x->type = type;
	x->log_read = -1;
	sr_seqlock(m->r->seq);
	x->csn = m->csn;
	x->id = sr_seqdo(m->r->seq, SR_TSNNEXT);
	if (sslikely(vlsn == 0))
		x->vlsn = sr_seqdo(m->r->seq, SR_LSN);
	else
		x->vlsn = vlsn;
	sr_sequnlock(m->r->seq);
	sx_init(m, x);
	ss_spinlock(&m->lock);
	ssrbnode *n = NULL;
	int rc = sx_matchtx(&m->i, NULL, (char*)&x->id, sizeof(x->id), &n);
	if (rc == 0 && n) {
		assert(0);
	} else {
		ss_rbset(&m->i, n, rc, &x->node);
	}
	if (type == SXRO)
		m->count_rd++;
	else
		m->count_rw++;
	ss_spinunlock(&m->lock);
	return SXREADY;
}
Пример #6
0
int si_recover(si *i)
{
	sr *r = i->r;
	int exist = ss_vfsexists(r->vfs, i->scheme->path);
	if (exist == 0)
		goto deploy;
	if (i->scheme->path_fail_on_exists) {
		sr_error(r->e, "directory '%s' already exists", i->scheme->path);
		return -1;
	}
	int rc = si_recoverdrop(i, r);
	switch (rc) {
	case -1: return -1;
	case  1: goto deploy;
	}
	rc = si_schemerecover(i->scheme, r);
	if (ssunlikely(rc == -1))
		return -1;
	r->scheme = &i->scheme->scheme;
	r->fmt = i->scheme->fmt;
	r->fmt_storage = i->scheme->fmt_storage;
	sdsnapshot snapshot;
	sd_snapshot_init(&snapshot);
	rc = si_recoversnapshot(i, r, &snapshot);
	if (ssunlikely(rc == -1)) {
		sd_snapshot_free(&snapshot, r);
		return -1;
	}
	rc = si_recoverindex(i, r, &snapshot);
	sd_snapshot_free(&snapshot, r);
	if (sslikely(rc <= 0))
		return rc;
deploy:
	return si_deploy(i, r, !exist);
}
Пример #7
0
static inline int
si_plannerpeek_gc(siplanner *p, siplan *plan)
{
	/* try to peek a node with a biggest number
	 * of branches which is ready for gc */
	int rc_inprogress = 0;
	sinode *n;
	ssrqnode *pn = NULL;
	while ((pn = ss_rqprev(&p->compact, pn))) {
		n = sscast(pn, sinode, nodecompact);
		sdindexheader *h = n->self.index.h;
		if (sslikely(h->dupkeys == 0) || (h->dupmin >= plan->a))
			continue;
		uint32_t used = (h->dupkeys * 100) / h->keys;
		if (used >= plan->b) {
			if (n->flags & SI_LOCK) {
				rc_inprogress = 2;
				continue;
			}
			goto match;
		}
	}
	if (rc_inprogress)
		plan->explain = SI_ERETRY;
	return rc_inprogress;
match:
	si_nodelock(n);
	plan->explain = SI_ENONE;
	plan->node = n;
	return 1;
}
Пример #8
0
void sx_gc(sx *t, sr *r)
{
	sxmanager *m = t->manager;
	ssiter i;
	ss_iterinit(ss_bufiter, &i);
	ss_iteropen(ss_bufiter, &i, &t->log.buf, sizeof(svlogv));
	if (sslikely(t->s == SXCOMMIT)) {
		for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i))
		{
			svlogv *lv = ss_iterof(ss_bufiter, &i);
			sxv *v = lv->vgc;
			ss_free(m->asxv, v);
		}
	} else
	if (t->s == SXROLLBACK) {
		int gc = 0;
		for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i))
		{
			svlogv *lv = ss_iterof(ss_bufiter, &i);
			sxv *v = lv->v.v;
			gc += sv_vsize((svv*)v->v);
			sx_vfree(m->a, m->asxv, v);
		}
		ss_quota(r->quota, SS_QREMOVE, gc);
	}
	sv_logfree(&t->log, m->a);
	t->s = SXUNDEF;
}
Пример #9
0
sxstate sx_set_autocommit(sxmanager *m, sxindex *index, sx *x, svlog *log, svv *v)
{
	if (sslikely(m->count_rw == 0)) {
		sx_init(m, x, log);
		svlogv lv;
		lv.index_id = index->dsn;
		lv.next     = UINT32_MAX;
		lv.v        = v;
		lv.ptr      = NULL;
		sv_logadd(x->log, index->r, &lv);
		sr_seq(index->r->seq, SR_TSNNEXT);
		sx_promote(x, SX_COMMIT);
		return SX_COMMIT;
	}
	sx_begin(m, x, SX_RW, log, 0);
	int rc = sx_set(x, index, v);
	if (ssunlikely(rc == -1)) {
		sx_rollback(x);
		return SX_ROLLBACK;
	}
	sxstate s = sx_prepare(x, NULL, NULL);
	switch (s) {
	case SX_PREPARE:
		s = sx_commit(x);
		break;
	case SX_LOCK:
		s = sx_rollback(x);
		break;
	case SX_ROLLBACK:
		break;
	default:
		assert(0);
	}
	return s;
}
Пример #10
0
int se_reqread(sereq *r)
{
	sereqarg *arg = &r->arg;
	sedb *db = (sedb*)r->db;
	uint32_t keysize;
	void *key;
	if (sslikely(arg->v.v)) {
		keysize = sv_size(&arg->v);
		key = sv_pointer(&arg->v);
	} else {
		keysize = 0;
		key = NULL;
	}
	char *prefix;
	uint32_t prefixsize;
	if (arg->vprefix.v) {
		void *vptr = sv_vpointer(arg->vprefix.v);
		prefix = sf_key(vptr, 0);
		prefixsize = sf_keysize(vptr, 0);
	} else {
		prefix = NULL;
		prefixsize = 0;
	}
	if (sslikely(arg->vlsn_generate))
		arg->vlsn = sr_seq(db->r.seq, SR_LSN);
	sitx x;
	si_begin(&x, &db->index, 1);
	siread q;
	si_readopen(&q, &x, arg->cache,
	            arg->order,
	            arg->vlsn,
	            prefix,
	            prefixsize, key, keysize);
	if (arg->update)
		si_readupdate(&q, &arg->vup, arg->update_eq);
	if (arg->cache_only)
		si_readcache_only(&q);
	if (arg->has)
		si_readhas(&q);
	r->rc = si_read(&q);
	r->read_disk  = q.read_disk;
	r->read_cache = q.read_cache;
	r->v = q.result.v;
	si_readclose(&q);
	si_commit(&x);
	return r->rc;
}
Пример #11
0
static int
so_snapshotfree(sosnapshot *s)
{
	so *e = so_of(&s->o);
	sx_rollback(&s->t);
	if (sslikely(s->name)) {
		ss_free(&e->a, s->name);
		s->name = NULL;
	}
	ss_free(&e->a_snapshot, s);
	return 0;
}
Пример #12
0
static inline int
sx_preparecb(sx *x, svlogv *v, uint64_t lsn, sxpreparef prepare, void *arg)
{
	if (sslikely(lsn == x->vlsn))
		return 0;
	if (prepare == NULL)
		return 0;
	sxindex *i = ((sxv*)v->ptr)->index;
	if (prepare(x, v->v, i->object, arg))
		return 1;
	return 0;
}
Пример #13
0
void ss_rbset(ssrb *t, ssrbnode *p, int prel, ssrbnode *n)
{
	n->color = SS_RBRED;
	n->p     = p;
	n->l     = NULL;
	n->r     = NULL;
	if (sslikely(p)) {
		assert(prel != 0);
		if (prel > 0)
			p->l = n;
		else
			p->r = n;
	} else {
		t->root = n;
	}
	ss_rbset_fixup(t, n);
}
Пример #14
0
static inline int
si_recoverdrop(si *i, sr *r)
{
	char path[1024];
	snprintf(path, sizeof(path), "%s/drop", i->scheme->path);
	int rc = ss_vfsexists(r->vfs, path);
	if (sslikely(! rc))
		return 0;
	if (i->scheme->path_fail_on_drop) {
		sr_malfunction(r->e, "attempt to recover a dropped database: %s:",
		               i->scheme->path);
		return -1;
	}
	rc = si_droprepository(i->scheme, r, 0);
	if (ssunlikely(rc == -1))
		return -1;
	return 1;
}
Пример #15
0
static inline int
sl_iterprepare(sliter *i)
{
	srversion *ver = (srversion*)i->map.p;
	if (! sr_versioncheck(ver))
		return sr_malfunction(i->r->e, "bad log file '%s' version",
		                      i->log->file);
	if (ssunlikely(i->log->size < (sizeof(srversion))))
		return sr_malfunction(i->r->e, "corrupted log file '%s': bad size",
		                      i->log->file);
	slv *next = (slv*)((char*)i->map.p + sizeof(srversion));
	int rc = sl_iternext_of(i, next, 1);
	if (ssunlikely(rc == -1))
		return -1;
	if (sslikely(i->next))
		return sl_itercontinue_of(i);
	return 0;
}
Пример #16
0
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg)
{
	uint64_t lsn = sr_seq(x->manager->seq, SR_LSN);
	/* proceed read-only transactions */
	if (x->type == SX_RO || sv_logcount_write(x->log) == 0)
		return sx_promote(x, SX_PREPARE);
	ssiter i;
	ss_iterinit(ss_bufiter, &i);
	ss_iteropen(ss_bufiter, &i, &x->log->buf, sizeof(svlogv));
	sxstate rc;
	for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i))
	{
		svlogv *lv = ss_iterof(ss_bufiter, &i);
		sxv *v = lv->ptr;
		if ((int)v->lo == x->log_read)
			break;
		if (sx_vaborted(v))
			return sx_promote(x, SX_ROLLBACK);
		if (sslikely(v->prev == NULL)) {
			rc = sx_preparecb(x, lv, lsn, prepare, arg);
			if (ssunlikely(rc != 0))
				return sx_promote(x, SX_ROLLBACK);
			continue;
		}
		if (sx_vcommitted(v->prev)) {
			if (v->prev->csn > x->csn)
				return sx_promote(x, SX_ROLLBACK);
			continue;
		}
		/* force commit for read-only conflicts */
		sxindex *i = v->prev->index;
		if (sv_vflags(v->prev->v, i->r) & SVGET) {
			rc = sx_preparecb(x, lv, lsn, prepare, arg);
			if (ssunlikely(rc != 0))
				return sx_promote(x, SX_ROLLBACK);
			continue;
		}
		return sx_promote(x, SX_LOCK);
	}
	return sx_promote(x, SX_PREPARE);
}
Пример #17
0
static inline void
ss_rbrotate_right(ssrb *t, ssrbnode *n)
{
	ssrbnode *p = n;
	ssrbnode *q = n->l;
	ssrbnode *parent = n->p;
	if (sslikely(p->p != NULL)) {
		if (parent->l == p)
			parent->l = q;
		else
			parent->r = q;
	} else {
		t->root = q;
	}
	q->p = parent;
	p->p = q;
	p->l = q->r;
	if (p->l)
		p->l->p = p;
	q->r = p;
}
Пример #18
0
static inline void
sx_rollback_svp(sx *x, ssiter *i, int free)
{
	sxmanager *m = x->manager;
	int gc = 0;
	for (; ss_iterhas(ss_bufiter, i); ss_iternext(ss_bufiter, i))
	{
		svlogv *lv = ss_iterof(ss_bufiter, i);
		sxv *v = lv->v.v;
		/* remove from index and replace head with
		 * a first waiter */
		sx_untrack(v);
		/* translate log version from sxv to svv */
		sv_init(&lv->v, &sv_vif, v->v, NULL);
		if (free) {
			if (sslikely(! (v->v->flags & SVGET)))
				gc += sv_vsize((svv*)v->v);
			sx_vfree(m->r, m->asxv, v);
		}
	}
	ss_quota(m->r->quota, SS_QREMOVE, gc);
}
Пример #19
0
int sd_buildend(sdbuild *b, sr *r)
{
	/* update sizes */
	sdbuildref *ref = sd_buildref(b);
	ref->msize = ss_bufused(&b->m) - ref->m;
	ref->vsize = ss_bufused(&b->v) - ref->v;
	ref->ksize = ss_bufused(&b->k) - ref->k;
	ref->csize = 0;
	/* calculate data crc (non-compressed) */
	sdpageheader *h = sd_buildheader(b);
	uint32_t crc = 0;
	if (sslikely(b->crc)) {
		crc = ss_crcp(r->crc, b->m.s + ref->m, ref->msize, 0);
		crc = ss_crcp(r->crc, b->v.s + ref->v, ref->vsize, crc);
		crc = ss_crcp(r->crc, b->k.s + ref->k, ref->ksize, crc);
	}
	h->crcdata = crc;
	/* compression */
	if (b->compress) {
		int rc = sd_buildcompress(b, r);
		if (ssunlikely(rc == -1))
			return -1;
		ref->csize = ss_bufused(&b->c) - ref->c;
	}
	/* update page header */
	int total = ref->msize + ref->vsize + ref->ksize;
	h->sizekeys = ref->ksize;
	h->sizeorigin = total - sizeof(sdpageheader);
	h->size = h->sizeorigin;
	if (b->compress)
		h->size = ref->csize - sizeof(sdpageheader);
	else
		h->size = h->sizeorigin;
	h->crc = ss_crcs(r->crc, h, sizeof(sdpageheader), 0);
	if (b->compress)
		memcpy(b->c.s + ref->c, h, sizeof(sdpageheader));
	return 0;
}
Пример #20
0
static inline int
se_confsophia_error(srconf *c, srconfstmt *s)
{
	se *e = s->ptr;
	char *errorp;
	char  error[128];
	error[0] = 0;
	int len = sr_errorcopy(&e->error, error, sizeof(error));
	if (sslikely(len == 0))
		errorp = NULL;
	else
		errorp = error;
	srconf conf = {
		.key      = c->key,
		.flags    = c->flags,
		.type     = c->type,
		.function = NULL,
		.value    = errorp,
		.ptr      = NULL,
		.next     = NULL
	};
	return se_confv(&conf, s);
}

static inline srconf*
se_confsophia(se *e, seconfrt *rt, srconf **pc)
{
	srconf *sophia = *pc;
	srconf *p = NULL;
	sr_C(&p, pc, se_confv, "version", SS_STRING, rt->version, SR_RO, NULL);
	sr_C(&p, pc, se_confv, "version_storage", SS_STRING, rt->version_storage, SR_RO, NULL);
	sr_C(&p, pc, se_confv, "build", SS_STRING, rt->build, SR_RO, NULL);
	sr_C(&p, pc, se_confsophia_error, "error", SS_STRING, NULL, SR_RO, NULL);
	sr_c(&p, pc, se_confv_offline, "path", SS_STRINGPTR, &e->conf.path);
	sr_c(&p, pc, se_confv_offline, "path_create", SS_U32, &e->conf.path_create);
	sr_c(&p, pc, se_confv_offline, "recover", SS_U32, &e->conf.recover);
	return sr_C(NULL, pc, NULL, "sophia", SS_UNDEF, sophia, SR_NS, NULL);
}
Пример #21
0
sxstate sx_prepare(sx *x, sxpreparef prepare, void *arg)
{
	uint64_t lsn = sr_seq(x->manager->r->seq, SR_LSN);
	/* proceed read-only transactions */
	if (x->type == SXRO || sv_logcount_write(&x->log) == 0)
		return sx_promote(x, SXPREPARE);
	ssiter i;
	ss_iterinit(ss_bufiter, &i);
	ss_iteropen(ss_bufiter, &i, &x->log.buf, sizeof(svlogv));
	for (; ss_iterhas(ss_bufiter, &i); ss_iternext(ss_bufiter, &i))
	{
		svlogv *lv = ss_iterof(ss_bufiter, &i);
		sxv *v = lv->v.v;
		if ((int)v->lo == x->log_read)
			break;
		if (sx_vaborted(v))
			return sx_promote(x, SXROLLBACK);
		if (sslikely(v->prev == NULL)) {
			if (prepare && lsn != x->vlsn) {
				sxindex *i = v->index;
				if (prepare(x, &lv->v, arg, i->ptr))
					return sx_promote(x, SXROLLBACK);
			}
			continue;
		}
		if (sx_vcommitted(v->prev)) {
			if (v->prev->csn > x->csn)
				return sx_promote(x, SXROLLBACK);
			continue;
		}
		/* force commit for read-only conflicts */
		if (v->prev->v->flags & SVGET)
			continue;
		return sx_promote(x, SXLOCK);
	}
	return sx_promote(x, SXPREPARE);
}
Пример #22
0
static inline svv*
sv_vset(svv *head, svv *v)
{
	/* default */
	if (sslikely(head->lsn < v->lsn)) {
		v->next = head;
		head->flags |= SVDUP;
		return v;
	}
	/* redistribution (starting from highest lsn) */
	svv *prev = head;
	svv *c = head->next;
	while (c) {
		assert(c->lsn != v->lsn);
		if (c->lsn < v->lsn)
			break;
		prev = c;
		c = c->next;
	}
	prev->next = v;
	v->next = c;
	v->flags |= SVDUP;
	return head;
}
Пример #23
0
static inline sshot void*
ss_slabamalloc(ssa *a, int size ssunused)
{
	ssslaba *s = (ssslaba*)a->priv;
	assert(size == (int)s->slab_size);

	ss_spinlock(&s->lock);
	char *slab;
	if (sslikely(s->pool_free_count)) {
		slab = s->pool_free;
		s->pool_free = *(char**)slab;
		s->pool_free_count--;
		if (ssunlikely(s->pool_free_count == 0))
			s->pool_free = NULL;
	} else
	if (ssunlikely(s->pool_next == s->pool_end)) {
		slab = NULL;
	} else {
		slab = s->pool_next;
		s->pool_next += s->slab_size;
	}
	ss_spinunlock(&s->lock);
	return slab;
}
Пример #24
0
static inline int
se_metasophia_error(srmeta *c, srmetastmt *s)
{
	se *e = s->ptr;
	char *errorp;
	char  error[128];
	error[0] = 0;
	int len = sr_errorcopy(&e->error, error, sizeof(error));
	if (sslikely(len == 0))
		errorp = NULL;
	else
		errorp = error;
	srmeta meta = {
		.key      = c->key,
		.flags    = c->flags,
		.type     = c->type,
		.function = NULL,
		.value    = errorp,
		.ptr      = NULL,
		.next     = NULL
	};
	return se_metav(&meta, s);
}

static inline srmeta*
se_metasophia(se *e, semetart *rt, srmeta **pc)
{
	srmeta *sophia = *pc;
	srmeta *p = NULL;
	sr_M(&p, pc, se_metav, "version", SS_STRING, rt->version, SR_RO, NULL);
	sr_M(&p, pc, se_metav, "build", SS_STRING, rt->build, SR_RO, NULL);
	sr_M(&p, pc, se_metasophia_error, "error", SS_STRING, NULL, SR_RO, NULL);
	sr_m(&p, pc, se_metav_offline, "path", SS_STRINGPTR, &e->meta.path);
	sr_m(&p, pc, se_metav_offline, "path_create", SS_U32, &e->meta.path_create);
	return sr_M(NULL, pc, NULL, "sophia", SS_UNDEF, sophia, SR_NS, NULL);
}
Пример #25
0
sxstate sx_begin(sxmanager *m, sx *t, uint64_t vlsn)
{
	t->s = SXREADY; 
	t->complete = 0;
	sr_seqlock(m->seq);
	t->id = sr_seqdo(m->seq, SR_TSNNEXT);
	if (sslikely(vlsn == 0))
		t->vlsn = sr_seqdo(m->seq, SR_LSN);
	else
		t->vlsn = vlsn;
	sr_sequnlock(m->seq);
	sx_init(m, t);
	ss_spinlock(&m->lock);
	ssrbnode *n = NULL;
	int rc = sx_matchtx(&m->i, NULL, (char*)&t->id, sizeof(t->id), &n);
	if (rc == 0 && n) {
		assert(0);
	} else {
		ss_rbset(&m->i, n, rc, &t->node);
	}
	m->count++;
	ss_spinunlock(&m->lock);
	return SXREADY;
}
Пример #26
0
static inline int
si_rangebranch(siread *q, sinode *n, sibranch *b, svmerge *m)
{
	sicachebranch *c = si_cachefollow(q->cache);
	assert(c->branch == b);
	/* iterate cache */
	if (ss_iterhas(sd_read, &c->i)) {
		svmergesrc *s = sv_mergeadd(m, &c->i);
		si_readstat(q, 1, n, 1);
		s->ptr = c;
		return 1;
	}
	if (c->open) {
		return 1;
	}
	if (q->cache_only) {
		return 2;
	}
	c->open = 1;
	/* choose compression type */
	int compression;
	ssfilterif *compression_if;
	if (! si_branchis_root(b)) {
		compression    = q->index->scheme->compression_branch;
		compression_if = q->index->scheme->compression_branch_if;
	} else {
		compression    = q->index->scheme->compression;
		compression_if = q->index->scheme->compression_if;
	}
	sdreadarg arg = {
		.index           = &b->index,
		.buf             = &c->buf_a,
		.buf_xf          = &c->buf_b,
		.buf_read        = &q->index->readbuf,
		.index_iter      = &c->index_iter,
		.page_iter       = &c->page_iter,
		.use_memory      = n->in_memory,
		.use_mmap        = q->index->scheme->mmap,
		.use_mmap_copy   = 1,
		.use_compression = compression,
		.compression_if  = compression_if,
		.has             = 0,
		.has_vlsn        = 0,
		.o               = q->order,
		.memory          = &b->copy,
		.mmap            = &n->map,
		.file            = &n->file,
		.r               = q->r
	};
	ss_iterinit(sd_read, &c->i);
	int rc = ss_iteropen(sd_read, &c->i, &arg, q->key, q->keysize);
	int reads = sd_read_stat(&c->i);
	si_readstat(q, 0, n, reads);
	if (ssunlikely(rc == -1))
		return -1;
	if (ssunlikely(! ss_iterhas(sd_read, &c->i)))
		return 0;
	svmergesrc *s = sv_mergeadd(m, &c->i);
	s->ptr = c;
	return 1;
}

static inline int
si_range(siread *q)
{
	assert(q->has == 0);

	ssiter i;
	ss_iterinit(si_iter, &i);
	ss_iteropen(si_iter, &i, q->r, q->index, q->order, q->key, q->keysize);
	sinode *node;
next_node:
	node = ss_iterof(si_iter, &i);
	if (ssunlikely(node == NULL))
		return 0;
	si_txtrack(q->x, node);

	/* prepare sources */
	svmerge *m = &q->merge;
	int count = node->branch_count + 2 + 1;
	int rc = sv_mergeprepare(m, q->r, count);
	if (ssunlikely(rc == -1)) {
		sr_errorreset(q->r->e);
		return -1;
	}

	/* external source (upsert) */
	svmergesrc *s;
	sv upbuf_reserve;
	ssbuf upbuf;
	if (ssunlikely(q->upsert_v && q->upsert_v->v)) {
		ss_bufinit_reserve(&upbuf, &upbuf_reserve, sizeof(upbuf_reserve));
		ss_bufadd(&upbuf, NULL, (void*)&q->upsert_v, sizeof(sv*));
		s = sv_mergeadd(m, NULL);
		ss_iterinit(ss_bufiterref, &s->src);
		ss_iteropen(ss_bufiterref, &s->src, &upbuf, sizeof(sv*));
	}

	/* in-memory indexes */
	svindex *second;
	svindex *first = si_nodeindex_priority(node, &second);
	if (first->count) {
		s = sv_mergeadd(m, NULL);
		ss_iterinit(sv_indexiter, &s->src);
		ss_iteropen(sv_indexiter, &s->src, q->r, first, q->order,
		            q->key, q->keysize);
	}
	if (ssunlikely(second && second->count)) {
		s = sv_mergeadd(m, NULL);
		ss_iterinit(sv_indexiter, &s->src);
		ss_iteropen(sv_indexiter, &s->src, q->r, second, q->order,
		            q->key, q->keysize);
	}

	/* cache and branches */
	rc = si_cachevalidate(q->cache, node);
	if (ssunlikely(rc == -1)) {
		sr_oom(q->r->e);
		return -1;
	}
	sibranch *b = node->branch;
	while (b) {
		rc = si_rangebranch(q, node, b, m);
		if (ssunlikely(rc == -1 || rc == 2))
			return rc;
		b = b->next;
	}

	/* merge and filter data stream */
	ssiter j;
	ss_iterinit(sv_mergeiter, &j);
	ss_iteropen(sv_mergeiter, &j, q->r, m, q->order);
	ssiter k;
	ss_iterinit(sv_readiter, &k);
	ss_iteropen(sv_readiter, &k, q->r, &j, &q->index->u, q->vlsn, 0);
	sv *v = ss_iterof(sv_readiter, &k);
	if (ssunlikely(v == NULL)) {
		sv_mergereset(&q->merge);
		ss_iternext(si_iter, &i);
		goto next_node;
	}

	rc = 1;
	/* convert upsert search to SS_EQ */
	if (q->upsert_eq) {
		rc = sr_compare(q->r->scheme, sv_pointer(v), sv_size(v),
		                q->key, q->keysize);
		rc = rc == 0;
	}
	/* do prefix search */
	if (q->prefix && rc) {
		rc = sr_compareprefix(q->r->scheme, q->prefix, q->prefixsize,
		                      sv_pointer(v),
		                      sv_size(v));
	}
	if (sslikely(rc == 1)) {
		if (ssunlikely(si_readdup(q, v) == -1))
			return -1;
	}

	/* skip a possible duplicates from data sources */
	ss_iternext(sv_readiter, &k);
	return rc;
}
Пример #27
0
sinode *si_bootstrap(si *i, uint64_t parent)
{
	sr *r = i->r;
	/* create node */
	sinode *n = si_nodenew(r);
	if (ssunlikely(n == NULL))
		return NULL;
	sdid id = {
		.parent = parent,
		.flags  = 0,
		.id     = sr_seq(r->seq, SR_NSNNEXT)
	};
	int rc;
	rc = si_nodecreate(n, r, i->scheme, &id);
	if (ssunlikely(rc == -1))
		goto e0;
	n->branch = &n->self;
	n->branch_count++;

	/* in-memory mode support */
	ssblob *blob = NULL;
	if (i->scheme->storage == SI_SIN_MEMORY) {
		blob = &n->self.copy;
		rc = ss_blobensure(blob, 4096);
		if (ssunlikely(rc == -1))
			goto e0;
		n->in_memory = 1;
	}

	/* create index with one empty page */
	sdindex index;
	sd_indexinit(&index);
	rc = sd_indexbegin(&index, r);
	if (ssunlikely(rc == -1))
		goto e0;

	ssqf f, *qf = NULL;
	ss_qfinit(&f);

	sdbuild build;
	sd_buildinit(&build);
	rc = sd_buildbegin(&build, r,
	                   i->scheme->node_page_checksum,
	                   i->scheme->compression_key,
	                   i->scheme->compression,
	                   i->scheme->compression_if);
	if (ssunlikely(rc == -1))
		goto e1;
	sd_buildend(&build, r);
	rc = sd_indexadd(&index, r, &build, sizeof(sdseal));
	if (ssunlikely(rc == -1))
		goto e1;

	/* write seal */
	uint64_t seal = n->file.size;
	rc = sd_writeseal(r, &n->file, blob);
	if (ssunlikely(rc == -1))
		goto e1;
	/* write page */
	rc = sd_writepage(r, &n->file, blob, &build);
	if (ssunlikely(rc == -1))
		goto e1;
	/* amqf */
	if (i->scheme->amqf) {
		rc = ss_qfensure(&f, r->a, 0);
		if (ssunlikely(rc == -1))
			goto e1;
		qf = &f;
	}
	rc = sd_indexcommit(&index, r, &id, qf, n->file.size);
	if (ssunlikely(rc == -1))
		goto e1;
	ss_qffree(&f, r->a);
	/* write index */
	rc = sd_writeindex(r, &n->file, blob, &index);
	if (ssunlikely(rc == -1))
		goto e1;
	/* close seal */
	rc = sd_seal(r, &n->file, blob, &index, seal);
	if (ssunlikely(rc == -1))
		goto e1;
	if (blob) {
		rc = ss_blobfit(blob);
		if (ssunlikely(rc == -1))
			goto e1;
	}
	if (i->scheme->mmap) {
		rc = si_nodemap(n, r);
		if (ssunlikely(rc == -1))
			goto e1;
	}
	si_branchset(&n->self, &index);

	sd_buildcommit(&build, r);
	sd_buildfree(&build, r);
	return n;
e1:
	ss_qffree(&f, r->a);
	sd_indexfree(&index, r);
	sd_buildfree(&build, r);
e0:
	si_nodefree(n, r, 0);
	return NULL;
}

static inline int
si_deploy(si *i, sr *r, int create_directory)
{
	/* create directory */
	int rc;
	if (sslikely(create_directory)) {
		rc = ss_vfsmkdir(r->vfs, i->scheme->path, 0755);
		if (ssunlikely(rc == -1)) {
			sr_malfunction(r->e, "directory '%s' create error: %s",
			               i->scheme->path, strerror(errno));
			return -1;
		}
	}
	/* create scheme file */
	rc = si_schemedeploy(i->scheme, r);
	if (ssunlikely(rc == -1)) {
		sr_malfunction_set(r->e);
		return -1;
	}
	/* create initial node */
	sinode *n = si_bootstrap(i, 0);
	if (ssunlikely(n == NULL))
		return -1;
	SS_INJECTION(r->i, SS_INJECTION_SI_RECOVER_0,
	             si_nodefree(n, r, 0);
	             sr_malfunction(r->e, "%s", "error injection");
	             return -1);
	rc = si_nodecomplete(n, r, i->scheme);
	if (ssunlikely(rc == -1)) {
		si_nodefree(n, r, 1);
		return -1;
	}
	si_insert(i, n);
	si_plannerupdate(&i->p, SI_COMPACT|SI_BRANCH|SI_TEMP, n);
	i->size = si_nodesize(n);
	return 1;
}
Пример #28
0
static inline int
si_trackdir(sitrack *track, sr *r, si *i)
{
	DIR *dir = opendir(i->scheme->path);
	if (ssunlikely(dir == NULL)) {
		sr_malfunction(r->e, "directory '%s' open error: %s",
		               i->scheme->path, strerror(errno));
		return -1;
	}
	struct dirent *de;
	while ((de = readdir(dir))) {
		if (ssunlikely(de->d_name[0] == '.'))
			continue;
		uint64_t id_parent = 0;
		uint64_t id = 0;
		int rc = si_process(de->d_name, &id, &id_parent);
		if (ssunlikely(rc == -1))
			continue; /* skip unknown file */
		si_tracknsn(track, id_parent);
		si_tracknsn(track, id);

		sinode *head, *node;
		sspath path;
		switch (rc) {
		case SI_RDB_DBI:
		case SI_RDB_DBSEAL: {
			/* find parent node and mark it as having
			 * incomplete compaction process */
			head = si_trackget(track, id_parent);
			if (sslikely(head == NULL)) {
				head = si_nodenew(r);
				if (ssunlikely(head == NULL))
					goto error;
				head->self.id.id = id_parent;
				head->recover = SI_RDB_UNDEF;
				si_trackset(track, head);
			}
			head->recover |= rc;
			/* remove any incomplete file made during compaction */
			if (rc == SI_RDB_DBI) {
				ss_pathcompound(&path, i->scheme->path, id_parent, id,
				                ".db.incomplete");
				rc = ss_vfsunlink(r->vfs, path.path);
				if (ssunlikely(rc == -1)) {
					sr_malfunction(r->e, "db file '%s' unlink error: %s",
					               path.path, strerror(errno));
					goto error;
				}
				continue;
			}
			assert(rc == SI_RDB_DBSEAL);
			/* recover 'sealed' node */
			node = si_nodenew(r);
			if (ssunlikely(node == NULL))
				goto error;
			node->recover = SI_RDB_DBSEAL;
			ss_pathcompound(&path, i->scheme->path, id_parent, id,
			                ".db.seal");
			rc = si_nodeopen(node, r, i->scheme, &path, NULL);
			if (ssunlikely(rc == -1)) {
				si_nodefree(node, r, 0);
				goto error;
			}
			si_trackset(track, node);
			si_trackmetrics(track, node);
			continue;
		}
		}
		assert(rc == SI_RDB);

		head = si_trackget(track, id);
		if (head != NULL && (head->recover & SI_RDB)) {
			/* loaded by snapshot */
			continue;
		}

		/* recover node */
		node = si_nodenew(r);
		if (ssunlikely(node == NULL))
			goto error;
		node->recover = SI_RDB;
		ss_path(&path, i->scheme->path, id, ".db");
		rc = si_nodeopen(node, r, i->scheme, &path, NULL);
		if (ssunlikely(rc == -1)) {
			si_nodefree(node, r, 0);
			goto error;
		}
		si_trackmetrics(track, node);

		/* track node */
		if (sslikely(head == NULL)) {
			si_trackset(track, node);
		} else {
			/* replace a node previously created by a
			 * incomplete compaction */
			si_trackreplace(track, head, node);
			head->recover &= ~SI_RDB_UNDEF;
			node->recover |= head->recover;
			si_nodefree(head, r, 0);
		}
	}
	closedir(dir);
	return 0;
error:
	closedir(dir);
	return -1;
}
Пример #29
0
static inline int
se_document_opt(const char *path)
{
	switch (path[0]) {
	case 'v':
		if (sslikely(strcmp(path, "value") == 0))
			return SE_DOCUMENT_VALUE;
		break;
	case 'k':
		if (sslikely(strncmp(path, "key", 3) == 0))
			return SE_DOCUMENT_KEY;
		break;
	case 'o':
		if (sslikely(strcmp(path, "order") == 0))
			return SE_DOCUMENT_ORDER;
		break;
	case 'l':
		if (sslikely(strcmp(path, "lsn") == 0))
			return SE_DOCUMENT_LSN;
		if (sslikely(strcmp(path, "log") == 0))
			return SE_DOCUMENT_LOG;
		break;
	case 'p':
		if (sslikely(strcmp(path, "prefix") == 0))
			return SE_DOCUMENT_PREFIX;
		break;
	case 'a':
		if (sslikely(strcmp(path, "arg") == 0))
			return SE_DOCUMENT_ARG;
		if (strcmp(path, "async") == 0)
			return SE_DOCUMENT_ASYNC;
		break;
	case 'r':
		if (sslikely(strcmp(path, "raw") == 0))
			return SE_DOCUMENT_RAW;
		break;
	case 'f':
		if (sslikely(strcmp(path, "flags") == 0))
			return SE_DOCUMENT_FLAGS;
		break;
	case 't':
		if (sslikely(strcmp(path, "type") == 0))
			return SE_DOCUMENT_TYPE;
		break;
	case 'c':
		if (sslikely(strcmp(path, "cache_only") == 0))
			return SE_DOCUMENT_CACHE_ONLY;
		break;
	case 'i':
		if (sslikely(strcmp(path, "immutable") == 0))
			return SE_DOCUMENT_IMMUTABLE;
		break;
	case 's':
		if (sslikely(strcmp(path, "status") == 0))
			return SE_DOCUMENT_STATUS;
		if (strcmp(path, "seq") == 0)
			return SE_DOCUMENT_SEQ;
		break;
	}
	return SE_DOCUMENT_UNKNOWN;
}
Пример #30
0
static inline int
si_split(si *index, sdc *c, ssbuf *result,
         sinode   *parent,
         ssiter   *i,
         uint64_t  size_node,
         uint64_t  size_stream,
         uint32_t  stream,
         uint64_t  vlsn)
{
	sr *r = &index->r;
	uint32_t timestamp = ss_timestamp();
	int rc;
	sdmergeconf mergeconf = {
		.stream              = stream,
		.size_stream         = size_stream,
		.size_node           = size_node,
		.size_page           = index->scheme.compaction.node_page_size,
		.checksum            = index->scheme.compaction.node_page_checksum,
		.expire              = index->scheme.expire,
		.timestamp           = timestamp,
		.compression         = index->scheme.compression,
		.compression_if      = index->scheme.compression_if,
		.direct_io           = index->scheme.direct_io,
		.direct_io_page_size = index->scheme.direct_io_page_size,
		.vlsn                = vlsn
	};
	sinode *n = NULL;
	sdmerge merge;
	rc = sd_mergeinit(&merge, r, i, &c->build, &c->build_index,
	                  &c->upsert, &mergeconf);
	if (ssunlikely(rc == -1))
		return -1;
	while ((rc = sd_merge(&merge)) > 0)
	{
		/* create new node */
		uint64_t id = sr_seq(index->r.seq, SR_NSNNEXT);
		n = si_nodenew(r, id, parent->id);
		if (ssunlikely(n == NULL))
			goto error;
		rc = si_nodecreate(n, r, &index->scheme);
		if (ssunlikely(rc == -1))
			goto error;

		/* write pages */
		uint64_t offset;
		offset = sd_iosize(&c->io, &n->file);
		while ((rc = sd_mergepage(&merge, offset)) == 1) {
			rc = sd_writepage(r, &n->file, &c->io, merge.build);
			if (ssunlikely(rc == -1))
				goto error;
			offset = sd_iosize(&c->io, &n->file);
		}
		if (ssunlikely(rc == -1))
			goto error;

		offset = sd_iosize(&c->io, &n->file);
		rc = sd_mergeend(&merge, offset);
		if (ssunlikely(rc == -1))
			goto error;

		/* write index */
		rc = sd_writeindex(r, &n->file, &c->io, &merge.index);
		if (ssunlikely(rc == -1))
			goto error;

		/* mmap mode */
		if (index->scheme.mmap) {
			rc = si_nodemap(n, r);
			if (ssunlikely(rc == -1))
				goto error;
		}

		/* add node to the list */
		rc = ss_bufadd(result, index->r.a, &n, sizeof(sinode*));
		if (ssunlikely(rc == -1)) {
			sr_oom_malfunction(index->r.e);
			goto error;
		}

		n->index = merge.index;
	}
	if (ssunlikely(rc == -1))
		goto error;
	return 0;
error:
	if (n)
		si_nodefree(n, r, 0);
	sd_mergefree(&merge);
	si_splitfree(result, r);
	return -1;
}

static int
si_merge(si *index, sdc *c, sinode *node,
         uint64_t vlsn,
         ssiter *stream,
         uint64_t size_stream,
         uint32_t n_stream)
{
	sr *r = &index->r;
	ssbuf *result = &c->a;
	ssiter i;

	/* begin compaction.
	 *
	 * Split merge stream into a number of
	 * a new nodes.
	 */
	int rc;
	rc = si_split(index, c, result,
	              node, stream,
	              index->scheme.compaction.node_size,
	              size_stream,
	              n_stream,
	              vlsn);
	if (ssunlikely(rc == -1))
		return -1;

	SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_0,
	             si_splitfree(result, r);
	             sr_malfunction(r->e, "%s", "error injection");
	             return -1);

	/* mask removal of a single node as a
	 * single node update */
	int count = ss_bufused(result) / sizeof(sinode*);
	int count_index;

	si_lock(index);
	count_index = index->n;
	si_unlock(index);

	sinode *n;
	if (ssunlikely(count == 0 && count_index == 1))
	{
		n = si_bootstrap(index, node->id);
		if (ssunlikely(n == NULL))
			return -1;
		rc = ss_bufadd(result, r->a, &n, sizeof(sinode*));
		if (ssunlikely(rc == -1)) {
			sr_oom_malfunction(r->e);
			si_nodefree(n, r, 1);
			return -1;
		}
		count++;
	}

	/* commit compaction changes */
	si_lock(index);
	svindex *j = si_nodeindex(node);
	si_plannerremove(&index->p, node);
	si_nodesplit(node);
	switch (count) {
	case 0: /* delete */
		si_remove(index, node);
		si_redistribute_index(index, r, c, node);
		break;
	case 1: /* self update */
		n = *(sinode**)result->s;
		n->i0 = *j;
		n->used = j->used;
		si_nodelock(n);
		si_replace(index, node, n);
		si_plannerupdate(&index->p, n);
		break;
	default: /* split */
		rc = si_redistribute(index, r, c, node, result);
		if (ssunlikely(rc == -1)) {
			si_unlock(index);
			si_splitfree(result, r);
			return -1;
		}
		ss_iterinit(ss_bufiterref, &i);
		ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*));
		n = ss_iterof(ss_bufiterref, &i);
		n->used = n->i0.used;
		si_nodelock(n);
		si_replace(index, node, n);
		si_plannerupdate(&index->p, n);
		for (ss_iternext(ss_bufiterref, &i); ss_iterhas(ss_bufiterref, &i);
		     ss_iternext(ss_bufiterref, &i)) {
			n = ss_iterof(ss_bufiterref, &i);
			n->used = n->i0.used;
			si_nodelock(n);
			si_insert(index, n);
			si_plannerupdate(&index->p, n);
		}
		break;
	}
	sv_indexinit(j);
	si_unlock(index);

	/* compaction completion */

	/* seal nodes */
	ss_iterinit(ss_bufiterref, &i);
	ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*));
	while (ss_iterhas(ss_bufiterref, &i))
	{
		n  = ss_iterof(ss_bufiterref, &i);
		if (index->scheme.sync) {
			rc = ss_filesync(&n->file);
			if (ssunlikely(rc == -1)) {
				sr_malfunction(r->e, "db file '%s' sync error: %s",
				               ss_pathof(&n->file.path),
				               strerror(errno));
				return -1;
			}
		}
		rc = si_noderename_seal(n, r, &index->scheme);
		if (ssunlikely(rc == -1)) {
			si_nodefree(node, r, 0);
			return -1;
		}
		SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_3,
		             si_nodefree(node, r, 0);
		             sr_malfunction(r->e, "%s", "error injection");
		             return -1);
		ss_iternext(ss_bufiterref, &i);
	}

	SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_1,
	             si_nodefree(node, r, 0);
	             sr_malfunction(r->e, "%s", "error injection");
	             return -1);

	/* gc node */
	uint16_t refs = si_noderefof(node);
	if (sslikely(refs == 0)) {
		rc = si_nodefree(node, r, 1);
		if (ssunlikely(rc == -1))
			return -1;
	} else {
		/* node concurrently being read, schedule for
		 * delayed removal */
		si_nodegc(node, r, &index->scheme);
		si_lock(index);
		ss_listappend(&index->gc, &node->gc);
		index->gc_count++;
		si_unlock(index);
	}

	SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_2,
	             sr_malfunction(r->e, "%s", "error injection");
	             return -1);

	/* complete new nodes */
	ss_iterinit(ss_bufiterref, &i);
	ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*));
	while (ss_iterhas(ss_bufiterref, &i))
	{
		n = ss_iterof(ss_bufiterref, &i);
		rc = si_noderename_complete(n, r, &index->scheme);
		if (ssunlikely(rc == -1))
			return -1;
		SS_INJECTION(r->i, SS_INJECTION_SI_COMPACTION_4,
		             sr_malfunction(r->e, "%s", "error injection");
		             return -1);
		ss_iternext(ss_bufiterref, &i);
	}

	/* unlock */
	si_lock(index);
	ss_iterinit(ss_bufiterref, &i);
	ss_iteropen(ss_bufiterref, &i, result, sizeof(sinode*));
	while (ss_iterhas(ss_bufiterref, &i))
	{
		n = ss_iterof(ss_bufiterref, &i);
		si_nodeunlock(n);
		ss_iternext(ss_bufiterref, &i);
	}
	si_unlock(index);
	return 0;
}