Exemple #1
0
static int
si_redistribute_index(si *index, sr *r, sdc *c, sinode *node)
{
	svindex *vindex = si_nodeindex(node);
	ssiter i;
	ss_iterinit(sv_indexiter, &i);
	ss_iteropen(sv_indexiter, &i, r, vindex, SS_GTE, NULL, 0);
	while (ss_iterhas(sv_indexiter, &i)) {
		sv *v = ss_iterof(sv_indexiter, &i);
		int rc = ss_bufadd(&c->b, r->a, &v->v, sizeof(svv**));
		if (ssunlikely(rc == -1))
			return sr_oom_malfunction(r->e);
		ss_iternext(sv_indexiter, &i);
	}
	if (ssunlikely(ss_bufused(&c->b) == 0))
		return 0;
	uint64_t now = ss_utime();
	ss_iterinit(ss_bufiterref, &i);
	ss_iteropen(ss_bufiterref, &i, &c->b, sizeof(svv*));
	while (ss_iterhas(ss_bufiterref, &i)) {
		svv *v = ss_iterof(ss_bufiterref, &i);
		si_redistribute_set(index, r, now, v);
		ss_iternext(ss_bufiterref, &i);
	}
	return 0;
}
Exemple #2
0
int se_reqwrite(sereq *r)
{
	sereqarg *arg = &r->arg;
	svlog *log = r->arg.log;
	se *e = se_of(r->object);
	/* set lsn */
	sl_prepare(&e->lp, log, arg->lsn);
	/* log write */
	if (! arg->recover) {
		sltx tl;
		sl_begin(&e->lp, &tl);
		int rc = sl_write(&tl, log);
		if (ssunlikely(rc == -1)) {
			sl_rollback(&tl);
			r->rc = -1;
			return -1;
		}
		sl_commit(&tl);
	}
	/* commit */
	if (sslikely(arg->vlsn_generate))
		arg->vlsn = sx_vlsn(&e->xm);
	uint64_t now = ss_utime();
	svlogindex *i   = (svlogindex*)log->index.s;
	svlogindex *end = (svlogindex*)log->index.p;
	while (i < end) {
		sedb *db = i->ptr;
		sitx ti;
		si_begin(&ti, &db->index, arg->vlsn, now, log, i);
		si_write(&ti, arg->recover);
		si_commit(&ti);
		i++;
	}
	return 0;
}
Exemple #3
0
static inline void
sc_taskbegin(sctask *task, scworker *w, uint64_t vlsn)
{
	task->time   = ss_utime();
	task->w      = w;
	task->vlsn   = vlsn;
	task->db     = NULL;
	task->rotate = 0;
	task->gc     = 0;
	task->backup = 0;
	si_planinit(&task->plan);
}
Exemple #4
0
int se_scheduler_init(sescheduler *s, so *env)
{
	uint64_t now = ss_utime();
	ss_mutexinit(&s->lock);
	s->workers_branch           = 0;
	s->workers_backup           = 0;
	s->workers_gc               = 0;
	s->workers_gc_db            = 0;
	s->workers_lru              = 0;
	s->rotate                   = 0;
	s->req                      = 0;
	s->i                        = NULL;
	s->count                    = 0;
	s->rr                       = 0;
	s->env                      = env;
	s->checkpoint_lsn           = 0;
	s->checkpoint_lsn_last      = 0;
	s->checkpoint               = 0;
	s->age                      = 0;
	s->age_last                 = now;
	s->backup_bsn               = 0;
	s->backup_bsn_last          = 0;
	s->backup_bsn_last_complete = 0;
	s->backup_events            = 0;
	s->backup                   = 0;
	s->anticache_asn            = 0;
	s->anticache_asn_last       = 0;
	s->anticache_last           = now;
	s->anticache_storage        = 0;
	s->anticache                = 0;
	s->snapshot_ssn             = 0;
	s->snapshot_ssn_last        = 0;
	s->snapshot_last            = now;
	s->snapshot                 = 0;
	s->gc                       = 0;
	s->gc_last                  = now;
	s->lru                      = 0;
	s->lru_last                 = now;
	ss_threadpool_init(&s->tp);
	se_workerpool_init(&s->workers);
	return 0;
}
Exemple #5
0
static inline so*
se_readresult(se *e, siread *r)
{
	sedocument *v = (sedocument*)se_document_new(e, r->index->object, &r->result);
	if (ssunlikely(v == NULL))
		return NULL;
	v->read_disk    = r->read_disk;
	v->read_cache   = r->read_cache;
	v->read_latency = 0;
	if (r->result.v) {
		v->read_latency = ss_utime() - r->read_start;
		sr_statget(&e->stat,
		           v->read_latency,
		           v->read_disk,
		           v->read_cache);
	}

	/* propagate current document settings to
	 * the result one */
	v->orderset = 1;
	v->order = r->order;
	if (v->order == SS_GTE)
		v->order = SS_GT;
	else
	if (v->order == SS_LTE)
		v->order = SS_LT;

	/* set prefix */
	if (r->prefix) {
		v->prefix = r->prefix;
		v->prefix_copy = r->prefix;
		v->prefix_size = r->prefix_size;
	}

	v->cold_only = r->cold_only;
	v->created   = 1;
	v->flagset   = 1;
	return &v->o;
}
Exemple #6
0
static inline int
si_plannerpeek_age(siplanner *p, siplan *plan)
{
	/* try to peek a node with update >= a and in-memory
	 * index size >= b */

	/* full scan */
	uint64_t now = ss_utime();
	sinode *n = NULL;
	ssrqnode *pn = NULL;
	while ((pn = ss_rqprev(&p->branch, pn))) {
		n = sscast(pn, sinode, nodebranch);
		if (n->flags & SI_LOCK)
			continue;
		if (n->used >= plan->b && ((now - n->update_time) >= plan->a))
			goto match;
	}
	return 0;
match:
	si_nodelock(n);
	plan->explain = SI_EINDEX_AGE;
	plan->node = n;
	return 1;
}
Exemple #7
0
static int
se_schedule(sescheduler *s, setask *task, seworker *w)
{
	ss_trace(&w->trace, "%s", "schedule");
	si_planinit(&task->plan);

	uint64_t now = ss_utime();
	se *e = (se*)s->env;
	sedb *db;
	srzone *zone = se_zoneof(e);
	assert(zone != NULL);

	task->checkpoint_complete = 0;
	task->backup_complete = 0;
	task->rotate = 0;
	task->req = 0;
	task->gc = 0;
	task->db = NULL;

	ss_mutexlock(&s->lock);

	/* asynchronous reqs dispatcher */
	if (s->req == 0) {
		switch (zone->async) {
		case 2:
			if (se_reqqueue(e) == 0)
				break;
		case 1:
			s->req = 1;
			task->req = zone->async;
			ss_mutexunlock(&s->lock);
			return 0;
		}
	}

	/* log gc and rotation */
	if (s->rotate == 0)
	{
		task->rotate = 1;
		s->rotate = 1;
	}

	/* checkpoint */
	int in_progress = 0;
	int rc;
checkpoint:
	if (s->checkpoint) {
		task->plan.plan = SI_CHECKPOINT;
		task->plan.a = s->checkpoint_lsn;
		rc = se_schedule_plan(s, &task->plan, &db);
		switch (rc) {
		case 1:
			s->workers_branch++;
			se_dbref(db, 1);
			task->db = db;
			task->gc = 1;
			ss_mutexunlock(&s->lock);
			return 1;
		case 2: /* work in progress */
			in_progress = 1;
			break;
		case 0: /* complete checkpoint */
			s->checkpoint = 0;
			s->checkpoint_lsn_last = s->checkpoint_lsn;
			s->checkpoint_lsn = 0;
			task->checkpoint_complete = 1;
			break;
		}
	}

	/* apply zone policy */
	switch (zone->mode) {
	case 0:  /* compact_index */
	case 1:  /* compact_index + branch_count prio */
		assert(0);
		break;
	case 2:  /* checkpoint */
	{
		if (in_progress) {
			ss_mutexunlock(&s->lock);
			return 0;
		}
		uint64_t lsn = sr_seq(&e->seq, SR_LSN);
		s->checkpoint_lsn = lsn;
		s->checkpoint = 1;
		goto checkpoint;
	}
	default: /* branch + compact */
		assert(zone->mode == 3);
	}

	/* database shutdown-drop */
	if (s->workers_gc_db < zone->gc_db_prio) {
		ss_spinlock(&e->dblock);
		db = NULL;
		if (ssunlikely(e->db_shutdown.n > 0)) {
			db = (sedb*)so_listfirst(&e->db_shutdown);
			if (se_dbgarbage(db)) {
				so_listdel(&e->db_shutdown, &db->o);
			} else {
				db = NULL;
			}
		}
		ss_spinunlock(&e->dblock);
		if (ssunlikely(db)) {
			if (db->dropped)
				task->plan.plan = SI_DROP;
			else
				task->plan.plan = SI_SHUTDOWN;
			s->workers_gc_db++;
			se_dbref(db, 1);
			task->db = db;
			ss_mutexunlock(&s->lock);
			return 1;
		}
	}

	/* backup */
	if (s->backup && (s->workers_backup < zone->backup_prio))
	{
		/* backup procedure.
		 *
		 * state 0 (start)
		 * -------
		 *
		 * a. disable log gc
		 * b. mark to start backup (state 1)
		 *
		 * state 1 (background, delayed start)
		 * -------
		 *
		 * a. create backup_path/<bsn.incomplete> directory
		 * b. create database directories
		 * c. create log directory
		 * d. state 2
		 *
		 * state 2 (background, copy)
		 * -------
		 *
		 * a. schedule and execute node backup which bsn < backup_bsn
		 * b. state 3
		 *
		 * state 3 (background, completion)
		 * -------
		 *
		 * a. rotate log file
		 * b. copy log files
		 * c. enable log gc, schedule gc
		 * d. rename <bsn.incomplete> into <bsn>
		 * e. set last backup, set COMPLETE
		 *
		*/
		if (s->backup == 1) {
			/* state 1 */
			rc = se_backupstart(s);
			if (ssunlikely(rc == -1)) {
				se_backuperror(s);
				goto backup_error;
			}
			s->backup = 2;
		}
		/* state 2 */
		task->plan.plan = SI_BACKUP;
		task->plan.a = s->backup_bsn;
		rc = se_schedule_plan(s, &task->plan, &db);
		switch (rc) {
		case 1:
			s->workers_backup++;
			se_dbref(db, 1);
			task->db = db;
			ss_mutexunlock(&s->lock);
			return 1;
		case 2: /* work in progress */
			break;
		case 0: /* state 3 */
			rc = se_backupcomplete(s, w);
			if (ssunlikely(rc == -1)) {
				se_backuperror(s);
				goto backup_error;
			}
			s->backup_events++;
			task->gc = 1;
			task->backup_complete = 1;
			break;
		}
backup_error:;
	}

	/* garbage-collection */
	if (s->gc) {
		if (s->workers_gc < zone->gc_prio) {
			task->plan.plan = SI_GC;
			task->plan.a = sx_vlsn(&e->xm);
			task->plan.b = zone->gc_wm;
			rc = se_schedule_plan(s, &task->plan, &db);
			switch (rc) {
			case 1:
				s->workers_gc++;
				se_dbref(db, 1);
				task->db = db;
				ss_mutexunlock(&s->lock);
				return 1;
			case 2: /* work in progress */
				break;
			case 0: /* state 3 */
				s->gc = 0;
				s->gc_last = now;
				break;
			}
		}
	} else {
		if (zone->gc_prio && zone->gc_period) {
			if ( (now - s->gc_last) >= ((uint64_t)zone->gc_period * 1000000) ) {
				s->gc = 1;
			}
		}
	}

	/* index aging */
	if (s->age) {
		if (s->workers_branch < zone->branch_prio) {
			task->plan.plan = SI_AGE;
			task->plan.a = zone->branch_age * 1000000; /* ms */
			task->plan.b = zone->branch_age_wm;
			rc = se_schedule_plan(s, &task->plan, &db);
			switch (rc) {
			case 1:
				s->workers_branch++;
				se_dbref(db, 1);
				task->db = db;
				ss_mutexunlock(&s->lock);
				return 1;
			case 0:
				s->age = 0;
				s->age_last = now;
				break;
			}
		}
	} else {
		if (zone->branch_prio && zone->branch_age_period) {
			if ( (now - s->age_last) >= ((uint64_t)zone->branch_age_period * 1000000) ) {
				s->age = 1;
			}
		}
	}

	/* branching */
	if (s->workers_branch < zone->branch_prio)
	{
		/* schedule branch task using following
		 * priority:
		 *
		 * a. peek node with the largest in-memory index
		 *    which is equal or greater then branch
		 *    watermark.
		 *    If nothing is found, stick to b.
		 *
		 * b. peek node with the largest in-memory index,
		 *    which has oldest update time.
		 *
		 * c. if no branch work is needed, schedule a
		 *    compaction job
		 *
		 */
		task->plan.plan = SI_BRANCH;
		task->plan.a = zone->branch_wm;
		rc = se_schedule_plan(s, &task->plan, &db);
		if (rc == 1) {
			s->workers_branch++;
			se_dbref(db, 1);
			task->db = db;
			task->gc = 1;
			ss_mutexunlock(&s->lock);
			return 1;
		}
	}

	/* compaction */
	task->plan.plan = SI_COMPACT;
	task->plan.a = zone->compact_wm;
	task->plan.b = zone->compact_mode;
	rc = se_schedule_plan(s, &task->plan, &db);
	if (rc == 1) {
		se_dbref(db, 1);
		task->db = db;
		ss_mutexunlock(&s->lock);
		return 1;
	}

	ss_mutexunlock(&s->lock);
	return 0;
}
Exemple #8
0
so *se_read(sedb *db, sedocument *o, sx *x, uint64_t vlsn,
            sicache *cache)
{
	se *e = se_of(&db->o);
	uint64_t start  = ss_utime();

	/* prepare the key */
	int auto_close = o->created <= 1;
	int rc = se_document_createkey(o);
	if (ssunlikely(rc == -1))
		goto error;
	rc = se_document_validate_ro(o, &db->o);
	if (ssunlikely(rc == -1))
		goto error;
	if (ssunlikely(! se_active(e)))
		goto error;

	sv vup;
	sv_init(&vup, &sv_vif, NULL, NULL);

	sedocument *ret = NULL;

	/* concurrent */
	if (x && o->order == SS_EQ) {
		/* note: prefix is ignored during concurrent
		 * index search */
		int rc = sx_get(x, &db->coindex, &o->v, &vup);
		if (ssunlikely(rc == -1 || rc == 2 /* delete */))
			goto error;
		if (rc == 1 && !sv_is(&vup, SVUPSERT)) {
			ret = (sedocument*)se_document_new(e, &db->o, &vup);
			if (sslikely(ret)) {
				ret->cold_only = o->cold_only;
				ret->created   = 1;
				ret->orderset  = 1;
				ret->flagset   = 1;
			} else {
				sv_vunref(db->r, vup.v);
			}
			if (auto_close)
				so_destroy(&o->o);
			return &ret->o;
		}
	} else {
		sx_get_autocommit(&e->xm, &db->coindex);
	}

	/* prepare read cache */
	int cachegc = 0;
	if (cache == NULL) {
		cachegc = 1;
		cache = si_cachepool_pop(&e->cachepool);
		if (ssunlikely(cache == NULL)) {
			if (vup.v)
				sv_vunref(db->r, vup.v);
			sr_oom(&e->error);
			goto error;
		}
	}

	sv_vref(o->v.v);

	/* do read */
	siread rq;
	si_readopen(&rq, db->index, cache, o->order,
	            vlsn,
	            sv_pointer(&o->v),
	            vup.v,
	            o->prefix_copy,
	            o->prefix_size,
	            o->cold_only,
	            0,
	            start);
	rc = si_read(&rq);
	si_readclose(&rq);

	/* prepare result */
	if (rc == 1) {
		ret = (sedocument*)se_readresult(e, &rq);
		if (ret)
			o->prefix_copy = NULL;
	}

	/* cleanup */
	if (o->v.v)
		sv_vunref(db->r, o->v.v);
	if (vup.v)
		sv_vunref(db->r, vup.v);
	if (ret == NULL && rq.result.v)
		sv_vunref(db->r, rq.result.v);
	if (cachegc && cache)
		si_cachepool_push(cache);

	if (auto_close)
		so_destroy(&o->o);
	return &ret->o;
error:
	if (auto_close)
		so_destroy(&o->o);
	return NULL;
}