void leaf_new(struct hdr *hdr, NID nid, uint32_t height, uint32_t children, struct node **n) { struct node *node; nassert(height == 0); nassert(children == 1); node = xcalloc(1, sizeof(*node)); nassert(node); node->nid = nid; node->height = height; ness_mutex_init(&node->attr.mtx); ness_mutex_init(&node->mtx); ness_rwlock_init(&node->rwlock, &node->mtx); node->n_children = children; node->layout_version = hdr->layout_version; if (children > 0) { node->pivots = xcalloc(children - 1, PIVOT_SIZE); node->parts = xcalloc(children, PART_SIZE); } node->opts = hdr->opts; node->i = &leaf_operations; node_set_dirty(node); *n = node; }
void ness_decompress(const char *src, uint32_t src_size, char *dst, uint32_t dst_size) { switch (src[0] & 0xF) { case NESS_NO_COMPRESS: memcpy(dst, src + 1, src_size - 1); break; case NESS_QUICKLZ_METHOD: { uint32_t raw_size; qlz_state_decompress *qsd; qsd = xcalloc(1, sizeof(*qsd)); raw_size = qlz_decompress(src + 1, dst, qsd); nassert(raw_size == dst_size); (void)raw_size; (void)dst_size; xfree(qsd); } break; default: printf("no decompress support!\n"); nassert(1); break; } }
void lnode_return (lnodepool_t * pool, lnode_t * node) { nassert (lnode_pool_isfrom (pool, node)); nassert (!lnode_is_in_a_list (node)); node->next = pool->fre; node->prev = node; pool->fre = node; }
void nonleaf_alloc_buffer(struct node *node) { int i; nassert(node->height > 0); nassert(node->u.n.n_children > 0); for (i = 0; i < (int)node->u.n.n_children; i++) { node->u.n.parts[i].buffer = nmb_new(); } }
struct env *env_open(const char *home, uint32_t flags) { LOG; struct env *e; e = xcalloc(1, sizeof(*e)); e->flags = flags; /* tree */ e->inner_node_fanout = 16; e->inner_default_node_size = 4 << 20; /* 4MB */ e->leaf_default_node_size = 4 << 20; /* 4MB */ e->leaf_default_basement_size = 128 << 10; /* 128KB */ /* cache */ e->cache_limits_bytes = 1024 << 20; e->cache_high_watermark = 80; /* 80% */ e->cache_flush_period_ms = 100; /* 0.1s */ e->cache_checkpoint_period_ms = 600000; /* 60s */ /* IO */ e->use_directio = 1; e->redo_path = "./dbbench"; e->enable_redo_log = 1; if (!home) home = "."; e->dir = xcalloc(1, strlen(home) + 1); xmemcpy(e->dir, (void*)home, strlen(home)); ness_check_dir(e->dir); /* compress */ e->compress_method = NESS_SNAPPY_METHOD; /* callback */ e->bt_compare_func = bt_compare_func_builtin; /* internal */ e->cache = cache_new(e); /* e->cache->cf_first->hdr->blocksize */ nassert(e->cache); e->txnmgr = txnmgr_new(); nassert(e->txnmgr); e->status = status_new(); nassert(e->status); return e; }
/* * swap the cache pair value and key * * REQUIRES: * a) node a lock(L_WRITE) * b) node b locked(L_WRITE) * */ void cache_cpair_value_swap(struct cache_file *cf, struct node *a, struct node *b) { struct cpair *cpa; struct cpair *cpb; struct cache *c = cf->cache; cpa = cpair_htable_find(c->table, a->nid); nassert(cpa); cpb = cpair_htable_find(c->table, b->nid); nassert(cpb); cpa->v = b; cpb->v = a; }
/* * EFFECT: * - flush in background thread * ENTER: * - parent is already locked * EXIT: * - nodes are all unlocked */ void tree_flush_node_on_background(struct tree *t, struct node *parent) { LOG; int childnum; enum reactivity re; struct node *child; struct partition *part; nassert(parent->height > 0); childnum = node_find_heaviest_idx(parent); part = &parent->parts[childnum]; /* pin the child */ if (cache_get_and_pin(t->cf, part->child_nid, (void**)&child, L_WRITE) != NESS_OK) { __ERROR("cache get node error, nid [%" PRIu64 "]", part->child_nid); return; } re = get_reactivity(t, child); if (re == STABLE) { /* detach buffer from parent */ struct nmb *buf = part->ptr.u.nonleaf->buffer; node_set_dirty(parent); part->ptr.u.nonleaf->buffer = nmb_new(t->e); /* flush it in background thread */ _place_node_and_buffer_on_background(t, child, buf); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); } else { /* the child is reactive, we deal it in main thread */ _child_maybe_reactivity(t, parent, child); } }
/* * PROCESS: * - the rule is k <= pivot, it's binary search with upperbound * * i0 i1 i2 i3 * +----+----+----+----+ * | 15 | 17 | 19 | +∞ | * +----+----+----+----+ * so if key is 16, we will get 1(i1) */ int node_partition_idx(struct node *node, struct msg *k) { int lo; int hi; int mi; int cmp; nassert(node->n_children > 1); lo = 0; hi = node->n_children - 2; while (lo <= hi) { /* mi integer overflow never happens */ mi = (lo + hi) / 2; cmp = node->node_op->pivot_compare_func(k, &node->pivots[mi]); if (cmp > 0) lo = mi + 1; else if (cmp < 0) hi = mi - 1; else { return mi; } } return lo; }
void node_free(struct node *node) { nassert(node != NULL); if (node->height == 0) { lmb_free(node->u.l.buffer); } else { uint32_t i; if (node->u.n.n_children > 0) { for (i = 0; i < node->u.n.n_children - 1; i++) { xfree(node->u.n.pivots[i].data); } for (i = 0; i < node->u.n.n_children; i++) { nmb_free(node->u.n.parts[i].buffer); } xfree(node->u.n.pivots); xfree(node->u.n.parts); } } xfree(node); }
/* search in a node's child */ int _search_child(struct cursor *cur, struct search *so, struct node *n, int childnum) { int ret; NID child_nid; int child_to_search; struct node *child; nassert(n->height > 0); ancestors_append(cur, n->parts[childnum].msgbuf); child_nid = n->parts[childnum].child_nid; if (!cache_get_and_pin(cur->tree->cf, child_nid, (void**)&child, L_READ)) { __ERROR("cache get node error, nid [%" PRIu64 "]", child_nid); return NESS_ERR; } child_to_search = _search_in_which_child(so, child); ret = _search_node(cur, so, child, child_to_search); /* unpin */ cache_unpin(cur->tree->cf, child->cpair); return ret; }
int logw_append(struct logw *lgw, struct msg *k, struct msg *v, msgtype_t t, int tbn) { int r = NESS_OK; char *base; uint32_t pos = 0; uint32_t size = + 4 /* length of beginning */ + 4 /* table number */ + CRC_SIZE; size += sizeof(k->size); size += k->size; if (v) { size += sizeof(v->size);; size += v->size; } _check_space(lgw, size); base = lgw->base; putuint32(base + pos, size); pos += 4; putuint32(base + pos, tbn); pos += 4; uint32_t fixsize = k->size; fixsize = ((fixsize << 8) | (char)t); putuint32(base + pos, fixsize); pos += 4; putnstr(base + pos, k->data, k->size); pos += k->size; if (v) { putuint32(base + pos, v->size); pos += 4; putnstr(base + pos, v->data, v->size); pos += v->size; } uint32_t xsum; /* not include the length of the header */ r = buf_xsum(base + 4, pos - 4, &xsum); if (r == 0) r = NESS_DO_XSUM_ERR; putuint32(base + pos, xsum); pos += 4; nassert(pos == size); lgw->size += size; ness_os_write(lgw->fd, base, size); return r; }
void leaf_free(struct node *leaf) { int i; nassert(leaf != NULL); nassert(leaf->height == 0); for (i = 0; i < (leaf->n_children - 1); i++) xfree(leaf->pivots[i].data); for (i = 0; i < leaf->n_children; i++) lmb_free(leaf->parts[i].msgbuf); ness_mutex_destroy(&leaf->attr.mtx); xfree(leaf->pivots); xfree(leaf->parts); xfree(leaf); }
void counter_incr(struct counter *c) { int cpu = sched_getcpu(); nassert(cpu < c->cpus); c->per_cpu_counter[cpu]++; }
/* * if iso type is: * a) uncommitted, the last one is visible * b) serializable, the last one is visible * c) committed: cur_id >= id, and id not in live roots list * d) repeatable: cur_id >= id, and id not in live snapshot list * */ int _cursor_get_values_from_leafentry(struct cursor *cur, void *le) { (void)cur; (void)le; nassert(0); return -1; }
/* * this is a 'follow the vine to get the melon' (root-to-leaf) * search->direction_compare_func is a helmsman to deep in which pivot we are * interested in. */ int _search_in_which_child(struct search *so, struct node *node) { int lo; int hi; int mi; int c; int children; int childnum = 0; /* i am leaf */ if (node->height == 0) return 0; children = node->u.n.n_children; nassert(children >= 2); lo = 0; hi = children - 1; while (lo < hi) { /* mi integer overflow never happens */ mi = (lo + hi) / 2; c = so->direction_compare_func(so, &node->u.n.pivots[mi]); if (((so->direction == SEARCH_FORWARD) && c) || ((so->direction == SEARCH_BACKWARD) && !c)) hi = mi; else lo = mi + 1; } childnum = lo; /* * detecting whether we should move to the prev/next pivot * make a new root-to-leaf path */ int cmp; struct msg *pivot; switch (so->direction) { case SEARCH_FORWARD: pivot = &node->u.n.pivots[childnum]; cmp = so->pivotbound_compare_func(so, pivot); while (childnum < (children - 1) && cmp >= 0) childnum++; break; case SEARCH_BACKWARD: pivot = &node->u.n.pivots[childnum - 1]; cmp = so->pivotbound_compare_func(so, pivot); while (childnum > 0 && cmp <= 0) childnum--; break; default: __PANIC("unsupport direction %u", so->direction); } return childnum; }
lnode_t * list_next (list_t * list, const lnode_t * lnode) { nassert (list_contains (list, lnode)); if (lnode->next == list_nil (list)) return NULL; return lnode->next; }
/* next */ void skiplist_iter_next(struct skiplist_iter *iter) { struct skipnode *n; nassert(skiplist_iter_valid(iter)); n = _get_next(iter->node, 0); iter->node = n; }
void counter_incr(struct counter *c) { int cpu = 1; #ifdef __linux__ cpu = sched_getcpu(); #endif nassert(cpu < c->cpus); c->per_cpu_counter[cpu]++; }
lnode_t * list_prev (list_t * list, const lnode_t * lnode) { nassert (list_contains (list, lnode)); if (lnode->prev == list_nil (list)) return NULL; return lnode->prev; }
list_t * list_init (list_t * list, listcount_t maxcount) { nassert (maxcount != 0); list->nilnode.next = &list->nilnode; list->nilnode.prev = &list->nilnode; list->nodecount = 0; list->maxcount = maxcount; return list; }
/* prev */ void skiplist_iter_prev(struct skiplist_iter *iter) { struct skipnode *n; nassert(skiplist_iter_valid(iter)); n = skiplist_find_less_than(iter->list, iter->node->key); if (n == iter->list->header) n = NULL; iter->node = n; }
/* * save the newly pivot bound to search->pivot_bound */ void _save_pivot_bound(struct search *so, struct node *n, int child_searched) { nassert(n->height > 0); int p = (so->direction == SEARCH_FORWARD) ? child_searched : child_searched - 1; if (p >= 0 && p < (int)(n->u.n.n_children - 1)) { if (so->pivot_bound) msgfree(so->pivot_bound); so->pivot_bound = msgdup(&n->u.n.pivots[p]); } }
list_t * list_create (listcount_t maxcount) { list_t *newlist = ns_malloc (sizeof *newlist); if (newlist) { nassert (maxcount != 0); newlist->nilnode.next = &newlist->nilnode; newlist->nilnode.prev = &newlist->nilnode; newlist->nodecount = 0; newlist->maxcount = maxcount; } return newlist; }
void list_process (list_t * list, void *context, void (*function) (list_t * list, lnode_t * lnode, void *context)) { lnode_t *node = list_first_priv (list), *next, *nil = list_nil (list); while (node != nil) { /* check for callback function deleting */ /* the next node from under us */ nassert (list_contains (list, node)); next = node->next; function (list, node, context); node = next; } }
void list_merge (list_t * dest, list_t * sour, int compare (const void *, const void *)) { lnode_t *dn, *sn, *tn; lnode_t *d_nil = list_nil (dest), *s_nil = list_nil (sour); /* Nothing to do if source and destination list are the same. */ if (dest == sour) return; /* overflow check */ nassert (list_count (sour) + list_count (dest) >= list_count (sour)); /* lists must be sorted */ nassert (list_is_sorted (sour, compare)); nassert (list_is_sorted (dest, compare)); dn = list_first_priv (dest); sn = list_first_priv (sour); while (dn != d_nil && sn != s_nil) { if (compare (lnode_get (dn), lnode_get (sn)) >= 0) { tn = lnode_next (sn); list_delete (sour, sn); list_ins_before (dest, sn, dn); sn = tn; } else { dn = lnode_next (dn); } } if (dn != d_nil) return; if (sn != s_nil) list_transfer (dest, sour, sn); }
void cache_unpin_readonly(struct cache_file *cf, struct node *n) { struct cpair *p; struct cache *c = cf->cache; /* * here, we don't need a hashtable array lock, * since we have hold the pair->value_lock, * others(evict thread) can't remove it from cache */ p = cpair_htable_find(c->table, n->nid); nassert(p); write_unlock(&p->value_lock); }
struct node *nonleaf_alloc_empty(NID nid, uint32_t height, uint32_t children) { int i; struct node *node; nassert(height > 0); nassert(children > 0); node = xcalloc(1, sizeof(*node)); node->nid = nid; node->height = height; node->node_op = &nop; node->u.n.n_children = children; node->u.n.pivots = xcalloc(children - 1, PIVOT_SIZE); node->u.n.parts = xcalloc(children, PART_SIZE); for (i = 0; i < (int)children; i++) { mutex_init(&node->u.n.parts[i].mtx); ness_rwlock_init(&node->u.n.parts[i].rwlock); } mutex_init(&node->attr.mtx); return node; }
uint32_t leaf_size(struct node *leaf) { int i; uint32_t sz = 0U; nassert(leaf->n_children == 1); for (i = 0; i < leaf->n_children; i++) { if (nessunlikely(i < (leaf->n_children - 1))) sz += msgsize(&leaf->pivots[i]); sz += sizeof(leaf->parts[i]); sz += lmb_memsize(leaf->parts[i].msgbuf); } sz += sizeof(*leaf); return sz; }
lnode_t * list_delete (list_t * list, lnode_t * del) { lnode_t *next = del->next; lnode_t *prev = del->prev; nassert (list_contains (list, del)); prev->next = next; next->prev = prev; list->nodecount--; del->next = del->prev = NULL; return del; }
/* * header laytout stored in disk as: * +----------------------+ * | 'nesshdr*' | * +----------------------+ * | version | * +----------------------+ * | last-nid | * +----------------------+ * | roo-nid | * +----------------------+ * | blocksize | * +----------------------+ * | blockoff | * +----------------------+ */ int write_hdr_to_disk(int fd, struct hdr *hdr, DISKOFF off) { int r; uint32_t real_size; uint32_t align_size; struct buffer *wbuf = NULL; nassert(hdr->root_nid >= NID_START); wbuf = buf_new(1 << 20); buf_putnstr(wbuf, "nesshdr*", 8); buf_putuint32(wbuf, LAYOUT_VERSION); buf_putuint64(wbuf, hdr->last_nid); buf_putuint64(wbuf, hdr->root_nid); buf_putuint32(wbuf, hdr->blocksize); buf_putuint64(wbuf, hdr->blockoff); uint32_t xsum; if (!buf_xsum(wbuf->buf, wbuf->NUL, &xsum)) { r = NESS_DO_XSUM_ERR; goto ERR; } buf_putuint32(wbuf, xsum); real_size = wbuf->NUL; align_size = ALIGN(real_size); buf_putnull(wbuf, align_size - real_size); if (ness_os_pwrite(fd, wbuf->buf, align_size, off) != 0) { r = NESS_WRITE_ERR; goto ERR; } buf_free(wbuf); return NESS_OK; ERR: buf_free(wbuf); return r; }