static void _root_split(struct tree *t, struct node *new_root, struct node *old_root) { struct node *a; struct node *b; struct msg *split_key = NULL; __DEBUG("root split begin, old NID %"PRIu64" , height %d" , old_root->nid , old_root->height); if (old_root->height > 0 || old_root->n_children > 2) _node_split(t, old_root, &a, &b, &split_key); else _leaf_and_lmb_split(t, old_root, &a, &b, &split_key); /* swap two roots */ _root_swap(new_root, old_root); msgcpy(&new_root->pivots[0], split_key); new_root->parts[0].child_nid = a->nid; new_root->parts[1].child_nid = b->nid; msgfree(split_key); cache_unpin(t->cf, b->cpair, make_cpair_attr(b)); node_set_dirty(old_root); node_set_dirty(new_root); t->hdr->height++; status_increment(&t->e->status->tree_root_new_nums); __DEBUG("root split end, old NID %"PRIu64, old_root->nid); }
/* * EFFECT: * - split the fissible root */ void _root_fissible(struct tree *t, struct node *root) { struct node *new_root; uint32_t new_root_height = 1; uint32_t new_root_children = 2; /* alloc a nonleaf node with 2 children */ NID nid = hdr_next_nid(t->hdr); node_create(nid, new_root_height, new_root_children, t->hdr->version, t->e, &new_root); cache_put_and_pin(t->cf, nid, new_root); _root_split(t, new_root, root); cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); cache_unpin(t->cf, new_root->cpair, make_cpair_attr(new_root)); }
/* * EFFECT: * - split the child * - add a new pivot(split-key) to parent * ENTER: * - parent is already locked(L_WRITE) * - child is already locked(L_WRITE) * EXITS: * - parent is locked * - child is locked */ void node_split_child(struct tree *t, struct node *parent, struct node *child) { int child_num; struct node *a; struct node *b; struct msg *split_key; if (child->height > 0 || child->n_children > 2) _node_split(t, child, &a, &b, &split_key); else _leaf_and_lmb_split(t, child, &a, &b, &split_key); child_num = node_partition_idx(parent, split_key); /* add pivot to parent */ _add_pivot_to_parent(t, parent, child_num, a, b, split_key); cache_unpin(t->cf, b->cpair, make_cpair_attr(b)); msgfree(split_key); if (child->height > 0) status_increment(&t->e->status->tree_nonleaf_split_nums); else status_increment(&t->e->status->tree_leaf_split_nums); }
/* * EFFECT: * - flush in background thread * ENTER: * - parent is already locked * EXIT: * - nodes are all unlocked */ void tree_flush_node_on_background(struct tree *t, struct node *parent) { LOG; int childnum; enum reactivity re; struct node *child; struct partition *part; nassert(parent->height > 0); childnum = node_find_heaviest_idx(parent); part = &parent->parts[childnum]; /* pin the child */ if (cache_get_and_pin(t->cf, part->child_nid, (void**)&child, L_WRITE) != NESS_OK) { __ERROR("cache get node error, nid [%" PRIu64 "]", part->child_nid); return; } re = get_reactivity(t, child); if (re == STABLE) { /* detach buffer from parent */ struct nmb *buf = part->ptr.u.nonleaf->buffer; node_set_dirty(parent); part->ptr.u.nonleaf->buffer = nmb_new(t->e); /* flush it in background thread */ _place_node_and_buffer_on_background(t, child, buf); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); } else { /* the child is reactive, we deal it in main thread */ _child_maybe_reactivity(t, parent, child); } }
/* * EFFECT: * - do flush in a background thread * PROCESS: * - if buf is NULL, we will do _flush_some_child * - if buf is NOT NULL, we will do _flush_buffer_to_child * ENTER: * - fe->node is already locked * EXIT: * - nodes are unlocked */ static void _flush_node_func(void *fe) { enum reactivity re; struct flusher_extra *extra = (struct flusher_extra*)fe; struct tree *t = extra->tree; struct node *n = extra->node; struct nmb *buf = extra->buffer; node_set_dirty(n); if (buf) { _flush_buffer_to_child(t, n, buf); nmb_free(buf); /* check the child node */ re = get_reactivity(t, n); if (re == FLUSHBLE) _flush_some_child(t, n); else cache_unpin(t->cf, n->cpair, make_cpair_attr(n)); } else { /* we want flush some buffer from n */ _flush_some_child(t, n); } xfree(extra); }
/* * PROCESS: * - put cmd to root * - check root reactivity * -- FISSIBLE: split root * -- FLUSHABLE: flush root in background * ENTER: * - no nodes are locked * EXITS: * - all nodes are unlocked */ int root_put_cmd(struct tree *t, struct bt_cmd *cmd) { LOG; struct node *root; enum reactivity re; volatile int hasput = 0; enum lock_type locktype = L_READ; /* printf("||||%s", (char *)cmd->val->data); */ CHANGE_LOCK_TYPE: if (!cache_get_and_pin(t->cf, t->hdr->root_nid, (void**)&root, locktype)) return NESS_ERR; /* printf("|||%lld", root->nid); */ if (!hasput) { node_put_cmd(t, root, cmd); hasput = 1; } re = get_reactivity(t, root); switch (re) { case STABLE: printf("|||STABLE"); cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); break; case FISSIBLE: printf("|||FISSIBLE"); if (locktype == L_READ) { cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); locktype = L_WRITE; goto CHANGE_LOCK_TYPE; } _root_fissible(t, root); break; case FLUSHBLE: printf("|||FLUSHBLE"); if (locktype == L_READ) { cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); locktype = L_WRITE; goto CHANGE_LOCK_TYPE; } tree_flush_node_on_background(t, root); break; } return NESS_OK; }
struct tree *tree_open(const char *dbname, struct env *e, struct tree_callback *tcb) { int fd; int flag; mode_t mode; int is_create = 0; struct tree *t; struct node *root; struct cache_file *cf; t = xcalloc(1, sizeof(*t)); t->e = e; mode = S_IRWXU | S_IRWXG | S_IRWXO; flag = O_RDWR | O_BINARY; if (e->use_directio) fd = ness_os_open_direct(dbname, flag, mode); else fd = ness_os_open(dbname, flag, mode); if (fd == -1) { if (e->use_directio) fd = ness_os_open(dbname, flag | O_CREAT, mode); else fd = ness_os_open_direct(dbname, flag | O_CREAT, mode); if (fd == -1) goto ERR; is_create = 1; } t->fd = fd; t->hdr = hdr_new(e); /* tree header */ if (!is_create) { tcb->fetch_hdr_cb(fd, t->hdr); } /* create cache file */ cf = cache_file_create(e->cache, t->fd, t->hdr, tcb); t->cf = cf; /* tree root node */ if (is_create) { NID nid = hdr_next_nid(t->hdr); node_create(nid, 0, 1, t->hdr->version, t->e, &root); cache_put_and_pin(cf, nid, root); root->isroot = 1; node_set_dirty(root); cache_unpin(cf, root->cpair, make_cpair_attr(root)); t->hdr->root_nid = root->nid; __DEBUG("create new root, NID %"PRIu64, root->nid); } else { /* get the root node */ if (cache_get_and_pin(cf, t->hdr->root_nid, (void**)&root, L_READ) != NESS_OK) __PANIC("get root from cache error [%" PRIu64 "]", t->hdr->root_nid); root->isroot = 1; cache_unpin(cf, root->cpair, make_cpair_attr(root)); __DEBUG("fetch root, NID %"PRIu64, root->nid); } return t; ERR: xfree(t); return NESS_ERR; }
void _flush_buffer_to_child(struct tree *t, struct node *child, struct nmb *buf) { struct mb_iter iter; mb_iter_init(&iter, buf->pma); while (mb_iter_next(&iter)) { /* TODO(BohuTANG): check msn */ struct nmb_values nvalues; nmb_get_values(&iter, &nvalues); struct bt_cmd cmd = { .msn = nvalues.msn, .type = nvalues.type, .key = &nvalues.key, .val = &nvalues.val, .xidpair = nvalues.xidpair }; node_put_cmd(t, child, &cmd); } } void _flush_some_child(struct tree *t, struct node *parent); /* * PROCESS: * - check child reactivity * - if FISSIBLE: split child * - if FLUSHBLE: flush buffer from child * ENTER: * - parent is already locked * - child is already locked * EXIT: * - parent is unlocked * - no nodes are locked */ void _child_maybe_reactivity(struct tree *t, struct node *parent, struct node *child) { enum reactivity re = get_reactivity(t, child); switch (re) { case STABLE: cache_unpin(t->cf, child->cpair, make_cpair_attr(child)); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); break; case FISSIBLE: node_split_child(t, parent, child); cache_unpin(t->cf, child->cpair, make_cpair_attr(child)); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); break; case FLUSHBLE: cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); _flush_some_child(t, child); break; } } /* * PROCESS: * - pick a heaviest child of parent * - flush from parent to child * - maybe split/flush child recursively * ENTER: * - parent is already locked * EXIT: * - parent is unlocked * - no nodes are locked */ void _flush_some_child(struct tree *t, struct node *parent) { int childnum; enum reactivity re; struct node *child; struct partition *part; struct nmb *buffer; struct timespec t1, t2; childnum = node_find_heaviest_idx(parent); nassert(childnum < parent->n_children); part = &parent->parts[childnum]; buffer = part->ptr.u.nonleaf->buffer; if (cache_get_and_pin(t->cf, part->child_nid, (void**)&child, L_WRITE) != NESS_OK) { __ERROR("cache get node error, nid [%" PRIu64 "]", part->child_nid); return; } ngettime(&t1); re = get_reactivity(t, child); if (re == STABLE) { node_set_dirty(parent); part->ptr.u.nonleaf->buffer = nmb_new(t->e); _flush_buffer_to_child(t, child, buffer); nmb_free(buffer); } ngettime(&t2); status_add(&t->e->status->tree_flush_child_costs, time_diff_ms(t1, t2)); status_increment(&t->e->status->tree_flush_child_nums); _child_maybe_reactivity(t, parent, child); }