/* search in a node's child */ int _search_child(struct cursor *cur, struct search *so, struct node *n, int childnum) { int ret; NID child_nid; int child_to_search; struct node *child; nassert(n->height > 0); ancestors_append(cur, n->parts[childnum].msgbuf); child_nid = n->parts[childnum].child_nid; if (!cache_get_and_pin(cur->tree->cf, child_nid, (void**)&child, L_READ)) { __ERROR("cache get node error, nid [%" PRIu64 "]", child_nid); return NESS_ERR; } child_to_search = _search_in_which_child(so, child); ret = _search_node(cur, so, child, child_to_search); /* unpin */ cache_unpin(cur->tree->cf, child->cpair); return ret; }
/* * EFFECT: * - flush in background thread * ENTER: * - parent is already locked * EXIT: * - nodes are all unlocked */ void tree_flush_node_on_background(struct tree *t, struct node *parent) { LOG; int childnum; enum reactivity re; struct node *child; struct partition *part; nassert(parent->height > 0); childnum = node_find_heaviest_idx(parent); part = &parent->parts[childnum]; /* pin the child */ if (cache_get_and_pin(t->cf, part->child_nid, (void**)&child, L_WRITE) != NESS_OK) { __ERROR("cache get node error, nid [%" PRIu64 "]", part->child_nid); return; } re = get_reactivity(t, child); if (re == STABLE) { /* detach buffer from parent */ struct nmb *buf = part->ptr.u.nonleaf->buffer; node_set_dirty(parent); part->ptr.u.nonleaf->buffer = nmb_new(t->e); /* flush it in background thread */ _place_node_and_buffer_on_background(t, child, buf); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); } else { /* the child is reactive, we deal it in main thread */ _child_maybe_reactivity(t, parent, child); } }
/* * PROCESS: * - put cmd to root * - check root reactivity * -- FISSIBLE: split root * -- FLUSHABLE: flush root in background * ENTER: * - no nodes are locked * EXITS: * - all nodes are unlocked */ int root_put_cmd(struct tree *t, struct bt_cmd *cmd) { LOG; struct node *root; enum reactivity re; volatile int hasput = 0; enum lock_type locktype = L_READ; /* printf("||||%s", (char *)cmd->val->data); */ CHANGE_LOCK_TYPE: if (!cache_get_and_pin(t->cf, t->hdr->root_nid, (void**)&root, locktype)) return NESS_ERR; /* printf("|||%lld", root->nid); */ if (!hasput) { node_put_cmd(t, root, cmd); hasput = 1; } re = get_reactivity(t, root); switch (re) { case STABLE: printf("|||STABLE"); cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); break; case FISSIBLE: printf("|||FISSIBLE"); if (locktype == L_READ) { cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); locktype = L_WRITE; goto CHANGE_LOCK_TYPE; } _root_fissible(t, root); break; case FLUSHBLE: printf("|||FLUSHBLE"); if (locktype == L_READ) { cache_unpin(t->cf, root->cpair, make_cpair_attr(root)); locktype = L_WRITE; goto CHANGE_LOCK_TYPE; } tree_flush_node_on_background(t, root); break; } return NESS_OK; }
/* * |key44, key88| * / \ * |key10, key20| |key90| * / | \ \ * |msgbuf0| |msgbuf1| |msgbuf2| |msgbuf3| * * (a tree with height 2) * * cursor search is very similar to depth-first-search algorithm. * for cursor_seektofirst operation, the root-to-leaf path is: * key44 -> key10 -> msgbuf0 * and do the inner sliding along with the msgbuf. * if we get the end of one leaf, CURSOR_EOF will be returned to upper on, * and we also set search->pivot_bound = key10, for the next time, * the root-to-leaf path(restart with a jump) will be: * key44 -> key10 -> msgbuf1 */ void _tree_search(struct cursor * cur, struct search * so) { int r; NID root_nid; int child_to_search; struct buftree *t; struct node *root; t = cur->tree; TRY_AGAIN: root_nid = t->hdr->root_nid; if (!cache_get_and_pin(t->cf, root_nid, (void**)&root, L_READ)) { __ERROR("cache get root node error, nid [%" PRIu64 "]", root_nid); return; } child_to_search = _search_in_which_child(so, root); r = _search_node(cur, so, root, child_to_search); /* unpin */ cache_unpin(t->cf, root->cpair); switch (r) { case CURSOR_CONTINUE: /* got the end of leaf */ goto TRY_AGAIN; break; case CURSOR_TRY_AGAIN: /* got the end of node */ goto TRY_AGAIN; break; case CURSOR_EOF: break; default: break; } }
/* * |key44, key88| * / \ * |key10, key20| |key90| * / | \ \ * |msgbuf0| |msgbuf1| |msgbuf2| |msgbuf3| * * (a tree with height 2) * * cursor search is very similar to depth-first-search algorithm. * for cursor_seektofirst operation, the root-to-leaf path is: * key44 -> key10 -> msgbuf0 * and do the inner sliding along with the msgbuf. * if we get the end of one leaf, CURSOR_EOF will be returned to upper on, * and we also set search->pivot_bound = key10, for the next time, * the root-to-leaf path(restart with a jump) will be: * key44 -> key10 -> msgbuf1 */ void _tree_search(struct cursor * cur, struct search * so) { int r; NID root_nid; int child_to_search; struct tree *t; struct node *root; t = cur->tree; try_again: root_nid = t->hdr->root_nid; if (cache_get_and_pin(t->cf, root_nid, &root, L_READ) < 0) { __ERROR("cache get root node error, nid [%" PRIu64 "]", root_nid); return; } child_to_search = _search_in_which_child(so, root); r = _search_node(cur, so, root, child_to_search); /* unpin */ cache_unpin_readonly(t->cf, root); switch (r) { case CURSOR_CONTINUE: break; case CURSOR_TRY_AGAIN: goto try_again; break; case CURSOR_EOF: break; default: break; } }
struct tree *tree_open(const char *dbname, struct env *e, struct tree_callback *tcb) { int fd; int flag; mode_t mode; int is_create = 0; struct tree *t; struct node *root; struct cache_file *cf; t = xcalloc(1, sizeof(*t)); t->e = e; mode = S_IRWXU | S_IRWXG | S_IRWXO; flag = O_RDWR | O_BINARY; if (e->use_directio) fd = ness_os_open_direct(dbname, flag, mode); else fd = ness_os_open(dbname, flag, mode); if (fd == -1) { if (e->use_directio) fd = ness_os_open(dbname, flag | O_CREAT, mode); else fd = ness_os_open_direct(dbname, flag | O_CREAT, mode); if (fd == -1) goto ERR; is_create = 1; } t->fd = fd; t->hdr = hdr_new(e); /* tree header */ if (!is_create) { tcb->fetch_hdr_cb(fd, t->hdr); } /* create cache file */ cf = cache_file_create(e->cache, t->fd, t->hdr, tcb); t->cf = cf; /* tree root node */ if (is_create) { NID nid = hdr_next_nid(t->hdr); node_create(nid, 0, 1, t->hdr->version, t->e, &root); cache_put_and_pin(cf, nid, root); root->isroot = 1; node_set_dirty(root); cache_unpin(cf, root->cpair, make_cpair_attr(root)); t->hdr->root_nid = root->nid; __DEBUG("create new root, NID %"PRIu64, root->nid); } else { /* get the root node */ if (cache_get_and_pin(cf, t->hdr->root_nid, (void**)&root, L_READ) != NESS_OK) __PANIC("get root from cache error [%" PRIu64 "]", t->hdr->root_nid); root->isroot = 1; cache_unpin(cf, root->cpair, make_cpair_attr(root)); __DEBUG("fetch root, NID %"PRIu64, root->nid); } return t; ERR: xfree(t); return NESS_ERR; }
void _flush_buffer_to_child(struct tree *t, struct node *child, struct nmb *buf) { struct mb_iter iter; mb_iter_init(&iter, buf->pma); while (mb_iter_next(&iter)) { /* TODO(BohuTANG): check msn */ struct nmb_values nvalues; nmb_get_values(&iter, &nvalues); struct bt_cmd cmd = { .msn = nvalues.msn, .type = nvalues.type, .key = &nvalues.key, .val = &nvalues.val, .xidpair = nvalues.xidpair }; node_put_cmd(t, child, &cmd); } } void _flush_some_child(struct tree *t, struct node *parent); /* * PROCESS: * - check child reactivity * - if FISSIBLE: split child * - if FLUSHBLE: flush buffer from child * ENTER: * - parent is already locked * - child is already locked * EXIT: * - parent is unlocked * - no nodes are locked */ void _child_maybe_reactivity(struct tree *t, struct node *parent, struct node *child) { enum reactivity re = get_reactivity(t, child); switch (re) { case STABLE: cache_unpin(t->cf, child->cpair, make_cpair_attr(child)); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); break; case FISSIBLE: node_split_child(t, parent, child); cache_unpin(t->cf, child->cpair, make_cpair_attr(child)); cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); break; case FLUSHBLE: cache_unpin(t->cf, parent->cpair, make_cpair_attr(parent)); _flush_some_child(t, child); break; } } /* * PROCESS: * - pick a heaviest child of parent * - flush from parent to child * - maybe split/flush child recursively * ENTER: * - parent is already locked * EXIT: * - parent is unlocked * - no nodes are locked */ void _flush_some_child(struct tree *t, struct node *parent) { int childnum; enum reactivity re; struct node *child; struct partition *part; struct nmb *buffer; struct timespec t1, t2; childnum = node_find_heaviest_idx(parent); nassert(childnum < parent->n_children); part = &parent->parts[childnum]; buffer = part->ptr.u.nonleaf->buffer; if (cache_get_and_pin(t->cf, part->child_nid, (void**)&child, L_WRITE) != NESS_OK) { __ERROR("cache get node error, nid [%" PRIu64 "]", part->child_nid); return; } ngettime(&t1); re = get_reactivity(t, child); if (re == STABLE) { node_set_dirty(parent); part->ptr.u.nonleaf->buffer = nmb_new(t->e); _flush_buffer_to_child(t, child, buffer); nmb_free(buffer); } ngettime(&t2); status_add(&t->e->status->tree_flush_child_costs, time_diff_ms(t1, t2)); status_increment(&t->e->status->tree_flush_child_nums); _child_maybe_reactivity(t, parent, child); }