/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_insert); WT_STAT_DATA_INCR(session, cursor_insert); WT_STAT_DATA_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). The real record number * is assigned by the serialized append operation. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = WT_RECNO_OOB; WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key exists. Creating a * record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && __cursor_valid(cbt, NULL)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) WT_ERR(WT_DUPLICATE_KEY); WT_ERR(__cursor_col_modify(session, cbt, false)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, fail if the key exists, else insert the * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && cbt->compare == 0 && __cursor_valid(cbt, NULL)) WT_ERR(WT_DUPLICATE_KEY); ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* Insert doesn't maintain a position across calls, clear resources. */ if (ret == 0) WT_TRET(__curfile_leave(cbt)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* tree walk flags */ LF_SET(WT_READ_NO_SPLIT); /* don't try to split */ if (truncating) LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; /* * Column-store pages may have appended entries. Handle it * separately from the usual cursor code, it's in a simple * format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif if (ret == 0) F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_txn_rollback -- * Roll back the current transaction. */ int __wt_txn_rollback(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; WT_TXN *txn; WT_TXN_OP *op; u_int i; bool readonly; WT_UNUSED(cfg); txn = &session->txn; readonly = txn->mod_count == 0; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); /* Rollback notification. */ if (txn->notify != NULL) WT_TRET(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 0)); /* Rollback updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { /* Metadata updates are never rolled back. */ if (op->fileid == WT_METAFILE_ID) continue; switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: WT_ASSERT(session, op->u.upd->txnid == txn->id); WT_ASSERT(session, S2C(session)->cache->las_fileid == 0 || op->fileid != S2C(session)->cache->las_fileid); op->u.upd->txnid = WT_TXN_ABORTED; break; case WT_TXN_OP_REF: __wt_delete_page_rollback(session, op->u.ref); break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* * Nothing to do: these operations are only logged for * recovery. The in-memory changes will be rolled back * with a combination of WT_TXN_OP_REF and * WT_TXN_OP_INMEM operations. */ break; } /* Free any memory allocated for the operation. */ __wt_txn_op_free(session, op); } txn->mod_count = 0; __wt_txn_release(session); /* * We're between transactions, if we need to block for eviction, it's * a good time to do so. Note that we must ignore any error return * because the user's data is committed. */ if (!readonly) (void)__wt_cache_eviction_check(session, false, false, NULL); return (ret); }
/* * ex_aci -- * Append, change, insert in ex. */ static int ex_aci(SCR *sp, EXCMD *cmdp, enum which cmd) { CHAR_T *p, *t; GS *gp; TEXT *tp; TEXTH tiq[] = {{ 0 }}; recno_t cnt = 0, lno; size_t len; u_int32_t flags; int need_newline; gp = sp->gp; NEEDFILE(sp, cmdp); /* * If doing a change, replace lines for as long as possible. Then, * append more lines or delete remaining lines. Changes to an empty * file are appends, inserts are the same as appends to the previous * line. * * !!! * Set the address to which we'll append. We set sp->lno to this * address as well so that autoindent works correctly when get text * from the user. */ lno = cmdp->addr1.lno; sp->lno = lno; if ((cmd == CHANGE || cmd == INSERT) && lno != 0) --lno; /* * !!! * If the file isn't empty, cut changes into the unnamed buffer. */ if (cmd == CHANGE && cmdp->addr1.lno != 0 && (cut(sp, NULL, &cmdp->addr1, &cmdp->addr2, CUT_LINEMODE) || del(sp, &cmdp->addr1, &cmdp->addr2, 1))) return (1); /* * !!! * Anything that was left after the command separator becomes part * of the inserted text. Apparently, it was common usage to enter: * * :g/pattern/append|stuff1 * * and append the line of text "stuff1" to the lines containing the * pattern. It was also historically legal to enter: * * :append|stuff1 * stuff2 * . * * and the text on the ex command line would be appended as well as * the text inserted after it. There was an historic bug however, * that the user had to enter *two* terminating lines (the '.' lines) * to terminate text input mode, in this case. This whole thing * could be taken too far, however. Entering: * * :append|stuff1\ * stuff2 * stuff3 * . * * i.e. mixing and matching the forms confused the historic vi, and, * not only did it take two terminating lines to terminate text input * mode, but the trailing backslashes were retained on the input. We * match historic practice except that we discard the backslashes. * * Input lines specified on the ex command line lines are separated by * <newline>s. If there is a trailing delimiter an empty line was * inserted. There may also be a leading delimiter, which is ignored * unless it's also a trailing delimiter. It is possible to encounter * a termination line, i.e. a single '.', in a global command, but not * necessary if the text insert command was the last of the global * commands. */ if (cmdp->save_cmdlen != 0) { for (p = cmdp->save_cmd, len = cmdp->save_cmdlen; len > 0; p = t) { for (t = p; len > 0 && t[0] != '\n'; ++t, --len); if (t != p || len == 0) { if (F_ISSET(sp, SC_EX_GLOBAL) && t - p == 1 && p[0] == '.') { ++t; if (len > 0) --len; break; } if (db_append(sp, 1, lno++, p, t - p)) return (1); } if (len != 0) { ++t; if (--len == 0 && db_append(sp, 1, lno++, NULL, 0)) return (1); } } /* * If there's any remaining text, we're in a global, and * there's more command to parse. * * !!! * We depend on the fact that non-global commands will eat the * rest of the command line as text input, and before getting * any text input from the user. Otherwise, we'd have to save * off the command text before or during the call to the text * input function below. */ if (len != 0) cmdp->save_cmd = t; cmdp->save_cmdlen = len; } if (F_ISSET(sp, SC_EX_GLOBAL)) { if ((sp->lno = lno) == 0 && db_exist(sp, 1)) sp->lno = 1; return (0); } /* * If not in a global command, read from the terminal. * * If this code is called by vi, we want to reset the terminal and use * ex's line get routine. It actually works fine if we use vi's get * routine, but it doesn't look as nice. Maybe if we had a separate * window or something, but getting a line at a time looks awkward. * However, depending on the screen that we're using, that may not * be possible. */ if (F_ISSET(sp, SC_VI)) { if (gp->scr_screen(sp, SC_EX)) { ex_wemsg(sp, cmdp->cmd->name, EXM_NOCANON); return (1); } /* If we're still in the vi screen, move out explicitly. */ need_newline = !F_ISSET(sp, SC_SCR_EXWROTE); F_SET(sp, SC_SCR_EX | SC_SCR_EXWROTE); if (need_newline) (void)ex_puts(sp, "\n"); /* * !!! * Users of historical versions of vi sometimes get confused * when they enter append mode, and can't seem to get out of * it. Give them an informational message. */ (void)ex_puts(sp, msg_cat(sp, "273|Entering ex input mode.", NULL)); (void)ex_puts(sp, "\n"); (void)ex_fflush(sp); } /* * Set input flags; the ! flag turns off autoindent for append, * change and insert. */ LF_INIT(TXT_DOTTERM | TXT_NUMBER); if (!FL_ISSET(cmdp->iflags, E_C_FORCE) && O_ISSET(sp, O_AUTOINDENT)) LF_SET(TXT_AUTOINDENT); if (O_ISSET(sp, O_BEAUTIFY)) LF_SET(TXT_BEAUTIFY); /* * This code can't use the common screen TEXTH structure (sp->tiq), * as it may already be in use, e.g. ":append|s/abc/ABC/" would fail * as we are only halfway through the text when the append code fires. * Use a local structure instead. (The ex code would have to use a * local structure except that we're guaranteed to finish remaining * characters in the common TEXTH structure when they were inserted * into the file, above.) */ TAILQ_INIT(tiq); if (ex_txt(sp, tiq, 0, flags)) return (1); TAILQ_FOREACH(tp, tiq, q) { if (db_append(sp, 1, lno++, tp->lb, tp->len)) return (1); ++cnt; } /* * Set sp->lno to the final line number value (correcting for a * possible 0 value) as that's historically correct for the final * line value, whether or not the user entered any text. */ if ((sp->lno = lno) == 0 && db_exist(sp, 1)) sp->lno = 1; return (0); }
/* * cl_event -- * Return a single event. * * PUBLIC: int cl_event __P((SCR *, EVENT *, u_int32_t, int)); */ int cl_event(SCR *sp, EVENT *evp, u_int32_t flags, int ms) { struct timeval t, *tp; CL_PRIVATE *clp; size_t lines, columns; int changed, nr = 0; CHAR_T *wp; size_t wlen; int rc; /* * Queue signal based events. We never clear SIGHUP or SIGTERM events, * so that we just keep returning them until the editor dies. */ clp = CLP(sp); retest: if (LF_ISSET(EC_INTERRUPT) || F_ISSET(clp, CL_SIGINT)) { if (F_ISSET(clp, CL_SIGINT)) { F_CLR(clp, CL_SIGINT); evp->e_event = E_INTERRUPT; } else evp->e_event = E_TIMEOUT; return (0); } if (F_ISSET(clp, CL_SIGHUP | CL_SIGTERM | CL_SIGWINCH)) { if (F_ISSET(clp, CL_SIGHUP)) { evp->e_event = E_SIGHUP; return (0); } if (F_ISSET(clp, CL_SIGTERM)) { evp->e_event = E_SIGTERM; return (0); } if (F_ISSET(clp, CL_SIGWINCH)) { F_CLR(clp, CL_SIGWINCH); if (cl_ssize(sp, 1, &lines, &columns, &changed)) return (1); if (changed) { (void)cl_resize(sp, lines, columns); evp->e_event = E_WRESIZE; return (0); } /* No real change, ignore the signal. */ } } /* Set timer. */ if (ms == 0) tp = NULL; else { t.tv_sec = ms / 1000; t.tv_usec = (ms % 1000) * 1000; tp = &t; } /* Read input characters. */ read: switch (cl_read(sp, LF_ISSET(EC_QUOTED | EC_RAW), clp->ibuf + clp->skip, SIZE(clp->ibuf) - clp->skip, &nr, tp)) { case INP_OK: rc = INPUT2INT5(sp, clp->cw, clp->ibuf, nr + clp->skip, wp, wlen); evp->e_csp = wp; evp->e_len = wlen; evp->e_event = E_STRING; if (rc < 0) { int n = -rc; memmove(clp->ibuf, clp->ibuf + nr + clp->skip - n, n); clp->skip = n; if (wlen == 0) goto read; } else if (rc == 0) clp->skip = 0; else msgq(sp, M_ERR, "323|Invalid input. Truncated."); break; case INP_EOF: evp->e_event = E_EOF; break; case INP_ERR: evp->e_event = E_ERR; break; case INP_INTR: goto retest; case INP_TIMEOUT: evp->e_event = E_TIMEOUT; break; default: abort(); } return (0); }
/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard reference. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, uint32_t flags, int top) { WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_TXN *txn; uint32_t i; txn = &session->txn; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!LF_ISSET(WT_REC_SINGLE)) WT_RET(__hazard_exclusive(session, ref, top)); /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ WT_RET(__rec_review( session, ref, ref->page, flags, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * Check if this page can be evicted: * * Fail if the top-level page is a page expected to be removed from the * tree as part of eviction (an empty page or a split-merge page). Note * "split" pages are NOT included in this test, because a split page can * be separately evicted, at which point it's replaced in its parent by * a reference to a split-merge page. That's a normal part of the leaf * page life-cycle if it grows too large and must be pushed out of the * cache. There is also an exception for empty pages, the root page may * be empty when evicted, but that only happens when the tree is closed. * * Fail if any page in the top-level page's subtree can't be merged into * its parent. You can't evict a page that references such in-memory * pages, they must be evicted first. The test is necessary but should * not fire much: the LRU-based eviction code is biased for leaf pages, * an internal page shouldn't be selected for LRU-based eviction until * its children have been evicted. Empty, split and split-merge pages * are all included in this test, they can all be merged into a parent. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. * * We don't do a cheap test for the top-level page: we're not called * to evict split-merge pages, which means the only interesting case * is an empty page. If the eviction thread picked an "empty" page * for eviction, it must have had reason, probably the empty page got * really, really full. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* If the page is dirty, write it so we know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, flags); /* If there are unwritten changes on the page, give up. */ if (ret == 0 && !LF_ISSET(WT_REC_SINGLE) && __wt_page_is_modified(page)) ret = EBUSY; if (ret == EBUSY) { WT_VERBOSE_RET(session, evict, "page %p written but not clean", page); if (F_ISSET(txn, TXN_RUNNING) && ++txn->eviction_fails >= 100) { txn->eviction_fails = 0; ret = WT_DEADLOCK; WT_STAT_INCR( S2C(session)->stats, txn_fail_cache); } /* * If there aren't multiple cursors active, there * are no consistency issues: try to bump our snapshot. */ if (session->ncursors <= 1) { __wt_txn_read_last(session); __wt_txn_read_first(session); } switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); txn->eviction_fails = 0; } /* * Repeat the eviction tests. * * Fail if the top-level page should be merged into its parent, and it's * not the root page. * * Fail if a page in the top-level page's subtree can't be merged into * its parent. */ if (top) { /* * We never get a top-level split-merge page to evict, they are * ignored by the eviction thread. Check out of sheer paranoia. */ if (mod != NULL) { if (F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) return (EBUSY); if (F_ISSET(mod, WT_PM_REC_EMPTY) && !WT_PAGE_IS_ROOT(page)) return (EBUSY); } } else if (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) return (EBUSY); return (0); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* The page is busy -- wait. */ break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; WT_RET(__wt_page_release(session, ref, flags)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * v_cmd -- * Get a vi command. */ static gcret_t v_cmd(SCR *sp, VICMD *dp, VICMD *vp, VICMD *ismotion, int *comcountp, int *mappedp) /* Previous key if getting motion component. */ { enum { COMMANDMODE, ISPARTIAL, NOTPARTIAL } cpart; ARG_CHAR_T key; VIKEYS const *kp; gcret_t gcret; u_int flags; const char *s; /* * Get an event command or a key. Event commands are simple, and * don't have any additional information. */ cpart = ismotion == NULL ? COMMANDMODE : ISPARTIAL; gcret = v_key(sp, vp, 1, EC_MAPCOMMAND); if (gcret != GC_OK) { if (gcret != GC_EVENT) return (gcret); if (v_event(sp, vp)) return (GC_ERR); if (ismotion != NULL && !F_ISSET(vp->kp, V_MOVE)) v_event_err(sp, &vp->ev); return (GC_EVENT); } /* * Keys are not simple. (Although vi's command structure less complex * than ex (and don't think I'm not grateful!) The command syntax is: * * [count] [buffer] [count] key [[motion] | [buffer] [character]] * * and there are, of course, several special cases. The motion value * is itself a vi command, with the syntax: * * [count] key [character] * * <escape> cancels partial commands, i.e. a command where at least * one non-numeric character has been entered. Otherwise, it beeps * the terminal. * * !!! * POSIX 1003.2-1992 explicitly disallows cancelling commands where * all that's been entered is a number, requiring that the terminal * be alerted. */ if (vp->ev.e_value == K_ESCAPE) goto esc; /* * Commands that are mapped are treated differently (e.g., they * don't set the dot command. Pass that information back. */ if (FL_ISSET(vp->ev.e_flags, CH_MAPPED)) *mappedp = 1; key = vp->ev.e_c; if (ismotion == NULL) cpart = NOTPARTIAL; /* Pick up an optional buffer. */ if (key == '"') { cpart = ISPARTIAL; if (ismotion != NULL) { v_emsg(sp, NULL, VIM_COMBUF); return (GC_ERR); } KEY(vp->buffer, 0); F_SET(vp, VC_BUFFER); KEY(key, EC_MAPCOMMAND); } /* * Pick up an optional count, where a leading 0 isn't a count, it's * a command. When a count is specified, the dot command behaves * differently, pass the information back. */ if (ISDIGIT(key) && key != '0') { if (v_count(sp, vp, key, &vp->count)) return (GC_ERR); F_SET(vp, VC_C1SET); *comcountp = 1; KEY(key, EC_MAPCOMMAND); } else *comcountp = 0; /* Pick up optional buffer. */ if (key == '"') { cpart = ISPARTIAL; if (F_ISSET(vp, VC_BUFFER)) { msgq(sp, M_ERR, "234|Only one buffer may be specified"); return (GC_ERR); } if (ismotion != NULL) { v_emsg(sp, NULL, VIM_COMBUF); return (GC_ERR); } KEY(vp->buffer, 0); F_SET(vp, VC_BUFFER); KEY(key, EC_MAPCOMMAND); } /* Check for an OOB command key. */ cpart = ISPARTIAL; if (key > MAXVIKEY) { v_emsg(sp, (const char *)KEY_NAME(sp, key), VIM_NOCOM); return (GC_ERR); } kp = &vikeys[vp->key = key]; /* * !!! * Historically, D accepted and then ignored a count. Match it. */ if (vp->key == 'D' && F_ISSET(vp, VC_C1SET)) { *comcountp = 0; vp->count = 0; F_CLR(vp, VC_C1SET); } /* * There are several commands that we implement as aliases, both * to match historic practice and to ensure consistency. Check * for command aliases. */ if (kp->func == NULL && (kp = v_alias(sp, vp, kp)) == NULL) return (GC_ERR); /* The tildeop option makes the ~ command take a motion. */ if (key == '~' && O_ISSET(sp, O_TILDEOP)) kp = &tmotion; vp->kp = kp; /* * Find the command. The only legal command with no underlying * function is dot. It's historic practice that <escape> doesn't * just erase the preceding number, it beeps the terminal as well. * It's a common problem, so just beep the terminal unless verbose * was set. */ if (kp->func == NULL) { if (key != '.') { v_emsg(sp, (const char *)KEY_NAME(sp, key), vp->ev.e_value == K_ESCAPE ? VIM_NOCOM_B : VIM_NOCOM); return (GC_ERR); } /* If called for a motion command, stop now. */ if (dp == NULL) goto usage; /* * !!! * If a '.' is immediately entered after an undo command, we * replay the log instead of redoing the last command. This * is necessary because 'u' can't set the dot command -- see * vi/v_undo.c:v_undo for details. */ if (VIP(sp)->u_ccnt == sp->ccnt) { vp->kp = &vikeys['u']; F_SET(vp, VC_ISDOT); return (GC_OK); } /* Otherwise, a repeatable command must have been executed. */ if (!F_ISSET(dp, VC_ISDOT)) { msgq(sp, M_ERR, "208|No command to repeat"); return (GC_ERR); } /* Set new count/buffer, if any, and return. */ if (F_ISSET(vp, VC_C1SET)) { F_SET(dp, VC_C1SET); dp->count = vp->count; } if (F_ISSET(vp, VC_BUFFER)) dp->buffer = vp->buffer; *vp = *dp; return (GC_OK); } /* Set the flags based on the command flags. */ flags = kp->flags; /* Check for illegal count. */ if (F_ISSET(vp, VC_C1SET) && !LF_ISSET(V_CNT)) goto usage; /* Illegal motion command. */ if (ismotion == NULL) { /* Illegal buffer. */ if (!LF_ISSET(V_OBUF) && F_ISSET(vp, VC_BUFFER)) goto usage; /* Required buffer. */ if (LF_ISSET(V_RBUF)) { KEY(vp->buffer, 0); F_SET(vp, VC_BUFFER); } } /* * Special case: '[', ']' and 'Z' commands. Doesn't the fact that * the *single* characters don't mean anything but the *doubled* * characters do, just frost your shorts? */ if (vp->key == '[' || vp->key == ']' || vp->key == 'Z') { /* * Historically, half entered [[, ]] or Z commands weren't * cancelled by <escape>, the terminal was beeped instead. * POSIX.2-1992 probably didn't notice, and requires that * they be cancelled instead of beeping. Seems fine to me. * * Don't set the EC_MAPCOMMAND flag, apparently ] is a popular * vi meta-character, and we don't want the user to wait while * we time out a possible mapping. This *appears* to match * historic vi practice, but with mapping characters, You Just * Never Know. */ KEY(key, 0); if (vp->key != key) { usage: if (ismotion == NULL) s = kp->usage; else if (ismotion->key == '~' && O_ISSET(sp, O_TILDEOP)) s = tmotion.usage; else s = vikeys[ismotion->key].usage; v_emsg(sp, s, VIM_USAGE); return (GC_ERR); } } /* Special case: 'z' command. */ if (vp->key == 'z') { KEY(vp->character, 0); if (ISDIGIT(vp->character)) { if (v_count(sp, vp, vp->character, &vp->count2)) return (GC_ERR); F_SET(vp, VC_C2SET); KEY(vp->character, 0); } } /* * Commands that have motion components can be doubled to imply the * current line. */ if (ismotion != NULL && ismotion->key != key && !LF_ISSET(V_MOVE)) { msgq(sp, M_ERR, "210|%s may not be used as a motion command", KEY_NAME(sp, key)); return (GC_ERR); } /* Pick up required trailing character. */ if (LF_ISSET(V_CHAR)) KEY(vp->character, 0); /* Get any associated cursor word. */ if (F_ISSET(kp, V_KEYW) && v_curword(sp)) return (GC_ERR); return (GC_OK); esc: switch (cpart) { case COMMANDMODE: msgq(sp, M_BERR, "211|Already in command mode"); return (GC_ERR_NOFLUSH); case ISPARTIAL: break; case NOTPARTIAL: (void)sp->gp->scr_bell(sp); break; } return (GC_ERR); }
/* * vi -- * Main vi command loop. * * PUBLIC: int vi __P((SCR **)); */ int vi(SCR **spp) { GS *gp; WIN *wp; MARK abst; SCR *next, *sp; VICMD cmd, *vp; VI_PRIVATE *vip; int comcount, mapped, rval; /* Get the first screen. */ sp = *spp; wp = sp->wp; gp = sp->gp; /* Initialize the command structure. */ vp = &cmd; memset(vp, 0, sizeof(VICMD)); /* Reset strange attraction. */ F_SET(vp, VM_RCM_SET); /* Initialize the vi screen. */ if (v_init(sp)) return (1); /* Set the focus. */ (void)sp->gp->scr_rename(sp, sp->frp->name, 1); for (vip = VIP(sp), rval = 0;;) { /* Resolve messages. */ if (!MAPPED_KEYS_WAITING(sp) && vs_resolve(sp, NULL, 0)) goto ret; /* * If not skipping a refresh, return to command mode and * refresh the screen. */ if (F_ISSET(vip, VIP_S_REFRESH)) F_CLR(vip, VIP_S_REFRESH); else { sp->showmode = SM_COMMAND; if (vs_refresh(sp, 0)) goto ret; } /* Set the new favorite position. */ if (F_ISSET(vp, VM_RCM_SET | VM_RCM_SETFNB | VM_RCM_SETNNB)) { F_CLR(vip, VIP_RCM_LAST); (void)vs_column(sp, &sp->rcm); } /* * If not currently in a map, log the cursor position, * and set a flag so that this command can become the * DOT command. */ if (MAPPED_KEYS_WAITING(sp)) mapped = 1; else { if (log_cursor(sp)) goto err; mapped = 0; } /* * There may be an ex command waiting, and we returned here * only because we exited a screen or file. In this case, * we simply go back into the ex parser. */ if (EXCMD_RUNNING(wp)) { vp->kp = &vikeys[':']; goto ex_continue; } /* Refresh the command structure. */ memset(vp, 0, sizeof(VICMD)); /* * We get a command, which may or may not have an associated * motion. If it does, we get it too, calling its underlying * function to get the resulting mark. We then call the * command setting the cursor to the resulting mark. * * !!! * Vi historically flushed mapped characters on error, but * entering extra <escape> characters at the beginning of * a map wasn't considered an error -- in fact, users would * put leading <escape> characters in maps to clean up vi * state before the map was interpreted. Beauty! */ switch (v_cmd(sp, DOT, vp, NULL, &comcount, &mapped)) { case GC_ERR: goto err; case GC_ERR_NOFLUSH: goto gc_err_noflush; case GC_FATAL: goto ret; case GC_INTERRUPT: goto intr; case GC_EVENT: case GC_OK: break; } /* Check for security setting. */ if (F_ISSET(vp->kp, V_SECURE) && O_ISSET(sp, O_SECURE)) { ex_emsg(sp, (const char *)KEY_NAME(sp, vp->key), EXM_SECURE); goto err; } /* * Historical practice: if a dot command gets a new count, * any motion component goes away, i.e. "d3w2." deletes a * total of 5 words. */ if (F_ISSET(vp, VC_ISDOT) && comcount) DOTMOTION->count = 1; /* Copy the key flags into the local structure. */ F_SET(vp, vp->kp->flags); /* Prepare to set the previous context. */ if (F_ISSET(vp, V_ABS | V_ABS_C | V_ABS_L)) { abst.lno = sp->lno; abst.cno = sp->cno; } /* * Set the three cursor locations to the current cursor. The * underlying routines don't bother if the cursor doesn't move. * This also handles line commands (e.g. Y) defaulting to the * current line. */ vp->m_start.lno = vp->m_stop.lno = vp->m_final.lno = sp->lno; vp->m_start.cno = vp->m_stop.cno = vp->m_final.cno = sp->cno; /* * Do any required motion; v_motion sets the from MARK and the * line mode flag, as well as the VM_RCM flags. */ if (F_ISSET(vp, V_MOTION) && v_motion(sp, DOTMOTION, vp, &mapped)) { if (INTERRUPTED(sp)) goto intr; goto err; } /* * If a count is set and the command is line oriented, set the * to MARK here relative to the cursor/from MARK. This is for * commands that take both counts and motions, i.e. "4yy" and * "y%". As there's no way the command can know which the user * did, we have to do it here. (There are commands that are * line oriented and that take counts ("#G", "#H"), for which * this calculation is either completely meaningless or wrong. * Each command must validate the value for itself. */ if (F_ISSET(vp, VC_C1SET) && F_ISSET(vp, VM_LMODE)) vp->m_stop.lno += vp->count - 1; /* Increment the command count. */ ++sp->ccnt; #if defined(DEBUG) && defined(COMLOG) v_comlog(sp, vp); #endif /* Call the function. */ ex_continue: if (vp->kp->func(sp, vp)) goto err; #ifdef DEBUG /* Make sure no function left the temporary space locked. */ if (F_ISSET(wp, W_TMP_INUSE)) { F_CLR(wp, W_TMP_INUSE); msgq(sp, M_ERR, "232|vi: temporary buffer not released"); } #endif /* * If we're exiting this screen, move to the next one, or, if * there aren't any more, return to the main editor loop. The * ordering is careful, don't discard the contents of sp until * the end. */ if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) { if (file_end(sp, NULL, F_ISSET(sp, SC_EXIT_FORCE))) goto ret; if (vs_discard(sp, &next)) goto ret; if (next == NULL && vs_swap(sp, &next, NULL)) goto ret; *spp = next; if (screen_end(sp)) goto ret; if (next == NULL) break; /* Switch screens, change focus. */ sp = next; vip = VIP(sp); (void)sp->gp->scr_rename(sp, sp->frp->name, 1); /* Don't trust the cursor. */ F_SET(vip, VIP_CUR_INVALID); continue; } /* * Set the dot command structure. * * !!! * Historically, commands which used mapped keys did not * set the dot command, with the exception of the text * input commands. */ if (F_ISSET(vp, V_DOT) && !mapped) { *DOT = cmd; F_SET(DOT, VC_ISDOT); /* * If a count was supplied for both the command and * its motion, the count was used only for the motion. * Turn the count back on for the dot structure. */ if (F_ISSET(vp, VC_C1RESET)) F_SET(DOT, VC_C1SET); /* VM flags aren't retained. */ F_CLR(DOT, VM_COMMASK | VM_RCM_MASK); } /* * Some vi row movements are "attracted" to the last position * set, i.e. the VM_RCM commands are moths to the VM_RCM_SET * commands' candle. If the movement is to the EOL the vi * command handles it. If it's to the beginning, we handle it * here. * * Note, some commands (e.g. _, ^) don't set the VM_RCM_SETFNB * flag, but do the work themselves. The reason is that they * have to modify the column in case they're being used as a * motion component. Other similar commands (e.g. +, -) don't * have to modify the column because they are always line mode * operations when used as motions, so the column number isn't * of any interest. * * Does this totally violate the screen and editor layering? * You betcha. As they say, if you think you understand it, * you don't. */ switch (F_ISSET(vp, VM_RCM_MASK)) { case 0: case VM_RCM_SET: break; case VM_RCM: vp->m_final.cno = vs_rcm(sp, vp->m_final.lno, F_ISSET(vip, VIP_RCM_LAST)); break; case VM_RCM_SETLAST: F_SET(vip, VIP_RCM_LAST); break; case VM_RCM_SETFNB: vp->m_final.cno = 0; /* FALLTHROUGH */ case VM_RCM_SETNNB: if (nonblank(sp, vp->m_final.lno, &vp->m_final.cno)) goto err; break; default: abort(); } /* Update the cursor. */ sp->lno = vp->m_final.lno; sp->cno = vp->m_final.cno; /* * Set the absolute mark -- set even if a tags or similar * command, since the tag may be moving to the same file. */ if ((F_ISSET(vp, V_ABS) || (F_ISSET(vp, V_ABS_L) && sp->lno != abst.lno) || (F_ISSET(vp, V_ABS_C) && (sp->lno != abst.lno || sp->cno != abst.cno))) && mark_set(sp, ABSMARK1, &abst, 1)) goto err; if (0) { err: if (v_event_flush(sp, CH_MAPPED)) msgq(sp, M_BERR, "110|Vi command failed: mapped keys discarded"); } /* * Check and clear interrupts. There's an obvious race, but * it's not worth fixing. */ gc_err_noflush: if (INTERRUPTED(sp)) { intr: CLR_INTERRUPT(sp); if (v_event_flush(sp, CH_MAPPED)) msgq(sp, M_ERR, "231|Interrupted: mapped keys discarded"); else msgq(sp, M_ERR, "236|Interrupted"); } /* If the last command switched screens, update. */ if (F_ISSET(sp, SC_SSWITCH)) { F_CLR(sp, SC_SSWITCH); /* * If the current screen is still displayed, it will * need a new status line. */ F_SET(sp, SC_STATUS); /* Switch screens, change focus. */ sp = sp->nextdisp; vip = VIP(sp); (void)sp->gp->scr_rename(sp, sp->frp->name, 1); /* Don't trust the cursor. */ F_SET(vip, VIP_CUR_INVALID); /* Refresh so we can display messages. */ if (vs_refresh(sp, 1)) return (1); } /* If the last command switched files, change focus. */ if (F_ISSET(sp, SC_FSWITCH)) { F_CLR(sp, SC_FSWITCH); (void)sp->gp->scr_rename(sp, sp->frp->name, 1); } /* If leaving vi, return to the main editor loop. */ if (F_ISSET(gp, G_SRESTART) || F_ISSET(sp, SC_EX)) { *spp = sp; v_dtoh(sp); gp->scr_discard(sp, NULL); break; } } if (0) ret: rval = 1; return (rval); }
DB * __rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo, int dflags) { BTREE *t; BTREEINFO btopeninfo; DB *dbp; PAGE *h; struct stat sb; int rfd = -1; /* pacify gcc */ int sverrno; dbp = NULL; /* Open the user's file -- if this fails, we're done. */ if (fname != NULL) { if ((rfd = open(fname, flags | O_CLOEXEC, mode)) == -1) return NULL; } /* Create a btree in memory (backed by disk). */ if (openinfo) { if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT)) goto einval; btopeninfo.flags = 0; btopeninfo.cachesize = openinfo->cachesize; btopeninfo.maxkeypage = 0; btopeninfo.minkeypage = 0; btopeninfo.psize = openinfo->psize; btopeninfo.compare = NULL; btopeninfo.prefix = NULL; btopeninfo.lorder = openinfo->lorder; dbp = __bt_open(openinfo->bfname, O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags); } else dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags); if (dbp == NULL) goto err; /* * Some fields in the tree structure are recno specific. Fill them * in and make the btree structure look like a recno structure. We * don't change the bt_ovflsize value, it's close enough and slightly * bigger. */ t = dbp->internal; if (openinfo) { if (openinfo->flags & R_FIXEDLEN) { F_SET(t, R_FIXLEN); t->bt_reclen = openinfo->reclen; if (t->bt_reclen == 0) goto einval; } t->bt_bval = openinfo->bval; } else t->bt_bval = '\n'; F_SET(t, R_RECNO); if (fname == NULL) F_SET(t, R_EOF | R_INMEM); else t->bt_rfd = rfd; if (fname != NULL) { /* * In 4.4BSD, stat(2) returns true for ISSOCK on pipes. * Unfortunately, that's not portable, so we use lseek * and check the errno values. */ errno = 0; if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, R_RDONLY); break; default: goto einval; } slow: if ((t->bt_rfp = fdopen(rfd, "r")) == NULL) goto err; F_SET(t, R_CLOSEFP); t->bt_irec = F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe; } else { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, R_RDONLY); break; case O_RDWR: break; default: goto einval; } if (fstat(rfd, &sb)) goto err; /* * Kluge -- we'd like to test to see if the file is too * big to mmap. Since, we don't know what size or type * off_t's or size_t's are, what the largest unsigned * integral type is, or what random insanity the local * C compiler will perpetrate, doing the comparison in * a portable way is flatly impossible. Hope that mmap * fails if the file is too large. */ if (sb.st_size == 0) F_SET(t, R_EOF); else { #ifdef MMAP_NOT_AVAILABLE /* * XXX * Mmap doesn't work correctly on many current * systems. In particular, it can fail subtly, * with cache coherency problems. Don't use it * for now. */ t->bt_msize = sb.st_size; if ((t->bt_smap = mmap(NULL, t->bt_msize, PROT_READ, MAP_FILE | MAP_PRIVATE, rfd, (off_t)0)) == (caddr_t)-1) goto slow; t->bt_cmap = t->bt_smap; t->bt_emap = t->bt_smap + sb.st_size; t->bt_irec = F_ISSET(t, R_FIXLEN) ? __rec_fmap : __rec_vmap; F_SET(t, R_MEMMAPPED); #else goto slow; #endif } } } /* Use the recno routines. */ dbp->close = __rec_close; dbp->del = __rec_delete; dbp->fd = __rec_fd; dbp->get = __rec_get; dbp->put = __rec_put; dbp->seq = __rec_seq; dbp->sync = __rec_sync; /* If the root page was created, reset the flags. */ if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL) goto err; if ((h->flags & P_TYPE) == P_BLEAF) { F_CLR(h, P_TYPE); F_SET(h, P_RLEAF); mpool_put(t->bt_mp, h, MPOOL_DIRTY); } else mpool_put(t->bt_mp, h, 0); if (openinfo && openinfo->flags & R_SNAPSHOT && !F_ISSET(t, R_EOF | R_INMEM) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) goto err; return (dbp); einval: errno = EINVAL; err: sverrno = errno; if (dbp != NULL) (void)__bt_close(dbp); if (fname != NULL) (void)close(rfd); errno = sverrno; return (NULL); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_cache_page_image_decr(session, dsk->mem_size); /* Discard any mapped image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)S2BT(session)->bm->map_discard( S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any allocated disk image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }
/* * __cursor_valid -- * Return if the cursor references an valid key/value pair. */ static inline bool __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) { WT_BTREE *btree; WT_CELL *cell; WT_COL *cip; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; btree = cbt->btree; page = cbt->ref->page; session = (WT_SESSION_IMPL *)cbt->iface.session; if (updp != NULL) *updp = NULL; /* * We may be pointing to an insert object, and we may have a page with * existing entries. Insert objects always have associated update * objects (the value). Any update object may be deleted, or invisible * to us. In the case of an on-page entry, there is by definition a * value that is visible to us, the original page cell. * * If we find a visible update structure, return our caller a reference * to it because we don't want to repeatedly search for the update, it * might suddenly become invisible (imagine a read-uncommitted session * with another session's aborted insert), and we don't want to handle * that potential error every time we look at the value. * * Unfortunately, the objects we might have and their relationships are * different for the underlying page types. * * In the case of row-store, an insert object implies ignoring any page * objects, no insert object can have the same key as an on-page object. * For row-store: * if there's an insert object: * if there's a visible update: * exact match * else * no exact match * else * use the on-page object (which may have an associated * update object that may or may not be visible to us). * * Column-store is more complicated because an insert object can have * the same key as an on-page object: updates to column-store rows * are insert/object pairs, and an invisible update isn't the end as * there may be an on-page object that is visible. This changes the * logic to: * if there's an insert object: * if there's a visible update: * exact match * else if the on-page object's key matches the insert key * use the on-page object * else * use the on-page object * * First, check for an insert object with a visible update (a visible * update that's been deleted is not a valid key/value pair). */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) return (false); if (updp != NULL) *updp = upd; return (true); } /* * If we don't have an insert object, or in the case of column-store, * there's an insert object but no update was visible to us and the key * on the page is the same as the insert object's key, and the slot as * set by the search function is valid, we can use the original page * information. */ switch (btree->type) { case BTREE_COL_FIX: /* * If search returned an insert object, there may or may not be * a matching on-page object, we have to check. Fixed-length * column-store pages don't have slots, but map one-to-one to * keys, check for retrieval past the end of the page. */ if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries) return (false); /* * An update would have appeared as an "insert" object; no * further checks to do. */ break; case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ if (page->pg_var_entries == 0) return (false); WT_ASSERT(session, cbt->slot < page->pg_var_entries); /* * Column-store updates are stored as "insert" objects. If * search returned an insert object we can't return, the * returned on-page object must be checked for a match. */ if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) return (false); /* * Although updates would have appeared as an "insert" objects, * variable-length column store deletes are written into the * backing store; check the cell for a record already deleted * when read. */ cip = &page->pg_var_d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) return (false); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ if (page->pg_row_entries == 0) return (false); WT_ASSERT(session, cbt->slot < page->pg_row_entries); /* * See above: for row-store, no insert object can have the same * key as an on-page object, we're done. */ if (cbt->ins != NULL) return (false); /* Check for an update. */ if (page->modify != NULL && page->modify->mod_row_update != NULL && (upd = __wt_txn_read(session, page->modify->mod_row_update[cbt->slot])) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) return (false); if (updp != NULL) *updp = upd; } break; } return (true); }
/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_update); WT_STAT_DATA_INCR(session, cursor_update); WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); /* * The tree is no longer empty: eviction should pay attention to it, * and it's no longer possible to bulk-load into it. */ if (btree->bulk_load_ok) { btree->bulk_load_ok = false; __wt_btree_evictable(session, true); } retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If not overwriting, fail if the key doesn't exist. If we * find an update for the key, check for conflicts. Update the * record if it exists. Creating a record past the end of the * tree in a fixed-length column-store implicitly fills the gap * with empty records. Update the record in that case, the * record exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if ((cbt->compare != 0 || !__cursor_valid(cbt, NULL)) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } ret = __cursor_col_modify(session, cbt, false); break; case BTREE_ROW: WT_ERR(__cursor_row_search(session, cbt, NULL, true)); /* * If not overwriting, check for conflicts and fail if the key * does not exist. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify(session, cbt, false); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If successful, point the cursor at internal copies of the data. We * could shuffle memory in the cursor so the key/value pair are in local * buffer memory, but that's a data copy. We don't want to do another * search (and we might get a different update structure if we race). * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ if (ret == 0) WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update)); if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_STAT_CONN_INCR(session, cursor_remove); WT_STAT_DATA_INCR(session, cursor_remove); WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size); retry: WT_RET(__cursor_func_init(cbt, true)); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__cursor_col_search(session, cbt, NULL)); /* * If we find a matching record, check whether an update would * conflict. Do this before checking if the update is visible * in __cursor_valid, or we can miss conflict. */ WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* * Creating a record past the end of the tree in a * fixed-length column-store implicitly fills the * gap with empty records. Return success in that * case, the record was deleted successfully. * * Correct the btree cursor's location: the search * will have pointed us at the previous/next item, * and that's not correct. */ cbt->recno = cursor->recno; } else ret = __cursor_col_modify(session, cbt, true); break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__cursor_row_search(session, cbt, NULL, false)); /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); if (cbt->compare != 0 || !__cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, true); break; } err: if (ret == WT_RESTART) { WT_STAT_CONN_INCR(session, cursor_restart); WT_STAT_DATA_INCR(session, cursor_restart); goto retry; } /* * If the cursor is configured to overwrite and the record is not * found, that is exactly what we want. */ if (F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ret == WT_NOTFOUND) ret = 0; if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; uint32_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase)); /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_VERBOSE_ERR(session, verify, "%s: checkpoint %s", btree->dhandle->name, ckpt->name); #ifdef HAVE_DIAGNOSTIC if (vs->dump_address || vs->dump_blocks || vs->dump_pages) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); #endif /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); /* Load the checkpoint, ignore trees with no root page. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, 1)); if (root_addr_size != 0) { /* Verify then discard the checkpoint from the cache. */ if ((ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { ret = __verify_tree( session, btree->root_page, vs); WT_TRET(__wt_bt_cache_op( session, NULL, WT_SYNC_DISCARD_NOWRITE)); } } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); }
/* * v_motion -- * * Get resulting motion mark. */ static int v_motion(SCR *sp, VICMD *dm, VICMD *vp, int *mappedp) { VICMD motion; gcret_t gcret; size_t len; u_long cnt; u_int flags; int tilde_reset, notused; /* * If '.' command, use the dot motion, else get the motion command. * Clear any line motion flags, the subsequent motion isn't always * the same, i.e. "/aaa" may or may not be a line motion. */ if (F_ISSET(vp, VC_ISDOT)) { motion = *dm; F_SET(&motion, VC_ISDOT); F_CLR(&motion, VM_COMMASK); gcret = GC_OK; } else { memset(&motion, 0, sizeof(VICMD)); gcret = v_cmd(sp, NULL, &motion, vp, ¬used, mappedp); if (gcret != GC_OK && gcret != GC_EVENT) return (1); } /* * A count may be provided both to the command and to the motion, in * which case the count is multiplicative. For example, "3y4y" is the * same as "12yy". This count is provided to the motion command and * not to the regular function. */ cnt = motion.count = F_ISSET(&motion, VC_C1SET) ? motion.count : 1; if (F_ISSET(vp, VC_C1SET)) { motion.count *= vp->count; F_SET(&motion, VC_C1SET); /* * Set flags to restore the original values of the command * structure so dot commands can change the count values, * e.g. "2dw" "3." deletes a total of five words. */ F_CLR(vp, VC_C1SET); F_SET(vp, VC_C1RESET); } /* * Some commands can be repeated to indicate the current line. In * this case, or if the command is a "line command", set the flags * appropriately. If not a doubled command, run the function to get * the resulting mark. */ if (gcret != GC_EVENT && vp->key == motion.key) { F_SET(vp, VM_LDOUBLE | VM_LMODE); /* Set the origin of the command. */ vp->m_start.lno = sp->lno; vp->m_start.cno = 0; /* * Set the end of the command. * * If the current line is missing, i.e. the file is empty, * historic vi permitted a "cc" or "!!" command to insert * text. */ vp->m_stop.lno = sp->lno + motion.count - 1; if (db_get(sp, vp->m_stop.lno, 0, NULL, &len)) { if (vp->m_stop.lno != 1 || (vp->key != 'c' && vp->key != '!')) { v_emsg(sp, NULL, VIM_EMPTY); return (1); } vp->m_stop.cno = 0; } else vp->m_stop.cno = len ? len - 1 : 0; } else { /* * Motion commands change the underlying movement (*snarl*). * For example, "l" is illegal at the end of a line, but "dl" * is not. Set flags so the function knows the situation. */ motion.rkp = vp->kp; /* * XXX * Use yank instead of creating a new motion command, it's a * lot easier for now. */ if (vp->kp == &tmotion) { tilde_reset = 1; vp->kp = &vikeys['y']; } else tilde_reset = 0; /* * Copy the key flags into the local structure, except for the * RCM flags -- the motion command will set the RCM flags in * the vp structure if necessary. This means that the motion * command is expected to determine where the cursor ends up! * However, we save off the current RCM mask and restore it if * it no RCM flags are set by the motion command, with a small * modification. * * We replace the VM_RCM_SET flag with the VM_RCM flag. This * is so that cursor movement doesn't set the relative position * unless the motion command explicitly specified it. This * appears to match historic practice, but I've never been able * to develop a hard-and-fast rule. */ flags = F_ISSET(vp, VM_RCM_MASK); if (LF_ISSET(VM_RCM_SET)) { LF_SET(VM_RCM); LF_CLR(VM_RCM_SET); } F_CLR(vp, VM_RCM_MASK); F_SET(&motion, motion.kp->flags & ~VM_RCM_MASK); /* * Set the three cursor locations to the current cursor. This * permits commands like 'j' and 'k', that are line oriented * motions and have special cursor suck semantics when they are * used as standalone commands, to ignore column positioning. */ motion.m_final.lno = motion.m_stop.lno = motion.m_start.lno = sp->lno; motion.m_final.cno = motion.m_stop.cno = motion.m_start.cno = sp->cno; /* Run the function. */ if ((motion.kp->func)(sp, &motion)) return (1); /* * If the current line is missing, i.e. the file is empty, * historic vi allowed "c<motion>" or "!<motion>" to insert * text. Otherwise fail -- most motion commands will have * already failed, but some, e.g. G, succeed in empty files. */ if (!db_exist(sp, vp->m_stop.lno)) { if (vp->m_stop.lno != 1 || (vp->key != 'c' && vp->key != '!')) { v_emsg(sp, NULL, VIM_EMPTY); return (1); } vp->m_stop.cno = 0; } /* * XXX * See above. */ if (tilde_reset) vp->kp = &tmotion; /* * Copy cut buffer, line mode and cursor position information * from the motion command structure, i.e. anything that the * motion command can set for us. The commands can flag the * movement as a line motion (see v_sentence) as well as set * the VM_RCM_* flags explicitly. */ F_SET(vp, F_ISSET(&motion, VM_COMMASK | VM_RCM_MASK)); /* * If the motion command set no relative motion flags, use * the (slightly) modified previous values. */ if (!F_ISSET(vp, VM_RCM_MASK)) F_SET(vp, flags); /* * Commands can change behaviors based on the motion command * used, for example, the ! command repeated the last bang * command if N or n was used as the motion. */ vp->rkp = motion.kp; /* * Motion commands can reset all of the cursor information. * If the motion is in the reverse direction, switch the * from and to MARK's so that it's in a forward direction. * Motions are from the from MARK to the to MARK (inclusive). */ if (motion.m_start.lno > motion.m_stop.lno || (motion.m_start.lno == motion.m_stop.lno && motion.m_start.cno > motion.m_stop.cno)) { vp->m_start = motion.m_stop; vp->m_stop = motion.m_start; } else { vp->m_start = motion.m_start; vp->m_stop = motion.m_stop; } vp->m_final = motion.m_final; } /* * If the command sets dot, save the motion structure. The motion * count was changed above and needs to be reset, that's why this * is done here, and not in the calling routine. */ if (F_ISSET(vp->kp, V_DOT)) { *dm = motion; dm->count = cnt; } return (0); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int single; conn = S2C(session); WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); single = LF_ISSET(WT_REC_SINGLE) ? 1 : 0; /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if WT_REC_SINGLE is set: we won't * check hazard references in that case. */ WT_ERR(__rec_review(session, page->ref, page, flags, 1)); /* Count evictions of internal pages during normal operation. */ if (!single && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) WT_STAT_INCR(conn->stats, cache_evict_internal); /* Update the parent and discard the page. */ if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) { WT_STAT_INCR(conn->stats, cache_evict_unmodified); WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, single); } else { WT_STAT_INCR(conn->stats, cache_evict_modified); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, single); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); } session->excl_next = 0; return (ret); }
/* * __wt_curds_create -- * Initialize a data-source cursor. */ int __wt_curds_create(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, NULL, /* get-key */ NULL, /* get-value */ NULL, /* set-key */ NULL, /* set-value */ NULL, /* compare */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __curds_update, /* update */ __curds_remove, /* remove */ __curds_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor, *dsc; WT_DECL_RET; const char *metaconf; metaconf = NULL; /* Open the WiredTiger cursor. */ WT_RET(__wt_calloc_def(session, 1, &cursor)); *cursor = iface; cursor->session = (WT_SESSION *)session; /* * XXX * We'll need the object's key and value formats. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &dsc)); dsc->session = (WT_SESSION *)session; memset(&dsc->q, 0, sizeof(dsc->q)); dsc->recno = 0; memset(dsc->raw_recno_buf, 0, sizeof(dsc->raw_recno_buf)); memset(&dsc->key, 0, sizeof(dsc->key)); memset(&dsc->value, 0, sizeof(dsc->value)); memset(&dsc->saved_err, 0, sizeof(dsc->saved_err)); dsc->data_source = NULL; memset(&dsc->flags, 0, sizeof(dsc->flags)); /* Reference the underlying application cursor. */ cursor->data_source = dsc; if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else __wt_free(session, cursor); } __wt_free(session, metaconf); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; *pagep = NULL; dsk = image; alloc_entries = 0; /* * Figure out how many underlying objects the page references so we can * allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * value item). * * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each entry is a * location cookie). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each entry is a key and * location cookie pair). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). * If that flag is not set, there are more keys than values, * we have to walk the page to figure it out. */ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) alloc_entries = dsk->u.entries; else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc( session, dsk->type, dsk->recno, alloc_entries, 1, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new internal page to the parent. */ if (ref != NULL) { switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_parent_ref = ref; break; } ref->page = page; } *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_read_blocked); break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (F_ISSET(S2BT(session), WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page, flags)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; wait_cnt += 1000; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); break; } else WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (++wait_cnt < 1000) __wt_yield(); else { sleep_cnt = WT_MIN(wait_cnt, 10000); wait_cnt *= 2; WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } } }
/* * __bt_search -- * Search a btree for a key. * * Parameters: * t: tree to search * key: key to find * exactp: pointer to exact match flag * * Returns: * The EPG for matching record, if any, or the EPG for the location * of the key, if it were inserted into the tree, is entered into * the bt_cur field of the tree. A pointer to the field is returned. */ EPG * __bt_search(BTREE *t, const DBT *key, int *exactp) { PAGE *h; indx_t base, idx, lim; pgno_t pg; int cmp; BT_CLR(t); for (pg = P_ROOT;;) { if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) return (NULL); /* Do a binary search on the current page. */ t->bt_cur.page = h; for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { t->bt_cur.index = idx = base + (lim >> 1); if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { if (h->flags & P_BLEAF) { *exactp = 1; return (&t->bt_cur); } goto next; } if (cmp > 0) { base = idx + 1; --lim; } } /* * If it's a leaf page, we're almost done. If no duplicates * are allowed, or we have an exact match, we're done. Else, * it's possible that there were matching keys on this page, * which later deleted, and we're on a page with no matches * while there are matches on other pages. If at the start or * end of a page, check the adjacent page. */ if (h->flags & P_BLEAF) { if (!F_ISSET(t, B_NODUPS)) { if (base == 0 && h->prevpg != P_INVALID && __bt_sprev(t, h, key, exactp)) return (&t->bt_cur); if (base == NEXTINDEX(h) && h->nextpg != P_INVALID && __bt_snext(t, h, key, exactp)) return (&t->bt_cur); } *exactp = 0; t->bt_cur.index = base; return (&t->bt_cur); } /* * No match found. Base is the smallest index greater than * key and may be zero or a last + 1 index. If it's non-zero, * decrement by one, and record the internal page which should * be a parent page for the key. If a split later occurs, the * inserted page will be to the right of the saved page. */ idx = base ? base - 1 : base; next: BT_PUSH(t, h->pgno, idx); pg = GETBINTERNAL(h, idx)->pgno; mpool_put(t->bt_mp, h, 0); } }
/* * __wt_curds_open -- * Initialize a data-source cursor. */ int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, NULL, /* get-key */ NULL, /* get-value */ NULL, /* set-key */ NULL, /* set-value */ __curds_compare, /* compare */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __curds_update, /* update */ __curds_remove, /* remove */ __curds_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; WT_DECL_RET; const char *metaconf; STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0); data_source = NULL; metaconf = NULL; WT_RET(__wt_calloc_def(session, 1, &data_source)); cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; F_SET(cursor, WT_CURSTD_DATA_SOURCE); /* * XXX * The underlying data-source may require the object's key and value * formats. This isn't a particularly elegant way of getting that * information to the data-source, this feels like a layering problem * to me. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); /* Data-source cursors have a collator reference. */ WT_ERR(__wt_collator_config(session, cfg, &data_source->collator)); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source)); source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); source->recno = 0; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); source->saved_err = 0; source->flags = 0; if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else __wt_free(session, data_source); *cursorp = NULL; } __wt_free(session, metaconf); return (ret); }
/* * cl_read -- * Read characters from the input. */ static input_t cl_read(SCR *sp, u_int32_t flags, char *bp, size_t blen, int *nrp, struct timeval *tp) { struct termios term1, term2; CL_PRIVATE *clp; GS *gp; fd_set rdfd; input_t rval; int maxfd, nr, term_reset; gp = sp->gp; clp = CLP(sp); term_reset = 0; /* * 1: A read from a file or a pipe. In this case, the reads * never timeout regardless. This means that we can hang * when trying to complete a map, but we're going to hang * on the next read anyway. */ if (!F_ISSET(clp, CL_STDIN_TTY)) { nr = extpread(STDIN_FILENO, bp, blen, O_FBLOCKING, -1); switch (nr) { case 0: return (INP_EOF); case -1: goto err; default: *nrp = nr; return (INP_OK); } /* NOTREACHED */ } /* * 2: A read with an associated timeout, e.g., trying to complete * a map sequence. If input exists, we fall into #3. */ if (tp != NULL) { FD_ZERO(&rdfd); FD_SET(STDIN_FILENO, &rdfd); switch (select(STDIN_FILENO + 1, &rdfd, NULL, NULL, tp)) { case 0: return (INP_TIMEOUT); case -1: goto err; default: break; } } /* * The user can enter a key in the editor to quote a character. If we * get here and the next key is supposed to be quoted, do what we can. * Reset the tty so that the user can enter a ^C, ^Q, ^S. There's an * obvious race here, when the key has already been entered, but there's * nothing that we can do to fix that problem. * * The editor can ask for the next literal character even thought it's * generally running in line-at-a-time mode. Do what we can. */ if (LF_ISSET(EC_QUOTED | EC_RAW) && !tcgetattr(STDIN_FILENO, &term1)) { term_reset = 1; if (LF_ISSET(EC_QUOTED)) { term2 = term1; term2.c_lflag &= ~ISIG; term2.c_iflag &= ~(IXON | IXOFF); (void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &term2); } else (void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &clp->vi_enter); } /* * 3: Wait for input. * * Select on the command input and scripting window file descriptors. * It's ugly that we wait on scripting file descriptors here, but it's * the only way to keep from locking out scripting windows. */ if (F_ISSET(gp, G_SCRWIN)) { loop: FD_ZERO(&rdfd); FD_SET(STDIN_FILENO, &rdfd); maxfd = STDIN_FILENO; if (F_ISSET(sp, SC_SCRIPT)) { FD_SET(sp->script->sh_master, &rdfd); if (sp->script->sh_master > maxfd) maxfd = sp->script->sh_master; } switch (select(maxfd + 1, &rdfd, NULL, NULL, NULL)) { case 0: abort(); case -1: goto err; default: break; } if (!FD_ISSET(STDIN_FILENO, &rdfd)) { if (sscr_input(sp)) return (INP_ERR); goto loop; } } /* * 4: Read the input. * * !!! * What's going on here is some scary stuff. Ex runs the terminal in * canonical mode. So, the <newline> character terminating a line of * input is returned in the buffer, but a trailing <EOF> character is * not similarly included. As ex uses 0<EOF> and ^<EOF> as autoindent * commands, it has to see the trailing <EOF> characters to determine * the difference between the user entering "0ab" and "0<EOF>ab". We * leave an extra slot in the buffer, so that we can add a trailing * <EOF> character if the buffer isn't terminated by a <newline>. We * lose if the buffer is too small for the line and exactly N characters * are entered followed by an <EOF> character. */ #define ONE_FOR_EOF 1 nr = extpread(STDIN_FILENO, bp, blen - ONE_FOR_EOF, O_FBLOCKING, -1); switch (nr) { case 0: /* EOF. */ /* * ^D in canonical mode returns a read of 0, i.e. EOF. EOF is * a valid command, but we don't want to loop forever because * the terminal driver is returning EOF because the user has * disconnected. The editor will almost certainly try to write * something before this fires, which should kill us, but You * Never Know. */ if (++clp->eof_count < 50) { bp[0] = clp->orig.c_cc[VEOF]; *nrp = 1; rval = INP_OK; } else rval = INP_EOF; break; case -1: /* Error or interrupt. */ err: if (errno == EINTR) rval = INP_INTR; else { rval = INP_ERR; msgq(sp, M_SYSERR, "input"); } break; default: /* Input characters. */ if (F_ISSET(sp, SC_EX) && bp[nr - 1] != '\n') bp[nr++] = clp->orig.c_cc[VEOF]; *nrp = nr; clp->eof_count = 0; rval = INP_OK; break; } /* Restore the terminal state if it was modified. */ if (term_reset) (void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &term1); return (rval); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; uint64_t *switch_txnp; uint64_t snap_min; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; txn = &session->txn; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { WT_ASSERT(session, !F_ISSET(&clsm->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update) { /* * Ensure that there is a transaction snapshot active. */ WT_RET(__wt_txn_autocommit_check(session)); WT_RET(__wt_txn_id_check(session)); WT_RET(__clsm_enter_update(clsm)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; if (txn->isolation == WT_ISO_SNAPSHOT) __wt_txn_cursor_op(session); /* * Figure out how many updates are required for * snapshot isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID * that overlaps with our snapshot is a potential * conflict. */ clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); snap_min = txn->snap_min; for (switch_txnp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, switch_txnp--) { if (WT_TXNID_LT(*switch_txnp, snap_min)) break; WT_ASSERT(session, !__wt_txn_visible_all( session, *switch_txnp)); } } } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || txn->isolation != WT_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; bool deleting, fatal, locked; ci = &block->live; fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); #endif /* * Checkpoints are a two-step process: first, write a new checkpoint to * disk (including all the new extent lists for modified checkpoints * and the live system). As part of this, create a list of file blocks * newly available for reallocation, based on checkpoints being deleted. * We then return the locations of the new checkpoint information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new checkpoint information, and if we crashed before * the new checkpoint location was saved, we'd have overwritten blocks * still referenced by checkpoints in the system.) In summary, there is * a second step: after our caller saves the checkpoint information, we * are called to add the newly available blocks into the live system's * available list. * * This function is the first step, the second step is in the resolve * function. * * If we're called to checkpoint the same file twice (without the second * resolution step), or re-entered for any reason, it's an error in our * caller, and our choices are all bad: leak blocks or potentially crash * with our caller not yet having saved previous checkpoint information * to stable storage. */ __wt_spin_lock(session, &block->live_lock); switch (block->ckpt_state) { case WT_CKPT_INPROGRESS: block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE; break; case WT_CKPT_NONE: case WT_CKPT_PANIC_ON_FAILURE: __wt_err(session, EINVAL, "%s: an unexpected checkpoint attempt: the checkpoint " "was never started or has already completed", block->name); ret = __wt_block_panic(session); break; case WT_CKPT_SALVAGE: /* Salvage doesn't use the standard checkpoint APIs. */ break; } __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but as * described above, there is no "free the checkpoint information" call * into the block manager; if there was an error in an upper level that * resulted in some previous checkpoint never being resolved, the list * may not be empty. We should have caught that with the "checkpoint * in progress" test, but it doesn't cost us anything to be cautious. * * We free the checkpoint's allocation and discard extent lists as part * of the resolution step, not because they're needed at that time, but * because it's potentially a lot of work, and waiting allows the btree * layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail", true)); __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = true; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Failures are now fatal: we can't currently back out the merge of any * deleted checkpoint extent lists into the live system's extent lists, * so continuing after error would leave the live system's extent lists * corrupted for any subsequent checkpoint (and potentially, should a * subsequent checkpoint succeed, for recovery). */ fatal = true; /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * deleted and merged checkpoint information without a lock, except for * the final merge of ranges into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = true; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before merging * checkpoint allocation and discard information from the checkpoints * we're deleting, those operations change the underlying byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, false)); live_update: /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ckpt_size; /* * Set the rolling checkpoint size for the live system. * The current size includes the current checkpoint's * root page size (root pages are on the checkpoint's * block allocation list as root pages are allocated * with the usual block allocation functions). That's * correct, but we don't want to include it in the size * for the next checkpoint. */ ckpt_size -= ci->root_size; /* * Additionally, we had a bug for awhile where the live * checkpoint size grew without bound. We can't sanity * check the value, that would require walking the tree * as part of the checkpoint. Bound any bug at the size * of the file. * It isn't practical to assert that the value is within * bounds since databases created with older versions * of WiredTiger (2.8.0) would likely see an error. */ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. This includes freeing a lot of extents, so do it * outside of the system's lock by copying and resetting the original, * then doing the work later. */ ci->ckpt_alloc = ci->alloc; WT_ERR(__wt_block_extlist_init( session, &ci->alloc, "live", "alloc", false)); ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init( session, &ci->discard, "live", "discard", false)); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) WT_ERR_MSG(session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard " "list"); #endif err: if (ret != 0 && fatal) { __wt_err(session, ret, "%s: fatal checkpoint failure", block->name); ret = __wt_block_panic(session); } if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; u_int close_range_end, close_range_start; bool locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; /* * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* * Query operations need a full set of cursors. Overwrite cursors * do queries in service of updates. */ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE)) F_SET(clsm, WT_CLSM_OPEN_READ); if (lsm_tree->nchunks == 0) return (0); ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* * If the key is pointing to memory that is pinned by a chunk * cursor, take a copy before closing cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_CURSOR_NEEDKEY(c); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->switch_txn[ngood - 1] = chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) close_range_end -= nupdates; else close_range_end = 0; WT_ASSERT(session, ngood >= close_range_end); } else { close_range_end = 0; close_range_start = 0; } if (close_range_end > close_range_start) { saved_gen = lsm_tree->dsk_gen; locked = false; WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, close_range_start, close_range_end)); WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->switch_txn[i] = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); /* * Setup all cursors other than the primary to only do conflict * checks on insert operations. This allows us to execute * inserts on non-primary chunks as a way of checking for * write conflicts with concurrent updates. */ if (i != nchunks - 1) (*cp)->insert = __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by * the first update. */ btree = ((WT_CURSOR_BTREE *)(primary))->btree; if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, __wt_btree_evictable(session, false)); } } clsm->dsk_gen = lsm_tree->dsk_gen; err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); return (ret); }
/* * __wt_txn_commit -- * Commit the current transaction. */ int __wt_txn_commit(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_OP *op; u_int i; bool locked, readonly; #ifdef HAVE_TIMESTAMPS wt_timestamp_t prev_commit_timestamp, ts; bool update_timestamp; #endif txn = &session->txn; conn = S2C(session); txn_global = &conn->txn_global; locked = false; WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING)); WT_ASSERT(session, !F_ISSET(txn, WT_TXN_ERROR) || txn->mod_count == 0); readonly = txn->mod_count == 0; /* * Look for a commit timestamp. */ WT_ERR( __wt_config_gets_def(session, cfg, "commit_timestamp", 0, &cval)); if (cval.len != 0) { #ifdef HAVE_TIMESTAMPS WT_ERR(__wt_txn_parse_timestamp(session, "commit", &ts, &cval)); WT_ERR(__wt_timestamp_validate(session, "commit", &ts, &cval, true, true, true)); __wt_timestamp_set(&txn->commit_timestamp, &ts); __wt_txn_set_commit_timestamp(session); #else WT_ERR_MSG(session, EINVAL, "commit_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } #ifdef HAVE_TIMESTAMPS /* * Debugging checks on timestamps, if user requested them. */ if (F_ISSET(txn, WT_TXN_TS_COMMIT_ALWAYS) && !F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "commit_timestamp required and " "none set on this transaction"); if (F_ISSET(txn, WT_TXN_TS_COMMIT_NEVER) && F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && txn->mod_count != 0) WT_ERR_MSG(session, EINVAL, "no commit_timestamp required and " "timestamp set on this transaction"); #endif /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. */ WT_ERR(__wt_config_gets_def(session, cfg, "sync", 0, &cval)); /* * If the user chose the default setting, check whether sync is enabled * for this transaction (either inherited or via begin_transaction). * If sync is disabled, clear the field to avoid the log write being * flushed. * * Otherwise check for specific settings. We don't need to check for * "on" because that is the default inherited from the connection. If * the user set anything in begin_transaction, we only override with an * explicit setting. */ if (cval.len == 0) { if (!FLD_ISSET(txn->txn_logsync, WT_LOG_SYNC_ENABLED) && !F_ISSET(txn, WT_TXN_SYNC_SET)) txn->txn_logsync = 0; } else { /* * If the caller already set sync on begin_transaction then * they should not be using sync on commit_transaction. * Flag that as an error. */ if (F_ISSET(txn, WT_TXN_SYNC_SET)) WT_ERR_MSG(session, EINVAL, "Sync already set during begin_transaction"); if (WT_STRING_MATCH("background", cval.str, cval.len)) txn->txn_logsync = WT_LOG_BACKGROUND; else if (WT_STRING_MATCH("off", cval.str, cval.len)) txn->txn_logsync = 0; /* * We don't need to check for "on" here because that is the * default to inherit from the connection setting. */ } /* Commit notification. */ if (txn->notify != NULL) WT_ERR(txn->notify->notify(txn->notify, (WT_SESSION *)session, txn->id, 1)); /* * We are about to release the snapshot: copy values into any * positioned cursors so they don't point to updates that could be * freed once we don't have a snapshot. */ if (session->ncursors > 0) { WT_DIAGNOSTIC_YIELD; WT_ERR(__wt_session_copy_values(session)); } /* If we are logging, write a commit log record. */ if (txn->logrec != NULL && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && !F_ISSET(session, WT_SESSION_NO_LOGGING)) { /* * We are about to block on I/O writing the log. * Release our snapshot in case it is keeping data pinned. * This is particularly important for checkpoints. */ __wt_txn_release_snapshot(session); /* * We hold the visibility lock for reading from the time * we write our log record until the time we release our * transaction so that the LSN any checkpoint gets will * always reflect visible data. */ __wt_readlock(session, &txn_global->visibility_rwlock); locked = true; WT_ERR(__wt_txn_log_commit(session, cfg)); } /* Note: we're going to commit: nothing can fail after this point. */ /* Process and free updates. */ for (i = 0, op = txn->mod; i < txn->mod_count; i++, op++) { switch (op->type) { case WT_TXN_OP_BASIC: case WT_TXN_OP_BASIC_TS: case WT_TXN_OP_INMEM: /* * Switch reserved operations to abort to * simplify obsolete update list truncation. */ if (op->u.upd->type == WT_UPDATE_RESERVED) { op->u.upd->txnid = WT_TXN_ABORTED; break; } /* * Writes to the lookaside file can be evicted as soon * as they commit. */ if (conn->cache->las_fileid != 0 && op->fileid == conn->cache->las_fileid) { op->u.upd->txnid = WT_TXN_NONE; break; } #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT) && op->type != WT_TXN_OP_BASIC_TS) { WT_ASSERT(session, op->fileid != WT_METAFILE_ID); __wt_timestamp_set(&op->u.upd->timestamp, &txn->commit_timestamp); } #endif break; case WT_TXN_OP_REF: #ifdef HAVE_TIMESTAMPS if (F_ISSET(txn, WT_TXN_HAS_TS_COMMIT)) __wt_timestamp_set( &op->u.ref->page_del->timestamp, &txn->commit_timestamp); #endif break; case WT_TXN_OP_TRUNCATE_COL: case WT_TXN_OP_TRUNCATE_ROW: /* Other operations don't need timestamps. */ break; } __wt_txn_op_free(session, op); } txn->mod_count = 0; #ifdef HAVE_TIMESTAMPS /* * Track the largest commit timestamp we have seen. * * We don't actually clear the local commit timestamp, just the flag. * That said, we can't update the global commit timestamp until this * transaction is visible, which happens when we release it. */ update_timestamp = F_ISSET(txn, WT_TXN_HAS_TS_COMMIT); #endif __wt_txn_release(session); if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); #ifdef HAVE_TIMESTAMPS /* First check if we've already committed something in the future. */ if (update_timestamp) { WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp)); update_timestamp = __wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0; } /* * If it looks like we need to move the global commit timestamp, * write lock and re-check. */ if (update_timestamp) { #if WT_TIMESTAMP_SIZE == 8 while (__wt_timestamp_cmp( &txn->commit_timestamp, &prev_commit_timestamp) > 0) { if (__wt_atomic_cas64( &txn_global->commit_timestamp.val, prev_commit_timestamp.val, txn->commit_timestamp.val)) { txn_global->has_commit_timestamp = true; break; } __wt_timestamp_set( &prev_commit_timestamp, &txn_global->commit_timestamp); } #else __wt_writelock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&txn->commit_timestamp, &txn_global->commit_timestamp) > 0) { __wt_timestamp_set(&txn_global->commit_timestamp, &txn->commit_timestamp); txn_global->has_commit_timestamp = true; } __wt_writeunlock(session, &txn_global->rwlock); #endif } #endif /* * We're between transactions, if we need to block for eviction, it's * a good time to do so. Note that we must ignore any error return * because the user's data is committed. */ if (!readonly) (void)__wt_cache_eviction_check(session, false, false, NULL); return (0); err: /* * If anything went wrong, roll back. * * !!! * Nothing can fail after this point. */ if (locked) __wt_readunlock(session, &txn_global->visibility_rwlock); WT_TRET(__wt_txn_rollback(session, cfg)); return (ret); }
/* * __clsm_enter_update -- * Make sure an LSM cursor is ready to perform an update. */ static int __clsm_enter_update(WT_CURSOR_LSM *clsm) { WT_CURSOR *primary; WT_LSM_CHUNK *primary_chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; bool hard_limit, have_primary, ovfl; lsm_tree = clsm->lsm_tree; ovfl = false; session = (WT_SESSION_IMPL *)clsm->iface.session; if (clsm->nchunks == 0) { primary = NULL; have_primary = false; } else { primary = clsm->cursors[clsm->nchunks - 1]; primary_chunk = clsm->primary_chunk; WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID)); have_primary = (primary != NULL && primary_chunk != NULL && (primary_chunk->switch_txn == WT_TXN_NONE || WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn))); } /* * In LSM there are multiple btrees active at one time. The tree * switch code needs to use btree API methods, and it wants to * operate on the btree for the primary chunk. Set that up now. * * If the primary chunk has grown too large, set a flag so the worker * thread will switch when it gets a chance to avoid introducing high * latency into application threads. Don't do this indefinitely: if a * chunk grows twice as large as the configured size, block until it * can be switched. */ hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); if (have_primary) { WT_ENTER_PAGE_INDEX(session); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree, ovfl = __wt_btree_lsm_over_size(session, hard_limit ? 2 * lsm_tree->chunk_size : lsm_tree->chunk_size)); WT_LEAVE_PAGE_INDEX(session); /* If there was no overflow, we're done. */ if (!ovfl) return (0); } /* Request a switch. */ WT_RET(__wt_clsm_request_switch(clsm)); /* If we only overflowed the soft limit, we're done. */ if (have_primary && !hard_limit) return (0); WT_RET(__wt_clsm_await_switch(clsm)); return (0); }
/* * cl_attr -- * Toggle a screen attribute on/off. * * PUBLIC: int cl_attr __P((SCR *, scr_attr_t, int)); */ int cl_attr(SCR *sp, scr_attr_t attribute, int on) { CL_PRIVATE *clp; WINDOW *win; clp = CLP(sp); win = CLSP(sp) ? CLSP(sp) : stdscr; switch (attribute) { case SA_ALTERNATE: /* * !!! * There's a major layering violation here. The problem is that the * X11 xterm screen has what's known as an "alternate" screen. Some * xterm termcap/terminfo entries include sequences to switch to/from * that alternate screen as part of the ti/te (smcup/rmcup) strings. * Vi runs in the alternate screen, so that you are returned to the * same screen contents on exit from vi that you had when you entered * vi. Further, when you run :shell, or :!date or similar ex commands, * you also see the original screen contents. This wasn't deliberate * on vi's part, it's just that it historically sent terminal init/end * sequences at those times, and the addition of the alternate screen * sequences to the strings changed the behavior of vi. The problem * caused by this is that we don't want to switch back to the alternate * screen while getting a new command from the user, when the user is * continuing to enter ex commands, e.g.: * * :!date <<< switch to original screen * [Hit return to continue] <<< prompt user to continue * :command <<< get command from user * * Note that the :command input is a true vi input mode, e.g., input * maps and abbreviations are being done. So, we need to be able to * switch back into the vi screen mode, without flashing the screen. * * To make matters worse, the curses initscr() and endwin() calls will * do this automatically -- so, this attribute isn't as controlled by * the higher level screen as closely as one might like. */ if (on) { if (clp->ti_te != TI_SENT) { clp->ti_te = TI_SENT; if (clp->smcup == NULL) (void)cl_getcap(sp, "smcup", &clp->smcup); if (clp->smcup != NULL) (void)tputs(clp->smcup, 1, cl_putchar); } } else if (clp->ti_te != TE_SENT) { clp->ti_te = TE_SENT; if (clp->rmcup == NULL) (void)cl_getcap(sp, "rmcup", &clp->rmcup); if (clp->rmcup != NULL) (void)tputs(clp->rmcup, 1, cl_putchar); (void)fflush(stdout); } (void)fflush(stdout); break; case SA_INVERSE: if (F_ISSET(sp, SC_EX | SC_SCR_EXWROTE)) { if (clp->smso == NULL) return (1); if (on) (void)tputs(clp->smso, 1, cl_putchar); else (void)tputs(clp->rmso, 1, cl_putchar); (void)fflush(stdout); } else { if (on) (void)wstandout(win); else (void)wstandend(win); } break; default: abort(); } return (0); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) { WT_BTREE *btree; WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; int exact; bool valid; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; upd = NULL; /* -Wuninitialized */ exact = 0; WT_STAT_CONN_INCR(session, cursor_search_near); WT_STAT_DATA_INCR(session, cursor_search_near); /* * If we have a row-store page pinned, search it; if we don't have a * page pinned, or the search of the pinned page doesn't find an exact * match, search from the root. Unlike WT_CURSOR.search, ignore pinned * pages in the case of column-store, search-near isn't an interesting * enough case for column-store to add the complexity needed to avoid * the tree search. * * Set the "insert" flag for the btree row-store search; we may intend * to position the cursor at the end of the tree, rather than match an * existing record. */ valid = false; if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { WT_ERR(__wt_txn_cursor_op(session)); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); /* * Search-near is trickier than search when searching an already * pinned page. If search returns the first or last page slots, * discard the results and search the full tree as the neighbor * pages might offer better matches. This test is simplistic as * we're ignoring append lists (there may be no page slots or we * might be legitimately positioned after the last page slot). * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->pg_row_entries - 1) valid = __cursor_valid(cbt, &upd); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); valid = __cursor_valid(cbt, &upd); } /* * If we find a valid key, return it. * * Else, creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. In this * case, we instantiate the empty record, it's an exact match. * * Else, move to the next key in the tree (bias for prefix searches). * Cursor next skips invalid rows, so we don't have to test for them * again. * * Else, redo the search and move to the previous key in the tree. * Cursor previous skips invalid rows, so we don't have to test for * them again. * * If that fails, quit, there's no record to return. */ if (valid) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; cursor->value.data = &cbt->v; cursor->value.size = 1; exact = 0; } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) exact = 1; else { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); if (__cursor_valid(cbt, &upd)) { exact = cbt->compare; ret = __wt_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_init(session, cbt)); #endif err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND)) *exactp = exact; return (ret); }