/* * __wt_verify_dsk_image -- * Verify a single block as read from disk. */ int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok) { const uint8_t *p, *end; u_int i; uint8_t flags; /* Check the page type. */ switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: break; case WT_PAGE_INVALID: default: WT_RET_VRFY(session, "page at %s has an invalid type of %" PRIu32, tag, dsk->type); } /* Check the page record number. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: if (dsk->recno != 0) break; WT_RET_VRFY(session, "%s page at %s has a record number of zero", __wt_page_type_string(dsk->type), tag); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (dsk->recno == 0) break; WT_RET_VRFY(session, "%s page at %s has a non-zero record number", __wt_page_type_string(dsk->type), tag); } /* Check the page flags. */ flags = dsk->flags; if (LF_ISSET(WT_PAGE_COMPRESSED)) LF_CLR(WT_PAGE_COMPRESSED); if (LF_ISSET(WT_PAGE_ENCRYPTED)) LF_CLR(WT_PAGE_ENCRYPTED); if (dsk->type == WT_PAGE_ROW_LEAF) { if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && LF_ISSET(WT_PAGE_EMPTY_V_NONE)) WT_RET_VRFY(session, "page at %s has invalid flags combination: 0x%" PRIx8, tag, dsk->flags); if (LF_ISSET(WT_PAGE_EMPTY_V_ALL)) LF_CLR(WT_PAGE_EMPTY_V_ALL); if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) LF_CLR(WT_PAGE_EMPTY_V_NONE); } if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, tag, flags); /* Unused bytes */ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) if (*p != '\0') WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", tag); /* * Any bytes after the data chunk should be nul bytes; ignore if the * size is 0, that allows easy checking of disk images where we don't * have the size. */ if (size != 0) { p = (uint8_t *)dsk + dsk->mem_size; end = (uint8_t *)dsk + size; for (; p < end; ++p) if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", __wt_page_type_string(dsk->type), tag); } /* Check for empty pages, then verify the items on the page. */ switch (dsk->type) { case WT_PAGE_COL_INT: case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (!empty_page_ok && dsk->u.entries == 0) WT_RET_VRFY(session, "%s page at %s has no entries", __wt_page_type_string(dsk->type), tag); break; case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: if (dsk->u.datalen == 0) WT_RET_VRFY(session, "%s page at %s has no data", __wt_page_type_string(dsk->type), tag); break; } switch (dsk->type) { case WT_PAGE_COL_INT: return (__verify_dsk_col_int(session, tag, dsk)); case WT_PAGE_COL_FIX: return (__verify_dsk_col_fix(session, tag, dsk)); case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, tag, dsk)); case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: return (__verify_dsk_row(session, tag, dsk)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * editor -- * Main editor routine. * * PUBLIC: int editor(GS *, int, char *[]); */ int editor(GS *gp, int argc, char *argv[]) { extern int optind; extern char *optarg; const char *p; EVENT ev; FREF *frp; SCR *sp; size_t len; u_int flags; int ch, flagchk, lflag, secure, startup, readonly, rval, silent; char *tag_f, *wsizearg, path[256]; CHAR_T *w; size_t wlen; /* Initialize the busy routine, if not defined by the screen. */ if (gp->scr_busy == NULL) gp->scr_busy = vs_busy; /* Initialize the message routine, if not defined by the screen. */ if (gp->scr_msg == NULL) gp->scr_msg = vs_msg; gp->catd = (nl_catd)-1; /* Common global structure initialization. */ TAILQ_INIT(gp->dq); TAILQ_INIT(gp->hq); SLIST_INIT(gp->ecq); SLIST_INSERT_HEAD(gp->ecq, &gp->excmd, q); gp->noprint = DEFAULT_NOPRINT; /* Structures shared by screens so stored in the GS structure. */ TAILQ_INIT(gp->frefq); TAILQ_INIT(gp->dcb_store.textq); SLIST_INIT(gp->cutq); SLIST_INIT(gp->seqq); /* Set initial screen type and mode based on the program name. */ readonly = 0; if (!strcmp(getprogname(), "ex") || !strcmp(getprogname(), "nex")) LF_INIT(SC_EX); else { /* Nview, view are readonly. */ if (!strcmp(getprogname(), "nview") || !strcmp(getprogname(), "view")) readonly = 1; /* Vi is the default. */ LF_INIT(SC_VI); } /* Convert old-style arguments into new-style ones. */ if (v_obsolete(argv)) return (1); /* Parse the arguments. */ flagchk = '\0'; tag_f = wsizearg = NULL; lflag = secure = silent = 0; startup = 1; /* Set the file snapshot flag. */ F_SET(gp, G_SNAPSHOT); #ifdef DEBUG while ((ch = getopt(argc, argv, "c:D:eFlRrSsT:t:vw:")) != EOF) #else while ((ch = getopt(argc, argv, "c:eFlRrSst:vw:")) != EOF) #endif switch (ch) { case 'c': /* Run the command. */ /* * XXX * We should support multiple -c options. */ if (gp->c_option != NULL) { warnx("only one -c command may be specified."); return (1); } gp->c_option = optarg; break; #ifdef DEBUG case 'D': switch (optarg[0]) { case 's': startup = 0; break; case 'w': attach(gp); break; default: warnx("usage: -D requires s or w argument."); return (1); } break; #endif case 'e': /* Ex mode. */ LF_CLR(SC_VI); LF_SET(SC_EX); break; case 'F': /* No snapshot. */ F_CLR(gp, G_SNAPSHOT); break; case 'l': /* Set lisp, showmatch options. */ lflag = 1; break; case 'R': /* Readonly. */ readonly = 1; break; case 'r': /* Recover. */ if (flagchk == 't') { warnx("only one of -r and -t may be specified."); return (1); } flagchk = 'r'; break; case 'S': secure = 1; break; case 's': silent = 1; break; #ifdef DEBUG case 'T': /* Trace. */ if ((gp->tracefp = fopen(optarg, "w")) == NULL) { warn("%s", optarg); goto err; } (void)fprintf(gp->tracefp, "\n===\ntrace: open %s\n", optarg); break; #endif case 't': /* Tag. */ if (flagchk == 'r') { warnx("only one of -r and -t may be specified."); return (1); } if (flagchk == 't') { warnx("only one tag file may be specified."); return (1); } flagchk = 't'; tag_f = optarg; break; case 'v': /* Vi mode. */ LF_CLR(SC_EX); LF_SET(SC_VI); break; case 'w': wsizearg = optarg; break; case '?': default: (void)gp->scr_usage(); return (1); } argc -= optind; argv += optind; /* * -s option is only meaningful to ex. * * If not reading from a terminal, it's like -s was specified. */ if (silent && !LF_ISSET(SC_EX)) { warnx("-s option is only applicable to ex."); goto err; } if (LF_ISSET(SC_EX) && F_ISSET(gp, G_SCRIPTED)) silent = 1; /* * Build and initialize the first/current screen. This is a bit * tricky. If an error is returned, we may or may not have a * screen structure. If we have a screen structure, put it on a * display queue so that the error messages get displayed. * * !!! * Everything we do until we go interactive is done in ex mode. */ if (screen_init(gp, NULL, &sp)) { if (sp != NULL) TAILQ_INSERT_HEAD(gp->dq, sp, q); goto err; } F_SET(sp, SC_EX); TAILQ_INSERT_HEAD(gp->dq, sp, q); if (v_key_init(sp)) /* Special key initialization. */ goto err; { int oargs[5], *oargp = oargs; if (lflag) { /* Command-line options. */ *oargp++ = O_LISP; *oargp++ = O_SHOWMATCH; } if (readonly) *oargp++ = O_READONLY; if (secure) *oargp++ = O_SECURE; *oargp = -1; /* Options initialization. */ if (opts_init(sp, oargs)) goto err; } if (wsizearg != NULL) { ARGS *av[2], a, b; (void)snprintf(path, sizeof(path), "window=%s", wsizearg); a.bp = (CHAR_T *)path; a.len = strlen(path); b.bp = NULL; b.len = 0; av[0] = &a; av[1] = &b; (void)opts_set(sp, av, NULL); } if (silent) { /* Ex batch mode option values. */ O_CLR(sp, O_AUTOPRINT); O_CLR(sp, O_PROMPT); O_CLR(sp, O_VERBOSE); O_CLR(sp, O_WARN); F_SET(sp, SC_EX_SILENT); } sp->rows = O_VAL(sp, O_LINES); /* Make ex formatting work. */ sp->cols = O_VAL(sp, O_COLUMNS); if (!silent && startup) { /* Read EXINIT, exrc files. */ if (ex_exrc(sp)) goto err; if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) { if (screen_end(sp)) goto err; goto done; } } /* * List recovery files if -r specified without file arguments. * Note, options must be initialized and startup information * read before doing this. */ if (flagchk == 'r' && argv[0] == NULL) { if (rcv_list(sp)) goto err; if (screen_end(sp)) goto err; goto done; } /* * !!! * Initialize the default ^D, ^U scrolling value here, after the * user has had every opportunity to set the window option. * * It's historic practice that changing the value of the window * option did not alter the default scrolling value, only giving * a count to ^D/^U did that. */ sp->defscroll = (O_VAL(sp, O_WINDOW) + 1) / 2; /* * If we don't have a command-line option, switch into the right * editor now, so that we position default files correctly, and * so that any tags file file-already-locked messages are in the * vi screen, not the ex screen. * * XXX * If we have a command-line option, the error message can end * up in the wrong place, but I think that the combination is * unlikely. */ if (gp->c_option == NULL) { F_CLR(sp, SC_EX | SC_VI); F_SET(sp, LF_ISSET(SC_EX | SC_VI)); } /* Open a tag file if specified. */ if (tag_f != NULL) { CHAR2INT(sp, tag_f, strlen(tag_f) + 1, w, wlen); if (ex_tag_first(sp, w)) goto err; } /* * Append any remaining arguments as file names. Files are recovery * files if -r specified. If the tag option or ex startup commands * loaded a file, then any file arguments are going to come after it. */ if (*argv != NULL) { if (sp->frp != NULL) { /* Cheat -- we know we have an extra argv slot. */ *--argv = strdup(sp->frp->name); if (*argv == NULL) { warn(NULL); goto err; } } sp->argv = sp->cargv = argv; F_SET(sp, SC_ARGNOFREE); if (flagchk == 'r') F_SET(sp, SC_ARGRECOVER); } /* * If the ex startup commands and or/the tag option haven't already * created a file, create one. If no command-line files were given, * use a temporary file. */ if (sp->frp == NULL) { if (sp->argv == NULL) { if ((frp = file_add(sp, NULL)) == NULL) goto err; } else { if ((frp = file_add(sp, sp->argv[0])) == NULL) goto err; if (F_ISSET(sp, SC_ARGRECOVER)) F_SET(frp, FR_RECOVER); } if (file_init(sp, frp, NULL, 0)) goto err; if (EXCMD_RUNNING(gp)) { (void)ex_cmd(sp); if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) { if (screen_end(sp)) goto err; goto done; } } } /* * Check to see if we need to wait for ex. If SC_SCR_EX is set, ex * was forced to initialize the screen during startup. We'd like to * wait for a single character from the user, but we can't because * we're not in raw mode. We can't switch to raw mode because the * vi initialization will switch to xterm's alternate screen, causing * us to lose the messages we're pausing to make sure the user read. * So, wait for a complete line. */ if (F_ISSET(sp, SC_SCR_EX)) { p = msg_cmsg(sp, CMSG_CONT_R, &len); (void)write(STDOUT_FILENO, p, len); for (;;) { if (v_event_get(sp, &ev, 0, 0)) goto err; if (ev.e_event == E_INTERRUPT || (ev.e_event == E_CHARACTER && (ev.e_value == K_CR || ev.e_value == K_NL))) break; (void)gp->scr_bell(sp); } } /* Switch into the right editor, regardless. */ F_CLR(sp, SC_EX | SC_VI); F_SET(sp, LF_ISSET(SC_EX | SC_VI) | SC_STATUS_CNT); /* * Main edit loop. Vi handles split screens itself, we only return * here when switching editor modes or restarting the screen. */ while (sp != NULL) if (F_ISSET(sp, SC_EX) ? ex(&sp) : vi(&sp)) goto err; done: rval = 0; if (0) err: rval = 1; /* Clean out the global structure. */ v_end(gp); return (rval); }
/* * ex_txt -- * Get lines from the terminal for ex. * * PUBLIC: int ex_txt(SCR *, TEXTH *, ARG_CHAR_T, u_int32_t); */ int ex_txt(SCR *sp, TEXTH *tiqh, ARG_CHAR_T prompt, u_int32_t flags) { EVENT ev; GS *gp; TEXT ait, *ntp, *tp; carat_t carat_st; size_t cnt; int rval; int nochange; rval = 0; /* * Get a TEXT structure with some initial buffer space, reusing the * last one if it's big enough. (All TEXT bookkeeping fields default * to 0 -- text_init() handles this.) */ if (!TAILQ_EMPTY(tiqh)) { tp = TAILQ_FIRST(tiqh); if (TAILQ_NEXT(tp, q) != NULL || tp->lb_len < 32) { text_lfree(tiqh); goto newtp; } tp->len = 0; } else { newtp: if ((tp = text_init(sp, NULL, 0, 32)) == NULL) goto err; TAILQ_INSERT_HEAD(tiqh, tp, q); } /* Set the starting line number. */ tp->lno = sp->lno + 1; /* * If it's a terminal, set up autoindent, put out the prompt, and * set it up so we know we were suspended. Otherwise, turn off * the autoindent flag, as that requires less special casing below. * * XXX * Historic practice is that ^Z suspended command mode (but, because * it ran in cooked mode, it was unaffected by the autowrite option.) * On restart, any "current" input was discarded, whether in insert * mode or not, and ex was in command mode. This code matches historic * practice, but not 'cause it's easier. */ gp = sp->gp; if (F_ISSET(gp, G_SCRIPTED)) LF_CLR(TXT_AUTOINDENT); else { if (LF_ISSET(TXT_AUTOINDENT)) { LF_SET(TXT_EOFCHAR); if (v_txt_auto(sp, sp->lno, NULL, 0, tp)) goto err; } txt_prompt(sp, tp, prompt, flags); } for (carat_st = C_NOTSET, nochange = 0;;) { if (v_event_get(sp, &ev, 0, 0)) goto err; /* Deal with all non-character events. */ switch (ev.e_event) { case E_CHARACTER: break; case E_ERR: goto err; case E_REPAINT: case E_WRESIZE: continue; case E_EOF: rval = 1; /* FALLTHROUGH */ case E_INTERRUPT: /* * Handle EOF/SIGINT events by discarding partially * entered text and returning. EOF returns failure, * E_INTERRUPT returns success. */ goto notlast; default: v_event_err(sp, &ev); goto notlast; } /* * Deal with character events. * * Check to see if the character fits into the input buffer. * (Use tp->len, ignore overwrite and non-printable chars.) */ BINC_GOTOW(sp, tp->lb, tp->lb_len, tp->len + 1); switch (ev.e_value) { case K_CR: /* * !!! * Historically, <carriage-return>'s in the command * weren't special, so the ex parser would return an * unknown command error message. However, if they * terminated the command if they were in a map. I'm * pretty sure this still isn't right, but it handles * what I've seen so far. */ if (!F_ISSET(&ev.e_ch, CH_MAPPED)) goto ins_ch; /* FALLTHROUGH */ case K_NL: /* * '\' can escape <carriage-return>/<newline>. We * don't discard the backslash because we need it * to get the <newline> through the ex parser. */ if (LF_ISSET(TXT_BACKSLASH) && tp->len != 0 && tp->lb[tp->len - 1] == '\\') goto ins_ch; /* * CR returns from the ex command line. * * XXX * Terminate with a nul, needed by filter. */ if (LF_ISSET(TXT_CR)) { tp->lb[tp->len] = '\0'; goto done; } /* * '.' may terminate text input mode; free the current * TEXT. */ if (LF_ISSET(TXT_DOTTERM) && tp->len == tp->ai + 1 && tp->lb[tp->len - 1] == '.') { notlast: TAILQ_REMOVE(tiqh, tp, q); text_free(tp); goto done; } /* Set up bookkeeping for the new line. */ if ((ntp = text_init(sp, NULL, 0, 32)) == NULL) goto err; ntp->lno = tp->lno + 1; /* * Reset the autoindent line value. 0^D keeps the ai * line from changing, ^D changes the level, even if * there were no characters in the old line. Note, if * using the current tp structure, use the cursor as * the length, the autoindent characters may have been * erased. */ if (LF_ISSET(TXT_AUTOINDENT)) { if (nochange) { nochange = 0; if (v_txt_auto(sp, OOBLNO, &ait, ait.ai, ntp)) goto err; free(ait.lb); } else if (v_txt_auto(sp, OOBLNO, tp, tp->len, ntp)) goto err; carat_st = C_NOTSET; } txt_prompt(sp, ntp, prompt, flags); /* * Swap old and new TEXT's, and insert the new TEXT * into the queue. */ tp = ntp; TAILQ_INSERT_TAIL(tiqh, tp, q); break; case K_CARAT: /* Delete autoindent chars. */ if (tp->len <= tp->ai && LF_ISSET(TXT_AUTOINDENT)) carat_st = C_CARATSET; goto ins_ch; case K_ZERO: /* Delete autoindent chars. */ if (tp->len <= tp->ai && LF_ISSET(TXT_AUTOINDENT)) carat_st = C_ZEROSET; goto ins_ch; case K_CNTRLD: /* Delete autoindent char. */ /* * !!! * Historically, the ^D command took (but then ignored) * a count. For simplicity, we don't return it unless * it's the first character entered. The check for len * equal to 0 is okay, TXT_AUTOINDENT won't be set. */ if (LF_ISSET(TXT_CNTRLD)) { for (cnt = 0; cnt < tp->len; ++cnt) if (!isblank(tp->lb[cnt])) break; if (cnt == tp->len) { tp->len = 1; tp->lb[0] = ev.e_c; tp->lb[1] = '\0'; /* * Put out a line separator, in case * the command fails. */ (void)putchar('\n'); goto done; } } /* * POSIX 1003.1b-1993, paragraph 7.1.1.9, states that * the EOF characters are discarded if there are other * characters to process in the line, i.e. if the EOF * is not the first character in the line. For this * reason, historic ex discarded the EOF characters, * even if occurring in the middle of the input line. * We match that historic practice. * * !!! * The test for discarding in the middle of the line is * done in the switch, because the CARAT forms are N+1, * not N. * * !!! * There's considerable magic to make the terminal code * return the EOF character at all. See that code for * details. */ if (!LF_ISSET(TXT_AUTOINDENT) || tp->len == 0) continue; switch (carat_st) { case C_CARATSET: /* ^^D */ if (tp->len > tp->ai + 1) continue; /* Save the ai string for later. */ ait.lb = NULL; ait.lb_len = 0; BINC_GOTOW(sp, ait.lb, ait.lb_len, tp->ai); MEMCPY(ait.lb, tp->lb, tp->ai); ait.ai = ait.len = tp->ai; carat_st = C_NOTSET; nochange = 1; goto leftmargin; case C_ZEROSET: /* 0^D */ if (tp->len > tp->ai + 1) continue; carat_st = C_NOTSET; leftmargin: (void)gp->scr_ex_adjust(sp, EX_TERM_CE); tp->ai = tp->len = 0; break; case C_NOTSET: /* ^D */ if (tp->len > tp->ai) continue; if (txt_dent(sp, tp)) goto err; break; default: abort(); } /* Clear and redisplay the line. */ (void)gp->scr_ex_adjust(sp, EX_TERM_CE); txt_prompt(sp, tp, prompt, flags); break; default: /* * See the TXT_BEAUTIFY comment in vi/v_txt_ev.c. * * Silently eliminate any iscntrl() character that was * not already handled specially, except for <tab> and * <ff>. */ ins_ch: if (LF_ISSET(TXT_BEAUTIFY) && ISCNTRL(ev.e_c) && ev.e_value != K_FORMFEED && ev.e_value != K_TAB) break; tp->lb[tp->len++] = ev.e_c; break; } } /* NOTREACHED */ done: return (rval); err: alloc_err: return (1); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; uint32_t slot; bool empty_internal, initial_descent, prev, skip; btree = S2BT(session); pindex = NULL; empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); /* * We're not supposed to walk trees without root pages. As this * has not always been the case, assert to debug that change. */ WT_ASSERT(session, btree->root.page != NULL); couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } /* * If the active page was the root, we've reached the walk's end; we * only get here if we've returned the root to our caller, so we're * holding no hazard pointers. */ if (__wt_ref_is_root(ref)) goto done; /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __ref_ascend(session, &ref, &pindex, &slot); /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(session, ref); empty_internal = false; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * always update the hints when splitting, it's expected * for them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; /* Skip lookaside pages if not requested. */ if (ref->state == WT_REF_LOOKASIDE && !LF_ISSET(WT_READ_LOOKASIDE)) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (skip_func != NULL) { WT_ERR(skip_func(session, ref, func_cookie, &skip)); if (skip) break; } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a cursor is setting up at the end of the * tree, we can't use our parent page's index, * because it may have already split; restart * the walk. */ if (prev && initial_descent) goto restart; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) goto restart; /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: empty_internal = true; /* * There's a split race when a cursor is setting * up at the end of the tree or moving backwards * through the tree and descending a level. When * splitting an internal page into its parent, * we move the WT_REF structures and update the * parent's page index before updating the split * page's page index, and it's not an atomic * update. A thread can read the parent page's * replacement page index, then read the split * page's original index, or the parent page's * original and the split page's replacement. * * This isn't a problem for a cursor setting up * at the start of the tree or moving forwards * through the tree because we do right-hand * splits on internal pages and the initial part * of the split page's namespace won't change as * part of a split. A thread reading the parent * page's and split page's indexes will move to * the same slot no matter what order of indexes * are read. * * Handle a cursor setting up at the end of the * tree or moving backwards through the tree. */ if (!prev) { WT_INTL_INDEX_GET( session, ref->page, pindex); slot = 0; } else if (initial_descent) { if (!__ref_initial_descent_prev( session, ref, &pindex)) goto restart; slot = pindex->entries - 1; } else { __ref_descend_prev( session, ref, &pindex); slot = pindex->entries - 1; } continue; } /* * The tree-walk restart code knows we return any leaf * page we acquire (never hazard-pointer coupling on * after acquiring a leaf page), and asserts no restart * happens while holding a leaf page. This page must be * returned to our caller. */ *refp = ref; goto done; } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; int prev, skip; uint32_t slot; btree = S2BT(session); /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __wt_page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __wt_page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { ref = pindex->index[slot]; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __wt_page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * editor -- * Main editor routine. * * PUBLIC: int editor __P((WIN *, int, char *[])); */ int editor(WIN *wp, int argc, char **argv) { extern int optind; extern char *optarg; const char *p; EVENT ev; FREF *frp; SCR *sp; GS *gp; size_t len; u_int flags; int ch, flagchk, lflag, secure, startup, readonly, rval, silent; char *tag_f, *wsizearg, path[256]; CHAR_T *w; size_t wlen; gp = wp->gp; /* Initialize the busy routine, if not defined by the screen. */ if (gp->scr_busy == NULL) gp->scr_busy = vs_busy; /* Initialize the message routine, if not defined by the screen. */ if (wp->scr_msg == NULL) wp->scr_msg = vs_msg; /* Set initial screen type and mode based on the program name. */ readonly = 0; if (!strcmp(gp->progname, "ex") || !strcmp(gp->progname, "nex")) LF_INIT(SC_EX); else { /* Nview, view are readonly. */ if (!strcmp(gp->progname, "nview") || !strcmp(gp->progname, "view")) readonly = 1; /* Vi is the default. */ LF_INIT(SC_VI); } /* Convert old-style arguments into new-style ones. */ if (v_obsolete(gp->progname, argv)) return (1); /* Parse the arguments. */ flagchk = '\0'; tag_f = wsizearg = NULL; lflag = secure = silent = 0; startup = 1; /* Set the file snapshot flag. */ F_SET(gp, G_SNAPSHOT); #ifdef DEBUG while ((ch = getopt(argc, argv, "c:D:eFlRrSsT:t:vw:")) != EOF) #else while ((ch = getopt(argc, argv, "c:eFlRrSst:vw:")) != EOF) #endif switch (ch) { case 'c': /* Run the command. */ /* * XXX * We should support multiple -c options. */ if (gp->c_option != NULL) { v_estr(gp->progname, 0, "only one -c command may be specified."); return (1); } gp->c_option = optarg; break; #ifdef DEBUG case 'D': switch (optarg[0]) { case 's': startup = 0; break; case 'w': attach(gp); break; default: v_estr(gp->progname, 0, "usage: -D requires s or w argument."); return (1); } break; #endif case 'e': /* Ex mode. */ LF_CLR(SC_VI); LF_SET(SC_EX); break; case 'F': /* No snapshot. */ v_estr(gp->progname, 0, "-F option no longer supported."); break; case 'l': /* Set lisp, showmatch options. */ lflag = 1; break; case 'R': /* Readonly. */ readonly = 1; break; case 'r': /* Recover. */ if (flagchk == 't') { v_estr(gp->progname, 0, "only one of -r and -t may be specified."); return (1); } flagchk = 'r'; break; case 'S': secure = 1; break; case 's': silent = 1; break; #ifdef TRACE case 'T': /* Trace. */ (void)vtrace_init(optarg); break; #endif case 't': /* Tag. */ if (flagchk == 'r') { v_estr(gp->progname, 0, "only one of -r and -t may be specified."); return (1); } if (flagchk == 't') { v_estr(gp->progname, 0, "only one tag file may be specified."); return (1); } flagchk = 't'; tag_f = optarg; break; case 'v': /* Vi mode. */ LF_CLR(SC_EX); LF_SET(SC_VI); break; case 'w': wsizearg = optarg; break; case '?': default: (void)gp->scr_usage(); return (1); } argc -= optind; argv += optind; /* * -s option is only meaningful to ex. * * If not reading from a terminal, it's like -s was specified. */ if (silent && !LF_ISSET(SC_EX)) { v_estr(gp->progname, 0, "-s option is only applicable to ex."); goto err; } if (LF_ISSET(SC_EX) && F_ISSET(gp, G_SCRIPTED)) silent = 1; /* * Build and initialize the first/current screen. This is a bit * tricky. If an error is returned, we may or may not have a * screen structure. If we have a screen structure, put it on a * display queue so that the error messages get displayed. * * !!! * Everything we do until we go interactive is done in ex mode. */ if (screen_init(gp, NULL, &sp)) { if (sp != NULL) { CIRCLEQ_INSERT_HEAD(&wp->scrq, sp, q); sp->wp = wp; } goto err; } F_SET(sp, SC_EX); CIRCLEQ_INSERT_HEAD(&wp->scrq, sp, q); sp->wp = wp; if (v_key_init(sp)) /* Special key initialization. */ goto err; { int oargs[5], *oargp = oargs; if (lflag) { /* Command-line options. */ *oargp++ = O_LISP; *oargp++ = O_SHOWMATCH; } if (readonly) *oargp++ = O_READONLY; if (secure) *oargp++ = O_SECURE; *oargp = -1; /* Options initialization. */ if (opts_init(sp, oargs)) goto err; } if (wsizearg != NULL) { ARGS *av[2], a, b; (void)snprintf(path, sizeof(path), "window=%s", wsizearg); a.bp = (CHAR_T *)path; a.len = strlen(path); b.bp = NULL; b.len = 0; av[0] = &a; av[1] = &b; (void)opts_set(sp, av, NULL); } if (silent) { /* Ex batch mode option values. */ O_CLR(sp, O_AUTOPRINT); O_CLR(sp, O_PROMPT); O_CLR(sp, O_VERBOSE); O_CLR(sp, O_WARN); F_SET(sp, SC_EX_SILENT); } sp->rows = O_VAL(sp, O_LINES); /* Make ex formatting work. */ sp->cols = O_VAL(sp, O_COLUMNS); if (!silent && startup) { /* Read EXINIT, exrc files. */ if (ex_exrc(sp)) goto err; if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) { if (screen_end(sp)) goto err; goto done; } } /* * List recovery files if -r specified without file arguments. * Note, options must be initialized and startup information * read before doing this. */ if (flagchk == 'r' && argv[0] == NULL) { if (rcv_list(sp)) goto err; if (screen_end(sp)) goto err; goto done; } /* * !!! * Initialize the default ^D, ^U scrolling value here, after the * user has had every opportunity to set the window option. * * It's historic practice that changing the value of the window * option did not alter the default scrolling value, only giving * a count to ^D/^U did that. */ sp->defscroll = (O_VAL(sp, O_WINDOW) + 1) / 2; /* * If we don't have a command-line option, switch into the right * editor now, so that we position default files correctly, and * so that any tags file file-already-locked messages are in the * vi screen, not the ex screen. * * XXX * If we have a command-line option, the error message can end * up in the wrong place, but I think that the combination is * unlikely. */ if (gp->c_option == NULL) { F_CLR(sp, SC_EX | SC_VI); F_SET(sp, LF_ISSET(SC_EX | SC_VI)); } /* Open a tag file if specified. */ if (tag_f != NULL) { CHAR2INT(sp, tag_f, strlen(tag_f) + 1, w, wlen); if (ex_tag_first(sp, w)) goto err; } /* * Append any remaining arguments as file names. Files are recovery * files if -r specified. If the tag option or ex startup commands * loaded a file, then any file arguments are going to come after it. */ if (*argv != NULL) { if (sp->frp != NULL) { /* Cheat -- we know we have an extra argv slot. */ MALLOC_NOMSG(sp, *--argv, char *, strlen(sp->frp->name) + 1); if (*argv == NULL) { v_estr(gp->progname, errno, NULL); goto err; } (void)strcpy(*argv, sp->frp->name); } sp->argv = sp->cargv = argv; F_SET(sp, SC_ARGNOFREE); if (flagchk == 'r') F_SET(sp, SC_ARGRECOVER); }
/* * v_motion -- * * Get resulting motion mark. */ static int v_motion( SCR *sp, VICMD *dm, VICMD *vp, int *mappedp) { VICMD motion; size_t len; u_long cnt; u_int flags; int tilde_reset, notused; /* * If '.' command, use the dot motion, else get the motion command. * Clear any line motion flags, the subsequent motion isn't always * the same, i.e. "/aaa" may or may not be a line motion. */ if (F_ISSET(vp, VC_ISDOT)) { motion = *dm; F_SET(&motion, VC_ISDOT); F_CLR(&motion, VM_COMMASK); } else { memset(&motion, 0, sizeof(VICMD)); if (v_cmd(sp, NULL, &motion, vp, ¬used, mappedp) != GC_OK) return (1); } /* * A count may be provided both to the command and to the motion, in * which case the count is multiplicative. For example, "3y4y" is the * same as "12yy". This count is provided to the motion command and * not to the regular function. */ cnt = motion.count = F_ISSET(&motion, VC_C1SET) ? motion.count : 1; if (F_ISSET(vp, VC_C1SET)) { motion.count *= vp->count; F_SET(&motion, VC_C1SET); /* * Set flags to restore the original values of the command * structure so dot commands can change the count values, * e.g. "2dw" "3." deletes a total of five words. */ F_CLR(vp, VC_C1SET); F_SET(vp, VC_C1RESET); } /* * Some commands can be repeated to indicate the current line. In * this case, or if the command is a "line command", set the flags * appropriately. If not a doubled command, run the function to get * the resulting mark. */ if (vp->key == motion.key) { F_SET(vp, VM_LDOUBLE | VM_LMODE); /* Set the origin of the command. */ vp->m_start.lno = sp->lno; vp->m_start.cno = 0; /* * Set the end of the command. * * If the current line is missing, i.e. the file is empty, * historic vi permitted a "cc" or "!!" command to insert * text. */ vp->m_stop.lno = sp->lno + motion.count - 1; if (db_get(sp, vp->m_stop.lno, 0, NULL, &len)) { if (vp->m_stop.lno != 1 || (vp->key != 'c' && vp->key != '!')) { v_emsg(sp, NULL, VIM_EMPTY); return (1); } vp->m_stop.cno = 0; } else vp->m_stop.cno = len ? len - 1 : 0; } else { /* * Motion commands change the underlying movement (*snarl*). * For example, "l" is illegal at the end of a line, but "dl" * is not. Set flags so the function knows the situation. */ motion.rkp = vp->kp; /* * XXX * Use yank instead of creating a new motion command, it's a * lot easier for now. */ if (vp->kp == &tmotion) { tilde_reset = 1; vp->kp = &vikeys['y']; } else tilde_reset = 0; /* * Copy the key flags into the local structure, except for the * RCM flags -- the motion command will set the RCM flags in * the vp structure if necessary. This means that the motion * command is expected to determine where the cursor ends up! * However, we save off the current RCM mask and restore it if * it no RCM flags are set by the motion command, with a small * modification. * * We replace the VM_RCM_SET flag with the VM_RCM flag. This * is so that cursor movement doesn't set the relative position * unless the motion command explicitly specified it. This * appears to match historic practice, but I've never been able * to develop a hard-and-fast rule. */ flags = F_ISSET(vp, VM_RCM_MASK); if (LF_ISSET(VM_RCM_SET)) { LF_SET(VM_RCM); LF_CLR(VM_RCM_SET); } F_CLR(vp, VM_RCM_MASK); F_SET(&motion, motion.kp->flags & ~VM_RCM_MASK); /* * Set the three cursor locations to the current cursor. This * permits commands like 'j' and 'k', that are line oriented * motions and have special cursor suck semantics when they are * used as standalone commands, to ignore column positioning. */ motion.m_final.lno = motion.m_stop.lno = motion.m_start.lno = sp->lno; motion.m_final.cno = motion.m_stop.cno = motion.m_start.cno = sp->cno; /* Run the function. */ if ((motion.kp->func)(sp, &motion)) return (1); /* * If the current line is missing, i.e. the file is empty, * historic vi allowed "c<motion>" or "!<motion>" to insert * text. Otherwise fail -- most motion commands will have * already failed, but some, e.g. G, succeed in empty files. */ if (!db_exist(sp, vp->m_stop.lno)) { if (vp->m_stop.lno != 1 || (vp->key != 'c' && vp->key != '!')) { v_emsg(sp, NULL, VIM_EMPTY); return (1); } vp->m_stop.cno = 0; } /* * XXX * See above. */ if (tilde_reset) vp->kp = &tmotion; /* * Copy cut buffer, line mode and cursor position information * from the motion command structure, i.e. anything that the * motion command can set for us. The commands can flag the * movement as a line motion (see v_sentence) as well as set * the VM_RCM_* flags explicitly. */ F_SET(vp, F_ISSET(&motion, VM_COMMASK | VM_RCM_MASK)); /* * If the motion command set no relative motion flags, use * the (slightly) modified previous values. */ if (!F_ISSET(vp, VM_RCM_MASK)) F_SET(vp, flags); /* * Commands can change behaviors based on the motion command * used, for example, the ! command repeated the last bang * command if N or n was used as the motion. */ vp->rkp = motion.kp; /* * Motion commands can reset all of the cursor information. * If the motion is in the reverse direction, switch the * from and to MARK's so that it's in a forward direction. * Motions are from the from MARK to the to MARK (inclusive). */ if (motion.m_start.lno > motion.m_stop.lno || (motion.m_start.lno == motion.m_stop.lno && motion.m_start.cno > motion.m_stop.cno)) { vp->m_start = motion.m_stop; vp->m_stop = motion.m_start; } else { vp->m_start = motion.m_start; vp->m_stop = motion.m_stop; } vp->m_final = motion.m_final; } /* * If the command sets dot, save the motion structure. The motion * count was changed above and needs to be reset, that's why this * is done here, and not in the calling routine. */ if (F_ISSET(vp->kp, V_DOT)) { *dm = motion; dm->count = cnt; } return (0); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; bool empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); empty_internal = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __page_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * update those hints when splitting, so it's common for * them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } /* * Optionally skip leaf pages: skip all leaf pages if * WT_READ_SKIP_LEAF is set, when the skip-leaf-count * variable is non-zero, skip some count of leaf pages. * If this page is disk-based, crack the cell to figure * out it's a leaf page without reading it. * * If skipping some number of leaf pages, decrement the * count of pages to zero, and then take the next leaf * page we can. Be cautious around the page decrement, * if for some reason don't take this particular page, * we can take the next one, and, there are additional * tests/decrements when we're about to return a leaf * page. */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) if (__ref_is_leaf(ref)) { if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { --*skipleafcntp; break; } } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: couple = ref; empty_internal = true; __page_descend( session, ref->page, &pindex, &slot, prev); } else { /* * Optionally skip leaf pages, the second half. * We didn't have an on-page cell to figure out * if it was a leaf page, we had to acquire the * hazard pointer and look at the page. */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) { couple = ref; if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { --*skipleafcntp; break; } } *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }