/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; u_int i, ins, nonbloom; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; char *main_uri; size_t namesize, newsize; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } if (entry == NULL) { WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_ERR_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_ERR_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (end->flags == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && end->flags == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, entry->ends_next + 1, &entry->ends)); if (!hasins) ins = entry->ends_next; newend = &entry->ends[ins]; memmove(newend + 1, newend, (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); entry->ends_next++; newend->cursor = ref_cursor; F_SET(newend, range); /* Open the main file with a projection of the indexed columns. */ if (entry->main == NULL && entry->index != NULL) { namesize = strlen(cjoin->table->name); newsize = namesize + entry->index->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", cjoin->table->name, (int)entry->index->colconf.len, entry->index->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); } err: if (main_uri != NULL) __wt_free(session, main_uri); return (ret); }
/* * __wt_connection_close -- * Close a connection handle. */ int __wt_connection_close(WT_CONNECTION_IMPL *conn) { WT_CONNECTION *wt_conn; WT_DECL_RET; WT_DLH *dlh; WT_SESSION_IMPL *s, *session; u_int i; wt_conn = &conn->iface; session = conn->default_session; /* Shut down the subsystems, ensuring workers see the state change. */ F_SET(conn, WT_CONN_CLOSING); WT_FULL_BARRIER(); /* * Clear any pending async operations and shut down the async worker * threads and system before closing LSM. */ WT_TRET(__wt_async_flush(session)); WT_TRET(__wt_async_destroy(session)); /* * Shut down server threads other than the eviction server, which is * needed later to close btree handles. Some of these threads access * btree handles, so take care in ordering shutdown to make sure they * exit before files are closed. */ WT_TRET(__wt_lsm_manager_destroy(session)); /* * Once the async and LSM threads exit, we shouldn't be opening any * more files. */ F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS); WT_FULL_BARRIER(); WT_TRET(__wt_checkpoint_server_destroy(session)); WT_TRET(__wt_statlog_destroy(session, true)); WT_TRET(__wt_sweep_destroy(session)); /* The eviction server is shut down last. */ WT_TRET(__wt_evict_destroy(session)); /* Close open data handles. */ WT_TRET(__wt_conn_dhandle_discard(session)); /* Shut down metadata tracking. */ WT_TRET(__wt_meta_track_destroy(session)); /* * Now that all data handles are closed, tell logging that a checkpoint * has completed then shut down the log manager (only after closing * data handles). The call to destroy the log manager is outside the * conditional because we allocate the log path so that printlog can * run without running logging or recovery. */ if (ret == 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE)) WT_TRET(__wt_txn_checkpoint_log( session, true, WT_TXN_LOG_CKPT_STOP, NULL)); WT_TRET(__wt_logmgr_destroy(session)); /* Free memory for collators, compressors, data sources. */ WT_TRET(__wt_conn_remove_collator(session)); WT_TRET(__wt_conn_remove_compressor(session)); WT_TRET(__wt_conn_remove_data_source(session)); WT_TRET(__wt_conn_remove_encryptor(session)); WT_TRET(__wt_conn_remove_extractor(session)); /* Disconnect from shared cache - must be before cache destroy. */ WT_TRET(__wt_conn_cache_pool_destroy(session)); /* Discard the cache. */ WT_TRET(__wt_cache_destroy(session)); /* Discard transaction state. */ __wt_txn_global_destroy(session); /* Close the lock file, opening up the database to other connections. */ if (conn->lock_fh != NULL) WT_TRET(__wt_close(session, &conn->lock_fh)); /* Close any optrack files */ if (session->optrack_fh != NULL) WT_TRET(__wt_close(session, &session->optrack_fh)); /* Close operation tracking */ WT_TRET(__wt_conn_optrack_teardown(session, false)); /* Close any file handles left open. */ WT_TRET(__wt_close_connection_close(session)); /* * Close the internal (default) session, and switch back to the dummy * session in case of any error messages from the remaining operations * while destroying the connection handle. */ if (session != &conn->dummy_session) { WT_TRET(session->iface.close(&session->iface, NULL)); session = conn->default_session = &conn->dummy_session; } /* * The session split stash, hazard information and handle arrays aren't * discarded during normal session close, they persist past the life of * the session. Discard them now. */ if (!F_ISSET(conn, WT_CONN_LEAK_MEMORY)) if ((s = conn->sessions) != NULL) for (i = 0; i < conn->session_size; ++s, ++i) { __wt_free(session, s->cursor_cache); __wt_free(session, s->dhhash); __wt_stash_discard_all(session, s); __wt_free(session, s->hazard); } /* Destroy the file-system configuration. */ if (conn->file_system != NULL && conn->file_system->terminate != NULL) WT_TRET(conn->file_system->terminate( conn->file_system, (WT_SESSION *)session)); /* Close extensions, first calling any unload entry point. */ while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) { TAILQ_REMOVE(&conn->dlhqh, dlh, q); if (dlh->terminate != NULL) WT_TRET(dlh->terminate(wt_conn)); WT_TRET(__wt_dlclose(session, dlh)); } /* Destroy the handle. */ __wt_connection_destroy(conn); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM key; uint64_t insert_count; WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); /* * Setup so that we don't hold pages we read into cache, and so * that we don't get stuck if the cache is full. If we allow * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); __wt_bloom_insert(bloom, &key); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); __wt_verbose(session, WT_VERB_LSM, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ __wt_lsm_tree_writelock(session, lsm_tree); F_SET(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; __wt_lsm_tree_writeunlock(session, lsm_tree); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/* * db_get -- * Look in the text buffers for a line, followed by the cache, followed * by the database. * * PUBLIC: int db_get __P((SCR *, db_recno_t, u_int32_t, CHAR_T **, size_t *)); */ int db_get(SCR *sp, db_recno_t lno, u_int32_t flags, CHAR_T **pp, size_t *lenp) /* Line number. */ /* Pointer store. */ /* Length store. */ { DBT data, key; EXF *ep; TEXT *tp; db_recno_t l1, l2; const CHAR_T *wp; size_t wlen; size_t nlen; /* * The underlying recno stuff handles zero by returning NULL, but * have to have an OOB condition for the look-aside into the input * buffer anyway. */ if (lno == 0) goto err1; /* Check for no underlying file. */ if ((ep = sp->ep) == NULL) { ex_emsg(sp, NULL, EXM_NOFILEYET); goto err3; } if (LF_ISSET(DBG_NOCACHE)) goto nocache; /* * Look-aside into the TEXT buffers and see if the line we want * is there. */ if (F_ISSET(sp, SC_TINPUT)) { l1 = ((TEXT *)sp->tiq.cqh_first)->lno; l2 = ((TEXT *)sp->tiq.cqh_last)->lno; if (l1 <= lno && l2 >= lno) { #if defined(DEBUG) && 0 vtrace(sp, "retrieve TEXT buffer line %lu\n", (u_long)lno); #endif for (tp = sp->tiq.cqh_first; tp->lno != lno; tp = tp->q.cqe_next); if (lenp != NULL) *lenp = tp->len; if (pp != NULL) *pp = tp->lb; return (0); } /* * Adjust the line number for the number of lines used * by the text input buffers. */ if (lno > l2) lno -= l2 - l1; } /* Look-aside into the cache, and see if the line we want is there. */ if (lno == sp->c_lno) { #if defined(DEBUG) && 0 vtrace(sp, "retrieve cached line %lu\n", (u_long)lno); #endif if (lenp != NULL) *lenp = sp->c_len; if (pp != NULL) *pp = sp->c_lp; return (0); } sp->c_lno = OOBLNO; nocache: nlen = 1024; retry: /* data.size contains length in bytes */ BINC_GOTO(sp, CHAR_T, sp->c_lp, sp->c_blen, nlen); /* Get the line from the underlying database. */ memset(&key, 0, sizeof(key)); key.data = &lno; key.size = sizeof(lno); memset(&data, 0, sizeof(data)); data.data = sp->c_lp; data.ulen = sp->c_blen; data.flags = DB_DBT_USERMEM; switch (ep->db->get(ep->db, NULL, &key, &data, 0)) { case DB_BUFFER_SMALL: nlen = data.size; goto retry; default: goto err2; case DB_NOTFOUND: err1: if (LF_ISSET(DBG_FATAL)) err2: db_err(sp, lno); alloc_err: err3: if (lenp != NULL) *lenp = 0; if (pp != NULL) *pp = NULL; return (1); case 0: ; } if (FILE2INT(sp, data.data, data.size, wp, wlen)) { if (!F_ISSET(sp, SC_CONV_ERROR)) { F_SET(sp, SC_CONV_ERROR); msgq(sp, M_ERR, "324|Conversion error on line %d", lno); } goto err3; } /* Reset the cache. */ if (wp != data.data) { BINC_GOTOW(sp, sp->c_lp, sp->c_blen, wlen); MEMCPYW(sp->c_lp, wp, wlen); } sp->c_lno = lno; sp->c_len = wlen; #if defined(DEBUG) && 0 vtrace(sp, "retrieve DB line %lu\n", (u_long)lno); #endif if (lenp != NULL) *lenp = wlen; if (pp != NULL) *pp = sp->c_lp; return (0); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; int needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; WT_ERR(__wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) WT_ERR(WT_RUN_RECOVERY); /* * Always run recovery even if it was a clean shutdown. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM buf, key; WT_SESSION *wt_session; uint64_t insert_count; int exist; /* * Normally, the Bloom URI is populated when the chunk struct is * allocated. After an open, however, it may not have been. * Deal with that here. */ if (chunk->bloom_uri == NULL) { WT_CLEAR(buf); WT_RET(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); } /* * Drop the bloom filter first - there may be some content hanging over * from an aborted merge or checkpoint. */ wt_session = &session->iface; WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist)); if (exist) WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); F_SET(session, WT_SESSION_NO_CACHE); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_VERBOSE_ERR(session, lsm, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __wt_curds_open -- * Initialize a data-source cursor. */ int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, NULL, /* get-key */ NULL, /* get-value */ NULL, /* set-key */ NULL, /* set-value */ __curds_compare, /* compare */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __curds_update, /* update */ __curds_remove, /* remove */ __curds_close); /* close */ WT_CONFIG_ITEM cval; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; WT_DECL_RET; const char *metaconf; STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0); data_source = NULL; metaconf = NULL; WT_RET(__wt_calloc_def(session, 1, &data_source)); cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; F_SET(cursor, WT_CURSTD_DATA_SOURCE); /* * XXX * The underlying data-source may require the object's key and value * formats. This isn't a particularly elegant way of getting that * information to the data-source, this feels like a layering problem * to me. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); /* Data-source cursors have a collator reference. */ WT_ERR(__wt_collator_config(session, cfg, &data_source->collator)); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source)); source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); source->recno = 0; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); source->saved_err = 0; source->flags = 0; if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else __wt_free(session, data_source); *cursorp = NULL; } __wt_free(session, metaconf); return (ret); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; uint64_t *switch_txnp; uint64_t snap_min; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; txn = &session->txn; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { WT_ASSERT(session, !F_ISSET(&clsm->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update) { /* * Ensure that there is a transaction snapshot active. */ WT_RET(__wt_txn_autocommit_check(session)); WT_RET(__wt_txn_id_check(session)); WT_RET(__clsm_enter_update(clsm)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; if (txn->isolation == WT_ISO_SNAPSHOT) __wt_txn_cursor_op(session); /* * Figure out how many updates are required for * snapshot isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID * that overlaps with our snapshot is a potential * conflict. */ clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); snap_min = txn->snap_min; for (switch_txnp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, switch_txnp--) { if (WT_TXNID_LT(*switch_txnp, snap_min)) break; WT_ASSERT(session, !__wt_txn_visible_all( session, *switch_txnp)); } } } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || txn->isolation != WT_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }
/* * v_motion -- * * Get resulting motion mark. */ static int v_motion(SCR *sp, VICMD *dm, VICMD *vp, int *mappedp) { VICMD motion; gcret_t gcret; size_t len; u_long cnt; u_int flags; int tilde_reset, notused; /* * If '.' command, use the dot motion, else get the motion command. * Clear any line motion flags, the subsequent motion isn't always * the same, i.e. "/aaa" may or may not be a line motion. */ if (F_ISSET(vp, VC_ISDOT)) { motion = *dm; F_SET(&motion, VC_ISDOT); F_CLR(&motion, VM_COMMASK); gcret = GC_OK; } else { memset(&motion, 0, sizeof(VICMD)); gcret = v_cmd(sp, NULL, &motion, vp, ¬used, mappedp); if (gcret != GC_OK && gcret != GC_EVENT) return (1); } /* * A count may be provided both to the command and to the motion, in * which case the count is multiplicative. For example, "3y4y" is the * same as "12yy". This count is provided to the motion command and * not to the regular function. */ cnt = motion.count = F_ISSET(&motion, VC_C1SET) ? motion.count : 1; if (F_ISSET(vp, VC_C1SET)) { motion.count *= vp->count; F_SET(&motion, VC_C1SET); /* * Set flags to restore the original values of the command * structure so dot commands can change the count values, * e.g. "2dw" "3." deletes a total of five words. */ F_CLR(vp, VC_C1SET); F_SET(vp, VC_C1RESET); } /* * Some commands can be repeated to indicate the current line. In * this case, or if the command is a "line command", set the flags * appropriately. If not a doubled command, run the function to get * the resulting mark. */ if (gcret != GC_EVENT && vp->key == motion.key) { F_SET(vp, VM_LDOUBLE | VM_LMODE); /* Set the origin of the command. */ vp->m_start.lno = sp->lno; vp->m_start.cno = 0; /* * Set the end of the command. * * If the current line is missing, i.e. the file is empty, * historic vi permitted a "cc" or "!!" command to insert * text. */ vp->m_stop.lno = sp->lno + motion.count - 1; if (db_get(sp, vp->m_stop.lno, 0, NULL, &len)) { if (vp->m_stop.lno != 1 || (vp->key != 'c' && vp->key != '!')) { v_emsg(sp, NULL, VIM_EMPTY); return (1); } vp->m_stop.cno = 0; } else vp->m_stop.cno = len ? len - 1 : 0; } else { /* * Motion commands change the underlying movement (*snarl*). * For example, "l" is illegal at the end of a line, but "dl" * is not. Set flags so the function knows the situation. */ motion.rkp = vp->kp; /* * XXX * Use yank instead of creating a new motion command, it's a * lot easier for now. */ if (vp->kp == &tmotion) { tilde_reset = 1; vp->kp = &vikeys['y']; } else tilde_reset = 0; /* * Copy the key flags into the local structure, except for the * RCM flags -- the motion command will set the RCM flags in * the vp structure if necessary. This means that the motion * command is expected to determine where the cursor ends up! * However, we save off the current RCM mask and restore it if * it no RCM flags are set by the motion command, with a small * modification. * * We replace the VM_RCM_SET flag with the VM_RCM flag. This * is so that cursor movement doesn't set the relative position * unless the motion command explicitly specified it. This * appears to match historic practice, but I've never been able * to develop a hard-and-fast rule. */ flags = F_ISSET(vp, VM_RCM_MASK); if (LF_ISSET(VM_RCM_SET)) { LF_SET(VM_RCM); LF_CLR(VM_RCM_SET); } F_CLR(vp, VM_RCM_MASK); F_SET(&motion, motion.kp->flags & ~VM_RCM_MASK); /* * Set the three cursor locations to the current cursor. This * permits commands like 'j' and 'k', that are line oriented * motions and have special cursor suck semantics when they are * used as standalone commands, to ignore column positioning. */ motion.m_final.lno = motion.m_stop.lno = motion.m_start.lno = sp->lno; motion.m_final.cno = motion.m_stop.cno = motion.m_start.cno = sp->cno; /* Run the function. */ if ((motion.kp->func)(sp, &motion)) return (1); /* * If the current line is missing, i.e. the file is empty, * historic vi allowed "c<motion>" or "!<motion>" to insert * text. Otherwise fail -- most motion commands will have * already failed, but some, e.g. G, succeed in empty files. */ if (!db_exist(sp, vp->m_stop.lno)) { if (vp->m_stop.lno != 1 || (vp->key != 'c' && vp->key != '!')) { v_emsg(sp, NULL, VIM_EMPTY); return (1); } vp->m_stop.cno = 0; } /* * XXX * See above. */ if (tilde_reset) vp->kp = &tmotion; /* * Copy cut buffer, line mode and cursor position information * from the motion command structure, i.e. anything that the * motion command can set for us. The commands can flag the * movement as a line motion (see v_sentence) as well as set * the VM_RCM_* flags explicitly. */ F_SET(vp, F_ISSET(&motion, VM_COMMASK | VM_RCM_MASK)); /* * If the motion command set no relative motion flags, use * the (slightly) modified previous values. */ if (!F_ISSET(vp, VM_RCM_MASK)) F_SET(vp, flags); /* * Commands can change behaviors based on the motion command * used, for example, the ! command repeated the last bang * command if N or n was used as the motion. */ vp->rkp = motion.kp; /* * Motion commands can reset all of the cursor information. * If the motion is in the reverse direction, switch the * from and to MARK's so that it's in a forward direction. * Motions are from the from MARK to the to MARK (inclusive). */ if (motion.m_start.lno > motion.m_stop.lno || (motion.m_start.lno == motion.m_stop.lno && motion.m_start.cno > motion.m_stop.cno)) { vp->m_start = motion.m_stop; vp->m_stop = motion.m_start; } else { vp->m_start = motion.m_start; vp->m_stop = motion.m_stop; } vp->m_final = motion.m_final; } /* * If the command sets dot, save the motion structure. The motion * count was changed above and needs to be reset, that's why this * is done here, and not in the calling routine. */ if (F_ISSET(vp->kp, V_DOT)) { *dm = motion; dm->count = cnt; } return (0); }
/* * v_init -- * Initialize the vi screen. */ static int v_init(SCR *sp) { GS *gp; VI_PRIVATE *vip; gp = sp->gp; vip = VIP(sp); /* Switch into vi. */ if (gp->scr_screen(sp, SC_VI)) return (1); (void)gp->scr_attr(sp, SA_ALTERNATE, 1); F_CLR(sp, SC_EX | SC_SCR_EX); F_SET(sp, SC_VI); /* * Initialize screen values. * * Small windows: see vs_refresh(), section 6a. * * Setup: * t_minrows is the minimum rows to display * t_maxrows is the maximum rows to display (rows - 1) * t_rows is the rows currently being displayed */ sp->rows = vip->srows = O_VAL(sp, O_LINES); sp->cols = O_VAL(sp, O_COLUMNS); sp->t_rows = sp->t_minrows = O_VAL(sp, O_WINDOW); if (sp->rows != 1) { if (sp->t_rows > sp->rows - 1) { sp->t_minrows = sp->t_rows = sp->rows - 1; msgq(sp, M_INFO, "214|Windows option value is too large, max is %zu", sp->t_rows); } sp->t_maxrows = sp->rows - 1; } else sp->t_maxrows = 1; sp->roff = sp->coff = 0; /* Create a screen map. */ CALLOC_RET(sp, HMAP, SMAP *, SIZE_HMAP(sp), sizeof(SMAP)); TMAP = HMAP + (sp->t_rows - 1); HMAP->lno = sp->lno; HMAP->coff = 0; HMAP->soff = 1; /* * Fill the screen map from scratch -- try and center the line. That * way if we're starting with a file we've seen before, we'll put the * line in the middle, otherwise, it won't work and we'll end up with * the line at the top. */ F_CLR(sp, SC_SCR_TOP); F_SET(sp, SC_SCR_REFORMAT | SC_SCR_CENTER); /* Invalidate the cursor. */ F_SET(vip, VIP_CUR_INVALID); /* Paint the screen image from scratch. */ F_SET(vip, VIP_N_EX_PAINT); return (0); }
/* * vi -- * Main vi command loop. * * PUBLIC: int vi __P((SCR **)); */ int vi(SCR **spp) { GS *gp; WIN *wp; MARK abst; SCR *next, *sp; VICMD cmd, *vp; VI_PRIVATE *vip; int comcount, mapped, rval; /* Get the first screen. */ sp = *spp; wp = sp->wp; gp = sp->gp; /* Initialize the command structure. */ vp = &cmd; memset(vp, 0, sizeof(VICMD)); /* Reset strange attraction. */ F_SET(vp, VM_RCM_SET); /* Initialize the vi screen. */ if (v_init(sp)) return (1); /* Set the focus. */ (void)sp->gp->scr_rename(sp, sp->frp->name, 1); for (vip = VIP(sp), rval = 0;;) { /* Resolve messages. */ if (!MAPPED_KEYS_WAITING(sp) && vs_resolve(sp, NULL, 0)) goto ret; /* * If not skipping a refresh, return to command mode and * refresh the screen. */ if (F_ISSET(vip, VIP_S_REFRESH)) F_CLR(vip, VIP_S_REFRESH); else { sp->showmode = SM_COMMAND; if (vs_refresh(sp, 0)) goto ret; } /* Set the new favorite position. */ if (F_ISSET(vp, VM_RCM_SET | VM_RCM_SETFNB | VM_RCM_SETNNB)) { F_CLR(vip, VIP_RCM_LAST); (void)vs_column(sp, &sp->rcm); } /* * If not currently in a map, log the cursor position, * and set a flag so that this command can become the * DOT command. */ if (MAPPED_KEYS_WAITING(sp)) mapped = 1; else { if (log_cursor(sp)) goto err; mapped = 0; } /* * There may be an ex command waiting, and we returned here * only because we exited a screen or file. In this case, * we simply go back into the ex parser. */ if (EXCMD_RUNNING(wp)) { vp->kp = &vikeys[':']; goto ex_continue; } /* Refresh the command structure. */ memset(vp, 0, sizeof(VICMD)); /* * We get a command, which may or may not have an associated * motion. If it does, we get it too, calling its underlying * function to get the resulting mark. We then call the * command setting the cursor to the resulting mark. * * !!! * Vi historically flushed mapped characters on error, but * entering extra <escape> characters at the beginning of * a map wasn't considered an error -- in fact, users would * put leading <escape> characters in maps to clean up vi * state before the map was interpreted. Beauty! */ switch (v_cmd(sp, DOT, vp, NULL, &comcount, &mapped)) { case GC_ERR: goto err; case GC_ERR_NOFLUSH: goto gc_err_noflush; case GC_FATAL: goto ret; case GC_INTERRUPT: goto intr; case GC_EVENT: case GC_OK: break; } /* Check for security setting. */ if (F_ISSET(vp->kp, V_SECURE) && O_ISSET(sp, O_SECURE)) { ex_emsg(sp, (const char *)KEY_NAME(sp, vp->key), EXM_SECURE); goto err; } /* * Historical practice: if a dot command gets a new count, * any motion component goes away, i.e. "d3w2." deletes a * total of 5 words. */ if (F_ISSET(vp, VC_ISDOT) && comcount) DOTMOTION->count = 1; /* Copy the key flags into the local structure. */ F_SET(vp, vp->kp->flags); /* Prepare to set the previous context. */ if (F_ISSET(vp, V_ABS | V_ABS_C | V_ABS_L)) { abst.lno = sp->lno; abst.cno = sp->cno; } /* * Set the three cursor locations to the current cursor. The * underlying routines don't bother if the cursor doesn't move. * This also handles line commands (e.g. Y) defaulting to the * current line. */ vp->m_start.lno = vp->m_stop.lno = vp->m_final.lno = sp->lno; vp->m_start.cno = vp->m_stop.cno = vp->m_final.cno = sp->cno; /* * Do any required motion; v_motion sets the from MARK and the * line mode flag, as well as the VM_RCM flags. */ if (F_ISSET(vp, V_MOTION) && v_motion(sp, DOTMOTION, vp, &mapped)) { if (INTERRUPTED(sp)) goto intr; goto err; } /* * If a count is set and the command is line oriented, set the * to MARK here relative to the cursor/from MARK. This is for * commands that take both counts and motions, i.e. "4yy" and * "y%". As there's no way the command can know which the user * did, we have to do it here. (There are commands that are * line oriented and that take counts ("#G", "#H"), for which * this calculation is either completely meaningless or wrong. * Each command must validate the value for itself. */ if (F_ISSET(vp, VC_C1SET) && F_ISSET(vp, VM_LMODE)) vp->m_stop.lno += vp->count - 1; /* Increment the command count. */ ++sp->ccnt; #if defined(DEBUG) && defined(COMLOG) v_comlog(sp, vp); #endif /* Call the function. */ ex_continue: if (vp->kp->func(sp, vp)) goto err; #ifdef DEBUG /* Make sure no function left the temporary space locked. */ if (F_ISSET(wp, W_TMP_INUSE)) { F_CLR(wp, W_TMP_INUSE); msgq(sp, M_ERR, "232|vi: temporary buffer not released"); } #endif /* * If we're exiting this screen, move to the next one, or, if * there aren't any more, return to the main editor loop. The * ordering is careful, don't discard the contents of sp until * the end. */ if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) { if (file_end(sp, NULL, F_ISSET(sp, SC_EXIT_FORCE))) goto ret; if (vs_discard(sp, &next)) goto ret; if (next == NULL && vs_swap(sp, &next, NULL)) goto ret; *spp = next; if (screen_end(sp)) goto ret; if (next == NULL) break; /* Switch screens, change focus. */ sp = next; vip = VIP(sp); (void)sp->gp->scr_rename(sp, sp->frp->name, 1); /* Don't trust the cursor. */ F_SET(vip, VIP_CUR_INVALID); continue; } /* * Set the dot command structure. * * !!! * Historically, commands which used mapped keys did not * set the dot command, with the exception of the text * input commands. */ if (F_ISSET(vp, V_DOT) && !mapped) { *DOT = cmd; F_SET(DOT, VC_ISDOT); /* * If a count was supplied for both the command and * its motion, the count was used only for the motion. * Turn the count back on for the dot structure. */ if (F_ISSET(vp, VC_C1RESET)) F_SET(DOT, VC_C1SET); /* VM flags aren't retained. */ F_CLR(DOT, VM_COMMASK | VM_RCM_MASK); } /* * Some vi row movements are "attracted" to the last position * set, i.e. the VM_RCM commands are moths to the VM_RCM_SET * commands' candle. If the movement is to the EOL the vi * command handles it. If it's to the beginning, we handle it * here. * * Note, some commands (e.g. _, ^) don't set the VM_RCM_SETFNB * flag, but do the work themselves. The reason is that they * have to modify the column in case they're being used as a * motion component. Other similar commands (e.g. +, -) don't * have to modify the column because they are always line mode * operations when used as motions, so the column number isn't * of any interest. * * Does this totally violate the screen and editor layering? * You betcha. As they say, if you think you understand it, * you don't. */ switch (F_ISSET(vp, VM_RCM_MASK)) { case 0: case VM_RCM_SET: break; case VM_RCM: vp->m_final.cno = vs_rcm(sp, vp->m_final.lno, F_ISSET(vip, VIP_RCM_LAST)); break; case VM_RCM_SETLAST: F_SET(vip, VIP_RCM_LAST); break; case VM_RCM_SETFNB: vp->m_final.cno = 0; /* FALLTHROUGH */ case VM_RCM_SETNNB: if (nonblank(sp, vp->m_final.lno, &vp->m_final.cno)) goto err; break; default: abort(); } /* Update the cursor. */ sp->lno = vp->m_final.lno; sp->cno = vp->m_final.cno; /* * Set the absolute mark -- set even if a tags or similar * command, since the tag may be moving to the same file. */ if ((F_ISSET(vp, V_ABS) || (F_ISSET(vp, V_ABS_L) && sp->lno != abst.lno) || (F_ISSET(vp, V_ABS_C) && (sp->lno != abst.lno || sp->cno != abst.cno))) && mark_set(sp, ABSMARK1, &abst, 1)) goto err; if (0) { err: if (v_event_flush(sp, CH_MAPPED)) msgq(sp, M_BERR, "110|Vi command failed: mapped keys discarded"); } /* * Check and clear interrupts. There's an obvious race, but * it's not worth fixing. */ gc_err_noflush: if (INTERRUPTED(sp)) { intr: CLR_INTERRUPT(sp); if (v_event_flush(sp, CH_MAPPED)) msgq(sp, M_ERR, "231|Interrupted: mapped keys discarded"); else msgq(sp, M_ERR, "236|Interrupted"); } /* If the last command switched screens, update. */ if (F_ISSET(sp, SC_SSWITCH)) { F_CLR(sp, SC_SSWITCH); /* * If the current screen is still displayed, it will * need a new status line. */ F_SET(sp, SC_STATUS); /* Switch screens, change focus. */ sp = sp->nextdisp; vip = VIP(sp); (void)sp->gp->scr_rename(sp, sp->frp->name, 1); /* Don't trust the cursor. */ F_SET(vip, VIP_CUR_INVALID); /* Refresh so we can display messages. */ if (vs_refresh(sp, 1)) return (1); } /* If the last command switched files, change focus. */ if (F_ISSET(sp, SC_FSWITCH)) { F_CLR(sp, SC_FSWITCH); (void)sp->gp->scr_rename(sp, sp->frp->name, 1); } /* If leaving vi, return to the main editor loop. */ if (F_ISSET(gp, G_SRESTART) || F_ISSET(sp, SC_EX)) { *spp = sp; v_dtoh(sp); gp->scr_discard(sp, NULL); break; } } if (0) ret: rval = 1; return (rval); }
/* * v_cmd -- * Get a vi command. */ static gcret_t v_cmd(SCR *sp, VICMD *dp, VICMD *vp, VICMD *ismotion, int *comcountp, int *mappedp) /* Previous key if getting motion component. */ { enum { COMMANDMODE, ISPARTIAL, NOTPARTIAL } cpart; ARG_CHAR_T key; VIKEYS const *kp; gcret_t gcret; u_int flags; const char *s; /* * Get an event command or a key. Event commands are simple, and * don't have any additional information. */ cpart = ismotion == NULL ? COMMANDMODE : ISPARTIAL; gcret = v_key(sp, vp, 1, EC_MAPCOMMAND); if (gcret != GC_OK) { if (gcret != GC_EVENT) return (gcret); if (v_event(sp, vp)) return (GC_ERR); if (ismotion != NULL && !F_ISSET(vp->kp, V_MOVE)) v_event_err(sp, &vp->ev); return (GC_EVENT); } /* * Keys are not simple. (Although vi's command structure less complex * than ex (and don't think I'm not grateful!) The command syntax is: * * [count] [buffer] [count] key [[motion] | [buffer] [character]] * * and there are, of course, several special cases. The motion value * is itself a vi command, with the syntax: * * [count] key [character] * * <escape> cancels partial commands, i.e. a command where at least * one non-numeric character has been entered. Otherwise, it beeps * the terminal. * * !!! * POSIX 1003.2-1992 explicitly disallows cancelling commands where * all that's been entered is a number, requiring that the terminal * be alerted. */ if (vp->ev.e_value == K_ESCAPE) goto esc; /* * Commands that are mapped are treated differently (e.g., they * don't set the dot command. Pass that information back. */ if (FL_ISSET(vp->ev.e_flags, CH_MAPPED)) *mappedp = 1; key = vp->ev.e_c; if (ismotion == NULL) cpart = NOTPARTIAL; /* Pick up an optional buffer. */ if (key == '"') { cpart = ISPARTIAL; if (ismotion != NULL) { v_emsg(sp, NULL, VIM_COMBUF); return (GC_ERR); } KEY(vp->buffer, 0); F_SET(vp, VC_BUFFER); KEY(key, EC_MAPCOMMAND); } /* * Pick up an optional count, where a leading 0 isn't a count, it's * a command. When a count is specified, the dot command behaves * differently, pass the information back. */ if (ISDIGIT(key) && key != '0') { if (v_count(sp, vp, key, &vp->count)) return (GC_ERR); F_SET(vp, VC_C1SET); *comcountp = 1; KEY(key, EC_MAPCOMMAND); } else *comcountp = 0; /* Pick up optional buffer. */ if (key == '"') { cpart = ISPARTIAL; if (F_ISSET(vp, VC_BUFFER)) { msgq(sp, M_ERR, "234|Only one buffer may be specified"); return (GC_ERR); } if (ismotion != NULL) { v_emsg(sp, NULL, VIM_COMBUF); return (GC_ERR); } KEY(vp->buffer, 0); F_SET(vp, VC_BUFFER); KEY(key, EC_MAPCOMMAND); } /* Check for an OOB command key. */ cpart = ISPARTIAL; if (key > MAXVIKEY) { v_emsg(sp, (const char *)KEY_NAME(sp, key), VIM_NOCOM); return (GC_ERR); } kp = &vikeys[vp->key = key]; /* * !!! * Historically, D accepted and then ignored a count. Match it. */ if (vp->key == 'D' && F_ISSET(vp, VC_C1SET)) { *comcountp = 0; vp->count = 0; F_CLR(vp, VC_C1SET); } /* * There are several commands that we implement as aliases, both * to match historic practice and to ensure consistency. Check * for command aliases. */ if (kp->func == NULL && (kp = v_alias(sp, vp, kp)) == NULL) return (GC_ERR); /* The tildeop option makes the ~ command take a motion. */ if (key == '~' && O_ISSET(sp, O_TILDEOP)) kp = &tmotion; vp->kp = kp; /* * Find the command. The only legal command with no underlying * function is dot. It's historic practice that <escape> doesn't * just erase the preceding number, it beeps the terminal as well. * It's a common problem, so just beep the terminal unless verbose * was set. */ if (kp->func == NULL) { if (key != '.') { v_emsg(sp, (const char *)KEY_NAME(sp, key), vp->ev.e_value == K_ESCAPE ? VIM_NOCOM_B : VIM_NOCOM); return (GC_ERR); } /* If called for a motion command, stop now. */ if (dp == NULL) goto usage; /* * !!! * If a '.' is immediately entered after an undo command, we * replay the log instead of redoing the last command. This * is necessary because 'u' can't set the dot command -- see * vi/v_undo.c:v_undo for details. */ if (VIP(sp)->u_ccnt == sp->ccnt) { vp->kp = &vikeys['u']; F_SET(vp, VC_ISDOT); return (GC_OK); } /* Otherwise, a repeatable command must have been executed. */ if (!F_ISSET(dp, VC_ISDOT)) { msgq(sp, M_ERR, "208|No command to repeat"); return (GC_ERR); } /* Set new count/buffer, if any, and return. */ if (F_ISSET(vp, VC_C1SET)) { F_SET(dp, VC_C1SET); dp->count = vp->count; } if (F_ISSET(vp, VC_BUFFER)) dp->buffer = vp->buffer; *vp = *dp; return (GC_OK); } /* Set the flags based on the command flags. */ flags = kp->flags; /* Check for illegal count. */ if (F_ISSET(vp, VC_C1SET) && !LF_ISSET(V_CNT)) goto usage; /* Illegal motion command. */ if (ismotion == NULL) { /* Illegal buffer. */ if (!LF_ISSET(V_OBUF) && F_ISSET(vp, VC_BUFFER)) goto usage; /* Required buffer. */ if (LF_ISSET(V_RBUF)) { KEY(vp->buffer, 0); F_SET(vp, VC_BUFFER); } } /* * Special case: '[', ']' and 'Z' commands. Doesn't the fact that * the *single* characters don't mean anything but the *doubled* * characters do, just frost your shorts? */ if (vp->key == '[' || vp->key == ']' || vp->key == 'Z') { /* * Historically, half entered [[, ]] or Z commands weren't * cancelled by <escape>, the terminal was beeped instead. * POSIX.2-1992 probably didn't notice, and requires that * they be cancelled instead of beeping. Seems fine to me. * * Don't set the EC_MAPCOMMAND flag, apparently ] is a popular * vi meta-character, and we don't want the user to wait while * we time out a possible mapping. This *appears* to match * historic vi practice, but with mapping characters, You Just * Never Know. */ KEY(key, 0); if (vp->key != key) { usage: if (ismotion == NULL) s = kp->usage; else if (ismotion->key == '~' && O_ISSET(sp, O_TILDEOP)) s = tmotion.usage; else s = vikeys[ismotion->key].usage; v_emsg(sp, s, VIM_USAGE); return (GC_ERR); } } /* Special case: 'z' command. */ if (vp->key == 'z') { KEY(vp->character, 0); if (ISDIGIT(vp->character)) { if (v_count(sp, vp, vp->character, &vp->count2)) return (GC_ERR); F_SET(vp, VC_C2SET); KEY(vp->character, 0); } } /* * Commands that have motion components can be doubled to imply the * current line. */ if (ismotion != NULL && ismotion->key != key && !LF_ISSET(V_MOVE)) { msgq(sp, M_ERR, "210|%s may not be used as a motion command", KEY_NAME(sp, key)); return (GC_ERR); } /* Pick up required trailing character. */ if (LF_ISSET(V_CHAR)) KEY(vp->character, 0); /* Get any associated cursor word. */ if (F_ISSET(kp, V_KEYW) && v_curword(sp)) return (GC_ERR); return (GC_OK); esc: switch (cpart) { case COMMANDMODE: msgq(sp, M_BERR, "211|Already in command mode"); return (GC_ERR_NOFLUSH); case ISPARTIAL: break; case NOTPARTIAL: (void)sp->gp->scr_bell(sp); break; } return (GC_ERR); }
DB * __rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo, int dflags) { BTREE *t; BTREEINFO btopeninfo; DB *dbp; PAGE *h; struct stat sb; int rfd = -1; /* pacify gcc */ int sverrno; dbp = NULL; /* Open the user's file -- if this fails, we're done. */ if (fname != NULL) { if ((rfd = open(fname, flags | O_CLOEXEC, mode)) == -1) return NULL; } /* Create a btree in memory (backed by disk). */ if (openinfo) { if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT)) goto einval; btopeninfo.flags = 0; btopeninfo.cachesize = openinfo->cachesize; btopeninfo.maxkeypage = 0; btopeninfo.minkeypage = 0; btopeninfo.psize = openinfo->psize; btopeninfo.compare = NULL; btopeninfo.prefix = NULL; btopeninfo.lorder = openinfo->lorder; dbp = __bt_open(openinfo->bfname, O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags); } else dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags); if (dbp == NULL) goto err; /* * Some fields in the tree structure are recno specific. Fill them * in and make the btree structure look like a recno structure. We * don't change the bt_ovflsize value, it's close enough and slightly * bigger. */ t = dbp->internal; if (openinfo) { if (openinfo->flags & R_FIXEDLEN) { F_SET(t, R_FIXLEN); t->bt_reclen = openinfo->reclen; if (t->bt_reclen == 0) goto einval; } t->bt_bval = openinfo->bval; } else t->bt_bval = '\n'; F_SET(t, R_RECNO); if (fname == NULL) F_SET(t, R_EOF | R_INMEM); else t->bt_rfd = rfd; if (fname != NULL) { /* * In 4.4BSD, stat(2) returns true for ISSOCK on pipes. * Unfortunately, that's not portable, so we use lseek * and check the errno values. */ errno = 0; if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, R_RDONLY); break; default: goto einval; } slow: if ((t->bt_rfp = fdopen(rfd, "r")) == NULL) goto err; F_SET(t, R_CLOSEFP); t->bt_irec = F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe; } else { switch (flags & O_ACCMODE) { case O_RDONLY: F_SET(t, R_RDONLY); break; case O_RDWR: break; default: goto einval; } if (fstat(rfd, &sb)) goto err; /* * Kluge -- we'd like to test to see if the file is too * big to mmap. Since, we don't know what size or type * off_t's or size_t's are, what the largest unsigned * integral type is, or what random insanity the local * C compiler will perpetrate, doing the comparison in * a portable way is flatly impossible. Hope that mmap * fails if the file is too large. */ if (sb.st_size == 0) F_SET(t, R_EOF); else { #ifdef MMAP_NOT_AVAILABLE /* * XXX * Mmap doesn't work correctly on many current * systems. In particular, it can fail subtly, * with cache coherency problems. Don't use it * for now. */ t->bt_msize = sb.st_size; if ((t->bt_smap = mmap(NULL, t->bt_msize, PROT_READ, MAP_FILE | MAP_PRIVATE, rfd, (off_t)0)) == (caddr_t)-1) goto slow; t->bt_cmap = t->bt_smap; t->bt_emap = t->bt_smap + sb.st_size; t->bt_irec = F_ISSET(t, R_FIXLEN) ? __rec_fmap : __rec_vmap; F_SET(t, R_MEMMAPPED); #else goto slow; #endif } } } /* Use the recno routines. */ dbp->close = __rec_close; dbp->del = __rec_delete; dbp->fd = __rec_fd; dbp->get = __rec_get; dbp->put = __rec_put; dbp->seq = __rec_seq; dbp->sync = __rec_sync; /* If the root page was created, reset the flags. */ if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL) goto err; if ((h->flags & P_TYPE) == P_BLEAF) { F_CLR(h, P_TYPE); F_SET(h, P_RLEAF); mpool_put(t->bt_mp, h, MPOOL_DIRTY); } else mpool_put(t->bt_mp, h, 0); if (openinfo && openinfo->flags & R_SNAPSHOT && !F_ISSET(t, R_EOF | R_INMEM) && t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR) goto err; return (dbp); einval: errno = EINVAL; err: sverrno = errno; if (dbp != NULL) (void)__bt_close(dbp); if (fname != NULL) (void)close(rfd); errno = sverrno; return (NULL); }
/* * cl_suspend -- * Suspend a screen. * * PUBLIC: int cl_suspend __P((SCR *, int *)); */ int cl_suspend(SCR *sp, int *allowedp) { struct termios t; CL_PRIVATE *clp; WINDOW *win; GS *gp; size_t y, x; int changed; gp = sp->gp; clp = CLP(sp); win = CLSP(sp) ? CLSP(sp) : stdscr; *allowedp = 1; /* * The ex implementation of this function isn't needed by screens not * supporting ex commands that require full terminal canonical mode * (e.g. :suspend). * * The vi implementation of this function isn't needed by screens not * supporting vi process suspension, i.e. any screen that isn't backed * by a UNIX shell. * * Setting allowedp to 0 will cause the editor to reject the command. */ if (F_ISSET(sp, SC_EX)) { /* Save the terminal settings, and restore the original ones. */ if (F_ISSET(clp, CL_STDIN_TTY)) { (void)tcgetattr(STDIN_FILENO, &t); (void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &clp->orig); } /* Stop the process group. */ (void)kill(0, SIGTSTP); /* Time passes ... */ /* Restore terminal settings. */ if (F_ISSET(clp, CL_STDIN_TTY)) (void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &t); return (0); } /* * Move to the lower left-hand corner of the screen. * * XXX * Not sure this is necessary in System V implementations, but it * shouldn't hurt. */ getyx(win, y, x); (void)wmove(win, LINES - 1, 0); (void)wrefresh(win); /* * Temporarily end the screen. System V introduced a semantic where * endwin() could be restarted. We use it because restarting curses * from scratch often fails in System V. 4BSD curses didn't support * restarting after endwin(), so we have to do what clean up we can * without calling it. */ /* Save the terminal settings. */ (void)tcgetattr(STDIN_FILENO, &t); /* Restore the cursor keys to normal mode. */ (void)keypad(stdscr, FALSE); /* Restore the window name. */ (void)cl_rename(sp, NULL, 0); #ifdef HAVE_BSD_CURSES (void)cl_attr(sp, SA_ALTERNATE, 0); #else (void)endwin(); #endif /* * XXX * Restore the original terminal settings. This is bad -- the * reset can cause character loss from the tty queue. However, * we can't call endwin() in BSD curses implementations, and too * many System V curses implementations don't get it right. */ (void)tcsetattr(STDIN_FILENO, TCSADRAIN | TCSASOFT, &clp->orig); /* Stop the process group. */ (void)kill(0, SIGTSTP); /* Time passes ... */ /* * If we received a killer signal, we're done. Leave everything * unchanged. In addition, the terminal has already been reset * correctly, so leave it alone. */ if (clp->killersig) { F_CLR(clp, CL_SCR_EX_INIT | CL_SCR_VI_INIT); return (0); } /* Restore terminal settings. */ wrefresh(win); /* Needed on SunOs/Solaris ? */ if (F_ISSET(clp, CL_STDIN_TTY)) (void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &t); #ifdef HAVE_BSD_CURSES (void)cl_attr(sp, SA_ALTERNATE, 1); #endif /* Set the window name. */ (void)cl_rename(sp, sp->frp->name, 1); /* Put the cursor keys into application mode. */ (void)keypad(stdscr, TRUE); /* Refresh and repaint the screen. */ (void)wmove(win, y, x); (void)cl_refresh(sp, 1); /* If the screen changed size, set the SIGWINCH bit. */ if (cl_ssize(sp, 1, NULL, NULL, &changed)) return (1); if (changed) F_SET(CLP(sp), CL_SIGWINCH); return (0); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; u_int close_range_end, close_range_start; bool locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; /* * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* * Query operations need a full set of cursors. Overwrite cursors * do queries in service of updates. */ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE)) F_SET(clsm, WT_CLSM_OPEN_READ); if (lsm_tree->nchunks == 0) return (0); ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* * If the key is pointing to memory that is pinned by a chunk * cursor, take a copy before closing cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_CURSOR_NEEDKEY(c); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->switch_txn[ngood - 1] = chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) close_range_end -= nupdates; else close_range_end = 0; WT_ASSERT(session, ngood >= close_range_end); } else { close_range_end = 0; close_range_start = 0; } if (close_range_end > close_range_start) { saved_gen = lsm_tree->dsk_gen; locked = false; WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, close_range_start, close_range_end)); WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->switch_txn[i] = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); /* * Setup all cursors other than the primary to only do conflict * checks on insert operations. This allows us to execute * inserts on non-primary chunks as a way of checking for * write conflicts with concurrent updates. */ if (i != nchunks - 1) (*cp)->insert = __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by * the first update. */ btree = ((WT_CURSOR_BTREE *)(primary))->btree; if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, __wt_btree_evictable(session, false)); } } clsm->dsk_gen = lsm_tree->dsk_gen; err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); return (ret); }
/* * __drop_table -- * WT_SESSION::drop for a table. */ static int __drop_table( WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) { WT_COLGROUP *colgroup; WT_DECL_RET; WT_INDEX *idx; WT_TABLE *table; u_int i; const char *name; bool tracked; WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE)); name = uri; WT_PREFIX_SKIP_REQUIRED(session, name, "table:"); table = NULL; tracked = false; /* * Open the table so we can drop its column groups and indexes. * * Ideally we would keep the table locked exclusive across the drop, * but for now we rely on the global table lock to prevent the table * being reopened while it is being dropped. One issue is that the * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock, * avoiding deadlocks while waiting for LSM operation to quiesce. * * Temporarily getting the table exclusively serves the purpose * of ensuring that cursors on the table that are already open * must at least be closed before this call proceeds. */ WT_ERR(__wt_schema_get_table_uri(session, uri, true, WT_DHANDLE_EXCLUSIVE, &table)); WT_ERR(__wt_schema_release_table(session, table)); WT_ERR(__wt_schema_get_table_uri(session, uri, true, 0, &table)); /* Drop the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) { if ((colgroup = table->cgroups[i]) == NULL) continue; /* * Drop the column group before updating the metadata to avoid * the metadata for the table becoming inconsistent if we can't * get exclusive access. */ WT_ERR(__wt_schema_drop(session, colgroup->source, cfg)); WT_ERR(__wt_metadata_remove(session, colgroup->name)); } /* Drop the indices. */ WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { if ((idx = table->indices[i]) == NULL) continue; /* * Drop the index before updating the metadata to avoid * the metadata for the table becoming inconsistent if we can't * get exclusive access. */ WT_ERR(__wt_schema_drop(session, idx->source, cfg)); WT_ERR(__wt_metadata_remove(session, idx->name)); } /* Make sure the table data handle is closed. */ WT_TRET(__wt_schema_release_table(session, table)); WT_ERR(__wt_schema_get_table_uri( session, uri, true, WT_DHANDLE_EXCLUSIVE, &table)); F_SET(&table->iface, WT_DHANDLE_DISCARD); if (WT_META_TRACKING(session)) { WT_WITH_DHANDLE(session, &table->iface, ret = __wt_meta_track_handle_lock(session, false)); WT_ERR(ret); tracked = true; } /* Remove the metadata entry (ignore missing items). */ WT_ERR(__wt_metadata_remove(session, uri)); err: if (table != NULL && !tracked) WT_TRET(__wt_schema_release_table(session, table)); return (ret); }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; int evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, 1); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. */ WT_ERR(__wt_evict(session, ref, WT_EVICT_EXCLUSIVE)); break; case WT_SYNC_DISCARD: WT_ASSERT(session, __wt_page_can_evict(session, page, 0, NULL)); __wt_evict_page_clean_update(session, ref); break; case WT_SYNC_DISCARD_FORCE: /* * Forced discard of the page, whether clean or dirty. * If we see a dirty page in a forced discard, clean * the page, both to keep statistics correct, and to * let the page-discard function assert no dirty page * is ever discarded. */ if (__wt_page_is_modified(page)) { page->modify->write_gen = 0; __wt_cache_dirty_decr(session, page); } F_SET(session, WT_SESSION_DISCARD_FORCE); __wt_evict_page_clean_update(session, ref); F_CLR(session, WT_SESSION_DISCARD_FORCE); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (evict_reset) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int skipped, newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, 0)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (skipped = newpage = 0;; skipped = 0, newpage = 1) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev( cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage, &skipped); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage, &skipped); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } if (newpage && skipped) page->read_gen = WT_READGEN_OLDEST; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * ex_aci -- * Append, change, insert in ex. */ static int ex_aci(SCR *sp, EXCMD *cmdp, enum which cmd) { CHAR_T *p, *t; GS *gp; TEXT *tp; TEXTH tiq[] = {{ 0 }}; recno_t cnt = 0, lno; size_t len; u_int32_t flags; int need_newline; gp = sp->gp; NEEDFILE(sp, cmdp); /* * If doing a change, replace lines for as long as possible. Then, * append more lines or delete remaining lines. Changes to an empty * file are appends, inserts are the same as appends to the previous * line. * * !!! * Set the address to which we'll append. We set sp->lno to this * address as well so that autoindent works correctly when get text * from the user. */ lno = cmdp->addr1.lno; sp->lno = lno; if ((cmd == CHANGE || cmd == INSERT) && lno != 0) --lno; /* * !!! * If the file isn't empty, cut changes into the unnamed buffer. */ if (cmd == CHANGE && cmdp->addr1.lno != 0 && (cut(sp, NULL, &cmdp->addr1, &cmdp->addr2, CUT_LINEMODE) || del(sp, &cmdp->addr1, &cmdp->addr2, 1))) return (1); /* * !!! * Anything that was left after the command separator becomes part * of the inserted text. Apparently, it was common usage to enter: * * :g/pattern/append|stuff1 * * and append the line of text "stuff1" to the lines containing the * pattern. It was also historically legal to enter: * * :append|stuff1 * stuff2 * . * * and the text on the ex command line would be appended as well as * the text inserted after it. There was an historic bug however, * that the user had to enter *two* terminating lines (the '.' lines) * to terminate text input mode, in this case. This whole thing * could be taken too far, however. Entering: * * :append|stuff1\ * stuff2 * stuff3 * . * * i.e. mixing and matching the forms confused the historic vi, and, * not only did it take two terminating lines to terminate text input * mode, but the trailing backslashes were retained on the input. We * match historic practice except that we discard the backslashes. * * Input lines specified on the ex command line lines are separated by * <newline>s. If there is a trailing delimiter an empty line was * inserted. There may also be a leading delimiter, which is ignored * unless it's also a trailing delimiter. It is possible to encounter * a termination line, i.e. a single '.', in a global command, but not * necessary if the text insert command was the last of the global * commands. */ if (cmdp->save_cmdlen != 0) { for (p = cmdp->save_cmd, len = cmdp->save_cmdlen; len > 0; p = t) { for (t = p; len > 0 && t[0] != '\n'; ++t, --len); if (t != p || len == 0) { if (F_ISSET(sp, SC_EX_GLOBAL) && t - p == 1 && p[0] == '.') { ++t; if (len > 0) --len; break; } if (db_append(sp, 1, lno++, p, t - p)) return (1); } if (len != 0) { ++t; if (--len == 0 && db_append(sp, 1, lno++, NULL, 0)) return (1); } } /* * If there's any remaining text, we're in a global, and * there's more command to parse. * * !!! * We depend on the fact that non-global commands will eat the * rest of the command line as text input, and before getting * any text input from the user. Otherwise, we'd have to save * off the command text before or during the call to the text * input function below. */ if (len != 0) cmdp->save_cmd = t; cmdp->save_cmdlen = len; } if (F_ISSET(sp, SC_EX_GLOBAL)) { if ((sp->lno = lno) == 0 && db_exist(sp, 1)) sp->lno = 1; return (0); } /* * If not in a global command, read from the terminal. * * If this code is called by vi, we want to reset the terminal and use * ex's line get routine. It actually works fine if we use vi's get * routine, but it doesn't look as nice. Maybe if we had a separate * window or something, but getting a line at a time looks awkward. * However, depending on the screen that we're using, that may not * be possible. */ if (F_ISSET(sp, SC_VI)) { if (gp->scr_screen(sp, SC_EX)) { ex_wemsg(sp, cmdp->cmd->name, EXM_NOCANON); return (1); } /* If we're still in the vi screen, move out explicitly. */ need_newline = !F_ISSET(sp, SC_SCR_EXWROTE); F_SET(sp, SC_SCR_EX | SC_SCR_EXWROTE); if (need_newline) (void)ex_puts(sp, "\n"); /* * !!! * Users of historical versions of vi sometimes get confused * when they enter append mode, and can't seem to get out of * it. Give them an informational message. */ (void)ex_puts(sp, msg_cat(sp, "273|Entering ex input mode.", NULL)); (void)ex_puts(sp, "\n"); (void)ex_fflush(sp); } /* * Set input flags; the ! flag turns off autoindent for append, * change and insert. */ LF_INIT(TXT_DOTTERM | TXT_NUMBER); if (!FL_ISSET(cmdp->iflags, E_C_FORCE) && O_ISSET(sp, O_AUTOINDENT)) LF_SET(TXT_AUTOINDENT); if (O_ISSET(sp, O_BEAUTIFY)) LF_SET(TXT_BEAUTIFY); /* * This code can't use the common screen TEXTH structure (sp->tiq), * as it may already be in use, e.g. ":append|s/abc/ABC/" would fail * as we are only halfway through the text when the append code fires. * Use a local structure instead. (The ex code would have to use a * local structure except that we're guaranteed to finish remaining * characters in the common TEXTH structure when they were inserted * into the file, above.) */ TAILQ_INIT(tiq); if (ex_txt(sp, tiq, 0, flags)) return (1); TAILQ_FOREACH(tp, tiq, q) { if (db_append(sp, 1, lno++, tp->lb, tp->len)) return (1); ++cnt; } /* * Set sp->lno to the final line number value (correcting for a * possible 0 value) as that's historically correct for the final * line value, whether or not the user entered any text. */ if ((sp->lno = lno) == 0 && db_exist(sp, 1)) sp->lno = 1; return (0); }
/* * __wt_bm_read -- * Map or read address cookie referenced block into a buffer. */ int __wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { WT_BLOCK *block; off_t offset; uint32_t size, cksum; int mapped; WT_UNUSED(addr_size); block = bm->block; /* Crack the cookie. */ WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum)); /* * Clear buffers previously used for mapped memory, we may be forced * to read into this buffer. */ if (F_ISSET(buf, WT_ITEM_MAPPED)) __wt_buf_free(session, buf); /* * If we're going to be able to return mapped memory and the buffer * has allocated memory, discard it. */ mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen; if (buf->mem != NULL && mapped) __wt_buf_free(session, buf); /* Map the block if it's possible. */ if (mapped) { buf->mem = (uint8_t *)bm->map + offset; buf->memsize = size; buf->data = buf->mem; buf->size = size; F_SET(buf, WT_ITEM_MAPPED); WT_RET(__wt_mmap_preload(session, buf->mem, buf->size)); WT_CSTAT_INCR(session, block_map_read); WT_CSTAT_INCRV(session, block_byte_map_read, size); return (0); } /* Read the block. */ WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum)); #ifdef HAVE_POSIX_FADVISE /* Optionally discard blocks from the system's buffer cache. */ if (block->os_cache_max != 0 && (block->os_cache += size) > block->os_cache_max) { WT_DECL_RET; block->os_cache = 0; if ((ret = posix_fadvise(block->fh->fd, (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0) WT_RET_MSG( session, ret, "%s: posix_fadvise", block->name); } #endif return (0); }
/* * mark -- * Mark commands. */ static int mark(SCR *sp, VICMD *vp, int getmark, enum which cmd) { dir_t dir; MARK m; size_t len; if (getmark && mark_get(sp, vp->character, &vp->m_stop, M_BERR)) return (1); /* * !!! * Historically, BQMARKS for character positions that no longer * existed acted as FQMARKS. * * FQMARKS move to the first non-blank. */ switch (cmd) { case BQMARK: if (db_get(sp, vp->m_stop.lno, DBG_FATAL, NULL, &len)) return (1); if (vp->m_stop.cno < len || (vp->m_stop.cno == len && len == 0)) break; if (ISMOTION(vp)) F_SET(vp, VM_LMODE); cmd = FQMARK; /* FALLTHROUGH */ case FQMARK: vp->m_stop.cno = 0; if (nonblank(sp, vp->m_stop.lno, &vp->m_stop.cno)) return (1); break; default: abort(); } /* Non-motion commands move to the end of the range. */ if (!ISMOTION(vp)) { vp->m_final = vp->m_stop; return (0); } /* * !!! * If a motion component to a BQMARK, the cursor has to move. */ if (cmd == BQMARK && vp->m_stop.lno == vp->m_start.lno && vp->m_stop.cno == vp->m_start.cno) { v_nomove(sp); return (1); } /* * If the motion is in the reverse direction, switch the start and * stop MARK's so that it's in a forward direction. (There's no * reason for this other than to make the tests below easier. The * code in vi.c:vi() would have done the switch.) Both forward * and backward motions can happen for any kind of search command. */ if (vp->m_start.lno > vp->m_stop.lno || (vp->m_start.lno == vp->m_stop.lno && vp->m_start.cno > vp->m_stop.cno)) { m = vp->m_start; vp->m_start = vp->m_stop; vp->m_stop = m; dir = BACKWARD; } else dir = FORWARD; /* * Yank cursor motion, when associated with marks as motion commands, * historically behaved as follows: * * ` motion ' motion * Line change? Line change? * Y N Y N * -------------- --------------- * FORWARD: | NM NM | NM NM * | | * BACKWARD: | M M | M NM(1) * * where NM means the cursor didn't move, and M means the cursor * moved to the mark. * * As the cursor was usually moved for yank commands associated * with backward motions, this implementation regularizes it by * changing the NM at position (1) to be an M. This makes mark * motions match search motions, which is probably A Good Thing. * * Delete cursor motion was always to the start of the text region, * regardless. Ignore other motion commands. */ #ifdef HISTORICAL_PRACTICE if (ISCMD(vp->rkp, 'y')) { if ((cmd == BQMARK || (cmd == FQMARK && vp->m_start.lno != vp->m_stop.lno)) && (vp->m_start.lno > vp->m_stop.lno || (vp->m_start.lno == vp->m_stop.lno && vp->m_start.cno > vp->m_stop.cno))) vp->m_final = vp->m_stop; } else if (ISCMD(vp->rkp, 'd')) if (vp->m_start.lno > vp->m_stop.lno || (vp->m_start.lno == vp->m_stop.lno && vp->m_start.cno > vp->m_stop.cno)) vp->m_final = vp->m_stop; #else vp->m_final = vp->m_start; #endif /* * Forward marks are always line oriented, and it's set in the * vcmd.c table. */ if (cmd == FQMARK) return (0); /* * BQMARK'S moving backward and starting at column 0, and ones moving * forward and ending at column 0 are corrected to the last column of * the previous line. Otherwise, adjust the starting/ending point to * the character before the current one (this is safe because we know * the search had to move to succeed). * * Mark motions become line mode opertions if they start at the first * nonblank and end at column 0 of another line. */ if (vp->m_start.lno < vp->m_stop.lno && vp->m_stop.cno == 0) { if (db_get(sp, --vp->m_stop.lno, DBG_FATAL, NULL, &len)) return (1); vp->m_stop.cno = len ? len - 1 : 0; len = 0; if (nonblank(sp, vp->m_start.lno, &len)) return (1); if (vp->m_start.cno <= len) F_SET(vp, VM_LMODE); } else --vp->m_stop.cno; return (0); }
/* maybe this could be simpler * * DB3 behaves differently from DB1 * * if lno != 0 just go to lno and put the new line after it * if lno == 0 then if there are any record, put in front of the first * otherwise just append to the end thus creating the first * line */ static int append(SCR *sp, db_recno_t lno, const CHAR_T *p, size_t len, lnop_t op, int update) { DBT data, key; DBC *dbcp_put; EXF *ep; const char *fp; size_t flen; int rval; /* Check for no underlying file. */ if ((ep = sp->ep) == NULL) { ex_emsg(sp, NULL, EXM_NOFILEYET); return (1); } if (ep->l_win && ep->l_win != sp->wp) { ex_emsg(sp, NULL, EXM_LOCKED); return 1; } /* Log before change. */ log_line(sp, lno + 1, LOG_LINE_APPEND_B); /* Update file. */ memset(&key, 0, sizeof(key)); key.data = &lno; key.size = sizeof(lno); memset(&data, 0, sizeof(data)); if ((sp->db_error = ep->db->cursor(ep->db, NULL, &dbcp_put, 0)) != 0) return 1; INT2FILE(sp, p, len, fp, flen); if (lno != 0) { if ((sp->db_error = dbcp_put->c_get(dbcp_put, &key, &data, DB_SET)) != 0) goto err2; data.data = __UNCONST(fp); data.size = flen; if ((sp->db_error = dbcp_put->c_put(dbcp_put, &key, &data, DB_AFTER)) != 0) { err2: (void)dbcp_put->c_close(dbcp_put); msgq(sp, M_DBERR, (op == LINE_APPEND) ? "004|unable to append to line %lu" : "005|unable to insert at line %lu", (u_long)lno); return (1); } } else { if ((sp->db_error = dbcp_put->c_get(dbcp_put, &key, &data, DB_FIRST)) != 0) { if (sp->db_error != DB_NOTFOUND) goto err2; data.data = __UNCONST(fp); data.size = flen; if ((sp->db_error = ep->db->put(ep->db, NULL, &key, &data, DB_APPEND)) != 0) { goto err2; } } else { key.data = &lno; key.size = sizeof(lno); data.data = __UNCONST(fp); data.size = flen; if ((sp->db_error = dbcp_put->c_put(dbcp_put, &key, &data, DB_BEFORE)) != 0) { goto err2; } } } (void)dbcp_put->c_close(dbcp_put); /* Flush the cache, update line count, before screen update. */ update_cache(sp, LINE_INSERT, lno); /* File now dirty. */ if (F_ISSET(ep, F_FIRSTMODIFY)) (void)rcv_init(sp); F_SET(ep, F_MODIFIED); /* Log after change. */ log_line(sp, lno + 1, LOG_LINE_APPEND_F); /* Update marks, @ and global commands. */ rval = line_insdel(sp, LINE_INSERT, lno + 1); /* * Update screen. * * comment copied from db_append * XXX * Nasty hack. If multiple lines are input by the user, they aren't * committed until an <ESC> is entered. The problem is the screen was * updated/scrolled as each line was entered. So, when this routine * is called to copy the new lines from the cut buffer into the file, * it has to know not to update the screen again. */ return (scr_update(sp, lno + 1, LINE_INSERT, update) || rval); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; bool deleting, fatal, locked; ci = &block->live; fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); #endif /* * Checkpoints are a two-step process: first, write a new checkpoint to * disk (including all the new extent lists for modified checkpoints * and the live system). As part of this, create a list of file blocks * newly available for reallocation, based on checkpoints being deleted. * We then return the locations of the new checkpoint information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new checkpoint information, and if we crashed before * the new checkpoint location was saved, we'd have overwritten blocks * still referenced by checkpoints in the system.) In summary, there is * a second step: after our caller saves the checkpoint information, we * are called to add the newly available blocks into the live system's * available list. * * This function is the first step, the second step is in the resolve * function. * * If we're called to checkpoint the same file twice (without the second * resolution step), or re-entered for any reason, it's an error in our * caller, and our choices are all bad: leak blocks or potentially crash * with our caller not yet having saved previous checkpoint information * to stable storage. */ __wt_spin_lock(session, &block->live_lock); switch (block->ckpt_state) { case WT_CKPT_INPROGRESS: block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE; break; case WT_CKPT_NONE: case WT_CKPT_PANIC_ON_FAILURE: __wt_err(session, EINVAL, "%s: an unexpected checkpoint attempt: the checkpoint " "was never started or has already completed", block->name); ret = __wt_block_panic(session); break; case WT_CKPT_SALVAGE: /* Salvage doesn't use the standard checkpoint APIs. */ break; } __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but as * described above, there is no "free the checkpoint information" call * into the block manager; if there was an error in an upper level that * resulted in some previous checkpoint never being resolved, the list * may not be empty. We should have caught that with the "checkpoint * in progress" test, but it doesn't cost us anything to be cautious. * * We free the checkpoint's allocation and discard extent lists as part * of the resolution step, not because they're needed at that time, but * because it's potentially a lot of work, and waiting allows the btree * layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail", true)); __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = true; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Failures are now fatal: we can't currently back out the merge of any * deleted checkpoint extent lists into the live system's extent lists, * so continuing after error would leave the live system's extent lists * corrupted for any subsequent checkpoint (and potentially, should a * subsequent checkpoint succeed, for recovery). */ fatal = true; /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * deleted and merged checkpoint information without a lock, except for * the final merge of ranges into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = true; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before merging * checkpoint allocation and discard information from the checkpoints * we're deleting, those operations change the underlying byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, false)); live_update: /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ckpt_size; /* * Set the rolling checkpoint size for the live system. * The current size includes the current checkpoint's * root page size (root pages are on the checkpoint's * block allocation list as root pages are allocated * with the usual block allocation functions). That's * correct, but we don't want to include it in the size * for the next checkpoint. */ ckpt_size -= ci->root_size; /* * Additionally, we had a bug for awhile where the live * checkpoint size grew without bound. We can't sanity * check the value, that would require walking the tree * as part of the checkpoint. Bound any bug at the size * of the file. * It isn't practical to assert that the value is within * bounds since databases created with older versions * of WiredTiger (2.8.0) would likely see an error. */ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. This includes freeing a lot of extents, so do it * outside of the system's lock by copying and resetting the original, * then doing the work later. */ ci->ckpt_alloc = ci->alloc; WT_ERR(__wt_block_extlist_init( session, &ci->alloc, "live", "alloc", false)); ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init( session, &ci->discard, "live", "discard", false)); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) WT_ERR_MSG(session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard " "list"); #endif err: if (ret != 0 && fatal) { __wt_err(session, ret, "%s: fatal checkpoint failure", block->name); ret = __wt_block_panic(session); } if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __snapshot_process -- * Process the list of snapshots. */ static int __snapshot_process( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase) { WT_BLOCK_SNAPSHOT *a, *b, *si; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_SNAPSHOT *snap; uint64_t snapshot_size; int deleting, locked; si = &block->live; locked = 0; /* * We've allocated our last page, update the snapshot size. We need to * calculate the live system's snapshot size before reading and merging * snapshot allocation and discard information from the snapshots we're * deleting, those operations will change the underlying byte counts. */ snapshot_size = si->snapshot_size; snapshot_size += si->alloc.bytes; snapshot_size -= si->discard.bytes; /* * Extents that become newly available as a result of deleting previous * snapshots are added to a list of extents. The list should be empty, * but there's no explicit "free the snapshot information" call into the * block manager; if there was an error in an upper level resulting in * the snapshot never being "resolved", the list might not be empty. * * XXX * This isn't sufficient, actually: we're going to leak all the blocks * that were written as part of the last snapshot because it was never * resolved. */ __wt_block_extlist_free(session, &si->snapshot_avail); WT_RET(__wt_block_extlist_init( session, &si->snapshot_avail, "live", "snapshot_avail")); /* * To delete a snapshot, we'll need snapshot information for it, and we * have to read that from the disk. */ deleting = 0; WT_SNAPSHOT_FOREACH(snapbase, snap) { /* * To delete a snapshot, we'll need snapshot information for it * and the subsequent snapshot. The test is tricky, we have to * load the current snapshot's information if it's marked for * deletion, or if it follows a snapshot marked for deletion, * where the boundary cases are the first snapshot in the list * and the last snapshot in the list: if we're deleting the last * snapshot in the list, there's no next snapshot, the snapshot * will be merged into the live tree. */ if (!F_ISSET(snap, WT_SNAP_DELETE) && (snap == snapbase || F_ISSET(snap, WT_SNAP_ADD) || !F_ISSET(snap - 1, WT_SNAP_DELETE))) continue; deleting = 1; /* * Allocate a snapshot structure, crack the cookie and read the * snapshot's extent lists. * * Ignore the avail list: snapshot avail lists are only useful * if we are rolling forward from the particular snapshot and * they represent our best understanding of what blocks can be * allocated. If we are not operating on the live snapshot, * subsequent snapshots might have allocated those blocks, and * the avail list is useless. We don't discard it, because it * is useful as part of verification, but we don't re-write it * either. */ WT_ERR(__wt_calloc( session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv)); si = snap->bpriv; WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0)); WT_ERR(__wt_block_buffer_to_snapshot( session, block, snap->raw.data, si)); WT_ERR(__wt_block_extlist_read(session, block, &si->alloc)); WT_ERR(__wt_block_extlist_read(session, block, &si->discard)); } /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if snapshots take too * much time away from real work: we read historic snapshot information * without a lock, but we could also merge and re-write the delete * snapshot information without a lock, except for ranges merged into * the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = 1; /* Skip the additional processing if we aren't deleting snapshots. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed snapshots: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_SNAPSHOT_FOREACH(snapbase, snap) { if (!F_ISSET(snap, WT_SNAP_DELETE)) continue; if (WT_VERBOSE_ISSET(session, snapshot)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__snapshot_string( session, block, snap->raw.data, tmp)); WT_VERBOSE_ERR(session, snapshot, "%s: delete-snapshot: %s: %s", block->name, snap->name, (char *)tmp->data); } /* * Set the from/to snapshot structures, where the "to" value * may be the live tree. */ a = snap->bpriv; if (F_ISSET(snap + 1, WT_SNAP_ADD)) b = &block->live; else b = (snap + 1)->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the snapshot's discard list, however, not the live system's * list because it appears on the snapshot's alloc list and so * must be paired in the snapshot. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" snapshot's extent * lists directly to the live system's avail list, they were * never on any alloc list. Include the "from" snapshot's * avail list, it's going away. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail)); WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * snapshot's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, &a->discard, &b->discard)); /* * If the "to" snapshot is also being deleted, we're done with * it, it's merged into some other snapshot in the next loop. * This means the extent lists may aggregate over a number of * snapshots, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(snap + 1, WT_SNAP_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" snapshot's allocate * and discard lists overlap is fair game, move ranges appearing * on both lists to the live snapshot's newly available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(snap + 1, WT_SNAP_ADD)) continue; /* * We have to write the "to" snapshot's extent lists out in new * blocks, and update its cookie. * * Free the blocks used to hold the "to" snapshot's extent lists * directly to the live system's avail list, they were never on * any alloc list. Do not include the "to" snapshot's avail * list, it's not changing. */ WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard)); F_SET(snap + 1, WT_SNAP_UPDATE); } /* Update snapshots marked for update. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_UPDATE)) { WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD)); WT_ERR(__snapshot_update( session, block, snap, snap->bpriv, 0, 0)); } live_update: si = &block->live; /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail)); /* Update the final, added snapshot based on the live system. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (F_ISSET(snap, WT_SNAP_ADD)) { WT_ERR(__snapshot_update( session, block, snap, si, snapshot_size, 1)); /* * XXX * Our caller wants two pieces of information: the time * the snapshot was taken and the final snapshot size. * This violates layering but the alternative is a call * for the btree layer to crack the snapshot cookie into * its components, and that's a fair amount of work. * (We could just read the system time in the session * layer when updating the metadata file, but that won't * work for the snapshot size, and so we do both here.) */ snap->snapshot_size = si->snapshot_size; WT_ERR(__wt_epoch(session, &snap->sec, NULL)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. */ __wt_block_extlist_free(session, &si->alloc); WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc")); __wt_block_extlist_free(session, &si->discard); WT_ERR( __wt_block_extlist_init(session, &si->discard, "live", "discard")); #ifdef HAVE_DIAGNOSTIC /* * The first snapshot in the system should always have an empty discard * list. If we've read that snapshot and/or created it, check. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if (!F_ISSET(snap, WT_SNAP_DELETE)) break; if ((a = snap->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) { __wt_errx(session, "snapshot incorrectly has blocks on the discard list"); WT_ERR(WT_ERROR); } #endif err: if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any snapshot information we loaded, we no longer need it. */ WT_SNAPSHOT_FOREACH(snapbase, snap) if ((si = snap->bpriv) != NULL) { __wt_block_extlist_free(session, &si->alloc); __wt_block_extlist_free(session, &si->avail); __wt_block_extlist_free(session, &si->discard); } __wt_scr_free(&tmp); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_prev); WT_STAT_DATA_INCR(session, cursor_prev); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* tree walk flags */ LF_SET(WT_READ_NO_SPLIT); /* don't try to split */ if (truncating) LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; /* * Column-store pages may have appended entries. Handle it * separately from the usual cursor code, it's in a simple * format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif if (ret == 0) F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { __wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri); return (0); } /* Stop if a running transaction needs the chunk. */ WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { __wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; __wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ __wt_epoch(session, &lsm_tree->last_flush_ts); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ __wt_lsm_tree_writelock(session, lsm_tree); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); __wt_lsm_tree_writeunlock(session, lsm_tree); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); __wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __wt_txn_config -- * Configure a transaction. */ int __wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_TXN *txn; txn = &session->txn; WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval)); if (cval.len != 0) txn->isolation = WT_STRING_MATCH("snapshot", cval.str, cval.len) ? WT_ISO_SNAPSHOT : WT_STRING_MATCH("read-committed", cval.str, cval.len) ? WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED; /* * The default sync setting is inherited from the connection, but can * be overridden by an explicit "sync" setting for this transaction. * * We want to distinguish between inheriting implicitly and explicitly. */ F_CLR(txn, WT_TXN_SYNC_SET); WT_RET(__wt_config_gets_def( session, cfg, "sync", (int)UINT_MAX, &cval)); if (cval.val == 0 || cval.val == 1) /* * This is an explicit setting of sync. Set the flag so * that we know not to overwrite it in commit_transaction. */ F_SET(txn, WT_TXN_SYNC_SET); /* * If sync is turned off explicitly, clear the transaction's sync field. */ if (cval.val == 0) txn->txn_logsync = 0; WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval)); if (cval.len > 0) /* * The layering here isn't ideal - the named snapshot get * function does both validation and setup. Otherwise we'd * need to walk the list of named snapshots twice during * transaction open. */ WT_RET(__wt_txn_named_snapshot_get(session, &cval)); WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval)); if (cval.len > 0) { #ifdef HAVE_TIMESTAMPS wt_timestamp_t ts; WT_TXN_GLOBAL *txn_global; char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1]; bool round_to_oldest; txn_global = &S2C(session)->txn_global; WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval)); /* * Read the configuration here to reduce the span of the * critical section. */ WT_RET(__wt_config_gets_def(session, cfg, "round_to_oldest", 0, &cval)); round_to_oldest = cval.val; /* * This code is not using the timestamp validate function to * avoid a race between checking and setting transaction * timestamp. */ __wt_readlock(session, &txn_global->rwlock); if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0) { WT_RET(__wt_timestamp_to_hex_string(session, timestamp_buf, &ts)); /* * If given read timestamp is earlier than oldest * timestamp then round the read timestamp to * oldest timestamp. */ if (round_to_oldest) __wt_timestamp_set(&txn->read_timestamp, &txn_global->oldest_timestamp); else { __wt_readunlock(session, &txn_global->rwlock); WT_RET_MSG(session, EINVAL, "read timestamp " "%s older than oldest timestamp", timestamp_buf); } } else { __wt_timestamp_set(&txn->read_timestamp, &ts); /* * Reset to avoid a verbose message as read * timestamp is not rounded to oldest timestamp. */ round_to_oldest = false; } __wt_txn_set_read_timestamp(session); __wt_readunlock(session, &txn_global->rwlock); txn->isolation = WT_ISO_SNAPSHOT; if (round_to_oldest) { /* * This message is generated here to reduce the span of * critical section. */ __wt_verbose(session, WT_VERB_TIMESTAMP, "Read " "timestamp %s : Rounded to oldest timestamp", timestamp_buf); } #else WT_RET_MSG(session, EINVAL, "read_timestamp requires a " "version of WiredTiger built with timestamp support"); #endif } return (0); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; lsm_tree->merge_min = lsm_tree->merge_max / 2; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); /* * Set up the config for each chunk. If possible, avoid high latencies * from fsync by flushing the cache every 8MB (will be overridden by * any application setting). */ tmpconfig = ""; #ifdef HAVE_SYNC_FILE_RANGE if (!S2C(session)->direct_io) tmpconfig = "os_cache_dirty_max=8MB,"; #endif WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s%s,key_format=u,value_format=u", tmpconfig, config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __curjoin_init_iter -- * Initialize before any iteration. */ static int __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) { WT_BLOOM *bloom; WT_DECL_RET; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_CURSOR_JOIN_ENDPOINT *end; uint32_t f, k; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); je = &cjoin->entries[0]; WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { __wt_stat_join_init_single(&je->stats); for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_RET(__curjoin_endpoint_init_key(session, je, end)); /* * The first entry is iterated as the 'outermost' cursor. * For the common GE case, we don't have to test against * the left reference key, we know it will be true since * the btree is ordered. */ if (je == cjoin->entries && je->ends[0].flags == (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)) F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_RET(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_RET(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_RET(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_RET(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_RET(__wt_bloom_intersection(je->bloom, bloom)); WT_RET(__wt_bloom_close(bloom)); } } } F_SET(cjoin, WT_CURJOIN_INITIALIZED); return (ret); }