/* * __wt_realloc_aligned -- * ANSI realloc function that aligns to buffer boundaries, configured with * the "buffer_alignment" key to wiredtiger_open. */ int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) { #if defined(HAVE_POSIX_MEMALIGN) WT_DECL_RET; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ if (session != NULL && S2C(session)->buffer_alignment > 0) { void *p, *newp; size_t bytes_allocated; /* * Sometimes we're allocating memory and we don't care about the * final length -- bytes_allocated_ret may be NULL. */ p = *(void **)retp; bytes_allocated = (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; WT_ASSERT(session, (p == NULL && bytes_allocated == 0) || (p != NULL && (bytes_allocated_ret == NULL || bytes_allocated != 0))); WT_ASSERT(session, bytes_to_allocate != 0); WT_ASSERT(session, bytes_allocated < bytes_to_allocate); /* * We are going to allocate an aligned buffer. When we do this * repeatedly, the allocator is expected to start on a boundary * each time, account for that additional space by never asking * for less than a full alignment size. The primary use case * for aligned buffers is Linux direct I/O, which requires that * the size be a multiple of the alignment anyway. */ bytes_to_allocate = WT_ALIGN(bytes_to_allocate, S2C(session)->buffer_alignment); WT_STAT_FAST_CONN_INCR(session, memory_allocation); if ((ret = posix_memalign(&newp, S2C(session)->buffer_alignment, bytes_to_allocate)) != 0) WT_RET_MSG(session, ret, "memory allocation"); if (p != NULL) memcpy(newp, p, bytes_allocated); __wt_free(session, p); p = newp; /* Clear the allocated memory (see above). */ memset((uint8_t *)p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) *bytes_allocated_ret = bytes_to_allocate; *(void **)retp = p; return (0); } #endif /* * If there is no posix_memalign function, or no alignment configured, * fall back to realloc. * * Windows note: Visual C CRT memalign does not match Posix behavior * and would also double each allocation so it is bad for memory use */ return (__wt_realloc( session, bytes_allocated_ret, bytes_to_allocate, retp)); }
/* * __wt_lsm_meta_read -- * Read the metadata for an LSM tree. */ int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONFIG cparser, lparser; WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata; WT_DECL_RET; WT_LSM_CHUNK *chunk; char *lsmconfig; u_int nchunks; chunk = NULL; /* -Wconditional-uninitialized */ WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->key_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format)); } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->value_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format)); } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) { if (cv.len == 0 || WT_STRING_CASE_MATCH("none", cv.str, cv.len)) continue; /* * Extract the application-supplied metadata (if any) * from the file configuration. */ WT_ERR(__wt_config_getones( session, lsmconfig, "file_config", &fileconf)); WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_subgets( session, &fileconf, "app_metadata", &metadata)); WT_ERR(__wt_collator_config(session, lsm_tree->name, &cv, &metadata, &lsm_tree->collator, &lsm_tree->collator_owned)); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->collator_name)); } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->bloom_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->bloom_config)); } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->file_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->file_config)); } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) { if (cv.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); } else if (WT_STRING_MATCH("bloom", ck.str, ck.len)) lsm_tree->bloom = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len)) lsm_tree->bloom_bit_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len)) lsm_tree->bloom_hash_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len)) lsm_tree->chunk_max = (uint64_t)cv.val; else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len)) lsm_tree->chunk_size = (uint64_t)cv.val; else if (WT_STRING_MATCH("merge_max", ck.str, ck.len)) lsm_tree->merge_max = (uint32_t)cv.val; else if (WT_STRING_MATCH("merge_min", ck.str, ck.len)) lsm_tree->merge_min = (uint32_t)cv.val; else if (WT_STRING_MATCH("last", ck.str, ck.len)) lsm_tree->last = (u_int)cv.val; else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("id", lk.str, lk.len)) { WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR( __wt_calloc_one(session, &chunk)); lsm_tree->chunk[nchunks++] = chunk; chunk->id = (uint32_t)lv.val; WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE); } else if (WT_STRING_MATCH( "bloom", lk.str, lk.len)) { WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } else if (WT_STRING_MATCH( "chunk_size", lk.str, lk.len)) { chunk->size = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "count", lk.str, lk.len)) { chunk->count = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "generation", lk.str, lk.len)) { chunk->generation = (uint32_t)lv.val; continue; } } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nchunks = nchunks; } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { WT_ERR(__wt_strndup(session, lv.str, lv.len, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } WT_ERR(__wt_realloc_def(session, &lsm_tree->old_alloc, nchunks + 1, &lsm_tree->old_chunks)); WT_ERR(__wt_calloc_one(session, &chunk)); lsm_tree->old_chunks[nchunks++] = chunk; WT_ERR(__wt_strndup(session, lk.str, lk.len, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nold_chunks = nchunks; /* Values included for backward compatibility */ } else if (WT_STRING_MATCH("merge_threads", ck.str, ck.len)) { } else WT_ERR(__wt_illegal_value(session, "LSM metadata")); } WT_ERR_NOTFOUND_OK(ret); /* * If the default merge_min was not overridden, calculate it now. We * do this here so that trees created before merge_min was added get a * sane value. */ if (lsm_tree->merge_min < 2) lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); err: __wt_free(session, lsmconfig); return (ret); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ __wt_lsm_tree_writelock(session, lsm_tree); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); __wt_lsm_tree_writeunlock(session, lsm_tree); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { __wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation); for (verb = start_chunk; verb < end_chunk + 1; verb++) __wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %" PRIu32 ", gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!lsm_tree->active) WT_ERR(EINTR); WT_STAT_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) __wt_bloom_insert(bloom, &key); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; __wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted", record_count, insert_count); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); __wt_lsm_tree_writelock(session, lsm_tree); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) __wt_lsm_tree_writeunlock(session, lsm_tree); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret, tret = __wt_schema_drop( session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) __wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close"); else __wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0)); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; /* * When a page is discarded, it's been disconnected from its parent and * its parent's WT_REF structure may now point to a different page. * Make sure we don't accidentally use the page itself or any other * information. */ page = *pagep; *pagep = NULL; page->parent = NULL; page->ref = NULL; WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; if ((hp = __wt_page_hazard_check(session, page)) != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", hp->page, hp->file, hp->line); } #endif /* Update the cache's information. */ __wt_cache_page_evict(session, page); /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: __free_page_col_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_INT: __free_page_row_int(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Free any allocated disk image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC)) { if (page->dsk != NULL) __wt_mmap_discard( session, page->dsk, page->dsk->mem_size); } else __wt_free(session, page->dsk); __wt_overwrite_and_free(session, page); }
/* * __conn_dhandle_get -- * Allocate a new data handle, lock it exclusively, and return it linked * into the connection's list. */ static int __conn_dhandle_get(WT_SESSION_IMPL *session, const char *name, const char *ckpt, uint32_t flags) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; uint32_t bucket; conn = S2C(session); /* * We have the handle lock, check whether we can find the handle we * are looking for. If we do, and we can lock it in the state we * want, this session will take ownership and we are done. */ ret = __wt_conn_dhandle_find(session, name, ckpt, flags); if (ret == 0) { dhandle = session->dhandle; WT_RET(__conn_dhandle_open_lock(session, dhandle, flags)); return (0); } WT_RET_NOTFOUND_OK(ret); /* * If no handle was found, allocate the data handle and a btree handle, * then initialize the data handle. Exclusively lock the data handle * before inserting it in the list. */ WT_RET(__wt_calloc_one(session, &dhandle)); WT_ERR(__wt_rwlock_alloc(session, &dhandle->rwlock, "data handle")); dhandle->name_hash = __wt_hash_city64(name, strlen(name)); WT_ERR(__wt_strdup(session, name, &dhandle->name)); if (ckpt != NULL) WT_ERR(__wt_strdup(session, ckpt, &dhandle->checkpoint)); WT_ERR(__wt_calloc_one(session, &btree)); dhandle->handle = btree; btree->dhandle = dhandle; WT_ERR(__wt_spin_init( session, &dhandle->close_lock, "data handle close")); F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); WT_ERR(__wt_writelock(session, dhandle->rwlock)); /* * Prepend the handle to the connection list, assuming we're likely to * need new files again soon, until they are cached by all sessions. * Find the right hash bucket to insert into as well. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_HANDLE_LIST_LOCKED)); bucket = dhandle->name_hash % WT_HASH_ARRAY_SIZE; WT_CONN_DHANDLE_INSERT(conn, dhandle, bucket); session->dhandle = dhandle; return (0); err: WT_TRET(__wt_rwlock_destroy(session, &dhandle->rwlock)); __wt_free(session, dhandle->name); __wt_free(session, dhandle->checkpoint); __wt_free(session, dhandle->handle); /* btree free */ __wt_spin_destroy(session, &dhandle->close_lock); __wt_overwrite_and_free(session, dhandle); return (ret); }
/* * __create_file -- * Create a new 'file:' object. */ static int __create_file(WT_SESSION_IMPL *session, const char *uri, bool exclusive, const char *config) { WT_DECL_ITEM(val); WT_DECL_RET; const char *filename, **p, *filecfg[] = { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; char *fileconf; uint32_t allocsize; bool is_metadata; fileconf = NULL; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) return (__wt_unexpected_object_type(session, uri, "file:")); /* Check if the file already exists. */ if (!is_metadata && (ret = __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); goto err; } /* Sanity check the allocation size. */ WT_ERR(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); /* Create the file. */ WT_ERR(__wt_block_manager_create(session, filename, allocsize)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); /* * If creating an ordinary file, append the file ID and current version * numbers to the passed-in configuration and insert the resulting * configuration into the metadata. */ if (!is_metadata) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "id=%" PRIu32 ",version=(major=%d,minor=%d)", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; *p = val->data; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); WT_ERR(__wt_metadata_insert(session, uri, fileconf)); } /* * Open the file to check that it was setup correctly. We don't need to * pass the configuration, we just wrote the collapsed configuration * into the metadata file, and it's going to be read/used by underlying * functions. * * Keep the handle exclusive until it is released at the end of the * call, otherwise we could race with a drop. */ WT_ERR(__wt_session_get_btree( session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, true)); else WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(session, &val); __wt_free(session, fileconf); return (ret); }
/* * __ovfl_reuse_wrapup -- * Resolve the page's overflow reuse list after a page is written. */ static int __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_OVFL_REUSE **e, **head, *reuse; size_t decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that aren't in-use, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { e = &reuse->next[i]; continue; } *e = reuse->next[i]; } /* * Second, discard any overflow record without an in-use flag, clear * the flags for the next run. * * As part of the pass through the lowest level, figure out how much * space we added/subtracted from the page, and update its footprint. * We don't get it exactly correct because we don't know the depth of * the skiplist here, but it's close enough, and figuring out the * memory footprint change in the reconciliation wrapup code means * fewer atomic updates and less code overall. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); e = &reuse->next[0]; continue; } *e = reuse->next[0]; WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_RET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) { WT_BTREE *saved_btree; WT_DECL_RET; int tret; /* * Unlock handles and complete checkpoints regardless of whether we are * unrolling. */ if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK) goto free; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ saved_btree = session->btree; session->btree = trk->btree; if (!unroll) WT_TRET(__wt_bm_checkpoint_resolve(session)); session->btree = saved_btree; break; case WT_ST_LOCK: /* Handle lock, see above */ saved_btree = session->btree; session->btree = trk->btree; if (unroll && trk->created) F_SET(session->btree, WT_BTREE_DISCARD); WT_TRET(__wt_session_release_btree(session)); session->btree = saved_btree; break; case WT_ST_FILEOP: /* File operation */ /* * For renames, both a and b are set. * For creates, a is NULL. * For removes, b is NULL. */ if (trk->a != NULL && trk->b != NULL && (tret = __wt_rename(session, trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll rename %s to %s", trk->b, trk->a); WT_TRET(tret); } else if (trk->a == NULL) { if ((tret = __wt_remove(session, trk->b + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll create %s", trk->b); WT_TRET(tret); } } /* * We can't undo removes yet: that would imply * some kind of temporary rename and remove in * roll forward. */ break; case WT_ST_REMOVE: /* Remove trk.a */ if ((tret = __wt_metadata_remove( session, trk->a)) != 0) { __wt_err(session, ret, "metadata unroll remove: %s", trk->a); WT_TRET(tret); } break; case WT_ST_SET: /* Set trk.a to trk.b */ if ((tret = __wt_metadata_update( session, trk->a, trk->b)) != 0) { __wt_err(session, ret, "metadata unroll update %s to %s", trk->a, trk->b); WT_TRET(tret); } break; WT_ILLEGAL_VALUE(session); } free: trk->op = WT_ST_EMPTY; __wt_free(session, trk->a); __wt_free(session, trk->b); trk->btree = NULL; return (ret); }
/* * __wt_delete_page_rollback -- * Abort pages that were deleted without being instantiated. */ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; uint64_t sleep_count, yield_count; /* * If the page is still "deleted", it's as we left it, reset the state * to on-disk and we're done. Otherwise, we expect the page is either * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ for (sleep_count = yield_count = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_LOOKASIDE: case WT_REF_READING: WT_ASSERT(session, 0); /* Impossible, assert */ break; case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ if (__wt_atomic_casv32( &ref->state, WT_REF_DELETED, WT_REF_DISK)) return; break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. */ break; case WT_REF_MEM: case WT_REF_SPLIT: /* * We can't use the normal read path to get a copy of * the page because the session may have closed the * cursor, we no longer have the reference to the tree * required for a hazard pointer. We're safe because * with unresolved transactions, the page isn't going * anywhere. * * The page is in an in-memory state, walk the list of * update structures and abort them. */ for (upd = ref->page_del->update_list; *upd != NULL; ++upd) (*upd)->txnid = WT_TXN_ABORTED; /* * Discard the memory, the transaction can't abort * twice. */ __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); return; } /* * We wait for the change in page state, yield before retrying, * and if we've yielded enough times, start sleeping so we don't * burn CPU to no purpose. */ __wt_ref_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_count); } }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; int needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0; /* We need a real session for recovery. */ WT_RET(__wt_open_session(conn, NULL, NULL, &session)); F_SET(session, WT_SESSION_NO_LOGGING); r.session = session; WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { WT_ERR(__recovery_file_scan(&r)); conn->next_file_id = r.max_fileid; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. */ if (!was_backup) { r.metadata_only = 1; if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r)); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; WT_ERR(__wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r)); } } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = 0; WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY, "Main recovery loop: starting at %u/%" PRIuMAX, r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset)); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) WT_ERR(WT_RUN_RECOVERY); /* * Always run recovery even if it was a clean shutdown. * We can consider skipping it in the future. */ if (WT_IS_INIT_LSN(&r.ckpt_lsn)) WT_ERR(__wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); else WT_ERR(__wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r)); conn->next_file_id = r.max_fileid; /* * If recovery ran successfully forcibly log a checkpoint so the next * open is fast and keep the metadata up to date with the checkpoint * LSN and archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); done: err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); }
/* * __wt_curds_open -- * Initialize a data-source cursor. */ int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curds_compare, /* compare */ __wt_cursor_equals, /* equals */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __curds_update, /* update */ __curds_remove, /* remove */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curds_close); /* close */ WT_CONFIG_ITEM cval, metadata; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; WT_DECL_RET; char *metaconf; WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0); data_source = NULL; metaconf = NULL; WT_RET(__wt_calloc_one(session, &data_source)); cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; /* * XXX * The underlying data-source may require the object's key and value * formats. This isn't a particularly elegant way of getting that * information to the data-source, this feels like a layering problem * to me. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); /* Data-source cursors may have a custom collator. */ WT_ERR( __wt_config_getones(session, metaconf, "app_metadata", &metadata)); WT_ERR(__wt_config_gets_none(session, cfg, "collator", &cval)); if (cval.len != 0) WT_ERR(__wt_collator_config(session, uri, &cval, &metadata, &data_source->collator, &data_source->collator_owned)); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source)); source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); source->recno = WT_RECNO_OOB; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); source->saved_err = 0; source->flags = 0; if (0) { err: WT_TRET(__curds_close(cursor)); *cursorp = NULL; } __wt_free(session, metaconf); return (ret); }
/* * __wt_logmgr_destroy -- * Destroy the log archiving server thread and logging subsystem. */ int __wt_logmgr_destroy(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION *wt_session; conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) { /* * We always set up the log_path so printlog can work without * recovery. Therefore, always free it, even if logging isn't * on. */ __wt_free(session, conn->log_path); return (0); } if (conn->log_tid_set) { WT_TRET(__wt_cond_auto_signal(session, conn->log_cond)); WT_TRET(__wt_thread_join(session, conn->log_tid)); conn->log_tid_set = false; } if (conn->log_file_tid_set) { WT_TRET(__wt_cond_signal(session, conn->log_file_cond)); WT_TRET(__wt_thread_join(session, conn->log_file_tid)); conn->log_file_tid_set = false; } if (conn->log_file_session != NULL) { wt_session = &conn->log_file_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_file_session = NULL; } if (conn->log_wrlsn_tid_set) { WT_TRET(__wt_cond_auto_signal(session, conn->log_wrlsn_cond)); WT_TRET(__wt_thread_join(session, conn->log_wrlsn_tid)); conn->log_wrlsn_tid_set = false; } if (conn->log_wrlsn_session != NULL) { wt_session = &conn->log_wrlsn_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_wrlsn_session = NULL; } WT_TRET(__wt_log_slot_destroy(session)); WT_TRET(__wt_log_close(session)); /* Close the server thread's session. */ if (conn->log_session != NULL) { wt_session = &conn->log_session->iface; WT_TRET(wt_session->close(wt_session, NULL)); conn->log_session = NULL; } /* Destroy the condition variables now that all threads are stopped */ WT_TRET(__wt_cond_auto_destroy(session, &conn->log_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log_file_cond)); WT_TRET(__wt_cond_auto_destroy(session, &conn->log_wrlsn_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_sync_cond)); WT_TRET(__wt_cond_destroy(session, &conn->log->log_write_cond)); WT_TRET(__wt_rwlock_destroy(session, &conn->log->log_archive_lock)); __wt_spin_destroy(session, &conn->log->log_lock); __wt_spin_destroy(session, &conn->log->log_slot_lock); __wt_spin_destroy(session, &conn->log->log_sync_lock); __wt_spin_destroy(session, &conn->log->log_writelsn_lock); __wt_free(session, conn->log_path); __wt_free(session, conn->log); return (ret); }
/* * __split_ref_deepen_move -- * Move a WT_REF from a parent to a child in service of a split to deepen * the tree, including updating the accounting information. */ static int __split_ref_deepen_move(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, size_t *parent_decrp, size_t *child_incrp) { WT_ADDR *addr; WT_CELL_UNPACK unpack; WT_DECL_RET; WT_IKEY *ikey; size_t size; void *key; /* * Instantiate row-store keys, and column- and row-store addresses in * the WT_REF structures referenced by a page that's being split (and * deepening the tree). The WT_REF structures aren't moving, but the * index references are moving from the page we're splitting to a set * of child pages, and so we can no longer reference the block image * that remains with the page being split. * * No locking is required to update the WT_REF structure because we're * the only thread splitting the parent page, and there's no way for * readers to race with our updates of single pointers. The changes * have to be written before the page goes away, of course, our caller * owns that problem. * * Row-store keys, first. */ if (parent->type == WT_PAGE_ROW_INT) { if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { __wt_ref_key(parent, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); ikey = ref->key.ikey; } else { WT_RET(__split_ovfl_key_cleanup(session, parent, ref)); *parent_decrp += sizeof(WT_IKEY) + ikey->size; } *child_incrp += sizeof(WT_IKEY) + ikey->size; } /* * If there's no address (the page has never been written), or the * address has been instantiated, there's no work to do. Otherwise, * get the address from the on-page cell. */ addr = ref->addr; if (addr != NULL && !__wt_off_page(parent, addr)) { __wt_cell_unpack((WT_CELL *)ref->addr, &unpack); WT_RET(__wt_calloc_one(session, &addr)); if ((ret = __wt_strndup( session, unpack.data, unpack.size, &addr->addr)) != 0) { __wt_free(session, addr); return (ret); } addr->size = (uint8_t)unpack.size; addr->type = unpack.raw == WT_CELL_ADDR_INT ? WT_ADDR_INT : WT_ADDR_LEAF; ref->addr = addr; } /* And finally, the WT_REF itself. */ WT_MEM_TRANSFER(*parent_decrp, *child_incrp, sizeof(WT_REF)); return (0); }
/* * __wt_realloc_aligned -- * ANSI realloc function that aligns to buffer boundaries, configured with * the "buffer_alignment" key to wiredtiger_open. */ int __wt_realloc_aligned(WT_SESSION_IMPL *session, size_t *bytes_allocated_ret, size_t bytes_to_allocate, void *retp) { #if defined(HAVE_POSIX_MEMALIGN) WT_DECL_RET; /* * !!! * This function MUST handle a NULL WT_SESSION_IMPL handle. */ if (session != NULL && S2C(session)->buffer_alignment > 0) { void *p, *newp; size_t bytes_allocated; /* * Sometimes we're allocating memory and we don't care about the * final length -- bytes_allocated_ret may be NULL. */ p = *(void **)retp; bytes_allocated = (bytes_allocated_ret == NULL) ? 0 : *bytes_allocated_ret; WT_ASSERT(session, (p == NULL && bytes_allocated == 0) || (p != NULL && (bytes_allocated_ret == NULL || bytes_allocated != 0))); WT_ASSERT(session, bytes_to_allocate != 0); WT_ASSERT(session, bytes_allocated < bytes_to_allocate); if (session != NULL) WT_STAT_FAST_CONN_INCR(session, memory_allocation); if ((ret = posix_memalign(&newp, S2C(session)->buffer_alignment, bytes_to_allocate)) != 0) WT_RET_MSG(session, ret, "memory allocation"); if (p != NULL) memcpy(newp, p, bytes_allocated); __wt_free(session, p); p = newp; /* Clear the allocated memory (see above). */ memset((uint8_t *)p + bytes_allocated, 0, bytes_to_allocate - bytes_allocated); /* Update caller's bytes allocated value. */ if (bytes_allocated_ret != NULL) *bytes_allocated_ret = bytes_to_allocate; *(void **)retp = p; return (0); } #endif /* * If there is no posix_memalign function, or no alignment configured, * fall back to realloc. */ return (__wt_realloc( session, bytes_allocated_ret, bytes_to_allocate, retp)); }
/* * __create_colgroup -- * Create a column group. */ static int __create_colgroup(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_RET; WT_ITEM confbuf, fmt, namebuf; WT_TABLE *table; size_t tlen; const char **cfgp, *cfg[4] = { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *cgname, *source, *sourceconf, *tablename; char *cgconf, *origconf; bool exists; sourceconf = NULL; cgconf = origconf = NULL; WT_CLEAR(fmt); WT_CLEAR(confbuf); WT_CLEAR(namebuf); exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "colgroup:")) return ( __wt_unexpected_object_type(session, name, "colgroup:")); cgname = strchr(tablename, ':'); if (cgname != NULL) { tlen = (size_t)(cgname - tablename); ++cgname; } else tlen = strlen(tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0) WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret, "Can't create '%s' for non-existent table '%.*s'", name, (int)tlen, tablename); /* Make sure the column group is referenced from the table. */ if (cgname != NULL && (ret = __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) WT_ERR_MSG(session, EINVAL, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); /* Check if the column group already exists. */ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); /* Find the first NULL entry in the cfg stack. */ for (cfgp = &cfg[1]; *cfgp; cfgp++) ; /* Add the source to the colgroup config before collapsing. */ if (__wt_config_getones( session, config, "source", &cval) == 0 && cval.len != 0) { WT_ERR(__wt_buf_fmt( session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_colgroup_source( session, table, cgname, config, &namebuf)); source = namebuf.data; WT_ERR(__wt_buf_fmt( session, &confbuf, "source=\"%s\"", source)); *cfgp++ = confbuf.data; } /* Calculate the key/value formats: these go into the source config. */ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format)); if (cgname == NULL) WT_ERR(__wt_buf_catfmt (session, &fmt, ",value_format=%s", table->value_format)); else { if (__wt_config_getones(session, config, "columns", &cval) != 0) WT_ERR_MSG(session, EINVAL, "No 'columns' configuration for '%s'", name); WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=")); WT_ERR(__wt_struct_reformat(session, table, cval.str, cval.len, NULL, true, &fmt)); } sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, cgconf)); WT_ERR(__wt_schema_open_colgroups(session, table)); } err: __wt_free(session, cgconf); __wt_free(session, sourceconf); __wt_free(session, origconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_DELETED *page_del; WT_UPDATE **upd_array, *upd; size_t size; uint32_t i; btree = S2BT(session); page = ref->page; page_del = ref->page_del; /* * Give the page a modify structure. * * If the tree is already dirty and so will be written, mark the page * dirty. (We'd like to free the deleted pages, but if the handle is * read-only or if the application never modifies the tree, we're not * able to do so.) */ WT_RET(__wt_page_modify_init(session, page)); if (btree->modified) __wt_page_modify_set(session, page); /* * An operation is accessing a "deleted" page, and we're building an * in-memory version of the page (making it look like all entries in * the page were individually updated by a remove operation). There * are two cases where we end up here: * * First, a running transaction used a truncate call to delete the page * without reading it, in which case the page reference includes a * structure with a transaction ID; the page we're building might split * in the future, so we update that structure to include references to * all of the update structures we create, so the transaction can abort. * * Second, a truncate call deleted a page and the truncate committed, * but an older transaction in the system forced us to keep the old * version of the page around, then we crashed and recovered, and now * we're being forced to read that page. * * In the first case, we have a page reference structure, in the second * second, we don't. * * Allocate the per-reference update array; in the case of instantiating * a page, deleted by a running transaction that might eventually abort, * we need a list of the update structures so we can do that abort. The * hard case is if a page splits: the update structures might be moved * to different pages, and we still have to find them all for an abort. */ if (page_del != NULL) WT_RET(__wt_calloc_def( session, page->entries + 1, &page_del->update_list)); /* Allocate the per-page update array. */ WT_ERR(__wt_calloc_def(session, page->entries, &upd_array)); page->modify->mod_row_update = upd_array; /* * Fill in the per-reference update array with references to update * structures, fill in the per-page update array with references to * deleted items. */ for (i = 0, size = 0; i < page->entries; ++i) { WT_ERR(__wt_calloc_one(session, &upd)); upd->type = WT_UPDATE_DELETED; if (page_del == NULL) upd->txnid = WT_TXN_NONE; /* Globally visible */ else { upd->txnid = page_del->txnid; page_del->update_list[i] = upd; } upd->next = upd_array[i]; upd_array[i] = upd; size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd); } __wt_cache_page_inmem_incr(session, page, size); return (0); err: /* * There's no need to free the page update structures on error, our * caller will discard the page and do that work for us. We could * similarly leave the per-reference update array alone because it * won't ever be used by any page that's not in-memory, but cleaning * it up makes sense, especially if we come back in to this function * attempting to instantiate this page again. */ if (page_del != NULL) __wt_free(session, page_del->update_list); return (ret); }
/* * __create_index -- * Create an index. */ static int __create_index(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG kcols, pkcols; WT_CONFIG_ITEM ckey, cval, icols, kval; WT_DECL_PACK_VALUE(pv); WT_DECL_RET; WT_INDEX *idx; WT_ITEM confbuf, extra_cols, fmt, namebuf; WT_PACK pack; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *source, *sourceconf, *idxname, *tablename; char *idxconf, *origconf; size_t tlen; bool exists, have_extractor; u_int i, npublic_cols; sourceconf = NULL; idxconf = origconf = NULL; WT_CLEAR(confbuf); WT_CLEAR(fmt); WT_CLEAR(extra_cols); WT_CLEAR(namebuf); exists = have_extractor = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "index:")) return (__wt_unexpected_object_type(session, name, "index:")); idxname = strchr(tablename, ':'); if (idxname == NULL) WT_RET_MSG(session, EINVAL, "Invalid index name, " "should be <table name>:<index name>: %s", name); tlen = (size_t)(idxname++ - tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0) WT_RET_MSG(session, ret, "Can't create an index for a non-existent table: %.*s", (int)tlen, tablename); if (table->is_simple) WT_ERR_MSG(session, EINVAL, "%s requires a table with named columns", name); /* Check if the index already exists. */ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); if (__wt_config_getones(session, config, "source", &cval) == 0) { WT_ERR(__wt_buf_fmt(session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_index_source( session, table, idxname, config, &namebuf)); source = namebuf.data; /* Add the source name to the index config before collapsing. */ WT_ERR(__wt_buf_catfmt(session, &confbuf, ",source=\"%s\"", source)); } if (__wt_config_getones_none( session, config, "extractor", &cval) == 0 && cval.len != 0) { have_extractor = true; /* Custom extractors must supply a key format. */ if ((ret = __wt_config_getones( session, config, "key_format", &kval)) != 0) WT_ERR_MSG(session, EINVAL, "%s: custom extractors require a key_format", name); } /* Calculate the key/value formats. */ WT_CLEAR(icols); if (__wt_config_getones(session, config, "columns", &icols) != 0 && !have_extractor) WT_ERR_MSG(session, EINVAL, "%s: requires 'columns' configuration", name); /* * Count the public columns using the declared columns for normal * indices or the key format for custom extractors. */ npublic_cols = 0; if (!have_extractor) { __wt_config_subinit(session, &kcols, &icols); while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } else { WT_ERR(__pack_initn(session, &pack, kval.str, kval.len)); while ((ret = __pack_next(&pack, &pv)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } /* * The key format for an index is somewhat subtle: the application * specifies a set of columns that it will use for the key, but the * engine usually adds some hidden columns in order to derive the * primary key. These hidden columns are part of the source's * key_format, which we are calculating now, but not part of an index * cursor's key_format. */ __wt_config_subinit(session, &pkcols, &table->colconf); for (i = 0; i < table->nkey_columns && (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0; i++) { /* * If the primary key column is already in the secondary key, * don't add it again. */ if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) { if (have_extractor) WT_ERR_MSG(session, EINVAL, "an index with a custom extractor may not " "include primary key columns"); continue; } WT_ERR(__wt_buf_catfmt( session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str)); } WT_ERR_NOTFOUND_OK(ret); /* Index values are empty: all columns are packed into the index key. */ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format=")); if (have_extractor) { WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s", (int)kval.len, kval.str)); WT_CLEAR(icols); } /* * Construct the index key format, or append the primary key columns * for custom extractors. */ WT_ERR(__wt_struct_reformat(session, table, icols.str, icols.len, (const char *)extra_cols.data, false, &fmt)); /* Check for a record number index key, which makes no sense. */ WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval)); if (cval.len == 1 && cval.str[0] == 'r') WT_ERR_MSG(session, EINVAL, "column-store index may not use the record number as its " "index key"); WT_ERR(__wt_buf_catfmt( session, &fmt, ",index_key_columns=%u", npublic_cols)); sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); cfg[1] = sourceconf; cfg[2] = confbuf.data; WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, idxconf)); /* Make sure that the configuration is valid. */ WT_ERR(__wt_schema_open_index( session, table, idxname, strlen(idxname), &idx)); /* If there is data in the table, fill the index. */ WT_ERR(__fill_index(session, table, idx)); } err: __wt_free(session, idxconf); __wt_free(session, origconf); __wt_free(session, sourceconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &extra_cols); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __wt_delete_page -- * If deleting a range, try to delete the page without instantiating it. */ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_DECL_RET; WT_PAGE *parent; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ if (ref->state == WT_REF_MEM && __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { WT_PUBLISH(ref->state, WT_REF_MEM); return (0); } (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1); ret = __wt_evict(session, ref, false); (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1); WT_RET_BUSY_OK(ret); } /* * Atomically switch the page's state to lock it. If the page is not * on-disk, other threads may be using it, no fast delete. */ if (ref->state != WT_REF_DISK || !__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * We cannot fast-delete pages that have overflow key/value items as * the overflow blocks have to be discarded. The way we figure that * out is to check the page's cell type, cells for leaf pages without * overflow items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if * the parent page were to split, deepening the tree. It's safe: the * page's reference will always point to some valid page, and if we find * any problems we simply fail the fast-delete optimization. */ parent = ref->home; if (__wt_off_page(parent, ref->addr) ? ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) goto err; /* * This action dirties the parent page: mark it dirty now, there's no * future reconciliation of the child leaf page that will dirty it as * we write the tree. */ WT_ERR(__wt_page_parent_modify_set(session, ref, false)); /* * Record the change in the transaction structure and set the change's * transaction ID. */ WT_ERR(__wt_calloc_one(session, &ref->page_del)); ref->page_del->txnid = session->txn.id; WT_ERR(__wt_txn_modify_ref(session, ref)); *skipp = true; WT_STAT_CONN_INCR(session, rec_page_delete_fast); WT_STAT_DATA_INCR(session, rec_page_delete_fast); WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); err: __wt_free(session, ref->page_del); /* * Restore the page to on-disk status, we'll have to instantiate it. */ WT_PUBLISH(ref->state, WT_REF_DISK); return (ret); }
/* * __create_table -- * Create a table. */ static int __create_table(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG conf; WT_CONFIG_ITEM cgkey, cgval, cval; WT_DECL_RET; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL }; const char *tablename; char *tableconf, *cgname; size_t cgsize; int ncolgroups; bool exists; cgname = NULL; table = NULL; tableconf = NULL; exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "table:")) return (__wt_unexpected_object_type(session, name, "table:")); if ((ret = __wt_schema_get_table(session, tablename, strlen(tablename), false, &table)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval)); __wt_config_subinit(session, &conf, &cval); for (ncolgroups = 0; (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0; ncolgroups++) ; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_collapse(session, cfg, &tableconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, tableconf)); /* Attempt to open the table now to catch any errors. */ WT_ERR(__wt_schema_get_table( session, tablename, strlen(tablename), true, &table)); if (ncolgroups == 0) { cgsize = strlen("colgroup:") + strlen(tablename) + 1; WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); snprintf(cgname, cgsize, "colgroup:%s", tablename); WT_ERR(__create_colgroup( session, cgname, exclusive, config)); } } if (0) { err: if (table != NULL) { WT_TRET(__wt_schema_remove_table(session, table)); table = NULL; } } if (table != NULL) __wt_schema_release_table(session, table); __wt_free(session, cgname); __wt_free(session, tableconf); return (ret); }
/* * __wt_curlog_open -- * Initialize a log cursor. */ int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curlog_compare, /* compare */ __wt_cursor_equals, /* equals */ __curlog_next, /* next */ __wt_cursor_notsup, /* prev */ __curlog_reset, /* reset */ __curlog_search, /* search */ __wt_cursor_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reconfigure */ __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LOG *log; WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0); conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_RET_MSG(session, EINVAL, "Cannot open a log cursor without logging enabled"); log = conn->log; cl = NULL; WT_RET(__wt_calloc_one(session, &cl)); cursor = &cl->iface; *cursor = iface; cursor->session = &session->iface; WT_ERR(__wt_calloc_one(session, &cl->cur_lsn)); WT_ERR(__wt_calloc_one(session, &cl->next_lsn)); WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); cursor->key_format = LOGC_KEY_FORMAT; cursor->value_format = LOGC_VALUE_FORMAT; WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else { __wt_free(session, cl->cur_lsn); __wt_free(session, cl->next_lsn); __wt_scr_free(session, &cl->logrec); __wt_scr_free(session, &cl->opkey); __wt_scr_free(session, &cl->opvalue); /* * NOTE: We cannot get on the error path with the * readlock held. No need to unlock it unless that * changes above. */ __wt_free(session, cl); } *cursorp = NULL; } return (ret); }
/* * __free_page_col_var -- * Discard a WT_PAGE_COL_VAR page. */ static void __free_page_col_var(WT_SESSION_IMPL *session, WT_PAGE *page) { /* Free the RLE lookup array. */ __wt_free(session, page->u.col_var.repeats); }
/* * __wt_dirlist -- * Get a list of files from a directory, optionally filtered by * a given prefix. */ int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp) { struct dirent *dp; DIR *dirp; WT_DECL_RET; size_t dirallocsz; u_int count, dirsz; bool match; char **entries, *path; *dirlist = NULL; *countp = 0; WT_RET(__wt_filename(session, dir, &path)); dirp = NULL; dirallocsz = 0; dirsz = 0; entries = NULL; if (flags == 0) LF_SET(WT_DIRLIST_INCLUDE); WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS, "wt_dirlist of %s %s prefix %s", path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude", prefix == NULL ? "all" : prefix)); WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, "%s: opendir", path); for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) { /* * Skip . and .. */ if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) continue; match = false; if (prefix != NULL && ((LF_ISSET(WT_DIRLIST_INCLUDE) && WT_PREFIX_MATCH(dp->d_name, prefix)) || (LF_ISSET(WT_DIRLIST_EXCLUDE) && !WT_PREFIX_MATCH(dp->d_name, prefix)))) match = true; if (prefix == NULL || match) { /* * We have a file name we want to return. */ count++; if (count > dirsz) { dirsz += WT_DIR_ENTRY; WT_ERR(__wt_realloc_def( session, &dirallocsz, dirsz, &entries)); } WT_ERR(__wt_strdup( session, dp->d_name, &entries[count-1])); } } if (count > 0) *dirlist = entries; *countp = count; err: if (dirp != NULL) (void)closedir(dirp); __wt_free(session, path); if (ret == 0) return (0); if (*dirlist != NULL) { for (count = dirsz; count > 0; count--) __wt_free(session, entries[count]); __wt_free(session, entries); } WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix); }
/* * __wt_txn_recover -- * Run recovery. */ int __wt_txn_recover(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *metac; WT_DECL_RET; WT_RECOVERY r; WT_RECOVERY_FILE *metafile; char *config; bool do_checkpoint, eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); config = NULL; do_checkpoint = true; eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ WT_RET(__wt_open_internal_session(conn, "txn-recover", false, WT_SESSION_NO_LOGGING, &session)); r.session = session; WT_MAX_LSN(&r.max_ckpt_lsn); WT_MAX_LSN(&r.max_rec_lsn); conn->txn_global.recovery_timestamp = conn->txn_global.meta_ckpt_timestamp = 0; F_SET(conn, WT_CONN_RECOVERING); WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config)); WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config)); WT_ERR(__wt_metadata_cursor_open(session, NULL, &metac)); metafile = &r.files[WT_METAFILE_ID]; metafile->c = metac; /* * If no log was found (including if logging is disabled), or if the * last checkpoint was done with logging disabled, recovery should not * run. Scan the metadata to figure out the largest file ID. */ if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_EXISTED) || WT_IS_MAX_LSN(&metafile->ckpt_lsn)) { /* * Detect if we're going from logging disabled to enabled. * We need to know this to verify LSNs and start at the correct * log file later. If someone ran with logging, then disabled * it and removed all the log files and then turned logging back * on, we have to start logging in the log file number that is * larger than any checkpoint LSN we have from the earlier time. */ WT_ERR(__recovery_file_scan(&r)); /* * The array can be re-allocated in recovery_file_scan. Reset * our pointer after scanning all the files. */ metafile = &r.files[WT_METAFILE_ID]; conn->next_file_id = r.max_fileid; if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) && WT_IS_MAX_LSN(&metafile->ckpt_lsn) && !WT_IS_MAX_LSN(&r.max_ckpt_lsn)) WT_ERR(__wt_log_reset(session, r.max_ckpt_lsn.l.file)); else do_checkpoint = false; goto done; } /* * First, do a pass through the log to recover the metadata, and * establish the last checkpoint LSN. Skip this when opening a hot * backup: we already have the correct metadata in that case. * * If we're running with salvage and we hit an error, we ignore it * and continue. In salvage we want to recover whatever part of the * data we can from the last checkpoint up until whatever problem we * detect in the log file. In salvage, we ignore errors from scanning * the log so recovery can continue. Other errors remain errors. */ if (!was_backup) { r.metadata_only = true; /* * If this is a read-only connection, check if the checkpoint * LSN in the metadata file is up to date, indicating a clean * shutdown. */ if (F_ISSET(conn, WT_CONN_READONLY)) { WT_ERR(__wt_log_needs_recovery( session, &metafile->ckpt_lsn, &needs_rec)); if (needs_rec) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); } if (WT_IS_INIT_LSN(&metafile->ckpt_lsn)) ret = __wt_log_scan(session, NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r); else { /* * Start at the last checkpoint LSN referenced in the * metadata. If we see the end of a checkpoint while * scanning, we will change the full scan to start from * there. */ r.ckpt_lsn = metafile->ckpt_lsn; ret = __wt_log_scan(session, &metafile->ckpt_lsn, 0, __txn_log_recover, &r); } if (F_ISSET(conn, WT_CONN_SALVAGE)) ret = 0; /* * If log scan couldn't find a file we expected to be around, * this indicates a corruption of some sort. */ if (ret == ENOENT) { F_SET(conn, WT_CONN_DATA_CORRUPTION); ret = WT_ERROR; } WT_ERR(ret); } /* Scan the metadata to find the live files and their IDs. */ WT_ERR(__recovery_file_scan(&r)); /* * Clear this out. We no longer need it and it could have been * re-allocated when scanning the files. */ WT_NOT_READ(metafile, NULL); /* * We no longer need the metadata cursor: close it to avoid pinning any * resources that could block eviction during recovery. */ r.files[0].c = NULL; WT_ERR(metac->close(metac)); /* * Now, recover all the files apart from the metadata. * Pass WT_LOGSCAN_RECOVER so that old logs get truncated. */ r.metadata_only = false; __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Main recovery loop: starting at %" PRIu32 "/%" PRIu32 " to %" PRIu32 "/%" PRIu32, r.ckpt_lsn.l.file, r.ckpt_lsn.l.offset, r.max_rec_lsn.l.file, r.max_rec_lsn.l.offset); WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec)); /* * Check if the database was shut down cleanly. If not * return an error if the user does not want automatic * recovery. */ if (needs_rec && (FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR) || F_ISSET(conn, WT_CONN_READONLY))) { if (F_ISSET(conn, WT_CONN_READONLY)) WT_ERR_MSG(session, WT_RUN_RECOVERY, "Read-only database needs recovery"); WT_ERR_MSG(session, WT_RUN_RECOVERY, "Database needs recovery"); } if (F_ISSET(conn, WT_CONN_READONLY)) { do_checkpoint = false; goto done; } /* * Recovery can touch more data than fits in cache, so it relies on * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ WT_ERR(__wt_evict_create(session)); eviction_started = true; /* * Always run recovery even if it was a clean shutdown only if * this is not a read-only connection. * We can consider skipping it in the future. */ if (needs_rec) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); if (WT_IS_INIT_LSN(&r.ckpt_lsn)) ret = __wt_log_scan(session, NULL, WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER, __txn_log_recover, &r); else ret = __wt_log_scan(session, &r.ckpt_lsn, WT_LOGSCAN_RECOVER, __txn_log_recover, &r); if (F_ISSET(conn, WT_CONN_SALVAGE)) ret = 0; WT_ERR(ret); conn->next_file_id = r.max_fileid; done: WT_ERR(__recovery_set_checkpoint_timestamp(&r)); if (do_checkpoint) /* * Forcibly log a checkpoint so the next open is fast and keep * the metadata up to date with the checkpoint LSN and * archiving. */ WT_ERR(session->iface.checkpoint(&session->iface, "force=1")); /* * If we're downgrading and have newer log files, force an archive, * no matter what the archive setting is. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) WT_ERR(__wt_log_truncate_files(session, NULL, true)); FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); FLD_CLR(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY); if (ret != 0) { FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_FAILED); __wt_err(session, ret, "Recovery failed"); } /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is * created. */ if (eviction_started) WT_TRET(__wt_evict_destroy(session)); WT_TRET(session->iface.close(&session->iface, NULL)); F_CLR(conn, WT_CONN_RECOVERING); return (ret); }
/* * __wt_page_alloc -- * Create or read a page into the cache. */ int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep) { WT_CACHE *cache; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; size_t size; uint32_t i; void *p; *pagep = NULL; cache = S2C(session)->cache; page = NULL; size = sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: break; case WT_PAGE_COL_VAR: /* * Variable-length column-store leaf page: allocate memory to * describe the page's contents with the initial allocation. */ size += alloc_entries * sizeof(WT_COL); break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page: allocate memory to describe the page's * contents with the initial allocation. */ size += alloc_entries * sizeof(WT_ROW); break; WT_ILLEGAL_VALUE(session); } WT_RET(__wt_calloc(session, 1, size, &page)); page->type = type; page->read_gen = WT_READGEN_NOTSET; switch (type) { case WT_PAGE_COL_FIX: page->pg_fix_recno = recno; page->pg_fix_entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_recno = recno; /* * Internal pages have an array of references to objects so they * can split. Allocate the array of references and optionally, * the objects to which they point. */ WT_ERR(__wt_calloc(session, 1, sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *), &p)); size += sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *); pindex = p; pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1); pindex->entries = alloc_entries; WT_INTL_INDEX_SET(page, pindex); if (alloc_refs) for (i = 0; i < pindex->entries; ++i) { WT_ERR(__wt_calloc_one( session, &pindex->index[i])); size += sizeof(WT_REF); } if (0) { err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) __wt_free(session, pindex->index[i]); __wt_free(session, pindex); } __wt_free(session, page); return (ret); } break; case WT_PAGE_COL_VAR: page->pg_var_recno = recno; page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_var_entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_row_entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); } /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); (void)WT_ATOMIC_ADD8(cache->bytes_read, size); (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1); *pagep = page; return (0); }
/* * __wt_curfile_create -- * Open a cursor for a given btree handle. */ int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curfile_compare, /* compare */ __curfile_equals, /* equals */ __curfile_next, /* next */ __curfile_prev, /* prev */ __curfile_reset, /* reset */ __curfile_search, /* search */ __curfile_search_near, /* search-near */ __curfile_insert, /* insert */ __curfile_update, /* update */ __curfile_remove, /* remove */ __wt_cursor_reconfigure, /* reconfigure */ __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_CURSOR_BULK *cbulk; WT_DECL_RET; size_t csize; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); cbt = NULL; btree = S2BT(session); WT_ASSERT(session, btree != NULL); csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); WT_RET(__wt_calloc(session, 1, csize, &cbt)); cursor = &cbt->iface; *cursor = iface; cursor->session = &session->iface; cursor->internal_uri = btree->dhandle->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; cbt->btree = btree; if (bulk) { F_SET(cursor, WT_CURSTD_BULK); cbulk = (WT_CURSOR_BULK *)cbt; /* Optionally skip the validation of each bulk-loaded key. */ WT_ERR(__wt_config_gets_def( session, cfg, "skip_sort_check", 0, &cval)); WT_ERR(__wt_curbulk_init( session, cbulk, bitmap, cval.val == 0 ? 0 : 1)); } /* * random_retrieval * Random retrieval cursors only support next, reset and close. */ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); if (cval.val != 0) { __wt_cursor_set_notsup(cursor); cursor->next = __curfile_next_random; cursor->reset = __curfile_reset; } /* Underlying btree initialization. */ __wt_btcur_open(cbt); /* __wt_cursor_init is last so we don't have to clean up on error. */ WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); WT_STAT_FAST_CONN_INCR(session, cursor_create); WT_STAT_FAST_DATA_INCR(session, cursor_create); if (0) { err: __wt_free(session, cbt); } return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, true)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __wt_session_compact -- * WT_SESSION.compact method. */ int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config) { WT_COMPACT_STATE compact; WT_CONFIG_ITEM cval; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; bool ignore_cache_size_set; ignore_cache_size_set = false; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* * The compaction thread should not block when the cache is full: it is * holding locks blocking checkpoints and once the cache is full, it can * spend a long time doing eviction. */ if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) { ignore_cache_size_set = true; F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); } /* In-memory ignores compaction operations. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; /* * Non-LSM object compaction requires checkpoints, which are impossible * in transactional contexts. Disallow in all contexts (there's no * reason for LSM to allow this, possible or not), and check now so the * error message isn't confusing. */ WT_ERR(__wt_txn_context_check(session, false)); /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "lsm:") && !WT_PREFIX_MATCH(uri, "table:")) { if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) ret = dsrc->compact == NULL ? __wt_object_unsupported(session, uri) : dsrc->compact( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg); else ret = __wt_bad_object_type(session, uri); goto err; } /* Setup the session handle's compaction state structure. */ memset(&compact, 0, sizeof(WT_COMPACT_STATE)); session->compact = &compact; /* Compaction can be time-limited. */ WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; __wt_epoch(session, &session->compact->begin); /* Find the types of data sources being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __compact_handle_append, __compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_lsm_compact, cfg, 0)); if (session->compact->file_count != 0) WT_ERR(__compact_worker(session)); err: session->compact = NULL; for (i = 0; i < session->op_handle_next; ++i) { WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__compact_end(session))); WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__wt_session_release_dhandle(session))); } __wt_free(session, session->op_handle); session->op_handle_allocated = session->op_handle_next = 0; /* * Release common session resources (for example, checkpoint may acquire * significant reconciliation structures/memory). */ WT_TRET(__wt_session_release_resources(session)); if (ignore_cache_size_set) F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); else WT_STAT_CONN_INCR(session, session_table_compact_success); API_END_RET_NOTFOUND_MAP(session, ret); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int op) { WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins, *ins_copy; WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist; WT_ITEM *value, _value; WT_PAGE *page; WT_UPDATE *old_upd, *upd, *upd_obsolete; size_t ins_size, new_inshead_size, new_inslist_size, upd_size; uint64_t recno; u_int skipdepth; int i, logged; btree = cbt->btree; page = cbt->page; recno = cbt->iface.recno; logged = 0; WT_ASSERT(session, op != 1); switch (op) { case 2: /* Remove */ if (btree->type == BTREE_COL_FIX) { value = &_value; value->data = ""; value->size = 1; } else value = NULL; break; case 3: /* Insert/Update */ default: value = &cbt->iface.value; /* * There's some chance the application specified a record past * the last record on the page. If that's the case, and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. */ if (recno == 0 || recno > __col_last_recno(page)) op = 1; break; } /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); ins = NULL; new_inshead = NULL; new_inslist = NULL; upd = NULL; /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, create a new WT_UPDATE * entry and have a serialized function link it into an existing * WT_INSERT entry's WT_UPDATE list. * * Else, allocate an insert array as necessary, build a WT_INSERT and * WT_UPDATE structure pair, and call a serialized function to insert * the WT_INSERT structure. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* Make sure the update can proceed. */ WT_ERR( __wt_update_check(session, page, old_upd = cbt->ins->upd)); /* Allocate the WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, &upd->txnid)); logged = 1; /* Serialize the update. */ WT_ERR(__wt_update_serial(session, page, cbt->write_gen, &cbt->ins->upd, old_upd, NULL, 0, &upd, upd_size, &upd_obsolete)); /* Discard any obsolete WT_UPDATE structures. */ if (upd_obsolete != NULL) __wt_update_obsolete_free(session, page, upd_obsolete); } else { /* Make sure the update can proceed. */ WT_ERR(__wt_update_check(session, page, NULL)); /* There may be no insert list, allocate as necessary. */ new_inshead_size = new_inslist_size = 0; if (op == 1) { if (page->modify->append == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); WT_ERR( __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; } else inshead = &page->modify->append[0]; cbt->ins_head = *inshead; } else if (page->type == WT_PAGE_COL_FIX) { if (page->modify->update == NULL) { new_inslist_size = 1 * sizeof(WT_INSERT_HEAD *); WT_ERR( __wt_calloc_def(session, 1, &new_inslist)); inshead = &new_inslist[0]; } else inshead = &page->modify->update[0]; } else { if (page->modify->update == NULL) { new_inslist_size = page->entries * sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_def( session, page->entries, &new_inslist)); inshead = &new_inslist[cbt->slot]; } else inshead = &page->modify->update[cbt->slot]; } /* There may be no WT_INSERT list, allocate as necessary. */ if (*inshead == NULL) { new_inshead_size = sizeof(WT_INSERT_HEAD); WT_ERR(__wt_calloc_def(session, 1, &new_inshead)); for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { cbt->ins_stack[i] = &new_inshead->head[i]; cbt->next_stack[i] = NULL; } cbt->ins_head = new_inshead; } /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it. */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); WT_ERR(__wt_txn_modify(session, &upd->txnid)); logged = 1; ins->upd = upd; ins_size += upd_size; cbt->ins = ins; /* Insert or append the WT_INSERT structure. */ if (op == 1) { /* * The serialized function clears ins: take a copy of * the pointer so we can look up the record number. */ ins_copy = ins; WT_ERR(__wt_col_append_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, cbt->next_stack, &new_inslist, new_inslist_size, &new_inshead, new_inshead_size, &ins, ins_size, skipdepth)); /* Put the new recno into the cursor. */ cbt->recno = WT_INSERT_RECNO(ins_copy); } else WT_ERR(__wt_insert_serial(session, page, cbt->write_gen, inshead, cbt->ins_stack, cbt->next_stack, &new_inslist, new_inslist_size, &new_inshead, new_inshead_size, &ins, ins_size, skipdepth)); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); __wt_free(session, upd); } __wt_free(session, new_inslist); __wt_free(session, new_inshead); return (ret); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference an empty page. * * Set the transaction ID to WT_TXN_NONE because the fact that * reconciliation left the page "empty" means there's no older * transaction in the system that might need to see an earlier * version of the page. It isn't necessary (WT_TXN_NONE is 0), * but it's the right thing to do. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = NULL; parent_ref->addr = NULL; parent_ref->txnid = WT_TXN_NONE; WT_PUBLISH(parent_ref->state, WT_REF_DELETED); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __wt_col_modify -- * Column-store delete, insert, and update. */ int __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint64_t recno, const WT_ITEM *value, WT_UPDATE *upd_arg, u_int modify_type, bool exclusive) { static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 }; WT_BTREE *btree; WT_DECL_RET; WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; bool append, logged; btree = cbt->btree; ins = NULL; page = cbt->ref->page; upd = upd_arg; append = logged = false; if (upd_arg == NULL) { if (modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE) { /* * Fixed-size column-store doesn't have on-page deleted * values, it's a nul byte. */ if (modify_type == WT_UPDATE_TOMBSTONE && btree->type == BTREE_COL_FIX) { modify_type = WT_UPDATE_STANDARD; value = &col_fix_remove; } } /* * There's a chance the application specified a record past the * last record on the page. If that's the case and we're * inserting a new WT_INSERT/WT_UPDATE pair, it goes on the * append list, not the update list. Also, an out-of-band recno * implies an append operation, we're allocating a new row. * Ignore any information obtained from the search. */ WT_ASSERT(session, recno != WT_RECNO_OOB || cbt->compare != 0); if (cbt->compare != 0 && (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? __col_var_last_recno(cbt->ref) : __col_fix_last_recno(cbt->ref)))) { append = true; cbt->ins = NULL; cbt->ins_head = NULL; } } /* We're going to modify the page, we should have loaded history. */ WT_ASSERT(session, cbt->ref->state != WT_REF_LIMBO); /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); mod = page->modify; /* * If modifying a record not previously modified, but which is in the * same update slot as a previously modified record, cursor.ins will * not be set because there's no list of update records for this recno, * but cursor.ins_head will be set to point to the correct update slot. * Acquire the necessary insert information, then create a new update * entry and link it into the existing list. We get here if a page has * a single cell representing multiple records (the records have the * same value), and then a record in the cell is updated or removed, * creating the update list for the cell, and then a cursor iterates * into that same cell to update/remove a different record. We find the * correct slot in the update array, but we don't find an update list * (because it doesn't exist), and don't have the information we need * to do the insert. Normally, we wouldn't care (we could fail and do * a search for the record which would configure everything for the * insert), but range truncation does this pattern for every record in * the cell, and the performance is terrible. For that reason, catch it * here. */ if (cbt->ins == NULL && cbt->ins_head != NULL) { cbt->ins = __col_insert_search( cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno); if (cbt->ins != NULL) { if (WT_INSERT_RECNO(cbt->ins) == recno) cbt->compare = 0; else { /* * The test below is for cursor.compare set to 0 * and cursor.ins set: cursor.compare wasn't set * by the search we just did, and has an unknown * value. Clear cursor.ins to avoid the test. */ cbt->ins = NULL; } } } /* * Delete, insert or update a column-store entry. * * If modifying a previously modified record, cursor.ins will be set to * point to the correct update list. Create a new update entry and link * it into the existing list. * * Else, allocate an insert array as necessary, build an insert/update * structure pair, and link it into place. */ if (cbt->compare == 0 && cbt->ins != NULL) { /* * If we are restoring updates that couldn't be evicted, the * key must not exist on the new page. */ WT_ASSERT(session, upd_arg == NULL); /* Make sure the update can proceed. */ WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd)); /* Allocate a WT_UPDATE structure and transaction ID. */ WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; /* * Point the new WT_UPDATE item to the next element in the list. * If we get it right, the serialization function lock acts as * our memory barrier to flush this write. */ upd->next = old_upd; /* Serialize the update. */ WT_ERR(__wt_update_serial( session, page, &cbt->ins->upd, &upd, upd_size, false)); } else { /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_append, ins_headp, 1); ins_headp = &mod->mod_col_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, 1); ins_headp = &mod->mod_col_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, page, mod->mod_col_update, ins_headp, page->entries); ins_headp = &mod->mod_col_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); ins_head = *ins_headp; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and * update the cursor to reference it (the WT_INSERT_HEAD might * be allocated, the WT_INSERT was allocated). */ WT_ERR(__col_insert_alloc( session, recno, skipdepth, &ins, &ins_size)); cbt->ins_head = ins_head; cbt->ins = ins; /* * Check for insert split and checkpoint races in column-store: * it's easy (as opposed to in row-store) and a difficult bug to * otherwise diagnose. */ WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB || (recno != WT_RECNO_OOB && mod->mod_col_split_recno > recno)); if (upd_arg == NULL) { WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size, modify_type)); WT_ERR(__wt_txn_modify(session, upd)); logged = true; /* Avoid a data copy in WT_CURSOR.update. */ cbt->modify_update = upd; } else upd_size = __wt_update_list_memsize(upd); ins->upd = upd; ins_size += upd_size; /* * If there was no insert list during the search, or there was * no search because the record number has not been allocated * yet, the cursor's information cannot be correct, search * couldn't have initialized it. * * Otherwise, point the new WT_INSERT item's skiplist to the * next elements in the insert list (which we will check are * still valid inside the serialization function). * * The serial mutex acts as our memory barrier to flush these * writes before inserting them into the list. */ if (cbt->ins_stack[0] == NULL || recno == WT_RECNO_OOB) for (i = 0; i < skipdepth; i++) { cbt->ins_stack[i] = &ins_head->head[i]; ins->next[i] = cbt->next_stack[i] = NULL; } else for (i = 0; i < skipdepth; i++) ins->next[i] = cbt->next_stack[i]; /* Append or insert the WT_INSERT structure. */ if (append) WT_ERR(__wt_col_append_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, &cbt->recno, skipdepth, exclusive)); else WT_ERR(__wt_insert_serial( session, page, cbt->ins_head, cbt->ins_stack, &ins, ins_size, skipdepth, exclusive)); } /* If the update was successful, add it to the in-memory log. */ if (logged && modify_type != WT_UPDATE_RESERVE) { WT_ERR(__wt_txn_log_op(session, cbt)); /* * In case of append, the recno (key) for the value is assigned * now. Set the recno in the transaction operation to be used * incase this transaction is prepared to retrieve the update * corresponding to this operation. */ __wt_txn_op_set_recno(session, cbt->recno); } if (0) { err: /* * Remove the update from the current transaction, so we don't * try to modify it on rollback. */ if (logged) __wt_txn_unmodify(session); __wt_free(session, ins); if (upd_arg == NULL) __wt_free(session, upd); } return (ret); }