/* * __spin_lock_next_id -- * Return the next spinlock caller ID. */ static int __spin_lock_next_id(WT_SESSION_IMPL *session, int *idp) { static int lock_id = 0, next_id = 0; WT_DECL_RET; /* If we've ever registered this location, we already have an ID. */ if (*idp != WT_SPINLOCK_REGISTER) return (0); /* * We can't use the global spinlock to lock the ID allocation (duh!), * use a CAS instruction to serialize access to a local variable. * This work only gets done once per library instantiation, there * isn't a performance concern. */ while (!WT_ATOMIC_CAS(lock_id, 0, 1)) __wt_yield(); /* Allocate a blocking ID for this location. */ if (*idp == WT_SPINLOCK_REGISTER) { if (next_id < WT_SPINLOCK_MAX_LOCATION_ID) *idp = next_id++; else WT_ERR_MSG(session, ENOMEM, "spinlock caller location registry failed, " "increase the connection's blocking matrix size"); } err: WT_PUBLISH(lock_id, 0); return (ret); }
/* * __wt_schema_get_table_uri -- * Get the table handle for the named table. */ int __wt_schema_get_table_uri(WT_SESSION_IMPL *session, const char *uri, bool ok_incomplete, uint32_t flags, WT_TABLE **tablep) { WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; WT_TABLE *table; *tablep = NULL; saved_dhandle = session->dhandle; WT_ERR(__wt_session_get_dhandle(session, uri, NULL, NULL, flags)); table = (WT_TABLE *)session->dhandle; if (!ok_incomplete && !table->cg_complete) { WT_ERR(__wt_session_release_dhandle(session)); ret = __wt_set_return(session, EINVAL); WT_ERR_MSG(session, ret, "'%s' cannot be used " "until all column groups are created", table->iface.name); } *tablep = table; err: session->dhandle = saved_dhandle; return (ret); }
/* * __curfile_equals -- * WT_CURSOR->equals method for the btree cursor type. */ static int __curfile_equals(WT_CURSOR *a, WT_CURSOR *b, int *equalp) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)a; CURSOR_API_CALL(a, session, equals, cbt->btree); /* * Check both cursors are a "file:" type then call the underlying * function, it can handle cursors pointing to different objects. */ if (!WT_PREFIX_MATCH(a->internal_uri, "file:") || !WT_PREFIX_MATCH(b->internal_uri, "file:")) WT_ERR_MSG(session, EINVAL, "Cursors must reference the same object"); WT_CURSOR_CHECKKEY(a); WT_CURSOR_CHECKKEY(b); ret = __wt_btcur_equals( (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, equalp); err: API_END_RET(session, ret); }
/* * __wt_dlopen -- * Open a dynamic library. */ int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) { WT_DECL_RET; WT_DLH *dlh; WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); /* NULL means load from the current binary */ if (path == NULL) { ret = GetModuleHandleExA(0, NULL, (HMODULE *)&dlh->handle); if (ret == FALSE) WT_ERR_MSG(session, __wt_errno(), "GetModuleHandleEx(%s): %s", path, 0); } else { // TODO: load dll here DebugBreak(); } /* Windows returns 0 on failure, WT expects 0 on success */ ret = !ret; *dlhp = dlh; if (0) { err: __wt_free(session, dlh->name); __wt_free(session, dlh); } return (ret); }
/*创建一个connection evict cache*/ int __wt_cache_create(WT_SESSION_IMPL* session, const char* cfg[]) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /*对cache进行配置*/ WT_RET(__wt_cache_config(session, 0, cfg)); if (cache->eviction_target >= cache->eviction_trigger) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); /*创建evict cond信号量*/ WT_ERR(__wt_cond_alloc(session, "cache eviction server", 0, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", 0, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict)); /*初始化cache stat统计模块*/ __wt_cache_stats_update(session); return 0; err: WT_RET(__wt_cache_destroy(session)); return ret; }
/* * __bulk_row_keycmp_err -- * Error routine when row-store keys inserted out-of-order. */ static int __bulk_row_keycmp_err(WT_CURSOR_BULK *cbulk) { WT_CURSOR *cursor; WT_DECL_ITEM(a); WT_DECL_ITEM(b); WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; cursor = &cbulk->cbt.iface; WT_ERR(__wt_scr_alloc(session, 512, &a)); WT_ERR(__wt_scr_alloc(session, 512, &b)); WT_ERR_MSG(session, EINVAL, "bulk-load presented with out-of-order keys: %s compares smaller " "than previously inserted key %s", __wt_buf_set_printable( session, cursor->key.data, cursor->key.size, a), __wt_buf_set_printable( session, cbulk->last.data, cbulk->last.size, b)); err: __wt_scr_free(session, &a); __wt_scr_free(session, &b); return (ret); }
/* * __session_open_cursor -- * WT_SESSION->open_cursor method. */ static int __session_open_cursor(WT_SESSION *wt_session, const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, open_cursor, config, cfg); if ((to_dup == NULL && uri == NULL) || (to_dup != NULL && uri != NULL)) WT_ERR_MSG(session, EINVAL, "should be passed either a URI or a cursor to duplicate, " "but not both"); if (to_dup != NULL) { uri = to_dup->uri; if (WT_PREFIX_MATCH(uri, "colgroup:") || WT_PREFIX_MATCH(uri, "index:") || WT_PREFIX_MATCH(uri, "file:") || WT_PREFIX_MATCH(uri, "lsm:") || WT_PREFIX_MATCH(uri, "table:")) ret = __wt_cursor_dup(session, to_dup, cfg, cursorp); else ret = __wt_bad_object_type(session, uri); } else ret = __wt_open_cursor(session, uri, NULL, cfg, cursorp); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __curfile_compare -- * WT_CURSOR->compare method for the btree cursor type. */ static int __curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)a; CURSOR_API_CALL(a, session, compare, cbt->btree); /* * Confirm both cursors refer to the same source and have keys, then * call the underlying object to compare them. */ if (strcmp(a->uri, b->uri) != 0) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); WT_CURSOR_NEEDKEY(a); WT_CURSOR_NEEDKEY(b); ret = __wt_btcur_compare( (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp); err: API_END(session); return (ret); }
/* * __curfile_modify -- * WT_CURSOR->modify method for the btree cursor type. */ static int __curfile_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL_BTREE(cursor, session, modify, cbt->btree); WT_ERR(__cursor_checkkey(cursor)); /* Check for a rational modify vector count. */ if (nentries <= 0) WT_ERR_MSG(session, EINVAL, "Illegal modify vector with %d entries", nentries); WT_ERR(__wt_btcur_modify(cbt, entries, nentries)); /* * Modify maintains a position, key and value. Unlike update, it's not * always an internal value. */ WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT); WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) != 0); err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curjoin_get_value -- * WT_CURSOR->get_value for join cursors. */ static int __curjoin_get_value(WT_CURSOR *cursor, ...) { WT_CURSOR_JOIN *cjoin; WT_CURSOR_JOIN_ITER *iter; WT_DECL_RET; WT_SESSION_IMPL *session; va_list ap; cjoin = (WT_CURSOR_JOIN *)cursor; iter = cjoin->iter; va_start(ap, cursor); CURSOR_API_CALL(cursor, session, get_value, NULL); if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) || !__curjoin_entry_iter_ready(iter)) WT_ERR_MSG(session, EINVAL, "join cursor must be advanced with next()"); if (iter->entry->index != NULL) WT_ERR(__wt_curindex_get_valuev(iter->cursor, ap)); else WT_ERR(__wt_curtable_get_valuev(iter->cursor, ap)); err: va_end(ap); API_END_RET(session, ret); }
/* * __clsm_compare -- * WT_CURSOR->compare implementation for the LSM cursor type. */ static int __clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR_LSM *alsm; WT_DECL_RET; WT_SESSION_IMPL *session; int cmp; /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */ alsm = (WT_CURSOR_LSM *)a; CURSOR_API_CALL(a, session, compare, NULL); /* * Confirm both cursors refer to the same source and have keys, then * compare the keys. */ if (strcmp(a->uri, b->uri) != 0) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); WT_CURSOR_NEEDKEY(a); WT_CURSOR_NEEDKEY(b); WT_ERR(WT_LEX_CMP( session, alsm->lsm_tree->collator, &a->key, &b->key, cmp)); *cmpp = cmp; err: API_END(session); return (ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp, int *log_only) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; int target_list; const char *uri; *foundp = 0; *log_only = 0; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); for (cb->list_next = 0, target_list = 0; (ret = __wt_config_next(&targetconf, &k, &v)) == 0; ++target_list) { /* If it is our first time through, allocate. */ if (target_list == 0) { *foundp = 1; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); /* * Handle log targets. We do not need to go through the * schema worker, just call the function to append them. * Set log_only only if it is our only URI target. */ if (WT_PREFIX_MATCH(uri, "log:")) { if (target_list == 0) *log_only = 1; else *log_only = 0; WT_ERR(__wt_backup_list_uri_append( session, uri, NULL)); } else WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_config_concat -- * Given a NULL-terminated list of configuration strings, concatenate them * into a newly allocated buffer. Nothing special is assumed about any * of the config strings, they are simply combined in order. * * This code deals with the case where some of the config strings are * wrapped in brackets but others aren't: the resulting string does not * have brackets. */ int __wt_config_concat( WT_SESSION_IMPL *session, const char **cfg, const char **config_ret) { WT_CONFIG cparser; WT_CONFIG_ITEM k, v; WT_ITEM buf; int ret; const char **cp; WT_CLEAR(buf); ret = 0; for (cp = cfg; *cp != NULL; ++cp) { WT_ERR(__wt_config_init(session, &cparser, *cp)); while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { if (k.type != ITEM_STRING && k.type != ITEM_ID) WT_ERR_MSG(session, EINVAL, "Invalid configuration key found: '%s'\n", k.str); /* Include the quotes around string keys/values. */ if (k.type == ITEM_STRING) { --k.str; k.len += 2; } if (v.type == ITEM_STRING) { --v.str; v.len += 2; } WT_ERR(__wt_buf_catfmt(session, &buf, "%.*s%s%.*s,", (int)k.len, k.str, (v.len > 0) ? "=" : "", (int)v.len, v.str)); } if (ret != WT_NOTFOUND) goto err; } /* * If the caller passes us no valid configuration strings, we end up * here with no allocated memory to return. Check the final buffer * size: empty configuration strings are possible, and paranoia is * good. */ if (buf.size == 0) WT_RET(__wt_buf_initsize(session, &buf, 1)); /* Strip the trailing comma and NUL-terminate */ ((char *)buf.data)[buf.size - 1] = '\0'; *config_ret = buf.data; return (0); err: __wt_buf_free(session, &buf); return (ret); }
/* * __wt_cache_create -- * Create the underlying cache. */ int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); WT_ASSERT(session, conn->cache == NULL); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /* Use a common routine for run-time configuration options. */ WT_RET(__wt_cache_config(session, false, cfg)); /* * The lowest possible page read-generation has a special meaning, it * marks a page for forcible eviction; don't let it happen by accident. */ cache->read_gen = WT_READGEN_START_VALUE; /* * The target size must be lower than the trigger size or we will never * get any work done. */ if (cache->eviction_target >= cache->eviction_trigger) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); WT_ERR(__wt_cond_auto_alloc(session, "cache eviction server", false, 10000, WT_MILLION, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", false, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict_queue)); /* * We get/set some values in the cache statistics (rather than have * two copies), configure them. */ __wt_cache_stats_update(session); return (0); err: WT_RET(__wt_cache_destroy(session)); return (ret); }
/*btree file的compact操作*/ static int __compact_file(WT_SESSION_IMPL* session, const char* uri, const char* cfg[]) { WT_DECL_RET; WT_DECL_ITEM(t); WT_SESSION *wt_session; WT_TXN *txn; int i; struct timespec start_time; txn = &session->txn; wt_session = &session->iface; /* * File compaction requires checkpoints, which will fail in a * transactional context. Check now so the error message isn't * confusing. */ if(session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, " File compaction not permitted in a transaction"); /* * Force the checkpoint: we don't want to skip it because the work we * need to have done is done in the underlying block manager. */ WT_ERR(__wt_scr_alloc(session, 128, &t)); WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); WT_ERR(__wt_epoch(session, &start_time)); /* * We compact 10% of the file on each pass (but the overall size of the * file is decreasing each time, so we're not compacting 10% of the * original file each time). Try 100 times (which is clearly more than * we need); quit if we make no progress and check for a timeout each * time through the loop. */ for (i = 0; i < 100; ++i) { WT_ERR(wt_session->checkpoint(wt_session, t->data)); session->compaction = 0; WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_compact, NULL, cfg, 0)); WT_ERR(ret); if (!session->compaction) break; WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(__session_compact_check_timeout(session, start_time)); } err: __wt_scr_free(session, &t); }
/* * __wt_config_collapse -- * Collapse a set of configuration strings into newly allocated memory. * * This function takes a NULL-terminated list of configuration strings (where * the first one contains all the defaults and the values are in order from * least to most preferred, that is, the default values are least preferred), * and collapses them into newly allocated memory. The algorithm is to walk * the first of the configuration strings, and for each entry, search all of * the configuration strings for a final value, keeping the last value found. * * Notes: * Any key not appearing in the first configuration string is discarded * from the final result, because we'll never search for it. * * Nested structures aren't parsed. For example, imagine a configuration * string contains "key=(k2=v2,k3=v3)", and a subsequent string has * "key=(k4=v4)", the result will be "key=(k4=v4)", as we search for and * use the final value of "key", regardless of field overlap or missing * fields in the nested value. */ int __wt_config_collapse( WT_SESSION_IMPL *session, const char **cfg, char **config_ret) { WT_CONFIG cparser; WT_CONFIG_ITEM k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; *config_ret = NULL; WT_RET(__wt_scr_alloc(session, 0, &tmp)); __wt_config_init(session, &cparser, cfg[0]); while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, "Invalid configuration key found: '%s'", k.str); WT_ERR(__wt_config_get(session, cfg, &k, &v)); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { --k.str; k.len += 2; } if (v.type == WT_CONFIG_ITEM_STRING) { --v.str; v.len += 2; } WT_ERR(__wt_buf_catfmt(session, tmp, "%.*s=%.*s,", (int)k.len, k.str, (int)v.len, v.str)); } /* We loop until error, and the expected error is WT_NOTFOUND. */ if (ret != WT_NOTFOUND) goto err; /* * If the caller passes us no valid configuration strings, we get here * with no bytes to copy -- that's OK, the underlying string copy can * handle empty strings. * * Strip any trailing comma. */ if (tmp->size != 0) --tmp->size; ret = __wt_strndup(session, tmp->data, tmp->size, config_ret); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __session_begin_transaction -- * WT_SESSION->begin_transaction method. */ static int __session_begin_transaction(WT_SESSION *wt_session, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, begin_transaction, config, cfg); WT_CSTAT_INCR(session, txn_begin); if (!F_ISSET(S2C(session), WT_CONN_TRANSACTIONAL)) WT_ERR_MSG(session, EINVAL, "Database not configured for transactions"); if (F_ISSET(&session->txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "Transaction already running"); WT_ERR(__session_reset_cursors(session)); ret = __wt_txn_begin(session, cfg); err: API_END(session); return (ret); }
/* * __curindex_set_value -- * WT_CURSOR->set_value implementation for index cursors. */ static void __curindex_set_value(WT_CURSOR *cursor, ...) { WT_DECL_RET; WT_SESSION_IMPL *session; JOINABLE_CURSOR_API_CALL(cursor, session, set_value, NULL); WT_ERR_MSG(session, ENOTSUP, "WT_CURSOR.set_value not supported for index cursors"); err: cursor->saved_err = ret; F_CLR(cursor, WT_CURSTD_VALUE_SET); API_END(session, ret); }
/* * __wt_cache_create -- * Create the underlying cache. */ int __wt_cache_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); WT_ASSERT(session, conn->cache == NULL); WT_RET(__wt_calloc_one(session, &conn->cache)); cache = conn->cache; /* Use a common routine for run-time configuration options. */ WT_RET(__wt_cache_config(session, 0, cfg)); /* * The target size must be lower than the trigger size or we will never * get any work done. */ if (cache->eviction_target >= cache->eviction_trigger) WT_ERR_MSG(session, EINVAL, "eviction target must be lower than the eviction trigger"); WT_ERR(__wt_cond_alloc(session, "cache eviction server", 0, &cache->evict_cond)); WT_ERR(__wt_cond_alloc(session, "eviction waiters", 0, &cache->evict_waiter_cond)); WT_ERR(__wt_spin_init(session, &cache->evict_lock, "cache eviction")); WT_ERR(__wt_spin_init(session, &cache->evict_walk_lock, "cache walk")); /* Allocate the LRU eviction queue. */ cache->evict_slots = WT_EVICT_WALK_BASE + WT_EVICT_WALK_INCR; WT_ERR(__wt_calloc_def(session, cache->evict_slots, &cache->evict)); /* * We get/set some values in the cache statistics (rather than have * two copies), configure them. */ __wt_cache_stats_update(session); return (0); err: WT_RET(__wt_cache_destroy(session)); return (ret); }
/* * __curjoin_next -- * WT_CURSOR::next for join cursors. */ static int __curjoin_next(WT_CURSOR *cursor) { WT_CURSOR_JOIN *cjoin; WT_DECL_RET; WT_SESSION_IMPL *session; bool skip_left; u_int i; cjoin = (WT_CURSOR_JOIN *)cursor; CURSOR_API_CALL(cursor, session, next, NULL); if (F_ISSET(cjoin, WT_CURJOIN_ERROR)) WT_ERR_MSG(session, WT_ERROR, "join cursor encountered previous error"); if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED)) WT_ERR(__curjoin_init_iter(session, cjoin)); nextkey: if ((ret = __curjoin_entry_iter_next(cjoin->iter, &cursor->key, &cursor->recno)) == 0) { F_SET(cursor, WT_CURSTD_KEY_EXT); /* * We may have already established membership for the * 'left' case for the first entry, since we're * using that in our iteration. */ skip_left = F_ISSET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); for (i = 0; i < cjoin->entries_next; i++) { ret = __curjoin_entry_member(session, cjoin, &cjoin->entries[i], skip_left); if (ret == WT_NOTFOUND) goto nextkey; skip_left = false; WT_ERR(ret); } } if (0) { err: F_SET(cjoin, WT_CURJOIN_ERROR); } API_END_RET(session, ret); }
/* * __session_checkpoint -- * WT_SESSION->checkpoint method. */ static int __session_checkpoint(WT_SESSION *wt_session, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; txn = &session->txn; WT_CSTAT_INCR(session, checkpoint); SESSION_API_CALL(session, checkpoint, config, cfg); /* * Checkpoints require a snapshot to write a transactionally consistent * snapshot of the data. * * We can't use an application's transaction: if it has uncommitted * changes, they will be written in the checkpoint and may appear after * a crash. * * Use a real snapshot transaction: we don't want any chance of the * snapshot being updated during the checkpoint. Eviction is prevented * from evicting anything newer than this because we track the oldest * transaction ID in the system that is not visible to all readers. */ if (F_ISSET(txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "Checkpoint not permitted in a transaction"); /* * Reset open cursors. * * We do this here explicitly even though it will happen implicitly in * the call to begin_transaction for the checkpoint, in case some * implementation of WT_CURSOR::reset needs the schema lock. */ WT_ERR(__session_reset_cursors(session)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_txn_checkpoint(session, cfg)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; int target_list; const char *uri; *foundp = target_list = 0; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); for (cb->list_next = 0; (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) { if (!target_list) { target_list = *foundp = 1; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_dlopen -- * Open a dynamic library. */ int __wt_dlopen(WT_SESSION_IMPL *session, const char *path, WT_DLH **dlhp) { WT_DECL_RET; WT_DLH *dlh; WT_RET(__wt_calloc_one(session, &dlh)); WT_ERR(__wt_strdup(session, path, &dlh->name)); if ((dlh->handle = dlopen(path, RTLD_LAZY)) == NULL) WT_ERR_MSG( session, __wt_errno(), "dlopen(%s): %s", path, dlerror()); *dlhp = dlh; if (0) { err: __wt_free(session, dlh->name); __wt_free(session, dlh); } return (ret); }
/* * __curds_compare -- * WT_CURSOR.compare method for the data-source cursor type. */ static int __curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_COLLATOR *collator; WT_DECL_RET; WT_SESSION_IMPL *session; CURSOR_API_CALL(a, session, compare, NULL); /* * Confirm both cursors refer to the same source and have keys, then * compare them. */ if (strcmp(a->internal_uri, b->internal_uri) != 0) WT_ERR_MSG(session, EINVAL, "Cursors must reference the same object"); WT_ERR(__cursor_needkey(a)); WT_ERR(__cursor_needkey(b)); if (WT_CURSOR_RECNO(a)) { if (a->recno < b->recno) *cmpp = -1; else if (a->recno == b->recno) *cmpp = 0; else *cmpp = 1; } else { /* * The assumption is data-sources don't provide WiredTiger with * WT_CURSOR.compare methods, instead, we'll copy the key/value * out of the underlying data-source cursor and any comparison * to be done can be done at this level. */ collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator; WT_ERR(__wt_compare( session, collator, &a->key, &b->key, cmpp)); } err: API_END_RET(session, ret); }
/* * __curjoin_get_key -- * WT_CURSOR->get_key for join cursors. */ static int __curjoin_get_key(WT_CURSOR *cursor, ...) { WT_CURSOR_JOIN *cjoin; WT_DECL_RET; WT_SESSION_IMPL *session; va_list ap; cjoin = (WT_CURSOR_JOIN *)cursor; va_start(ap, cursor); JOINABLE_CURSOR_API_CALL(cursor, session, get_key, NULL); if (!F_ISSET(cjoin, WT_CURJOIN_INITIALIZED) || !cjoin->iter->positioned) WT_ERR_MSG(session, EINVAL, "join cursor must be advanced with next()"); WT_ERR(__wt_cursor_get_keyv(cursor, cursor->flags, ap)); err: va_end(ap); API_END_RET(session, ret); }
/* * __curmetadata_compare -- * WT_CURSOR->compare method for the metadata cursor type. */ static int __curmetadata_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR *a_file_cursor, *b_file_cursor; WT_CURSOR_METADATA *a_mdc, *b_mdc; WT_DECL_RET; WT_SESSION_IMPL *session; a_mdc = ((WT_CURSOR_METADATA *)a); b_mdc = ((WT_CURSOR_METADATA *)b); a_file_cursor = a_mdc->file_cursor; b_file_cursor = b_mdc->file_cursor; CURSOR_API_CALL(a, session, compare, ((WT_CURSOR_BTREE *)a_file_cursor)->btree); if (b->compare != __curmetadata_compare) WT_ERR_MSG(session, EINVAL, "Can only compare cursors of the same type"); WT_MD_CURSOR_NEEDKEY(a); WT_MD_CURSOR_NEEDKEY(b); if (F_ISSET(a_mdc, WT_MDC_ONMETADATA)) { if (F_ISSET(b_mdc, WT_MDC_ONMETADATA)) *cmpp = 0; else *cmpp = 1; } else if (F_ISSET(b_mdc, WT_MDC_ONMETADATA)) *cmpp = -1; else ret = a_file_cursor->compare( a_file_cursor, b_file_cursor, cmpp); err: API_END(session, ret); return (ret); }
/* * __session_open_cursor -- * WT_SESSION->open_cursor method. */ static int __session_open_cursor(WT_SESSION *wt_session, const char *uri, WT_CURSOR *to_dup, const char *config, WT_CURSOR **cursorp) { WT_SESSION_IMPL *session; int ret; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, open_cursor, config, cfg); if (uri != NULL && to_dup != NULL) WT_ERR_MSG(session, EINVAL, "should be passed either a URI or a cursor, but not both"); if (to_dup != NULL) ret = __wt_cursor_dup(session, to_dup, config, cursorp); else if (WT_PREFIX_MATCH(uri, "colgroup:")) ret = __wt_curfile_open(session, uri, cfg, cursorp); else if (WT_PREFIX_MATCH(uri, "config:")) ret = __wt_curconfig_open(session, uri, cfg, cursorp); else if (WT_PREFIX_MATCH(uri, "file:")) ret = __wt_curfile_open(session, uri, cfg, cursorp); else if (WT_PREFIX_MATCH(uri, "index:")) ret = __wt_curindex_open(session, uri, cfg, cursorp); else if (WT_PREFIX_MATCH(uri, "statistics:")) ret = __wt_curstat_open(session, uri, cfg, cursorp); else if (WT_PREFIX_MATCH(uri, "table:")) ret = __wt_curtable_open(session, uri, cfg, cursorp); else { __wt_err(session, EINVAL, "Unknown cursor type '%s'", uri); ret = EINVAL; } err: API_END_NOTFOUND_MAP(session, ret); }
/* * __posix_directory_sync -- * Flush a directory to ensure file creation, remove or rename is durable. */ static int __posix_directory_sync(WT_SESSION_IMPL *session, const char *path) { WT_DECL_ITEM(tmp); WT_DECL_RET; int fd, tret; char *dir; WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__wt_buf_setstr(session, tmp, path)); /* * This layer should never see a path that doesn't include a trailing * path separator, this code asserts that fact. */ dir = tmp->mem; strrchr(dir, '/')[1] = '\0'; fd = -1; /* -Wconditional-uninitialized */ WT_SYSCALL_RETRY(( (fd = open(dir, O_RDONLY, 0444)) == -1 ? -1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, "%s: directory-sync: open", dir); ret = __posix_sync(session, fd, dir, "directory-sync"); WT_SYSCALL(close(fd), tret); if (tret != 0) { __wt_err(session, tret, "%s: directory-sync: close", dir); if (ret == 0) ret = tret; } err: __wt_scr_free(session, &tmp); return (ret); }
/* * __curindex_compare -- * WT_CURSOR->compare method for the index cursor type. */ static int __curindex_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR_INDEX *cindex; WT_DECL_RET; WT_SESSION_IMPL *session; cindex = (WT_CURSOR_INDEX *)a; JOINABLE_CURSOR_API_CALL(a, session, compare, NULL); /* Check both cursors are "index:" type. */ if (!WT_PREFIX_MATCH(a->uri, "index:") || strcmp(a->uri, b->uri) != 0) WT_ERR_MSG(session, EINVAL, "Cursors must reference the same object"); WT_CURSOR_CHECKKEY(a); WT_CURSOR_CHECKKEY(b); ret = __wt_compare( session, cindex->index->collator, &a->key, &b->key, cmpp); err: API_END_RET(session, ret); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; bool deleting, fatal, locked; ci = &block->live; fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); #endif /* * Checkpoints are a two-step process: first, write a new checkpoint to * disk (including all the new extent lists for modified checkpoints * and the live system). As part of this, create a list of file blocks * newly available for reallocation, based on checkpoints being deleted. * We then return the locations of the new checkpoint information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new checkpoint information, and if we crashed before * the new checkpoint location was saved, we'd have overwritten blocks * still referenced by checkpoints in the system.) In summary, there is * a second step: after our caller saves the checkpoint information, we * are called to add the newly available blocks into the live system's * available list. * * This function is the first step, the second step is in the resolve * function. * * If we're called to checkpoint the same file twice (without the second * resolution step), or re-entered for any reason, it's an error in our * caller, and our choices are all bad: leak blocks or potentially crash * with our caller not yet having saved previous checkpoint information * to stable storage. */ __wt_spin_lock(session, &block->live_lock); if (block->ckpt_inprogress) ret = __wt_block_panic(session, EINVAL, "%s: unexpected checkpoint ordering", block->name); else block->ckpt_inprogress = true; __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but as * described above, there is no "free the checkpoint information" call * into the block manager; if there was an error in an upper level that * resulted in some previous checkpoint never being resolved, the list * may not be empty. We should have caught that with the "checkpoint * in progress" test, but it doesn't cost us anything to be cautious. * * We free the checkpoint's allocation and discard extent lists as part * of the resolution step, not because they're needed at that time, but * because it's potentially a lot of work, and waiting allows the btree * layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail", true)); __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = true; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Failures are now fatal: we can't currently back out the merge of any * deleted checkpoint extent lists into the live system's extent lists, * so continuing after error would leave the live system's extent lists * corrupted for any subsequent checkpoint (and potentially, should a * subsequent checkpoint succeed, for recovery). */ fatal = true; /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * deleted and merged checkpoint information without a lock, except for * the final merge of ranges into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = true; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before merging * checkpoint allocation and discard information from the checkpoints * we're deleting, those operations change the underlying byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } #endif /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, false)); live_update: /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ckpt_size; /* * Set the rolling checkpoint size for the live system. * The current size includes the current checkpoint's * root page size (root pages are on the checkpoint's * block allocation list as root pages are allocated * with the usual block allocation functions). That's * correct, but we don't want to include it in the size * for the next checkpoint. */ ckpt_size -= ci->root_size; /* * Additionally, we had a bug for awhile where the live * checkpoint size grew without bound. We can't sanity * check the value, that would require walking the tree * as part of the checkpoint. Bound any bug at the size * of the file. * It isn't practical to assert that the value is within * bounds since databases created with older versions * of WiredTiger (2.8.0) would likely see an error. */ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. This includes freeing a lot of extents, so do it * outside of the system's lock by copying and resetting the original, * then doing the work later. */ ci->ckpt_alloc = ci->alloc; WT_ERR(__wt_block_extlist_init( session, &ci->alloc, "live", "alloc", false)); ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init( session, &ci->discard, "live", "discard", false)); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) WT_ERR_MSG(session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard " "list"); #endif err: if (ret != 0 && fatal) ret = __wt_block_panic(session, ret, "%s: fatal checkpoint failure", block->name); if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }