/* * __recovery_file_scan -- * Scan the files referenced from the metadata and gather information * about them for recovery. */ static int __recovery_file_scan(WT_RECOVERY *r) { WT_DECL_RET; WT_CURSOR *c; const char *uri, *config; int cmp; /* Scan through all files in the metadata. */ c = r->files[0].c; c->set_key(c, "file:"); if ((ret = c->search_near(c, &cmp)) != 0) { /* Is the metadata empty? */ if (ret == WT_NOTFOUND) ret = 0; goto err; } if (cmp < 0) WT_ERR_NOTFOUND_OK(c->next(c)); for (; ret == 0; ret = c->next(c)) { WT_ERR(c->get_key(c, &uri)); if (!WT_PREFIX_MATCH(uri, "file:")) break; WT_ERR(c->get_value(c, &config)); WT_ERR(__recovery_setup_file(r, uri, config)); } WT_ERR_NOTFOUND_OK(ret); err: return (ret); }
/* * __backup_all -- * Backup all objects in the database. */ static int __backup_all(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb) { WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_DECL_RET; const char *key, *value; cursor = NULL; /* * Open a cursor on the metadata file and copy all of the entries to * the hot backup file. */ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &key)); WT_ERR(cursor->get_value(cursor, &value)); WT_ERR_TEST((fprintf( cb->bfp, "%s\n%s\n", key, value) < 0), __wt_errno()); /* * While reading the metadata file, check there are no "sources" * or "types" which can't support hot backup. This checks for * a data source that's non-standard, which can't be backed up, * but is also sanity checking: if there's an entry backed by * anything other than a file or lsm entry, we're confused. */ if ((ret = __wt_config_getones( session, value, "type", &cval)) == 0 && !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file") && !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm")) WT_ERR_MSG(session, ENOTSUP, "hot backup is not supported for objects of " "type %.*s", (int)cval.len, cval.str); WT_ERR_NOTFOUND_OK(ret); if ((ret =__wt_config_getones( session, value, "source", &cval)) == 0 && !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "file:") && !WT_PREFIX_MATCH_LEN(cval.str, cval.len, "lsm:")) WT_ERR_MSG(session, ENOTSUP, "hot backup is not supported for objects of " "source %.*s", (int)cval.len, cval.str); WT_ERR_NOTFOUND_OK(ret); } WT_ERR_NOTFOUND_OK(ret); /* Build a list of the file objects that need to be copied. */ WT_WITH_DHANDLE_LOCK(session, ret = __wt_meta_btree_apply( session, __backup_list_all_append, NULL)); err: if (cursor != NULL) WT_TRET(cursor->close(cursor)); return (ret); }
/* * __recovery_set_checkpoint_timestamp -- * Set the checkpoint timestamp as retrieved from the metadata file. */ static int __recovery_set_checkpoint_timestamp(WT_RECOVERY *r) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; wt_timestamp_t ckpt_timestamp; char ts_string[WT_TS_INT_STRING_SIZE], *sys_config; sys_config = NULL; session = r->session; conn = S2C(session); /* * Read the system checkpoint information from the metadata file and * save the stable timestamp of the last checkpoint for later query. * This gets saved in the connection. */ ckpt_timestamp = 0; /* Search in the metadata for the system information. */ WT_ERR_NOTFOUND_OK( __wt_metadata_search(session, WT_SYSTEM_CKPT_URI, &sys_config)); if (sys_config != NULL) { WT_CLEAR(cval); WT_ERR_NOTFOUND_OK(__wt_config_getones( session, sys_config, "checkpoint_timestamp", &cval)); if (cval.len != 0) { __wt_verbose(session, WT_VERB_RECOVERY, "Recovery timestamp %.*s", (int)cval.len, cval.str); WT_ERR(__wt_txn_parse_timestamp_raw(session, "recovery", &ckpt_timestamp, &cval)); } } /* * Set the recovery checkpoint timestamp and the metadata checkpoint * timestamp so that the checkpoint after recovery writes the correct * value into the metadata. */ conn->txn_global.meta_ckpt_timestamp = conn->txn_global.recovery_timestamp = ckpt_timestamp; if (WT_VERBOSE_ISSET(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS)) { __wt_timestamp_to_string( conn->txn_global.recovery_timestamp, ts_string); __wt_verbose(session, WT_VERB_RECOVERY | WT_VERB_RECOVERY_PROGRESS, "Set global recovery timestamp: %s", ts_string); } err: __wt_free(session, sys_config); return (ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp, int *log_only) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; int target_list; const char *uri; *foundp = 0; *log_only = 0; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); for (cb->list_next = 0, target_list = 0; (ret = __wt_config_next(&targetconf, &k, &v)) == 0; ++target_list) { /* If it is our first time through, allocate. */ if (target_list == 0) { *foundp = 1; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); /* * Handle log targets. We do not need to go through the * schema worker, just call the function to append them. * Set log_only only if it is our only URI target. */ if (WT_PREFIX_MATCH(uri, "log:")) { if (target_list == 0) *log_only = 1; else *log_only = 0; WT_ERR(__wt_backup_list_uri_append( session, uri, NULL)); } else WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_las_remove_block -- * Remove all records matching a key prefix from the lookaside store. */ int __wt_las_remove_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, const uint8_t *addr, size_t addr_size) { WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; uint64_t las_counter, las_txnid; uint32_t las_id; int exact; WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); /* * Search for the block's unique prefix and step through all matching * records, removing them. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, btree_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != btree_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ WT_ERR(cursor->remove(cursor)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __clsm_next -- * WT_CURSOR->next method for the LSM cursor type. */ static int __clsm_next(WT_CURSOR *cursor) { WT_CURSOR_LSM *clsm; WT_CURSOR *c; WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; int cmp; bool check, deleted; clsm = (WT_CURSOR_LSM *)cursor; CURSOR_API_CALL(cursor, session, next, NULL); WT_CURSOR_NOVALUE(cursor); WT_ERR(__clsm_enter(clsm, false, false)); /* If we aren't positioned for a forward scan, get started. */ if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) { F_CLR(clsm, WT_CLSM_MULTIPLE); WT_FORALL_CURSORS(clsm, c, i) { if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) { WT_ERR(c->reset(c)); ret = c->next(c); } else if (c != clsm->current) { c->set_key(c, &cursor->key); if ((ret = c->search_near(c, &cmp)) == 0) { if (cmp < 0) ret = c->next(c); else if (cmp == 0) { if (clsm->current == NULL) clsm->current = c; else F_SET(clsm, WT_CLSM_MULTIPLE); } } else F_CLR(c, WT_CURSTD_KEY_SET); } WT_ERR_NOTFOUND_OK(ret); } F_SET(clsm, WT_CLSM_ITERATE_NEXT); F_CLR(clsm, WT_CLSM_ITERATE_PREV); /* We just positioned *at* the key, now move. */ if (clsm->current != NULL) goto retry; } else {
/* * __truncate_dsrc -- * WT_SESSION::truncate for a data-source without a truncate operation. */ static int __truncate_dsrc(WT_SESSION_IMPL *session, const char *uri) { WT_CURSOR *cursor; WT_DECL_RET; const char *cfg[2]; /* Open a cursor and traverse the object, removing every entry. */ cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); cfg[1] = NULL; WT_RET(__wt_open_cursor(session, uri, NULL, cfg, &cursor)); while ((ret = cursor->next(cursor)) == 0) WT_ERR(cursor->remove(cursor)); WT_ERR_NOTFOUND_OK(ret); err: WT_TRET(cursor->close(cursor)); return (ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; int target_list; const char *uri; *foundp = target_list = 0; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); for (cb->list_next = 0; (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) { if (!target_list) { target_list = *foundp = 1; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(&tmp); return (ret); }
/* * __conn_reconfigure -- * WT_CONNECTION->reconfigure method. */ static int __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_SESSION_IMPL *session; /* * Special version of cfg that doesn't include the default config: used * to limit changes to values that the application sets explicitly. * Note that any function using this value has to be prepared to handle * not-found as a valid option return. */ const char *raw_cfg[] = { config, NULL }; conn = (WT_CONNECTION_IMPL *)wt_conn; CONNECTION_API_CALL(conn, session, reconfigure, config, cfg); /* Turning on statistics clears any existing values. */ if ((ret = __wt_config_gets(session, raw_cfg, "statistics", &cval)) == 0) { conn->statistics = cval.val == 0 ? 0 : 1; if (conn->statistics) __wt_stat_clear_connection_stats(&conn->stats); } WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_conn_cache_pool_config(session, cfg)); WT_ERR(__wt_cache_config(conn, raw_cfg)); WT_ERR(__conn_verbose_config(session, raw_cfg)); /* Wake up the cache pool server so any changes are noticed. */ if (F_ISSET(conn, WT_CONN_CACHE_POOL)) WT_ERR(__wt_cond_signal( session, __wt_process.cache_pool->cache_pool_cond)); err: API_END(session); return (ret); }
/* * __metadata_load_bulk -- * Create any bulk-loaded file stubs. */ static int __metadata_load_bulk(WT_SESSION_IMPL *session) { WT_CURSOR *cursor; WT_DECL_RET; uint32_t allocsize; bool exist; const char *filecfg[] = { WT_CONFIG_BASE(session, file_meta), NULL, NULL }; const char *key, *value; /* * If a file was being bulk-loaded during the hot backup, it will appear * in the metadata file, but the file won't exist. Create on demand. */ WT_RET(__wt_metadata_cursor(session, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &key)); if (!WT_PREFIX_SKIP(key, "file:")) continue; /* If the file exists, it's all good. */ WT_ERR(__wt_fs_exist(session, key, &exist)); if (exist) continue; /* * If the file doesn't exist, assume it's a bulk-loaded file; * retrieve the allocation size and re-create the file. */ WT_ERR(cursor->get_value(cursor, &value)); filecfg[1] = value; WT_ERR(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); WT_ERR(__wt_block_manager_create(session, key, allocsize)); } WT_ERR_NOTFOUND_OK(ret); err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); }
/* * __fill_index -- * Fill the index from the current contents of the table. */ static int __fill_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) { WT_DECL_RET; WT_CURSOR *tcur, *icur; WT_SESSION *wt_session; wt_session = &session->iface; tcur = NULL; icur = NULL; WT_RET(__wt_schema_open_colgroups(session, table)); /* * If the column groups have not been completely created, * there cannot be data inserted yet, and we're done. */ if (!table->cg_complete) return (0); WT_ERR(wt_session->open_cursor(wt_session, idx->source, NULL, "bulk=unordered", &icur)); WT_ERR(wt_session->open_cursor(wt_session, table->name, NULL, "readonly", &tcur)); while ((ret = tcur->next(tcur)) == 0) WT_ERR(__wt_apply_single_idx(session, idx, icur, (WT_CURSOR_TABLE *)tcur, icur->insert)); WT_ERR_NOTFOUND_OK(ret); err: if (icur) WT_TRET(icur->close(icur)); if (tcur) WT_TRET(tcur->close(tcur)); return (ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, const char *cfg[], bool *foundp, bool *log_only) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; bool target_list; const char *uri; *foundp = *log_only = false; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); __wt_config_subinit(session, &targetconf, &cval); for (target_list = false; (ret = __wt_config_next(&targetconf, &k, &v)) == 0; target_list = true) { /* If it is our first time through, allocate. */ if (!target_list) { *foundp = true; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); /* * Handle log targets. We do not need to go through the schema * worker, just call the function to append them. Set log_only * only if it is our only URI target. */ if (WT_PREFIX_MATCH(uri, "log:")) { /* * Log archive cannot mix with incremental backup, don't * let that happen. */ if (FLD_ISSET( S2C(session)->log_flags, WT_CONN_LOG_ARCHIVE)) WT_ERR_MSG(session, EINVAL, "incremental backup not possible when " "automatic log archival configured"); *log_only = !target_list; WT_ERR(__backup_log_append( session, session->bkp_cursor, false)); } else { *log_only = false; WT_ERR(__wt_schema_worker(session, uri, NULL, __backup_list_uri_append, cfg, 0)); } } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __config_merge_scan -- * Walk a configuration string, inserting entries into the merged array. */ static int __config_merge_scan(WT_SESSION_IMPL *session, const char *key, const char *value, WT_CONFIG_MERGE *cp) { WT_CONFIG cparser; WT_CONFIG_ITEM k, v; WT_DECL_ITEM(kb); WT_DECL_ITEM(vb); WT_DECL_RET; size_t len; WT_ERR(__wt_scr_alloc(session, 0, &kb)); WT_ERR(__wt_scr_alloc(session, 0, &vb)); WT_ERR(__wt_config_init(session, &cparser, value)); while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, "Invalid configuration key found: '%s'\n", k.str); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { --k.str; k.len += 2; } if (v.type == WT_CONFIG_ITEM_STRING) { --v.str; v.len += 2; } /* * !!! * We're using a JSON quote character to separate the names we * create for nested structures. That's not completely safe as * it's possible to quote characters in JSON such that a quote * character appears as a literal character in a key name. In * a few cases, applications can create their own key namespace * (for example, shared library extension names), and therefore * it's possible for an application to confuse us. Error if we * we ever see a key with a magic character. */ for (len = 0; len < k.len; ++len) if (k.str[len] == SEPC) WT_ERR_MSG(session, EINVAL, "key %.*s contains a '%c' separator " "character", (int)k.len, (char *)k.str, SEPC); /* Build the key/value strings. */ WT_ERR(__wt_buf_fmt(session, kb, "%s%s%.*s", key == NULL ? "" : key, key == NULL ? "" : SEP, (int)k.len, k.str)); WT_ERR(__wt_buf_fmt(session, vb, "%.*s", (int)v.len, v.str)); /* * If the value is a structure, recursively parse it. * * !!! * Don't merge unless the structure has field names. WiredTiger * stores checkpoint LSNs in the metadata file using nested * structures without field names: "checkpoint_lsn=(1,0)", not * "checkpoint_lsn=(file=1,offset=0)". The value type is still * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the * value. */ if (v.type == WT_CONFIG_ITEM_STRUCT && strchr(vb->data, '=') != NULL) { WT_ERR(__config_merge_scan( session, kb->data, vb->data, cp)); continue; } /* Insert the value into the array. */ WT_ERR(__wt_realloc_def(session, &cp->entries_allocated, cp->entries_next + 1, &cp->entries)); WT_ERR(__wt_strndup(session, kb->data, kb->size, &cp->entries[cp->entries_next].k)); WT_ERR(__wt_strndup(session, vb->data, vb->size, &cp->entries[cp->entries_next].v)); cp->entries[cp->entries_next].gen = cp->entries_next; ++cp->entries_next; } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &kb); __wt_scr_free(session, &vb); return (ret); }
/* * wiredtiger_open -- * Main library entry point: open a new connection to a WiredTiger * database. */ int wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *config, WT_CONNECTION **wt_connp) { static WT_CONNECTION stdc = { __conn_close, __conn_reconfigure, __conn_get_home, __conn_is_new, __conn_open_session, __conn_load_extension, __conn_add_data_source, __conn_add_collator, __conn_add_compressor, __conn_add_extractor }; static struct { const char *name; uint32_t flag; } *ft, directio_types[] = { { "data", WT_DIRECTIO_DATA }, { "log", WT_DIRECTIO_LOG }, { NULL, 0 } }; WT_CONFIG subconfig; WT_CONFIG_ITEM cval, skey, sval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_ITEM *cbuf, expath, exconfig; WT_SESSION_IMPL *session; const char *cfg[] = { __wt_confdfl_wiredtiger_open, config, NULL, NULL, NULL }; int exist; *wt_connp = NULL; session = NULL; cbuf = NULL; WT_CLEAR(expath); WT_CLEAR(exconfig); WT_RET(__wt_library_init()); WT_RET(__wt_calloc_def(NULL, 1, &conn)); conn->iface = stdc; /* * Immediately link the structure into the connection structure list: * the only thing ever looked at on that list is the database name, * and a NULL value is fine. */ __wt_spin_lock(NULL, &__wt_process.spinlock); TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q); __wt_spin_unlock(NULL, &__wt_process.spinlock); session = conn->default_session = &conn->dummy_session; session->iface.connection = &conn->iface; session->name = "wiredtiger_open"; __wt_event_handler_set(session, event_handler); /* Remaining basic initialization of the connection structure. */ WT_ERR(__wt_connection_init(conn)); /* Check the configuration strings. */ WT_ERR(__wt_config_check( session, __wt_confchk_wiredtiger_open, config, 0)); /* Get the database home. */ WT_ERR(__conn_home(session, home, cfg)); /* Make sure no other thread of control already owns this database. */ WT_ERR(__conn_single(session, cfg)); /* Read the database-home configuration file. */ WT_ERR(__conn_config_file(session, cfg, &cbuf)); /* Read the environment variable configuration. */ WT_ERR(__conn_config_env(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval)); conn->hazard_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge", &cval)); if (cval.val) F_SET(conn, WT_CONN_LSM_MERGE); WT_ERR(__wt_config_gets(session, cfg, "sync", &cval)); if (cval.val) F_SET(conn, WT_CONN_SYNC); WT_ERR(__wt_config_gets(session, cfg, "transactional", &cval)); if (cval.val) F_SET(conn, WT_CONN_TRANSACTIONAL); /* Configure verbose flags. */ WT_ERR(__conn_verbose_config(session, cfg)); WT_ERR(__wt_conn_cache_pool_config(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "logging", &cval)); if (cval.val != 0) WT_ERR(__wt_open( session, WT_LOG_FILENAME, 1, 0, 0, &conn->log_fh)); /* Configure direct I/O and buffer alignment. */ WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval)); if (cval.val == -1) conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT; else conn->buffer_alignment = (size_t)cval.val; #ifndef HAVE_POSIX_MEMALIGN if (conn->buffer_alignment != 0) WT_ERR_MSG(session, EINVAL, "buffer_alignment requires posix_memalign"); #endif /* * Configuration: direct_io, mmap, statistics. */ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval)); for (ft = directio_types; ft->name != NULL; ft++) { ret = __wt_config_subgets(session, &cval, ft->name, &sval); if (ret == 0) { if (sval.val) FLD_SET(conn->direct_io, ft->flag); } else if (ret != WT_NOTFOUND) goto err; } WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val == 0 ? 0 : 1; WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval)); conn->statistics = cval.val == 0 ? 0 : 1; /* Load any extensions referenced in the config. */ WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval)); WT_ERR(__wt_config_subinit(session, &subconfig, &cval)); while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) { WT_ERR(__wt_buf_fmt( session, &expath, "%.*s", (int)skey.len, skey.str)); if (sval.len > 0) WT_ERR(__wt_buf_fmt(session, &exconfig, "entry=%.*s\n", (int)sval.len, sval.str)); WT_ERR(conn->iface.load_extension(&conn->iface, expath.data, (sval.len > 0) ? exconfig.data : NULL)); } WT_ERR_NOTFOUND_OK(ret); /* * Open the connection; if that fails, the connection handle has been * destroyed by the time the open function returns. */ if ((ret = __wt_connection_open(conn, cfg)) != 0) { conn = NULL; WT_ERR(ret); } /* Open the default session. */ WT_ERR(__wt_open_session(conn, 1, NULL, NULL, &conn->default_session)); session = conn->default_session; /* * Check on the turtle and metadata files, creating them if necessary * (which avoids application threads racing to create the metadata file * later). */ WT_ERR(__wt_meta_turtle_init(session, &exist)); if (!exist) { /* * We're single-threaded, but acquire the schema lock * regardless: the lower level code checks that it is * appropriately synchronized. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_create(session, WT_METADATA_URI, NULL)); WT_ERR(ret); } WT_ERR(__wt_metadata_open(session)); /* If there's a hot-backup file, load it. */ WT_ERR(__wt_metadata_load_backup(session)); /* * XXX LSM initialization. * This is structured so that it could be moved to an extension. */ WT_ERR(__wt_lsm_init(&conn->iface, NULL)); STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; /* * Destroying the connection on error will destroy our session handle, * cleanup using the session handle first, then discard the connection. */ err: if (cbuf != NULL) __wt_buf_free(session, cbuf); __wt_buf_free(session, &expath); __wt_buf_free(session, &exconfig); if (ret != 0 && conn != NULL) WT_TRET(__wt_connection_destroy(conn)); /* Let the server threads proceed. */ if (ret == 0) conn->connection_initialized = 1; return (ret); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; start_id = 0; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count)); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count)); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close")); else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __txn_op_apply -- * Apply a transactional operation during recovery. */ static int __txn_op_apply( WT_RECOVERY *r, WT_LSN *lsnp, const uint8_t **pp, const uint8_t *end) { WT_CURSOR *cursor, *start, *stop; WT_DECL_RET; WT_ITEM key, start_key, stop_key, value; WT_SESSION_IMPL *session; uint64_t recno, start_recno, stop_recno; uint32_t fileid, mode, optype, opsize; session = r->session; cursor = NULL; /* Peek at the size and the type. */ WT_ERR(__wt_logop_read(session, pp, end, &optype, &opsize)); end = *pp + opsize; switch (optype) { case WT_LOGOP_COL_MODIFY: WT_ERR(__wt_logop_col_modify_unpack(session, pp, end, &fileid, &recno, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); if ((ret = cursor->search(cursor)) != 0) WT_ERR_NOTFOUND_OK(ret); else { /* * Build/insert a complete value during recovery rather * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ WT_ERR(__wt_modify_apply(session, cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; case WT_LOGOP_COL_PUT: WT_ERR(__wt_logop_col_put_unpack(session, pp, end, &fileid, &recno, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_COL_REMOVE: WT_ERR(__wt_logop_col_remove_unpack(session, pp, end, &fileid, &recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); cursor->set_key(cursor, recno); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_COL_TRUNCATE: WT_ERR(__wt_logop_col_truncate_unpack(session, pp, end, &fileid, &start_recno, &stop_recno)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ if (start_recno == WT_RECNO_OOB) { start = NULL; stop = cursor; } else if (stop_recno == WT_RECNO_OOB) { start = cursor; stop = NULL; } else { start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, true, &stop)); } /* Set the keys. */ if (start != NULL) start->set_key(start, start_recno); if (stop != NULL) stop->set_key(stop, stop_recno); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; case WT_LOGOP_ROW_MODIFY: WT_ERR(__wt_logop_row_modify_unpack(session, pp, end, &fileid, &key, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); if ((ret = cursor->search(cursor)) != 0) WT_ERR_NOTFOUND_OK(ret); else { /* * Build/insert a complete value during recovery rather * than using cursor modify to create a partial update * (for no particular reason than simplicity). */ WT_ERR(__wt_modify_apply(session, cursor, value.data)); WT_ERR(cursor->insert(cursor)); } break; case WT_LOGOP_ROW_PUT: WT_ERR(__wt_logop_row_put_unpack(session, pp, end, &fileid, &key, &value)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); __wt_cursor_set_raw_value(cursor, &value); WT_ERR(cursor->insert(cursor)); break; case WT_LOGOP_ROW_REMOVE: WT_ERR(__wt_logop_row_remove_unpack(session, pp, end, &fileid, &key)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); __wt_cursor_set_raw_key(cursor, &key); WT_ERR(cursor->remove(cursor)); break; case WT_LOGOP_ROW_TRUNCATE: WT_ERR(__wt_logop_row_truncate_unpack(session, pp, end, &fileid, &start_key, &stop_key, &mode)); GET_RECOVERY_CURSOR(session, r, lsnp, fileid, &cursor); /* Set up the cursors. */ start = stop = NULL; switch (mode) { case WT_TXN_TRUNC_ALL: /* Both cursors stay NULL. */ break; case WT_TXN_TRUNC_BOTH: start = cursor; WT_ERR(__recovery_cursor( session, r, lsnp, fileid, true, &stop)); break; case WT_TXN_TRUNC_START: start = cursor; break; case WT_TXN_TRUNC_STOP: stop = cursor; break; WT_ILLEGAL_VALUE_ERR(session, mode); } /* Set the keys. */ if (start != NULL) __wt_cursor_set_raw_key(start, &start_key); if (stop != NULL) __wt_cursor_set_raw_key(stop, &stop_key); WT_TRET(session->iface.truncate(&session->iface, NULL, start, stop, NULL)); /* If we opened a duplicate cursor, close it now. */ if (stop != NULL && stop != cursor) WT_TRET(stop->close(stop)); WT_ERR(ret); break; WT_ILLEGAL_VALUE_ERR(session, optype); } /* Reset the cursor so it doesn't block eviction. */ if (cursor != NULL) WT_ERR(cursor->reset(cursor)); return (0); err: __wt_err(session, ret, "operation apply failed during recovery: operation type %" PRIu32 " at LSN %" PRIu32 "/%" PRIu32, optype, lsnp->l.file, lsnp->l.offset); return (ret); }
/* * __open_index -- * Open an index. */ static int __open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) { WT_CONFIG colconf; WT_CONFIG_ITEM ckey, cval, metadata; WT_DECL_ITEM(buf); WT_DECL_ITEM(plan); WT_DECL_RET; u_int npublic_cols, i; WT_ERR(__wt_scr_alloc(session, 0, &buf)); /* Get the data source from the index config. */ WT_ERR(__wt_config_getones(session, idx->config, "source", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->source)); WT_ERR(__wt_config_getones(session, idx->config, "immutable", &cval)); if (cval.val) F_SET(idx, WT_INDEX_IMMUTABLE); /* * Compatibility: we didn't always maintain collator information in * index metadata, cope when it isn't found. */ WT_CLEAR(cval); WT_ERR_NOTFOUND_OK(__wt_config_getones( session, idx->config, "collator", &cval)); if (cval.len != 0) { WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_getones( session, idx->config, "app_metadata", &metadata)); WT_ERR(__wt_collator_config( session, idx->name, &cval, &metadata, &idx->collator, &idx->collator_owned)); } WT_ERR(__wt_extractor_config( session, idx->name, idx->config, &idx->extractor, &idx->extractor_owned)); WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &idx->key_format)); /* * The key format for an index is somewhat subtle: the application * specifies a set of columns that it will use for the key, but the * engine usually adds some hidden columns in order to derive the * primary key. These hidden columns are part of the file's key. * * The file's key_format is stored persistently, we need to calculate * the index cursor key format (which will usually omit some of those * keys). */ WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_config_getones( session, idx->config, "columns", &idx->colconf)); /* Start with the declared index columns. */ WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf)); for (npublic_cols = 0; (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0; ++npublic_cols) WT_ERR(__wt_buf_catfmt( session, buf, "%.*s,", (int)ckey.len, ckey.str)); if (ret != WT_NOTFOUND) goto err; /* * If we didn't find any columns, the index must have an extractor. * We don't rely on this unconditionally because it was only added to * the metadata after version 2.3.1. */ if (npublic_cols == 0) { WT_ERR(__wt_config_getones( session, idx->config, "index_key_columns", &cval)); npublic_cols = (u_int)cval.val; WT_ASSERT(session, npublic_cols != 0); for (i = 0; i < npublic_cols; i++) WT_ERR(__wt_buf_catfmt(session, buf, "\"bad col\",")); } /* * Now add any primary key columns from the table that are not * already part of the index key. */ WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf)); for (i = 0; i < table->nkey_columns && (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0; i++) { /* * If the primary key column is already in the secondary key, * don't add it again. */ if (__wt_config_subgetraw( session, &idx->colconf, &ckey, &cval) == 0) continue; WT_ERR(__wt_buf_catfmt( session, buf, "%.*s,", (int)ckey.len, ckey.str)); } WT_ERR_NOTFOUND_OK(ret); /* * If the table doesn't yet have its column groups, don't try to * calculate a plan: we are just checking that the index creation is * sane. */ if (!table->cg_complete) goto err; WT_ERR(__wt_scr_alloc(session, 0, &plan)); WT_ERR(__wt_struct_plan( session, table, buf->data, buf->size, false, plan)); WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->key_plan)); /* Set up the cursor key format (the visible columns). */ WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_struct_truncate(session, idx->key_format, npublic_cols, buf)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &idx->idxkey_format)); /* * Add a trailing padding byte to the format. This ensures that there * will be no special optimization of the last column, so the primary * key columns can be simply appended. */ WT_ERR(__wt_buf_catfmt(session, buf, "x")); WT_ERR(__wt_strndup(session, buf->data, buf->size, &idx->exkey_format)); /* By default, index cursor values are the table value columns. */ /* TODO Optimize to use index columns in preference to table lookups. */ WT_ERR(__wt_buf_init(session, plan, 0)); WT_ERR(__wt_struct_plan(session, table, table->colconf.str, table->colconf.len, true, plan)); WT_ERR(__wt_strndup(session, plan->data, plan->size, &idx->value_plan)); err: __wt_scr_free(session, &buf); __wt_scr_free(session, &plan); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (key->size != 0) { __wt_cursor_set_raw_key(cursor, key); ret = cursor->search_near(cursor, ¬used); /* * Don't search for the same key twice; if we don't set a new * key below, it's because we've reached the end of the table * and we want the next pass to start at the beginning of the * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ key->size = 0; if (ret != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. */ cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } srch_notfound: WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __txn_rollback_to_stable_lookaside_fixup -- * Remove any updates that need to be rolled back from the lookaside file. */ static int __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_RET; WT_DECL_TIMESTAMP(rollback_timestamp) WT_ITEM las_addr, las_key, las_timestamp; WT_TXN_GLOBAL *txn_global; uint64_t las_counter, las_txnid, remove_cnt; uint32_t las_id, session_flags; conn = S2C(session); cursor = NULL; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_CLEAR(las_timestamp); /* * Copy the stable timestamp, otherwise we'd need to lock it each time * it's accessed. Even though the stable timestamp isn't supposed to be * updated while rolling back, accessing it without a lock would * violate protocol. */ txn_global = &S2C(session)->txn_global; __wt_readlock(session, &txn_global->rwlock); __wt_timestamp_set(&rollback_timestamp, &txn_global->stable_timestamp); __wt_readunlock(session, &txn_global->rwlock); __wt_las_cursor(session, &cursor, &session_flags); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; (ret = cursor->next(cursor)) == 0; ) { WT_ERR(cursor->get_key(cursor, &las_id, &las_addr, &las_counter, &las_txnid, &las_timestamp, &las_key)); /* Check the file ID so we can skip durable tables */ if (__bit_test(conn->stable_rollback_bitstring, las_id)) continue; /* * Entries with no timestamp will have a timestamp of zero, * which will fail the following check and cause them to never * be removed. */ if (__wt_timestamp_cmp( &rollback_timestamp, las_timestamp.data) < 0) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } WT_ERR_NOTFOUND_OK(ret); err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. Underflow * isn't fatal, but check anyway so we don't skew low over time. */ if (remove_cnt > conn->las_record_cnt) conn->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_sub64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __create_colgroup -- * Create a column group. */ static int __create_colgroup(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_RET; WT_ITEM confbuf, fmt, namebuf; WT_TABLE *table; size_t tlen; const char **cfgp, *cfg[4] = { WT_CONFIG_BASE(session, colgroup_meta), config, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *cgname, *source, *sourceconf, *tablename; char *cgconf, *origconf; bool exists; sourceconf = NULL; cgconf = origconf = NULL; WT_CLEAR(fmt); WT_CLEAR(confbuf); WT_CLEAR(namebuf); exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "colgroup:")) return ( __wt_unexpected_object_type(session, name, "colgroup:")); cgname = strchr(tablename, ':'); if (cgname != NULL) { tlen = (size_t)(cgname - tablename); ++cgname; } else tlen = strlen(tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, true, &table)) != 0) WT_RET_MSG(session, (ret == WT_NOTFOUND) ? ENOENT : ret, "Can't create '%s' for non-existent table '%.*s'", name, (int)tlen, tablename); /* Make sure the column group is referenced from the table. */ if (cgname != NULL && (ret = __wt_config_subgets(session, &table->cgconf, cgname, &cval)) != 0) WT_ERR_MSG(session, EINVAL, "Column group '%s' not found in table '%.*s'", cgname, (int)tlen, tablename); /* Check if the column group already exists. */ if ((ret = __wt_metadata_search(session, name, &origconf)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); /* Find the first NULL entry in the cfg stack. */ for (cfgp = &cfg[1]; *cfgp; cfgp++) ; /* Add the source to the colgroup config before collapsing. */ if (__wt_config_getones( session, config, "source", &cval) == 0 && cval.len != 0) { WT_ERR(__wt_buf_fmt( session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_colgroup_source( session, table, cgname, config, &namebuf)); source = namebuf.data; WT_ERR(__wt_buf_fmt( session, &confbuf, "source=\"%s\"", source)); *cfgp++ = confbuf.data; } /* Calculate the key/value formats: these go into the source config. */ WT_ERR(__wt_buf_fmt(session, &fmt, "key_format=%s", table->key_format)); if (cgname == NULL) WT_ERR(__wt_buf_catfmt (session, &fmt, ",value_format=%s", table->value_format)); else { if (__wt_config_getones(session, config, "columns", &cval) != 0) WT_ERR_MSG(session, EINVAL, "No 'columns' configuration for '%s'", name); WT_ERR(__wt_buf_catfmt(session, &fmt, ",value_format=")); WT_ERR(__wt_struct_reformat(session, table, cval.str, cval.len, NULL, true, &fmt)); } sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, cgconf)); WT_ERR(__wt_schema_open_colgroups(session, table)); } err: __wt_free(session, cgconf); __wt_free(session, sourceconf); __wt_free(session, origconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __wt_lsm_meta_read -- * Read the metadata for an LSM tree. */ int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONFIG cparser, lparser; WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata; WT_DECL_RET; WT_LSM_CHUNK *chunk; char *lsmconfig; u_int nchunks; chunk = NULL; /* -Wconditional-uninitialized */ /* LSM trees inherit the merge setting from the connection. */ if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) F_SET(lsm_tree, WT_LSM_TREE_MERGES); WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->key_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format)); } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->value_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format)); } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) { if (cv.len == 0 || WT_STRING_MATCH("none", cv.str, cv.len)) continue; /* * Extract the application-supplied metadata (if any) * from the file configuration. */ WT_ERR(__wt_config_getones( session, lsmconfig, "file_config", &fileconf)); WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_subgets( session, &fileconf, "app_metadata", &metadata)); WT_ERR(__wt_collator_config(session, lsm_tree->name, &cv, &metadata, &lsm_tree->collator, &lsm_tree->collator_owned)); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->collator_name)); } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->bloom_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->bloom_config)); } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->file_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->file_config)); } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) { if (cv.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); } else if (WT_STRING_MATCH("bloom", ck.str, ck.len)) lsm_tree->bloom = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len)) lsm_tree->bloom_bit_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len)) lsm_tree->bloom_hash_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("chunk_count_limit", ck.str, ck.len)) { lsm_tree->chunk_count_limit = (uint32_t)cv.val; if (cv.val != 0) F_CLR(lsm_tree, WT_LSM_TREE_MERGES); } else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len)) lsm_tree->chunk_max = (uint64_t)cv.val; else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len)) lsm_tree->chunk_size = (uint64_t)cv.val; else if (WT_STRING_MATCH("merge_max", ck.str, ck.len)) lsm_tree->merge_max = (uint32_t)cv.val; else if (WT_STRING_MATCH("merge_min", ck.str, ck.len)) lsm_tree->merge_min = (uint32_t)cv.val; else if (WT_STRING_MATCH("last", ck.str, ck.len)) lsm_tree->last = (u_int)cv.val; else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("id", lk.str, lk.len)) { WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR( __wt_calloc_one(session, &chunk)); lsm_tree->chunk[nchunks++] = chunk; chunk->id = (uint32_t)lv.val; WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE); } else if (WT_STRING_MATCH( "bloom", lk.str, lk.len)) { WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } else if (WT_STRING_MATCH( "chunk_size", lk.str, lk.len)) { chunk->size = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "count", lk.str, lk.len)) { chunk->count = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "generation", lk.str, lk.len)) { chunk->generation = (uint32_t)lv.val; continue; } } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nchunks = nchunks; } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { WT_ERR(__wt_strndup(session, lv.str, lv.len, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } WT_ERR(__wt_realloc_def(session, &lsm_tree->old_alloc, nchunks + 1, &lsm_tree->old_chunks)); WT_ERR(__wt_calloc_one(session, &chunk)); lsm_tree->old_chunks[nchunks++] = chunk; WT_ERR(__wt_strndup(session, lk.str, lk.len, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nold_chunks = nchunks; } /* * Ignore any other values: the metadata entry might have been * created by a future release, with unknown options. */ } WT_ERR_NOTFOUND_OK(ret); /* * If the default merge_min was not overridden, calculate it now. We * do this here so that trees created before merge_min was added get a * sane value. */ if (lsm_tree->merge_min < 2) lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); err: __wt_free(session, lsmconfig); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (conn->las_sweep_call != 0 && key->data != NULL) { __wt_cursor_set_raw_key(cursor, key); if ((ret = cursor->search_near(cursor, ¬used)) != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. * * We can't know for sure how many records are in the lookaside table, * the cursor insert and remove statistics aren't updated atomically. * Start with reviewing 100 rows, and if it takes more than the target * number of calls to finish, increase the number of rows checked on * each call; if it takes less than the target calls to finish, then * decrease the number of rows reviewed on each call (but never less * than 100). */ #define WT_SWEEP_LOOKASIDE_MIN_CNT 100 #define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 ++conn->las_sweep_call; if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) WT_ERR(cursor->remove(cursor)); } /* * When reaching the lookaside table end or the target number of calls, * adjust the row count. Decrease/increase the row count depending on * if the number of calls is less/more than the target. */ if (ret == WT_NOTFOUND || conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; } srch_notfound: if (ret == WT_NOTFOUND) conn->las_sweep_call = 0; WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __wt_curds_open -- * Initialize a data-source cursor. */ int __wt_curds_open( WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curds_compare, /* compare */ __wt_cursor_equals, /* equals */ __curds_next, /* next */ __curds_prev, /* prev */ __curds_reset, /* reset */ __curds_search, /* search */ __curds_search_near, /* search-near */ __curds_insert, /* insert */ __wt_cursor_modify_notsup, /* modify */ __curds_update, /* update */ __curds_remove, /* remove */ __curds_reserve, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curds_close); /* close */ WT_CONFIG_ITEM cval, metadata; WT_CURSOR *cursor, *source; WT_CURSOR_DATA_SOURCE *data_source; WT_DECL_RET; char *metaconf; WT_STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0); data_source = NULL; metaconf = NULL; WT_RET(__wt_calloc_one(session, &data_source)); cursor = &data_source->iface; *cursor = iface; cursor->session = &session->iface; /* * XXX * The underlying data-source may require the object's key and value * formats. This isn't a particularly elegant way of getting that * information to the data-source, this feels like a layering problem * to me. */ WT_ERR(__wt_metadata_search(session, uri, &metaconf)); WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format)); WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval)); WT_ERR( __wt_strndup(session, cval.str, cval.len, &cursor->value_format)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); /* Data-source cursors may have a custom collator. */ ret = __wt_config_getones(session, metaconf, "collator", &cval); if (ret == 0 && cval.len != 0) { WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_getones( session, metaconf, "app_metadata", &metadata)); WT_ERR(__wt_collator_config(session, uri, &cval, &metadata, &data_source->collator, &data_source->collator_owned)); } WT_ERR_NOTFOUND_OK(ret); WT_ERR(dsrc->open_cursor(dsrc, &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source)); source = data_source->source; source->session = (WT_SESSION *)session; memset(&source->q, 0, sizeof(source->q)); source->recno = WT_RECNO_OOB; memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf)); memset(&source->key, 0, sizeof(source->key)); memset(&source->value, 0, sizeof(source->value)); source->saved_err = 0; source->flags = 0; if (0) { err: WT_TRET(__curds_close(cursor)); *cursorp = NULL; } __wt_free(session, metaconf); return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, 1)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __create_index -- * Create an index. */ static int __create_index(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) { WT_CONFIG kcols, pkcols; WT_CONFIG_ITEM ckey, cval, icols, kval; WT_DECL_PACK_VALUE(pv); WT_DECL_RET; WT_ITEM confbuf, extra_cols, fmt, namebuf; WT_PACK pack; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, index_meta), NULL, NULL, NULL }; const char *sourcecfg[] = { config, NULL, NULL }; const char *source, *sourceconf, *idxname, *tablename; char *idxconf; size_t tlen; int have_extractor; u_int i, npublic_cols; sourceconf = NULL; idxconf = NULL; WT_CLEAR(confbuf); WT_CLEAR(fmt); WT_CLEAR(extra_cols); WT_CLEAR(namebuf); have_extractor = 0; tablename = name; if (!WT_PREFIX_SKIP(tablename, "index:")) return (EINVAL); idxname = strchr(tablename, ':'); if (idxname == NULL) WT_RET_MSG(session, EINVAL, "Invalid index name, " "should be <table name>:<index name>: %s", name); tlen = (size_t)(idxname++ - tablename); if ((ret = __wt_schema_get_table(session, tablename, tlen, 1, &table)) != 0) WT_RET_MSG(session, ret, "Can't create an index for a non-existent table: %.*s", (int)tlen, tablename); if (table->is_simple) WT_RET_MSG(session, EINVAL, "%s requires a table with named columns", name); if (__wt_config_getones(session, config, "source", &cval) == 0) { WT_ERR(__wt_buf_fmt(session, &namebuf, "%.*s", (int)cval.len, cval.str)); source = namebuf.data; } else { WT_ERR(__wt_schema_index_source( session, table, idxname, config, &namebuf)); source = namebuf.data; /* Add the source name to the index config before collapsing. */ WT_ERR(__wt_buf_catfmt(session, &confbuf, ",source=\"%s\"", source)); } if (__wt_config_getones_none( session, config, "extractor", &cval) == 0 && cval.len != 0) { have_extractor = 1; /* Custom extractors must supply a key format. */ if ((ret = __wt_config_getones( session, config, "key_format", &kval)) != 0) WT_ERR_MSG(session, EINVAL, "%s: custom extractors require a key_format", name); } /* Calculate the key/value formats. */ WT_CLEAR(icols); if (__wt_config_getones(session, config, "columns", &icols) != 0 && !have_extractor) WT_ERR_MSG(session, EINVAL, "%s: requires 'columns' configuration", name); /* * Count the public columns using the declared columns for normal * indices or the key format for custom extractors. */ npublic_cols = 0; if (!have_extractor) { WT_ERR(__wt_config_subinit(session, &kcols, &icols)); while ((ret = __wt_config_next(&kcols, &ckey, &cval)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } else { WT_ERR(__pack_initn(session, &pack, kval.str, kval.len)); while ((ret = __pack_next(&pack, &pv)) == 0) ++npublic_cols; WT_ERR_NOTFOUND_OK(ret); } /* * The key format for an index is somewhat subtle: the application * specifies a set of columns that it will use for the key, but the * engine usually adds some hidden columns in order to derive the * primary key. These hidden columns are part of the source's * key_format, which we are calculating now, but not part of an index * cursor's key_format. */ WT_ERR(__wt_config_subinit(session, &pkcols, &table->colconf)); for (i = 0; i < table->nkey_columns && (ret = __wt_config_next(&pkcols, &ckey, &cval)) == 0; i++) { /* * If the primary key column is already in the secondary key, * don't add it again. */ if (__wt_config_subgetraw(session, &icols, &ckey, &cval) == 0) { if (have_extractor) WT_ERR_MSG(session, EINVAL, "an index with a custom extractor may not " "include primary key columns"); continue; } WT_ERR(__wt_buf_catfmt( session, &extra_cols, "%.*s,", (int)ckey.len, ckey.str)); } if (ret != 0 && ret != WT_NOTFOUND) goto err; /* Index values are empty: all columns are packed into the index key. */ WT_ERR(__wt_buf_fmt(session, &fmt, "value_format=,key_format=")); if (have_extractor) { WT_ERR(__wt_buf_catfmt(session, &fmt, "%.*s", (int)kval.len, kval.str)); WT_CLEAR(icols); } /* * Construct the index key format, or append the primary key columns * for custom extractors. */ WT_ERR(__wt_struct_reformat(session, table, icols.str, icols.len, (const char *)extra_cols.data, 0, &fmt)); /* Check for a record number index key, which makes no sense. */ WT_ERR(__wt_config_getones(session, fmt.data, "key_format", &cval)); if (cval.len == 1 && cval.str[0] == 'r') WT_ERR_MSG(session, EINVAL, "column-store index may not use the record number as its " "index key"); WT_ERR(__wt_buf_catfmt( session, &fmt, ",index_key_columns=%u", npublic_cols)); sourcecfg[1] = fmt.data; WT_ERR(__wt_config_merge(session, sourcecfg, NULL, &sourceconf)); WT_ERR(__wt_schema_create(session, source, sourceconf)); cfg[1] = sourceconf; cfg[2] = confbuf.data; WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); if ((ret = __wt_metadata_insert(session, name, idxconf)) != 0) { /* * If the entry already exists in the metadata, we're done. * This is an error for exclusive creates but okay otherwise. */ if (ret == WT_DUPLICATE_KEY) ret = exclusive ? EEXIST : 0; goto err; } /* Make sure that the configuration is valid. */ WT_ERR(__wt_schema_open_index( session, table, idxname, strlen(idxname), NULL)); err: __wt_free(session, idxconf); __wt_free(session, sourceconf); __wt_buf_free(session, &confbuf); __wt_buf_free(session, &extra_cols); __wt_buf_free(session, &fmt); __wt_buf_free(session, &namebuf); __wt_schema_release_table(session, table); return (ret); }
/* * __create_table -- * Create a table. */ static int __create_table(WT_SESSION_IMPL *session, const char *name, int exclusive, const char *config) { WT_CONFIG conf; WT_CONFIG_ITEM cgkey, cgval, cval; WT_DECL_RET; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL }; const char *tablename; char *tableconf, *cgname; size_t cgsize; int ncolgroups; cgname = NULL; table = NULL; tableconf = NULL; tablename = name; if (!WT_PREFIX_SKIP(tablename, "table:")) return (EINVAL); if ((ret = __wt_schema_get_table(session, tablename, strlen(tablename), 0, &table)) == 0) { __wt_schema_release_table(session, table); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval)); WT_ERR(__wt_config_subinit(session, &conf, &cval)); for (ncolgroups = 0; (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0; ncolgroups++) ; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_collapse(session, cfg, &tableconf)); if ((ret = __wt_metadata_insert(session, name, tableconf)) != 0) { /* * If the entry already exists in the metadata, we're done. * This is an error for exclusive creates but okay otherwise. */ if (ret == WT_DUPLICATE_KEY) ret = exclusive ? EEXIST : 0; goto err; } /* Attempt to open the table now to catch any errors. */ WT_ERR(__wt_schema_get_table( session, tablename, strlen(tablename), 1, &table)); if (ncolgroups == 0) { cgsize = strlen("colgroup:") + strlen(tablename) + 1; WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); snprintf(cgname, cgsize, "colgroup:%s", tablename); WT_ERR(__create_colgroup(session, cgname, exclusive, config)); } if (0) { err: if (table != NULL) { WT_TRET(__wt_schema_remove_table(session, table)); table = NULL; } } if (table != NULL) __wt_schema_release_table(session, table); __wt_free(session, cgname); __wt_free(session, tableconf); return (ret); }
/* * __schema_open_index -- * Open one or more indices for a table (internal version). */ static int __schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) { WT_CURSOR *cursor; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_INDEX *idx; u_int i; int cmp; bool match; const char *idxconf, *name, *tablename, *uri; /* Check if we've already done the work. */ if (idxname == NULL && table->idx_complete) return (0); cursor = NULL; idx = NULL; match = false; /* Build a search key. */ tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); WT_ERR(__wt_scr_alloc(session, 512, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); /* Find matching indices. */ WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tmp->data); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &uri)); name = uri; if (!WT_PREFIX_SKIP(name, tmp->data)) break; /* Is this the index we are looking for? */ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len); /* * Ensure there is space, including if we have to make room for * a new entry in the middle of the list. */ WT_ERR(__wt_realloc_def(session, &table->idx_alloc, WT_MAX(i, table->nindices) + 1, &table->indices)); /* Keep the in-memory list in sync with the metadata. */ cmp = 0; while (table->indices[i] != NULL && (cmp = strcmp(uri, table->indices[i]->name)) > 0) { /* Index no longer exists, remove it. */ __wt_free(session, table->indices[i]); memmove(&table->indices[i], &table->indices[i + 1], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[--table->nindices] = NULL; } if (cmp < 0) { /* Make room for a new index. */ memmove(&table->indices[i + 1], &table->indices[i], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[i] = NULL; ++table->nindices; } if (!match) continue; if (table->indices[i] == NULL) { WT_ERR(cursor->get_value(cursor, &idxconf)); WT_ERR(__wt_calloc_one(session, &idx)); WT_ERR(__wt_strdup(session, uri, &idx->name)); WT_ERR(__wt_strdup(session, idxconf, &idx->config)); WT_ERR(__open_index(session, table, idx)); /* * If we're checking the creation of an index before a * table is fully created, don't save the index: it * will need to be reopened once the table is complete. */ if (!table->cg_complete) { WT_ERR( __wt_schema_destroy_index(session, &idx)); if (idxname != NULL) break; continue; } table->indices[i] = idx; idx = NULL; /* * If the slot is bigger than anything else we've seen, * bump the number of indices. */ if (i >= table->nindices) table->nindices = i + 1; } /* If we were looking for a single index, we're done. */ if (indexp != NULL) *indexp = table->indices[i]; if (idxname != NULL) break; } WT_ERR_NOTFOUND_OK(ret); if (idxname != NULL && !match) ret = WT_NOTFOUND; /* If we did a full pass, we won't need to do it again. */ if (idxname == NULL) { table->nindices = i; table->idx_complete = true; } err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); WT_TRET(__wt_schema_destroy_index(session, &idx)); __wt_scr_free(session, &tmp); return (ret); }
/* * __create_table -- * Create a table. */ static int __create_table(WT_SESSION_IMPL *session, const char *name, bool exclusive, const char *config) { WT_CONFIG conf; WT_CONFIG_ITEM cgkey, cgval, cval; WT_DECL_RET; WT_TABLE *table; const char *cfg[4] = { WT_CONFIG_BASE(session, table_meta), config, NULL, NULL }; const char *tablename; char *tableconf, *cgname; size_t cgsize; int ncolgroups; bool exists; cgname = NULL; table = NULL; tableconf = NULL; exists = false; tablename = name; if (!WT_PREFIX_SKIP(tablename, "table:")) return (__wt_unexpected_object_type(session, name, "table:")); if ((ret = __wt_schema_get_table(session, tablename, strlen(tablename), false, &table)) == 0) { if (exclusive) WT_ERR(EEXIST); exists = true; } WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_gets(session, cfg, "colgroups", &cval)); __wt_config_subinit(session, &conf, &cval); for (ncolgroups = 0; (ret = __wt_config_next(&conf, &cgkey, &cgval)) == 0; ncolgroups++) ; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_collapse(session, cfg, &tableconf)); if (!exists) { WT_ERR(__wt_metadata_insert(session, name, tableconf)); /* Attempt to open the table now to catch any errors. */ WT_ERR(__wt_schema_get_table( session, tablename, strlen(tablename), true, &table)); if (ncolgroups == 0) { cgsize = strlen("colgroup:") + strlen(tablename) + 1; WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); snprintf(cgname, cgsize, "colgroup:%s", tablename); WT_ERR(__create_colgroup( session, cgname, exclusive, config)); } } if (0) { err: if (table != NULL) { WT_TRET(__wt_schema_remove_table(session, table)); table = NULL; } } if (table != NULL) __wt_schema_release_table(session, table); __wt_free(session, cgname); __wt_free(session, tableconf); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM key; uint64_t insert_count; WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); /* * Setup so that we don't hold pages we read into cache, and so * that we don't get stuck if the cache is full. If we allow * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count)); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __curjoin_init_bloom -- * Populate Bloom filters */ static int __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom) { WT_COLLATOR *collator; WT_CURSOR *c; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; WT_DECL_ITEM(uribuf); WT_DECL_RET; WT_ITEM curkey, curvalue; size_t size; u_int skip; int cmp; const char *uri; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; c = NULL; skip = 0; if (entry->index != NULL) /* * Open the raw index. We're avoiding any references * to the main table, they may be expensive. */ uri = entry->index->source; else { /* * For joins on the main table, we just need the primary * key for comparison, we don't need any values. */ size = strlen(cjoin->table->iface.name) + 3; WT_ERR(__wt_scr_alloc(session, size, &uribuf)); WT_ERR(__wt_buf_fmt(session, uribuf, "%s()", cjoin->table->iface.name)); uri = uribuf->data; } WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c)); /* Initially position the cursor if necessary. */ endmax = &entry->ends[entry->ends_next]; if ((end = &entry->ends[0]) < endmax) { if (F_ISSET(end, WT_CURJOIN_END_GT) || WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) { WT_ERR(__wt_cursor_dup_position(end->cursor, c)); if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE) skip = 1; } else if (F_ISSET(end, WT_CURJOIN_END_LT)) { if ((ret = c->next(c)) == WT_NOTFOUND) goto done; WT_ERR(ret); } else WT_PANIC_ERR(session, EINVAL, "fatal error in join cursor position state"); } collator = (entry->index == NULL) ? NULL : entry->index->collator; while (ret == 0) { WT_ERR(c->get_key(c, &curkey)); entry->stats.iterated++; if (entry->index != NULL) { /* * Repack so it's comparable to the * reference endpoints. */ WT_ERR(__wt_struct_repack(session, c->key_format, (entry->repack_format != NULL ? entry->repack_format : entry->index->idxkey_format), &c->key, &curkey)); } for (end = &entry->ends[skip]; end < endmax; end++) { WT_ERR(__wt_compare(session, collator, &curkey, &end->key, &cmp)); if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) { /* if condition satisfied, insert immediately */ switch (WT_CURJOIN_END_RANGE(end)) { case WT_CURJOIN_END_EQ: if (cmp == 0) goto insert; break; case WT_CURJOIN_END_GT: if (cmp > 0) { /* skip this check next time */ skip = entry->ends_next; goto insert; } break; case WT_CURJOIN_END_GE: if (cmp >= 0) goto insert; break; case WT_CURJOIN_END_LT: if (cmp < 0) goto insert; break; case WT_CURJOIN_END_LE: if (cmp <= 0) goto insert; break; } } else if (!F_ISSET(end, WT_CURJOIN_END_LT)) { if (cmp < 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ))) goto advance; if (cmp > 0) { if (F_ISSET(end, WT_CURJOIN_END_GT)) skip = 1; else goto done; } } else { if (cmp > 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ))) goto done; } } /* * Either it's a disjunction that hasn't satisfied any * condition, or it's a conjunction that has satisfied all * conditions. */ if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) goto advance; insert: if (entry->index != NULL) { curvalue.data = (unsigned char *)curkey.data + curkey.size; WT_ASSERT(session, c->key.size > curkey.size); curvalue.size = c->key.size - curkey.size; } else WT_ERR(c->get_key(c, &curvalue)); __wt_bloom_insert(bloom, &curvalue); entry->stats.bloom_insert++; advance: if ((ret = c->next(c)) == WT_NOTFOUND) break; } done: WT_ERR_NOTFOUND_OK(ret); err: if (c != NULL) WT_TRET(c->close(c)); __wt_scr_free(session, &uribuf); return (ret); }