/* * __backup_list_append -- * Append a new file name to the list, allocate space as necessary. */ static int __backup_list_append( WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *uri) { WT_CURSOR_BACKUP_ENTRY *p; WT_DATA_HANDLE *old_dhandle; WT_DECL_RET; bool need_handle; const char *name; /* Leave a NULL at the end to mark the end of the list. */ WT_RET(__wt_realloc_def(session, &cb->list_allocated, cb->list_next + 2, &cb->list)); p = &cb->list[cb->list_next]; p[0].name = p[1].name = NULL; p[0].handle = p[1].handle = NULL; need_handle = false; name = uri; if (WT_PREFIX_MATCH(uri, "file:")) { need_handle = true; name += strlen("file:"); } /* * !!! * Assumes metadata file entries map one-to-one to physical files. * To support a block manager where that's not the case, we'd need * to call into the block manager and get a list of physical files * that map to this logical "file". I'm not going to worry about * that for now, that block manager might not even support physical * copying of files by applications. */ WT_RET(__wt_strdup(session, name, &p->name)); /* * If it's a file in the database, get a handle for the underlying * object (this handle blocks schema level operations, for example * WT_SESSION.drop or an LSM file discard after level merging). */ if (need_handle) { old_dhandle = session->dhandle; if ((ret = __wt_session_get_btree(session, uri, NULL, NULL, 0)) == 0) p->handle = session->dhandle; session->dhandle = old_dhandle; WT_RET(ret); } ++cb->list_next; return (0); }
/* * __conn_foc_add -- * Add a new entry into the connection's free-on-close list. */ static int __conn_foc_add(WT_SESSION_IMPL *session, const void *p) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* * Our caller is expected to be holding any locks we need. */ WT_RET(__wt_realloc_def( session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc)); conn->foc[conn->foc_cnt++] = (void *)p; return (0); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t nchunks, new_id; WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ if ((nchunks = lsm_tree->nchunks) != 0 && (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && !F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_VERBOSE_ERR(session, lsm, "Tree switch to: %" PRIu32 ", throttle %ld", new_id, lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; chunk->txnid_max = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; lsm_tree->modified = 1; err: /* TODO: mark lsm_tree bad on error(?) */ WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __recovery_setup_file -- * Set up the recovery slot for a file. */ static int __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) { WT_CONFIG_ITEM cval; WT_LSN lsn; uint32_t fileid, lsnfile, lsnoffset; WT_RET(__wt_config_getones(r->session, config, "id", &cval)); fileid = (uint32_t)cval.val; /* Track the largest file ID we have seen. */ if (fileid > r->max_fileid) r->max_fileid = fileid; if (r->nfiles <= fileid) { WT_RET(__wt_realloc_def( r->session, &r->file_alloc, fileid + 1, &r->files)); r->nfiles = fileid + 1; } WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); WT_RET( __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); /* If there is checkpoint logged for the file, apply everything. */ if (cval.type != WT_CONFIG_ITEM_STRUCT) WT_INIT_LSN(&lsn); /* NOLINTNEXTLINE(cert-err34-c) */ else if (sscanf(cval.str, "(%" SCNu32 ",%" SCNu32 ")", &lsnfile, &lsnoffset) == 2) WT_SET_LSN(&lsn, lsnfile, lsnoffset); else WT_RET_MSG(r->session, EINVAL, "Failed to parse checkpoint LSN '%.*s'", (int)cval.len, cval.str); r->files[fileid].ckpt_lsn = lsn; __wt_verbose(r->session, WT_VERB_RECOVERY, "Recovering %s with id %" PRIu32 " @ (%" PRIu32 ", %" PRIu32 ")", uri, fileid, lsn.l.file, lsn.l.offset); if ((!WT_IS_MAX_LSN(&lsn) && !WT_IS_INIT_LSN(&lsn)) && (WT_IS_MAX_LSN(&r->max_ckpt_lsn) || __wt_log_cmp(&lsn, &r->max_ckpt_lsn) > 0)) r->max_ckpt_lsn = lsn; return (0); }
/* * __wt_conn_foc_add -- * Add a new entry into the connection's free-on-close list. */ void __wt_conn_foc_add(WT_SESSION_IMPL *session, const void *p) { WT_CONNECTION_IMPL *conn; conn = S2C(session); /* * Callers of this function are expected to be holding the connection's * api_lock. * * All callers of this function currently ignore errors. */ if (__wt_realloc_def( session, &conn->foc_size, conn->foc_cnt + 1, &conn->foc) == 0) conn->foc[conn->foc_cnt++] = (void *)p; }
/* * __curjoin_insert_endpoint -- * Insert a new entry into the endpoint array for the join entry. */ static int __curjoin_insert_endpoint(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, u_int pos, WT_CURSOR_JOIN_ENDPOINT **newendp) { WT_CURSOR_JOIN_ENDPOINT *newend; WT_RET(__wt_realloc_def(session, &entry->ends_allocated, entry->ends_next + 1, &entry->ends)); newend = &entry->ends[pos]; memmove(newend + 1, newend, (entry->ends_next - pos) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); entry->ends_next++; *newendp = newend; return (0); }
/* * __wt_ovfl_discard_add -- * Add a new entry to the page's list of overflow records that have been * discarded. */ int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell) { WT_OVFL_TRACK *track; if (page->modify->ovfl_track == NULL) WT_RET(__ovfl_track_init(session, page)); track = page->modify->ovfl_track; WT_RET(__wt_realloc_def(session, &track->discard_allocated, track->discard_entries + 1, &track->discard)); track->discard[track->discard_entries++] = cell; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_discard_verbose(session, page, cell, "add")); return (0); }
/* * __recovery_setup_file -- * Set up the recovery slot for a file. */ static int __recovery_setup_file(WT_RECOVERY *r, const char *uri, const char *config) { WT_CONFIG_ITEM cval; WT_LSN lsn; uint32_t fileid; WT_RET(__wt_config_getones(r->session, config, "id", &cval)); fileid = (uint32_t)cval.val; /* Track the largest file ID we have seen. */ if (fileid > r->max_fileid) r->max_fileid = fileid; if (r->nfiles <= fileid) { WT_RET(__wt_realloc_def( r->session, &r->file_alloc, fileid + 1, &r->files)); r->nfiles = fileid + 1; } WT_RET(__wt_strdup(r->session, uri, &r->files[fileid].uri)); WT_RET( __wt_config_getones(r->session, config, "checkpoint_lsn", &cval)); /* If there is checkpoint logged for the file, apply everything. */ if (cval.type != WT_CONFIG_ITEM_STRUCT) WT_INIT_LSN(&lsn); else if (sscanf(cval.str, "(%" PRIu32 ",%" PRIdMAX ")", &lsn.file, (intmax_t*)&lsn.offset) != 2) WT_RET_MSG(r->session, EINVAL, "Failed to parse checkpoint LSN '%.*s'", (int)cval.len, cval.str); r->files[fileid].ckpt_lsn = lsn; WT_RET(__wt_verbose(r->session, WT_VERB_RECOVERY, "Recovering %s with id %u @ (%" PRIu32 ", %" PRIu64 ")", uri, fileid, lsn.file, lsn.offset)); return (0); }
/* * __compact_handle_append -- * Gather a file handle to be compacted. * Called via the schema_worker function. */ static int __compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; WT_UNUSED(cfg); WT_RET(__wt_session_get_dhandle( session, session->dhandle->name, NULL, NULL, 0)); /* Set compact active on the handle. */ if ((ret = __compact_start(session)) != 0) { WT_TRET(__wt_session_release_dhandle(session)); return (ret); } /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->op_handle_allocated, session->op_handle_next + 1, &session->op_handle)); session->op_handle[session->op_handle_next++] = session->dhandle; return (0); }
/* * __wt_session_fotxn_add -- * Add a new entry into the session's free-on-transaction generation list. */ int __wt_session_fotxn_add(WT_SESSION_IMPL *session, void *p, size_t len) { WT_FOTXN *fotxn; size_t i; /* * Make sure the current thread has a transaction pinned so that * we don't immediately free the memory we are stashing. */ WT_ASSERT(session, WT_SESSION_TXN_STATE(session)->snap_min != WT_TXN_NONE); /* Grow the list as necessary. */ WT_RET(__wt_realloc_def(session, &session->fotxn_size, session->fotxn_cnt + 1, &session->fotxn)); /* Find an empty slot. */ for (i = 0, fotxn = session->fotxn; i < session->fotxn_size / sizeof(session->fotxn[0]); ++i, ++fotxn) if (fotxn->p == NULL) { fotxn->txnid = S2C(session)->txn_global.current + 1; WT_ASSERT(session, !__wt_txn_visible_all(session, fotxn->txnid)); fotxn->p = p; fotxn->len = len; break; } ++session->fotxn_cnt; /* See if we can free any previous entries. */ if (session->fotxn_cnt > 1) __wt_session_fotxn_discard(session, session, 0); return (0); }
/* * __wt_win_directory_list -- * Get a list of files from a directory, MSVC version. */ int __wt_win_directory_list(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *directory, const char *prefix, char ***dirlistp, uint32_t *countp) { DWORD windows_error; HANDLE findhandle; WIN32_FIND_DATAW finddata; WT_DECL_ITEM(pathbuf); WT_DECL_ITEM(file_utf8); WT_DECL_ITEM(pathbuf_wide); WT_DECL_ITEM(prefix_wide); WT_DECL_RET; WT_SESSION_IMPL *session; size_t dirallocsz, pathlen, prefix_widelen; uint32_t count; char *dir_copy, **entries; session = (WT_SESSION_IMPL *)wt_session; *dirlistp = NULL; *countp = 0; findhandle = INVALID_HANDLE_VALUE; dirallocsz = 0; entries = NULL; WT_ERR(__wt_strdup(session, directory, &dir_copy)); pathlen = strlen(dir_copy); if (dir_copy[pathlen - 1] == '\\') dir_copy[pathlen - 1] = '\0'; WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf)); WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", dir_copy)); WT_ERR(__wt_to_utf16_string(session, pathbuf->data, &pathbuf_wide)); WT_ERR(__wt_to_utf16_string(session, prefix, &prefix_wide)); prefix_widelen = wcslen(prefix_wide->data); findhandle = FindFirstFileW(pathbuf_wide->data, &finddata); if (findhandle == INVALID_HANDLE_VALUE) { windows_error = __wt_getlasterror(); __wt_errx(session, "%s: directory-list: FindFirstFile: %s", pathbuf->data, __wt_formatmessage(session, windows_error)); WT_ERR(__wt_map_windows_error(windows_error)); } count = 0; do { /* * Skip . and .. */ if (wcscmp(finddata.cFileName, L".") == 0 || wcscmp(finddata.cFileName, L"..") == 0) continue; /* The list of files is optionally filtered by a prefix. */ if (prefix != NULL && wcsncmp(finddata.cFileName, prefix_wide->data, prefix_widelen) != 0) continue; WT_ERR(__wt_realloc_def( session, &dirallocsz, count + 1, &entries)); WT_ERR(__wt_to_utf8_string( session, finddata.cFileName, &file_utf8)); WT_ERR(__wt_strdup(session, file_utf8->data, &entries[count])); ++count; __wt_scr_free(session, &file_utf8); } while (FindNextFileW(findhandle, &finddata) != 0); *dirlistp = entries; *countp = count; err: if (findhandle != INVALID_HANDLE_VALUE) if (FindClose(findhandle) == 0) { windows_error = __wt_getlasterror(); __wt_errx(session, "%s: directory-list: FindClose: %s", pathbuf->data, __wt_formatmessage(session, windows_error)); if (ret == 0) ret = __wt_map_windows_error(windows_error); } __wt_free(session, dir_copy); __wt_scr_free(session, &pathbuf); __wt_scr_free(session, &file_utf8); __wt_scr_free(session, &pathbuf_wide); __wt_scr_free(session, &prefix_wide); if (ret == 0) return (0); WT_TRET(__wt_win_directory_list_free( file_system, wt_session, entries, count)); WT_RET_MSG(session, ret, "%s: directory-list, prefix \"%s\"", directory, prefix == NULL ? "" : prefix); }
/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN *child; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; size_t len; uint8_t endrange; u_int i, ins, nonbloom; bool hasins, needbloom, nested, range_eq; entry = NULL; hasins = needbloom = false; ins = nonbloom = 0; /* -Wuninitialized */ if (cjoin->entries_next == 0) { if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION)) F_SET(cjoin, WT_CURJOIN_DISJUNCTION); } else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=or does not match previous operation=and"); else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=and does not match previous operation=or"); nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:"); if (!nested) for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx && cjoin->entries[i].subjoin == NULL) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } else { if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "Bloom filters cannot be used with subjoins"); } if (entry == NULL) { WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_RET_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) != F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) WT_RET_MSG(session, EINVAL, "join has incompatible bloom_false_positives " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); endrange = WT_CURJOIN_END_RANGE(end); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (endrange == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_RET_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && endrange == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && endrange != WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } if (nested) { child = (WT_CURSOR_JOIN *)ref_cursor; entry->subjoin = child; child->parent = cjoin; } else { WT_RET(__curjoin_insert_endpoint(session, entry, hasins ? ins : entry->ends_next, &end)); end->cursor = ref_cursor; F_SET(end, range); if (entry->main == NULL && idx != NULL) { /* * Open the main file with a projection of the * indexed columns. */ WT_RET(__curjoin_open_main(session, cjoin, entry)); /* * When we are repacking index keys to remove the * primary key, we never want to transform trailing * 'u'. Use no-op padding to force this. */ cindex = (WT_CURSOR_INDEX *)ref_cursor; len = strlen(cindex->iface.key_format) + 3; WT_RET(__wt_calloc(session, len, 1, &entry->repack_format)); WT_RET(__wt_snprintf(entry->repack_format, len, "%s0x", cindex->iface.key_format)); } } return (0); }
/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; u_int i, ins, nonbloom; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; char *main_uri; size_t namesize, newsize; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } if (entry == NULL) { WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_ERR_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_ERR_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (end->flags == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && end->flags == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, entry->ends_next + 1, &entry->ends)); if (!hasins) ins = entry->ends_next; newend = &entry->ends[ins]; memmove(newend + 1, newend, (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); entry->ends_next++; newend->cursor = ref_cursor; F_SET(newend, range); /* Open the main file with a projection of the indexed columns. */ if (entry->main == NULL && entry->index != NULL) { namesize = strlen(cjoin->table->name); newsize = namesize + entry->index->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", cjoin->table->name, (int)entry->index->colconf.len, entry->index->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); } err: if (main_uri != NULL) __wt_free(session, main_uri); return (ret); }
/* * __wt_posix_directory_list -- * Get a list of files from a directory, POSIX version. */ int __wt_posix_directory_list(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *directory, const char *prefix, char ***dirlistp, uint32_t *countp) { struct dirent *dp; DIR *dirp; WT_DECL_RET; WT_SESSION_IMPL *session; size_t dirallocsz; uint32_t count; int tret; char **entries; WT_UNUSED(file_system); session = (WT_SESSION_IMPL *)wt_session; *dirlistp = NULL; *countp = 0; dirp = NULL; dirallocsz = 0; entries = NULL; WT_SYSCALL_RETRY(((dirp = opendir(directory)) == NULL ? -1 : 0), ret); if (ret != 0) WT_RET_MSG(session, ret, "%s: directory-list: opendir", directory); for (count = 0; (dp = readdir(dirp)) != NULL;) { /* * Skip . and .. */ if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) continue; /* The list of files is optionally filtered by a prefix. */ if (prefix != NULL && !WT_PREFIX_MATCH(dp->d_name, prefix)) continue; WT_ERR(__wt_realloc_def( session, &dirallocsz, count + 1, &entries)); WT_ERR(__wt_strdup(session, dp->d_name, &entries[count])); ++count; } *dirlistp = entries; *countp = count; err: if (dirp != NULL) { WT_SYSCALL(closedir(dirp), tret); if (tret != 0) { __wt_err(session, tret, "%s: directory-list: closedir", directory); if (ret == 0) ret = tret; } } if (ret == 0) return (0); WT_TRET(__wt_posix_directory_list_free( file_system, wt_session, entries, count)); WT_RET_MSG(session, ret, "%s: directory-list, prefix \"%s\"", directory, prefix == NULL ? "" : prefix); }
/* * __wt_lsm_meta_read -- * Read the metadata for an LSM tree. */ int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONFIG cparser, lparser; WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata; WT_DECL_RET; WT_LSM_CHUNK *chunk; char *lsmconfig; u_int nchunks; chunk = NULL; /* -Wconditional-uninitialized */ /* LSM trees inherit the merge setting from the connection. */ if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) F_SET(lsm_tree, WT_LSM_TREE_MERGES); WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->key_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format)); } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->value_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format)); } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) { if (cv.len == 0 || WT_STRING_MATCH("none", cv.str, cv.len)) continue; /* * Extract the application-supplied metadata (if any) * from the file configuration. */ WT_ERR(__wt_config_getones( session, lsmconfig, "file_config", &fileconf)); WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_subgets( session, &fileconf, "app_metadata", &metadata)); WT_ERR(__wt_collator_config(session, lsm_tree->name, &cv, &metadata, &lsm_tree->collator, &lsm_tree->collator_owned)); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->collator_name)); } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->bloom_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->bloom_config)); } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->file_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->file_config)); } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) { if (cv.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); } else if (WT_STRING_MATCH("bloom", ck.str, ck.len)) lsm_tree->bloom = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len)) lsm_tree->bloom_bit_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len)) lsm_tree->bloom_hash_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("chunk_count_limit", ck.str, ck.len)) { lsm_tree->chunk_count_limit = (uint32_t)cv.val; if (cv.val != 0) F_CLR(lsm_tree, WT_LSM_TREE_MERGES); } else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len)) lsm_tree->chunk_max = (uint64_t)cv.val; else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len)) lsm_tree->chunk_size = (uint64_t)cv.val; else if (WT_STRING_MATCH("merge_max", ck.str, ck.len)) lsm_tree->merge_max = (uint32_t)cv.val; else if (WT_STRING_MATCH("merge_min", ck.str, ck.len)) lsm_tree->merge_min = (uint32_t)cv.val; else if (WT_STRING_MATCH("last", ck.str, ck.len)) lsm_tree->last = (u_int)cv.val; else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("id", lk.str, lk.len)) { WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR( __wt_calloc_one(session, &chunk)); lsm_tree->chunk[nchunks++] = chunk; chunk->id = (uint32_t)lv.val; WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE); } else if (WT_STRING_MATCH( "bloom", lk.str, lk.len)) { WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } else if (WT_STRING_MATCH( "chunk_size", lk.str, lk.len)) { chunk->size = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "count", lk.str, lk.len)) { chunk->count = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "generation", lk.str, lk.len)) { chunk->generation = (uint32_t)lv.val; continue; } } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nchunks = nchunks; } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { WT_ERR(__wt_strndup(session, lv.str, lv.len, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } WT_ERR(__wt_realloc_def(session, &lsm_tree->old_alloc, nchunks + 1, &lsm_tree->old_chunks)); WT_ERR(__wt_calloc_one(session, &chunk)); lsm_tree->old_chunks[nchunks++] = chunk; WT_ERR(__wt_strndup(session, lk.str, lk.len, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nold_chunks = nchunks; } /* * Ignore any other values: the metadata entry might have been * created by a future release, with unknown options. */ } WT_ERR_NOTFOUND_OK(ret); /* * If the default merge_min was not overridden, calculate it now. We * do this here so that trees created before merge_min was added get a * sane value. */ if (lsm_tree->merge_min < 2) lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); err: __wt_free(session, lsmconfig); return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; u_int close_range_end, close_range_start; bool locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; /* * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* * Query operations need a full set of cursors. Overwrite cursors * do queries in service of updates. */ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE)) F_SET(clsm, WT_CLSM_OPEN_READ); if (lsm_tree->nchunks == 0) return (0); ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* * If the key is pointing to memory that is pinned by a chunk * cursor, take a copy before closing cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_CURSOR_NEEDKEY(c); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->switch_txn[ngood - 1] = chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) close_range_end -= nupdates; else close_range_end = 0; WT_ASSERT(session, ngood >= close_range_end); } else { close_range_end = 0; close_range_start = 0; } if (close_range_end > close_range_start) { saved_gen = lsm_tree->dsk_gen; locked = false; WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, close_range_start, close_range_end)); WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->switch_txn[i] = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); /* * Setup all cursors other than the primary to only do conflict * checks on insert operations. This allows us to execute * inserts on non-primary chunks as a way of checking for * write conflicts with concurrent updates. */ if (i != nchunks - 1) (*cp)->insert = __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by * the first update. */ btree = ((WT_CURSOR_BTREE *)(primary))->btree; if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, __wt_btree_evictable(session, false)); } } clsm->dsk_gen = lsm_tree->dsk_gen; err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) { WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; int locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; lsm_tree = clsm->lsm_tree; chunk = NULL; ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* Copy the key, so we don't lose the cursor position. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); if (update) { if (txn->isolation == TXN_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); } else F_SET(clsm, WT_CLSM_OPEN_READ); WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; /* * If there is no in-memory chunk in the tree for an update operation, * create one. * * !!! * It is exceeding unlikely that we get here at all, but if there is a * transaction in progress and it rolls back, it would leave the * metadata inconsistent. */ if (update && (lsm_tree->nchunks == 0 || (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) { /* Release our lock because switch will get a write lock. */ locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; } F_SET(session, WT_SESSION_NO_CACHE_CHECK); /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->txnid_max)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->txnid_max[ngood - 1] = chunk->txnid_max; if (__wt_txn_visible_all( session, chunk->txnid_max)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && (ngood < clsm->nchunks || (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) WT_ERR(__clsm_close_cursors( clsm, 0, nchunks - nupdates)); WT_ERR(__clsm_close_cursors( clsm, ngood, clsm->nchunks)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->txnid_max[i] = chunk->txnid_max; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree, __wt_btree_evictable(session, 0)); } clsm->dsk_gen = lsm_tree->dsk_gen; err: F_CLR(session, WT_SESSION_NO_CACHE_CHECK); #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __config_merge_scan -- * Walk a configuration string, inserting entries into the merged array. */ static int __config_merge_scan(WT_SESSION_IMPL *session, const char *key, const char *value, WT_CONFIG_MERGE *cp) { WT_CONFIG cparser; WT_CONFIG_ITEM k, v; WT_DECL_ITEM(kb); WT_DECL_ITEM(vb); WT_DECL_RET; size_t len; WT_ERR(__wt_scr_alloc(session, 0, &kb)); WT_ERR(__wt_scr_alloc(session, 0, &vb)); WT_ERR(__wt_config_init(session, &cparser, value)); while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, "Invalid configuration key found: '%s'\n", k.str); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { --k.str; k.len += 2; } if (v.type == WT_CONFIG_ITEM_STRING) { --v.str; v.len += 2; } /* * !!! * We're using a JSON quote character to separate the names we * create for nested structures. That's not completely safe as * it's possible to quote characters in JSON such that a quote * character appears as a literal character in a key name. In * a few cases, applications can create their own key namespace * (for example, shared library extension names), and therefore * it's possible for an application to confuse us. Error if we * we ever see a key with a magic character. */ for (len = 0; len < k.len; ++len) if (k.str[len] == SEPC) WT_ERR_MSG(session, EINVAL, "key %.*s contains a '%c' separator " "character", (int)k.len, (char *)k.str, SEPC); /* Build the key/value strings. */ WT_ERR(__wt_buf_fmt(session, kb, "%s%s%.*s", key == NULL ? "" : key, key == NULL ? "" : SEP, (int)k.len, k.str)); WT_ERR(__wt_buf_fmt(session, vb, "%.*s", (int)v.len, v.str)); /* * If the value is a structure, recursively parse it. * * !!! * Don't merge unless the structure has field names. WiredTiger * stores checkpoint LSNs in the metadata file using nested * structures without field names: "checkpoint_lsn=(1,0)", not * "checkpoint_lsn=(file=1,offset=0)". The value type is still * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the * value. */ if (v.type == WT_CONFIG_ITEM_STRUCT && strchr(vb->data, '=') != NULL) { WT_ERR(__config_merge_scan( session, kb->data, vb->data, cp)); continue; } /* Insert the value into the array. */ WT_ERR(__wt_realloc_def(session, &cp->entries_allocated, cp->entries_next + 1, &cp->entries)); WT_ERR(__wt_strndup(session, kb->data, kb->size, &cp->entries[cp->entries_next].k)); WT_ERR(__wt_strndup(session, vb->data, vb->size, &cp->entries[cp->entries_next].v)); cp->entries[cp->entries_next].gen = cp->entries_next; ++cp->entries_next; } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &kb); __wt_scr_free(session, &vb); return (ret); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t nchunks, new_id; int first_switch; WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); nchunks = lsm_tree->nchunks; first_switch = nchunks == 0 ? 1 : 0; /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ chunk = NULL; if (!first_switch && (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; /* Set the switch transaction in the previous chunk, if necessary. */ if (chunk != NULL && chunk->switch_txn == WT_TXN_NONE) chunk->switch_txn = __wt_txn_new_id(session); /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 0); new_id = WT_ATOMIC_ADD4(lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Tree %s switch to: %" PRIu32 ", checkpoint throttle %ld, " "merge throttle %ld", lsm_tree->name, new_id, lsm_tree->ckpt_throttle, lsm_tree->merge_throttle)); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; chunk->switch_txn = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; lsm_tree->modified = 1; err: WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); /* * Errors that happen during a tree switch leave the tree in a state * where we can't make progress. Error out of WiredTiger. */ if (ret != 0) WT_PANIC_RET(session, ret, "Failed doing LSM switch"); else if (!first_switch) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); return (ret); }
/* * __wt_dirlist -- * Get a list of files from a directory, optionally filtered by * a given prefix. */ int __wt_dirlist(WT_SESSION_IMPL *session, const char *dir, const char *prefix, uint32_t flags, char ***dirlist, u_int *countp) { struct dirent *dp; DIR *dirp; WT_DECL_RET; size_t dirallocsz; u_int count, dirsz; int match; char **entries, *path; *dirlist = NULL; *countp = 0; WT_RET(__wt_filename(session, dir, &path)); dirp = NULL; dirallocsz = 0; dirsz = 0; entries = NULL; if (flags == 0) LF_SET(WT_DIRLIST_INCLUDE); WT_ERR(__wt_verbose(session, WT_VERB_FILEOPS, "wt_dirlist of %s %s prefix %s", path, LF_ISSET(WT_DIRLIST_INCLUDE) ? "include" : "exclude", prefix == NULL ? "all" : prefix)); WT_SYSCALL_RETRY(((dirp = opendir(path)) == NULL ? 1 : 0), ret); if (ret != 0) WT_ERR_MSG(session, ret, "%s: opendir", path); for (dirsz = 0, count = 0; (dp = readdir(dirp)) != NULL;) { /* * Skip . and .. */ if (strcmp(dp->d_name, ".") == 0 || strcmp(dp->d_name, "..") == 0) continue; match = 0; if (prefix != NULL && ((LF_ISSET(WT_DIRLIST_INCLUDE) && WT_PREFIX_MATCH(dp->d_name, prefix)) || (LF_ISSET(WT_DIRLIST_EXCLUDE) && !WT_PREFIX_MATCH(dp->d_name, prefix)))) match = 1; if (prefix == NULL || match) { /* * We have a file name we want to return. */ count++; if (count > dirsz) { dirsz += WT_DIR_ENTRY; WT_ERR(__wt_realloc_def( session, &dirallocsz, dirsz, &entries)); } WT_ERR(__wt_strdup( session, dp->d_name, &entries[count-1])); } } if (count > 0) *dirlist = entries; *countp = count; err: if (dirp != NULL) (void)closedir(dirp); __wt_free(session, path); if (ret == 0) return (0); if (*dirlist != NULL) { for (count = dirsz; count > 0; count--) __wt_free(session, entries[count]); __wt_free(session, entries); } WT_RET_MSG(session, ret, "dirlist %s prefix %s", dir, prefix); }
/* * __schema_open_index -- * Open one or more indices for a table (internal version). */ static int __schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) { WT_CURSOR *cursor; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_INDEX *idx; u_int i; int cmp; bool match; const char *idxconf, *name, *tablename, *uri; /* Check if we've already done the work. */ if (idxname == NULL && table->idx_complete) return (0); cursor = NULL; idx = NULL; match = false; /* Build a search key. */ tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); WT_ERR(__wt_scr_alloc(session, 512, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); /* Find matching indices. */ WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tmp->data); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &uri)); name = uri; if (!WT_PREFIX_SKIP(name, tmp->data)) break; /* Is this the index we are looking for? */ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len); /* * Ensure there is space, including if we have to make room for * a new entry in the middle of the list. */ WT_ERR(__wt_realloc_def(session, &table->idx_alloc, WT_MAX(i, table->nindices) + 1, &table->indices)); /* Keep the in-memory list in sync with the metadata. */ cmp = 0; while (table->indices[i] != NULL && (cmp = strcmp(uri, table->indices[i]->name)) > 0) { /* Index no longer exists, remove it. */ __wt_free(session, table->indices[i]); memmove(&table->indices[i], &table->indices[i + 1], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[--table->nindices] = NULL; } if (cmp < 0) { /* Make room for a new index. */ memmove(&table->indices[i + 1], &table->indices[i], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[i] = NULL; ++table->nindices; } if (!match) continue; if (table->indices[i] == NULL) { WT_ERR(cursor->get_value(cursor, &idxconf)); WT_ERR(__wt_calloc_one(session, &idx)); WT_ERR(__wt_strdup(session, uri, &idx->name)); WT_ERR(__wt_strdup(session, idxconf, &idx->config)); WT_ERR(__open_index(session, table, idx)); /* * If we're checking the creation of an index before a * table is fully created, don't save the index: it * will need to be reopened once the table is complete. */ if (!table->cg_complete) { WT_ERR( __wt_schema_destroy_index(session, &idx)); if (idxname != NULL) break; continue; } table->indices[i] = idx; idx = NULL; /* * If the slot is bigger than anything else we've seen, * bump the number of indices. */ if (i >= table->nindices) table->nindices = i + 1; } /* If we were looking for a single index, we're done. */ if (indexp != NULL) *indexp = table->indices[i]; if (idxname != NULL) break; } WT_ERR_NOTFOUND_OK(ret); if (idxname != NULL && !match) ret = WT_NOTFOUND; /* If we did a full pass, we won't need to do it again. */ if (idxname == NULL) { table->nindices = i; table->idx_complete = true; } err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); WT_TRET(__wt_schema_destroy_index(session, &idx)); __wt_scr_free(session, &tmp); return (ret); }
* strings "key=(k1=v1,k2=v2)" and "key=(k1=v2)" appear, the result will * be "key=(k1=v2,k2=v2)" because the nested values are merged. */ int __wt_config_merge(WT_SESSION_IMPL *session, const char **cfg, const char *cfg_strip, const char **config_ret) WT_GCC_FUNC_ATTRIBUTE((visibility("default"))) { WT_CONFIG_MERGE merge; WT_DECL_RET; size_t i; /* Start out with a reasonable number of entries. */ WT_CLEAR(merge); WT_RET(__wt_realloc_def( session, &merge.entries_allocated, 100, &merge.entries)); /* * Scan the configuration strings, entering them into the array. The * list of configuration values to be removed must be scanned last * so their generation numbers are the highest. */ for (; *cfg != NULL; ++cfg) WT_ERR(__config_merge_scan(session, NULL, *cfg, false, &merge)); if (cfg_strip != NULL) WT_ERR(__config_merge_scan( session, NULL, cfg_strip, true, &merge)); /* * Sort the array by key and, in the case of identical keys, by * generation.