/* * __bloom_init -- * Allocate a WT_BLOOM handle. */ static int __bloom_init(WT_SESSION_IMPL *session, const char *uri, const char *config, WT_BLOOM **bloomp) { WT_BLOOM *bloom; WT_DECL_RET; size_t len; bloom = NULL; WT_ERR(__wt_calloc(session, 1, sizeof(WT_BLOOM), &bloom)); WT_ERR(__wt_strdup(session, uri, &bloom->uri)); WT_ERR(__wt_strdup(session, config, &bloom->config)); len = strlen(WT_BLOOM_TABLE_CONFIG) + 2; if (config != NULL) len += strlen(config); WT_ERR(__wt_calloc(session, len, sizeof(char), &bloom->config)); /* Add the standard config at the end, so it overrides user settings. */ (void)snprintf(bloom->config, len, "%s,%s", config == NULL ? "" : config, WT_BLOOM_TABLE_CONFIG); bloom->session = session; *bloomp = bloom; return (0); err: if (bloom->uri != NULL) __wt_free(session, bloom->uri); if (bloom->config != NULL) __wt_free(session, bloom->config); if (bloom->bitstring != NULL) __wt_free(session, bloom->bitstring); if (bloom != NULL) __wt_free(session, bloom); return (ret); }
/* * __wt_update_alloc -- * Allocate a WT_UPDATE structure and associated value and fill it in. */ int __wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) { WT_UPDATE *upd; size_t size; /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ size = value == NULL ? 0 : value->size; WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); if (value == NULL) WT_UPDATE_DELETED_SET(upd); else { upd->size = WT_STORE_SIZE(size); memcpy(WT_UPDATE_DATA(upd), value->data, size); } *updp = upd; if (sizep != NULL) *sizep = sizeof(WT_UPDATE) + size; return (0); }
/* * __wt_nfilename -- * Build a file name in a scratch buffer. If the name is already an * absolute path duplicate it, otherwise generate a path relative to the * connection home directory. */ int __wt_nfilename( WT_SESSION_IMPL *session, const char *name, size_t namelen, char **path) { size_t len; char *buf; *path = NULL; /* * Needs to work with a NULL session handle - since this is called via * the exists API which is used by the test utilities. */ if (session == NULL || __wt_absolute_path(name)) WT_RET(__wt_strndup(session, name, namelen, path)); else { len = strlen(S2C(session)->home) + 1 + namelen + 1; WT_RET(__wt_calloc(session, 1, len, &buf)); snprintf(buf, len, "%s%s%.*s", S2C(session)->home, __wt_path_separator(), (int)namelen, name); *path = buf; } return (0); }
/* * __wt_cond_alloc -- * Allocate and initialize a condition variable. */ int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp) { WT_CONDVAR *cond; /* * !!! * This function MUST handle a NULL session handle. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond)); /* Initialize the mutex. */ if (pthread_mutex_init(&cond->mtx, NULL) != 0) goto err; /* Initialize the condition variable to permit self-blocking. */ if (pthread_cond_init(&cond->cond, NULL) != 0) goto err; cond->name = name; cond->signalled = is_signalled; *condp = cond; return (0); err: __wt_free(session, cond); return (WT_ERROR); }
/* * __ckpt_extlist_read -- * Read a checkpoints extent lists and copy */ static int __ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt) { WT_BLOCK_CKPT *ci; /* * Allocate a checkpoint structure, crack the cookie and read the * checkpoint's extent lists. * * Ignore the avail list: checkpoint avail lists are only useful if we * are rolling forward from the particular checkpoint and they represent * our best understanding of what blocks can be allocated. If we are * not operating on the live checkpoint, subsequent checkpoints might * have allocated those blocks, and the avail list is useless. We don't * discard it, because it is useful as part of verification, but we * don't re-write it either. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv)); ci = ckpt->bpriv; WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name)); WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci)); WT_RET(__wt_block_extlist_read( session, block, &ci->alloc, ci->file_size)); WT_RET(__wt_block_extlist_read( session, block, &ci->discard, ci->file_size)); return (0); }
/* * __wt_ovfl_txnc_add -- * Add a new entry to the page's list of transaction-cached overflow * records. */ int __wt_ovfl_txnc_add(WT_SESSION_IMPL *session, WT_PAGE *page, const uint8_t *addr, size_t addr_size, const void *value, size_t value_size) { WT_OVFL_TXNC **head, **stack[WT_SKIP_MAXDEPTH], *txnc; size_t size; u_int i, skipdepth; uint8_t *p; if (page->modify->ovfl_track == NULL) WT_RET(__ovfl_track_init(session, page)); head = page->modify->ovfl_track->ovfl_txnc; /* Choose a skiplist depth for this insert. */ skipdepth = __wt_skip_choose_depth(session); /* * Allocate the WT_OVFL_TXNC structure, next pointers for the skip * list, room for the address and value, then copy everything into * place. * * To minimize the WT_OVFL_TXNC structure size, the address offset * and size are single bytes: that's safe because the address follows * the structure (which can't be more than about 100B), and address * cookies are limited to 255B. */ size = sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *) + addr_size + value_size; WT_RET(__wt_calloc(session, 1, size, &txnc)); p = (uint8_t *)txnc + sizeof(WT_OVFL_TXNC) + skipdepth * sizeof(WT_OVFL_TXNC *); txnc->addr_offset = (uint8_t)WT_PTRDIFF(p, txnc); txnc->addr_size = (uint8_t)addr_size; memcpy(p, addr, addr_size); p += addr_size; txnc->value_offset = WT_PTRDIFF32(p, txnc); txnc->value_size = WT_STORE_SIZE(value_size); memcpy(p, value, value_size); txnc->current = __wt_txn_new_id(session); __wt_cache_page_inmem_incr( session, page, WT_OVFL_SIZE(txnc, WT_OVFL_TXNC)); /* Insert the new entry into the skiplist. */ __ovfl_txnc_skip_search_stack(head, stack, addr, addr_size); for (i = 0; i < skipdepth; ++i) { txnc->next[i] = *stack[i]; *stack[i] = txnc; } if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET(__ovfl_txnc_verbose(session, page, txnc, "add")); return (0); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; case WT_PM_REC_EMPTY: /* Page is empty */ /* We checked if the page was empty when we reviewed it. */ /* FALLTHROUGH */ WT_ILLEGAL_VALUE(session); } return (0); }
/* * __curjoin_open_main -- * For the given index, open the main file with a projection * that is the index keys. */ static int __curjoin_open_main(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry) { WT_DECL_RET; WT_INDEX *idx; size_t len, newsize; char *main_uri, *newformat; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; main_uri = newformat = NULL; idx = entry->index; newsize = strlen(cjoin->table->iface.name) + idx->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); WT_ERR(__wt_snprintf(main_uri, newsize, "%s%.*s", cjoin->table->iface.name, (int)idx->colconf.len, idx->colconf.str)); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); if (idx->extractor == NULL) { /* * Add no-op padding so trailing 'u' formats are not * transformed to 'U'. This matches what happens in * the index. We don't do this when we have an * extractor, extractors already use the padding * byte trick. */ len = strlen(entry->main->value_format) + 3; WT_ERR(__wt_calloc(session, len, 1, &newformat)); WT_ERR(__wt_snprintf( newformat, len, "%s0x", entry->main->value_format)); __wt_free(session, entry->main->value_format); entry->main->value_format = newformat; newformat = NULL; } err: __wt_free(session, main_uri); __wt_free(session, newformat); return (ret); }
/* * __wt_connection_open -- * Open a connection. */ int __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) { WT_SESSION_IMPL *session; /* Default session. */ session = conn->default_session; WT_ASSERT(session, session->iface.connection == &conn->iface); /* * Tell internal server threads to run: this must be set before opening * any sessions. */ F_SET(conn, WT_CONN_SERVER_RUN | WT_CONN_LOG_SERVER_RUN); /* WT_SESSION_IMPL array. */ WT_RET(__wt_calloc(session, conn->session_size, sizeof(WT_SESSION_IMPL), &conn->sessions)); WT_CACHE_LINE_ALIGNMENT_VERIFY(session, conn->sessions); /* * Open the default session. We open this before starting service * threads because those may allocate and use session resources that * need to get cleaned up on close. */ WT_RET(__wt_open_internal_session( conn, "connection", false, 0, &session)); /* * The connection's default session is originally a static structure, * swap that out for a more fully-functional session. It's necessary * to have this step: the session allocation code uses the connection's * session, and if we pass a reference to the default session as the * place to store the allocated session, things get confused and error * handling can be corrupted. So, we allocate into a stack variable * and then assign it on success. */ conn->default_session = session; /* * Publish: there must be a barrier to ensure the connection structure * fields are set before other threads read from the pointer. */ WT_WRITE_BARRIER(); /* Create the cache. */ WT_RET(__wt_cache_create(session, cfg)); /* Initialize transaction support. */ WT_RET(__wt_txn_global_init(session, cfg)); return (0); }
/* * __wt_block_ext_prealloc -- * Pre-allocate WT_EXT and WT_SIZE structures. */ int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max) { if (session->block_manager == NULL) { WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_MGR_SESSION), &session->block_manager)); session->block_manager_cleanup = __block_manager_session_cleanup; } WT_RET(__block_ext_prealloc(session, max)); WT_RET(__block_size_prealloc(session, max)); return (0); }
/* * __curjoin_entry_iter_init -- * Initialize an iteration for the index managed by a join entry. * */ static int __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_CURSOR_JOIN_ITER **iterp) { WT_CURSOR *newcur; WT_CURSOR *to_dup; WT_DECL_RET; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; const char *uri, **config; char *uribuf; WT_CURSOR_JOIN_ITER *iter; size_t size; iter = NULL; uribuf = NULL; to_dup = entry->ends[0].cursor; uri = to_dup->uri; if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; if (cjoin->projection != NULL) { size = strlen(uri) + strlen(cjoin->projection) + 1; WT_ERR(__wt_calloc(session, size, 1, &uribuf)); snprintf(uribuf, size, "%s%s", uri, cjoin->projection); uri = uribuf; } WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)cjoin, config, &newcur)); WT_ERR(__wt_cursor_dup_position(to_dup, newcur)); WT_ERR(__wt_calloc_one(session, &iter)); iter->cjoin = cjoin; iter->session = session; iter->entry = entry; iter->cursor = newcur; iter->advance = false; *iterp = iter; if (0) { err: __wt_free(session, iter); } __wt_free(session, uribuf); return (ret); }
/* * __block_ext_alloc -- * Allocate a new WT_EXT structure. */ static int __block_ext_alloc(WT_SESSION_IMPL *session, WT_EXT **extp) { WT_EXT *ext; size_t skipdepth; skipdepth = __wt_skip_choose_depth(session); WT_RET(__wt_calloc(session, 1, sizeof(WT_EXT) + skipdepth * 2 * sizeof(WT_EXT *), &ext)); ext->depth = (uint8_t)skipdepth; (*extp) = ext; return (0); }
/* * __wt_curmetadata_open -- * WT_SESSION->open_cursor method for metadata cursors. * * Metadata cursors are a similar to a file cursor on the special metadata * table, except that the metadata for the metadata table (which is stored * in the turtle file) can also be queried. * * Metadata cursors are read-only default. */ int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, NULL, /* get-key */ NULL, /* get-value */ NULL, /* set-key */ NULL, /* set-value */ __curmetadata_compare, /* compare */ __curmetadata_next, /* next */ __curmetadata_prev, /* prev */ __curmetadata_reset, /* reset */ __curmetadata_search, /* search */ __curmetadata_search_near, /* search-near */ __curmetadata_insert, /* insert */ __curmetadata_update, /* update */ __curmetadata_remove, /* remove */ __curmetadata_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_METADATA *mdc; WT_DECL_RET; WT_RET(__wt_calloc(session, 1, sizeof(WT_CURSOR_METADATA), &mdc)); cursor = &mdc->iface; *cursor = iface; cursor->session = &session->iface; cursor->key_format = "S"; cursor->value_format = "S"; /* Open the file cursor for operations on the regular metadata */ WT_ERR(__wt_metadata_cursor(session, cfg[1], &mdc->file_cursor)); WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); /* Metadata cursors default to read only. */ WT_ERR(__wt_cursor_config_readonly(cursor, cfg, 1)); if (0) { err: __wt_free(session, mdc); } return (ret); }
/* * __col_insert_alloc -- * Column-store insert: allocate a WT_INSERT structure and fill it in. */ static int __col_insert_alloc(WT_SESSION_IMPL *session, uint64_t recno, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) { WT_INSERT *ins; size_t ins_size; /* * Allocate the WT_INSERT structure and skiplist pointers, then copy * the record number into place. */ ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *); WT_RET(__wt_calloc(session, 1, ins_size, &ins)); WT_INSERT_RECNO(ins) = recno; *insp = ins; *ins_sizep = ins_size; return (0); }
/* * __wt_strndup -- * Duplicate a byte string of a given length (and NUL-terminate). */ int __wt_strndup(WT_SESSION_IMPL *session, const void *str, size_t len, void *retp) { void *p; if (str == NULL) { *(void **)retp = NULL; return (0); } WT_RET(__wt_calloc(session, len + 1, 1, &p)); /* * Don't change this to strncpy, we rely on this function to duplicate * "strings" that contain nul bytes. */ memcpy(p, str, len); *(void **)retp = p; return (0); }
/* * __wt_rwlock_alloc -- * Allocate and initialize a read/write lock. */ int __wt_rwlock_alloc( WT_SESSION_IMPL *session, const char *name, WT_RWLOCK **rwlockp) { WT_DECL_RET; WT_RWLOCK *rwlock; WT_RET(__wt_calloc(session, 1, sizeof(WT_RWLOCK), &rwlock)); WT_ERR_TEST(pthread_rwlock_init(&rwlock->rwlock, NULL), WT_ERROR); rwlock->name = name; *rwlockp = rwlock; WT_VERBOSE_ERR(session, mutex, "rwlock: alloc %s (%p)", rwlock->name, rwlock); if (0) { err: __wt_free(session, rwlock); } return (ret); }
/* * __wt_getenv -- * Get a non-NULL, greater than zero-length environment variable. */ int __wt_getenv(WT_SESSION_IMPL *session, const char *variable, const char **envp) { WT_DECL_RET; DWORD size; *envp = NULL; size = GetEnvironmentVariableA(variable, NULL, 0); if (size <= 1) return (WT_NOTFOUND); WT_RET(__wt_calloc(session, 1, size, envp)); ret = GetEnvironmentVariableA(variable, *envp, size); /* We expect the number of bytes not including nul terminator. */ if ((ret + 1) != size) WT_RET_MSG(session, __wt_getlasterror(), "GetEnvironmentVariableA failed: %s", variable); return (0); }
/* * __wt_cond_alloc -- * Allocate and initialize a condition variable. */ int __wt_cond_alloc(WT_SESSION_IMPL *session, const char *name, int is_signalled, WT_CONDVAR **condp) { WT_CONDVAR *cond; WT_DECL_RET; pthread_mutexattr_t *attrp; /* Initialize the mutex. */ #ifdef HAVE_MUTEX_ADAPTIVE pthread_mutexattr_t attr; WT_RET(pthread_mutexattr_init(&attr)); WT_RET(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP)); attrp = &attr; #else attrp = NULL; #endif /* * !!! * This function MUST handle a NULL session handle. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_CONDVAR), &cond)); WT_ERR(pthread_mutex_init(&cond->mtx, attrp)); /* Initialize the condition variable to permit self-blocking. */ WT_ERR(pthread_cond_init(&cond->cond, NULL)); cond->name = name; cond->waiters = is_signalled ? -1 : 0; *condp = cond; return (0); err: __wt_free(session, cond); return (ret); }
/* * __wt_nfilename -- * Build a file name in a scratch buffer. If the name is already an * absolute path duplicate it, otherwise generate a path relative to the * connection home directory. */ int __wt_nfilename(WT_SESSION_IMPL *session, const char *name, size_t namelen, const char **path) { WT_CONNECTION_IMPL *conn; size_t len; char *buf; conn = S2C(session); *path = NULL; if (__wt_absolute_path(name)) WT_RET(__wt_strndup(session, name, namelen, path)); else { len = strlen(conn->home) + 1 + namelen + 1; WT_RET(__wt_calloc(session, 1, len, &buf)); snprintf(buf, len, "%s/%.*s", conn->home, (int)namelen, name); *path = buf; } return (0); }
/* * __wt_row_insert_alloc -- * Row-store insert: allocate a WT_INSERT structure and fill it in. */ int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) { WT_INSERT *ins; size_t ins_size; /* * Allocate the WT_INSERT structure, next pointers for the skip list, * and room for the key. Then copy the key into place. */ ins_size = sizeof(WT_INSERT) + skipdepth * sizeof(WT_INSERT *) + key->size; WT_RET(__wt_calloc(session, 1, ins_size, &ins)); ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size); WT_INSERT_KEY_SIZE(ins) = key->size; memcpy(WT_INSERT_KEY(ins), key->data, key->size); *insp = ins; if (ins_sizep != NULL) *ins_sizep = ins_size; return (0); }
/* * __wt_update_alloc -- * Allocate a WT_UPDATE structure and associated value from the session's * buffer and fill it in. */ int __wt_update_alloc(WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) { WT_DECL_RET; WT_UPDATE *upd; size_t size; /* * Allocate the WT_UPDATE structure and room for the value, then copy * the value into place. */ size = value == NULL ? 0 : value->size; WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); if (value == NULL) WT_UPDATE_DELETED_SET(upd); else { upd->size = WT_STORE_SIZE(size); memcpy(WT_UPDATE_DATA(upd), value->data, size); } /* * This must come last: after __wt_txn_modify succeeds, we must return * a non-NULL upd so our callers can call __wt_txn_unmodify on any * subsequent failure. */ if ((ret = __wt_txn_modify(session, &upd->txnid)) != 0) { __wt_free(session, upd); return (ret); } *updp = upd; if (sizep != NULL) *sizep = sizeof(WT_UPDATE) + size; return (0); }
/* * __curjoin_iter_set_entry -- * Set the current entry for an iterator. */ static int __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos) { WT_CURSOR *c, *to_dup; WT_CURSOR_JOIN *cjoin, *topjoin; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_SESSION_IMPL *session; size_t size; const char *raw_cfg[] = { WT_CONFIG_BASE( iter->session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( iter->session, WT_SESSION_open_cursor), NULL }; const char **config; char *uri; session = iter->session; cjoin = iter->cjoin; uri = NULL; entry = iter->entry = &cjoin->entries[entry_pos]; iter->positioned = false; iter->entry_pos = entry_pos; iter->end_pos = 0; iter->is_equal = (entry->ends_next == 1 && WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ); iter->end_skip = (entry->ends_next > 0 && WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0; iter->end_count = WT_MIN(1, entry->ends_next); if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) { iter->entry_count = cjoin->entries_next; if (iter->is_equal) iter->end_count = entry->ends_next; } else iter->entry_count = 1; WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count); entry->stats.iterated = 0; if (entry->subjoin == NULL) { for (topjoin = iter->cjoin; topjoin->parent != NULL; topjoin = topjoin->parent) ; to_dup = entry->ends[0].cursor; if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; size = strlen(to_dup->internal_uri) + 3; WT_ERR(__wt_calloc(session, size, 1, &uri)); WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri)); if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) { iter->cursor = NULL; if (c != NULL) WT_ERR(c->close(c)); WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)topjoin, config, &iter->cursor)); } WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor)); } else if (iter->cursor != NULL) { WT_ERR(iter->cursor->close(iter->cursor)); iter->cursor = NULL; } err: __wt_free(session, uri); return (ret); }
/* * __wt_page_alloc -- * Create or read a page into the cache. */ int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint64_t recno, uint32_t alloc_entries, int alloc_refs, WT_PAGE **pagep) { WT_CACHE *cache; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; size_t size; uint32_t i; void *p; *pagep = NULL; cache = S2C(session)->cache; page = NULL; size = sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: break; case WT_PAGE_COL_VAR: /* * Variable-length column-store leaf page: allocate memory to * describe the page's contents with the initial allocation. */ size += alloc_entries * sizeof(WT_COL); break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page: allocate memory to describe the page's * contents with the initial allocation. */ size += alloc_entries * sizeof(WT_ROW); break; WT_ILLEGAL_VALUE(session); } WT_RET(__wt_calloc(session, 1, size, &page)); page->type = type; page->read_gen = WT_READGEN_NOTSET; switch (type) { case WT_PAGE_COL_FIX: page->pg_fix_recno = recno; page->pg_fix_entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_recno = recno; /* * Internal pages have an array of references to objects so they * can split. Allocate the array of references and optionally, * the objects to which they point. */ WT_ERR(__wt_calloc(session, 1, sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *), &p)); size += sizeof(WT_PAGE_INDEX) + alloc_entries * sizeof(WT_REF *); pindex = p; pindex->index = (WT_REF **)((WT_PAGE_INDEX *)p + 1); pindex->entries = alloc_entries; WT_INTL_INDEX_SET(page, pindex); if (alloc_refs) for (i = 0; i < pindex->entries; ++i) { WT_ERR(__wt_calloc_def( session, 1, &pindex->index[i])); size += sizeof(WT_REF); } if (0) { err: if ((pindex = WT_INTL_INDEX_COPY(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) __wt_free(session, pindex->index[i]); __wt_free(session, pindex); } __wt_free(session, page); return (ret); } break; case WT_PAGE_COL_VAR: page->pg_var_recno = recno; page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_var_entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: page->pg_row_d = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_row_entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); } /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); (void)WT_ATOMIC_ADD8(cache->pages_inmem, 1); *pagep = page; return (0); }
/* * __wt_curfile_create -- * Open a cursor for a given btree handle. */ int __wt_curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], int bulk, int bitmap, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curfile_compare, /* compare */ __curfile_equals, /* equals */ __curfile_next, /* next */ __curfile_prev, /* prev */ __curfile_reset, /* reset */ __curfile_search, /* search */ __curfile_search_near, /* search-near */ __curfile_insert, /* insert */ __curfile_update, /* update */ __curfile_remove, /* remove */ __wt_cursor_reconfigure, /* reconfigure */ __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_CURSOR_BULK *cbulk; WT_DECL_RET; size_t csize; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); cbt = NULL; btree = S2BT(session); WT_ASSERT(session, btree != NULL); csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); WT_RET(__wt_calloc(session, 1, csize, &cbt)); cursor = &cbt->iface; *cursor = iface; cursor->session = &session->iface; cursor->internal_uri = btree->dhandle->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; cbt->btree = btree; if (bulk) { F_SET(cursor, WT_CURSTD_BULK); cbulk = (WT_CURSOR_BULK *)cbt; /* Optionally skip the validation of each bulk-loaded key. */ WT_ERR(__wt_config_gets_def( session, cfg, "skip_sort_check", 0, &cval)); WT_ERR(__wt_curbulk_init( session, cbulk, bitmap, cval.val == 0 ? 0 : 1)); } /* * random_retrieval * Random retrieval cursors only support next, reset and close. */ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); if (cval.val != 0) { __wt_cursor_set_notsup(cursor); cursor->next = __curfile_next_random; cursor->reset = __curfile_reset; } /* Underlying btree initialization. */ __wt_btcur_open(cbt); /* __wt_cursor_init is last so we don't have to clean up on error. */ WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); WT_STAT_FAST_CONN_INCR(session, cursor_create); WT_STAT_FAST_DATA_INCR(session, cursor_create); if (0) { err: __wt_free(session, cbt); } return (ret); }
/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN *child; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; size_t len; uint8_t endrange; u_int i, ins, nonbloom; bool hasins, needbloom, nested, range_eq; entry = NULL; hasins = needbloom = false; ins = nonbloom = 0; /* -Wuninitialized */ if (cjoin->entries_next == 0) { if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION)) F_SET(cjoin, WT_CURJOIN_DISJUNCTION); } else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=or does not match previous operation=and"); else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=and does not match previous operation=or"); nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:"); if (!nested) for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx && cjoin->entries[i].subjoin == NULL) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } else { if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "Bloom filters cannot be used with subjoins"); } if (entry == NULL) { WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_RET_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) != F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) WT_RET_MSG(session, EINVAL, "join has incompatible bloom_false_positives " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); endrange = WT_CURJOIN_END_RANGE(end); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (endrange == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_RET_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && endrange == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && endrange != WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } if (nested) { child = (WT_CURSOR_JOIN *)ref_cursor; entry->subjoin = child; child->parent = cjoin; } else { WT_RET(__curjoin_insert_endpoint(session, entry, hasins ? ins : entry->ends_next, &end)); end->cursor = ref_cursor; F_SET(end, range); if (entry->main == NULL && idx != NULL) { /* * Open the main file with a projection of the * indexed columns. */ WT_RET(__curjoin_open_main(session, cjoin, entry)); /* * When we are repacking index keys to remove the * primary key, we never want to transform trailing * 'u'. Use no-op padding to force this. */ cindex = (WT_CURSOR_INDEX *)ref_cursor; len = strlen(cindex->iface.key_format) + 3; WT_RET(__wt_calloc(session, len, 1, &entry->repack_format)); WT_RET(__wt_snprintf(entry->repack_format, len, "%s0x", cindex->iface.key_format)); } } return (0); }
/* * __wt_page_alloc -- * Create or read a page into the cache. */ int __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, uint32_t alloc_entries, WT_PAGE **pagep) { WT_CACHE *cache; WT_PAGE *page; size_t size; void *p; *pagep = NULL; cache = S2C(session)->cache; /* * Allocate a page, and for most page types, the additional information * it needs to describe the disk image. */ size = sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: size += alloc_entries * sizeof(WT_REF); break; case WT_PAGE_COL_VAR: size += alloc_entries * sizeof(WT_COL); break; case WT_PAGE_ROW_LEAF: size += alloc_entries * sizeof(WT_ROW); break; WT_ILLEGAL_VALUE(session); } WT_RET(__wt_calloc(session, 1, size, &page)); p = (uint8_t *)page + sizeof(WT_PAGE); switch (type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->u.intl.t = p; break; case WT_PAGE_COL_VAR: page->u.col_var.d = p; break; case WT_PAGE_ROW_LEAF: page->u.row.d = p; break; WT_ILLEGAL_VALUE(session); } /* Increment the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); (void)WT_ATOMIC_ADD(cache->pages_inmem, 1); /* The one page field we set is the type. */ page->type = type; *pagep = page; return (0); }
/* * __wt_open -- * Open a file handle. */ int __wt_open(WT_SESSION_IMPL *session, const char *name, int ok_create, int exclusive, int dio_type, WT_FH **fhp) { DWORD dwCreationDisposition; HANDLE filehandle, filehandle_secondary; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *fh, *tfh; int direct_io, f, matched, share_mode; char *path; conn = S2C(session); fh = NULL; path = NULL; filehandle = INVALID_HANDLE_VALUE; filehandle_secondary = INVALID_HANDLE_VALUE; direct_io = 0; WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: open", name)); /* Increment the reference count if we already have the file open. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) return (0); /* For directories, create empty file handles with invalid handles */ if (dio_type == WT_FILE_TYPE_DIRECTORY) { goto setupfh; } WT_RET(__wt_filename(session, name, &path)); share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE; /* * Security: * The application may spawn a new process, and we don't want another * process to have access to our file handles. * * TODO: Set tighter file permissions but set bInheritHandle to false * to prevent inheritance */ f = FILE_ATTRIBUTE_NORMAL; dwCreationDisposition = 0; if (ok_create) { dwCreationDisposition = CREATE_NEW; if (exclusive) dwCreationDisposition = CREATE_ALWAYS; } else dwCreationDisposition = OPEN_EXISTING; if (dio_type && FLD_ISSET(conn->direct_io, dio_type)) { f |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; direct_io = 1; } if (dio_type == WT_FILE_TYPE_LOG && FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) { f |= FILE_FLAG_WRITE_THROUGH; } /* Disable read-ahead on trees: it slows down random read workloads. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) f |= FILE_FLAG_RANDOM_ACCESS; filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, dwCreationDisposition, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) { if (GetLastError() == ERROR_FILE_EXISTS && ok_create) filehandle = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), direct_io ? "%s: open failed with direct I/O configured, some " "filesystem types do not support direct I/O" : "%s", path); } /* * Open a second handle to file to support allocation/truncation * concurrently with reads on the file. Writes would also move the file * pointer. */ filehandle_secondary = CreateFileA(path, (GENERIC_READ | GENERIC_WRITE), share_mode, NULL, OPEN_EXISTING, f, NULL); if (filehandle == INVALID_HANDLE_VALUE) WT_ERR_MSG(session, __wt_errno(), "open failed for secondary handle: %s", path); setupfh: WT_ERR(__wt_calloc(session, 1, sizeof(WT_FH), &fh)); WT_ERR(__wt_strdup(session, name, &fh->name)); fh->filehandle = filehandle; fh->filehandle_secondary = filehandle_secondary; fh->ref = 1; fh->direct_io = direct_io; /* Set the file's size. */ if (dio_type != WT_FILE_TYPE_DIRECTORY) WT_ERR(__wt_filesize(session, fh, &fh->size)); /* Configure file extension. */ if (dio_type == WT_FILE_TYPE_DATA || dio_type == WT_FILE_TYPE_CHECKPOINT) fh->extend_len = conn->data_extend_len; /* Configure fallocate/posix_fallocate calls. */ __wt_fallocate_config(session, fh); /* * Repeat the check for a match, but then link onto the database's list * of files. */ matched = 0; __wt_spin_lock(session, &conn->fh_lock); TAILQ_FOREACH(tfh, &conn->fhqh, q) if (strcmp(name, tfh->name) == 0) { ++tfh->ref; *fhp = tfh; matched = 1; break; } if (!matched) { TAILQ_INSERT_TAIL(&conn->fhqh, fh, q); WT_STAT_FAST_CONN_INCR(session, file_open); *fhp = fh; } __wt_spin_unlock(session, &conn->fh_lock); if (matched) { err: if (fh != NULL) { __wt_free(session, fh->name); __wt_free(session, fh); } if (filehandle != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle); if (filehandle_secondary != INVALID_HANDLE_VALUE) (void)CloseHandle(filehandle_secondary); } __wt_free(session, path); return (ret); }
/* * __curjoin_init_next -- * Initialize the cursor join when the next function is first called. */ static int __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bool iterable) { WT_BLOOM *bloom; WT_CURSOR *origcur; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_DECL_RET; size_t size; uint32_t f, k; char *mainbuf; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char **config, *proj, *urimain; mainbuf = NULL; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); /* Get a consistent view of our subordinate cursors if appropriate. */ __wt_txn_cursor_op(session); if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; urimain = cjoin->table->iface.name; if ((proj = cjoin->projection) != NULL) { size = strlen(urimain) + strlen(proj) + 1; WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj)); urimain = mainbuf; } WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, &cjoin->main)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { if (je->subjoin != NULL) { WT_ERR(__curjoin_init_next(session, je->subjoin, iterable)); continue; } __wt_stat_join_init_single(&je->stats); /* * For a single compare=le/lt endpoint in any entry that may * be iterated, construct a companion compare=ge endpoint * that will actually be iterated. */ if (iterable && je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) { origcur = je->ends[0].cursor; WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end)); WT_ERR(__wt_open_cursor(session, origcur->uri, (WT_CURSOR *)cjoin, F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg, &end->cursor)); end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_OWN_CURSOR; WT_ERR(end->cursor->next(end->cursor)); F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION); } for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_ERR(__curjoin_endpoint_init_key(session, je, end)); /* * Do any needed Bloom filter initialization. Ignore Bloom * filters for entries that will be iterated. They won't * help since these entries either don't need an inclusion * check or are doing any needed check during the iteration. */ if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, "join cursors with Bloom filters cannot be " "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_ERR(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_ERR(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_ERR(__wt_bloom_intersection(je->bloom, bloom)); WT_ERR(__wt_bloom_close(bloom)); } } if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) iterable = false; } F_SET(cjoin, WT_CURJOIN_INITIALIZED); err: __wt_free(session, mainbuf); return (ret); }
/* * __rec_page_dirty_update -- * Update a dirty page's reference on eviction. */ static int __rec_page_dirty_update(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_ADDR *addr; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; mod = page->modify; parent_ref = page->ref; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_EMPTY: /* Page is empty */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference an empty page. * * Set the transaction ID to WT_TXN_NONE because the fact that * reconciliation left the page "empty" means there's no older * transaction in the system that might need to see an earlier * version of the page. It isn't necessary (WT_TXN_NONE is 0), * but it's the right thing to do. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = NULL; parent_ref->addr = NULL; parent_ref->txnid = WT_TXN_NONE; WT_PUBLISH(parent_ref->state, WT_REF_DELETED); break; case WT_PM_REC_REPLACE: /* 1-for-1 page swap */ if (parent_ref->addr != NULL && __wt_off_page(page->parent, parent_ref->addr)) { __wt_free(session, ((WT_ADDR *)parent_ref->addr)->addr); __wt_free(session, parent_ref->addr); } /* * Update the parent to reference the replacement page. * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ WT_RET(__wt_calloc(session, 1, sizeof(WT_ADDR), &addr)); *addr = mod->u.replace; mod->u.replace.addr = NULL; mod->u.replace.size = 0; parent_ref->page = NULL; parent_ref->addr = addr; WT_PUBLISH(parent_ref->state, WT_REF_DISK); break; case WT_PM_REC_SPLIT: /* Page split */ /* * Update the parent to reference new internal page(s). * * Publish: a barrier to ensure the structure fields are set * before the state change makes the page available to readers. */ parent_ref->page = mod->u.split; WT_PUBLISH(parent_ref->state, WT_REF_MEM); /* Clear the reference else discarding the page will free it. */ mod->u.split = NULL; F_CLR(mod, WT_PM_REC_SPLIT); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __curfile_create -- * Open a cursor for a given btree handle. */ static int __curfile_create(WT_SESSION_IMPL *session, WT_CURSOR *owner, const char *cfg[], bool bulk, bool bitmap, WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curfile_compare, /* compare */ __curfile_equals, /* equals */ __curfile_next, /* next */ __curfile_prev, /* prev */ __curfile_reset, /* reset */ __curfile_search, /* search */ __curfile_search_near, /* search-near */ __curfile_insert, /* insert */ __wt_cursor_modify_notsup, /* modify */ __curfile_update, /* update */ __curfile_remove, /* remove */ __curfile_reserve, /* reserve */ __wt_cursor_reconfigure, /* reconfigure */ __curfile_close); /* close */ WT_BTREE *btree; WT_CONFIG_ITEM cval; WT_CURSOR *cursor; WT_CURSOR_BTREE *cbt; WT_CURSOR_BULK *cbulk; WT_DECL_RET; size_t csize; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BTREE, iface) == 0); cbt = NULL; btree = S2BT(session); WT_ASSERT(session, btree != NULL); csize = bulk ? sizeof(WT_CURSOR_BULK) : sizeof(WT_CURSOR_BTREE); WT_RET(__wt_calloc(session, 1, csize, &cbt)); cursor = &cbt->iface; *cursor = iface; cursor->session = &session->iface; cursor->internal_uri = btree->dhandle->name; cursor->key_format = btree->key_format; cursor->value_format = btree->value_format; cbt->btree = btree; /* * Increment the data-source's in-use counter; done now because closing * the cursor will decrement it, and all failure paths from here close * the cursor. */ __wt_cursor_dhandle_incr_use(session); if (session->dhandle->checkpoint != NULL) F_SET(cbt, WT_CBT_NO_TXN); if (bulk) { F_SET(cursor, WT_CURSTD_BULK); cbulk = (WT_CURSOR_BULK *)cbt; /* Optionally skip the validation of each bulk-loaded key. */ WT_ERR(__wt_config_gets_def( session, cfg, "skip_sort_check", 0, &cval)); WT_ERR(__wt_curbulk_init( session, cbulk, bitmap, cval.val == 0 ? 0 : 1)); } /* * Random retrieval, row-store only. * Random retrieval cursors support a limited set of methods. */ WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval)); if (cval.val != 0) { if (WT_CURSOR_RECNO(cursor)) WT_ERR_MSG(session, ENOTSUP, "next_random configuration not supported for " "column-store objects"); __wt_cursor_set_notsup(cursor); cursor->next = __wt_curfile_next_random; cursor->reset = __curfile_reset; WT_ERR(__wt_config_gets_def( session, cfg, "next_random_sample_size", 0, &cval)); if (cval.val != 0) cbt->next_random_sample_size = (u_int)cval.val; } /* Underlying btree initialization. */ __wt_btcur_open(cbt); /* * WT_CURSOR.modify supported on 'u' value formats, but the fast-path * through the btree code requires log file format changes, it's not * available in all versions. */ if (WT_STREQ(cursor->value_format, "u") && S2C(session)->compat_major >= WT_LOG_V2) cursor->modify = __curfile_modify; WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); WT_STAT_CONN_INCR(session, cursor_create); WT_STAT_DATA_INCR(session, cursor_create); if (0) { err: /* * Our caller expects to release the data handle if we fail. * Disconnect it from the cursor before closing. */ if (session->dhandle != NULL) __wt_cursor_dhandle_decr_use(session); cbt->btree = NULL; WT_TRET(__curfile_close(cursor)); *cursorp = NULL; } return (ret); }