/* * __clsm_compare -- * WT_CURSOR->compare implementation for the LSM cursor type. */ static int __clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR_LSM *alsm; WT_DECL_RET; WT_SESSION_IMPL *session; int cmp; /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */ alsm = (WT_CURSOR_LSM *)a; CURSOR_API_CALL(a, session, compare, NULL); /* * Confirm both cursors refer to the same source and have keys, then * compare the keys. */ if (strcmp(a->uri, b->uri) != 0) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); WT_CURSOR_NEEDKEY(a); WT_CURSOR_NEEDKEY(b); WT_ERR(WT_LEX_CMP( session, alsm->lsm_tree->collator, &a->key, &b->key, cmp)); *cmpp = cmp; err: API_END(session); return (ret); }
/* * __curfile_compare -- * WT_CURSOR->compare method for the btree cursor type. */ static int __curfile_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)a; CURSOR_API_CALL(a, session, compare, cbt->btree); /* * Confirm both cursors refer to the same source and have keys, then * call the underlying object to compare them. */ if (strcmp(a->uri, b->uri) != 0) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); WT_CURSOR_NEEDKEY(a); WT_CURSOR_NEEDKEY(b); ret = __wt_btcur_compare( (WT_CURSOR_BTREE *)a, (WT_CURSOR_BTREE *)b, cmpp); err: API_END(session); return (ret); }
/* * __wt_schema_range_truncate -- * WT_SESSION::truncate with a range. */ int __wt_schema_range_truncate( WT_SESSION_IMPL *session, WT_CURSOR *start, WT_CURSOR *stop) { WT_CURSOR *cursor; WT_DATA_SOURCE *dsrc; WT_DECL_RET; const char *uri; cursor = (start != NULL) ? start : stop; uri = cursor->internal_uri; if (WT_PREFIX_MATCH(uri, "file:")) { if (start != NULL) WT_CURSOR_NEEDKEY(start); if (stop != NULL) WT_CURSOR_NEEDKEY(stop); WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)cursor)->btree, ret = __wt_btcur_range_truncate( (WT_CURSOR_BTREE *)start, (WT_CURSOR_BTREE *)stop)); } else if (WT_PREFIX_MATCH(uri, "table:")) ret = __wt_table_range_truncate( (WT_CURSOR_TABLE *)start, (WT_CURSOR_TABLE *)stop); else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL && dsrc->range_truncate != NULL) ret = dsrc->range_truncate(dsrc, &session->iface, start, stop); else ret = __wt_range_truncate(start, stop); err: return (ret); }
/* * __curfile_remove -- * WT_CURSOR->remove method for the btree cursor type. */ static int __curfile_remove(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret); /* * After a successful remove, copy the key: the value is not available. */ if (ret == 0) { if (F_ISSET(cursor, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&(cursor)->key)) { WT_ERR(__wt_buf_set(session, &cursor->key, cursor->key.data, cursor->key.size)); F_CLR(cursor, WT_CURSTD_KEY_INT); F_SET(cursor, WT_CURSTD_KEY_EXT); } F_CLR(cursor, WT_CURSTD_VALUE_SET); } err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curfile_insert -- * WT_CURSOR->insert method for the btree cursor type. */ static int __curfile_insert(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree); if (!F_ISSET(cursor, WT_CURSTD_APPEND)) WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret); /* * Insert is the one cursor operation that doesn't end with the cursor * pointing to an on-page item. The standard macro handles errors * correctly, but we need to leave the application cursor unchanged in * the case of success, except for column-store appends, where we are * returning a key. */ if (ret == 0) { if (!F_ISSET(cursor, WT_CURSTD_APPEND)) { F_SET(cursor, WT_CURSTD_KEY_EXT); F_CLR(cursor, WT_CURSTD_KEY_INT); } F_SET(cursor, WT_CURSTD_VALUE_EXT); F_CLR(cursor, WT_CURSTD_VALUE_INT); } err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curstat_get_key -- * WT_CURSOR->get_key for statistics cursors. */ static int __curstat_get_key(WT_CURSOR *cursor, ...) { WT_CURSOR_STAT *cst; WT_DECL_RET; WT_ITEM *item; WT_SESSION_IMPL *session; size_t size; va_list ap; cst = (WT_CURSOR_STAT *)cursor; va_start(ap, cursor); CURSOR_API_CALL(cursor, session, get_key, NULL); WT_CURSOR_NEEDKEY(cursor); if (F_ISSET(cursor, WT_CURSTD_RAW)) { WT_ERR(__wt_struct_size( session, &size, cursor->key_format, cst->key)); WT_ERR(__wt_buf_initsize(session, &cursor->key, size)); WT_ERR(__wt_struct_pack(session, cursor->key.mem, size, cursor->key_format, cst->key)); item = va_arg(ap, WT_ITEM *); item->data = cursor->key.data; item->size = cursor->key.size; } else
/* * __curfile_insert -- * WT_CURSOR->insert method for the btree cursor type. */ static int __curfile_insert(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree); if (!F_ISSET(cursor, WT_CURSTD_APPEND)) WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_insert(cbt), ret); /* * Insert is the one cursor operation that doesn't end with the cursor * pointing to an on-page item (except for column-store appends, where * we are returning a key). That is, the application's cursor continues * to reference the application's memory after a successful cursor call, * which isn't true anywhere else. We don't want to have to explain that * scoping corner case, so we reset the application's cursor so it can * free the referenced memory and continue on without risking subsequent * core dumps. */ if (ret == 0) { if (!F_ISSET(cursor, WT_CURSTD_APPEND)) F_CLR(cursor, WT_CURSTD_KEY_INT); F_CLR(cursor, WT_CURSTD_VALUE_INT); } err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curds_compare -- * WT_CURSOR.compare method for the data-source cursor type. */ static int __curds_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_COLLATOR *collator; WT_DECL_RET; WT_SESSION_IMPL *session; CURSOR_API_CALL(a, session, compare, NULL); /* * Confirm both cursors refer to the same source and have keys, then * compare them. */ if (strcmp(a->internal_uri, b->internal_uri) != 0) WT_ERR_MSG(session, EINVAL, "Cursors must reference the same object"); WT_CURSOR_NEEDKEY(a); WT_CURSOR_NEEDKEY(b); if (WT_CURSOR_RECNO(a)) { if (a->recno < b->recno) *cmpp = -1; else if (a->recno == b->recno) *cmpp = 0; else *cmpp = 1; } else { /* * The assumption is data-sources don't provide WiredTiger with * WT_CURSOR.compare methods, instead, we'll copy the key/value * out of the underlying data-source cursor and any comparison * to be done can be done at this level. */ collator = ((WT_CURSOR_DATA_SOURCE *)a)->collator; WT_ERR(__wt_compare( session, collator, &a->key, &b->key, cmpp)); } err: API_END_RET(session, ret); }
/* * __curds_key_set - * Set the key for the data-source. */ static inline int __curds_key_set(WT_CURSOR *cursor) { WT_DECL_RET; WT_CURSOR_NEEDKEY(cursor); cursor->data_source->recno = cursor->recno; cursor->data_source->key.data = cursor->key.data; cursor->data_source->key.size = cursor->key.size; err: return (ret); }
/* * __curfile_search_near -- * WT_CURSOR->search_near method for the btree cursor type. */ static int __curfile_search_near(WT_CURSOR *cursor, int *exact) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, search_near, cbt->btree); WT_CURSOR_NEEDKEY(cursor); ret = __wt_btcur_search_near(cbt, exact); err: API_END(session); return (ret); }
/* * __curfile_remove -- * WT_CURSOR->remove method for the btree cursor type. */ static int __curfile_remove(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree); WT_CURSOR_NEEDKEY(cursor); ret = __wt_btcur_remove((WT_CURSOR_BTREE *)cursor); err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curds_key_set -- * Set the key for the data-source. */ static int __curds_key_set(WT_CURSOR *cursor) { WT_CURSOR *source; WT_DECL_RET; source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source; WT_CURSOR_NEEDKEY(cursor); source->recno = cursor->recno; source->key.data = cursor->key.data; source->key.size = cursor->key.size; err: return (ret); }
/* * __curfile_search -- * WT_CURSOR->search method for the btree cursor type. */ static int __curfile_search(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_API_CALL(cursor, session, search, cbt->btree); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_search(cbt), ret); err: API_END_RET(session, ret); }
/* * __curfile_insert -- * WT_CURSOR->insert method for the btree cursor type. */ static int __curfile_insert(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, insert, cbt->btree); if (!F_ISSET(cursor, WT_CURSTD_APPEND)) WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); ret = __wt_btcur_insert((WT_CURSOR_BTREE *)cursor); err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curfile_update -- * WT_CURSOR->update method for the btree cursor type. */ static int __curfile_update(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, update, cbt->btree); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_update(cbt), ret); err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curbulk_insert -- * WT_CURSOR->insert for the bulk cursor type. */ static int __curbulk_insert(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_SESSION_IMPL *session; int ret; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; CURSOR_API_CALL(cursor, session, insert, btree); if (btree->type == BTREE_ROW) WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__wt_bulk_insert(cbulk)); err: API_END(session); return (ret); }
/* * __curbulk_insert -- * WT_CURSOR->insert for the bulk cursor type. */ static int __curbulk_insert(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; /* * Bulk cursor inserts are updates, but don't need auto-commit * transactions because they are single-threaded and not visible until * the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); if (btree->type == BTREE_ROW) WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NEEDVALUE(cursor); WT_ERR(__wt_bulk_insert(cbulk)); err: API_END(session); return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; u_int close_range_end, close_range_start; bool locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; /* * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* * Query operations need a full set of cursors. Overwrite cursors * do queries in service of updates. */ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE)) F_SET(clsm, WT_CLSM_OPEN_READ); if (lsm_tree->nchunks == 0) return (0); ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* * If the key is pointing to memory that is pinned by a chunk * cursor, take a copy before closing cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_CURSOR_NEEDKEY(c); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->switch_txn[ngood - 1] = chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) close_range_end -= nupdates; else close_range_end = 0; WT_ASSERT(session, ngood >= close_range_end); } else { close_range_end = 0; close_range_start = 0; } if (close_range_end > close_range_start) { saved_gen = lsm_tree->dsk_gen; locked = false; WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, close_range_start, close_range_end)); WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->switch_txn[i] = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); /* * Setup all cursors other than the primary to only do conflict * checks on insert operations. This allows us to execute * inserts on non-primary chunks as a way of checking for * write conflicts with concurrent updates. */ if (i != nchunks - 1) (*cp)->insert = __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by * the first update. */ btree = ((WT_CURSOR_BTREE *)(primary))->btree; if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, __wt_btree_evictable(session, false)); } } clsm->dsk_gen = lsm_tree->dsk_gen; err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); return (ret); }