/* * __wt_btcur_update -- * Update a record in the tree. */ int __wt_btcur_update(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_updates); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: if (cursor->value.size != 1) WT_RET_MSG(session, EINVAL, "item size of %" PRIu32 " does not match " "fixed-length file requirement of 1 byte", cursor->value.size); /* FALLTHROUGH */ case BTREE_COL_VAR: WT_ERR(__wt_col_search(session, cbt, 1)); /* * Update the record if it exists. Creating a record past the * end of the tree in a fixed-length column-store implicitly * fills the gap with empty records. Update the record in that * case, the record exists. */ if ((cbt->compare != 0 || __cursor_invalid(cbt)) && !__cursor_fix_implicit(btree, cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART) goto retry; break; case BTREE_ROW: /* Update the record it it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_btcur_reset -- * Invalidate the cursor position. */ int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) { WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_BSTAT_INCR(session, cursor_resets); __cursor_func_init(cbt, 1); __cursor_search_clear(cbt); return (0); }
/* * __wt_btcur_remove -- * Remove a record from the tree. */ int __wt_btcur_remove(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_removes); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__wt_col_search(session, cbt, 1)); /* * Remove the record if it exists. Creating a record past the * end of the tree in a fixed-length column-store implicitly * fills the gap with empty records. Return success in that * case, the record was deleted successfully. */ if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = __cursor_fix_implicit(btree, cbt) ? 0 : WT_NOTFOUND; else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART) goto retry; break; case BTREE_ROW: /* Remove the record if it exists. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare != 0 || __cursor_invalid(cbt)) ret = WT_NOTFOUND; else if ((ret = __wt_row_modify(session, cbt, 1)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_btcur_search -- * Search for a matching record in the tree. */ int __wt_btcur_search(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_ITEM *val; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_read); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); __cursor_func_init(cbt, 1); WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); if (cbt->compare != 0 || __cursor_invalid(cbt)) { /* * Creating a record past the end of the tree in a fixed-length * column-store implicitly fills the gap with empty records. */ if (__cursor_fix_implicit(btree, cbt)) { cbt->v = 0; val = &cbt->iface.value; val->data = &cbt->v; val->size = 1; } else ret = WT_NOTFOUND; } else ret = __wt_kv_return(session, cbt, 0); err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __hazard_exclusive -- * Request exclusive access to a page. */ static int __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top) { /* * Make sure there is space to track exclusive access so we can unlock * to clean up. */ if (session->excl_next * sizeof(WT_REF *) == session->excl_allocated) WT_RET(__wt_realloc(session, &session->excl_allocated, (session->excl_next + 50) * sizeof(WT_REF *), &session->excl)); /* * Hazard references are acquired down the tree, which means we can't * deadlock. * * Request exclusive access to the page. The top-level page should * already be in the locked state, lock child pages in memory. * If another thread already has this page, give up. */ if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED)) return (EBUSY); /* We couldn't change the state. */ WT_ASSERT(session, ref->state == WT_REF_LOCKED); session->excl[session->excl_next++] = ref; /* Check for a matching hazard reference. */ if (__wt_page_hazard_check(session, ref->page) == NULL) return (0); WT_BSTAT_INCR(session, rec_hazard); WT_CSTAT_INCR(session, cache_evict_hazard); WT_VERBOSE_RET( session, evict, "page %p hazard request failed", ref->page); return (EBUSY); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_SESSION_IMPL *session; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_BSTAT_INCR(session, cursor_read_prev); __cursor_func_init(cbt, 0); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (cbt->page != NULL) { switch (cbt->page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } do { WT_ERR(__wt_tree_np(session, &cbt->page, 0, 0)); WT_ERR_TEST(cbt->page == NULL, WT_NOTFOUND); } while ( cbt->page->type == WT_PAGE_COL_INT || cbt->page->type == WT_PAGE_ROW_INT); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (cbt->page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(cbt->page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_btcur_insert -- * Insert a record into the tree. */ int __wt_btcur_insert(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_inserts); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); retry: __cursor_func_init(cbt, 1); switch (btree->type) { case BTREE_COL_FIX: case BTREE_COL_VAR: /* * If WT_CURSTD_APPEND is set, insert a new record (ignoring * the application's record number). First we search for the * maximum possible record number so the search ends on the * last page. The real record number is assigned by the * serialized append operation. * __wt_col_append_serial_func */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = UINT64_MAX; WT_ERR(__wt_col_search(session, cbt, 1)); if (F_ISSET(cursor, WT_CURSTD_APPEND)) cbt->iface.recno = 0; /* * If WT_CURSTD_OVERWRITE set, insert/update the key/value pair. * * If WT_CURSTD_OVERWRITE not set, fail if the key exists, else * insert the key/value pair. Creating a record past the end * of the tree in a fixed-length column-store implicitly fills * the gap with empty records. Fail in that case, the record * exists. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && ((cbt->compare == 0 && !__cursor_invalid(cbt)) || (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) { ret = WT_DUPLICATE_KEY; break; } if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART) goto retry; if (F_ISSET(cursor, WT_CURSTD_APPEND) && ret == 0) cbt->iface.recno = cbt->recno; break; case BTREE_ROW: /* * If WT_CURSTD_OVERWRITE not set, fail if the key exists, else * insert the key/value pair. * * If WT_CURSTD_OVERWRITE set, insert/update the key/value pair. */ WT_ERR(__wt_row_search(session, cbt, 1)); if (cbt->compare == 0 && !__cursor_invalid(cbt) && !F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { ret = WT_DUPLICATE_KEY; break; } if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART) goto retry; break; WT_ILLEGAL_VALUE(session); } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_btcur_search_near -- * Search for a record in the tree. */ int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact) { WT_BTREE *btree; WT_ITEM *val; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int ret; btree = cbt->btree; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cursor->session; WT_BSTAT_INCR(session, cursor_read_near); if (btree->type == BTREE_ROW) WT_RET(__cursor_size_chk(session, &cursor->key)); __cursor_func_init(cbt, 1); WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); /* * Creating a record past the end of the tree in a fixed-length column- * store implicitly fills the gap with empty records. In this case, we * instantiate the empty record, it's an exact match. * * Else, if we find a valid key (one that wasn't deleted), return it. * * Else, if we found a deleted key, try to move to the next key in the * tree (bias for prefix searches). Cursor next skips deleted records, * so we don't have to test for them again. * * Else if there's no larger tree key, redo the search and try and find * an earlier record. If that fails, quit, there's no record to return. */ if (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)) { cbt->v = 0; val = &cbt->iface.value; val->data = &cbt->v; val->size = 1; *exact = 0; } else if (!__cursor_invalid(cbt)) { *exact = cbt->compare; ret = __wt_kv_return(session, cbt, cbt->compare == 0 ? 0 : 1); } else if ((ret = __wt_btcur_next(cbt)) != WT_NOTFOUND) *exact = 1; else { WT_ERR(btree->type == BTREE_ROW ? __wt_row_search(session, cbt, 0) : __wt_col_search(session, cbt, 0)); if (!__cursor_invalid(cbt)) { *exact = cbt->compare; ret = __wt_kv_return( session, cbt, cbt->compare == 0 ? 0 : 1); } else if ((ret = __wt_btcur_prev(cbt)) != WT_NOTFOUND) *exact = -1; } err: __cursor_func_resolve(cbt, ret); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; uint32_t page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * If we're compressing the file blocks, place the initial read into a * scratch buffer, we're going to have to re-allocate more memory for * decompression. Else check the caller's buffer size and grow it as * necessary, there will only be one buffer. */ if (block->compressor == NULL) { F_SET(buf, WT_ITEM_ALIGNED); WT_RET(__wt_buf_init(session, buf, size)); buf->size = size; dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, size, &tmp)); tmp->size = size; dsk = tmp->mem; } /* Read. */ WT_ERR(__wt_read(session, block->fh, offset, size, dsk)); blk = WT_BLOCK_HEADER_REF(dsk); /* Validate the checksum. */ if (block->checksum && cksum != WT_BLOCK_CHECKSUM_NOT_SET && blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) { blk->cksum = 0; page_cksum = __wt_cksum(dsk, size); if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET) ++page_cksum; if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); WT_ERR(WT_ERROR); } } /* * If the in-memory block size is larger than the on-disk block size, * the block is compressed. Size the user's buffer, copy the skipped * bytes of the original image into place, then decompress. * * If the in-memory block size is less than or equal to the on-disk * block size, the block is not compressed. */ if (blk->disk_size < dsk->size) { if (block->compressor == NULL) WT_ERR(__wt_illegal_value(session, block->name)); WT_ERR(__wt_buf_init(session, buf, dsk->size)); buf->size = dsk->size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(block->compressor->decompress( block->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->size - WT_BLOCK_COMPRESS_SKIP, &result_len)); if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP) WT_ERR(__wt_illegal_value(session, block->name)); } else if (block->compressor == NULL) buf->size = dsk->size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, why configure a compressor that * doesn't work? Allocate a buffer of the right size * (we used a scratch buffer which might be large), and * copy the data into place. */ WT_ERR( __wt_buf_set(session, buf, tmp->data, dsk->size)); WT_BSTAT_INCR(session, page_read); WT_CSTAT_INCR(session, block_read); err: __wt_scr_free(&tmp); return (ret); }