/* * __clsm_compare -- * WT_CURSOR->compare implementation for the LSM cursor type. */ static int __clsm_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp) { WT_CURSOR_LSM *alsm; WT_DECL_RET; WT_SESSION_IMPL *session; int cmp; /* There's no need to sync with the LSM tree, avoid WT_LSM_ENTER. */ alsm = (WT_CURSOR_LSM *)a; CURSOR_API_CALL(a, session, compare, NULL); /* * Confirm both cursors refer to the same source and have keys, then * compare the keys. */ if (strcmp(a->uri, b->uri) != 0) WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object"); WT_CURSOR_NEEDKEY(a); WT_CURSOR_NEEDKEY(b); WT_ERR(WT_LEX_CMP( session, alsm->lsm_tree->collator, &a->key, &b->key, cmp)); *cmpp = cmp; err: API_END(session); return (ret); }
/* * __split_verify_intl_key_order -- * Verify the key order on an internal page after a split, diagnostic only. */ static void __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_ITEM *next, _next, *last, _last, *tmp; WT_REF *ref; uint64_t recno; int cmp, first; btree = S2BT(session); switch (page->type) { case WT_PAGE_COL_INT: recno = 0; WT_INTL_FOREACH_BEGIN(page, ref) { WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: next = &_next; WT_CLEAR(_next); last = &_last; WT_CLEAR(_last); first = 1; WT_INTL_FOREACH_BEGIN(page, ref) { __wt_ref_key(page, ref, &next->data, &next->size); if (last->size == 0) { if (first) first = 0; else { (void)WT_LEX_CMP(session, btree->collator, last, next, cmp); WT_ASSERT(session, cmp < 0); } } tmp = last; last = next; next = tmp; } WT_INTL_FOREACH_END;
/* * __wt_bulk_insert -- * Bulk insert, called once per item. */ int __wt_bulk_insert(WT_CURSOR_BULK *cbulk) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int cmp; session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; btree = S2BT(session); cursor = &cbulk->cbt.iface; switch (btree->type) { case BTREE_COL_FIX: WT_RET(__wt_rec_col_fix_bulk_insert(cbulk)); break; case BTREE_COL_VAR: /* * If this isn't the first value inserted, compare it against * the last value and increment the RLE count. * * Instead of a "first time" variable, I'm using the RLE count, * because it is set to 0 exactly once, the first time through * the code. */ if (cbulk->rle != 0) { if (cbulk->cmp.size == cursor->value.size && memcmp(cbulk->cmp.data, cursor->value.data, cursor->value.size) == 0) { ++cbulk->rle; break; } WT_RET(__wt_rec_col_var_bulk_insert(cbulk)); } WT_RET(__wt_buf_set(session, &cbulk->cmp, cursor->value.data, cursor->value.size)); cbulk->rle = 1; break; case BTREE_ROW: /* * If this isn't the first value inserted, compare it against * the last key to ensure the application doesn't accidentally * corrupt the table. * * Instead of a "first time" variable, I'm using the RLE count, * because it is set to 0 exactly once, the first time through * the code. */ if (cbulk->rle != 0) { WT_RET(WT_LEX_CMP(session, btree->collator, &cursor->key, &cbulk->cmp, cmp)); if (cmp <= 0) return (__bulk_row_keycmp_err(cbulk)); } WT_RET(__wt_buf_set(session, &cbulk->cmp, cursor->key.data, cursor->key.size)); cbulk->rle = 1; WT_RET(__wt_rec_row_bulk_insert(cbulk)); break; WT_ILLEGAL_VALUE(session); } WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); return (0); }
/* * __verify_dsk_row -- * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. */ static int __verify_dsk_row( WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(current); WT_DECL_ITEM(last_ovfl); WT_DECL_ITEM(last_pfx); WT_DECL_RET; WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; uint32_t cell_num, cell_type, i, key_cnt, prefix; uint8_t *end; int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); last = last_ovfl; end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ if (__wt_cell_unpack_safe(NULL, cell, unpack, end) != 0) { ret = __err_cell_corrupted(session, cell_num, addr); goto err; } /* Check the raw and collapsed cell types. */ WT_ERR(__err_cell_type( session, cell_num, addr, unpack->raw, dsk->type)); WT_ERR(__err_cell_type( session, cell_num, addr, unpack->type, dsk->type)); cell_type = unpack->type; /* * Check ordering relationships between the WT_CELL entries. * For row-store internal pages, check for: * two values in a row, * two keys in a row, * a value as the first cell on a page. * For row-store leaf pages, check for: * two values in a row, * a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: ++key_cnt; switch (last_cell_type) { case FIRST: case WAS_VALUE: break; case WAS_KEY: if (dsk->type == WT_PAGE_ROW_LEAF) break; WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent keys", cell_num - 1, addr); } last_cell_type = WAS_KEY; break; case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_VALUE: case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", addr); case WAS_KEY: break; case WAS_VALUE: WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent values", cell_num - 1, addr); } last_cell_type = WAS_VALUE; break; } /* Check if any referenced item is entirely in the file. */ switch (cell_type) { case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) goto eof; break; } /* * Remaining checks are for key order and prefix compression. * If this cell isn't a key, we're done, move to the next cell. * If this cell is an overflow item, instantiate the key and * compare it with the last key. Otherwise, we have to deal * with prefix compression. */ switch (cell_type) { case WT_CELL_KEY: break; case WT_CELL_KEY_OVFL: WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); goto key_compare; default: /* Not a key -- continue with the next cell. */ continue; } /* * Prefix compression checks. * * Confirm the first non-overflow key on a page has a zero * prefix compression count. */ prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 " key on page at %s is the first " "non-overflow key on the page and has a non-zero " "prefix compression value", cell_num, addr); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " "compression count of %" PRIu32 ", larger than the length of the previous key, %zu", cell_num, addr, prefix, last->size); /* * If Huffman decoding required, unpack the cell to build the * key, then resolve the prefix. Else, we can do it faster * internally because we don't have to shuffle memory around as * much. */ if (huffman != NULL) { WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); /* * If there's a prefix, make sure there's enough buffer * space, then shift the decoded data past the prefix * and copy the prefix into place. Take care with the * pointers: current->data may be pointing inside the * buffer. */ if (prefix != 0) { WT_ERR(__wt_buf_grow( session, current, prefix + current->size)); memmove((uint8_t *)current->mem + prefix, current->data, current->size); memcpy(current->mem, last->data, prefix); current->data = current->mem; current->size += prefix; } } else { /* * Get the cell's data/length and make sure we have * enough buffer space. */ WT_ERR(__wt_buf_init( session, current, prefix + unpack->size)); /* Copy the prefix then the data into place. */ if (prefix != 0) memcpy(current->mem, last->data, prefix); memcpy((uint8_t *)current->mem + prefix, unpack->data, unpack->size); current->size = prefix + unpack->size; } key_compare: /* * Compare the current key against the last key. * * Be careful about the 0th key on internal pages: we only store * the first byte and custom collators may not be able to handle * truncated keys. */ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { WT_ERR(WT_LEX_CMP( session, btree->collator, last, current, cmp)); if (cmp >= 0) WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " "page at %s are incorrectly sorted", cell_num - 2, cell_num, addr); } /* * Swap the buffers: last always references the last key entry, * last_pfx and last_ovfl reference the last prefix-compressed * and last overflow key entries. Current gets pointed to the * buffer we're not using this time around, which is where the * next key goes. */ last = current; if (cell_type == WT_CELL_KEY) { current = last_pfx; last_pfx = last; } else { current = last_ovfl; last_ovfl = last; } WT_ASSERT(session, last != current); }