/* * __truncate_table -- * WT_SESSION::truncate for a table. */ static int __truncate_table(WT_SESSION_IMPL *session, const char *name) { WT_BTREE *btree; WT_DECL_ITEM(namebuf); WT_DECL_RET; WT_TABLE *table; int i; WT_RET(__wt_schema_get_table(session, name, strlen(name), 0, &table)); WT_RET(__wt_scr_alloc(session, 0, &namebuf)); /* Truncate the column groups. */ for (i = 0; i < WT_COLGROUPS(table); i++) { /* * Get an exclusive lock on the handle: it will be released by * __wt_conn_btree_close_all. */ WT_ERR(__wt_session_get_btree(session, table->cgroups[i]->source, NULL, NULL, WT_BTREE_EXCLUSIVE)); btree = session->btree; WT_ERR(__wt_buf_set( session, namebuf, btree->name, strlen(btree->name) + 1)); WT_ERR(__truncate_file(session, namebuf->data)); } /* Truncate the indices. */ WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { /* * Get an exclusive lock on the handle: it will be released by * __wt_conn_btree_close_all. */ WT_ERR(__wt_session_get_btree(session, table->indices[i]->source, NULL, NULL, WT_BTREE_EXCLUSIVE)); btree = session->btree; WT_ERR(__wt_buf_set( session, namebuf, btree->name, strlen(btree->name) + 1)); WT_ERR(__truncate_file(session, namebuf->data)); } table->idx_complete = 0; /* Reopen the column groups. */ ret = __wt_schema_open_colgroups(session, table); err: __wt_scr_free(&namebuf); return (ret); }
/* * __wt_txn_truncate_log -- * Begin truncating a range of a file. */ int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) { WT_BTREE *btree; WT_ITEM *item; WT_TXN_OP *op; btree = S2BT(session); WT_RET(__txn_next_op(session, &op)); if (btree->type == BTREE_ROW) { op->type = WT_TXN_OP_TRUNCATE_ROW; op->u.truncate_row.mode = WT_TXN_TRUNC_ALL; WT_CLEAR(op->u.truncate_row.start); WT_CLEAR(op->u.truncate_row.stop); if (start != NULL) { op->u.truncate_row.mode = WT_TXN_TRUNC_START; item = &op->u.truncate_row.start; WT_RET(__wt_cursor_get_raw_key(&start->iface, item)); WT_RET(__wt_buf_set( session, item, item->data, item->size)); } if (stop != NULL) { op->u.truncate_row.mode = (op->u.truncate_row.mode == WT_TXN_TRUNC_ALL) ? WT_TXN_TRUNC_STOP : WT_TXN_TRUNC_BOTH; item = &op->u.truncate_row.stop; WT_RET(__wt_cursor_get_raw_key(&stop->iface, item)); WT_RET(__wt_buf_set( session, item, item->data, item->size)); } } else { op->type = WT_TXN_OP_TRUNCATE_COL; op->u.truncate_col.start = (start == NULL) ? WT_RECNO_OOB : start->recno; op->u.truncate_col.stop = (stop == NULL) ? WT_RECNO_OOB : stop->recno; } /* Write that operation into the in-memory log. */ WT_RET(__wt_txn_log_op(session, NULL)); WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM)); F_SET(session, WT_SESSION_LOGGING_INMEM); return (0); }
/* * __curlog_op_read -- * Read out any key/value from an individual operation record * in the log. We're only interested in put and remove operations * since truncate is not a cursor operation. All successful * returns from this function will have set up the cursor copy of * key and value to give the user. */ static int __curlog_op_read(WT_SESSION_IMPL *session, WT_CURSOR_LOG *cl, uint32_t optype, uint32_t opsize, uint32_t *fileid) { WT_ITEM key, value; uint64_t recno; const uint8_t *end, *pp; pp = cl->stepp; end = pp + opsize; switch (optype) { case WT_LOGOP_COL_PUT: WT_RET(__wt_logop_col_put_unpack(session, &pp, end, fileid, &recno, &value)); WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno))); WT_RET(__wt_buf_set(session, cl->opvalue, value.data, value.size)); break; case WT_LOGOP_COL_REMOVE: WT_RET(__wt_logop_col_remove_unpack(session, &pp, end, fileid, &recno)); WT_RET(__wt_buf_set(session, cl->opkey, &recno, sizeof(recno))); WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0)); break; case WT_LOGOP_ROW_PUT: WT_RET(__wt_logop_row_put_unpack(session, &pp, end, fileid, &key, &value)); WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size)); WT_RET(__wt_buf_set(session, cl->opvalue, value.data, value.size)); break; case WT_LOGOP_ROW_REMOVE: WT_RET(__wt_logop_row_remove_unpack(session, &pp, end, fileid, &key)); WT_RET(__wt_buf_set(session, cl->opkey, key.data, key.size)); WT_RET(__wt_buf_set(session, cl->opvalue, NULL, 0)); break; default: /* * Any other operations return the record in the value * and an empty key. */ *fileid = 0; WT_RET(__wt_buf_set(session, cl->opkey, NULL, 0)); WT_RET(__wt_buf_set(session, cl->opvalue, cl->stepp, opsize)); } return (0); }
/* * __curfile_remove -- * WT_CURSOR->remove method for the btree cursor type. */ static int __curfile_remove(WT_CURSOR *cursor) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; WT_SESSION_IMPL *session; cbt = (WT_CURSOR_BTREE *)cursor; CURSOR_UPDATE_API_CALL(cursor, session, remove, cbt->btree); WT_CURSOR_NEEDKEY(cursor); WT_CURSOR_NOVALUE(cursor); WT_BTREE_CURSOR_SAVE_AND_RESTORE(cursor, __wt_btcur_remove(cbt), ret); /* * After a successful remove, copy the key: the value is not available. */ if (ret == 0) { if (F_ISSET(cursor, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&(cursor)->key)) { WT_ERR(__wt_buf_set(session, &cursor->key, cursor->key.data, cursor->key.size)); F_CLR(cursor, WT_CURSTD_KEY_INT); F_SET(cursor, WT_CURSTD_KEY_EXT); } F_CLR(cursor, WT_CURSTD_VALUE_SET); } err: CURSOR_UPDATE_API_END(session, ret); return (ret); }
/* * __curmetadata_setkv -- * Copy key/value into the public cursor, stripping internal metadata for * "create-only" cursors. */ static int __curmetadata_setkv(WT_CURSOR_METADATA *mdc, WT_CURSOR *fc) { WT_CURSOR *c; WT_DECL_RET; WT_SESSION_IMPL *session; char *value; c = &mdc->iface; session = (WT_SESSION_IMPL *)c->session; c->key.data = fc->key.data; c->key.size = fc->key.size; if (F_ISSET(mdc, WT_MDC_CREATEONLY)) { WT_RET(__schema_create_strip(session, fc->value.data, &value)); ret = __wt_buf_set( session, &c->value, value, strlen(value) + 1); __wt_free(session, value); WT_RET(ret); } else { c->value.data = fc->value.data; c->value.size = fc->value.size; } F_SET(c, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); F_CLR(mdc, WT_MDC_ONMETADATA); F_SET(mdc, WT_MDC_POSITIONED); return (0); }
/* * __curmetadata_metadata_search -- * Retrieve the metadata for the metadata table */ static int __curmetadata_metadata_search(WT_SESSION_IMPL *session, WT_CURSOR *cursor) { WT_CURSOR_METADATA *mdc; WT_DECL_RET; const char *value; mdc = (WT_CURSOR_METADATA *)cursor; /* The metadata search interface allocates a new string in value. */ WT_RET(__wt_metadata_search(session, WT_METADATA_URI, &value)); /* * Copy the value to the underlying btree cursor's tmp item which will * be freed when the cursor is closed. */ if (F_ISSET(mdc, WT_MDC_TMP_USED)) __wt_buf_free(session, &mdc->tmp_val); ret = __wt_buf_set(session, &mdc->tmp_val, value, strlen(value)); __wt_free(session, value); WT_RET(ret); cursor->key.data = WT_METADATA_URI; cursor->key.size = strlen(WT_METADATA_URI); cursor->value.data = mdc->tmp_val.data; cursor->value.size = mdc->tmp_val.size; F_SET(mdc, WT_MDC_ONMETADATA | WT_MDC_POSITIONED | WT_MDC_TMP_USED); F_SET(cursor, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); return (0); }
/* * __ckpt_load -- * Load a single checkpoint's information into a WT_CKPT structure. */ static int __ckpt_load(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *k, WT_CONFIG_ITEM *v, WT_CKPT *ckpt) { WT_CONFIG_ITEM a; char timebuf[64]; /* * Copy the name, address (raw and hex), order and time into the slot. * If there's no address, it's a fake. */ WT_RET(__wt_strndup(session, k->str, k->len, &ckpt->name)); WT_RET(__wt_config_subgets(session, v, "addr", &a)); WT_RET(__wt_buf_set(session, &ckpt->addr, a.str, a.len)); if (a.len == 0) F_SET(ckpt, WT_CKPT_FAKE); else WT_RET(__wt_nhex_to_raw(session, a.str, a.len, &ckpt->raw)); WT_RET(__wt_config_subgets(session, v, "order", &a)); if (a.len == 0) goto format; ckpt->order = a.val; WT_RET(__wt_config_subgets(session, v, "time", &a)); if (a.len == 0 || a.len > sizeof(timebuf) - 1) goto format; memcpy(timebuf, a.str, a.len); timebuf[a.len] = '\0'; if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1) goto format; WT_RET(__wt_config_subgets(session, v, "size", &a)); ckpt->ckpt_size = (uint64_t)a.val; WT_RET(__wt_config_subgets(session, v, "write_gen", &a)); if (a.len == 0) goto format; /* * The largest value a WT_CONFIG_ITEM can handle is signed: this value * appears on disk and I don't want to sign it there, so I'm casting it * here instead. */ ckpt->write_gen = (uint64_t)a.val; return (0); format: WT_RET_MSG(session, WT_ERROR, "corrupted checkpoint list"); }
/* * __async_set_key -- * WT_ASYNC_OP->set_key implementation for op handles. */ static void __async_set_key(WT_ASYNC_OP *asyncop, ...) { WT_CURSOR *c; va_list ap; c = &asyncop->c; va_start(ap, asyncop); __wt_cursor_set_keyv(c, c->flags, ap); if (!WT_DATA_IN_ITEM(&c->key) && !WT_CURSOR_RECNO(c)) c->saved_err = __wt_buf_set( O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->key, c->key.data, c->key.size); va_end(ap); }
/* * __async_set_value -- * WT_ASYNC_OP->set_value implementation for op handles. */ static void __async_set_value(WT_ASYNC_OP *asyncop, ...) { WT_CURSOR *c; va_list ap; c = &asyncop->c; va_start(ap, asyncop); __wt_cursor_set_valuev(c, ap); /* Copy the data, if it is pointing at data elsewhere. */ if (!WT_DATA_IN_ITEM(&c->value)) c->saved_err = __wt_buf_set( O2S((WT_ASYNC_OP_IMPL *)asyncop), &c->value, c->value.data, c->value.size); va_end(ap); }
/* * __curlog_logrec -- * Callback function from log_scan to get a log record. */ static int __curlog_logrec(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord) { WT_CURSOR_LOG *cl; cl = cookie; WT_UNUSED(firstrecord); /* Set up the LSNs and take a copy of the log record for the cursor. */ *cl->cur_lsn = *lsnp; *cl->next_lsn = *next_lsnp; WT_RET(__wt_buf_set(session, cl->logrec, logrec->data, logrec->size)); /* * Read the log header. Set up the step pointers to walk the * operations inside the record. Get the record type. */ cl->stepp = WT_LOG_SKIP_HEADER(cl->logrec->data); cl->stepp_end = (uint8_t *)cl->logrec->data + logrec->size; WT_RET(__wt_logrec_read(session, &cl->stepp, cl->stepp_end, &cl->rectype)); /* A step count of 0 means the entire record. */ cl->step_count = 0; /* * Unpack the txnid so that we can return each * individual operation for this txnid. */ if (cl->rectype == WT_LOGREC_COMMIT) WT_RET(__wt_vunpack_uint(&cl->stepp, WT_PTRDIFF(cl->stepp_end, cl->stepp), &cl->txnid)); else { /* * Step over anything else. * Setting stepp to NULL causes the next() * method to read a new record on the next call. */ cl->stepp = NULL; cl->txnid = 0; } return (0); }
/* * __curbulk_insert_row -- * Row-store bulk cursor insert, with key-sort checks. */ static int __curbulk_insert_row(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; int cmp; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; /* * Bulk cursor inserts are updates, but don't need auto-commit * transactions because they are single-threaded and not visible * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); WT_CURSOR_CHECKKEY(cursor); WT_CURSOR_CHECKVALUE(cursor); /* * If this isn't the first key inserted, compare it against the last key * to ensure the application doesn't accidentally corrupt the table. */ if (!cbulk->first_insert) { WT_ERR(__wt_compare(session, btree->collator, &cursor->key, &cbulk->last, &cmp)); if (cmp <= 0) WT_ERR(__bulk_row_keycmp_err(cbulk)); } else cbulk->first_insert = false; /* Save a copy of the key for the next comparison. */ WT_ERR(__wt_buf_set(session, &cbulk->last, cursor->key.data, cursor->key.size)); ret = __wt_bulk_insert_row(session, cbulk); err: API_END_RET(session, ret); }
/* * __wt_cursor_key_order_init -- * Initialize key ordering checks for cursor movements after a successful * search. */ int __wt_cursor_key_order_init(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { /* * Cursor searches set the position for cursor movements, set the * last-key value for diagnostic checking. */ switch (cbt->ref->page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: cbt->lastrecno = cbt->recno; return (0); case WT_PAGE_ROW_LEAF: return (__wt_buf_set(session, cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __cursor_key_order_check_row -- * Check key ordering for row-store cursor movements. */ static int __cursor_key_order_check_row( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool next) { WT_BTREE *btree; WT_ITEM *key; WT_DECL_RET; WT_DECL_ITEM(a); WT_DECL_ITEM(b); int cmp; btree = S2BT(session); key = &cbt->iface.key; cmp = 0; /* -Werror=maybe-uninitialized */ if (cbt->lastkey->size != 0) WT_RET(__wt_compare( session, btree->collator, cbt->lastkey, key, &cmp)); if (cbt->lastkey->size == 0 || (next && cmp < 0) || (!next && cmp > 0)) return (__wt_buf_set(session, cbt->lastkey, cbt->iface.key.data, cbt->iface.key.size)); WT_ERR(__wt_scr_alloc(session, 512, &a)); WT_ERR(__wt_scr_alloc(session, 512, &b)); WT_PANIC_ERR(session, EINVAL, "WT_CURSOR.%s out-of-order returns: returned key %s then key %s", next ? "next" : "prev", __wt_buf_set_printable( session, cbt->lastkey->data, cbt->lastkey->size, a), __wt_buf_set_printable(session, key->data, key->size, b)); err: __wt_scr_free(session, &a); __wt_scr_free(session, &b); return (ret); }
/* * __wt_bulk_insert -- * Bulk insert, called once per item. */ int __wt_bulk_insert(WT_CURSOR_BULK *cbulk) { WT_BTREE *btree; WT_CURSOR *cursor; WT_SESSION_IMPL *session; int cmp; session = (WT_SESSION_IMPL *)cbulk->cbt.iface.session; btree = S2BT(session); cursor = &cbulk->cbt.iface; switch (btree->type) { case BTREE_COL_FIX: WT_RET(__wt_rec_col_fix_bulk_insert(cbulk)); break; case BTREE_COL_VAR: /* * If this isn't the first value inserted, compare it against * the last value and increment the RLE count. * * Instead of a "first time" variable, I'm using the RLE count, * because it is set to 0 exactly once, the first time through * the code. */ if (cbulk->rle != 0) { if (cbulk->cmp.size == cursor->value.size && memcmp(cbulk->cmp.data, cursor->value.data, cursor->value.size) == 0) { ++cbulk->rle; break; } WT_RET(__wt_rec_col_var_bulk_insert(cbulk)); } WT_RET(__wt_buf_set(session, &cbulk->cmp, cursor->value.data, cursor->value.size)); cbulk->rle = 1; break; case BTREE_ROW: /* * If this isn't the first value inserted, compare it against * the last key to ensure the application doesn't accidentally * corrupt the table. * * Instead of a "first time" variable, I'm using the RLE count, * because it is set to 0 exactly once, the first time through * the code. */ if (cbulk->rle != 0) { WT_RET(WT_LEX_CMP(session, btree->collator, &cursor->key, &cbulk->cmp, cmp)); if (cmp <= 0) return (__bulk_row_keycmp_err(cbulk)); } WT_RET(__wt_buf_set(session, &cbulk->cmp, cursor->key.data, cursor->key.size)); cbulk->rle = 1; WT_RET(__wt_rec_row_bulk_insert(cbulk)); break; WT_ILLEGAL_VALUE(session); } WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); return (0); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = S2BT(session); page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins != NULL && (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else { WT_RET( __wt_row_key(session, page, rip, &cursor->key, 0)); upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value); /* * Restart for a variable-length column-store. We could catch restart * higher up the call-stack but there's no point to it: unlike row-store * (where a normal search path finds cached overflow values), we have to * access the page's reconciliation structures, and that's as easy here * as higher up the stack. */ if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR) ret = __wt_ovfl_cache_col_restart( session, page, &unpack, &cursor->value); return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, 1)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) { WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; int locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; lsm_tree = clsm->lsm_tree; chunk = NULL; ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* Copy the key, so we don't lose the cursor position. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); if (update) { if (txn->isolation == TXN_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); } else F_SET(clsm, WT_CLSM_OPEN_READ); WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; /* * If there is no in-memory chunk in the tree for an update operation, * create one. * * !!! * It is exceeding unlikely that we get here at all, but if there is a * transaction in progress and it rolls back, it would leave the * metadata inconsistent. */ if (update && (lsm_tree->nchunks == 0 || (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) { /* Release our lock because switch will get a write lock. */ locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; } F_SET(session, WT_SESSION_NO_CACHE_CHECK); /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->txnid_max)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->txnid_max[ngood - 1] = chunk->txnid_max; if (__wt_txn_visible_all( session, chunk->txnid_max)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && (ngood < clsm->nchunks || (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) WT_ERR(__clsm_close_cursors( clsm, 0, nchunks - nupdates)); WT_ERR(__clsm_close_cursors( clsm, ngood, clsm->nchunks)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->txnid_max[i] = chunk->txnid_max; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree, __wt_btree_evictable(session, 0)); } clsm->dsk_gen = lsm_tree->dsk_gen; err: F_CLR(session, WT_SESSION_NO_CACHE_CHECK); #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(etmp); WT_DECL_ITEM(tmp); WT_DECL_RET; WT_ENCRYPTOR *encryptor; WT_ITEM *ip; const WT_PAGE_HEADER *dsk; const char *fail_msg; size_t result_len; btree = S2BT(session); bm = btree->bm; fail_msg = NULL; /* -Wuninitialized */ /* * If anticipating a compressed or encrypted block, read into a scratch * buffer and decompress into the caller's buffer. Else, read directly * into the caller's buffer. */ if (btree->compressor == NULL && btree->kencryptor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; ip = NULL; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; ip = tmp; } /* * If the block is encrypted, copy the skipped bytes of the original * image into place, then decrypt. */ if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) { if (btree->kencryptor == NULL || (encryptor = btree->kencryptor->encryptor) == NULL || encryptor->decrypt == NULL) { fail_msg = "encrypted block in file for which no encryption " "configured"; goto corrupt; } WT_ERR(__wt_scr_alloc(session, 0, &etmp)); if ((ret = __wt_decrypt(session, encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) { fail_msg = "block decryption failed"; goto corrupt; } ip = etmp; dsk = ip->data; } else if (btree->kencryptor != NULL) { fail_msg = "unencrypted block in file for which encryption configured"; goto corrupt; } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) { fail_msg = "compressed block in file for which no compression " "configured"; goto corrupt; } /* * Size the buffer based on the in-memory bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { fail_msg = "block decryption failed"; goto corrupt; } } else /* * If we uncompressed above, the page is in the correct buffer. * If we get here the data may be in the wrong buffer and the * buffer may be the wrong size. If needed, get the page * into the destination buffer. */ if (ip != NULL) WT_ERR(__wt_buf_set( session, buf, ip->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); if (0) { corrupt: if (ret == 0) ret = WT_ERROR; if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { __wt_err(session, ret, "%s", fail_msg); ret = __wt_illegal_value(session, btree->dhandle->name); } } err: __wt_scr_free(session, &tmp); __wt_scr_free(session, &etmp); return (ret); }
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_PAGE *page; WT_ROW *rip; uint8_t v; switch (page->type){ case WT_PAGE_COL_FIX: cursor->recno = cbt->recno; /*cursor对应的是一个upd,直接返回value*/ if (upd != NULL){ cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return 0; } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return __wt_buf_set(session, &cursor->value, &v, 1); case WT_PAGE_COL_VAR: cursor->recno = cbt->recno; if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*获得对应的cell,并通过cell得到K/V值*/ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; if (cbt->ins != NULL){ /*插入的k/v对*/ cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/ cursor->key.data = cbt->search_key.data; cursor->key.size = cbt->search_key.size; } else WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/ /*值是在append/update list当中,从当中取*/ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/ if (__wt_row_leaf_value(page, rip, &cursor->value)) return 0; /*不是连续存储的,需要通过解析cell来定位到value*/ if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){ cursor->value.size = 0; return 0; } break; WT_ILLEGAL_VALUE(session); } /*通过cell解析到对应的value值, ovfl item*/ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return 0; }
/* * __wt_struct_plan -- * Given a table cursor containing a complete table, build the "projection * plan" to distribute the columns to dependent stores. A string * representing the plan will be appended to the plan buffer. */ int __wt_struct_plan(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, bool value_only, WT_ITEM *plan) { WT_CONFIG conf; WT_CONFIG_ITEM k, v; WT_DECL_RET; u_int cg, col, current_cg, current_col, i, start_cg, start_col; char coltype, current_coltype; bool have_it; start_cg = start_col = UINT_MAX; /* -Wuninitialized */ /* Work through the value columns by skipping over the key columns. */ __wt_config_initn(session, &conf, columns, len); if (value_only) for (i = 0; i < table->nkey_columns; i++) WT_RET(__wt_config_next(&conf, &k, &v)); current_cg = cg = 0; current_col = col = INT_MAX; current_coltype = coltype = WT_PROJ_KEY; /* Keep lint quiet. */ for (i = 0; (ret = __wt_config_next(&conf, &k, &v)) == 0; i++) { have_it = false; while ((ret = __find_next_col(session, table, &k, &cg, &col, &coltype)) == 0 && (!have_it || cg != start_cg || col != start_col)) { /* * First we move to the column. If that is in a * different column group to the last column we * accessed, or before the last column in the same * column group, or moving from the key to the value, * we need to switch column groups or rewind. */ if (current_cg != cg || current_col > col || current_coltype != coltype) { WT_ASSERT(session, !value_only || coltype == WT_PROJ_VALUE); WT_RET(__wt_buf_catfmt( session, plan, "%u%c", cg, coltype)); /* * Set the current column group and column * within the table. */ current_cg = cg; current_col = 0; current_coltype = coltype; } /* Now move to the column we want. */ if (current_col < col) { if (col - current_col > 1) WT_RET(__wt_buf_catfmt(session, plan, "%u", col - current_col)); WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_SKIP)); } /* * Now copy the value in / out. In the common case, * where each value is used in one column, we do a * "next" operation. If the value is used again, we do * a "reuse" operation to avoid making another copy. */ if (!have_it) { WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_NEXT)); start_cg = cg; start_col = col; have_it = true; } else WT_RET(__wt_buf_catfmt(session, plan, "%c", WT_PROJ_REUSE)); current_col = col + 1; } /* * We may fail to find a column if it is a custom extractor. * In that case, treat it as the first value column: we only * ever use such plans to extract the primary key from the * index. */ if (ret == WT_NOTFOUND) WT_RET(__wt_buf_catfmt(session, plan, "0%c%c", WT_PROJ_VALUE, WT_PROJ_NEXT)); } WT_RET_TEST(ret != WT_NOTFOUND, ret); /* Special case empty plans. */ if (i == 0 && plan->size == 0) WT_RET(__wt_buf_set(session, plan, "", 1)); return (0); }
/* * __wt_struct_reformat -- * Given a table and a list of columns (which could be values in a column * group or index keys), calculate the resulting new format string. * The result will be appended to the format buffer. */ int __wt_struct_reformat(WT_SESSION_IMPL *session, WT_TABLE *table, const char *columns, size_t len, const char *extra_cols, bool value_only, WT_ITEM *format) { WT_CONFIG config; WT_CONFIG_ITEM k, next_k, next_v; WT_DECL_PACK_VALUE(pv); WT_DECL_RET; bool have_next; __wt_config_initn(session, &config, columns, len); /* * If an empty column list is specified, this will fail with * WT_NOTFOUND, that's okay. */ WT_RET_NOTFOUND_OK(ret = __wt_config_next(&config, &next_k, &next_v)); if (ret == WT_NOTFOUND) { if (extra_cols != NULL) { __wt_config_init(session, &config, extra_cols); WT_RET(__wt_config_next(&config, &next_k, &next_v)); extra_cols = NULL; } else if (format->size == 0) { WT_RET(__wt_buf_set(session, format, "", 1)); return (0); } } do { k = next_k; ret = __wt_config_next(&config, &next_k, &next_v); if (ret != 0 && ret != WT_NOTFOUND) return (ret); have_next = ret == 0; if (!have_next && extra_cols != NULL) { __wt_config_init(session, &config, extra_cols); WT_RET(__wt_config_next(&config, &next_k, &next_v)); have_next = true; extra_cols = NULL; } if ((ret = __find_column_format(session, table, &k, value_only, &pv)) != 0) { if (value_only && ret == EINVAL) WT_RET_MSG(session, EINVAL, "A column group cannot store key column " "'%.*s' in its value", (int)k.len, k.str); WT_RET_MSG(session, EINVAL, "Column '%.*s' not found", (int)k.len, k.str); } /* * Check whether we're moving an unsized WT_ITEM from the end * to the middle, or vice-versa. This determines whether the * size needs to be prepended. This is the only case where the * destination size can be larger than the source size. */ if (pv.type == 'u' && !pv.havesize && have_next) pv.type = 'U'; else if (pv.type == 'U' && !have_next) pv.type = 'u'; if (pv.havesize) WT_RET(__wt_buf_catfmt(session, format, "%" PRIu32 "%c", pv.size, pv.type)); else WT_RET(__wt_buf_catfmt(session, format, "%c", pv.type)); } while (have_next); return (0); }
/* * __wt_meta_ckptlist_get -- * Load all available checkpoint information for a file. */ int __wt_meta_ckptlist_get( WT_SESSION_IMPL *session, const char *fname, WT_CKPT **ckptbasep) { WT_CKPT *ckpt, *ckptbase; WT_CONFIG ckptconf; WT_CONFIG_ITEM a, k, v; WT_DECL_RET; WT_ITEM *buf; size_t allocated, slot; const char *config; char timebuf[64]; *ckptbasep = NULL; buf = NULL; ckptbase = NULL; allocated = slot = 0; config = NULL; /* Retrieve the metadata information for the file. */ WT_RET(__wt_metadata_read(session, fname, &config)); /* Load any existing checkpoints into the array. */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); if (__wt_config_getones(session, config, "checkpoint", &v) == 0 && __wt_config_subinit(session, &ckptconf, &v) == 0) for (; __wt_config_next(&ckptconf, &k, &v) == 0; ++slot) { if (slot * sizeof(WT_CKPT) == allocated) WT_ERR(__wt_realloc(session, &allocated, (slot + 50) * sizeof(WT_CKPT), &ckptbase)); ckpt = &ckptbase[slot]; /* * Copy the name, address (raw and hex), order and time * into the slot. If there's no address, it's a fake. */ WT_ERR( __wt_strndup(session, k.str, k.len, &ckpt->name)); WT_ERR(__wt_config_subgets(session, &v, "addr", &a)); WT_ERR( __wt_buf_set(session, &ckpt->addr, a.str, a.len)); if (a.len == 0) F_SET(ckpt, WT_CKPT_FAKE); else WT_ERR(__wt_nhex_to_raw( session, a.str, a.len, &ckpt->raw)); WT_ERR(__wt_config_subgets(session, &v, "order", &a)); if (a.val == 0) goto format; ckpt->order = a.val; WT_ERR(__wt_config_subgets(session, &v, "time", &a)); if (a.len == 0) goto format; if (a.len > sizeof(timebuf) - 1) goto format; memcpy(timebuf, a.str, a.len); timebuf[a.len] = '\0'; if (sscanf(timebuf, "%" SCNuMAX, &ckpt->sec) != 1) goto format; WT_ERR(__wt_config_subgets(session, &v, "size", &a)); ckpt->ckpt_size = (uint64_t)a.val; } /* * Allocate an extra slot for a new value, plus a slot to mark the end. * * This isn't very clean, but there's necessary cooperation between the * schema layer (that maintains the list of checkpoints), the btree * layer (that knows when the root page is written, creating a new * checkpoint), and the block manager (which actually creates the * checkpoint). All of that cooperation is handled in the WT_CKPT * structure referenced from the WT_BTREE structure. */ if ((slot + 2) * sizeof(WT_CKPT) > allocated) WT_ERR(__wt_realloc(session, &allocated, (slot + 2) * sizeof(WT_CKPT), &ckptbase)); /* Sort in creation-order. */ qsort(ckptbase, slot, sizeof(WT_CKPT), __ckpt_compare_order); /* Return the array to our caller. */ *ckptbasep = ckptbase; if (0) { format: WT_ERR_MSG(session, WT_ERROR, "corrupted checkpoint list"); err: __wt_meta_ckptlist_free(session, ckptbase); } __wt_free(session, config); __wt_scr_free(&buf); return (ret); }
/* * __curbulk_insert_var -- * Variable-length column-store bulk cursor insert. */ static int __curbulk_insert_var(WT_CURSOR *cursor) { WT_BTREE *btree; WT_CURSOR_BULK *cbulk; WT_DECL_RET; WT_SESSION_IMPL *session; uint64_t recno; cbulk = (WT_CURSOR_BULK *)cursor; btree = cbulk->cbt.btree; /* * Bulk cursor inserts are updates, but don't need auto-commit * transactions because they are single-threaded and not visible * until the bulk cursor is closed. */ CURSOR_API_CALL(cursor, session, insert, btree); WT_STAT_FAST_DATA_INCR(session, cursor_insert_bulk); /* * If the "append" flag was configured, the application doesn't have to * supply a key, else require a key. */ if (F_ISSET(cursor, WT_CURSTD_APPEND)) recno = cbulk->recno + 1; else { WT_CURSOR_CHECKKEY(cursor); if ((recno = cursor->recno) <= cbulk->recno) WT_ERR(__bulk_col_keycmp_err(cbulk)); } WT_CURSOR_CHECKVALUE(cursor); if (!cbulk->first_insert) { /* * If not the first insert and the key space is sequential, * compare the current value against the last value; if the * same, just increment the RLE count. */ if (recno == cbulk->recno + 1 && cbulk->last.size == cursor->value.size && memcmp(cbulk->last.data, cursor->value.data, cursor->value.size) == 0) { ++cbulk->rle; ++cbulk->recno; goto duplicate; } /* Insert the previous key/value pair. */ WT_ERR(__wt_bulk_insert_var(session, cbulk, false)); } else cbulk->first_insert = false; /* * Insert any skipped records as deleted records, update the current * record count and RLE counter. */ if (recno != cbulk->recno + 1) { cbulk->rle = (recno - cbulk->recno) - 1; WT_ERR(__wt_bulk_insert_var(session, cbulk, true)); } cbulk->rle = 1; cbulk->recno = recno; /* Save a copy of the value for the next comparison. */ ret = __wt_buf_set(session, &cbulk->last, cursor->value.data, cursor->value.size); duplicate: err: API_END_RET(session, ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; const WT_PAGE_HEADER *dsk; size_t result_len; btree = S2BT(session); bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(btree, WT_BTREE_VERIFY) || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, btree->dhandle->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; uint32_t page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * If we're compressing the file blocks, place the initial read into a * scratch buffer, we're going to have to re-allocate more memory for * decompression. Else check the caller's buffer size and grow it as * necessary, there will only be one buffer. */ if (block->compressor == NULL) { F_SET(buf, WT_ITEM_ALIGNED); WT_RET(__wt_buf_init(session, buf, size)); buf->size = size; dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, size, &tmp)); tmp->size = size; dsk = tmp->mem; } /* Read. */ WT_ERR(__wt_read(session, block->fh, offset, size, dsk)); blk = WT_BLOCK_HEADER_REF(dsk); /* Validate the checksum. */ if (block->checksum && cksum != WT_BLOCK_CHECKSUM_NOT_SET && blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) { blk->cksum = 0; page_cksum = __wt_cksum(dsk, size); if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET) ++page_cksum; if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); WT_ERR(WT_ERROR); } } /* * If the in-memory block size is larger than the on-disk block size, * the block is compressed. Size the user's buffer, copy the skipped * bytes of the original image into place, then decompress. * * If the in-memory block size is less than or equal to the on-disk * block size, the block is not compressed. */ if (blk->disk_size < dsk->size) { if (block->compressor == NULL) WT_ERR(__wt_illegal_value(session, block->name)); WT_ERR(__wt_buf_init(session, buf, dsk->size)); buf->size = dsk->size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(block->compressor->decompress( block->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->size - WT_BLOCK_COMPRESS_SKIP, &result_len)); if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP) WT_ERR(__wt_illegal_value(session, block->name)); } else if (block->compressor == NULL) buf->size = dsk->size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, why configure a compressor that * doesn't work? Allocate a buffer of the right size * (we used a scratch buffer which might be large), and * copy the data into place. */ WT_ERR( __wt_buf_set(session, buf, tmp->data, dsk->size)); WT_BSTAT_INCR(session, page_read); WT_CSTAT_INCR(session, block_read); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (conn->las_sweep_call != 0 && key->data != NULL) { __wt_cursor_set_raw_key(cursor, key); if ((ret = cursor->search_near(cursor, ¬used)) != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. * * We can't know for sure how many records are in the lookaside table, * the cursor insert and remove statistics aren't updated atomically. * Start with reviewing 100 rows, and if it takes more than the target * number of calls to finish, increase the number of rows checked on * each call; if it takes less than the target calls to finish, then * decrease the number of rows reviewed on each call (but never less * than 100). */ #define WT_SWEEP_LOOKASIDE_MIN_CNT 100 #define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 ++conn->las_sweep_call; if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) WT_ERR(cursor->remove(cursor)); } /* * When reaching the lookaside table end or the target number of calls, * adjust the row count. Decrease/increase the row count depending on * if the number of calls is less/more than the target. */ if (ret == WT_NOTFOUND || conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; } srch_notfound: if (ret == WT_NOTFOUND) conn->las_sweep_call = 0; WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) { WT_CURSOR *c; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_SESSION_IMPL *session; uint64_t *txnid_maxp; uint64_t id, myid, snap_min; session = (WT_SESSION_IMPL *)clsm->iface.session; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { c = &clsm->iface; /* Copy out data before resetting chunk cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); if (F_ISSET(c, WT_CURSTD_VALUE_INT) && !WT_DATA_IN_ITEM(&c->value)) WT_RET(__wt_buf_set( session, &c->value, c->value.data, c->value.size)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; WT_RET(__wt_cache_full_check(session)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update && (chunk = clsm->primary_chunk) != NULL) { WT_RET(__wt_txn_autocommit_check(session)); for (id = chunk->txnid_max, myid = session->txn.id; !TXNID_LE(myid, id); id = chunk->txnid_max) { WT_ASSERT(session, myid != WT_TXN_NONE); (void)WT_ATOMIC_CAS( chunk->txnid_max, id, myid); } } /* * Figure out how many updates are required for snapshot * isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID that * overlaps with our snapshot is a potential conflict. */ clsm->nupdates = 1; if (session->txn.isolation == TXN_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { snap_min = session->txn.snap_min; for (txnid_maxp = &clsm->txnid_max[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, txnid_maxp--) if (TXNID_LT(*txnid_maxp, snap_min)) break; } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || session->txn.isolation != TXN_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_CURSOR *cursor; WT_IKEY *ikey; WT_PAGE *page; WT_ROW *rip; WT_UPDATE *upd; uint8_t v; btree = session->btree; unpack = &_unpack; page = cbt->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: if (key_ret) cursor->recno = cbt->recno; /* * If the cursor references a WT_INSERT item, take the related * WT_UPDATE item. */ if (cbt->ins != NULL) { upd = cbt->ins->upd; cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->u.row.d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take the key and * related WT_UPDATE item. Otherwise, take the key from the * original page, and the value from any related WT_UPDATE item, * or the page if the key was never updated. */ if (cbt->ins == NULL) { if (key_ret) { if (__wt_off_page(page, rip->key)) { ikey = rip->key; cursor->key.data = WT_IKEY_DATA(ikey); cursor->key.size = ikey->size; } else WT_RET(__wt_row_key( session, page, rip, &cursor->key)); } upd = WT_ROW_UPDATE(page, rip); } else { if (key_ret) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } upd = cbt->ins->upd; } if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the original cell (which may be empty). */ if ((cell = __wt_row_value(page, rip)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* It's a cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, unpack); if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) { cursor->value.data = unpack->data; cursor->value.size = unpack->size; return (0); } else return (__wt_cell_unpack_copy(session, unpack, &cursor->value)); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (key->size != 0) { __wt_cursor_set_raw_key(cursor, key); ret = cursor->search_near(cursor, ¬used); /* * Don't search for the same key twice; if we don't set a new * key below, it's because we've reached the end of the table * and we want the next pass to start at the beginning of the * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ key->size = 0; if (ret != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. */ cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } srch_notfound: WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __wt_kv_return -- * Return a page referenced key/value pair to the application. */ int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK unpack; WT_CURSOR *cursor; WT_ITEM *tmp; WT_PAGE *page; WT_ROW *rip; uint8_t v; btree = S2BT(session); page = cbt->ref->page; cursor = &cbt->iface; switch (page->type) { case WT_PAGE_COL_FIX: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page. */ v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* * The interface cursor's record has usually been set, but that * isn't universally true, specifically, cursor.search_near may * call here without first setting the interface cursor. */ cursor->recno = cbt->recno; /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Take the value from the original page cell. */ cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]); break; case WT_PAGE_ROW_LEAF: rip = &page->pg_row_d[cbt->slot]; /* * If the cursor references a WT_INSERT item, take its key. * Else, if we have an exact match, we copied the key in the * search function, take it from there. * If we don't have an exact match, take the key from the * original page. */ if (cbt->ins != NULL) { cursor->key.data = WT_INSERT_KEY(cbt->ins); cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins); } else if (cbt->compare == 0) { /* * If not in an insert list and there's an exact match, * the row-store search function built the key we want * to return in the cursor's temporary buffer. Swap the * cursor's search-key and temporary buffers so we can * return it (it's unsafe to return the temporary buffer * itself because our caller might do another search in * this table using the key we return, and we'd corrupt * the search key during any subsequent search that used * the temporary buffer. */ tmp = cbt->row_key; cbt->row_key = cbt->tmp; cbt->tmp = tmp; cursor->key.data = cbt->row_key->data; cursor->key.size = cbt->row_key->size; } else WT_RET(__wt_row_leaf_key( session, page, rip, &cursor->key, false)); /* If the cursor references a WT_UPDATE item, return it. */ if (upd != NULL) { cursor->value.data = WT_UPDATE_DATA(upd); cursor->value.size = upd->size; return (0); } /* Simple values have their location encoded in the WT_ROW. */ if (__wt_row_leaf_value(page, rip, &cursor->value)) return (0); /* * Take the value from the original page cell (which may be * empty). */ if ((cell = __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) { cursor->value.size = 0; return (0); } break; WT_ILLEGAL_VALUE(session); } /* The value is an on-page cell, unpack and expand it as necessary. */ __wt_cell_unpack(cell, &unpack); WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value)); return (0); }