/* * __wt_txn_truncate_log -- * Begin truncating a range of a file. */ int __wt_txn_truncate_log( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop) { WT_BTREE *btree; WT_ITEM *item; WT_TXN_OP *op; btree = S2BT(session); WT_RET(__txn_next_op(session, &op)); if (btree->type == BTREE_ROW) { op->type = WT_TXN_OP_TRUNCATE_ROW; op->u.truncate_row.mode = WT_TXN_TRUNC_ALL; WT_CLEAR(op->u.truncate_row.start); WT_CLEAR(op->u.truncate_row.stop); if (start != NULL) { op->u.truncate_row.mode = WT_TXN_TRUNC_START; item = &op->u.truncate_row.start; WT_RET(__wt_cursor_get_raw_key(&start->iface, item)); WT_RET(__wt_buf_set( session, item, item->data, item->size)); } if (stop != NULL) { op->u.truncate_row.mode = (op->u.truncate_row.mode == WT_TXN_TRUNC_ALL) ? WT_TXN_TRUNC_STOP : WT_TXN_TRUNC_BOTH; item = &op->u.truncate_row.stop; WT_RET(__wt_cursor_get_raw_key(&stop->iface, item)); WT_RET(__wt_buf_set( session, item, item->data, item->size)); } } else { op->type = WT_TXN_OP_TRUNCATE_COL; op->u.truncate_col.start = (start == NULL) ? WT_RECNO_OOB : start->recno; op->u.truncate_col.stop = (stop == NULL) ? WT_RECNO_OOB : stop->recno; } /* Write that operation into the in-memory log. */ WT_RET(__wt_txn_log_op(session, NULL)); WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOGGING_INMEM)); F_SET(session, WT_SESSION_LOGGING_INMEM); return (0); }
/* * __wt_apply_single_idx -- * Apply an operation to a single index of a table. */ int __wt_apply_single_idx(WT_SESSION_IMPL *session, WT_INDEX *idx, WT_CURSOR *cur, WT_CURSOR_TABLE *ctable, int (*f)(WT_CURSOR *)) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __wt_cursor_notsup, /* compare */ __wt_cursor_notsup, /* equals */ __wt_cursor_notsup, /* next */ __wt_cursor_notsup, /* prev */ __wt_cursor_notsup, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_notsup, /* search-near */ __curextract_insert, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* reconfigure */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup); /* close */ WT_CURSOR_EXTRACTOR extract_cursor; WT_DECL_RET; WT_ITEM key, value; if (idx->extractor) { extract_cursor.iface = iface; extract_cursor.iface.session = &session->iface; extract_cursor.iface.key_format = idx->exkey_format; extract_cursor.ctable = ctable; extract_cursor.idxc = cur; extract_cursor.f = f; WT_RET(__wt_cursor_get_raw_key(&ctable->iface, &key)); WT_RET(__wt_cursor_get_raw_value(&ctable->iface, &value)); ret = idx->extractor->extract(idx->extractor, &session->iface, &key, &value, &extract_cursor.iface); __wt_buf_free(session, &extract_cursor.iface.key); WT_RET(ret); } else { WT_RET(__wt_schema_project_merge(session, ctable->cg_cursors, idx->key_plan, idx->key_format, &cur->key)); /* * The index key is now set and the value is empty * (it starts clear and is never set). */ F_SET(cur, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); WT_RET(f(cur)); } return (0); }
/* * __curextract_insert -- * Handle a key produced by a custom extractor. */ static int __curextract_insert(WT_CURSOR *cursor) { WT_CURSOR_EXTRACTOR *cextract; WT_ITEM *key, ikey, pkey; WT_SESSION_IMPL *session; cextract = (WT_CURSOR_EXTRACTOR *)cursor; session = (WT_SESSION_IMPL *)cursor->session; WT_ITEM_SET(ikey, cursor->key); /* * We appended a padding byte to the key to avoid rewriting the last * column. Strip that away here. */ WT_ASSERT(session, ikey.size > 0); --ikey.size; WT_RET(__wt_cursor_get_raw_key(cextract->ctable->cg_cursors[0], &pkey)); /* * We have the index key in the format we need, and all of the primary * key columns are required: just append them. */ key = &cextract->idxc->key; WT_RET(__wt_buf_grow(session, key, ikey.size + pkey.size)); memcpy((uint8_t *)key->mem, ikey.data, ikey.size); memcpy((uint8_t *)key->mem + ikey.size, pkey.data, pkey.size); key->size = ikey.size + pkey.size; /* * The index key is now set and the value is empty (it starts clear and * is never set). */ F_SET(cextract->idxc, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); /* Call the underlying cursor function to update the index. */ return (cextract->f(cextract->idxc)); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (key->size != 0) { __wt_cursor_set_raw_key(cursor, key); ret = cursor->search_near(cursor, ¬used); /* * Don't search for the same key twice; if we don't set a new * key below, it's because we've reached the end of the table * and we want the next pass to start at the beginning of the * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ key->size = 0; if (ret != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. */ cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } srch_notfound: WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __apply_idx -- * Apply an operation to all indices of a table. */ static int __apply_idx(WT_CURSOR_TABLE *ctable, size_t func_off, int skip_immutable) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __wt_cursor_notsup, /* compare */ __wt_cursor_notsup, /* next */ __wt_cursor_notsup, /* prev */ __wt_cursor_notsup, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_notsup, /* search-near */ __curextract_insert, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup); /* close */ WT_CURSOR **cp; WT_CURSOR_EXTRACTOR extract_cursor; WT_DECL_RET; WT_INDEX *idx; WT_ITEM key, value; WT_SESSION_IMPL *session; int (*f)(WT_CURSOR *); u_int i; cp = ctable->idx_cursors; session = (WT_SESSION_IMPL *)ctable->iface.session; for (i = 0; i < ctable->table->nindices; i++, cp++) { idx = ctable->table->indices[i]; if (skip_immutable && F_ISSET(idx, WT_INDEX_IMMUTABLE)) continue; f = *(int (**)(WT_CURSOR *))((uint8_t *)*cp + func_off); if (idx->extractor) { extract_cursor.iface = iface; extract_cursor.iface.session = &session->iface; extract_cursor.iface.key_format = idx->exkey_format; extract_cursor.ctable = ctable; extract_cursor.idxc = *cp; extract_cursor.f = f; WT_RET(__wt_cursor_get_raw_key(&ctable->iface, &key)); WT_RET( __wt_cursor_get_raw_value(&ctable->iface, &value)); ret = idx->extractor->extract(idx->extractor, &session->iface, &key, &value, &extract_cursor.iface); __wt_buf_free(session, &extract_cursor.iface.key); WT_RET(ret); } else { WT_RET(__wt_schema_project_merge(session, ctable->cg_cursors, idx->key_plan, idx->key_format, &(*cp)->key)); /* * The index key is now set and the value is empty * (it starts clear and is never set). */ F_SET(*cp, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT); WT_RET(f(*cp)); } WT_RET((*cp)->reset(*cp)); } return (0); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (conn->las_sweep_call != 0 && key->data != NULL) { __wt_cursor_set_raw_key(cursor, key); if ((ret = cursor->search_near(cursor, ¬used)) != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. * * We can't know for sure how many records are in the lookaside table, * the cursor insert and remove statistics aren't updated atomically. * Start with reviewing 100 rows, and if it takes more than the target * number of calls to finish, increase the number of rows checked on * each call; if it takes less than the target calls to finish, then * decrease the number of rows reviewed on each call (but never less * than 100). */ #define WT_SWEEP_LOOKASIDE_MIN_CNT 100 #define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 ++conn->las_sweep_call; if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) WT_ERR(cursor->remove(cursor)); } /* * When reaching the lookaside table end or the target number of calls, * adjust the row count. Decrease/increase the row count depending on * if the number of calls is less/more than the target. */ if (ret == WT_NOTFOUND || conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; } srch_notfound: if (ret == WT_NOTFOUND) conn->las_sweep_call = 0; WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __curdump_get_key -- * WT_CURSOR->get_key for dump cursors. */ static int __curdump_get_key(WT_CURSOR *cursor, ...) { WT_CURSOR *child; WT_CURSOR_DUMP *cdump; WT_CURSOR_JSON *json; WT_DECL_RET; WT_ITEM item, *itemp; WT_SESSION_IMPL *session; size_t size; uint64_t recno; const char *fmt; const void *buffer; va_list ap; cdump = (WT_CURSOR_DUMP *)cursor; child = cdump->child; va_start(ap, cursor); CURSOR_API_CALL(cursor, session, get_key, NULL); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) { json = (WT_CURSOR_JSON *)cursor->json_private; WT_ASSERT(session, json != NULL); if (WT_CURSOR_RECNO(cursor)) { WT_ERR(child->get_key(child, &recno)); buffer = &recno; size = sizeof(recno); fmt = "R"; } else { WT_ERR(__wt_cursor_get_raw_key(child, &item)); buffer = item.data; size = item.size; if (F_ISSET(cursor, WT_CURSTD_RAW)) fmt = "u"; else fmt = cursor->key_format; } ret = __wt_json_alloc_unpack( session, buffer, size, fmt, json, true, ap); } else { if (WT_CURSOR_RECNO(cursor) && !F_ISSET(cursor, WT_CURSTD_RAW)) { WT_ERR(child->get_key(child, &recno)); WT_ERR(__wt_buf_fmt(session, &cursor->key, "%" PRIu64, recno)); } else { WT_ERR(child->get_key(child, &item)); WT_ERR(__raw_to_dump(session, &item, &cursor->key, F_ISSET(cursor, WT_CURSTD_DUMP_HEX))); } if (F_ISSET(cursor, WT_CURSTD_RAW)) { itemp = va_arg(ap, WT_ITEM *); itemp->data = cursor->key.data; itemp->size = cursor->key.size; } else *va_arg(ap, const char **) = cursor->key.data; }