/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; btree = session->btree; bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->mem; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_init(session, buf, dsk->mem_size)); buf->size = dsk->mem_size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len)); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR) ? WT_ERROR : __wt_illegal_value(session, btree->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_CSTAT_INCR(session, cache_read); WT_DSTAT_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_DSTAT_INCR(session, compress_read); WT_CSTAT_INCRV(session, cache_bytes_read, addr_size); WT_DSTAT_INCRV(session, cache_bytes_read, addr_size); err: __wt_scr_free(&tmp); return (ret); }
/* * __conn_config_file -- * Read in any WiredTiger_config file in the home directory. */ static int __conn_config_file(WT_SESSION_IMPL *session, const char **cfg, WT_ITEM **cbufp) { WT_DECL_ITEM(cbuf); WT_DECL_RET; WT_FH *fh; off_t size; uint32_t len; int exist, quoted; uint8_t *p, *t; *cbufp = NULL; /* Returned buffer */ fh = NULL; /* Check for an optional configuration file. */ #define WT_CONFIGFILE "WiredTiger.config" WT_RET(__wt_exist(session, WT_CONFIGFILE, &exist)); if (!exist) return (0); /* Open the configuration file. */ WT_RET(__wt_open(session, WT_CONFIGFILE, 0, 0, 0, &fh)); WT_ERR(__wt_filesize(session, fh, &size)); if (size == 0) goto err; /* * Sanity test: a 100KB configuration file would be insane. (There's * no practical reason to limit the file size, but I can either limit * the file size to something rational, or I can add code to test if * the off_t size is larger than a uint32_t, which is more complicated * and a waste of time.) */ if (size > 100 * 1024) WT_ERR_MSG(session, EFBIG, WT_CONFIGFILE); len = (uint32_t)size; /* * Copy the configuration file into memory, with a little slop, I'm not * interested in debugging off-by-ones. * * The beginning of a file is the same as if we run into an unquoted * newline character, simplify the parsing loop by pretending that's * what we're doing. */ WT_ERR(__wt_scr_alloc(session, len + 10, &cbuf)); WT_ERR( __wt_read(session, fh, (off_t)0, len, ((uint8_t *)cbuf->mem) + 1)); ((uint8_t *)cbuf->mem)[0] = '\n'; cbuf->size = len + 1; /* * Collapse the file's lines into a single string: newline characters * are replaced with commas unless the newline is quoted or backslash * escaped. Comment lines (an unescaped newline where the next non- * white-space character is a hash), are discarded. */ for (quoted = 0, p = t = cbuf->mem; len > 0;) { /* * Backslash pairs pass through untouched, unless immediately * preceding a newline, in which case both the backslash and * the newline are discarded. Backslash characters escape * quoted characters, too, that is, a backslash followed by a * quote doesn't start or end a quoted string. */ if (*p == '\\' && len > 1) { if (p[1] != '\n') { *t++ = p[0]; *t++ = p[1]; } p += 2; len -= 2; continue; } /* * If we're in a quoted string, or starting a quoted string, * take all characters, including white-space and newlines. */ if (quoted || *p == '"') { if (*p == '"') quoted = !quoted; *t++ = *p++; --len; continue; } /* Everything else gets taken, except for newline characters. */ if (*p != '\n') { *t++ = *p++; --len; continue; } /* * Replace any newline characters with commas (and strings of * commas are safe). * * After any newline, skip to a non-white-space character; if * the next character is a hash mark, skip to the next newline. */ for (;;) { for (*t++ = ','; --len > 0 && isspace(*++p);) ; if (len == 0) break; if (*p != '#') break; while (--len > 0 && *++p != '\n') ; if (len == 0) break; } } *t = '\0'; #if 0 fprintf(stderr, "file config: {%s}\n", (const char *)cbuf->data); #endif /* Check the configuration string. */ WT_ERR(__wt_config_check( session, __wt_confchk_wiredtiger_open, cbuf->data, 0)); /* * The configuration file falls between the default configuration and * the wiredtiger_open() configuration, overriding the defaults but not * overriding the wiredtiger_open() configuration. */ while (cfg[1] != NULL) ++cfg; cfg[1] = cfg[0]; cfg[0] = cbuf->data; *cbufp = cbuf; if (0) { err: if (cbuf != NULL) __wt_buf_free(session, cbuf); } if (fh != NULL) WT_TRET(__wt_close(session, fh)); return (ret); }
/* * __las_page_instantiate -- * Instantiate lookaside update records in a recently read page. */ static int __las_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size) { WT_CURSOR *cursor; WT_CURSOR_BTREE cbt; WT_DECL_ITEM(current_key); WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_ITEM(las_value); WT_DECL_RET; WT_PAGE *page; WT_UPDATE *first_upd, *last_upd, *upd; size_t incr, total_incr; uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid; uint32_t las_id, upd_size, session_flags; int exact; const uint8_t *p; cursor = NULL; page = ref->page; first_upd = last_upd = upd = NULL; total_incr = 0; current_recno = recno = WT_RECNO_OOB; session_flags = 0; /* [-Werror=maybe-uninitialized] */ __wt_btcur_init(session, &cbt); __wt_btcur_open(&cbt); WT_ERR(__wt_scr_alloc(session, 0, ¤t_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_scr_alloc(session, 0, &las_value)); /* Open a lookaside table cursor. */ WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * The lookaside records are in key and update order, that is, there * will be a set of in-order updates for a key, then another set of * in-order updates for a subsequent key. We process all of the updates * for a key and then insert those updates into the page, then all the * updates for the next key, and so on. * * Search for the block's unique prefix, stepping through any matching * records. */ las_addr->data = addr; las_addr->size = addr_size; las_key->size = 0; cursor->set_key( cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key); if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0) ret = cursor->next(cursor); for (; ret == 0; ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * Confirm the search using the unique prefix; if not a match, * we're done searching for records for this page. */ if (las_id != read_id || las_addr->size != addr_size || memcmp(las_addr->data, addr, addr_size) != 0) break; /* * If the on-page value has become globally visible, this record * is no longer needed. */ if (__wt_txn_visible_all(session, las_txnid)) continue; /* Allocate the WT_UPDATE structure. */ WT_ERR(cursor->get_value( cursor, &upd_txnid, &upd_size, las_value)); WT_ERR(__wt_update_alloc(session, (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value, &upd, &incr)); total_incr += incr; upd->txnid = upd_txnid; switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: p = las_key->data; WT_ERR(__wt_vunpack_uint(&p, 0, &recno)); if (current_recno == recno) break; WT_ASSERT(session, current_recno < recno); if (first_upd != NULL) { WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; } current_recno = recno; break; case WT_PAGE_ROW_LEAF: if (current_key->size == las_key->size && memcmp(current_key->data, las_key->data, las_key->size) == 0) break; if (first_upd != NULL) { WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; } WT_ERR(__wt_buf_set(session, current_key, las_key->data, las_key->size)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Append the latest update to the list. */ if (first_upd == NULL) first_upd = last_upd = upd; else { last_upd->next = upd; last_upd = upd; } upd = NULL; } WT_ERR_NOTFOUND_OK(ret); /* Insert the last set of updates, if any. */ if (first_upd != NULL) switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: WT_ERR(__col_instantiate(session, current_recno, ref, &cbt, first_upd)); first_upd = NULL; break; case WT_PAGE_ROW_LEAF: WT_ERR(__row_instantiate(session, current_key, ref, &cbt, first_upd)); first_upd = NULL; break; WT_ILLEGAL_VALUE_ERR(session); } /* Discard the cursor. */ WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags)); if (total_incr != 0) { __wt_cache_page_inmem_incr(session, page, total_incr); /* * We've modified/dirtied the page, but that's not necessary and * if we keep the page clean, it's easier to evict. We leave the * lookaside table updates in place, so if we evict this page * without dirtying it, any future instantiation of it will find * the records it needs. If the page is dirtied before eviction, * then we'll write any needed lookaside table records for the * new location of the page. */ __wt_page_modify_clear(session, page); } err: WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); WT_TRET(__wt_btcur_close(&cbt, 1)); /* * On error, upd points to a single unlinked WT_UPDATE structure, * first_upd points to a list. */ if (upd != NULL) __wt_free(session, upd); if (first_upd != NULL) __wt_free_update_list(session, first_upd); __wt_scr_free(session, ¤t_key); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); __wt_scr_free(session, &las_value); return (ret); }
/*重命名一个索引或者colgroup*/ static int __rename_tree(WT_SESSION_IMPL* session, WT_TABLE* table, const char *newuri, const char *name, const char *cfg[]) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(nn); WT_DECL_ITEM(ns); WT_DECL_ITEM(nv); WT_DECL_ITEM(os); WT_DECL_RET; const char *newname, *olduri, *suffix; char *value; int is_colgroup; olduri = table->name; value = NULL; newname = newuri; (void)WT_PREFIX_SKIP(newname, "table:"); /*判断是重命名一个colgroup*/ is_colgroup = WT_PREFIX_MATCH(name, "colgroup:"); if (!is_colgroup && !WT_PREFIX_MATCH(name, "index:")) WT_ERR_MSG(session, EINVAL, "expected a 'colgroup:' or 'index:' source: '%s'", name); suffix = strchr(name, ':'); /* An existing table should have a well formed name. */ WT_ASSERT(session, suffix != NULL); suffix = strchr(suffix + 1, ':'); WT_ERR(__wt_scr_alloc(session, 0, &nn)); WT_ERR(__wt_buf_fmt(session, nn, "%s%s%s", is_colgroup ? "colgroup:" : "index:", newname, (suffix == NULL) ? "" : suffix)); /* Skip the colon, if any. */ if (suffix != NULL) ++suffix; WT_ERR(__wt_metadata_search(session, name, &value)); /* * Calculate the new data source URI. Use the existing table structure * and substitute the new name temporarily. */ WT_ERR(__wt_scr_alloc(session, 0, &ns)); table->name = newuri; if (is_colgroup) WT_ERR(__wt_schema_colgroup_source(session, table, suffix, value, ns)); else WT_ERR(__wt_schema_index_source(session, table, suffix, value, ns)); if ((ret = __wt_config_getones(session, value, "source", &cval)) != 0) WT_ERR_MSG(session, EINVAL, "index or column group has no data source: %s", value); /* Take a copy of the old data source. */ WT_ERR(__wt_scr_alloc(session, 0, &os)); WT_ERR(__wt_buf_fmt(session, os, "%.*s", (int)cval.len, cval.str)); /* Overwrite it with the new data source. */ WT_ERR(__wt_scr_alloc(session, 0, &nv)); WT_ERR(__wt_buf_fmt(session, nv, "%.*s%s%s", (int)WT_PTRDIFF(cval.str, value), value, (const char *)ns->data, cval.str + cval.len)); /* * Remove the old metadata entry. * Insert the new metadata entry. */ WT_ERR(__wt_metadata_remove(session, name)); WT_ERR(__wt_metadata_insert(session, nn->data, nv->data)); /* Rename the file. */ WT_ERR(__wt_schema_rename(session, os->data, ns->data, cfg)); err: __wt_scr_free(session, &nn); __wt_scr_free(session, &ns); __wt_scr_free(session, &nv); __wt_scr_free(session, &os); __wt_free(session, value); table->name = olduri; return ret; }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(ctmp); WT_DECL_ITEM(etmp); WT_DECL_RET; WT_KEYED_ENCRYPTOR *kencryptor; WT_ITEM *ip; WT_PAGE_HEADER *dsk; size_t dst_len, len, result_len, size, src_len; int compression_failed, data_cksum, encrypted; uint8_t *dst, *src; btree = S2BT(session); bm = btree->bm; encrypted = 0; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (checkpoint == 0 && addr != NULL && addr_sizep != NULL) || (checkpoint == 1 && addr == NULL && addr_sizep == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &ctmp)); memcpy(ctmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)ctmp->data + WT_BLOCK_COMPRESS_SKIP, ctmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); ctmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = ctmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(session, &ctmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (btree->compressor == NULL || btree->compressor->compress == NULL || compressed) ip = buf; else if (buf->size <= btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &ctmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)ctmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); result_len += WT_BLOCK_COMPRESS_SKIP; /* * If compression fails, or doesn't gain us at least one unit of * allocation, fallback to the original version. This isn't * unexpected: if compression doesn't work for some chunk of * data for some reason (noting likely additional format/header * information which compressed output requires), it just means * the uncompressed version is as good as it gets, and that's * what we use. */ if (compression_failed || buf->size / btree->allocsize <= result_len / btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_fail); } else { compressed = 1; WT_STAT_FAST_DATA_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(ctmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); ctmp->size = result_len; ip = ctmp; } } /* * Optionally encrypt the data. We need to add in the original * length, in case both compression and encryption are done. */ if ((kencryptor = btree->kencryptor) != NULL) { /* * Get size needed for encrypted buffer. */ __wt_encrypt_size(session, kencryptor, ip->size, &size); WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &etmp)); WT_ERR(__wt_encrypt(session, kencryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)); encrypted = 1; ip = etmp; } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); if (encrypted) F_SET(dsk, WT_PAGE_ENCRYPTED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = 1; break; case CKSUM_OFF: data_cksum = 0; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); WT_STAT_FAST_CONN_INCR(session, cache_write); WT_STAT_FAST_DATA_INCR(session, cache_write); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size); err: __wt_scr_free(session, &ctmp); __wt_scr_free(session, &etmp); return (ret); }
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; size_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; bool bm_start, quit; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; bm_start = false; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Optionally dump specific block offsets. */ WT_ERR(__verify_config_offsets(session, cfg, &quit)); if (quit) goto done; /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase, cfg)); bm_start = true; /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_ERR(__wt_verbose(session, WT_VERB_VERIFY, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Load the checkpoint. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); /* * Ignore trees with no root page. * Verify, then discard the checkpoint from the cache. */ if (root_addr_size != 0 && (ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, root_addr, root_addr_size, vs->tmp1), __wt_page_type_string( btree->root.page->type))); WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); /* Display the tree shape. */ if (vs->dump_shape) WT_ERR(__verify_tree_shape(session, vs)); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, int full, uint32_t flags, WT_LSN *lsnp) { WT_DECL_RET; WT_DECL_ITEM(logrec); WT_LSN *ckpt_lsn; WT_TXN *txn; const char *fmt = WT_UNCHECKED_STRING(IIQIU); uint8_t *end, *p; size_t recsize; uint32_t i, rectype = WT_LOGREC_CHECKPOINT; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } else return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = 1; *ckpt_lsn = S2C(session)->log->alloc_lsn; break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; *ckpt_lsn = S2C(session)->log->alloc_lsn; } /* Write the checkpoint log record. */ WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->file, ckpt_lsn->offset, txn->ckpt_nsnapshot, &txn->ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress, tell the logging subsystem the * checkpoint LSN so that it can archive. */ if (!S2C(session)->hot_backup) WT_ERR(__wt_log_ckpt(session, ckpt_lsn)); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_FAIL: /* Cleanup any allocated resources */ INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(&txn->ckpt_snapshot); txn->full_ckpt = 0; break; } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval)); lsm_tree->merge_min = (uint32_t)cval.val; if (lsm_tree->merge_min > lsm_tree->merge_max) WT_ERR_MSG(session, EINVAL, "LSM merge_min must be less than or equal to merge_max"); /* * Set up the config for each chunk. * * Make the memory_page_max double the chunk size, so application * threads don't immediately try to force evict the chunk when the * worker thread clears the NO_EVICTION flag. */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, config, 2 * lsm_tree->chunk_max)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &lsm_tree->file_config)); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_curindex_open -- * WT_SESSION->open_cursor method for index cursors. */ int __wt_curindex_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __curindex_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __curindex_set_value, /* set-value */ __curindex_compare, /* compare */ __wt_cursor_equals, /* equals */ __curindex_next, /* next */ __curindex_prev, /* prev */ __curindex_reset, /* reset */ __curindex_search, /* search */ __curindex_search_near, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curindex_close); /* close */ WT_CURSOR_INDEX *cindex; WT_CURSOR *cursor; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_INDEX *idx; WT_TABLE *table; const char *columns, *idxname, *tablename; size_t namesize; tablename = uri; if (!WT_PREFIX_SKIP(tablename, "index:") || (idxname = strchr(tablename, ':')) == NULL) WT_RET_MSG(session, EINVAL, "Invalid cursor URI: '%s'", uri); namesize = (size_t)(idxname - tablename); ++idxname; if ((ret = __wt_schema_get_table(session, tablename, namesize, false, &table)) != 0) { if (ret == WT_NOTFOUND) WT_RET_MSG(session, EINVAL, "Cannot open cursor '%s' on unknown table", uri); return (ret); } columns = strchr(idxname, '('); if (columns == NULL) namesize = strlen(idxname); else namesize = (size_t)(columns - idxname); if ((ret = __wt_schema_open_index( session, table, idxname, namesize, &idx)) != 0) { __wt_schema_release_table(session, table); return (ret); } WT_RET(__wt_calloc_one(session, &cindex)); cursor = &cindex->iface; *cursor = iface; cursor->session = &session->iface; cindex->table = table; cindex->index = idx; cindex->key_plan = idx->key_plan; cindex->value_plan = idx->value_plan; cursor->internal_uri = idx->name; cursor->key_format = idx->idxkey_format; cursor->value_format = table->value_format; /* * XXX * A very odd corner case is an index with a recno key. * The only way to get here is by creating an index on a column store * using only the primary's recno as the index key. Disallow that for * now. */ if (WT_CURSOR_RECNO(cursor)) WT_ERR_MSG(session, WT_ERROR, "Column store indexes based on a record number primary " "key are not supported"); /* Handle projections. */ if (columns != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__wt_struct_reformat(session, table, columns, strlen(columns), NULL, false, tmp)); WT_ERR(__wt_strndup( session, tmp->data, tmp->size, &cursor->value_format)); WT_ERR(__wt_buf_init(session, tmp, 0)); WT_ERR(__wt_struct_plan(session, table, columns, strlen(columns), false, tmp)); WT_ERR(__wt_strndup( session, tmp->data, tmp->size, &cindex->value_plan)); } WT_ERR(__wt_cursor_init( cursor, cursor->internal_uri, owner, cfg, cursorp)); WT_ERR(__wt_open_cursor( session, idx->source, cursor, cfg, &cindex->child)); /* Open the column groups needed for this index cursor. */ WT_ERR(__curindex_open_colgroups(session, cindex, cfg)); if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) __wt_json_column_init( cursor, table->key_format, &idx->colconf, &table->colconf); if (0) { err: WT_TRET(__curindex_close(cursor)); *cursorp = NULL; } __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_schema_open_indices -- * Open the indices for a table. */ int __wt_schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) { WT_CURSOR *cursor; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_INDEX *idx; u_int i; int cmp, match; const char *idxconf, *name, *tablename, *uri; /* Check if we've already done the work. */ if (idxname == NULL && table->idx_complete) return (0); cursor = NULL; idx = NULL; /* Build a search key. */ tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); WT_ERR(__wt_scr_alloc(session, 512, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); /* Find matching indices. */ WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); cursor->set_key(cursor, tmp->data); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &uri)); name = uri; if (!WT_PREFIX_SKIP(name, tmp->data)) break; /* Is this the index we are looking for? */ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len); /* * Ensure there is space, including if we have to make room for * a new entry in the middle of the list. */ WT_ERR(__wt_realloc_def(session, &table->idx_alloc, WT_MAX(i, table->nindices) + 1, &table->indices)); /* Keep the in-memory list in sync with the metadata. */ cmp = 0; while (table->indices[i] != NULL && (cmp = strcmp(uri, table->indices[i]->name)) > 0) { /* Index no longer exists, remove it. */ __wt_free(session, table->indices[i]); memmove(&table->indices[i], &table->indices[i + 1], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[--table->nindices] = NULL; } if (cmp < 0) { /* Make room for a new index. */ memmove(&table->indices[i + 1], &table->indices[i], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[i] = NULL; ++table->nindices; } if (!match) continue; if (table->indices[i] == NULL) { WT_ERR(cursor->get_value(cursor, &idxconf)); WT_ERR(__wt_calloc_def(session, 1, &idx)); WT_ERR(__wt_strdup(session, uri, &idx->name)); WT_ERR(__wt_strdup(session, idxconf, &idx->config)); WT_ERR(__open_index(session, table, idx)); table->indices[i] = idx; idx = NULL; } /* If we were looking for a single index, we're done. */ if (indexp != NULL) *indexp = table->indices[i]; if (idxname != NULL) break; } WT_ERR_NOTFOUND_OK(ret); /* If we did a full pass, we won't need to do it again. */ if (idxname == NULL) { table->nindices = i; table->idx_complete = 1; } err: __wt_scr_free(&tmp); if (idx != NULL) __wt_schema_destroy_index(session, idx); if (cursor != NULL) WT_TRET(cursor->close(cursor)); return (ret); }
/* * __wt_schema_open_colgroups -- * Open the column groups for a table. */ int __wt_schema_open_colgroups(WT_SESSION_IMPL *session, WT_TABLE *table) { WT_COLGROUP *colgroup; WT_CONFIG cparser; WT_CONFIG_ITEM ckey, cval; WT_DECL_RET; WT_DECL_ITEM(buf); const char *cgconfig; u_int i; if (table->cg_complete) return (0); WT_RET(__wt_scr_alloc(session, 0, &buf)); colgroup = NULL; WT_RET(__wt_config_subinit(session, &cparser, &table->cgconf)); /* Open each column group. */ for (i = 0; i < WT_COLGROUPS(table); i++) { if (table->ncolgroups > 0) WT_ERR(__wt_config_next(&cparser, &ckey, &cval)); else WT_CLEAR(ckey); /* * Always open from scratch: we may have failed part of the way * through opening a table, or column groups may have changed. */ if (table->cgroups[i] != NULL) { __wt_schema_destroy_colgroup( session, table->cgroups[i]); table->cgroups[i] = NULL; } WT_ERR(__wt_schema_colgroup_name(session, table, ckey.str, ckey.len, buf)); if ((ret = __wt_metadata_search( session, buf->data, &cgconfig)) != 0) { /* It is okay if the table is incomplete. */ if (ret == WT_NOTFOUND) ret = 0; goto err; } WT_ERR(__wt_calloc_def(session, 1, &colgroup)); colgroup->name = __wt_buf_steal(session, buf, NULL); colgroup->config = cgconfig; WT_ERR(__wt_config_getones(session, colgroup->config, "columns", &colgroup->colconf)); WT_ERR(__wt_config_getones( session, colgroup->config, "source", &cval)); WT_ERR(__wt_buf_fmt(session, buf, "%.*s", (int)cval.len, cval.str)); colgroup->source = __wt_buf_steal(session, buf, NULL); table->cgroups[i] = colgroup; colgroup = NULL; } if (!table->is_simple) { WT_ERR(__wt_table_check(session, table)); WT_ERR(__wt_struct_plan(session, table, table->colconf.str, table->colconf.len, 1, buf)); table->plan = __wt_buf_steal(session, buf, NULL); } table->cg_complete = 1; err: __wt_scr_free(&buf); if (colgroup != NULL) __wt_schema_destroy_colgroup(session, colgroup); return (ret); }
/* * ___open_index -- * Open an index. */ static int __open_index(WT_SESSION_IMPL *session, WT_TABLE *table, WT_INDEX *idx) { WT_CONFIG colconf; WT_CONFIG_ITEM ckey, cval; WT_DECL_ITEM(buf); WT_DECL_ITEM(plan); WT_DECL_RET; u_int cursor_key_cols, i; WT_ERR(__wt_scr_alloc(session, 0, &buf)); /* Get the data source from the index config. */ WT_ERR(__wt_config_getones(session, idx->config, "source", &cval)); WT_ERR(__wt_buf_fmt( session, buf, "%.*s", (int)cval.len, cval.str)); idx->source = __wt_buf_steal(session, buf, NULL); idx->need_value = WT_PREFIX_MATCH(idx->source, "lsm:"); WT_ERR(__wt_config_getones(session, idx->config, "key_format", &cval)); WT_ERR(__wt_buf_fmt( session, buf, "%.*s", (int)cval.len, cval.str)); idx->key_format = __wt_buf_steal(session, buf, NULL); /* * The key format for an index is somewhat subtle: the application * specifies a set of columns that it will use for the key, but the * engine usually adds some hidden columns in order to derive the * primary key. These hidden columns are part of the file's key. * * The file's key_format is stored persistently, we need to calculate * the index cursor key format (which will usually omit some of those * keys). */ WT_ERR(__wt_config_getones( session, idx->config, "columns", &idx->colconf)); /* Start with the declared index columns. */ WT_ERR(__wt_config_subinit(session, &colconf, &idx->colconf)); cursor_key_cols = 0; while ((ret = __wt_config_next(&colconf, &ckey, &cval)) == 0) { WT_ERR(__wt_buf_catfmt( session, buf, "%.*s,", (int)ckey.len, ckey.str)); ++cursor_key_cols; } if (ret != 0 && ret != WT_NOTFOUND) goto err; /* * Now add any primary key columns from the table that are not * already part of the index key. */ WT_ERR(__wt_config_subinit(session, &colconf, &table->colconf)); for (i = 0; i < table->nkey_columns && (ret = __wt_config_next(&colconf, &ckey, &cval)) == 0; i++) { /* * If the primary key column is already in the secondary key, * don't add it again. */ if (__wt_config_subgetraw( session, &idx->colconf, &ckey, &cval) == 0) continue; WT_ERR(__wt_buf_catfmt( session, buf, "%.*s,", (int)ckey.len, ckey.str)); } if (ret != 0 && ret != WT_NOTFOUND) goto err; WT_ERR(__wt_scr_alloc(session, 0, &plan)); WT_ERR(__wt_struct_plan(session, table, buf->data, buf->size, 0, plan)); idx->key_plan = __wt_buf_steal(session, plan, NULL); /* Set up the cursor key format (the visible columns). */ WT_ERR(__wt_buf_init(session, buf, 0)); WT_ERR(__wt_struct_truncate(session, idx->key_format, cursor_key_cols, buf)); idx->idxkey_format = __wt_buf_steal(session, buf, NULL); /* By default, index cursor values are the table value columns. */ /* TODO Optimize to use index columns in preference to table lookups. */ WT_ERR(__wt_struct_plan(session, table, table->colconf.str, table->colconf.len, 1, plan)); idx->value_plan = __wt_buf_steal(session, plan, NULL); err: __wt_scr_free(&buf); __wt_scr_free(&plan); return (ret); }
/* * __verify_dsk_row -- * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. */ static int __verify_dsk_row( WT_SESSION_IMPL *session, const char *addr, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(current); WT_DECL_ITEM(last_ovfl); WT_DECL_ITEM(last_pfx); WT_DECL_RET; WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; uint32_t cell_num, cell_type, i, key_cnt, prefix; uint8_t *end; int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); last = last_ovfl; end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { ret = __err_cell_corrupted(session, cell_num, addr); goto err; } /* Check the raw and collapsed cell types. */ WT_ERR(__err_cell_type( session, cell_num, addr, unpack->raw, dsk->type)); WT_ERR(__err_cell_type( session, cell_num, addr, unpack->type, dsk->type)); cell_type = unpack->type; /* * Check ordering relationships between the WT_CELL entries. * For row-store internal pages, check for: * two values in a row, * two keys in a row, * a value as the first cell on a page. * For row-store leaf pages, check for: * two values in a row, * a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: ++key_cnt; switch (last_cell_type) { case FIRST: case WAS_VALUE: break; case WAS_KEY: if (dsk->type == WT_PAGE_ROW_LEAF) break; WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent keys", cell_num - 1, addr); } last_cell_type = WAS_KEY; break; case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_VALUE: case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", addr); case WAS_KEY: break; case WAS_VALUE: WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent values", cell_num - 1, addr); } last_cell_type = WAS_VALUE; break; } /* Check if any referenced item has a valid address. */ switch (cell_type) { case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) goto eof; break; } /* * Remaining checks are for key order and prefix compression. * If this cell isn't a key, we're done, move to the next cell. * If this cell is an overflow item, instantiate the key and * compare it with the last key. Otherwise, we have to deal with * prefix compression. */ switch (cell_type) { case WT_CELL_KEY: break; case WT_CELL_KEY_OVFL: WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); goto key_compare; default: /* Not a key -- continue with the next cell. */ continue; } /* * Prefix compression checks. * * Confirm the first non-overflow key on a page has a zero * prefix compression count. */ prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 " key on page at %s is the first " "non-overflow key on the page and has a non-zero " "prefix compression value", cell_num, addr); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " "compression count of %" PRIu32 ", larger than " "the length of the previous key, %" WT_SIZET_FMT, cell_num, addr, prefix, last->size); /* * If Huffman decoding required, unpack the cell to build the * key, then resolve the prefix. Else, we can do it faster * internally because we don't have to shuffle memory around as * much. */ if (huffman != NULL) { WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); /* * If there's a prefix, make sure there's enough buffer * space, then shift the decoded data past the prefix * and copy the prefix into place. Take care with the * pointers: current->data may be pointing inside the * buffer. */ if (prefix != 0) { WT_ERR(__wt_buf_grow( session, current, prefix + current->size)); memmove((uint8_t *)current->mem + prefix, current->data, current->size); memcpy(current->mem, last->data, prefix); current->data = current->mem; current->size += prefix; } } else { /* * Get the cell's data/length and make sure we have * enough buffer space. */ WT_ERR(__wt_buf_init( session, current, prefix + unpack->size)); /* Copy the prefix then the data into place. */ if (prefix != 0) memcpy(current->mem, last->data, prefix); memcpy((uint8_t *)current->mem + prefix, unpack->data, unpack->size); current->size = prefix + unpack->size; } key_compare: /* * Compare the current key against the last key. * * Be careful about the 0th key on internal pages: we only store * the first byte and custom collators may not be able to handle * truncated keys. */ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { WT_ERR(__wt_compare( session, btree->collator, last, current, &cmp)); if (cmp >= 0) WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " "page at %s are incorrectly sorted", cell_num - 2, cell_num, addr); } /* * Swap the buffers: last always references the last key entry, * last_pfx and last_ovfl reference the last prefix-compressed * and last overflow key entries. Current gets pointed to the * buffer we're not using this time around, which is where the * next key goes. */ last = current; if (cell_type == WT_CELL_KEY) { current = last_pfx; last_pfx = last; } else { current = last_ovfl; last_ovfl = last; } WT_ASSERT(session, last != current); }
/* * __wt_txn_checkpoint_log -- * Write a log record for a checkpoint operation. */ int __wt_txn_checkpoint_log( WT_SESSION_IMPL *session, bool full, uint32_t flags, WT_LSN *lsnp) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; WT_ITEM *ckpt_snapshot, empty; WT_LSN *ckpt_lsn; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; uint8_t *end, *p; size_t recsize; uint32_t i, rectype; const char *fmt; conn = S2C(session); txn_global = &conn->txn_global; txn = &session->txn; ckpt_lsn = &txn->ckpt_lsn; /* * If this is a file sync, log it unless there is a full checkpoint in * progress. */ if (!full) { if (txn->full_ckpt) { if (lsnp != NULL) *lsnp = *ckpt_lsn; return (0); } return (__txn_log_file_sync(session, flags, lsnp)); } switch (flags) { case WT_TXN_LOG_CKPT_PREPARE: txn->full_ckpt = true; if (conn->compat_major >= WT_LOG_V2) { /* * Write the system log record containing a checkpoint * start operation. */ rectype = WT_LOGREC_SYSTEM; fmt = WT_UNCHECKED_STRING(I); WT_ERR(__wt_struct_size( session, &recsize, fmt, rectype)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_logop_checkpoint_start_pack( session, logrec)); WT_ERR(__wt_log_write(session, logrec, ckpt_lsn, 0)); } else { WT_ERR(__wt_log_printf(session, "CHECKPOINT: Starting record")); WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } /* * We take and immediately release the visibility lock. * Acquiring the write lock guarantees that any transaction * that has written to the log has also made its transaction * visible at this time. */ __wt_writelock(session, &txn_global->visibility_rwlock); __wt_writeunlock(session, &txn_global->visibility_rwlock); /* * We need to make sure that the log records in the checkpoint * LSN are on disk. In particular to make sure that the * current log file exists. */ WT_ERR(__wt_log_force_sync(session, ckpt_lsn)); break; case WT_TXN_LOG_CKPT_START: /* Take a copy of the transaction snapshot. */ txn->ckpt_nsnapshot = txn->snapshot_count; recsize = (size_t)txn->ckpt_nsnapshot * WT_INTPACK64_MAXSIZE; WT_ERR(__wt_scr_alloc(session, recsize, &txn->ckpt_snapshot)); p = txn->ckpt_snapshot->mem; end = p + recsize; for (i = 0; i < txn->snapshot_count; i++) WT_ERR(__wt_vpack_uint( &p, WT_PTRDIFF(end, p), txn->snapshot[i])); break; case WT_TXN_LOG_CKPT_STOP: /* * During a clean connection close, we get here without the * prepare or start steps. In that case, log the current LSN * as the checkpoint LSN. */ if (!txn->full_ckpt) { txn->ckpt_nsnapshot = 0; WT_CLEAR(empty); ckpt_snapshot = ∅ WT_ERR(__wt_log_flush_lsn(session, ckpt_lsn, true)); } else ckpt_snapshot = txn->ckpt_snapshot; /* Write the checkpoint log record. */ rectype = WT_LOGREC_CHECKPOINT; fmt = WT_UNCHECKED_STRING(IIIIu); WT_ERR(__wt_struct_size(session, &recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); WT_ERR(__wt_logrec_alloc(session, recsize, &logrec)); WT_ERR(__wt_struct_pack(session, (uint8_t *)logrec->data + logrec->size, recsize, fmt, rectype, ckpt_lsn->l.file, ckpt_lsn->l.offset, txn->ckpt_nsnapshot, ckpt_snapshot)); logrec->size += (uint32_t)recsize; WT_ERR(__wt_log_write(session, logrec, lsnp, F_ISSET(conn, WT_CONN_CKPT_SYNC) ? WT_LOG_FSYNC : 0)); /* * If this full checkpoint completed successfully and there is * no hot backup in progress and this is not an unclean * recovery, tell the logging subsystem the checkpoint LSN so * that it can archive. Do not update the logging checkpoint * LSN if this is during a clean connection close, only during * a full checkpoint. A clean close may not update any * metadata LSN and we do not want to archive in that case. */ if (!conn->hot_backup && (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DIRTY) || FLD_ISSET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE)) && txn->full_ckpt) __wt_log_ckpt(session, ckpt_lsn); /* FALLTHROUGH */ case WT_TXN_LOG_CKPT_CLEANUP: /* Cleanup any allocated resources */ WT_INIT_LSN(ckpt_lsn); txn->ckpt_nsnapshot = 0; __wt_scr_free(session, &txn->ckpt_snapshot); txn->full_ckpt = false; break; WT_ILLEGAL_VALUE_ERR(session); } err: __wt_logrec_free(session, &logrec); return (ret); }
/* * __wt_log_newfile -- * Create the next log file and write the file header record into it. */ int __wt_log_newfile(WT_SESSION_IMPL *session, int conn_create) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LOG *log; WT_LOG_DESC *desc; WT_LOG_RECORD *logrec; WT_LOGSLOT tmp; WT_MYSLOT myslot; conn = S2C(session); log = conn->log; /* * Set aside the log file handle to be closed later. Other threads * may still be using it to write to the log. */ WT_ASSERT(session, log->log_close_fh == NULL); log->log_close_fh = log->log_fh; log->fileid++; WT_RET(__log_openfile(session, 1, &log->log_fh, log->fileid)); log->alloc_lsn.file = log->fileid; log->alloc_lsn.offset = log->log_fh->size; /* * Set up the log descriptor record. Use a scratch buffer to * get correct alignment for direct I/O. */ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); memset(buf->mem, 0, log->allocsize); logrec = (WT_LOG_RECORD *)buf->mem; desc = (WT_LOG_DESC *)logrec->record; desc->log_magic = WT_LOG_MAGIC; desc->majorv = WT_LOG_MAJOR_VERSION; desc->minorv = WT_LOG_MINOR_VERSION; desc->log_size = (uint64_t)conn->log_file_max; /* * Now that the record is set up, initialize the record header. */ logrec->len = log->allocsize; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, log->allocsize); WT_CLEAR(tmp); myslot.slot = &tmp; myslot.offset = 0; /* * Recursively call __log_acquire to allocate log space for the * log descriptor record. Call __log_fill to write it, but we * do not need to call __log_release because we're not waiting for * earlier operations to complete. */ WT_ERR(__log_acquire(session, logrec->len, &tmp)); WT_ERR(__log_fill(session, &myslot, 1, buf, NULL)); /* * If we're called from connection creation code, we need to update * the LSNs since we're the only write in progress. */ if (conn_create) { WT_ERR(__wt_fsync(session, log->log_fh)); log->sync_lsn = tmp.slot_end_lsn; log->write_lsn = tmp.slot_end_lsn; } err: __wt_scr_free(&buf); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (conn->las_sweep_call != 0 && key->data != NULL) { __wt_cursor_set_raw_key(cursor, key); if ((ret = cursor->search_near(cursor, ¬used)) != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. * * We can't know for sure how many records are in the lookaside table, * the cursor insert and remove statistics aren't updated atomically. * Start with reviewing 100 rows, and if it takes more than the target * number of calls to finish, increase the number of rows checked on * each call; if it takes less than the target calls to finish, then * decrease the number of rows reviewed on each call (but never less * than 100). */ #define WT_SWEEP_LOOKASIDE_MIN_CNT 100 #define WT_SWEEP_LOOKASIDE_PASS_TARGET 30 ++conn->las_sweep_call; if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT) cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT; /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) WT_ERR(cursor->remove(cursor)); } /* * When reaching the lookaside table end or the target number of calls, * adjust the row count. Decrease/increase the row count depending on * if the number of calls is less/more than the target. */ if (ret == WT_NOTFOUND || conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) { if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET && conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT) conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT; if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT; } srch_notfound: if (ret == WT_NOTFOUND) conn->las_sweep_call = 0; WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __open_verbose -- * Optionally output a verbose message on handle open. */ static inline int __open_verbose( WT_SESSION_IMPL *session, const char *name, int file_type, u_int flags) { #ifdef HAVE_VERBOSE WT_DECL_RET; WT_DECL_ITEM(tmp); const char *file_type_tag, *sep; if (!WT_VERBOSE_ISSET(session, WT_VERB_FILEOPS)) return (0); /* * It's useful to track file opens when debugging platforms, take some * effort to output good tracking information. */ switch (file_type) { case WT_FS_OPEN_FILE_TYPE_CHECKPOINT: file_type_tag = "checkpoint"; break; case WT_FS_OPEN_FILE_TYPE_DATA: file_type_tag = "data"; break; case WT_FS_OPEN_FILE_TYPE_DIRECTORY: file_type_tag = "directory"; break; case WT_FS_OPEN_FILE_TYPE_LOG: file_type_tag = "log"; break; case WT_FS_OPEN_FILE_TYPE_REGULAR: file_type_tag = "regular"; break; default: file_type_tag = "unknown open type"; break; } WT_RET(__wt_scr_alloc(session, 0, &tmp)); sep = " ("; #define WT_FS_OPEN_VERBOSE_FLAG(f, name) \ if (LF_ISSET(f)) { \ WT_ERR(__wt_buf_catfmt( \ session, tmp, "%s%s", sep, name)); \ sep = ", "; \ } WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_CREATE, "create"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_DIRECTIO, "direct-IO"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_EXCLUSIVE, "exclusive"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_FIXED, "fixed"); WT_FS_OPEN_VERBOSE_FLAG(WT_FS_OPEN_READONLY, "readonly"); if (tmp->size != 0) WT_ERR(__wt_buf_catfmt(session, tmp, ")")); __wt_verbose(session, WT_VERB_FILEOPS, "%s: file-open: type %s%s", name, file_type_tag, tmp->size == 0 ? "" : (char *)tmp->data); err: __wt_scr_free(session, &tmp); return (ret); #else WT_UNUSED(session); WT_UNUSED(name); WT_UNUSED(file_type); WT_UNUSED(flags); return (0); #endif }
/* * __wt_lsm_meta_write -- * Write the metadata for an LSM tree. */ int __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_CHUNK *chunk; u_int i; int first; WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "key_format=%s,value_format=%s,bloom_config=(%s),file_config=(%s)", lsm_tree->key_format, lsm_tree->value_format, lsm_tree->bloom_config, lsm_tree->file_config)); if (lsm_tree->collator_name != NULL) WT_ERR(__wt_buf_catfmt( session, buf, ",collator=%s", lsm_tree->collator_name)); WT_ERR(__wt_buf_catfmt(session, buf, ",last=%" PRIu32 ",chunk_count_limit=%" PRIu32 ",chunk_max=%" PRIu64 ",chunk_size=%" PRIu64 ",auto_throttle=%" PRIu32 ",merge_max=%" PRIu32 ",merge_min=%" PRIu32 ",bloom=%" PRIu32 ",bloom_bit_count=%" PRIu32 ",bloom_hash_count=%" PRIu32, lsm_tree->last, lsm_tree->chunk_count_limit, lsm_tree->chunk_max, lsm_tree->chunk_size, F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) ? 1 : 0, lsm_tree->merge_max, lsm_tree->merge_min, lsm_tree->bloom, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count)); WT_ERR(__wt_buf_catfmt(session, buf, ",chunks=[")); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (i > 0) WT_ERR(__wt_buf_catfmt(session, buf, ",")); WT_ERR(__wt_buf_catfmt(session, buf, "id=%" PRIu32, chunk->id)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_buf_catfmt(session, buf, ",bloom")); if (chunk->size != 0) WT_ERR(__wt_buf_catfmt(session, buf, ",chunk_size=%" PRIu64, chunk->size)); if (chunk->count != 0) WT_ERR(__wt_buf_catfmt( session, buf, ",count=%" PRIu64, chunk->count)); WT_ERR(__wt_buf_catfmt( session, buf, ",generation=%" PRIu32, chunk->generation)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); WT_ERR(__wt_buf_catfmt(session, buf, ",old_chunks=[")); first = 1; for (i = 0; i < lsm_tree->nold_chunks; i++) { chunk = lsm_tree->old_chunks[i]; WT_ASSERT(session, chunk != NULL); if (first) first = 0; else WT_ERR(__wt_buf_catfmt(session, buf, ",")); WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_buf_catfmt( session, buf, ",bloom=\"%s\"", chunk->bloom_uri)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); ret = __wt_metadata_update(session, lsm_tree->name, buf->data); WT_ERR(ret); err: __wt_scr_free(session, &buf); return (ret); }
/* * __wt_huffman_read -- * Read a Huffman table from a file. */ static int __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, struct __wt_huffman_table **tablep, u_int *entriesp, u_int *numbytesp) { struct __wt_huffman_table *table, *tp; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FSTREAM *fs; int64_t symbol, frequency; u_int entries, lineno; int n; bool is_utf8; *tablep = NULL; *entriesp = *numbytesp = 0; fs = NULL; table = NULL; /* * Try and open the backing file. */ WT_RET(__huffman_confchk_file(session, ip, &is_utf8, &fs)); /* * UTF-8 table is 256 bytes, with a range of 0-255. * UTF-16 is 128KB (2 * 65536) bytes, with a range of 0-65535. */ if (is_utf8) { entries = UINT8_MAX; *numbytesp = 1; WT_ERR(__wt_calloc_def(session, entries, &table)); } else { entries = UINT16_MAX; *numbytesp = 2; WT_ERR(__wt_calloc_def(session, entries, &table)); } WT_ERR(__wt_scr_alloc(session, 0, &tmp)); for (tp = table, lineno = 1;; ++tp, ++lineno) { WT_ERR(__wt_getline(session, fs, tmp)); if (tmp->size == 0) break; n = sscanf( tmp->data, "%" SCNi64 " %" SCNi64, &symbol, &frequency); /* * Entries is 0-based, that is, there are (entries +1) possible * values that can be configured. The line number is 1-based, so * adjust the test for too many entries, and report (entries +1) * in the error as the maximum possible number of entries. */ if (lineno > entries + 1) WT_ERR_MSG(session, EINVAL, "Huffman table file %.*s is corrupted, " "more than %" PRIu32 " entries", (int)ip->len, ip->str, entries + 1); if (n != 2) WT_ERR_MSG(session, EINVAL, "line %u of Huffman table file %.*s is corrupted: " "expected two unsigned integral values", lineno, (int)ip->len, ip->str); if (symbol < 0 || symbol > entries) WT_ERR_MSG(session, EINVAL, "line %u of Huffman file %.*s is corrupted; " "symbol %" PRId64 " not in range, maximum " "value is %u", lineno, (int)ip->len, ip->str, symbol, entries); if (frequency < 0 || frequency > UINT32_MAX) WT_ERR_MSG(session, EINVAL, "line %u of Huffman file %.*s is corrupted; " "frequency %" PRId64 " not in range, maximum " "value is %" PRIu32, lineno, (int)ip->len, ip->str, frequency, (uint32_t)UINT32_MAX); tp->symbol = (uint32_t)symbol; tp->frequency = (uint32_t)frequency; } *entriesp = lineno - 1; *tablep = table; if (0) { err: __wt_free(session, table); } (void)__wt_fclose(session, &fs); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_conn_optrack_setup -- * Set up operation logging. */ int __wt_conn_optrack_setup(WT_SESSION_IMPL *session, const char *cfg[], bool reconfig) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_RET; conn = S2C(session); /* Once an operation tracking path has been set it can't be changed. */ if (!reconfig) { WT_RET(__wt_config_gets(session, cfg, "operation_tracking.path", &cval)); WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->optrack_path)); } WT_RET(__wt_config_gets(session, cfg, "operation_tracking.enabled", &cval)); if (cval.val == 0) { if (F_ISSET(conn, WT_CONN_OPTRACK)) { WT_RET(__wt_conn_optrack_teardown(session, reconfig)); F_CLR(conn, WT_CONN_OPTRACK); } return (0); } if (F_ISSET(conn, WT_CONN_READONLY)) /* Operation tracking isn't supported in read-only mode */ WT_RET_MSG(session, EINVAL, "Operation tracking is incompatible with read only " "configuration."); if (F_ISSET(conn, WT_CONN_OPTRACK)) /* Already enabled, nothing else to do */ return (0); /* * Operation tracking files will include the ID of the creating process * in their name, so we can distinguish between log files created by * different WiredTiger processes in the same directory. We cache the * process id for future use. */ conn->optrack_pid = __wt_process_id(); /* * Open the file in the same directory that will hold a map of * translations between function names and function IDs. If the file * exists, remove it. */ WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_filename_construct(session, conn->optrack_path, "optrack-map", conn->optrack_pid, UINT32_MAX, buf)); WT_ERR(__wt_open(session, (const char *)buf->data, WT_FS_OPEN_FILE_TYPE_REGULAR, WT_FS_OPEN_CREATE, &conn->optrack_map_fh)); WT_ERR(__wt_spin_init(session, &conn->optrack_map_spinlock, "optrack map spinlock")); WT_ERR(__wt_malloc(session, WT_OPTRACK_BUFSIZE, &conn->dummy_session.optrack_buf)); /* Set operation tracking on */ F_SET(conn, WT_CONN_OPTRACK); err: __wt_scr_free(session, &buf); return (ret); }
/* * __wt_curlog_open -- * Initialize a log cursor. */ int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curlog_compare, /* compare */ __wt_cursor_equals, /* equals */ __curlog_next, /* next */ __wt_cursor_notsup, /* prev */ __curlog_reset, /* reset */ __curlog_search, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LOG *log; WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0); conn = S2C(session); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_RET_MSG(session, EINVAL, "Cannot open a log cursor without logging enabled"); log = conn->log; cl = NULL; WT_RET(__wt_calloc_one(session, &cl)); cursor = &cl->iface; *cursor = iface; cursor->session = &session->iface; WT_ERR(__wt_calloc_one(session, &cl->cur_lsn)); WT_ERR(__wt_calloc_one(session, &cl->next_lsn)); WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); cursor->key_format = WT_LOGC_KEY_FORMAT; cursor->value_format = WT_LOGC_VALUE_FORMAT; WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); /* * The user may be trying to read a log record they just wrote. * Log records may be buffered, so force out any now. */ WT_ERR(__wt_log_force_write(session, 1)); /* Log cursors block archiving. */ WT_ERR(__wt_readlock(session, log->log_archive_lock)); if (0) { err: if (F_ISSET(cursor, WT_CURSTD_OPEN)) WT_TRET(cursor->close(cursor)); else { __wt_free(session, cl->cur_lsn); __wt_free(session, cl->next_lsn); __wt_scr_free(session, &cl->logrec); __wt_scr_free(session, &cl->opkey); __wt_scr_free(session, &cl->opvalue); /* * NOTE: We cannot get on the error path with the * readlock held. No need to unlock it unless that * changes above. */ __wt_free(session, cl); } *cursorp = NULL; } return (ret); }
/* * __wt_curjoin_open -- * Initialize a join cursor. * * Join cursors are read-only. */ int __wt_curjoin_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __curjoin_get_key, /* get-key */ __curjoin_get_value, /* get-value */ __wt_cursor_set_key_notsup, /* set-key */ __wt_cursor_set_value_notsup, /* set-value */ __wt_cursor_compare_notsup, /* compare */ __wt_cursor_equals_notsup, /* equals */ __curjoin_next, /* next */ __wt_cursor_notsup, /* prev */ __curjoin_reset, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_modify_notsup, /* modify */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __wt_cursor_notsup, /* cache */ __wt_cursor_reopen_notsup, /* reopen */ __curjoin_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_JOIN *cjoin; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_TABLE *table; size_t size; const char *tablename, *columns; WT_STATIC_ASSERT(offsetof(WT_CURSOR_JOIN, iface) == 0); if (owner != NULL) WT_RET_MSG(session, EINVAL, "unable to initialize a join cursor with existing owner"); tablename = uri; if (!WT_PREFIX_SKIP(tablename, "join:table:")) return ( __wt_unexpected_object_type(session, uri, "join:table:")); columns = strchr(tablename, '('); if (columns == NULL) size = strlen(tablename); else size = WT_PTRDIFF(columns, tablename); WT_RET(__wt_schema_get_table( session, tablename, size, false, 0, &table)); WT_RET(__wt_calloc_one(session, &cjoin)); cursor = (WT_CURSOR *)cjoin; *cursor = iface; cursor->session = (WT_SESSION *)session; cursor->key_format = table->key_format; cursor->value_format = table->value_format; cjoin->table = table; /* Handle projections. */ WT_ERR(__wt_scr_alloc(session, 0, &tmp)); if (columns != NULL) { WT_ERR(__wt_struct_reformat(session, table, columns, strlen(columns), NULL, false, tmp)); WT_ERR(__wt_strndup( session, tmp->data, tmp->size, &cursor->value_format)); WT_ERR(__wt_strdup(session, columns, &cjoin->projection)); } WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp)); if (0) { err: WT_TRET(__curjoin_close(cursor)); *cursorp = NULL; } __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(etmp); WT_DECL_ITEM(tmp); WT_DECL_RET; WT_ENCRYPTOR *encryptor; WT_ITEM *ip; const WT_PAGE_HEADER *dsk; const char *fail_msg; size_t result_len; btree = S2BT(session); bm = btree->bm; fail_msg = NULL; /* -Wuninitialized */ /* * If anticipating a compressed or encrypted block, read into a scratch * buffer and decompress into the caller's buffer. Else, read directly * into the caller's buffer. */ if (btree->compressor == NULL && btree->kencryptor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; ip = NULL; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; ip = tmp; } /* * If the block is encrypted, copy the skipped bytes of the original * image into place, then decrypt. */ if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) { if (btree->kencryptor == NULL || (encryptor = btree->kencryptor->encryptor) == NULL || encryptor->decrypt == NULL) { fail_msg = "encrypted block in file for which no encryption " "configured"; goto corrupt; } WT_ERR(__wt_scr_alloc(session, 0, &etmp)); if ((ret = __wt_decrypt(session, encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) { fail_msg = "block decryption failed"; goto corrupt; } ip = etmp; dsk = ip->data; } else if (btree->kencryptor != NULL) { fail_msg = "unencrypted block in file for which encryption configured"; goto corrupt; } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) { fail_msg = "compressed block in file for which no compression " "configured"; goto corrupt; } /* * Size the buffer based on the in-memory bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { fail_msg = "block decryption failed"; goto corrupt; } } else /* * If we uncompressed above, the page is in the correct buffer. * If we get here the data may be in the wrong buffer and the * buffer may be the wrong size. If needed, get the page * into the destination buffer. */ if (ip != NULL) WT_ERR(__wt_buf_set( session, buf, ip->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); if (0) { corrupt: if (ret == 0) ret = WT_ERROR; if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { __wt_err(session, ret, "%s", fail_msg); ret = __wt_illegal_value(session, btree->dhandle->name); } } err: __wt_scr_free(session, &tmp); __wt_scr_free(session, &etmp); return (ret); }
/* * __curjoin_init_bloom -- * Populate Bloom filters */ static int __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENTRY *entry, WT_BLOOM *bloom) { WT_COLLATOR *collator; WT_CURSOR *c; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; WT_DECL_ITEM(uribuf); WT_DECL_RET; WT_ITEM curkey, curvalue; size_t size; u_int skip; int cmp; const char *uri; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; c = NULL; skip = 0; if (entry->index != NULL) /* * Open the raw index. We're avoiding any references * to the main table, they may be expensive. */ uri = entry->index->source; else { /* * For joins on the main table, we just need the primary * key for comparison, we don't need any values. */ size = strlen(cjoin->table->iface.name) + 3; WT_ERR(__wt_scr_alloc(session, size, &uribuf)); WT_ERR(__wt_buf_fmt(session, uribuf, "%s()", cjoin->table->iface.name)); uri = uribuf->data; } WT_ERR(__wt_open_cursor(session, uri, &cjoin->iface, raw_cfg, &c)); /* Initially position the cursor if necessary. */ endmax = &entry->ends[entry->ends_next]; if ((end = &entry->ends[0]) < endmax) { if (F_ISSET(end, WT_CURJOIN_END_GT) || WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ) { WT_ERR(__wt_cursor_dup_position(end->cursor, c)); if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE) skip = 1; } else if (F_ISSET(end, WT_CURJOIN_END_LT)) { if ((ret = c->next(c)) == WT_NOTFOUND) goto done; WT_ERR(ret); } else WT_PANIC_ERR(session, EINVAL, "fatal error in join cursor position state"); } collator = (entry->index == NULL) ? NULL : entry->index->collator; while (ret == 0) { WT_ERR(c->get_key(c, &curkey)); entry->stats.iterated++; if (entry->index != NULL) { /* * Repack so it's comparable to the * reference endpoints. */ WT_ERR(__wt_struct_repack(session, c->key_format, (entry->repack_format != NULL ? entry->repack_format : entry->index->idxkey_format), &c->key, &curkey)); } for (end = &entry->ends[skip]; end < endmax; end++) { WT_ERR(__wt_compare(session, collator, &curkey, &end->key, &cmp)); if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) { /* if condition satisfied, insert immediately */ switch (WT_CURJOIN_END_RANGE(end)) { case WT_CURJOIN_END_EQ: if (cmp == 0) goto insert; break; case WT_CURJOIN_END_GT: if (cmp > 0) { /* skip this check next time */ skip = entry->ends_next; goto insert; } break; case WT_CURJOIN_END_GE: if (cmp >= 0) goto insert; break; case WT_CURJOIN_END_LT: if (cmp < 0) goto insert; break; case WT_CURJOIN_END_LE: if (cmp <= 0) goto insert; break; } } else if (!F_ISSET(end, WT_CURJOIN_END_LT)) { if (cmp < 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ))) goto advance; if (cmp > 0) { if (F_ISSET(end, WT_CURJOIN_END_GT)) skip = 1; else goto done; } } else { if (cmp > 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ))) goto done; } } /* * Either it's a disjunction that hasn't satisfied any * condition, or it's a conjunction that has satisfied all * conditions. */ if (F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) goto advance; insert: if (entry->index != NULL) { curvalue.data = (unsigned char *)curkey.data + curkey.size; WT_ASSERT(session, c->key.size > curkey.size); curvalue.size = c->key.size - curkey.size; } else WT_ERR(c->get_key(c, &curvalue)); __wt_bloom_insert(bloom, &curvalue); entry->stats.bloom_insert++; advance: if ((ret = c->next(c)) == WT_NOTFOUND) break; } done: WT_ERR_NOTFOUND_OK(ret); err: if (c != NULL) WT_TRET(c->close(c)); __wt_scr_free(session, &uribuf); return (ret); }
/* * __wt_win_directory_list -- * Get a list of files from a directory, MSVC version. */ int __wt_win_directory_list(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session, const char *directory, const char *prefix, char ***dirlistp, uint32_t *countp) { DWORD windows_error; HANDLE findhandle; WIN32_FIND_DATAW finddata; WT_DECL_ITEM(pathbuf); WT_DECL_ITEM(file_utf8); WT_DECL_ITEM(pathbuf_wide); WT_DECL_ITEM(prefix_wide); WT_DECL_RET; WT_SESSION_IMPL *session; size_t dirallocsz, pathlen, prefix_widelen; uint32_t count; char *dir_copy, **entries; session = (WT_SESSION_IMPL *)wt_session; *dirlistp = NULL; *countp = 0; findhandle = INVALID_HANDLE_VALUE; dirallocsz = 0; entries = NULL; WT_ERR(__wt_strdup(session, directory, &dir_copy)); pathlen = strlen(dir_copy); if (dir_copy[pathlen - 1] == '\\') dir_copy[pathlen - 1] = '\0'; WT_ERR(__wt_scr_alloc(session, pathlen + 3, &pathbuf)); WT_ERR(__wt_buf_fmt(session, pathbuf, "%s\\*", dir_copy)); WT_ERR(__wt_to_utf16_string(session, pathbuf->data, &pathbuf_wide)); WT_ERR(__wt_to_utf16_string(session, prefix, &prefix_wide)); prefix_widelen = wcslen(prefix_wide->data); findhandle = FindFirstFileW(pathbuf_wide->data, &finddata); if (findhandle == INVALID_HANDLE_VALUE) { windows_error = __wt_getlasterror(); __wt_errx(session, "%s: directory-list: FindFirstFile: %s", pathbuf->data, __wt_formatmessage(session, windows_error)); WT_ERR(__wt_map_windows_error(windows_error)); } count = 0; do { /* * Skip . and .. */ if (wcscmp(finddata.cFileName, L".") == 0 || wcscmp(finddata.cFileName, L"..") == 0) continue; /* The list of files is optionally filtered by a prefix. */ if (prefix != NULL && wcsncmp(finddata.cFileName, prefix_wide->data, prefix_widelen) != 0) continue; WT_ERR(__wt_realloc_def( session, &dirallocsz, count + 1, &entries)); WT_ERR(__wt_to_utf8_string( session, finddata.cFileName, &file_utf8)); WT_ERR(__wt_strdup(session, file_utf8->data, &entries[count])); ++count; __wt_scr_free(session, &file_utf8); } while (FindNextFileW(findhandle, &finddata) != 0); *dirlistp = entries; *countp = count; err: if (findhandle != INVALID_HANDLE_VALUE) if (FindClose(findhandle) == 0) { windows_error = __wt_getlasterror(); __wt_errx(session, "%s: directory-list: FindClose: %s", pathbuf->data, __wt_formatmessage(session, windows_error)); if (ret == 0) ret = __wt_map_windows_error(windows_error); } __wt_free(session, dir_copy); __wt_scr_free(session, &pathbuf); __wt_scr_free(session, &file_utf8); __wt_scr_free(session, &pathbuf_wide); __wt_scr_free(session, &prefix_wide); if (ret == 0) return (0); WT_TRET(__wt_win_directory_list_free( file_system, wt_session, entries, count)); WT_RET_MSG(session, ret, "%s: directory-list, prefix \"%s\"", directory, prefix == NULL ? "" : prefix); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (key->size != 0) { __wt_cursor_set_raw_key(cursor, key); ret = cursor->search_near(cursor, ¬used); /* * Don't search for the same key twice; if we don't set a new * key below, it's because we've reached the end of the table * and we want the next pass to start at the beginning of the * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ key->size = 0; if (ret != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. */ cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } srch_notfound: WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __config_merge_scan -- * Walk a configuration string, inserting entries into the merged array. */ static int __config_merge_scan(WT_SESSION_IMPL *session, const char *key, const char *value, WT_CONFIG_MERGE *cp) { WT_CONFIG cparser; WT_CONFIG_ITEM k, v; WT_DECL_ITEM(kb); WT_DECL_ITEM(vb); WT_DECL_RET; size_t len; WT_ERR(__wt_scr_alloc(session, 0, &kb)); WT_ERR(__wt_scr_alloc(session, 0, &vb)); WT_ERR(__wt_config_init(session, &cparser, value)); while ((ret = __wt_config_next(&cparser, &k, &v)) == 0) { if (k.type != WT_CONFIG_ITEM_STRING && k.type != WT_CONFIG_ITEM_ID) WT_ERR_MSG(session, EINVAL, "Invalid configuration key found: '%s'\n", k.str); /* Include the quotes around string keys/values. */ if (k.type == WT_CONFIG_ITEM_STRING) { --k.str; k.len += 2; } if (v.type == WT_CONFIG_ITEM_STRING) { --v.str; v.len += 2; } /* * !!! * We're using a JSON quote character to separate the names we * create for nested structures. That's not completely safe as * it's possible to quote characters in JSON such that a quote * character appears as a literal character in a key name. In * a few cases, applications can create their own key namespace * (for example, shared library extension names), and therefore * it's possible for an application to confuse us. Error if we * we ever see a key with a magic character. */ for (len = 0; len < k.len; ++len) if (k.str[len] == SEPC) WT_ERR_MSG(session, EINVAL, "key %.*s contains a '%c' separator " "character", (int)k.len, (char *)k.str, SEPC); /* Build the key/value strings. */ WT_ERR(__wt_buf_fmt(session, kb, "%s%s%.*s", key == NULL ? "" : key, key == NULL ? "" : SEP, (int)k.len, k.str)); WT_ERR(__wt_buf_fmt(session, vb, "%.*s", (int)v.len, v.str)); /* * If the value is a structure, recursively parse it. * * !!! * Don't merge unless the structure has field names. WiredTiger * stores checkpoint LSNs in the metadata file using nested * structures without field names: "checkpoint_lsn=(1,0)", not * "checkpoint_lsn=(file=1,offset=0)". The value type is still * WT_CONFIG_ITEM_STRUCT, so we check for a field name in the * value. */ if (v.type == WT_CONFIG_ITEM_STRUCT && strchr(vb->data, '=') != NULL) { WT_ERR(__config_merge_scan( session, kb->data, vb->data, cp)); continue; } /* Insert the value into the array. */ WT_ERR(__wt_realloc_def(session, &cp->entries_allocated, cp->entries_next + 1, &cp->entries)); WT_ERR(__wt_strndup(session, kb->data, kb->size, &cp->entries[cp->entries_next].k)); WT_ERR(__wt_strndup(session, vb->data, vb->size, &cp->entries[cp->entries_next].v)); cp->entries[cp->entries_next].gen = cp->entries_next; ++cp->entries_next; } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &kb); __wt_scr_free(session, &vb); return (ret); }
/* * __create_file -- * Create a new 'file:' object. */ static int __create_file(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_DECL_ITEM(val); WT_DECL_RET; uint32_t allocsize; int is_metadata; const char *filename, **p, *filecfg[] = { WT_CONFIG_BASE(session, file_meta), config, NULL, NULL }; char *fileconf; fileconf = NULL; is_metadata = strcmp(uri, WT_METAFILE_URI) == 0; filename = uri; if (!WT_PREFIX_SKIP(filename, "file:")) WT_RET_MSG(session, EINVAL, "Expected a 'file:' URI: %s", uri); /* Check if the file already exists. */ if (!is_metadata && (ret = __wt_metadata_search(session, uri, &fileconf)) != WT_NOTFOUND) { if (exclusive) WT_TRET(EEXIST); goto err; } /* Sanity check the allocation size. */ WT_RET(__wt_direct_io_size_check( session, filecfg, "allocation_size", &allocsize)); /* Create the file. */ WT_ERR(__wt_block_manager_create(session, filename, allocsize)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_fileop(session, NULL, uri)); /* * If creating an ordinary file, append the file ID and current version * numbers to the passed-in configuration and insert the resulting * configuration into the metadata. */ if (!is_metadata) { WT_ERR(__wt_scr_alloc(session, 0, &val)); WT_ERR(__wt_buf_fmt(session, val, "id=%" PRIu32 ",version=(major=%d,minor=%d)", ++S2C(session)->next_file_id, WT_BTREE_MAJOR_VERSION_MAX, WT_BTREE_MINOR_VERSION_MAX)); for (p = filecfg; *p != NULL; ++p) ; *p = val->data; WT_ERR(__wt_config_collapse(session, filecfg, &fileconf)); WT_ERR(__wt_metadata_insert(session, uri, fileconf)); } /* * Open the file to check that it was setup correctly. We don't need to * pass the configuration, we just wrote the collapsed configuration * into the metadata file, and it's going to be read/used by underlying * functions. * * Keep the handle exclusive until it is released at the end of the * call, otherwise we could race with a drop. */ WT_ERR(__wt_session_get_btree( session, uri, NULL, NULL, WT_DHANDLE_EXCLUSIVE)); if (WT_META_TRACKING(session)) WT_ERR(__wt_meta_track_handle_lock(session, 1)); else WT_ERR(__wt_session_release_btree(session)); err: __wt_scr_free(session, &val); __wt_free(session, fileconf); return (ret); }
/* * __wt_meta_ckptlist_set -- * Set a file's checkpoint value from the WT_CKPT list. */ int __wt_meta_ckptlist_set(WT_SESSION_IMPL *session, const char *fname, WT_CKPT *ckptbase, WT_LSN *ckptlsn) { WT_CKPT *ckpt; WT_DECL_ITEM(buf); WT_DECL_RET; time_t secs; int64_t maxorder; const char *sep; WT_ERR(__wt_scr_alloc(session, 0, &buf)); maxorder = 0; sep = ""; WT_ERR(__wt_buf_fmt(session, buf, "checkpoint=(")); WT_CKPT_FOREACH(ckptbase, ckpt) { /* * Each internal checkpoint name is appended with a generation * to make it a unique name. We're solving two problems: when * two checkpoints are taken quickly, the timer may not be * unique and/or we can even see time travel on the second * checkpoint if we snapshot the time in-between nanoseconds * rolling over. Second, if we reset the generational counter * when new checkpoints arrive, we could logically re-create * specific checkpoints, racing with cursors open on those * checkpoints. I can't think of any way to return incorrect * results by racing with those cursors, but it's simpler not * to worry about it. */ if (ckpt->order > maxorder) maxorder = ckpt->order; /* Skip deleted checkpoints. */ if (F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_UPDATE)) { /* * We fake checkpoints for handles in the middle of a * bulk load. If there is a checkpoint, convert the * raw cookie to a hex string. */ if (ckpt->raw.size == 0) ckpt->addr.size = 0; else WT_ERR(__wt_raw_to_hex(session, ckpt->raw.data, ckpt->raw.size, &ckpt->addr)); /* Set the order and timestamp. */ if (F_ISSET(ckpt, WT_CKPT_ADD)) ckpt->order = ++maxorder; /* * XXX * Assumes a time_t fits into a uintmax_t, which isn't * guaranteed, a time_t has to be an arithmetic type, * but not an integral type. */ __wt_seconds(session, &secs); ckpt->sec = (uintmax_t)secs; } if (strcmp(ckpt->name, WT_CHECKPOINT) == 0) WT_ERR(__wt_buf_catfmt(session, buf, "%s%s.%" PRId64 "=(addr=\"%.*s\",order=%" PRId64 ",time=%" PRIuMAX ",size=%" PRIu64 ",write_gen=%" PRIu64 ")", sep, ckpt->name, ckpt->order, (int)ckpt->addr.size, (char *)ckpt->addr.data, ckpt->order, ckpt->sec, ckpt->ckpt_size, ckpt->write_gen)); else WT_ERR(__wt_buf_catfmt(session, buf, "%s%s=(addr=\"%.*s\",order=%" PRId64 ",time=%" PRIuMAX ",size=%" PRIu64 ",write_gen=%" PRIu64 ")", sep, ckpt->name, (int)ckpt->addr.size, (char *)ckpt->addr.data, ckpt->order, ckpt->sec, ckpt->ckpt_size, ckpt->write_gen)); sep = ","; } WT_ERR(__wt_buf_catfmt(session, buf, ")")); if (ckptlsn != NULL) WT_ERR(__wt_buf_catfmt(session, buf, ",checkpoint_lsn=(%" PRIu32 ",%" PRIuMAX ")", ckptlsn->l.file, (uintmax_t)ckptlsn->l.offset)); WT_ERR(__ckpt_set(session, fname, buf->mem)); err: __wt_scr_free(session, &buf); return (ret); }
/* * __wt_curlog_open -- * Initialize a log cursor. */ int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CONNECTION_IMPL *conn; WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value, /* get-value */ __wt_cursor_set_key, /* set-key */ __wt_cursor_set_value, /* set-value */ __curlog_compare, /* compare */ __wt_cursor_equals, /* equals */ __curlog_next, /* next */ __wt_cursor_notsup, /* prev */ __curlog_reset, /* reset */ __curlog_search, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_modify_notsup, /* modify */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __wt_cursor_notsup, /* cache */ __wt_cursor_reopen_notsup, /* reopen */ __curlog_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_LOG *cl; WT_DECL_RET; WT_LOG *log; WT_STATIC_ASSERT(offsetof(WT_CURSOR_LOG, iface) == 0); conn = S2C(session); log = conn->log; WT_RET(__wt_calloc_one(session, &cl)); cursor = (WT_CURSOR *)cl; *cursor = iface; cursor->session = (WT_SESSION *)session; cursor->key_format = WT_LOGC_KEY_FORMAT; cursor->value_format = WT_LOGC_VALUE_FORMAT; WT_ERR(__wt_calloc_one(session, &cl->cur_lsn)); WT_ERR(__wt_calloc_one(session, &cl->next_lsn)); WT_ERR(__wt_scr_alloc(session, 0, &cl->logrec)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opkey)); WT_ERR(__wt_scr_alloc(session, 0, &cl->opvalue)); WT_INIT_LSN(cl->cur_lsn); WT_INIT_LSN(cl->next_lsn); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (log != NULL) { /* * The user may be trying to read a log record they just wrote. * Log records may be buffered, so force out any now. */ WT_ERR(__wt_log_force_write(session, 1, NULL)); /* Log cursors block archiving. */ __wt_readlock(session, &log->log_archive_lock); F_SET(cl, WT_CURLOG_ARCHIVE_LOCK); (void)__wt_atomic_add32(&conn->log_cursors, 1); } if (0) { err: WT_TRET(__curlog_close(cursor)); *cursorp = NULL; } return (ret); }