boost::optional<RecordId> WiredTigerRecordStore::oplogStartHack(
    OperationContext* txn, const RecordId& startingPosition) const {
    if (!_useOplogHack)
        return boost::none;

    {
        WiredTigerRecoveryUnit* wru = WiredTigerRecoveryUnit::get(txn);
        _oplogSetStartHack(wru);
    }

    WiredTigerCursor cursor(_uri, _tableId, true, txn);
    WT_CURSOR* c = cursor.get();

    int cmp;
    c->set_key(c, _makeKey(startingPosition));
    int ret = WT_OP_CHECK(c->search_near(c, &cmp));
    if (ret == 0 && cmp > 0)
        ret = c->prev(c);  // landed one higher than startingPosition
    if (ret == WT_NOTFOUND)
        return RecordId();  // nothing <= startingPosition
    invariantWTOK(ret);

    int64_t key;
    ret = c->get_key(c, &key);
    invariantWTOK(ret);
    return _fromKey(key);
}
Example #2
0
/*
 * __recovery_file_scan --
 *	Scan the files referenced from the metadata and gather information
 *	about them for recovery.
 */
static int
__recovery_file_scan(WT_RECOVERY *r)
{
	WT_CURSOR *c;
	WT_DECL_RET;
	int cmp;
	const char *uri, *config;

	/* Scan through all files in the metadata. */
	c = r->files[0].c;
	c->set_key(c, "file:");
	if ((ret = c->search_near(c, &cmp)) != 0) {
		/* Is the metadata empty? */
		WT_RET_NOTFOUND_OK(ret);
		return (0);
	}
	if (cmp < 0)
		WT_RET_NOTFOUND_OK(c->next(c));
	for (; ret == 0; ret = c->next(c)) {
		WT_RET(c->get_key(c, &uri));
		if (!WT_PREFIX_MATCH(uri, "file:"))
			break;
		WT_RET(c->get_value(c, &config));
		WT_RET(__recovery_setup_file(r, uri, config));
	}
	WT_RET_NOTFOUND_OK(ret);
	return (0);
}
Example #3
0
/*
 * __curindex_search_near --
 *	WT_CURSOR->search_near method for index cursors.
 */
static int
__curindex_search_near(WT_CURSOR *cursor, int *exact)
{
	WT_CURSOR *child;
	WT_CURSOR_INDEX *cindex;
	WT_DECL_RET;
	WT_ITEM found_key;
	WT_SESSION_IMPL *session;
	int cmp;

	cindex = (WT_CURSOR_INDEX *)cursor;
	child = cindex->child;
	JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL);

	/*
	 * We are searching using the application-specified key, which
	 * (usually) doesn't contain the primary key, so it is just a prefix of
	 * any matching index key.  That said, if there is an exact match, we
	 * want to find the first matching index entry and set exact equal to
	 * zero.
	 *
	 * Do a search_near, and if we find an entry that is too small, step to
	 * the next one.  In the unlikely event of a search past the end of the
	 * tree, go back to the last key.
	 */
	__wt_cursor_set_raw_key(child, &cursor->key);
	WT_ERR(child->search_near(child, &cmp));

	if (cmp < 0) {
		if ((ret = child->next(child)) == WT_NOTFOUND)
			ret = child->prev(child);
		WT_ERR(ret);
	}

	/*
	 * We expect partial matches, and want the smallest record with a key
	 * greater than or equal to the search key.
	 *
	 * If the found key starts with the search key, we indicate a match by
	 * setting exact equal to zero.
	 *
	 * The compare function expects application-supplied keys to come first
	 * so we flip the sign of the result to match what callers expect.
	 */
	found_key = child->key;
	if (found_key.size > cursor->key.size)
		found_key.size = cursor->key.size;

	WT_ERR(__wt_compare(
	    session, cindex->index->collator, &cursor->key, &found_key, exact));
	*exact = -*exact;

	WT_ERR(__curindex_move(cindex));

	if (0) {
err:		F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
	}

	API_END_RET(session, ret);
}
Example #4
0
/*
 * __curindex_search --
 *	WT_CURSOR->search method for index cursors.
 */
static int
__curindex_search(WT_CURSOR *cursor)
{
	WT_CURSOR *child;
	WT_CURSOR_INDEX *cindex;
	WT_DECL_RET;
	WT_ITEM found_key;
	WT_SESSION_IMPL *session;
	int cmp;

	cindex = (WT_CURSOR_INDEX *)cursor;
	child = cindex->child;
	JOINABLE_CURSOR_API_CALL(cursor, session, search, NULL);

	/*
	 * We are searching using the application-specified key, which
	 * (usually) doesn't contain the primary key, so it is just a prefix of
	 * any matching index key.  Do a search_near, step to the next entry if
	 * we land on one that is too small, then check that the prefix
	 * matches.
	 */
	__wt_cursor_set_raw_key(child, &cursor->key);
	WT_ERR(child->search_near(child, &cmp));

	if (cmp < 0)
		WT_ERR(child->next(child));

	/*
	 * We expect partial matches, and want the smallest record with a key
	 * greater than or equal to the search key.
	 *
	 * If the key we find is shorter than the search key, it can't possibly
	 * match.
	 *
	 * The only way for the key to be exactly equal is if there is an index
	 * on the primary key, because otherwise the primary key columns will
	 * be appended to the index key, but we don't disallow that (odd) case.
	 */
	found_key = child->key;
	if (found_key.size < cursor->key.size)
		WT_ERR(WT_NOTFOUND);
	found_key.size = cursor->key.size;

	WT_ERR(__wt_compare(
	    session, cindex->index->collator, &cursor->key, &found_key, &cmp));
	if (cmp != 0) {
		ret = WT_NOTFOUND;
		goto err;
	}

	WT_ERR(__curindex_move(cindex));

	if (0) {
err:		F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
	}

	API_END_RET(session, ret);
}
    boost::optional<Record> next() final {
        if (_eof)
            return {};

        WT_CURSOR* c = _cursor->get();

        bool mustAdvance = true;
        if (_lastReturnedId.isNull() && !_forward && _rs._isCapped) {
            // In this case we need to seek to the highest visible record.
            const RecordId reverseCappedInitialSeekPoint =
                _readUntilForOplog.isNull() ? _rs.lowestCappedHiddenRecord() : _readUntilForOplog;

            if (!reverseCappedInitialSeekPoint.isNull()) {
                c->set_key(c, _makeKey(reverseCappedInitialSeekPoint));
                int cmp;
                int seekRet = WT_OP_CHECK(c->search_near(c, &cmp));
                if (seekRet == WT_NOTFOUND) {
                    _eof = true;
                    return {};
                }
                invariantWTOK(seekRet);

                // If we landed at or past the lowest hidden record, we must advance to be in
                // the visible range.
                mustAdvance = _rs.isCappedHidden(reverseCappedInitialSeekPoint)
                    ? (cmp >= 0)
                    : (cmp > 0);  // No longer hidden.
            }
        }

        if (mustAdvance) {
            // Nothing after the next line can throw WCEs.
            // Note that an unpositioned (or eof) WT_CURSOR returns the first/last entry in the
            // table when you call next/prev.
            int advanceRet = WT_OP_CHECK(_forward ? c->next(c) : c->prev(c));
            if (advanceRet == WT_NOTFOUND) {
                _eof = true;
                return {};
            }
            invariantWTOK(advanceRet);
        }

        int64_t key;
        invariantWTOK(c->get_key(c, &key));
        const RecordId id = _fromKey(key);

        if (!isVisible(id)) {
            _eof = true;
            return {};
        }

        WT_ITEM value;
        invariantWTOK(c->get_value(c, &value));

        _lastReturnedId = id;
        return {{id, {static_cast<const char*>(value.data), static_cast<int>(value.size)}}};
    }
    bool restore() final {
        if (!_cursor)
            _cursor.emplace(_rs.getURI(), _rs.tableId(), true, _txn);

        // This will ensure an active session exists, so any restored cursors will bind to it
        invariant(WiredTigerRecoveryUnit::get(_txn)->getSession(_txn) == _cursor->getSession());

        // If we've hit EOF, then this iterator is done and need not be restored.
        if (_eof)
            return true;

        if (_lastReturnedId.isNull())
            return true;

        WT_CURSOR* c = _cursor->get();
        c->set_key(c, _makeKey(_lastReturnedId));

        int cmp;
        int ret = WT_OP_CHECK(c->search_near(c, &cmp));
        if (ret == WT_NOTFOUND) {
            _eof = true;
            return !_rs._isCapped;
        }
        invariantWTOK(ret);

        if (cmp == 0)
            return true;  // Landed right where we left off.

        if (_rs._isCapped) {
            // Doc was deleted either by cappedDeleteAsNeeded() or cappedTruncateAfter().
            // It is important that we error out in this case so that consumers don't
            // silently get 'holes' when scanning capped collections. We don't make
            // this guarantee for normal collections so it is ok to skip ahead in that case.
            _eof = true;
            return false;
        }

        if (_forward && cmp > 0) {
            // We landed after where we were. Move back one so that next() will return this
            // document.
            ret = WT_OP_CHECK(c->prev(c));
        } else if (!_forward && cmp < 0) {
            // Do the opposite for reverse cursors.
            ret = WT_OP_CHECK(c->next(c));
        }
        if (ret != WT_NOTFOUND)
            invariantWTOK(ret);

        return true;
    }
Example #7
0
/*
 * __wt_meta_btree_apply --
 *	Apply a function to all files listed in the metadata, apart from the
 *	metadata file.
 */
int
__wt_meta_btree_apply(WT_SESSION_IMPL *session,
    int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[])
{
	WT_CURSOR *cursor;
	WT_DATA_HANDLE *saved_dhandle;
	WT_DECL_RET;
	const char *uri;
	int cmp, tret;

	saved_dhandle = session->dhandle;
	WT_RET(__wt_metadata_cursor(session, NULL, &cursor));
	cursor->set_key(cursor, "file:");
	if ((tret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		tret = cursor->next(cursor);
	for (; tret == 0; tret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		if (!WT_PREFIX_MATCH(uri, "file:"))
			break;
		else if (strcmp(uri, WT_METAFILE_URI) == 0)
			continue;

		/*
		 * We need to pull the handle into the session handle cache
		 * and make sure it's referenced to stop other internal code
		 * dropping the handle (e.g in LSM when cleaning up obsolete
		 * chunks).  Holding the metadata lock isn't enough.
		 */
		ret = __wt_session_get_btree(session, uri, NULL, NULL, 0);
		if (ret == 0) {
			WT_SAVE_DHANDLE(session,
			    ret = func(session, cfg));
			if (WT_META_TRACKING(session))
				WT_TRET(
				    __wt_meta_track_handle_lock(session, 0));
			else
				WT_TRET(__wt_session_release_btree(session));
		} else if (ret == EBUSY)
			ret = __wt_conn_btree_apply_single(
			    session, uri, NULL, func, cfg);
		WT_ERR(ret);
	}

	if (tret != WT_NOTFOUND)
		WT_TRET(tret);
err:	WT_TRET(cursor->close(cursor));
	session->dhandle = saved_dhandle;
	return (ret);
}
Example #8
0
/*
 * __clsm_next --
 *	WT_CURSOR->next method for the LSM cursor type.
 */
static int
__clsm_next(WT_CURSOR *cursor)
{
	WT_CURSOR_LSM *clsm;
	WT_CURSOR *c;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	u_int i;
	int cmp;
	bool check, deleted;

	clsm = (WT_CURSOR_LSM *)cursor;

	CURSOR_API_CALL(cursor, session, next, NULL);
	WT_CURSOR_NOVALUE(cursor);
	WT_ERR(__clsm_enter(clsm, false, false));

	/* If we aren't positioned for a forward scan, get started. */
	if (clsm->current == NULL || !F_ISSET(clsm, WT_CLSM_ITERATE_NEXT)) {
		F_CLR(clsm, WT_CLSM_MULTIPLE);
		WT_FORALL_CURSORS(clsm, c, i) {
			if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
				WT_ERR(c->reset(c));
				ret = c->next(c);
			} else if (c != clsm->current) {
				c->set_key(c, &cursor->key);
				if ((ret = c->search_near(c, &cmp)) == 0) {
					if (cmp < 0)
						ret = c->next(c);
					else if (cmp == 0) {
						if (clsm->current == NULL)
							clsm->current = c;
						else
							F_SET(clsm,
							    WT_CLSM_MULTIPLE);
					}
				} else
					F_CLR(c, WT_CURSTD_KEY_SET);
			}
			WT_ERR_NOTFOUND_OK(ret);
		}
		F_SET(clsm, WT_CLSM_ITERATE_NEXT);
		F_CLR(clsm, WT_CLSM_ITERATE_PREV);

		/* We just positioned *at* the key, now move. */
		if (clsm->current != NULL)
			goto retry;
	} else {
Example #9
0
    Iterator* WiredTigerEngine::Find(const Slice& findkey, const Options& options)
    {
        ContextHolder& holder = GetContextHolder();
        WT_SESSION* session = holder.session;
        if (NULL == session)
        {
            return NULL;
        }
        int ret, exact;
        holder.AddTranscRef();
        WT_CURSOR *cursor = create_wiredtiger_cursor(session);
        if (NULL == cursor)
        {
            holder.ReleaseTranscRef();
            return NULL;
        }
        WT_ITEM key_item;
        key_item.data = findkey.data();
        key_item.size = findkey.size();
        cursor->set_key(cursor, &key_item);

        if ((ret = cursor->search_near(cursor, &exact)) == 0)
        {
            if (exact < 0)
            {
                ret = cursor->next(cursor);
            }
            if (0 == ret)
            {
                return new WiredTigerIterator(this, cursor);
            }
        }
        DEBUG_LOG("Error find data for reason: %s", wiredtiger_strerror(ret));
        cursor->close(cursor);
        holder.ReleaseTranscRef();
        return NULL;
    }
Example #10
0
/*
 * __curds_search_near --
 *	WT_CURSOR.search_near method for the data-source cursor type.
 */
static int
__curds_search_near(WT_CURSOR *cursor, int *exact)
{
	WT_CURSOR *source;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;

	CURSOR_API_CALL(cursor, session, search_near, NULL);

	WT_STAT_CONN_INCR(session, cursor_search_near);
	WT_STAT_DATA_INCR(session, cursor_search_near);

	WT_ERR(__curds_txn_enter(session, false));

	WT_ERR(__curds_key_set(cursor));
	ret =
	    __curds_cursor_resolve(cursor, source->search_near(source, exact));

err:	__curds_txn_leave(session);

	API_END_RET(session, ret);
}
Example #11
0
/*
 * __schema_open_index --
 *	Open one or more indices for a table (internal version).
 */
static int
__schema_open_index(WT_SESSION_IMPL *session,
    WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp)
{
	WT_CURSOR *cursor;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_INDEX *idx;
	u_int i;
	int cmp;
	bool match;
	const char *idxconf, *name, *tablename, *uri;

	/* Check if we've already done the work. */
	if (idxname == NULL && table->idx_complete)
		return (0);

	cursor = NULL;
	idx = NULL;
	match = false;

	/* Build a search key. */
	tablename = table->name;
	(void)WT_PREFIX_SKIP(tablename, "table:");
	WT_ERR(__wt_scr_alloc(session, 512, &tmp));
	WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename));

	/* Find matching indices. */
	WT_ERR(__wt_metadata_cursor(session, &cursor));
	cursor->set_key(cursor, tmp->data);
	if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0)
		ret = cursor->next(cursor);
	for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor, &uri));
		name = uri;
		if (!WT_PREFIX_SKIP(name, tmp->data))
			break;

		/* Is this the index we are looking for? */
		match = idxname == NULL || WT_STRING_MATCH(name, idxname, len);

		/*
		 * Ensure there is space, including if we have to make room for
		 * a new entry in the middle of the list.
		 */
		WT_ERR(__wt_realloc_def(session, &table->idx_alloc,
		    WT_MAX(i, table->nindices) + 1, &table->indices));

		/* Keep the in-memory list in sync with the metadata. */
		cmp = 0;
		while (table->indices[i] != NULL &&
		    (cmp = strcmp(uri, table->indices[i]->name)) > 0) {
			/* Index no longer exists, remove it. */
			__wt_free(session, table->indices[i]);
			memmove(&table->indices[i], &table->indices[i + 1],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[--table->nindices] = NULL;
		}
		if (cmp < 0) {
			/* Make room for a new index. */
			memmove(&table->indices[i + 1], &table->indices[i],
			    (table->nindices - i) * sizeof(WT_INDEX *));
			table->indices[i] = NULL;
			++table->nindices;
		}

		if (!match)
			continue;

		if (table->indices[i] == NULL) {
			WT_ERR(cursor->get_value(cursor, &idxconf));
			WT_ERR(__wt_calloc_one(session, &idx));
			WT_ERR(__wt_strdup(session, uri, &idx->name));
			WT_ERR(__wt_strdup(session, idxconf, &idx->config));
			WT_ERR(__open_index(session, table, idx));

			/*
			 * If we're checking the creation of an index before a
			 * table is fully created, don't save the index: it
			 * will need to be reopened once the table is complete.
			 */
			if (!table->cg_complete) {
				WT_ERR(
				    __wt_schema_destroy_index(session, &idx));
				if (idxname != NULL)
					break;
				continue;
			}

			table->indices[i] = idx;
			idx = NULL;

			/*
			 * If the slot is bigger than anything else we've seen,
			 * bump the number of indices.
			 */
			if (i >= table->nindices)
				table->nindices = i + 1;
		}

		/* If we were looking for a single index, we're done. */
		if (indexp != NULL)
			*indexp = table->indices[i];
		if (idxname != NULL)
			break;
	}
	WT_ERR_NOTFOUND_OK(ret);
	if (idxname != NULL && !match)
		ret = WT_NOTFOUND;

	/* If we did a full pass, we won't need to do it again. */
	if (idxname == NULL) {
		table->nindices = i;
		table->idx_complete = true;
	}

err:	WT_TRET(__wt_metadata_cursor_release(session, &cursor));
	WT_TRET(__wt_schema_destroy_index(session, &idx));

	__wt_scr_free(session, &tmp);
	return (ret);
}
Example #12
0
/*
 * __las_page_instantiate --
 *	Instantiate lookaside update records in a recently read page.
 */
static int
__las_page_instantiate(WT_SESSION_IMPL *session,
    WT_REF *ref, uint32_t read_id, const uint8_t *addr, size_t addr_size)
{
	WT_CURSOR *cursor;
	WT_CURSOR_BTREE cbt;
	WT_DECL_ITEM(current_key);
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_ITEM(las_value);
	WT_DECL_RET;
	WT_PAGE *page;
	WT_UPDATE *first_upd, *last_upd, *upd;
	size_t incr, total_incr;
	uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
	uint32_t las_id, upd_size, session_flags;
	int exact;
	const uint8_t *p;

	cursor = NULL;
	page = ref->page;
	first_upd = last_upd = upd = NULL;
	total_incr = 0;
	current_recno = recno = WT_RECNO_OOB;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	__wt_btcur_init(session, &cbt);
	__wt_btcur_open(&cbt);

	WT_ERR(__wt_scr_alloc(session, 0, &current_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));
	WT_ERR(__wt_scr_alloc(session, 0, &las_value));

	/* Open a lookaside table cursor. */
	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * The lookaside records are in key and update order, that is, there
	 * will be a set of in-order updates for a key, then another set of
	 * in-order updates for a subsequent key. We process all of the updates
	 * for a key and then insert those updates into the page, then all the
	 * updates for the next key, and so on.
	 *
	 * Search for the block's unique prefix, stepping through any matching
	 * records.
	 */
	las_addr->data = addr;
	las_addr->size = addr_size;
	las_key->size = 0;
	cursor->set_key(
	    cursor, read_id, las_addr, (uint64_t)0, (uint32_t)0, las_key);
	if ((ret = cursor->search_near(cursor, &exact)) == 0 && exact < 0)
		ret = cursor->next(cursor);
	for (; ret == 0; ret = cursor->next(cursor)) {
		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * Confirm the search using the unique prefix; if not a match,
		 * we're done searching for records for this page.
		 */
		if (las_id != read_id ||
		    las_addr->size != addr_size ||
		    memcmp(las_addr->data, addr, addr_size) != 0)
			break;

		/*
		 * If the on-page value has become globally visible, this record
		 * is no longer needed.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			continue;

		/* Allocate the WT_UPDATE structure. */
		WT_ERR(cursor->get_value(
		    cursor, &upd_txnid, &upd_size, las_value));
		WT_ERR(__wt_update_alloc(session,
		    (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
		    &upd, &incr));
		total_incr += incr;
		upd->txnid = upd_txnid;

		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			p = las_key->data;
			WT_ERR(__wt_vunpack_uint(&p, 0, &recno));
			if (current_recno == recno)
				break;
			WT_ASSERT(session, current_recno < recno);

			if (first_upd != NULL) {
				WT_ERR(__col_instantiate(session,
				    current_recno, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			current_recno = recno;
			break;
		case WT_PAGE_ROW_LEAF:
			if (current_key->size == las_key->size &&
			    memcmp(current_key->data,
			    las_key->data, las_key->size) == 0)
				break;

			if (first_upd != NULL) {
				WT_ERR(__row_instantiate(session,
				    current_key, ref, &cbt, first_upd));
				first_upd = NULL;
			}
			WT_ERR(__wt_buf_set(session,
			    current_key, las_key->data, las_key->size));
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

		/* Append the latest update to the list. */
		if (first_upd == NULL)
			first_upd = last_upd = upd;
		else {
			last_upd->next = upd;
			last_upd = upd;
		}
		upd = NULL;
	}
	WT_ERR_NOTFOUND_OK(ret);

	/* Insert the last set of updates, if any. */
	if (first_upd != NULL)
		switch (page->type) {
		case WT_PAGE_COL_FIX:
		case WT_PAGE_COL_VAR:
			WT_ERR(__col_instantiate(session,
			    current_recno, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		case WT_PAGE_ROW_LEAF:
			WT_ERR(__row_instantiate(session,
			    current_key, ref, &cbt, first_upd));
			first_upd = NULL;
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}

	/* Discard the cursor. */
	WT_ERR(__wt_las_cursor_close(session, &cursor, session_flags));

	if (total_incr != 0) {
		__wt_cache_page_inmem_incr(session, page, total_incr);

		/*
		 * We've modified/dirtied the page, but that's not necessary and
		 * if we keep the page clean, it's easier to evict. We leave the
		 * lookaside table updates in place, so if we evict this page
		 * without dirtying it, any future instantiation of it will find
		 * the records it needs. If the page is dirtied before eviction,
		 * then we'll write any needed lookaside table records for the
		 * new location of the page.
		 */
		__wt_page_modify_clear(session, page);
	}

err:	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));
	WT_TRET(__wt_btcur_close(&cbt, 1));

	/*
	 * On error, upd points to a single unlinked WT_UPDATE structure,
	 * first_upd points to a list.
	 */
	if (upd != NULL)
		__wt_free(session, upd);
	if (first_upd != NULL)
		__wt_free_update_list(session, first_upd);

	__wt_scr_free(session, &current_key);
	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);
	__wt_scr_free(session, &las_value);

	return (ret);
}
Example #13
0
/*
 * __wt_las_sweep --
 *	Sweep the lookaside table.
 */
int
__wt_las_sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	WT_ITEM *key;
	uint64_t cnt, las_counter, las_txnid;
	int64_t remove_cnt;
	uint32_t las_id, session_flags;
	int notused;

	conn = S2C(session);
	cursor = NULL;
	key = &conn->las_sweep_key;
	remove_cnt = 0;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * If we're not starting a new sweep, position the cursor using the key
	 * from the last call (we don't care if we're before or after the key,
	 * just roughly in the same spot is fine).
	 */
	if (key->size != 0) {
		__wt_cursor_set_raw_key(cursor, key);
		ret = cursor->search_near(cursor, &notused);

		/*
		 * Don't search for the same key twice; if we don't set a new
		 * key below, it's because we've reached the end of the table
		 * and we want the next pass to start at the beginning of the
		 * table. Searching for the same key could leave us stuck at
		 * the end of the table, repeatedly checking the same rows.
		 */
		key->size = 0;
		if (ret != 0)
			goto srch_notfound;
	}

	/*
	 * The sweep server wakes up every 10 seconds (by default), it's a slow
	 * moving thread. Try to review the entire lookaside table once every 5
	 * minutes, or every 30 calls.
	 *
	 * The reason is because the lookaside table exists because we're seeing
	 * cache/eviction pressure (it allows us to trade performance and disk
	 * space for cache space), and it's likely lookaside blocks are being
	 * evicted, and reading them back in doesn't help things. A trickier,
	 * but possibly better, alternative might be to review all lookaside
	 * blocks in the cache in order to get rid of them, and slowly review
	 * lookaside blocks that have already been evicted.
	 */
	cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30);

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	/* Walk the file. */
	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
		/*
		 * If the loop terminates after completing a work unit, we will
		 * continue the table sweep next time. Get a local copy of the
		 * sweep key, we're going to reset the cursor; do so before
		 * calling cursor.remove, cursor.remove can discard our hazard
		 * pointer and the page could be evicted from underneath us.
		 */
		if (cnt == 1) {
			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
			if (!WT_DATA_IN_ITEM(key))
				WT_ERR(__wt_buf_set(
				    session, key, key->data, key->size));
		}

		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * If the on-page record transaction ID associated with the
		 * record is globally visible, the record can be discarded.
		 *
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		if (__wt_txn_visible_all(session, las_txnid)) {
			WT_ERR(cursor->remove(cursor));
			++remove_cnt;
		}
	}

srch_notfound:
	WT_ERR_NOTFOUND_OK(ret);

	if (0) {
err:		__wt_buf_free(session, key);
	}

	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));

	/*
	 * If there were races to remove records, we can over-count.  All
	 * arithmetic is signed, so underflow isn't fatal, but check anyway so
	 * we don't skew low over time.
	 */
	if (remove_cnt > S2C(session)->las_record_cnt)
		S2C(session)->las_record_cnt = 0;
	else if (remove_cnt > 0)
		(void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt);

	F_CLR(session, WT_SESSION_NO_CACHE);

	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);

	return (ret);
}
Example #14
0
/*
 * __wt_las_sweep --
 *	Sweep the lookaside table.
 */
int
__wt_las_sweep(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *cursor;
	WT_DECL_ITEM(las_addr);
	WT_DECL_ITEM(las_key);
	WT_DECL_RET;
	WT_ITEM *key;
	uint64_t cnt, las_counter, las_txnid;
	uint32_t las_id, session_flags;
	int notused;

	conn = S2C(session);
	cursor = NULL;
	key = &conn->las_sweep_key;
	session_flags = 0;		/* [-Werror=maybe-uninitialized] */

	WT_ERR(__wt_scr_alloc(session, 0, &las_addr));
	WT_ERR(__wt_scr_alloc(session, 0, &las_key));

	WT_ERR(__wt_las_cursor(session, &cursor, &session_flags));

	/*
	 * If we're not starting a new sweep, position the cursor using the key
	 * from the last call (we don't care if we're before or after the key,
	 * just roughly in the same spot is fine).
	 */
	if (conn->las_sweep_call != 0 && key->data != NULL) {
		__wt_cursor_set_raw_key(cursor, key);
		if ((ret = cursor->search_near(cursor, &notused)) != 0)
			goto srch_notfound;
	}

	/*
	 * The sweep server wakes up every 10 seconds (by default), it's a slow
	 * moving thread. Try to review the entire lookaside table once every 5
	 * minutes, or every 30 calls.
	 *
	 * The reason is because the lookaside table exists because we're seeing
	 * cache/eviction pressure (it allows us to trade performance and disk
	 * space for cache space), and it's likely lookaside blocks are being
	 * evicted, and reading them back in doesn't help things. A trickier,
	 * but possibly better, alternative might be to review all lookaside
	 * blocks in the cache in order to get rid of them, and slowly review
	 * lookaside blocks that have already been evicted.
	 *
	 * We can't know for sure how many records are in the lookaside table,
	 * the cursor insert and remove statistics aren't updated atomically.
	 * Start with reviewing 100 rows, and if it takes more than the target
	 * number of calls to finish, increase the number of rows checked on
	 * each call; if it takes less than the target calls to finish, then
	 * decrease the number of rows reviewed on each call (but never less
	 * than 100).
	 */
#define	WT_SWEEP_LOOKASIDE_MIN_CNT	100
#define	WT_SWEEP_LOOKASIDE_PASS_TARGET	 30
	++conn->las_sweep_call;
	if ((cnt = conn->las_sweep_cnt) < WT_SWEEP_LOOKASIDE_MIN_CNT)
		cnt = conn->las_sweep_cnt = WT_SWEEP_LOOKASIDE_MIN_CNT;

	/* Walk the file. */
	for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) {
		/*
		 * If the loop terminates after completing a work unit, we will
		 * continue the table sweep next time. Get a local copy of the
		 * sweep key, we're going to reset the cursor; do so before
		 * calling cursor.remove, cursor.remove can discard our hazard
		 * pointer and the page could be evicted from underneath us.
		 */
		if (cnt == 1) {
			WT_ERR(__wt_cursor_get_raw_key(cursor, key));
			if (!WT_DATA_IN_ITEM(key))
				WT_ERR(__wt_buf_set(
				    session, key, key->data, key->size));
		}

		WT_ERR(cursor->get_key(cursor,
		    &las_id, las_addr, &las_counter, &las_txnid, las_key));

		/*
		 * If the on-page record transaction ID associated with the
		 * record is globally visible, the record can be discarded.
		 *
		 * Cursor opened overwrite=true: won't return WT_NOTFOUND should
		 * another thread remove the record before we do, and the cursor
		 * remains positioned in that case.
		 */
		if (__wt_txn_visible_all(session, las_txnid))
			WT_ERR(cursor->remove(cursor));
	}

	/*
	 * When reaching the lookaside table end or the target number of calls,
	 * adjust the row count. Decrease/increase the row count depending on
	 * if the number of calls is less/more than the target.
	 */
	if (ret == WT_NOTFOUND ||
	    conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET) {
		if (conn->las_sweep_call < WT_SWEEP_LOOKASIDE_PASS_TARGET &&
		    conn->las_sweep_cnt > WT_SWEEP_LOOKASIDE_MIN_CNT)
			conn->las_sweep_cnt -= WT_SWEEP_LOOKASIDE_MIN_CNT;
		if (conn->las_sweep_call > WT_SWEEP_LOOKASIDE_PASS_TARGET)
			conn->las_sweep_cnt += WT_SWEEP_LOOKASIDE_MIN_CNT;
	}

srch_notfound:
	if (ret == WT_NOTFOUND)
		conn->las_sweep_call = 0;

	WT_ERR_NOTFOUND_OK(ret);

	if (0) {
err:		__wt_buf_free(session, key);
	}

	WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags));

	__wt_scr_free(session, &las_addr);
	__wt_scr_free(session, &las_key);

	return (ret);
}
int main(void)
{
	int count, exact, ret;
	WT_CONNECTION *conn;
	WT_SESSION *session;
	WT_CURSOR *cursor;
	CUSTOMER cust, *custp, cust_sample[] = {
		{ 0, "Professor Oak", "LeafGreen Avenue", "123-456-7890" },
		{ 0, "Lorelei", "Sevii Islands", "098-765-4321" },
		{ 0, NULL, NULL, NULL }
	};
	CALL call, *callp, call_sample[] = {
		{ 0, 32, 1, 2, "billing", "unavailable" },
		{ 0, 33, 1, 2, "billing", "available" },
		{ 0, 34, 1, 2, "reminder", "unavailable" },
		{ 0, 35, 1, 2, "reminder", "available" },
		{ 0, 0, 0, 0, NULL, NULL }
	};

	ret = wiredtiger_open(home, NULL, "create", &conn);
	if (ret != 0) {
		fprintf(stderr, "Error connecting to %s: %s\n",
		    home, wiredtiger_strerror(ret));
		return (1);
	}
	/* Note: further error checking omitted for clarity. */

	/*! [call-center work] */
	ret = conn->open_session(conn, NULL, NULL, &session);

	/*
	 * Create the customers table, give names and types to the columns.
	 * The columns will be stored in two groups: "main" and "address",
	 * created below.
	 */
	ret = session->create(session, "table:customers",
	    "key_format=r,"
	    "value_format=SSS,"
	    "columns=(id,name,address,phone),"
	    "colgroups=(main,address)");

	/* Create the main column group with value columns except address. */
	ret = session->create(session,
	    "colgroup:customers:main", "columns=(name,phone)");

	/* Create the address column group with just the address. */
	ret = session->create(session,
	    "colgroup:customers:address", "columns=(address)");

	/* Create an index on the customer table by phone number. */
	ret = session->create(session,
	    "index:customers:phone", "columns=(phone)");

	/* Populate the customers table with some data. */
	ret = session->open_cursor(
	    session, "table:customers", NULL, "append", &cursor);
	for (custp = cust_sample; custp->name != NULL; custp++) {
		cursor->set_value(cursor,
		    custp->name, custp->address, custp->phone);
		ret = cursor->insert(cursor);
	}
	ret = cursor->close(cursor);

	/*
	 * Create the calls table, give names and types to the columns.  All the
	 * columns will be stored together, so no column groups are declared.
	 */
	ret = session->create(session, "table:calls",
	    "key_format=r,"
	    "value_format=qrrSS,"
	    "columns=(id,call_date,cust_id,emp_id,call_type,notes)");

	/*
	 * Create an index on the calls table with a composite key of cust_id
	 * and call_date.
	 */
	ret = session->create(session, "index:calls:cust_date",
	    "columns=(cust_id,call_date)");

	/* Populate the calls table with some data. */
	ret = session->open_cursor(
	    session, "table:calls", NULL, "append", &cursor);
	for (callp = call_sample; callp->call_type != NULL; callp++) {
		cursor->set_value(cursor, callp->call_date, callp->cust_id,
		    callp->emp_id, callp->call_type, callp->notes);
		ret = cursor->insert(cursor);
	}
	ret = cursor->close(cursor);

	/*
	 * First query: a call arrives.  In SQL:
	 *
	 * SELECT id, name FROM Customers WHERE phone=?
	 *
	 * Use the cust_phone index, lookup by phone number to fill the
	 * customer record.  The cursor will have a key format of "S" for a
	 * string because the cust_phone index has a single column ("phone"),
	 * which is of type "S".
	 *
	 * Specify the columns we want: the customer ID and the name.  This
	 * means the cursor's value format will be "rS".
	 */
	ret = session->open_cursor(session,
	    "index:customers:phone(id,name)", NULL, NULL, &cursor);
	cursor->set_key(cursor, "123-456-7890");
	ret = cursor->search(cursor);
	if (ret == 0) {
		ret = cursor->get_value(cursor, &cust.id, &cust.name);
		printf("Read customer record for %s (ID %" PRIu64 ")\n",
		    cust.name, cust.id);
	}
	ret = cursor->close(cursor);

	/*
	 * Next query: get the recent order history.  In SQL:
	 *
	 * SELECT * FROM Calls WHERE cust_id=? ORDER BY call_date DESC LIMIT 3
	 *
	 * Use the call_cust_date index to find the matching calls.  Since it is
	 * is in increasing order by date for a given customer, we want to start
	 * with the last record for the customer and work backwards.
	 *
	 * Specify a subset of columns to be returned.  (Note that if these were
	 * all covered by the index, the primary would not have to be accessed.)
	 * Stop after getting 3 records.
	 */
	ret = session->open_cursor(session,
	    "index:calls:cust_date(cust_id,call_type,notes)",
	    NULL, NULL, &cursor);

	/*
	 * The keys in the index are (cust_id,call_date) -- we want the largest
	 * call date for a given cust_id.  Search for (cust_id+1,0), then work
	 * backwards.
	 */
	cust.id = 1;
	cursor->set_key(cursor, cust.id + 1, 0);
	ret = cursor->search_near(cursor, &exact);

	/*
	 * If the table is empty, search_near will return WT_NOTFOUND, else the
	 * cursor will be positioned on a matching key if one exists, or an
	 * adjacent key if one does not.  If the positioned key is equal to or
	 * larger than the search key, go back one.
	 */
	if (ret == 0 && exact >= 0)
		ret = cursor->prev(cursor);
	for (count = 0; ret == 0 && count < 3; ++count) {
		ret = cursor->get_value(cursor,
		    &call.cust_id, &call.call_type, &call.notes);
		if (call.cust_id != cust.id)
			break;
		printf("Call record: customer %" PRIu64 " (%s: %s)\n",
		    call.cust_id, call.call_type, call.notes);
		ret = cursor->prev(cursor);
	}
	/*! [call-center work] */

	ret = conn->close(conn, NULL);

	return (ret);
}