Beispiel #1
0
/*
 * __wt_curjoin_join --
 *	Add a new join to a join cursor.
 */
int
__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range,
    uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
{
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_CURSOR_JOIN_ENDPOINT *end, *newend;
	bool hasins, needbloom, range_eq;
	u_int i, ins, nonbloom;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	char *main_uri;
	size_t namesize, newsize;

	entry = NULL;
	hasins = needbloom = false;
	ins = 0; /* -Wuninitialized */
	main_uri = NULL;
	nonbloom = 0; /* -Wuninitialized */
	namesize = strlen(cjoin->table->name);

	for (i = 0; i < cjoin->entries_next; i++) {
		if (cjoin->entries[i].index == idx) {
			entry = &cjoin->entries[i];
			break;
		}
		if (!needbloom && i > 0 &&
		    !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) {
			needbloom = true;
			nonbloom = i;
		}
	}
	if (entry == NULL) {
		WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated,
		    cjoin->entries_next + 1, &cjoin->entries));
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
			/*
			 * Reorder the list so that after the first entry,
			 * the Bloom filtered entries come next, followed by
			 * the non-Bloom entries.  Once the Bloom filters
			 * are built, determining membership via Bloom is
			 * faster than without Bloom, so we can answer
			 * membership questions more quickly, and with less
			 * I/O, with the Bloom entries first.
			 */
			entry = &cjoin->entries[nonbloom];
			memmove(entry + 1, entry,
			    (cjoin->entries_next - nonbloom) *
			    sizeof(WT_CURSOR_JOIN_ENTRY));
			memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
		}
		else
			entry = &cjoin->entries[cjoin->entries_next];
		entry->index = idx;
		entry->flags = flags;
		entry->count = count;
		entry->bloom_bit_count = bloom_bit_count;
		entry->bloom_hash_count = bloom_hash_count;
		++cjoin->entries_next;
	} else {
		/* Merge the join into an existing entry for this index */
		if (count != 0 && entry->count != 0 && entry->count != count)
			WT_ERR_MSG(session, EINVAL,
			    "count=%" PRIu64 " does not match "
			    "previous count=%" PRIu64 " for this index",
			    count, entry->count);
		if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
			WT_ERR_MSG(session, EINVAL,
			    "join has incompatible strategy "
			    "values for the same index");

		/*
		 * Check against other comparisons (we call them endpoints)
		 * already set up for this index.
		 * We allow either:
		 *   - one or more "eq" (with disjunction)
		 *   - exactly one "eq" (with conjunction)
		 *   - exactly one of "gt" or "ge" (conjunction or disjunction)
		 *   - exactly one of "lt" or "le" (conjunction or disjunction)
		 *   - one of "gt"/"ge" along with one of "lt"/"le"
		 *         (currently restricted to conjunction).
		 *
		 * Some other combinations, although expressible either do
		 * not make sense (X == 3 AND X == 5) or are reducible (X <
		 * 7 AND X < 9).  Other specific cases of (X < 7 OR X > 15)
		 * or (X == 4 OR X > 15) make sense but we don't handle yet.
		 */
		for (i = 0; i < entry->ends_next; i++) {
			end = &entry->ends[i];
			range_eq = (range == WT_CURJOIN_END_EQ);
			if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
			    ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
			    (F_ISSET(end, WT_CURJOIN_END_LT) &&
			    ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
			    (end->flags == WT_CURJOIN_END_EQ &&
			    (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
			    != 0))
				WT_ERR_MSG(session, EINVAL,
				    "join has overlapping ranges");
			if (range == WT_CURJOIN_END_EQ &&
			    end->flags == WT_CURJOIN_END_EQ &&
			    !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
				WT_ERR_MSG(session, EINVAL,
				    "compare=eq can only be combined "
				    "using operation=or");

			/*
			 * Sort "gt"/"ge" to the front, followed by any number
			 * of "eq", and finally "lt"/"le".
			 */
			if (!hasins &&
			    ((range & WT_CURJOIN_END_GT) != 0 ||
			    (range == WT_CURJOIN_END_EQ &&
			    !F_ISSET(end, WT_CURJOIN_END_GT)))) {
				ins = i;
				hasins = true;
			}
		}
		/* All checks completed, merge any new configuration now */
		entry->count = count;
		entry->bloom_bit_count =
		    WT_MAX(entry->bloom_bit_count, bloom_bit_count);
		entry->bloom_hash_count =
		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
	}
	WT_ERR(__wt_realloc_def(session, &entry->ends_allocated,
	    entry->ends_next + 1, &entry->ends));
	if (!hasins)
		ins = entry->ends_next;
	newend = &entry->ends[ins];
	memmove(newend + 1, newend,
	    (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
	memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
	entry->ends_next++;
	newend->cursor = ref_cursor;
	F_SET(newend, range);

	/* Open the main file with a projection of the indexed columns. */
	if (entry->main == NULL && entry->index != NULL) {
		namesize = strlen(cjoin->table->name);
		newsize = namesize + entry->index->colconf.len + 1;
		WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
		snprintf(main_uri, newsize, "%s%.*s",
		    cjoin->table->name, (int)entry->index->colconf.len,
		    entry->index->colconf.str);
		WT_ERR(__wt_open_cursor(session, main_uri,
		    (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
	}

err:	if (main_uri != NULL)
		__wt_free(session, main_uri);
	return (ret);
}
Beispiel #2
0
/*
 * __wt_connection_close --
 *	Close a connection handle.
 */
int
__wt_connection_close(WT_CONNECTION_IMPL *conn)
{
	WT_CONNECTION *wt_conn;
	WT_DECL_RET;
	WT_DLH *dlh;
	WT_SESSION_IMPL *s, *session;
	u_int i;

	wt_conn = &conn->iface;
	session = conn->default_session;

	/* Shut down the subsystems, ensuring workers see the state change. */
	F_SET(conn, WT_CONN_CLOSING);
	WT_FULL_BARRIER();

	/*
	 * Clear any pending async operations and shut down the async worker
	 * threads and system before closing LSM.
	 */
	WT_TRET(__wt_async_flush(session));
	WT_TRET(__wt_async_destroy(session));

	/*
	 * Shut down server threads other than the eviction server, which is
	 * needed later to close btree handles.  Some of these threads access
	 * btree handles, so take care in ordering shutdown to make sure they
	 * exit before files are closed.
	 */
	WT_TRET(__wt_lsm_manager_destroy(session));

	/*
	 * Once the async and LSM threads exit, we shouldn't be opening any
	 * more files.
	 */
	F_SET(conn, WT_CONN_CLOSING_NO_MORE_OPENS);
	WT_FULL_BARRIER();

	WT_TRET(__wt_checkpoint_server_destroy(session));
	WT_TRET(__wt_statlog_destroy(session, true));
	WT_TRET(__wt_sweep_destroy(session));

	/* The eviction server is shut down last. */
	WT_TRET(__wt_evict_destroy(session));

	/* Close open data handles. */
	WT_TRET(__wt_conn_dhandle_discard(session));

	/* Shut down metadata tracking. */
	WT_TRET(__wt_meta_track_destroy(session));

	/*
	 * Now that all data handles are closed, tell logging that a checkpoint
	 * has completed then shut down the log manager (only after closing
	 * data handles).  The call to destroy the log manager is outside the
	 * conditional because we allocate the log path so that printlog can
	 * run without running logging or recovery.
	 */
	if (ret == 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) &&
	    FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE))
		WT_TRET(__wt_txn_checkpoint_log(
		    session, true, WT_TXN_LOG_CKPT_STOP, NULL));
	WT_TRET(__wt_logmgr_destroy(session));

	/* Free memory for collators, compressors, data sources. */
	WT_TRET(__wt_conn_remove_collator(session));
	WT_TRET(__wt_conn_remove_compressor(session));
	WT_TRET(__wt_conn_remove_data_source(session));
	WT_TRET(__wt_conn_remove_encryptor(session));
	WT_TRET(__wt_conn_remove_extractor(session));

	/* Disconnect from shared cache - must be before cache destroy. */
	WT_TRET(__wt_conn_cache_pool_destroy(session));

	/* Discard the cache. */
	WT_TRET(__wt_cache_destroy(session));

	/* Discard transaction state. */
	__wt_txn_global_destroy(session);

	/* Close the lock file, opening up the database to other connections. */
	if (conn->lock_fh != NULL)
		WT_TRET(__wt_close(session, &conn->lock_fh));

	/* Close any optrack files */
	if (session->optrack_fh != NULL)
		WT_TRET(__wt_close(session, &session->optrack_fh));

	/* Close operation tracking */
	WT_TRET(__wt_conn_optrack_teardown(session, false));

	/* Close any file handles left open. */
	WT_TRET(__wt_close_connection_close(session));

	/*
	 * Close the internal (default) session, and switch back to the dummy
	 * session in case of any error messages from the remaining operations
	 * while destroying the connection handle.
	 */
	if (session != &conn->dummy_session) {
		WT_TRET(session->iface.close(&session->iface, NULL));
		session = conn->default_session = &conn->dummy_session;
	}

	/*
	 * The session split stash, hazard information and handle arrays aren't
	 * discarded during normal session close, they persist past the life of
	 * the session. Discard them now.
	 */
	if (!F_ISSET(conn, WT_CONN_LEAK_MEMORY))
		if ((s = conn->sessions) != NULL)
			for (i = 0; i < conn->session_size; ++s, ++i) {
				__wt_free(session, s->cursor_cache);
				__wt_free(session, s->dhhash);
				__wt_stash_discard_all(session, s);
				__wt_free(session, s->hazard);
			}

	/* Destroy the file-system configuration. */
	if (conn->file_system != NULL && conn->file_system->terminate != NULL)
		WT_TRET(conn->file_system->terminate(
		    conn->file_system, (WT_SESSION *)session));

	/* Close extensions, first calling any unload entry point. */
	while ((dlh = TAILQ_FIRST(&conn->dlhqh)) != NULL) {
		TAILQ_REMOVE(&conn->dlhqh, dlh, q);

		if (dlh->terminate != NULL)
			WT_TRET(dlh->terminate(wt_conn));
		WT_TRET(__wt_dlclose(session, dlh));
	}

	/* Destroy the handle. */
	__wt_connection_destroy(conn);

	return (ret);
}
Beispiel #3
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM key;
	uint64_t insert_count;

	WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	/*
	 * Setup so that we don't hold pages we read into cache, and so
	 * that we don't get stuck if the cache is full. If we allow
	 * ourselves to get stuck creating bloom filters, the entire tree
	 * can stall since there may be no worker threads available to flush.
	 */
	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		__wt_bloom_insert(bloom, &key);
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	__wt_verbose(session, WT_VERB_LSM,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count);

	/* Ensure the bloom filter is in the metadata. */
	__wt_lsm_tree_writelock(session, lsm_tree);
	F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	__wt_lsm_tree_writeunlock(session, lsm_tree);

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
Beispiel #4
0
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
{
	WT_BLOCK_HEADER *blk;
	uint32_t alloc_size, page_cksum;

	WT_VERBOSE_RET(session, read,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, size, cksum);

#ifdef HAVE_DIAGNOSTIC
	/*
	 * In diagnostic mode, verify the block we're about to read isn't on
	 * either the available or discard lists.
	 *
	 * Don't check during salvage, it's possible we're reading an already
	 * freed overflow page.
	 */
	if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
		WT_RET(
		    __wt_block_misplaced(session, block, "read", offset, size));
#endif

	/*
	 * Grow the buffer as necessary and read the block.  Buffers should be
	 * aligned for reading, but there are lots of buffers (for example, file
	 * cursors have two buffers each, key and value), and it's difficult to
	 * be sure we've found all of them.  If the buffer isn't aligned, it's
	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
	 * the time on reads, the buffer memory has not yet been allocated, so
	 * we're not adding any additional processing time.)
	 */
	if (F_ISSET(buf, WT_ITEM_ALIGNED))
		alloc_size = size;
	else {
		F_SET(buf, WT_ITEM_ALIGNED);
		alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10);
	}
	WT_RET(__wt_buf_init(session, buf, alloc_size));
	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
	buf->size = size;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	blk->cksum = 0;
	page_cksum = __wt_cksum(buf->mem,
	    F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
	if (cksum != page_cksum) {
		if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
			__wt_errx(session,
			    "read checksum error [%"
			    PRIu32 "B @ %" PRIuMAX ", %"
			    PRIu32 " != %" PRIu32 "]",
			    size, (uintmax_t)offset, cksum, page_cksum);
		return (WT_ERROR);
	}

	WT_CSTAT_INCR(session, block_read);
	WT_CSTAT_INCRV(session, block_byte_read, size);
	return (0);
}
Beispiel #5
0
/*
 * db_get --
 *	Look in the text buffers for a line, followed by the cache, followed
 *	by the database.
 *
 * PUBLIC: int db_get __P((SCR *, db_recno_t, u_int32_t, CHAR_T **, size_t *));
 */
int
db_get(SCR *sp, db_recno_t lno, u_int32_t flags, CHAR_T **pp, size_t *lenp)
		/* Line number. */ /* Pointer store. */ /* Length store. */
{
	DBT data, key;
	EXF *ep;
	TEXT *tp;
	db_recno_t l1, l2;
	const CHAR_T *wp;
	size_t wlen;
	size_t nlen;

	/*
	 * The underlying recno stuff handles zero by returning NULL, but
	 * have to have an OOB condition for the look-aside into the input
	 * buffer anyway.
	 */
	if (lno == 0)
		goto err1;

	/* Check for no underlying file. */
	if ((ep = sp->ep) == NULL) {
		ex_emsg(sp, NULL, EXM_NOFILEYET);
		goto err3;
	}

	if (LF_ISSET(DBG_NOCACHE))
		goto nocache;

	/*
	 * Look-aside into the TEXT buffers and see if the line we want
	 * is there.
	 */
	if (F_ISSET(sp, SC_TINPUT)) {
		l1 = ((TEXT *)sp->tiq.cqh_first)->lno;
		l2 = ((TEXT *)sp->tiq.cqh_last)->lno;
		if (l1 <= lno && l2 >= lno) {
#if defined(DEBUG) && 0
			vtrace(sp,
			    "retrieve TEXT buffer line %lu\n", (u_long)lno);
#endif
			for (tp = sp->tiq.cqh_first;
			    tp->lno != lno; tp = tp->q.cqe_next);
			if (lenp != NULL)
				*lenp = tp->len;
			if (pp != NULL)
				*pp = tp->lb;
			return (0);
		}
		/*
		 * Adjust the line number for the number of lines used
		 * by the text input buffers.
		 */
		if (lno > l2)
			lno -= l2 - l1;
	}

	/* Look-aside into the cache, and see if the line we want is there. */
	if (lno == sp->c_lno) {
#if defined(DEBUG) && 0
		vtrace(sp, "retrieve cached line %lu\n", (u_long)lno);
#endif
		if (lenp != NULL)
			*lenp = sp->c_len;
		if (pp != NULL)
			*pp = sp->c_lp;
		return (0);
	}
	sp->c_lno = OOBLNO;

nocache:
	nlen = 1024;
retry:
	/* data.size contains length in bytes */
	BINC_GOTO(sp, CHAR_T, sp->c_lp, sp->c_blen, nlen);

	/* Get the line from the underlying database. */
	memset(&key, 0, sizeof(key));
	key.data = &lno;
	key.size = sizeof(lno);
	memset(&data, 0, sizeof(data));
	data.data = sp->c_lp;
	data.ulen = sp->c_blen;
	data.flags = DB_DBT_USERMEM;
	switch (ep->db->get(ep->db, NULL, &key, &data, 0)) {
	case DB_BUFFER_SMALL:
		nlen = data.size;
		goto retry;
        default:
		goto err2;
	case DB_NOTFOUND:
err1:		if (LF_ISSET(DBG_FATAL))
err2:			db_err(sp, lno);
alloc_err:
err3:		if (lenp != NULL)
			*lenp = 0;
		if (pp != NULL)
			*pp = NULL;
		return (1);
	case 0:
		;
	}

	if (FILE2INT(sp, data.data, data.size, wp, wlen)) {
	    if (!F_ISSET(sp, SC_CONV_ERROR)) {
		F_SET(sp, SC_CONV_ERROR);
		msgq(sp, M_ERR, "324|Conversion error on line %d", lno);
	    }
	    goto err3;
	}

	/* Reset the cache. */
	if (wp != data.data) {
	    BINC_GOTOW(sp, sp->c_lp, sp->c_blen, wlen);
	    MEMCPYW(sp->c_lp, wp, wlen);
	}
	sp->c_lno = lno;
	sp->c_len = wlen;

#if defined(DEBUG) && 0
	vtrace(sp, "retrieve DB line %lu\n", (u_long)lno);
#endif
	if (lenp != NULL)
		*lenp = wlen;
	if (pp != NULL)
		*pp = sp->c_lp;
	return (0);
}
Beispiel #6
0
/*
 * __wt_txn_recover --
 *	Run recovery.
 */
int
__wt_txn_recover(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_CURSOR *metac;
	WT_DECL_RET;
	WT_RECOVERY r;
	struct WT_RECOVERY_FILE *metafile;
	char *config;
	int needs_rec, was_backup;

	conn = S2C(session);
	WT_CLEAR(r);
	WT_INIT_LSN(&r.ckpt_lsn);
	was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP) ? 1 : 0;

	/* We need a real session for recovery. */
	WT_RET(__wt_open_session(conn, NULL, NULL, &session));
	F_SET(session, WT_SESSION_NO_LOGGING);
	r.session = session;

	WT_ERR(__wt_metadata_search(session, WT_METAFILE_URI, &config));
	WT_ERR(__recovery_setup_file(&r, WT_METAFILE_URI, config));
	WT_ERR(__wt_metadata_cursor(session, NULL, &metac));
	metafile = &r.files[WT_METAFILE_ID];
	metafile->c = metac;

	/*
	 * If no log was found (including if logging is disabled), or if the
	 * last checkpoint was done with logging disabled, recovery should not
	 * run.  Scan the metadata to figure out the largest file ID.
	 */
	if (!FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_EXISTED) ||
	    WT_IS_MAX_LSN(&metafile->ckpt_lsn)) {
		WT_ERR(__recovery_file_scan(&r));
		conn->next_file_id = r.max_fileid;
		goto done;
	}

	/*
	 * First, do a pass through the log to recover the metadata, and
	 * establish the last checkpoint LSN.  Skip this when opening a hot
	 * backup: we already have the correct metadata in that case.
	 */
	if (!was_backup) {
		r.metadata_only = 1;
		if (WT_IS_INIT_LSN(&metafile->ckpt_lsn))
			WT_ERR(__wt_log_scan(session,
			    NULL, WT_LOGSCAN_FIRST, __txn_log_recover, &r));
		else {
			/*
			 * Start at the last checkpoint LSN referenced in the
			 * metadata.  If we see the end of a checkpoint while
			 * scanning, we will change the full scan to start from
			 * there.
			 */
			r.ckpt_lsn = metafile->ckpt_lsn;
			WT_ERR(__wt_log_scan(session,
			    &metafile->ckpt_lsn, 0, __txn_log_recover, &r));
		}
	}

	/* Scan the metadata to find the live files and their IDs. */
	WT_ERR(__recovery_file_scan(&r));

	/*
	 * We no longer need the metadata cursor: close it to avoid pinning any
	 * resources that could block eviction during recovery.
	 */
	r.files[0].c = NULL;
	WT_ERR(metac->close(metac));

	/*
	 * Now, recover all the files apart from the metadata.
	 * Pass WT_LOGSCAN_RECOVER so that old logs get truncated.
	 */
	r.metadata_only = 0;
	WT_ERR(__wt_verbose(session, WT_VERB_RECOVERY,
	    "Main recovery loop: starting at %u/%" PRIuMAX,
	    r.ckpt_lsn.file, (uintmax_t)r.ckpt_lsn.offset));
	WT_ERR(__wt_log_needs_recovery(session, &r.ckpt_lsn, &needs_rec));
	/*
	 * Check if the database was shut down cleanly.  If not
	 * return an error if the user does not want automatic
	 * recovery.
	 */
	if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR))
		WT_ERR(WT_RUN_RECOVERY);
	/*
	 * Always run recovery even if it was a clean shutdown.
	 * We can consider skipping it in the future.
	 */
	if (WT_IS_INIT_LSN(&r.ckpt_lsn))
		WT_ERR(__wt_log_scan(session, NULL,
		    WT_LOGSCAN_FIRST | WT_LOGSCAN_RECOVER,
		    __txn_log_recover, &r));
	else
		WT_ERR(__wt_log_scan(session, &r.ckpt_lsn,
		    WT_LOGSCAN_RECOVER, __txn_log_recover, &r));

	conn->next_file_id = r.max_fileid;

	/*
	 * If recovery ran successfully forcibly log a checkpoint so the next
	 * open is fast and keep the metadata up to date with the checkpoint
	 * LSN and archiving.
	 */
	WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

done:
err:	WT_TRET(__recovery_free(&r));
	__wt_free(session, config);
	WT_TRET(session->iface.close(&session->iface, NULL));

	return (ret);
}
Beispiel #7
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM buf, key;
	WT_SESSION *wt_session;
	uint64_t insert_count;
	int exist;

	/*
	 * Normally, the Bloom URI is populated when the chunk struct is
	 * allocated.  After an open, however, it may not have been.
	 * Deal with that here.
	 */
	if (chunk->bloom_uri == NULL) {
		WT_CLEAR(buf);
		WT_RET(__wt_lsm_tree_bloom_name(
		    session, lsm_tree, chunk->id, &buf));
		chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL);
	}

	/*
	 * Drop the bloom filter first - there may be some content hanging over
	 * from an aborted merge or checkpoint.
	 */
	wt_session = &session->iface;
	WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
	if (exist)
		WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	F_SET(session, WT_SESSION_NO_CACHE);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	WT_VERBOSE_ERR(session, lsm,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count);

	/* Ensure the bloom filter is in the metadata. */
	WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
	F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE);
	return (ret);
}
Beispiel #8
0
/*
 * __wt_curds_open --
 *	Initialize a data-source cursor.
 */
int
__wt_curds_open(
    WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
    const char *cfg[], WT_DATA_SOURCE *dsrc, WT_CURSOR **cursorp)
{
	WT_CURSOR_STATIC_INIT(iface,
	    NULL,			/* get-key */
	    NULL,			/* get-value */
	    NULL,			/* set-key */
	    NULL,			/* set-value */
	    __curds_compare,		/* compare */
	    __curds_next,		/* next */
	    __curds_prev,		/* prev */
	    __curds_reset,		/* reset */
	    __curds_search,		/* search */
	    __curds_search_near,	/* search-near */
	    __curds_insert,		/* insert */
	    __curds_update,		/* update */
	    __curds_remove,		/* remove */
	    __curds_close);		/* close */
	WT_CONFIG_ITEM cval;
	WT_CURSOR *cursor, *source;
	WT_CURSOR_DATA_SOURCE *data_source;
	WT_DECL_RET;
	const char *metaconf;

	STATIC_ASSERT(offsetof(WT_CURSOR_DATA_SOURCE, iface) == 0);

	data_source = NULL;
	metaconf = NULL;

	WT_RET(__wt_calloc_def(session, 1, &data_source));
	cursor = &data_source->iface;
	*cursor = iface;
	cursor->session = &session->iface;
	F_SET(cursor, WT_CURSTD_DATA_SOURCE);

	/*
	 * XXX
	 * The underlying data-source may require the object's key and value
	 * formats.  This isn't a particularly elegant way of getting that
	 * information to the data-source, this feels like a layering problem
	 * to me.
	 */
	WT_ERR(__wt_metadata_search(session, uri, &metaconf));
	WT_ERR(__wt_config_getones(session, metaconf, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len, &cursor->key_format));
	WT_ERR(__wt_config_getones(session, metaconf, "value_format", &cval));
	WT_ERR(
	    __wt_strndup(session, cval.str, cval.len, &cursor->value_format));

	WT_ERR(__wt_cursor_init(cursor, uri, owner, cfg, cursorp));

	/* Data-source cursors have a collator reference. */
	WT_ERR(__wt_collator_config(session, cfg, &data_source->collator));

	WT_ERR(dsrc->open_cursor(dsrc,
	    &session->iface, uri, (WT_CONFIG_ARG *)cfg, &data_source->source));
	source = data_source->source;
	source->session = (WT_SESSION *)session;
	memset(&source->q, 0, sizeof(source->q));
	source->recno = 0;
	memset(source->raw_recno_buf, 0, sizeof(source->raw_recno_buf));
	memset(&source->key, 0, sizeof(source->key));
	memset(&source->value, 0, sizeof(source->value));
	source->saved_err = 0;
	source->flags = 0;

	if (0) {
err:		if (F_ISSET(cursor, WT_CURSTD_OPEN))
			WT_TRET(cursor->close(cursor));
		else
			__wt_free(session, data_source);
		*cursorp = NULL;
	}

	__wt_free(session, metaconf);
	return (ret);
}
Beispiel #9
0
/*
 * __clsm_enter --
 *	Start an operation on an LSM cursor, update if the tree has changed.
 */
static inline int
__clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
{
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	uint64_t *switch_txnp;
	uint64_t snap_min;

	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;
	txn = &session->txn;

	/* Merge cursors never update. */
	if (F_ISSET(clsm, WT_CLSM_MERGE))
		return (0);

	if (reset) {
		WT_ASSERT(session, !F_ISSET(&clsm->iface,
		   WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
		WT_RET(__clsm_reset_cursors(clsm, NULL));
	}

	for (;;) {
		/*
		 * If the cursor looks up-to-date, check if the cache is full.
		 * In case this call blocks, the check will be repeated before
		 * proceeding.
		 */
		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
		    lsm_tree->nchunks != 0)
			goto open;

		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
		    lsm_tree->nchunks != 0)
			goto open;

		/* Update the maximum transaction ID in the primary chunk. */
		if (update) {
			/*
			 * Ensure that there is a transaction snapshot active.
			 */
			WT_RET(__wt_txn_autocommit_check(session));
			WT_RET(__wt_txn_id_check(session));

			WT_RET(__clsm_enter_update(clsm));
			if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
				goto open;

			if (txn->isolation == WT_ISO_SNAPSHOT)
				__wt_txn_cursor_op(session);

			/*
			 * Figure out how many updates are required for
			 * snapshot isolation.
			 *
			 * This is not a normal visibility check on the maximum
			 * transaction ID in each chunk: any transaction ID
			 * that overlaps with our snapshot is a potential
			 * conflict.
			 */
			clsm->nupdates = 1;
			if (txn->isolation == WT_ISO_SNAPSHOT &&
			    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
				WT_ASSERT(session,
				    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
				snap_min = txn->snap_min;
				for (switch_txnp =
				    &clsm->switch_txn[clsm->nchunks - 2];
				    clsm->nupdates < clsm->nchunks;
				    clsm->nupdates++, switch_txnp--) {
					if (WT_TXNID_LT(*switch_txnp, snap_min))
						break;
					WT_ASSERT(session,
					    !__wt_txn_visible_all(
					    session, *switch_txnp));
				}
			}
		}

		/*
		 * Stop when we are up-to-date, as long as this is:
		 *   - a snapshot isolation update and the cursor is set up for
		 *     that;
		 *   - an update operation with a primary chunk, or
		 *   - a read operation and the cursor is open for reading.
		 */
		if ((!update ||
		    txn->isolation != WT_ISO_SNAPSHOT ||
		    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
		    ((update && clsm->primary_chunk != NULL) ||
		    (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
			break;

open:		WT_WITH_SCHEMA_LOCK(session,
		    ret = __clsm_open_cursors(clsm, update, 0, 0));
		WT_RET(ret);
	}

	if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
		WT_RET(__cursor_enter(session));
		F_SET(clsm, WT_CLSM_ACTIVE);
	}

	return (0);
}
Beispiel #10
0
/*
 * v_motion --
 *
 * Get resulting motion mark.
 */
static int
v_motion(SCR *sp, VICMD *dm, VICMD *vp, int *mappedp)
{
	VICMD motion;
	gcret_t gcret;
	size_t len;
	u_long cnt;
	u_int flags;
	int tilde_reset, notused;

	/*
	 * If '.' command, use the dot motion, else get the motion command.
	 * Clear any line motion flags, the subsequent motion isn't always
	 * the same, i.e. "/aaa" may or may not be a line motion.
	 */
	if (F_ISSET(vp, VC_ISDOT)) {
		motion = *dm;
		F_SET(&motion, VC_ISDOT);
		F_CLR(&motion, VM_COMMASK);
		gcret = GC_OK;
	} else {
		memset(&motion, 0, sizeof(VICMD));
		gcret = v_cmd(sp, NULL, &motion, vp, &notused, mappedp);
		if (gcret != GC_OK && gcret != GC_EVENT)
			return (1);
	}

	/*
	 * A count may be provided both to the command and to the motion, in
	 * which case the count is multiplicative.  For example, "3y4y" is the
	 * same as "12yy".  This count is provided to the motion command and
	 * not to the regular function.
	 */
	cnt = motion.count = F_ISSET(&motion, VC_C1SET) ? motion.count : 1;
	if (F_ISSET(vp, VC_C1SET)) {
		motion.count *= vp->count;
		F_SET(&motion, VC_C1SET);

		/*
		 * Set flags to restore the original values of the command
		 * structure so dot commands can change the count values,
		 * e.g. "2dw" "3." deletes a total of five words.
		 */
		F_CLR(vp, VC_C1SET);
		F_SET(vp, VC_C1RESET);
	}

	/*
	 * Some commands can be repeated to indicate the current line.  In
	 * this case, or if the command is a "line command", set the flags
	 * appropriately.  If not a doubled command, run the function to get
	 * the resulting mark.
 	 */
	if (gcret != GC_EVENT && vp->key == motion.key) {
		F_SET(vp, VM_LDOUBLE | VM_LMODE);

		/* Set the origin of the command. */
		vp->m_start.lno = sp->lno;
		vp->m_start.cno = 0;

		/*
		 * Set the end of the command.
		 *
		 * If the current line is missing, i.e. the file is empty,
		 * historic vi permitted a "cc" or "!!" command to insert
		 * text.
		 */
		vp->m_stop.lno = sp->lno + motion.count - 1;
		if (db_get(sp, vp->m_stop.lno, 0, NULL, &len)) {
			if (vp->m_stop.lno != 1 ||
			   (vp->key != 'c' && vp->key != '!')) {
				v_emsg(sp, NULL, VIM_EMPTY);
				return (1);
			}
			vp->m_stop.cno = 0;
		} else
			vp->m_stop.cno = len ? len - 1 : 0;
	} else {
		/*
		 * Motion commands change the underlying movement (*snarl*).
		 * For example, "l" is illegal at the end of a line, but "dl"
		 * is not.  Set flags so the function knows the situation.
		 */
		motion.rkp = vp->kp;

		/*
		 * XXX
		 * Use yank instead of creating a new motion command, it's a
		 * lot easier for now.
		 */
		if (vp->kp == &tmotion) {
			tilde_reset = 1;
			vp->kp = &vikeys['y'];
		} else
			tilde_reset = 0;

		/*
		 * Copy the key flags into the local structure, except for the
		 * RCM flags -- the motion command will set the RCM flags in
		 * the vp structure if necessary.  This means that the motion
		 * command is expected to determine where the cursor ends up!
		 * However, we save off the current RCM mask and restore it if
		 * it no RCM flags are set by the motion command, with a small
		 * modification.
		 *
		 * We replace the VM_RCM_SET flag with the VM_RCM flag.  This
		 * is so that cursor movement doesn't set the relative position
		 * unless the motion command explicitly specified it.  This
		 * appears to match historic practice, but I've never been able
		 * to develop a hard-and-fast rule.
		 */
		flags = F_ISSET(vp, VM_RCM_MASK);
		if (LF_ISSET(VM_RCM_SET)) {
			LF_SET(VM_RCM);
			LF_CLR(VM_RCM_SET);
		}
		F_CLR(vp, VM_RCM_MASK);
		F_SET(&motion, motion.kp->flags & ~VM_RCM_MASK);

		/*
		 * Set the three cursor locations to the current cursor.  This
		 * permits commands like 'j' and 'k', that are line oriented
		 * motions and have special cursor suck semantics when they are
		 * used as standalone commands, to ignore column positioning.
		 */
		motion.m_final.lno =
		    motion.m_stop.lno = motion.m_start.lno = sp->lno;
		motion.m_final.cno =
		    motion.m_stop.cno = motion.m_start.cno = sp->cno;

		/* Run the function. */
		if ((motion.kp->func)(sp, &motion))
			return (1);

		/*
		 * If the current line is missing, i.e. the file is empty,
		 * historic vi allowed "c<motion>" or "!<motion>" to insert
		 * text.  Otherwise fail -- most motion commands will have
		 * already failed, but some, e.g. G, succeed in empty files.
		 */
		if (!db_exist(sp, vp->m_stop.lno)) {
			if (vp->m_stop.lno != 1 ||
			   (vp->key != 'c' && vp->key != '!')) {
				v_emsg(sp, NULL, VIM_EMPTY);
				return (1);
			}
			vp->m_stop.cno = 0;
		}

		/*
		 * XXX
		 * See above.
		 */
		if (tilde_reset)
			vp->kp = &tmotion;

		/*
		 * Copy cut buffer, line mode and cursor position information
		 * from the motion command structure, i.e. anything that the
		 * motion command can set for us.  The commands can flag the
		 * movement as a line motion (see v_sentence) as well as set
		 * the VM_RCM_* flags explicitly.
		 */
		F_SET(vp, F_ISSET(&motion, VM_COMMASK | VM_RCM_MASK));

		/*
		 * If the motion command set no relative motion flags, use
		 * the (slightly) modified previous values.
		 */
		if (!F_ISSET(vp, VM_RCM_MASK))
			F_SET(vp, flags);

		/*
		 * Commands can change behaviors based on the motion command
		 * used, for example, the ! command repeated the last bang
		 * command if N or n was used as the motion.
		 */
		vp->rkp = motion.kp;

		/*
		 * Motion commands can reset all of the cursor information.
		 * If the motion is in the reverse direction, switch the
		 * from and to MARK's so that it's in a forward direction.
		 * Motions are from the from MARK to the to MARK (inclusive).
		 */
		if (motion.m_start.lno > motion.m_stop.lno ||
		    (motion.m_start.lno == motion.m_stop.lno &&
		    motion.m_start.cno > motion.m_stop.cno)) {
			vp->m_start = motion.m_stop;
			vp->m_stop = motion.m_start;
		} else {
			vp->m_start = motion.m_start;
			vp->m_stop = motion.m_stop;
		}
		vp->m_final = motion.m_final;
	}

	/*
	 * If the command sets dot, save the motion structure.  The motion
	 * count was changed above and needs to be reset, that's why this
	 * is done here, and not in the calling routine.
	 */
	if (F_ISSET(vp->kp, V_DOT)) {
		*dm = motion;
		dm->count = cnt;
	}
	return (0);
}
Beispiel #11
0
/*
 * v_init --
 *	Initialize the vi screen.
 */
static int
v_init(SCR *sp)
{
	GS *gp;
	VI_PRIVATE *vip;

	gp = sp->gp;
	vip = VIP(sp);

	/* Switch into vi. */
	if (gp->scr_screen(sp, SC_VI))
		return (1);
	(void)gp->scr_attr(sp, SA_ALTERNATE, 1);

	F_CLR(sp, SC_EX | SC_SCR_EX);
	F_SET(sp, SC_VI);

	/*
	 * Initialize screen values.
	 *
	 * Small windows: see vs_refresh(), section 6a.
	 *
	 * Setup:
	 *	t_minrows is the minimum rows to display
	 *	t_maxrows is the maximum rows to display (rows - 1)
	 *	t_rows is the rows currently being displayed
	 */
	sp->rows = vip->srows = O_VAL(sp, O_LINES);
	sp->cols = O_VAL(sp, O_COLUMNS);
	sp->t_rows = sp->t_minrows = O_VAL(sp, O_WINDOW);
	if (sp->rows != 1) {
		if (sp->t_rows > sp->rows - 1) {
			sp->t_minrows = sp->t_rows = sp->rows - 1;
			msgq(sp, M_INFO,
			    "214|Windows option value is too large, max is %zu",
			    sp->t_rows);
		}
		sp->t_maxrows = sp->rows - 1;
	} else
		sp->t_maxrows = 1;
	sp->roff = sp->coff = 0;

	/* Create a screen map. */
	CALLOC_RET(sp, HMAP, SMAP *, SIZE_HMAP(sp), sizeof(SMAP));
	TMAP = HMAP + (sp->t_rows - 1);
	HMAP->lno = sp->lno;
	HMAP->coff = 0;
	HMAP->soff = 1;

	/*
	 * Fill the screen map from scratch -- try and center the line.  That
	 * way if we're starting with a file we've seen before, we'll put the
	 * line in the middle, otherwise, it won't work and we'll end up with
	 * the line at the top.
	 */
	F_CLR(sp, SC_SCR_TOP);
	F_SET(sp, SC_SCR_REFORMAT | SC_SCR_CENTER);

	/* Invalidate the cursor. */
	F_SET(vip, VIP_CUR_INVALID);

	/* Paint the screen image from scratch. */
	F_SET(vip, VIP_N_EX_PAINT);

	return (0);
}
Beispiel #12
0
/*
 * vi --
 * 	Main vi command loop.
 *
 * PUBLIC: int vi __P((SCR **));
 */
int
vi(SCR **spp)
{
	GS *gp;
	WIN *wp;
	MARK abst;
	SCR *next, *sp;
	VICMD cmd, *vp;
	VI_PRIVATE *vip;
	int comcount, mapped, rval;

	/* Get the first screen. */
	sp = *spp;
	wp = sp->wp;
	gp = sp->gp;

	/* Initialize the command structure. */
	vp = &cmd;
	memset(vp, 0, sizeof(VICMD));

	/* Reset strange attraction. */
	F_SET(vp, VM_RCM_SET);

	/* Initialize the vi screen. */
	if (v_init(sp))
		return (1);

	/* Set the focus. */
	(void)sp->gp->scr_rename(sp, sp->frp->name, 1);

	for (vip = VIP(sp), rval = 0;;) {
		/* Resolve messages. */
		if (!MAPPED_KEYS_WAITING(sp) && vs_resolve(sp, NULL, 0))
			goto ret;

		/*
		 * If not skipping a refresh, return to command mode and
		 * refresh the screen.
		 */
		if (F_ISSET(vip, VIP_S_REFRESH))
			F_CLR(vip, VIP_S_REFRESH);
		else {
			sp->showmode = SM_COMMAND;
			if (vs_refresh(sp, 0))
				goto ret;
		}

		/* Set the new favorite position. */
		if (F_ISSET(vp, VM_RCM_SET | VM_RCM_SETFNB | VM_RCM_SETNNB)) {
			F_CLR(vip, VIP_RCM_LAST);
			(void)vs_column(sp, &sp->rcm);
		}

		/*
		 * If not currently in a map, log the cursor position,
		 * and set a flag so that this command can become the
		 * DOT command.
		 */
		if (MAPPED_KEYS_WAITING(sp))
			mapped = 1;
		else {
			if (log_cursor(sp))
				goto err;
			mapped = 0;
		}

		/*
		 * There may be an ex command waiting, and we returned here
		 * only because we exited a screen or file.  In this case,
		 * we simply go back into the ex parser.
		 */
		if (EXCMD_RUNNING(wp)) {
			vp->kp = &vikeys[':'];
			goto ex_continue;
		}

		/* Refresh the command structure. */
		memset(vp, 0, sizeof(VICMD));

		/*
		 * We get a command, which may or may not have an associated
		 * motion.  If it does, we get it too, calling its underlying
		 * function to get the resulting mark.  We then call the
		 * command setting the cursor to the resulting mark.
		 *
		 * !!!
		 * Vi historically flushed mapped characters on error, but
		 * entering extra <escape> characters at the beginning of
		 * a map wasn't considered an error -- in fact, users would
		 * put leading <escape> characters in maps to clean up vi
		 * state before the map was interpreted.  Beauty!
		 */
		switch (v_cmd(sp, DOT, vp, NULL, &comcount, &mapped)) {
		case GC_ERR:
			goto err;
		case GC_ERR_NOFLUSH:
			goto gc_err_noflush;
		case GC_FATAL:
			goto ret;
		case GC_INTERRUPT:
			goto intr;
		case GC_EVENT:
		case GC_OK:
			break;
		}

		/* Check for security setting. */
		if (F_ISSET(vp->kp, V_SECURE) && O_ISSET(sp, O_SECURE)) {
			ex_emsg(sp, (const char *)KEY_NAME(sp, vp->key),
			    EXM_SECURE);
			goto err;
		}

		/*
		 * Historical practice: if a dot command gets a new count,
		 * any motion component goes away, i.e. "d3w2." deletes a
		 * total of 5 words.
		 */
		if (F_ISSET(vp, VC_ISDOT) && comcount)
			DOTMOTION->count = 1;

		/* Copy the key flags into the local structure. */
		F_SET(vp, vp->kp->flags);

		/* Prepare to set the previous context. */
		if (F_ISSET(vp, V_ABS | V_ABS_C | V_ABS_L)) {
			abst.lno = sp->lno;
			abst.cno = sp->cno;
		}

		/*
		 * Set the three cursor locations to the current cursor.  The
		 * underlying routines don't bother if the cursor doesn't move.
		 * This also handles line commands (e.g. Y) defaulting to the
		 * current line.
		 */
		vp->m_start.lno = vp->m_stop.lno = vp->m_final.lno = sp->lno;
		vp->m_start.cno = vp->m_stop.cno = vp->m_final.cno = sp->cno;

		/*
		 * Do any required motion; v_motion sets the from MARK and the
		 * line mode flag, as well as the VM_RCM flags.
		 */
		if (F_ISSET(vp, V_MOTION) &&
		    v_motion(sp, DOTMOTION, vp, &mapped)) {
			if (INTERRUPTED(sp))
				goto intr;
			goto err;
		}

		/*
		 * If a count is set and the command is line oriented, set the
		 * to MARK here relative to the cursor/from MARK.  This is for
		 * commands that take both counts and motions, i.e. "4yy" and
		 * "y%".  As there's no way the command can know which the user
		 * did, we have to do it here.  (There are commands that are
		 * line oriented and that take counts ("#G", "#H"), for which
		 * this calculation is either completely meaningless or wrong.
		 * Each command must validate the value for itself.
		 */
		if (F_ISSET(vp, VC_C1SET) && F_ISSET(vp, VM_LMODE))
			vp->m_stop.lno += vp->count - 1;

		/* Increment the command count. */
		++sp->ccnt;

#if defined(DEBUG) && defined(COMLOG)
		v_comlog(sp, vp);
#endif
		/* Call the function. */
ex_continue:	if (vp->kp->func(sp, vp))
			goto err;
#ifdef DEBUG
		/* Make sure no function left the temporary space locked. */
		if (F_ISSET(wp, W_TMP_INUSE)) {
			F_CLR(wp, W_TMP_INUSE);
			msgq(sp, M_ERR,
			    "232|vi: temporary buffer not released");
		}
#endif
		/*
		 * If we're exiting this screen, move to the next one, or, if
		 * there aren't any more, return to the main editor loop.  The
		 * ordering is careful, don't discard the contents of sp until
		 * the end.
		 */
		if (F_ISSET(sp, SC_EXIT | SC_EXIT_FORCE)) {
			if (file_end(sp, NULL, F_ISSET(sp, SC_EXIT_FORCE)))
				goto ret;
			if (vs_discard(sp, &next))
				goto ret;
			if (next == NULL && vs_swap(sp, &next, NULL))
				goto ret;
			*spp = next;
			if (screen_end(sp))
				goto ret;
			if (next == NULL)
				break;

			/* Switch screens, change focus. */
			sp = next;
			vip = VIP(sp);
			(void)sp->gp->scr_rename(sp, sp->frp->name, 1);

			/* Don't trust the cursor. */
			F_SET(vip, VIP_CUR_INVALID);

			continue;
		}

		/*
		 * Set the dot command structure.
		 *
		 * !!!
		 * Historically, commands which used mapped keys did not
		 * set the dot command, with the exception of the text
		 * input commands.
		 */
		if (F_ISSET(vp, V_DOT) && !mapped) {
			*DOT = cmd;
			F_SET(DOT, VC_ISDOT);

			/*
			 * If a count was supplied for both the command and
			 * its motion, the count was used only for the motion.
			 * Turn the count back on for the dot structure.
			 */
			if (F_ISSET(vp, VC_C1RESET))
				F_SET(DOT, VC_C1SET);

			/* VM flags aren't retained. */
			F_CLR(DOT, VM_COMMASK | VM_RCM_MASK);
		}

		/*
		 * Some vi row movements are "attracted" to the last position
		 * set, i.e. the VM_RCM commands are moths to the VM_RCM_SET
		 * commands' candle.  If the movement is to the EOL the vi
		 * command handles it.  If it's to the beginning, we handle it
		 * here.
		 *
		 * Note, some commands (e.g. _, ^) don't set the VM_RCM_SETFNB
		 * flag, but do the work themselves.  The reason is that they
		 * have to modify the column in case they're being used as a
		 * motion component.  Other similar commands (e.g. +, -) don't
		 * have to modify the column because they are always line mode
		 * operations when used as motions, so the column number isn't
		 * of any interest.
		 *
		 * Does this totally violate the screen and editor layering?
		 * You betcha.  As they say, if you think you understand it,
		 * you don't.
		 */
		switch (F_ISSET(vp, VM_RCM_MASK)) {
		case 0:
		case VM_RCM_SET:
			break;
		case VM_RCM:
			vp->m_final.cno = vs_rcm(sp,
			    vp->m_final.lno, F_ISSET(vip, VIP_RCM_LAST));
			break;
		case VM_RCM_SETLAST:
			F_SET(vip, VIP_RCM_LAST);
			break;
		case VM_RCM_SETFNB:
			vp->m_final.cno = 0;
			/* FALLTHROUGH */
		case VM_RCM_SETNNB:
			if (nonblank(sp, vp->m_final.lno, &vp->m_final.cno))
				goto err;
			break;
		default:
			abort();
		}

		/* Update the cursor. */
		sp->lno = vp->m_final.lno;
		sp->cno = vp->m_final.cno;

		/*
		 * Set the absolute mark -- set even if a tags or similar
		 * command, since the tag may be moving to the same file.
		 */
		if ((F_ISSET(vp, V_ABS) ||
		    (F_ISSET(vp, V_ABS_L) && sp->lno != abst.lno) ||
		    (F_ISSET(vp, V_ABS_C) &&
		    (sp->lno != abst.lno || sp->cno != abst.cno))) &&
		    mark_set(sp, ABSMARK1, &abst, 1))
			goto err;

		if (0) {
err:			if (v_event_flush(sp, CH_MAPPED))
				msgq(sp, M_BERR,
			    "110|Vi command failed: mapped keys discarded");
		}

		/*
		 * Check and clear interrupts.  There's an obvious race, but
		 * it's not worth fixing.
		 */
gc_err_noflush:	if (INTERRUPTED(sp)) {
intr:			CLR_INTERRUPT(sp);
			if (v_event_flush(sp, CH_MAPPED))
				msgq(sp, M_ERR,
				    "231|Interrupted: mapped keys discarded");
			else
				msgq(sp, M_ERR, "236|Interrupted");
		}

		/* If the last command switched screens, update. */
		if (F_ISSET(sp, SC_SSWITCH)) {
			F_CLR(sp, SC_SSWITCH);

			/*
			 * If the current screen is still displayed, it will
			 * need a new status line.
			 */
			F_SET(sp, SC_STATUS);

			/* Switch screens, change focus. */
			sp = sp->nextdisp;
			vip = VIP(sp);
			(void)sp->gp->scr_rename(sp, sp->frp->name, 1);

			/* Don't trust the cursor. */
			F_SET(vip, VIP_CUR_INVALID);

			/* Refresh so we can display messages. */
			if (vs_refresh(sp, 1))
				return (1);
		}

		/* If the last command switched files, change focus. */
		if (F_ISSET(sp, SC_FSWITCH)) {
			F_CLR(sp, SC_FSWITCH);
			(void)sp->gp->scr_rename(sp, sp->frp->name, 1);
		}

		/* If leaving vi, return to the main editor loop. */
		if (F_ISSET(gp, G_SRESTART) || F_ISSET(sp, SC_EX)) {
			*spp = sp;
			v_dtoh(sp);
			gp->scr_discard(sp, NULL);
			break;
		}
	}
	if (0)
ret:		rval = 1;
	return (rval);
}
Beispiel #13
0
/*
 * v_cmd --
 *	Get a vi command.
 */
static gcret_t
v_cmd(SCR *sp, VICMD *dp, VICMD *vp, VICMD *ismotion, int *comcountp, int *mappedp)
	        
	               
	                	/* Previous key if getting motion component. */
	                         
{
	enum { COMMANDMODE, ISPARTIAL, NOTPARTIAL } cpart;
	ARG_CHAR_T key;
	VIKEYS const *kp;
	gcret_t gcret;
	u_int flags;
	const char *s;

	/*
	 * Get an event command or a key.  Event commands are simple, and
	 * don't have any additional information.
	 */
	cpart = ismotion == NULL ? COMMANDMODE : ISPARTIAL;
	gcret = v_key(sp, vp, 1, EC_MAPCOMMAND);
	if (gcret != GC_OK) {
		if (gcret != GC_EVENT)
			return (gcret);
		if (v_event(sp, vp))
			return (GC_ERR);
		if (ismotion != NULL && !F_ISSET(vp->kp, V_MOVE))
			v_event_err(sp, &vp->ev);
		return (GC_EVENT);
	}

	/*
	 * Keys are not simple.  (Although vi's command structure less complex
	 * than ex (and don't think I'm not grateful!)  The command syntax is:
	 *
	 *	[count] [buffer] [count] key [[motion] | [buffer] [character]]
	 *
	 * and there are, of course, several special cases.  The motion value
	 * is itself a vi command, with the syntax:
	 *
	 *	[count] key [character]
	 *
	 * <escape> cancels partial commands, i.e. a command where at least
	 * one non-numeric character has been entered.  Otherwise, it beeps
	 * the terminal.
	 *
	 * !!!
	 * POSIX 1003.2-1992 explicitly disallows cancelling commands where
	 * all that's been entered is a number, requiring that the terminal
	 * be alerted.
	 */
	if (vp->ev.e_value == K_ESCAPE)
		goto esc;

	/*
	 * Commands that are mapped are treated differently (e.g., they
	 * don't set the dot command.  Pass that information back.
	 */
	if (FL_ISSET(vp->ev.e_flags, CH_MAPPED))
		*mappedp = 1;
	key = vp->ev.e_c;

	if (ismotion == NULL)
		cpart = NOTPARTIAL;

	/* Pick up an optional buffer. */
	if (key == '"') {
		cpart = ISPARTIAL;
		if (ismotion != NULL) {
			v_emsg(sp, NULL, VIM_COMBUF);
			return (GC_ERR);
		}
		KEY(vp->buffer, 0);
		F_SET(vp, VC_BUFFER);

		KEY(key, EC_MAPCOMMAND);
	}

	/*
	 * Pick up an optional count, where a leading 0 isn't a count, it's
	 * a command.  When a count is specified, the dot command behaves
	 * differently, pass the information back.
	 */
	if (ISDIGIT(key) && key != '0') {
		if (v_count(sp, vp, key, &vp->count))
			return (GC_ERR);

		F_SET(vp, VC_C1SET);
		*comcountp = 1;

		KEY(key, EC_MAPCOMMAND);
	} else
		*comcountp = 0;

	/* Pick up optional buffer. */
	if (key == '"') {
		cpart = ISPARTIAL;
		if (F_ISSET(vp, VC_BUFFER)) {
			msgq(sp, M_ERR, "234|Only one buffer may be specified");
			return (GC_ERR);
		}
		if (ismotion != NULL) {
			v_emsg(sp, NULL, VIM_COMBUF);
			return (GC_ERR);
		}
		KEY(vp->buffer, 0);
		F_SET(vp, VC_BUFFER);

		KEY(key, EC_MAPCOMMAND);
	}

	/* Check for an OOB command key. */
	cpart = ISPARTIAL;
	if (key > MAXVIKEY) {
		v_emsg(sp, (const char *)KEY_NAME(sp, key), VIM_NOCOM);
		return (GC_ERR);
	}
	kp = &vikeys[vp->key = key];

	/*
	 * !!!
	 * Historically, D accepted and then ignored a count.  Match it.
	 */
	if (vp->key == 'D' && F_ISSET(vp, VC_C1SET)) {
		*comcountp = 0;
		vp->count = 0;
		F_CLR(vp, VC_C1SET);
	}

	/*
	 * There are several commands that we implement as aliases, both
	 * to match historic practice and to ensure consistency.  Check
	 * for command aliases.
	 */
	if (kp->func == NULL && (kp = v_alias(sp, vp, kp)) == NULL)
		return (GC_ERR);

	/* The tildeop option makes the ~ command take a motion. */
	if (key == '~' && O_ISSET(sp, O_TILDEOP))
		kp = &tmotion;

	vp->kp = kp;

	/*
	 * Find the command.  The only legal command with no underlying
	 * function is dot.  It's historic practice that <escape> doesn't
	 * just erase the preceding number, it beeps the terminal as well.
	 * It's a common problem, so just beep the terminal unless verbose
	 * was set.
	 */
	if (kp->func == NULL) {
		if (key != '.') {
			v_emsg(sp, (const char *)KEY_NAME(sp, key),
			    vp->ev.e_value == K_ESCAPE ?
			    VIM_NOCOM_B : VIM_NOCOM);
			return (GC_ERR);
		}

		/* If called for a motion command, stop now. */
		if (dp == NULL)
			goto usage;

		/*
		 * !!!
		 * If a '.' is immediately entered after an undo command, we
		 * replay the log instead of redoing the last command.  This
		 * is necessary because 'u' can't set the dot command -- see
		 * vi/v_undo.c:v_undo for details.
		 */
		if (VIP(sp)->u_ccnt == sp->ccnt) {
			vp->kp = &vikeys['u'];
			F_SET(vp, VC_ISDOT);
			return (GC_OK);
		}

		/* Otherwise, a repeatable command must have been executed. */
		if (!F_ISSET(dp, VC_ISDOT)) {
			msgq(sp, M_ERR, "208|No command to repeat");
			return (GC_ERR);
		}

		/* Set new count/buffer, if any, and return. */
		if (F_ISSET(vp, VC_C1SET)) {
			F_SET(dp, VC_C1SET);
			dp->count = vp->count;
		}
		if (F_ISSET(vp, VC_BUFFER))
			dp->buffer = vp->buffer;

		*vp = *dp;
		return (GC_OK);
	}

	/* Set the flags based on the command flags. */
	flags = kp->flags;

	/* Check for illegal count. */
	if (F_ISSET(vp, VC_C1SET) && !LF_ISSET(V_CNT))
		goto usage;

	/* Illegal motion command. */
	if (ismotion == NULL) {
		/* Illegal buffer. */
		if (!LF_ISSET(V_OBUF) && F_ISSET(vp, VC_BUFFER))
			goto usage;

		/* Required buffer. */
		if (LF_ISSET(V_RBUF)) {
			KEY(vp->buffer, 0);
			F_SET(vp, VC_BUFFER);
		}
	}

	/*
	 * Special case: '[', ']' and 'Z' commands.  Doesn't the fact that
	 * the *single* characters don't mean anything but the *doubled*
	 * characters do, just frost your shorts?
	 */
	if (vp->key == '[' || vp->key == ']' || vp->key == 'Z') {
		/*
		 * Historically, half entered [[, ]] or Z commands weren't
		 * cancelled by <escape>, the terminal was beeped instead.
		 * POSIX.2-1992 probably didn't notice, and requires that
		 * they be cancelled instead of beeping.  Seems fine to me.
		 *
		 * Don't set the EC_MAPCOMMAND flag, apparently ] is a popular
		 * vi meta-character, and we don't want the user to wait while
		 * we time out a possible mapping.  This *appears* to match
		 * historic vi practice, but with mapping characters, You Just
		 * Never Know.
		 */
		KEY(key, 0);

		if (vp->key != key) {
usage:			if (ismotion == NULL)
				s = kp->usage;
			else if (ismotion->key == '~' && O_ISSET(sp, O_TILDEOP))
				s = tmotion.usage;
			else
				s = vikeys[ismotion->key].usage;
			v_emsg(sp, s, VIM_USAGE);
			return (GC_ERR);
		}
	}
	/* Special case: 'z' command. */
	if (vp->key == 'z') {
		KEY(vp->character, 0);
		if (ISDIGIT(vp->character)) {
			if (v_count(sp, vp, vp->character, &vp->count2))
				return (GC_ERR);
			F_SET(vp, VC_C2SET);
			KEY(vp->character, 0);
		}
	}

	/*
	 * Commands that have motion components can be doubled to imply the
	 * current line.
	 */
	if (ismotion != NULL && ismotion->key != key && !LF_ISSET(V_MOVE)) {
		msgq(sp, M_ERR, "210|%s may not be used as a motion command",
		    KEY_NAME(sp, key));
		return (GC_ERR);
	}

	/* Pick up required trailing character. */
	if (LF_ISSET(V_CHAR))
		KEY(vp->character, 0);

	/* Get any associated cursor word. */
	if (F_ISSET(kp, V_KEYW) && v_curword(sp))
		return (GC_ERR);

	return (GC_OK);

esc:	switch (cpart) {
	case COMMANDMODE:
		msgq(sp, M_BERR, "211|Already in command mode");
		return (GC_ERR_NOFLUSH);
	case ISPARTIAL:
		break;
	case NOTPARTIAL:
		(void)sp->gp->scr_bell(sp);
		break;
	}
	return (GC_ERR);
}
Beispiel #14
0
DB *
__rec_open(const char *fname, int flags, mode_t mode, const RECNOINFO *openinfo,
    int dflags)
{
	BTREE *t;
	BTREEINFO btopeninfo;
	DB *dbp;
	PAGE *h;
	struct stat sb;
	int rfd = -1;	/* pacify gcc */
	int sverrno;

	dbp = NULL;
	/* Open the user's file -- if this fails, we're done. */
	if (fname != NULL) {
		if ((rfd = open(fname, flags | O_CLOEXEC, mode)) == -1)
			return NULL;
	}

	/* Create a btree in memory (backed by disk). */
	if (openinfo) {
		if (openinfo->flags & ~(R_FIXEDLEN | R_NOKEY | R_SNAPSHOT))
			goto einval;
		btopeninfo.flags = 0;
		btopeninfo.cachesize = openinfo->cachesize;
		btopeninfo.maxkeypage = 0;
		btopeninfo.minkeypage = 0;
		btopeninfo.psize = openinfo->psize;
		btopeninfo.compare = NULL;
		btopeninfo.prefix = NULL;
		btopeninfo.lorder = openinfo->lorder;
		dbp = __bt_open(openinfo->bfname,
		    O_RDWR, S_IRUSR | S_IWUSR, &btopeninfo, dflags);
	} else
		dbp = __bt_open(NULL, O_RDWR, S_IRUSR | S_IWUSR, NULL, dflags);
	if (dbp == NULL)
		goto err;

	/*
	 * Some fields in the tree structure are recno specific.  Fill them
	 * in and make the btree structure look like a recno structure.  We
	 * don't change the bt_ovflsize value, it's close enough and slightly
	 * bigger.
	 */
	t = dbp->internal;
	if (openinfo) {
		if (openinfo->flags & R_FIXEDLEN) {
			F_SET(t, R_FIXLEN);
			t->bt_reclen = openinfo->reclen;
			if (t->bt_reclen == 0)
				goto einval;
		}
		t->bt_bval = openinfo->bval;
	} else
		t->bt_bval = '\n';

	F_SET(t, R_RECNO);
	if (fname == NULL)
		F_SET(t, R_EOF | R_INMEM);
	else
		t->bt_rfd = rfd;

	if (fname != NULL) {
		/*
		 * In 4.4BSD, stat(2) returns true for ISSOCK on pipes.
		 * Unfortunately, that's not portable, so we use lseek
		 * and check the errno values.
		 */
		errno = 0;
		if (lseek(rfd, (off_t)0, SEEK_CUR) == -1 && errno == ESPIPE) {
			switch (flags & O_ACCMODE) {
			case O_RDONLY:
				F_SET(t, R_RDONLY);
				break;
			default:
				goto einval;
			}
slow:			if ((t->bt_rfp = fdopen(rfd, "r")) == NULL)
				goto err;
			F_SET(t, R_CLOSEFP);
			t->bt_irec =
			    F_ISSET(t, R_FIXLEN) ? __rec_fpipe : __rec_vpipe;
		} else {
			switch (flags & O_ACCMODE) {
			case O_RDONLY:
				F_SET(t, R_RDONLY);
				break;
			case O_RDWR:
				break;
			default:
				goto einval;
			}

			if (fstat(rfd, &sb))
				goto err;
			/*
			 * Kluge -- we'd like to test to see if the file is too
			 * big to mmap.  Since, we don't know what size or type
			 * off_t's or size_t's are, what the largest unsigned
			 * integral type is, or what random insanity the local
			 * C compiler will perpetrate, doing the comparison in
			 * a portable way is flatly impossible.  Hope that mmap
			 * fails if the file is too large.
			 */
			if (sb.st_size == 0)
				F_SET(t, R_EOF);
			else {
#ifdef MMAP_NOT_AVAILABLE
				/*
				 * XXX
				 * Mmap doesn't work correctly on many current
				 * systems.  In particular, it can fail subtly,
				 * with cache coherency problems.  Don't use it
				 * for now.
				 */
				t->bt_msize = sb.st_size;
				if ((t->bt_smap = mmap(NULL, t->bt_msize,
				    PROT_READ, MAP_FILE | MAP_PRIVATE, rfd,
				    (off_t)0)) == (caddr_t)-1)
					goto slow;
				t->bt_cmap = t->bt_smap;
				t->bt_emap = t->bt_smap + sb.st_size;
				t->bt_irec = F_ISSET(t, R_FIXLEN) ?
				    __rec_fmap : __rec_vmap;
				F_SET(t, R_MEMMAPPED);
#else
				goto slow;
#endif
			}
		}
	}

	/* Use the recno routines. */
	dbp->close = __rec_close;
	dbp->del = __rec_delete;
	dbp->fd = __rec_fd;
	dbp->get = __rec_get;
	dbp->put = __rec_put;
	dbp->seq = __rec_seq;
	dbp->sync = __rec_sync;

	/* If the root page was created, reset the flags. */
	if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL)
		goto err;
	if ((h->flags & P_TYPE) == P_BLEAF) {
		F_CLR(h, P_TYPE);
		F_SET(h, P_RLEAF);
		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
	} else
		mpool_put(t->bt_mp, h, 0);

	if (openinfo && openinfo->flags & R_SNAPSHOT &&
	    !F_ISSET(t, R_EOF | R_INMEM) &&
	    t->bt_irec(t, MAX_REC_NUMBER) == RET_ERROR)
                goto err;
	return (dbp);

einval:	errno = EINVAL;
err:	sverrno = errno;
	if (dbp != NULL)
		(void)__bt_close(dbp);
	if (fname != NULL)
		(void)close(rfd);
	errno = sverrno;
	return (NULL);
}
Beispiel #15
0
/*
 * cl_suspend --
 *	Suspend a screen.
 *
 * PUBLIC: int cl_suspend __P((SCR *, int *));
 */
int
cl_suspend(SCR *sp, int *allowedp)
{
	struct termios t;
	CL_PRIVATE *clp;
	WINDOW *win;
	GS *gp;
	size_t y, x;
	int changed;

	gp = sp->gp;
	clp = CLP(sp);
	win = CLSP(sp) ? CLSP(sp) : stdscr;
	*allowedp = 1;

	/*
	 * The ex implementation of this function isn't needed by screens not
	 * supporting ex commands that require full terminal canonical mode
	 * (e.g. :suspend).
	 *
	 * The vi implementation of this function isn't needed by screens not
	 * supporting vi process suspension, i.e. any screen that isn't backed
	 * by a UNIX shell.
	 *
	 * Setting allowedp to 0 will cause the editor to reject the command.
	 */
	if (F_ISSET(sp, SC_EX)) { 
		/* Save the terminal settings, and restore the original ones. */
		if (F_ISSET(clp, CL_STDIN_TTY)) {
			(void)tcgetattr(STDIN_FILENO, &t);
			(void)tcsetattr(STDIN_FILENO,
			    TCSASOFT | TCSADRAIN, &clp->orig);
		}

		/* Stop the process group. */
		(void)kill(0, SIGTSTP);

		/* Time passes ... */

		/* Restore terminal settings. */
		if (F_ISSET(clp, CL_STDIN_TTY))
			(void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &t);
		return (0);
	}

	/*
	 * Move to the lower left-hand corner of the screen.
	 *
	 * XXX
	 * Not sure this is necessary in System V implementations, but it
	 * shouldn't hurt.
	 */
	getyx(win, y, x);
	(void)wmove(win, LINES - 1, 0);
	(void)wrefresh(win);

	/*
	 * Temporarily end the screen.  System V introduced a semantic where
	 * endwin() could be restarted.  We use it because restarting curses
	 * from scratch often fails in System V.  4BSD curses didn't support
	 * restarting after endwin(), so we have to do what clean up we can
	 * without calling it.
	 */
	/* Save the terminal settings. */
	(void)tcgetattr(STDIN_FILENO, &t);

	/* Restore the cursor keys to normal mode. */
	(void)keypad(stdscr, FALSE);

	/* Restore the window name. */
	(void)cl_rename(sp, NULL, 0);

#ifdef HAVE_BSD_CURSES
	(void)cl_attr(sp, SA_ALTERNATE, 0);
#else
	(void)endwin();
#endif
	/*
	 * XXX
	 * Restore the original terminal settings.  This is bad -- the
	 * reset can cause character loss from the tty queue.  However,
	 * we can't call endwin() in BSD curses implementations, and too
	 * many System V curses implementations don't get it right.
	 */
	(void)tcsetattr(STDIN_FILENO, TCSADRAIN | TCSASOFT, &clp->orig);

	/* Stop the process group. */
	(void)kill(0, SIGTSTP);

	/* Time passes ... */

	/*
	 * If we received a killer signal, we're done.  Leave everything
	 * unchanged.  In addition, the terminal has already been reset
	 * correctly, so leave it alone.
	 */
	if (clp->killersig) {
		F_CLR(clp, CL_SCR_EX_INIT | CL_SCR_VI_INIT);
		return (0);
	}

	/* Restore terminal settings. */
	wrefresh(win);			    /* Needed on SunOs/Solaris ? */
	if (F_ISSET(clp, CL_STDIN_TTY))
		(void)tcsetattr(STDIN_FILENO, TCSASOFT | TCSADRAIN, &t);

#ifdef HAVE_BSD_CURSES
	(void)cl_attr(sp, SA_ALTERNATE, 1);
#endif

	/* Set the window name. */
	(void)cl_rename(sp, sp->frp->name, 1);

	/* Put the cursor keys into application mode. */
	(void)keypad(stdscr, TRUE);

	/* Refresh and repaint the screen. */
	(void)wmove(win, y, x);
	(void)cl_refresh(sp, 1);

	/* If the screen changed size, set the SIGWINCH bit. */
	if (cl_ssize(sp, 1, NULL, NULL, &changed))
		return (1);
	if (changed)
		F_SET(CLP(sp), CL_SIGWINCH);

	return (0);
}
Beispiel #16
0
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id)
{
	WT_BTREE *btree;
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	u_int close_range_end, close_range_start;
	bool locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	chunk = NULL;
	locked = false;
	lsm_tree = clsm->lsm_tree;

	/*
	 * Ensure that any snapshot update has cursors on the right set of
	 * chunks to guarantee visibility is correct.
	 */
	if (update && txn->isolation == WT_ISO_SNAPSHOT)
		F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);

	/*
	 * Query operations need a full set of cursors. Overwrite cursors
	 * do queries in service of updates.
	 */
	if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE))
		F_SET(clsm, WT_CLSM_OPEN_READ);

	if (lsm_tree->nchunks == 0)
		return (0);

	ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/*
	 * If the key is pointing to memory that is pinned by a chunk
	 * cursor, take a copy before closing cursors.
	 */
	if (F_ISSET(c, WT_CURSTD_KEY_INT))
		WT_CURSOR_NEEDKEY(c);

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
	locked = true;

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->switch_txn));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->switch_txn[ngood - 1] = chunk->switch_txn;
				if (__wt_txn_visible_all(
				    session, chunk->switch_txn))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (clsm->cursors != NULL && ngood < clsm->nchunks) {
			close_range_start = ngood;
			close_range_end = clsm->nchunks;
		} else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) {
			close_range_start = 0;
			close_range_end = WT_MIN(nchunks, clsm->nchunks);
			if (close_range_end > nupdates)
				close_range_end -= nupdates;
			else
				close_range_end = 0;
			WT_ASSERT(session, ngood >= close_range_end);
		} else {
			close_range_end = 0;
			close_range_start = 0;
		}
		if (close_range_end > close_range_start) {
			saved_gen = lsm_tree->dsk_gen;
			locked = false;
			WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
			WT_ERR(__clsm_close_cursors(
			    clsm, close_range_start, close_range_end));
			WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
			locked = true;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->switch_txn[i] = chunk->switch_txn;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		/*
		 * Setup all cursors other than the primary to only do conflict
		 * checks on insert operations. This allows us to execute
		 * inserts on non-primary chunks as a way of checking for
		 * write conflicts with concurrent updates.
		 */
		if (i != nchunks - 1)
			(*cp)->insert = __wt_curfile_update_check;

		if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    chunk->switch_txn == WT_TXN_NONE) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		/*
		 * Disable eviction for the in-memory chunk.  Also clear the
		 * bulk load flag here, otherwise eviction will be enabled by
		 * the first update.
		 */
		btree = ((WT_CURSOR_BTREE *)(primary))->btree;
		if (btree->bulk_load_ok) {
			btree->bulk_load_ok = false;
			WT_WITH_BTREE(session, btree,
			    __wt_btree_evictable(session, false));
		}
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;

err:	
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
	return (ret);
}
Beispiel #17
0
/*
 * __drop_table --
 *	WT_SESSION::drop for a table.
 */
static int
__drop_table(
    WT_SESSION_IMPL *session, const char *uri, const char *cfg[])
{
	WT_COLGROUP *colgroup;
	WT_DECL_RET;
	WT_INDEX *idx;
	WT_TABLE *table;
	u_int i;
	const char *name;
	bool tracked;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_TABLE_WRITE));

	name = uri;
	WT_PREFIX_SKIP_REQUIRED(session, name, "table:");

	table = NULL;
	tracked = false;

	/*
	 * Open the table so we can drop its column groups and indexes.
	 *
	 * Ideally we would keep the table locked exclusive across the drop,
	 * but for now we rely on the global table lock to prevent the table
	 * being reopened while it is being dropped.  One issue is that the
	 * WT_WITHOUT_LOCKS macro can drop and reacquire the global table lock,
	 * avoiding deadlocks while waiting for LSM operation to quiesce.
	 *
	 * Temporarily getting the table exclusively serves the purpose
	 * of ensuring that cursors on the table that are already open
	 * must at least be closed before this call proceeds.
	 */
	WT_ERR(__wt_schema_get_table_uri(session, uri, true,
	    WT_DHANDLE_EXCLUSIVE, &table));
	WT_ERR(__wt_schema_release_table(session, table));
	WT_ERR(__wt_schema_get_table_uri(session, uri, true, 0, &table));

	/* Drop the column groups. */
	for (i = 0; i < WT_COLGROUPS(table); i++) {
		if ((colgroup = table->cgroups[i]) == NULL)
			continue;
		/*
		 * Drop the column group before updating the metadata to avoid
		 * the metadata for the table becoming inconsistent if we can't
		 * get exclusive access.
		 */
		WT_ERR(__wt_schema_drop(session, colgroup->source, cfg));
		WT_ERR(__wt_metadata_remove(session, colgroup->name));
	}

	/* Drop the indices. */
	WT_ERR(__wt_schema_open_indices(session, table));
	for (i = 0; i < table->nindices; i++) {
		if ((idx = table->indices[i]) == NULL)
			continue;
		/*
		 * Drop the index before updating the metadata to avoid
		 * the metadata for the table becoming inconsistent if we can't
		 * get exclusive access.
		 */
		WT_ERR(__wt_schema_drop(session, idx->source, cfg));
		WT_ERR(__wt_metadata_remove(session, idx->name));
	}

	/* Make sure the table data handle is closed. */
	WT_TRET(__wt_schema_release_table(session, table));
	WT_ERR(__wt_schema_get_table_uri(
	    session, uri, true, WT_DHANDLE_EXCLUSIVE, &table));
	F_SET(&table->iface, WT_DHANDLE_DISCARD);
	if (WT_META_TRACKING(session)) {
		WT_WITH_DHANDLE(session, &table->iface,
		    ret = __wt_meta_track_handle_lock(session, false));
		WT_ERR(ret);
		tracked = true;
	}

	/* Remove the metadata entry (ignore missing items). */
	WT_ERR(__wt_metadata_remove(session, uri));

err:	if (table != NULL && !tracked)
		WT_TRET(__wt_schema_release_table(session, table));
	return (ret);
}
Beispiel #18
0
/*
 * __wt_evict_file --
 *	Discard pages for a specific file.
 */
int
__wt_evict_file(WT_SESSION_IMPL *session, int syncop)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *next_ref, *ref;
	int evict_reset;

	/*
	 * We need exclusive access to the file -- disable ordinary eviction
	 * and drain any blocks already queued.
	 */
	WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset));

	/* Make sure the oldest transaction ID is up-to-date. */
	__wt_txn_update_oldest(session, 1);

	/* Walk the tree, discarding pages. */
	next_ref = NULL;
	WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
	    WT_READ_CACHE | WT_READ_NO_EVICT));
	while ((ref = next_ref) != NULL) {
		page = ref->page;

		/*
		 * Eviction can fail when a page in the evicted page's subtree
		 * switches state.  For example, if we don't evict a page marked
		 * empty, because we expect it to be merged into its parent, it
		 * might no longer be empty after it's reconciled, in which case
		 * eviction of its parent would fail.  We can either walk the
		 * tree multiple times (until it's finally empty), or reconcile
		 * each page to get it to its final state before considering if
		 * it's an eviction target or will be merged into its parent.
		 *
		 * Don't limit this test to any particular page type, that tends
		 * to introduce bugs when the reconciliation of other page types
		 * changes, and there's no advantage to doing so.
		 *
		 * Eviction can also fail because an update cannot be written.
		 * If sessions have disjoint sets of files open, updates in a
		 * no-longer-referenced file may not yet be globally visible,
		 * and the write will fail with EBUSY.  Our caller handles that
		 * error, retrying later.
		 */
		if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
			WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));

		/*
		 * We can't evict the page just returned to us (it marks our
		 * place in the tree), so move the walk to one page ahead of
		 * the page being evicted.  Note, we reconciled the returned
		 * page first: if reconciliation of that page were to change
		 * the shape of the tree, and we did the next walk call before
		 * the reconciliation, the next walk call could miss a page in
		 * the tree.
		 */
		WT_ERR(__wt_tree_walk(session, &next_ref, NULL,
		    WT_READ_CACHE | WT_READ_NO_EVICT));

		switch (syncop) {
		case WT_SYNC_CLOSE:
			/*
			 * Evict the page.
			 */
			WT_ERR(__wt_evict(session, ref, WT_EVICT_EXCLUSIVE));
			break;
		case WT_SYNC_DISCARD:
			WT_ASSERT(session,
			    __wt_page_can_evict(session, page, 0, NULL));
			__wt_evict_page_clean_update(session, ref);
			break;
		case WT_SYNC_DISCARD_FORCE:
			/*
			 * Forced discard of the page, whether clean or dirty.
			 * If we see a dirty page in a forced discard, clean
			 * the page, both to keep statistics correct, and to
			 * let the page-discard function assert no dirty page
			 * is ever discarded.
			 */
			if (__wt_page_is_modified(page)) {
				page->modify->write_gen = 0;
				__wt_cache_dirty_decr(session, page);
			}

			F_SET(session, WT_SESSION_DISCARD_FORCE);
			__wt_evict_page_clean_update(session, ref);
			F_CLR(session, WT_SESSION_DISCARD_FORCE);
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}
	}

	if (0) {
err:		/* On error, clear any left-over tree walk. */
		if (next_ref != NULL)
			WT_TRET(__wt_page_release(
			    session, next_ref, WT_READ_NO_EVICT));
	}

	if (evict_reset)
		__wt_evict_file_exclusive_off(session);

	return (ret);
}
Beispiel #19
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, int truncating)
{
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	int skipped, newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_FAST_CONN_INCR(session, cursor_prev);
	WT_STAT_FAST_DATA_INCR(session, cursor_prev);

	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* Tree walk flags. */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);

	WT_RET(__cursor_func_init(cbt, 0));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt, 0);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	for (skipped = newpage = 0;; skipped = 0, newpage = 1) {
		page = cbt->ref == NULL ? NULL : cbt->ref->page;
		WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page));

		/*
		 * The last page in a column-store has appended entries.
		 * We handle it separately from the usual cursor code:
		 * it's only that one page and it's in a simple format.
		 */
		if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);

		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(
				    cbt, newpage, &skipped);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = 1;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage, &skipped);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage, &skipped);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		if (newpage && skipped)
			page->read_gen = WT_READGEN_OLDEST;

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
	}

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Beispiel #20
0
/*
 * ex_aci --
 *	Append, change, insert in ex.
 */
static int
ex_aci(SCR *sp, EXCMD *cmdp, enum which cmd)
{
	CHAR_T *p, *t;
	GS *gp;
	TEXT *tp;
	TEXTH tiq[] = {{ 0 }};
	recno_t cnt = 0, lno;
	size_t len;
	u_int32_t flags;
	int need_newline;

	gp = sp->gp;
	NEEDFILE(sp, cmdp);

	/*
	 * If doing a change, replace lines for as long as possible.  Then,
	 * append more lines or delete remaining lines.  Changes to an empty
	 * file are appends, inserts are the same as appends to the previous
	 * line.
	 *
	 * !!!
	 * Set the address to which we'll append.  We set sp->lno to this
	 * address as well so that autoindent works correctly when get text
	 * from the user.
	 */
	lno = cmdp->addr1.lno;
	sp->lno = lno;
	if ((cmd == CHANGE || cmd == INSERT) && lno != 0)
		--lno;

	/*
	 * !!!
	 * If the file isn't empty, cut changes into the unnamed buffer.
	 */
	if (cmd == CHANGE && cmdp->addr1.lno != 0 &&
	    (cut(sp, NULL, &cmdp->addr1, &cmdp->addr2, CUT_LINEMODE) ||
	    del(sp, &cmdp->addr1, &cmdp->addr2, 1)))
		return (1);

	/*
	 * !!!
	 * Anything that was left after the command separator becomes part
	 * of the inserted text.  Apparently, it was common usage to enter:
	 *
	 *	:g/pattern/append|stuff1
	 *
	 * and append the line of text "stuff1" to the lines containing the
	 * pattern.  It was also historically legal to enter:
	 *
	 *	:append|stuff1
	 *	stuff2
	 *	.
	 *
	 * and the text on the ex command line would be appended as well as
	 * the text inserted after it.  There was an historic bug however,
	 * that the user had to enter *two* terminating lines (the '.' lines)
	 * to terminate text input mode, in this case.  This whole thing
	 * could be taken too far, however.  Entering:
	 *
	 *	:append|stuff1\
	 *	stuff2
	 *	stuff3
	 *	.
	 *
	 * i.e. mixing and matching the forms confused the historic vi, and,
	 * not only did it take two terminating lines to terminate text input
	 * mode, but the trailing backslashes were retained on the input.  We
	 * match historic practice except that we discard the backslashes.
	 *
	 * Input lines specified on the ex command line lines are separated by
	 * <newline>s.  If there is a trailing delimiter an empty line was
	 * inserted.  There may also be a leading delimiter, which is ignored
	 * unless it's also a trailing delimiter.  It is possible to encounter
	 * a termination line, i.e. a single '.', in a global command, but not
	 * necessary if the text insert command was the last of the global
	 * commands.
	 */
	if (cmdp->save_cmdlen != 0) {
		for (p = cmdp->save_cmd,
		    len = cmdp->save_cmdlen; len > 0; p = t) {
			for (t = p; len > 0 && t[0] != '\n'; ++t, --len);
			if (t != p || len == 0) {
				if (F_ISSET(sp, SC_EX_GLOBAL) &&
				    t - p == 1 && p[0] == '.') {
					++t;
					if (len > 0)
						--len;
					break;
				}
				if (db_append(sp, 1, lno++, p, t - p))
					return (1);
			}
			if (len != 0) {
				++t;
				if (--len == 0 &&
				    db_append(sp, 1, lno++, NULL, 0))
					return (1);
			}
		}
		/*
		 * If there's any remaining text, we're in a global, and
		 * there's more command to parse.
		 *
		 * !!!
		 * We depend on the fact that non-global commands will eat the
		 * rest of the command line as text input, and before getting
		 * any text input from the user.  Otherwise, we'd have to save
		 * off the command text before or during the call to the text
		 * input function below.
		 */
		if (len != 0)
			cmdp->save_cmd = t;
		cmdp->save_cmdlen = len;
	}

	if (F_ISSET(sp, SC_EX_GLOBAL)) {
		if ((sp->lno = lno) == 0 && db_exist(sp, 1))
			sp->lno = 1;
		return (0);
	}

	/*
	 * If not in a global command, read from the terminal.
	 *
	 * If this code is called by vi, we want to reset the terminal and use
	 * ex's line get routine.  It actually works fine if we use vi's get
	 * routine, but it doesn't look as nice.  Maybe if we had a separate
	 * window or something, but getting a line at a time looks awkward.
	 * However, depending on the screen that we're using, that may not
	 * be possible.
	 */
	if (F_ISSET(sp, SC_VI)) {
		if (gp->scr_screen(sp, SC_EX)) {
			ex_wemsg(sp, cmdp->cmd->name, EXM_NOCANON);
			return (1);
		}

		/* If we're still in the vi screen, move out explicitly. */
		need_newline = !F_ISSET(sp, SC_SCR_EXWROTE);
		F_SET(sp, SC_SCR_EX | SC_SCR_EXWROTE);
		if (need_newline)
			(void)ex_puts(sp, "\n");

		/*
		 * !!!
		 * Users of historical versions of vi sometimes get confused
		 * when they enter append mode, and can't seem to get out of
		 * it.  Give them an informational message.
		 */
		(void)ex_puts(sp,
		    msg_cat(sp, "273|Entering ex input mode.", NULL));
		(void)ex_puts(sp, "\n");
		(void)ex_fflush(sp);
	}

	/*
	 * Set input flags; the ! flag turns off autoindent for append,
	 * change and insert.
	 */
	LF_INIT(TXT_DOTTERM | TXT_NUMBER);
	if (!FL_ISSET(cmdp->iflags, E_C_FORCE) && O_ISSET(sp, O_AUTOINDENT))
		LF_SET(TXT_AUTOINDENT);
	if (O_ISSET(sp, O_BEAUTIFY))
		LF_SET(TXT_BEAUTIFY);

	/*
	 * This code can't use the common screen TEXTH structure (sp->tiq),
	 * as it may already be in use, e.g. ":append|s/abc/ABC/" would fail
	 * as we are only halfway through the text when the append code fires.
	 * Use a local structure instead.  (The ex code would have to use a
	 * local structure except that we're guaranteed to finish remaining
	 * characters in the common TEXTH structure when they were inserted
	 * into the file, above.)
	 */
	TAILQ_INIT(tiq);

	if (ex_txt(sp, tiq, 0, flags))
		return (1);

	TAILQ_FOREACH(tp, tiq, q) {
		if (db_append(sp, 1, lno++, tp->lb, tp->len))
			return (1);
		++cnt;
	}

	/*
	 * Set sp->lno to the final line number value (correcting for a
	 * possible 0 value) as that's historically correct for the final
	 * line value, whether or not the user entered any text.
	 */
	if ((sp->lno = lno) == 0 && db_exist(sp, 1))
		sp->lno = 1;

	return (0);
}
Beispiel #21
0
/*
 * __wt_bm_read --
 *	Map or read address cookie referenced block into a buffer.
 */
int
__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
{
	WT_BLOCK *block;
	off_t offset;
	uint32_t size, cksum;
	int mapped;

	WT_UNUSED(addr_size);
	block = bm->block;

	/* Crack the cookie. */
	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));

	/*
	 * Clear buffers previously used for mapped memory, we may be forced
	 * to read into this buffer.
	 */
	if (F_ISSET(buf, WT_ITEM_MAPPED))
		__wt_buf_free(session, buf);

	/*
	 * If we're going to be able to return mapped memory and the buffer
	 * has allocated memory, discard it.
	 */
	mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen;
	if (buf->mem != NULL && mapped)
		__wt_buf_free(session, buf);

	/* Map the block if it's possible. */
	if (mapped) {
		buf->mem = (uint8_t *)bm->map + offset;
		buf->memsize = size;
		buf->data = buf->mem;
		buf->size = size;
		F_SET(buf, WT_ITEM_MAPPED);

		WT_RET(__wt_mmap_preload(session, buf->mem, buf->size));

		WT_CSTAT_INCR(session, block_map_read);
		WT_CSTAT_INCRV(session, block_byte_map_read, size);
		return (0);
	}

	/* Read the block. */
	WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));

#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system's buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += size) > block->os_cache_max) {
		WT_DECL_RET;

		block->os_cache = 0;
		if ((ret = posix_fadvise(block->fh->fd,
		    (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	return (0);
}
Beispiel #22
0
/*
 * mark --
 *	Mark commands.
 */
static int
mark(SCR *sp, VICMD *vp, int getmark, enum which cmd)
{
	dir_t dir;
	MARK m;
	size_t len;

	if (getmark && mark_get(sp, vp->character, &vp->m_stop, M_BERR))
		return (1);

	/*
	 * !!!
	 * Historically, BQMARKS for character positions that no longer
	 * existed acted as FQMARKS.
	 *
	 * FQMARKS move to the first non-blank.
	 */
	switch (cmd) {
	case BQMARK:
		if (db_get(sp, vp->m_stop.lno, DBG_FATAL, NULL, &len))
			return (1);
		if (vp->m_stop.cno < len ||
		    (vp->m_stop.cno == len && len == 0))
			break;

		if (ISMOTION(vp))
			F_SET(vp, VM_LMODE);
		cmd = FQMARK;
		/* FALLTHROUGH */
	case FQMARK:
		vp->m_stop.cno = 0;
		if (nonblank(sp, vp->m_stop.lno, &vp->m_stop.cno))
			return (1);
		break;
	default:
		abort();
	}

	/* Non-motion commands move to the end of the range. */
	if (!ISMOTION(vp)) {
		vp->m_final = vp->m_stop;
		return (0);
	}

	/*
	 * !!!
	 * If a motion component to a BQMARK, the cursor has to move.
	 */
	if (cmd == BQMARK &&
	    vp->m_stop.lno == vp->m_start.lno &&
	    vp->m_stop.cno == vp->m_start.cno) {
		v_nomove(sp);
		return (1);
	}

	/*
	 * If the motion is in the reverse direction, switch the start and
	 * stop MARK's so that it's in a forward direction.  (There's no
	 * reason for this other than to make the tests below easier.  The
	 * code in vi.c:vi() would have done the switch.)  Both forward
	 * and backward motions can happen for any kind of search command.
	 */
	if (vp->m_start.lno > vp->m_stop.lno ||
	    (vp->m_start.lno == vp->m_stop.lno &&
	    vp->m_start.cno > vp->m_stop.cno)) {
		m = vp->m_start;
		vp->m_start = vp->m_stop;
		vp->m_stop = m;
		dir = BACKWARD;
	} else
		dir = FORWARD;

	/*
	 * Yank cursor motion, when associated with marks as motion commands,
	 * historically behaved as follows:
	 *
	 * ` motion			' motion
	 *		Line change?		Line change?
	 *		Y	N		Y	N
	 *	      --------------	      ---------------
	 * FORWARD:  |	NM	NM	      | NM	NM
	 *	     |			      |
	 * BACKWARD: |	M	M	      | M	NM(1)
	 *
	 * where NM means the cursor didn't move, and M means the cursor
	 * moved to the mark.
	 *
	 * As the cursor was usually moved for yank commands associated
	 * with backward motions, this implementation regularizes it by
	 * changing the NM at position (1) to be an M.  This makes mark
	 * motions match search motions, which is probably A Good Thing.
	 *
	 * Delete cursor motion was always to the start of the text region,
	 * regardless.  Ignore other motion commands.
	 */
#ifdef HISTORICAL_PRACTICE
	if (ISCMD(vp->rkp, 'y')) {
		if ((cmd == BQMARK ||
		    (cmd == FQMARK && vp->m_start.lno != vp->m_stop.lno)) &&
		    (vp->m_start.lno > vp->m_stop.lno ||
		    (vp->m_start.lno == vp->m_stop.lno &&
		    vp->m_start.cno > vp->m_stop.cno)))
			vp->m_final = vp->m_stop;
	} else if (ISCMD(vp->rkp, 'd'))
		if (vp->m_start.lno > vp->m_stop.lno ||
		    (vp->m_start.lno == vp->m_stop.lno &&
		    vp->m_start.cno > vp->m_stop.cno))
			vp->m_final = vp->m_stop;
#else
	vp->m_final = vp->m_start;
#endif

	/*
	 * Forward marks are always line oriented, and it's set in the
	 * vcmd.c table.
	 */
	if (cmd == FQMARK)
		return (0);

	/*
	 * BQMARK'S moving backward and starting at column 0, and ones moving
	 * forward and ending at column 0 are corrected to the last column of
	 * the previous line.  Otherwise, adjust the starting/ending point to
	 * the character before the current one (this is safe because we know
	 * the search had to move to succeed).
	 *
	 * Mark motions become line mode opertions if they start at the first
	 * nonblank and end at column 0 of another line.
	 */
	if (vp->m_start.lno < vp->m_stop.lno && vp->m_stop.cno == 0) {
		if (db_get(sp, --vp->m_stop.lno, DBG_FATAL, NULL, &len))
			return (1);
		vp->m_stop.cno = len ? len - 1 : 0;
		len = 0;
		if (nonblank(sp, vp->m_start.lno, &len))
			return (1);
		if (vp->m_start.cno <= len)
			F_SET(vp, VM_LMODE);
	} else
		--vp->m_stop.cno;

	return (0);
}
Beispiel #23
0
/* maybe this could be simpler
 *
 * DB3 behaves differently from DB1
 *
 * if lno != 0 just go to lno and put the new line after it
 * if lno == 0 then if there are any record, put in front of the first
 *		    otherwise just append to the end thus creating the first
 *				line
 */
static int
append(SCR *sp, db_recno_t lno, const CHAR_T *p, size_t len, lnop_t op, int update)
{
	DBT data, key;
	DBC *dbcp_put;
	EXF *ep;
	const char *fp;
	size_t flen;
	int rval;

	/* Check for no underlying file. */
	if ((ep = sp->ep) == NULL) {
		ex_emsg(sp, NULL, EXM_NOFILEYET);
		return (1);
	}
	if (ep->l_win && ep->l_win != sp->wp) {
		ex_emsg(sp, NULL, EXM_LOCKED);
		return 1;
	}

	/* Log before change. */
	log_line(sp, lno + 1, LOG_LINE_APPEND_B);

	/* Update file. */
	memset(&key, 0, sizeof(key));
	key.data = &lno;
	key.size = sizeof(lno);
	memset(&data, 0, sizeof(data));

	if ((sp->db_error = ep->db->cursor(ep->db, NULL, &dbcp_put, 0)) != 0)
	    return 1;

	INT2FILE(sp, p, len, fp, flen);

	if (lno != 0) {
	    if ((sp->db_error = dbcp_put->c_get(dbcp_put, &key, &data, DB_SET)) != 0) 
		goto err2;

	    data.data = __UNCONST(fp);
	    data.size = flen;
	    if ((sp->db_error = dbcp_put->c_put(dbcp_put, &key, &data, DB_AFTER)) != 0) {
err2:
		(void)dbcp_put->c_close(dbcp_put);
		msgq(sp, M_DBERR, 
			(op == LINE_APPEND) 
			    ? "004|unable to append to line %lu" 
			    : "005|unable to insert at line %lu", 
			(u_long)lno);
		return (1);
	    }
	} else {
	    if ((sp->db_error = dbcp_put->c_get(dbcp_put, &key, &data, DB_FIRST)) != 0) {
		if (sp->db_error != DB_NOTFOUND)
		    goto err2;

		data.data = __UNCONST(fp);
		data.size = flen;
		if ((sp->db_error = ep->db->put(ep->db, NULL, &key, &data, DB_APPEND)) != 0) {
		    goto err2;
		}
	    } else {
		key.data = &lno;
		key.size = sizeof(lno);
		data.data = __UNCONST(fp);
		data.size = flen;
		if ((sp->db_error = dbcp_put->c_put(dbcp_put, &key, &data, DB_BEFORE)) != 0) {
		    goto err2;
		}
	    }
	}

	(void)dbcp_put->c_close(dbcp_put);

	/* Flush the cache, update line count, before screen update. */
	update_cache(sp, LINE_INSERT, lno);

	/* File now dirty. */
	if (F_ISSET(ep, F_FIRSTMODIFY))
		(void)rcv_init(sp);
	F_SET(ep, F_MODIFIED);

	/* Log after change. */
	log_line(sp, lno + 1, LOG_LINE_APPEND_F);

	/* Update marks, @ and global commands. */
	rval = line_insdel(sp, LINE_INSERT, lno + 1);

	/*
	 * Update screen.
	 *
	 * comment copied from db_append
	 * XXX
	 * Nasty hack.  If multiple lines are input by the user, they aren't
	 * committed until an <ESC> is entered.  The problem is the screen was
	 * updated/scrolled as each line was entered.  So, when this routine
	 * is called to copy the new lines from the cut buffer into the file,
	 * it has to know not to update the screen again.
	 */
	return (scr_update(sp, lno + 1, LINE_INSERT, update) || rval);
}
Beispiel #24
0
/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, fatal, locked;

	ci = &block->live;
	fatal = locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice (without the second
	 * resolution step), or re-entered for any reason, it's an error in our
	 * caller, and our choices are all bad: leak blocks or potentially crash
	 * with our caller not yet having saved previous checkpoint information
	 * to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	switch (block->ckpt_state) {
	case WT_CKPT_INPROGRESS:
		block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
		break;
	case WT_CKPT_NONE:
	case WT_CKPT_PANIC_ON_FAILURE:
		__wt_err(session, EINVAL,
		    "%s: an unexpected checkpoint attempt: the checkpoint "
		    "was never started or has already completed",
		    block->name);
		ret = __wt_block_panic(session);
		break;
	case WT_CKPT_SALVAGE:
		/* Salvage doesn't use the standard checkpoint APIs. */
		break;
	}
	__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Failures are now fatal: we can't currently back out the merge of any
	 * deleted checkpoint extent lists into the live system's extent lists,
	 * so continuing after error would leave the live system's extent lists
	 * corrupted for any subsequent checkpoint (and potentially, should a
	 * subsequent checkpoint succeed, for recovery).
	 */
	fatal = true;

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data);
		}
		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ckpt_size;

			/*
			 * Set the rolling checkpoint size for the live system.
			 * The current size includes the current checkpoint's
			 * root page size (root pages are on the checkpoint's
			 * block allocation list as root pages are allocated
			 * with the usual block allocation functions). That's
			 * correct, but we don't want to include it in the size
			 * for the next checkpoint.
			 */
			ckpt_size -= ci->root_size;

			/*
			 * Additionally, we had a bug for awhile where the live
			 * checkpoint size grew without bound. We can't sanity
			 * check the value, that would require walking the tree
			 * as part of the checkpoint. Bound any bug at the size
			 * of the file.
			 * It isn't practical to assert that the value is within
			 * bounds since databases created with older versions
			 * of WiredTiger (2.8.0) would likely see an error.
			 */
			ci->ckpt_size =
			    WT_MIN(ckpt_size, (uint64_t)block->size);

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

err:	if (ret != 0 && fatal) {
		__wt_err(session, ret,
		    "%s: fatal checkpoint failure", block->name);
		ret = __wt_block_panic(session);
	}

	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}
Beispiel #25
0
/*
 * __snapshot_process --
 *	Process the list of snapshots.
 */
static int
__snapshot_process(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_SNAPSHOT *snapbase)
{
	WT_BLOCK_SNAPSHOT *a, *b, *si;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_SNAPSHOT *snap;
	uint64_t snapshot_size;
	int deleting, locked;

	si = &block->live;
	locked = 0;

	/*
	 * We've allocated our last page, update the snapshot size.  We need to
	 * calculate the live system's snapshot size before reading and merging
	 * snapshot allocation and discard information from the snapshots we're
	 * deleting, those operations will change the underlying byte counts.
	 */
	snapshot_size = si->snapshot_size;
	snapshot_size += si->alloc.bytes;
	snapshot_size -= si->discard.bytes;

	/*
	 * Extents that become newly available as a result of deleting previous
	 * snapshots are added to a list of extents.  The list should be empty,
	 * but there's no explicit "free the snapshot information" call into the
	 * block manager; if there was an error in an upper level resulting in
	 * the snapshot never being "resolved", the list might not be empty.
	 *
	 * XXX
	 * This isn't sufficient, actually: we're going to leak all the blocks
	 * that were written as part of the last snapshot because it was never
	 * resolved.
	 */
	__wt_block_extlist_free(session, &si->snapshot_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &si->snapshot_avail, "live", "snapshot_avail"));

	/*
	 * To delete a snapshot, we'll need snapshot information for it, and we
	 * have to read that from the disk.
	 */
	deleting = 0;
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		/*
		 * To delete a snapshot, we'll need snapshot information for it
		 * and the subsequent snapshot.  The test is tricky, we have to
		 * load the current snapshot's information if it's marked for
		 * deletion, or if it follows a snapshot marked for deletion,
		 * where the boundary cases are the first snapshot in the list
		 * and the last snapshot in the list: if we're deleting the last
		 * snapshot in the list, there's no next snapshot, the snapshot
		 * will be merged into the live tree.
		 */
		if (!F_ISSET(snap, WT_SNAP_DELETE) &&
		    (snap == snapbase ||
		    F_ISSET(snap, WT_SNAP_ADD) ||
		    !F_ISSET(snap - 1, WT_SNAP_DELETE)))
			continue;
		deleting = 1;

		/*
		 * Allocate a snapshot structure, crack the cookie and read the
		 * snapshot's extent lists.
		 *
		 * Ignore the avail list: snapshot avail lists are only useful
		 * if we are rolling forward from the particular snapshot and
		 * they represent our best understanding of what blocks can be
		 * allocated.  If we are not operating on the live snapshot,
		 * subsequent snapshots might have allocated those blocks, and
		 * the avail list is useless.  We don't discard it, because it
		 * is useful as part of verification, but we don't re-write it
		 * either.
		 */
		WT_ERR(__wt_calloc(
		    session, 1, sizeof(WT_BLOCK_SNAPSHOT), &snap->bpriv));
		si = snap->bpriv;
		WT_ERR(__wt_block_snap_init(session, block, si, snap->name, 0));
		WT_ERR(__wt_block_buffer_to_snapshot(
		    session, block, snap->raw.data, si));
		WT_ERR(__wt_block_extlist_read(session, block, &si->alloc));
		WT_ERR(__wt_block_extlist_read(session, block, &si->discard));
	}

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if snapshots take too
	 * much time away from real work: we read historic snapshot information
	 * without a lock, but we could also merge and re-write the delete
	 * snapshot information without a lock, except for ranges merged into
	 * the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = 1;

	/* Skip the additional processing if we aren't deleting snapshots. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed snapshots: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap) {
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, snapshot)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__snapshot_string(
			    session, block, snap->raw.data, tmp));
			WT_VERBOSE_ERR(session, snapshot,
			    "%s: delete-snapshot: %s: %s",
			    block->name, snap->name, (char *)tmp->data);
		}

		/*
		 * Set the from/to snapshot structures, where the "to" value
		 * may be the live tree.
		 */
		a = snap->bpriv;
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			b = &block->live;
		else
			b = (snap + 1)->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the snapshot's discard list, however, not the live system's
		 * list because it appears on the snapshot's alloc list and so
		 * must be paired in the snapshot.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" snapshot's extent
		 * lists directly to the live system's avail list, they were
		 * never on any alloc list.   Include the "from" snapshot's
		 * avail list, it's going away.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * snapshot's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, &a->discard, &b->discard));

		/*
		 * If the "to" snapshot is also being deleted, we're done with
		 * it, it's merged into some other snapshot in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * snapshots, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" snapshot's allocate
		 * and discard lists overlap is fair game, move ranges appearing
		 * on both lists to the live snapshot's newly available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(snap + 1, WT_SNAP_ADD))
			continue;

		/*
		 * We have to write the "to" snapshot's extent lists out in new
		 * blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" snapshot's extent lists
		 * directly to the live system's avail list, they were never on
		 * any alloc list.  Do not include the "to" snapshot's avail
		 * list, it's not changing.
		 */
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__snapshot_extlist_fblocks(session, block, &b->discard));

		F_SET(snap + 1, WT_SNAP_UPDATE);
	}

	/* Update snapshots marked for update. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_UPDATE)) {
			WT_ASSERT(session, !F_ISSET(snap, WT_SNAP_ADD));
			WT_ERR(__snapshot_update(
			    session, block, snap, snap->bpriv, 0, 0));
		}

live_update:
	si = &block->live;

	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &si->avail));

	/* Update the final, added snapshot based on the live system. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (F_ISSET(snap, WT_SNAP_ADD)) {
			WT_ERR(__snapshot_update(
			    session, block, snap, si, snapshot_size, 1));

			/*
			 * XXX
			 * Our caller wants two pieces of information: the time
			 * the snapshot was taken and the final snapshot size.
			 * This violates layering but the alternative is a call
			 * for the btree layer to crack the snapshot cookie into
			 * its components, and that's a fair amount of work.
			 * (We could just read the system time in the session
			 * layer when updating the metadata file, but that won't
			 * work for the snapshot size, and so we do both here.)
			 */
			snap->snapshot_size = si->snapshot_size;
			WT_ERR(__wt_epoch(session, &snap->sec, NULL));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.
	 */
	__wt_block_extlist_free(session, &si->alloc);
	WT_ERR(__wt_block_extlist_init(session, &si->alloc, "live", "alloc"));
	__wt_block_extlist_free(session, &si->discard);
	WT_ERR(
	    __wt_block_extlist_init(session, &si->discard, "live", "discard"));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first snapshot in the system should always have an empty discard
	 * list.  If we've read that snapshot and/or created it, check.
	 */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if (!F_ISSET(snap, WT_SNAP_DELETE))
			break;
	if ((a = snap->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0) {
		__wt_errx(session,
		    "snapshot incorrectly has blocks on the discard list");
		WT_ERR(WT_ERROR);
	}
#endif

err:	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any snapshot information we loaded, we no longer need it. */
	WT_SNAPSHOT_FOREACH(snapbase, snap)
		if ((si = snap->bpriv) != NULL) {
			__wt_block_extlist_free(session, &si->alloc);
			__wt_block_extlist_free(session, &si->avail);
			__wt_block_extlist_free(session, &si->discard);
		}

	__wt_scr_free(&tmp);
	return (ret);
}
Beispiel #26
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
{
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	uint32_t flags;
	bool newpage;

	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_STAT_CONN_INCR(session, cursor_prev);
	WT_STAT_DATA_INCR(session, cursor_prev);

	F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);

	WT_RET(__cursor_func_init(cbt, false));

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	flags = WT_READ_PREV | WT_READ_SKIP_INTL;	/* tree walk flags */
	LF_SET(WT_READ_NO_SPLIT);			/* don't try to split */
	if (truncating)
		LF_SET(WT_READ_TRUNCATE);
	for (newpage = false;; newpage = true) {
		page = cbt->ref == NULL ? NULL : cbt->ref->page;

		/*
		 * Column-store pages may have appended entries. Handle it
		 * separately from the usual cursor code, it's in a simple
		 * format.
		 */
		if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);

		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = true;
		}
		if (page != NULL) {
			switch (page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		/*
		 * If we saw a lot of deleted records on this page, or we went
		 * all the way through a page and only saw deleted records, try
		 * to evict the page when we release it.  Otherwise repeatedly
		 * deleting from the beginning of a tree can have quadratic
		 * performance.  Take care not to force eviction of pages that
		 * are genuinely empty, in new trees.
		 */
		if (page != NULL &&
		    (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
		    (newpage && cbt->page_deleted_count > 0)))
			__wt_page_evict_soon(session, cbt->ref);
		cbt->page_deleted_count = 0;

		WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
		WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND);
	}
#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_check(session, cbt, false));
#endif
	if (ret == 0)
		F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Beispiel #27
0
/*
 * __wt_lsm_checkpoint_chunk --
 *	Flush a single LSM chunk to disk.
 */
int
__wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk)
{
	WT_DECL_RET;
	WT_TXN_ISOLATION saved_isolation;
	bool flush_set;

	flush_set = false;

	/*
	 * If the chunk is already checkpointed, make sure it is also evicted.
	 * Either way, there is no point trying to checkpoint it again.
	 */
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) &&
	    !chunk->evicted) {
		WT_WITH_HANDLE_LIST_LOCK(session,
		    ret = __lsm_discard_handle(session, chunk->uri, NULL));
		if (ret == 0)
			chunk->evicted = 1;
		else if (ret == EBUSY)
			ret = 0;
		else
			WT_RET_MSG(session, ret, "discard handle");
	}
	if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s already on disk",
		    chunk->uri);
		return (0);
	}

	/* Stop if a running transaction needs the chunk. */
	WT_RET(__wt_txn_update_oldest(
	    session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));
	if (chunk->switch_txn == WT_TXN_NONE ||
	    !__wt_txn_visible_all(session, chunk->switch_txn)) {
		__wt_verbose(session, WT_VERB_LSM,
		    "LSM worker %s: running transaction, return",
		    chunk->uri);
		return (0);
	}

	if (!__wt_atomic_cas8(&chunk->flushing, 0, 1))
		return (0);
	flush_set = true;

	__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s",
	    chunk->uri);

	/*
	 * Flush the file before checkpointing: this is the expensive part in
	 * terms of I/O.
	 *
	 * !!!
	 * We can wait here for checkpoints and fsyncs to complete, which can
	 * take a long time.
	 */
	if ((ret = __wt_session_get_btree(
	    session, chunk->uri, NULL, NULL, 0)) == 0) {
		/*
		 * Set read-uncommitted: we have already checked that all of the
		 * updates in this chunk are globally visible, use the cheapest
		 * possible check in reconciliation.
		 */
		saved_isolation = session->txn.isolation;
		session->txn.isolation = WT_ISO_READ_UNCOMMITTED;
		ret = __wt_cache_op(session, WT_SYNC_WRITE_LEAVES);
		session->txn.isolation = saved_isolation;
		WT_TRET(__wt_session_release_btree(session));
	}
	WT_ERR(ret);

	__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s",
	    chunk->uri);

	/*
	 * Turn on metadata tracking to ensure the checkpoint gets the
	 * necessary handle locks.
	 *
	 * Ensure that we don't race with a running checkpoint: the checkpoint
	 * lock protects against us racing with an application checkpoint in
	 * this chunk.  Don't wait for it, though: checkpoints can take a long
	 * time, and our checkpoint operation should be very quick.
	 */
	WT_ERR(__wt_meta_track_on(session));
	WT_WITH_CHECKPOINT_LOCK(session, ret,
	    WT_WITH_SCHEMA_LOCK(session, ret,
		ret = __wt_schema_worker(
		session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)));
	WT_TRET(__wt_meta_track_off(session, false, ret != 0));
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM checkpoint");

	/* Now the file is written, get the chunk size. */
	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));

	/* Update the flush timestamp to help track ongoing progress. */
	__wt_epoch(session, &lsm_tree->last_flush_ts);
	++lsm_tree->chunks_flushed;

	/* Lock the tree, mark the chunk as on disk and update the metadata. */
	__wt_lsm_tree_writelock(session, lsm_tree);
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);
	__wt_lsm_tree_writeunlock(session, lsm_tree);
	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM metadata write");

	WT_PUBLISH(chunk->flushing, 0);
	flush_set = false;

	/*
	 * Clear the no-eviction flag so the primary can be evicted and
	 * eventually closed.  Only do this once the checkpoint has succeeded:
	 * otherwise, accessing the leaf page during the checkpoint can trigger
	 * forced eviction.
	 */
	WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0));
	__wt_btree_evictable(session, true);
	WT_ERR(__wt_session_release_btree(session));

	/* Make sure we aren't pinning a transaction ID. */
	__wt_txn_release_snapshot(session);

	__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s",
	    chunk->uri);

	/* Schedule a bloom filter create for our newly flushed chunk. */
	if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF))
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_BLOOM, 0, lsm_tree));
	else
		WT_ERR(__wt_lsm_manager_push_entry(
		    session, WT_LSM_WORK_MERGE, 0, lsm_tree));

err:	if (flush_set)
		WT_PUBLISH(chunk->flushing, 0);

	return (ret);
}
Beispiel #28
0
/*
 * __wt_txn_config --
 *	Configure a transaction.
 */
int
__wt_txn_config(WT_SESSION_IMPL *session, const char *cfg[])
{
	WT_CONFIG_ITEM cval;
	WT_TXN *txn;

	txn = &session->txn;

	WT_RET(__wt_config_gets_def(session, cfg, "isolation", 0, &cval));
	if (cval.len != 0)
		txn->isolation =
		    WT_STRING_MATCH("snapshot", cval.str, cval.len) ?
		    WT_ISO_SNAPSHOT :
		    WT_STRING_MATCH("read-committed", cval.str, cval.len) ?
		    WT_ISO_READ_COMMITTED : WT_ISO_READ_UNCOMMITTED;

	/*
	 * The default sync setting is inherited from the connection, but can
	 * be overridden by an explicit "sync" setting for this transaction.
	 *
	 * We want to distinguish between inheriting implicitly and explicitly.
	 */
	F_CLR(txn, WT_TXN_SYNC_SET);
	WT_RET(__wt_config_gets_def(
	    session, cfg, "sync", (int)UINT_MAX, &cval));
	if (cval.val == 0 || cval.val == 1)
		/*
		 * This is an explicit setting of sync.  Set the flag so
		 * that we know not to overwrite it in commit_transaction.
		 */
		F_SET(txn, WT_TXN_SYNC_SET);

	/*
	 * If sync is turned off explicitly, clear the transaction's sync field.
	 */
	if (cval.val == 0)
		txn->txn_logsync = 0;

	WT_RET(__wt_config_gets_def(session, cfg, "snapshot", 0, &cval));
	if (cval.len > 0)
		/*
		 * The layering here isn't ideal - the named snapshot get
		 * function does both validation and setup. Otherwise we'd
		 * need to walk the list of named snapshots twice during
		 * transaction open.
		 */
		WT_RET(__wt_txn_named_snapshot_get(session, &cval));

	WT_RET(__wt_config_gets_def(session, cfg, "read_timestamp", 0, &cval));
	if (cval.len > 0) {
#ifdef HAVE_TIMESTAMPS
		wt_timestamp_t ts;
		WT_TXN_GLOBAL *txn_global;
		char timestamp_buf[2 * WT_TIMESTAMP_SIZE + 1];
		bool round_to_oldest;

		txn_global = &S2C(session)->txn_global;
		WT_RET(__wt_txn_parse_timestamp(session, "read", &ts, &cval));

		/*
		 * Read the configuration here to reduce the span of the
		 * critical section.
		 */
		WT_RET(__wt_config_gets_def(session,
		    cfg, "round_to_oldest", 0, &cval));
		round_to_oldest = cval.val;
		/*
		 * This code is not using the timestamp validate function to
		 * avoid a race between checking and setting transaction
		 * timestamp.
		 */
		__wt_readlock(session, &txn_global->rwlock);
		if (__wt_timestamp_cmp(&ts, &txn_global->oldest_timestamp) < 0)
		{
			WT_RET(__wt_timestamp_to_hex_string(session,
			    timestamp_buf, &ts));
			/*
			 * If given read timestamp is earlier than oldest
			 * timestamp then round the read timestamp to
			 * oldest timestamp.
			 */
			if (round_to_oldest)
				__wt_timestamp_set(&txn->read_timestamp,
				    &txn_global->oldest_timestamp);
			else {
				__wt_readunlock(session, &txn_global->rwlock);
				WT_RET_MSG(session, EINVAL, "read timestamp "
				    "%s older than oldest timestamp",
				    timestamp_buf);
			}
		} else {
			__wt_timestamp_set(&txn->read_timestamp, &ts);
			/*
			 * Reset to avoid a verbose message as read
			 * timestamp is not rounded to oldest timestamp.
			 */
			round_to_oldest = false;
		}

		__wt_txn_set_read_timestamp(session);
		__wt_readunlock(session, &txn_global->rwlock);
		txn->isolation = WT_ISO_SNAPSHOT;
		if (round_to_oldest) {
			/*
			 * This message is generated here to reduce the span of
			 * critical section.
			 */
			__wt_verbose(session, WT_VERB_TIMESTAMP, "Read "
			    "timestamp %s : Rounded to oldest timestamp",
			    timestamp_buf);
		}
#else
		WT_RET_MSG(session, EINVAL, "read_timestamp requires a "
		    "version of WiredTiger built with timestamp support");
#endif
	}

	return (0);
}
Beispiel #29
0
/*
 * __wt_lsm_tree_create --
 *	Create an LSM tree structure for the given name.
 */
int
__wt_lsm_tree_create(WT_SESSION_IMPL *session,
    const char *uri, int exclusive, const char *config)
{
	WT_CONFIG_ITEM cval;
	WT_DECL_ITEM(buf);
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	const char *cfg[] =
	    { WT_CONFIG_BASE(session, session_create), config, NULL };
	const char *tmpconfig;

	/* If the tree is open, it already exists. */
	if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) {
		__wt_lsm_tree_release(session, lsm_tree);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	/*
	 * If the tree has metadata, it already exists.
	 *
	 * !!!
	 * Use a local variable: we don't care what the existing configuration
	 * is, but we don't want to overwrite the real config.
	 */
	if (__wt_metadata_search(session, uri, &tmpconfig) == 0) {
		__wt_free(session, tmpconfig);
		return (exclusive ? EEXIST : 0);
	}
	WT_RET_NOTFOUND_OK(ret);

	WT_RET(__wt_config_gets(session, cfg, "key_format", &cval));
	if (WT_STRING_MATCH("r", cval.str, cval.len))
		WT_RET_MSG(session, EINVAL,
		    "LSM trees cannot be configured as column stores");

	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));

	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));

	WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->key_format));
	WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->value_format));

	WT_ERR(__wt_config_gets(session, cfg, "collator", &cval));
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->collator_name));

	WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval));
	if (cval.val)
		F_SET(lsm_tree, WT_LSM_TREE_THROTTLE);
	else
		F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE);
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval));
	FLD_SET(lsm_tree->bloom,
	    (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED));
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval));
	if (cval.val != 0)
		FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST);

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) &&
	    FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))
		WT_ERR_MSG(session, EINVAL,
		    "Bloom filters can only be created on newest and oldest "
		    "chunks if bloom filters are enabled");

	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval));
	if (cval.type == WT_CONFIG_ITEM_STRUCT) {
		cval.str++;
		cval.len -= 2;
	}
	WT_ERR(__wt_strndup(session, cval.str, cval.len,
	    &lsm_tree->bloom_config));

	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval));
	lsm_tree->bloom_bit_count = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval));
	lsm_tree->bloom_hash_count = (uint32_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval));
	lsm_tree->chunk_max = (uint64_t)cval.val;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval));
	lsm_tree->chunk_size = (uint64_t)cval.val;
	if (lsm_tree->chunk_size > lsm_tree->chunk_max)
		WT_ERR_MSG(session, EINVAL,
		    "Chunk size (chunk_size) must be smaller than or equal to "
		    "the maximum chunk size (chunk_max)");
	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval));
	lsm_tree->merge_max = (uint32_t)cval.val;
	lsm_tree->merge_min = lsm_tree->merge_max / 2;
	WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval));
	lsm_tree->merge_threads = (uint32_t)cval.val;
	/* Sanity check that api_data.py is in sync with lsm.h */
	WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS);

	/*
	 * Set up the config for each chunk.  If possible, avoid high latencies
	 * from fsync by flushing the cache every 8MB (will be overridden by
	 * any application setting).
	 */
	tmpconfig = "";
#ifdef HAVE_SYNC_FILE_RANGE
	if (!S2C(session)->direct_io)
		tmpconfig = "os_cache_dirty_max=8MB,";
#endif
	WT_ERR(__wt_scr_alloc(session, 0, &buf));
	WT_ERR(__wt_buf_fmt(session, buf,
	    "%s%s,key_format=u,value_format=u", tmpconfig, config));
	lsm_tree->file_config = __wt_buf_steal(session, buf, NULL);

	/* Create the first chunk and flush the metadata. */
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

	/* Discard our partially populated handle. */
	ret = __lsm_tree_discard(session, lsm_tree);
	lsm_tree = NULL;

	/*
	 * Open our new tree and add it to the handle cache. Don't discard on
	 * error: the returned handle is NULL on error, and the metadata
	 * tracking macros handle cleaning up on failure.
	 */
	if (ret == 0)
		ret = __lsm_tree_open(session, uri, &lsm_tree);
	if (ret == 0)
		__wt_lsm_tree_release(session, lsm_tree);

	if (0) {
err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	__wt_scr_free(&buf);
	return (ret);
}
Beispiel #30
0
/*
 * __curjoin_init_iter --
 *	Initialize before any iteration.
 */
static int
__curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
{
	WT_BLOOM *bloom;
	WT_DECL_RET;
	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
	WT_CURSOR_JOIN_ENDPOINT *end;
	uint32_t f, k;

	if (cjoin->entries_next == 0)
		WT_RET_MSG(session, EINVAL,
		    "join cursor has not yet been joined with any other "
		    "cursors");

	je = &cjoin->entries[0];
	WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter));

	jeend = &cjoin->entries[cjoin->entries_next];
	for (je = cjoin->entries; je < jeend; je++) {
		__wt_stat_join_init_single(&je->stats);
		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
		     end++)
			WT_RET(__curjoin_endpoint_init_key(session, je, end));

		/*
		 * The first entry is iterated as the 'outermost' cursor.
		 * For the common GE case, we don't have to test against
		 * the left reference key, we know it will be true since
		 * the btree is ordered.
		 */
		if (je == cjoin->entries && je->ends[0].flags ==
		    (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ))
			F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);

		if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
			if (je->bloom == NULL) {
				/*
				 * Look for compatible filters to be shared,
				 * pick compatible numbers for bit counts
				 * and number of hashes.
				 */
				f = je->bloom_bit_count;
				k = je->bloom_hash_count;
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						f = WT_MAX(
						    je2->bloom_bit_count, f);
						k = WT_MAX(
						    je2->bloom_hash_count, k);
					}
				je->bloom_bit_count = f;
				je->bloom_hash_count = k;
				WT_RET(__wt_bloom_create(session, NULL,
				    NULL, je->count, f, k, &je->bloom));
				F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
				WT_RET(__curjoin_init_bloom(session, cjoin,
				    je, je->bloom));
				/*
				 * Share the Bloom filter, making all
				 * config info consistent.
				 */
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						WT_ASSERT(session,
						    je2->bloom == NULL);
						je2->bloom = je->bloom;
						je2->bloom_bit_count = f;
						je2->bloom_hash_count = k;
					}
			} else {
				/*
				 * Create a temporary filter that we'll
				 * merge into the shared one.  The Bloom
				 * parameters of the two filters must match.
				 */
				WT_RET(__wt_bloom_create(session, NULL,
				    NULL, je->count, je->bloom_bit_count,
				    je->bloom_hash_count, &bloom));
				WT_RET(__curjoin_init_bloom(session, cjoin,
				    je, bloom));
				WT_RET(__wt_bloom_intersection(je->bloom,
				    bloom));
				WT_RET(__wt_bloom_close(bloom));
			}
		}
	}

	F_SET(cjoin, WT_CURJOIN_INITIALIZED);
	return (ret);
}