Beispiel #1
0
/*************************************************************//**
Inserts an entry into a hash table. If an entry with the same fold number
is found, its node is updated to point to the new data, and no new node
is inserted. If btr_search_enabled is set to FALSE, we will only allow
updating existing nodes, but no new node is allowed to be added.
@return	TRUE if succeed, FALSE if no more memory could be allocated */
UNIV_INTERN
ibool
ha_insert_for_fold_func(
/*====================*/
	hash_table_t*	table,	/*!< in: hash table */
	ulint		fold,	/*!< in: folded value of data; if a node with
				the same fold value already exists, it is
				updated to point to the same data, and no new
				node is created! */
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
	buf_block_t*	block,	/*!< in: buffer block containing the data */
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
	const rec_t*	data)	/*!< in: data, must not be NULL */
{
	hash_cell_t*	cell;
	ha_node_t*	node;
	ha_node_t*	prev_node;
	ulint		hash;

	ut_ad(data);
	ut_ad(table);
	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
	ut_a(block->frame == page_align(data));
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
#ifdef UNIV_SYNC_DEBUG
	ut_ad(rw_lock_own(block->btr_search_latch, RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
	ASSERT_HASH_MUTEX_OWN(table, fold);
	ut_ad(btr_search_enabled);

	hash = hash_calc_hash(fold, table);

	cell = hash_get_nth_cell(table, hash);

	prev_node = cell->node;

	while (prev_node != NULL) {
		if (prev_node->fold == fold) {
#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
			if (table->adaptive) {
				buf_block_t* prev_block = prev_node->block;
				ut_a(prev_block->frame
				     == page_align(prev_node->data));
				ut_a(prev_block->n_pointers > 0);
				prev_block->n_pointers--;
				block->n_pointers++;
			}

			prev_node->block = block;
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
			prev_node->data = data;

			return(TRUE);
		}

		prev_node = prev_node->next;
	}

	/* We have to allocate a new chain node */

	node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t));

	if (node == NULL) {
		/* It was a btr search type memory heap and at the moment
		no more memory could be allocated: return */

		ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH);

		return(FALSE);
	}

	ha_node_set_data(node, block, data);

#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
	if (table->adaptive) {
		block->n_pointers++;
	}
#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */

	node->fold = fold;

	node->next = NULL;

	prev_node = cell->node;

	if (prev_node == NULL) {

		cell->node = node;

		return(TRUE);
	}

	while (prev_node->next != NULL) {

		prev_node = prev_node->next;
	}

	prev_node->next = node;

	return(TRUE);
}
Beispiel #2
0
ibool
ib_handle_errors(
/*=============*/
	enum db_err*	new_err,/*!< out: possible new error encountered in
				lock wait, or if no new error, the value
				of trx->error_state at the entry of this
				function */
	trx_t*		trx,	/*!< in: transaction */
	que_thr_t*	thr,	/*!< in: query thread */
	trx_savept_t*	savept)	/*!< in: savepoint or NULL */
{
	enum db_err	err;

handle_new_error:
	err = trx->error_state;

	ut_a(err != DB_SUCCESS);

	trx->error_state = DB_SUCCESS;

	switch (err) {
	case DB_LOCK_WAIT_TIMEOUT:
		if (ses_rollback_on_timeout) {
			trx_general_rollback(trx, FALSE, NULL);
			break;
		}
		/* fall through */
	case DB_DUPLICATE_KEY:
	case DB_FOREIGN_DUPLICATE_KEY:
	case DB_TOO_BIG_RECORD:
	case DB_ROW_IS_REFERENCED:
	case DB_NO_REFERENCED_ROW:
	case DB_CANNOT_ADD_CONSTRAINT:
	case DB_TOO_MANY_CONCURRENT_TRXS:
	case DB_OUT_OF_FILE_SPACE:
		if (savept) {
			/* Roll back the latest, possibly incomplete
			insertion or update */

			trx_general_rollback(trx, TRUE, savept);
		}
		break;
	case DB_LOCK_WAIT:
		srv_suspend_user_thread(thr);

		if (trx->error_state != DB_SUCCESS) {
			que_thr_stop_client(thr);

			goto handle_new_error;
		}

		*new_err = err;

		return(TRUE); /* Operation needs to be retried. */

	case DB_DEADLOCK:
	case DB_LOCK_TABLE_FULL:
		/* Roll back the whole transaction; this resolution was added
		to version 3.23.43 */

		trx_general_rollback(trx, FALSE, NULL);
		break;

	case DB_MUST_GET_MORE_FILE_SPACE:
		srv_panic(DB_ERROR,
		      "InnoDB: The database cannot continue"
		      " operation because of\n"
		      "InnoDB: lack of space. You must add"
		      " a new data file\n"
		      "InnoDB: and restart the database.\n");
		break;

	case DB_CORRUPTION:
		ib_logger(ib_stream,
		      "InnoDB: We detected index corruption"
		      " in an InnoDB type table.\n"
		      "InnoDB: You have to dump + drop + reimport"
		      " the table or, in\n"
		      "InnoDB: a case of widespread corruption,"
		      " dump all InnoDB\n"
		      "InnoDB: tables and recreate the"
		      " whole InnoDB tablespace.\n"
		      "InnoDB: If the server crashes"
		      " after the startup or when\n"
		      "InnoDB: you dump the tables, check the \n"
		      "InnoDB: InnoDB website for help.\n");
		break;
	default:
		ib_logger(ib_stream, "InnoDB: unknown error code %lu\n",
			(ulong) err);
		ut_error;
	}

	if (trx->error_state != DB_SUCCESS) {
		*new_err = trx->error_state;
	} else {
		*new_err = err;
	}

	trx->error_state = DB_SUCCESS;

	return(FALSE);
}
Beispiel #3
0
/*******************************************************************//**
Builds a partial row from an update undo log record. It contains the
columns which occur as ordering in any index of the table.
@return	pointer to remaining part of undo record */
UNIV_INTERN
byte*
trx_undo_rec_get_partial_row(
/*=========================*/
	byte*		ptr,	/*!< in: remaining part in update undo log
				record of a suitable type, at the start of
				the stored index columns;
				NOTE that this copy of the undo log record must
				be preserved as long as the partial row is
				used, as we do NOT copy the data in the
				record! */
	dict_index_t*	index,	/*!< in: clustered index */
	dtuple_t**	row,	/*!< out, own: partial row */
	ibool		ignore_prefix, /*!< in: flag to indicate if we
				expect blob prefixes in undo. Used
				only in the assertion. */
	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
				needed is allocated */
{
	const byte*	end_ptr;
	ulint		row_len;

	ut_ad(index);
	ut_ad(ptr);
	ut_ad(row);
	ut_ad(heap);
	ut_ad(dict_index_is_clust(index));

	row_len = dict_table_get_n_cols(index->table);

	*row = dtuple_create(heap, row_len);

	dict_table_copy_types(*row, index->table);

	end_ptr = ptr + mach_read_from_2(ptr);
	ptr += 2;

	while (ptr != end_ptr) {
		dfield_t*		dfield;
		byte*			field;
		ulint			field_no;
		const dict_col_t*	col;
		ulint			col_no;
		ulint			len;
		ulint			orig_len;

		ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);

		col = dict_index_get_nth_col(index, field_no);
		col_no = dict_col_get_no(col);

		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);

		dfield = dtuple_get_nth_field(*row, col_no);

		dfield_set_data(dfield, field, len);

		if (len != UNIV_SQL_NULL
		    && len >= UNIV_EXTERN_STORAGE_FIELD) {
			dfield_set_len(dfield,
				       len - UNIV_EXTERN_STORAGE_FIELD);
			dfield_set_ext(dfield);
			/* If the prefix of this column is indexed,
			ensure that enough prefix is stored in the
			undo log record. */
			ut_a(ignore_prefix
			     || !col->ord_part
			     || dfield_get_len(dfield)
			     >= REC_MAX_INDEX_COL_LEN
			     + BTR_EXTERN_FIELD_REF_SIZE);
		}
	}

	return(ptr);
}
Beispiel #4
0
void
buf_LRU_invalidate_tablespace(
/*==========================*/
	ulint	id)	/* in: space id */
{
	buf_block_t*	block;
	ulint		page_no;
	ibool		all_freed;

	/* Before we attempt to drop pages one by one we first
	attempt to drop page hash index entries in batches to make
	it more efficient. The batching attempt is a best effort
	attempt and does not guarantee that all pages hash entries
	will be dropped. We get rid of remaining page hash entries
	one by one below. */
	buf_LRU_drop_page_hash_for_tablespace(id);

scan_again:
	mutex_enter(&(buf_pool->mutex));

	all_freed = TRUE;

	block = UT_LIST_GET_LAST(buf_pool->LRU);

	while (block != NULL) {
		buf_block_t*	prev_block;

		mutex_enter(&block->mutex);
		prev_block = UT_LIST_GET_PREV(LRU, block);

		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		if (block->space == id
		    && (block->buf_fix_count > 0 || block->io_fix != 0)) {

			/* We cannot remove this page during this scan yet;
			maybe the system is currently reading it in, or
			flushing the modifications to the file */

			all_freed = FALSE;

			goto next_page;
		}

		if (block->space == id) {
#ifdef UNIV_DEBUG
			if (buf_debug_prints) {
				fprintf(stderr,
					"Dropping space %lu page %lu\n",
					(ulong) block->space,
					(ulong) block->offset);
			}
#endif
			if (block->is_hashed) {
				page_no = block->offset;

				mutex_exit(&block->mutex);

				mutex_exit(&(buf_pool->mutex));

				/* Note that the following call will acquire
				an S-latch on the page */

				btr_search_drop_page_hash_when_freed(id,
								     page_no);
				goto scan_again;
			}

			if (0 != ut_dulint_cmp(block->oldest_modification,
					       ut_dulint_zero)) {

				/* Remove from the flush list of modified
				blocks */
				block->oldest_modification = ut_dulint_zero;

				UT_LIST_REMOVE(flush_list,
					       buf_pool->flush_list, block);
			}

			/* Remove from the LRU list */
			buf_LRU_block_remove_hashed_page(block);
			buf_LRU_block_free_hashed_page(block);
		}
next_page:
		mutex_exit(&block->mutex);
		block = prev_block;
	}

	mutex_exit(&(buf_pool->mutex));

	if (!all_freed) {
		os_thread_sleep(20000);

		goto scan_again;
	}
}
Beispiel #5
0
/**********************************************************************
Adds a block to the LRU list. */
UNIV_INLINE
void
buf_LRU_add_block_low(
/*==================*/
	buf_block_t*	block,	/* in: control block */
	ibool		old)	/* in: TRUE if should be put to the old blocks
				in the LRU list, else put to the start; if the
				LRU list is very short, the block is added to
				the start, regardless of this parameter */
{
	ulint	cl;

	ut_ad(buf_pool);
	ut_ad(block);
	ut_ad(mutex_own(&(buf_pool->mutex)));

	ut_a(block->state == BUF_BLOCK_FILE_PAGE);
	ut_a(!block->in_LRU_list);

	block->old = old;
	cl = buf_pool_clock_tic();

	if (srv_use_awe && block->frame) {
		/* Add to the list of mapped pages; for simplicity we always
		add to the start, even if the user would have set 'old'
		TRUE */

		UT_LIST_ADD_FIRST(awe_LRU_free_mapped,
				  buf_pool->awe_LRU_free_mapped, block);
	}

	if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {

		UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block);

		block->LRU_position = cl;
		block->freed_page_clock = buf_pool->freed_page_clock;
	} else {
		UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old,
				     block);
		buf_pool->LRU_old_len++;

		/* We copy the LRU position field of the previous block
		to the new block */

		block->LRU_position = (buf_pool->LRU_old)->LRU_position;
	}

	block->in_LRU_list = TRUE;

	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {

		ut_ad(buf_pool->LRU_old);

		/* Adjust the length of the old block list if necessary */

		buf_LRU_old_adjust_len();

	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {

		/* The LRU list is now long enough for LRU_old to become
		defined: init it */

		buf_LRU_old_init();
	}
}
Beispiel #6
0
/***************************************************************
Updates a clustered index record of a row when the ordering fields do
not change. */
static
ulint
row_upd_clust_rec(
/*==============*/
				/* out: DB_SUCCESS if operation successfully
				completed, else error code or DB_LOCK_WAIT */
	upd_node_t*	node,	/* in: row update node */
	dict_index_t*	index,	/* in: clustered index */
	que_thr_t*	thr,	/* in: query thread */
	mtr_t*		mtr)	/* in: mtr; gets committed here */
{
	big_rec_t*	big_rec	= NULL;
	btr_pcur_t*	pcur;
	btr_cur_t*	btr_cur;
	ulint		err;
	
	ut_ad(node);
	ut_ad(index->type & DICT_CLUSTERED);

	pcur = node->pcur;
	btr_cur = btr_pcur_get_btr_cur(pcur);

	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
	
	/* Try optimistic updating of the record, keeping changes within
	the page; we do not check locks because we assume the x-lock on the
	record to update */

	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
		err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
						btr_cur, node->update,
						node->cmpl_info, thr, mtr);
	} else {
		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
						btr_cur, node->update,
						node->cmpl_info, thr, mtr);
	}

	mtr_commit(mtr);
	
	if (err == DB_SUCCESS) {

		return(err);
	}

	/* We may have to modify the tree structure: do a pessimistic descent
	down the index tree */

	mtr_start(mtr);
	
	/* NOTE: this transaction has an s-lock or x-lock on the record and
	therefore other transactions cannot modify the record when we have no
	latch on the page. In addition, we assume that other query threads of
	the same transaction do not modify the record in the meantime.
	Therefore we can assert that the restoration of the cursor succeeds. */

	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));

	ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
	
	err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
					&big_rec, node->update,
					node->cmpl_info, thr, mtr);
	mtr_commit(mtr);

	if (err == DB_SUCCESS && big_rec) {
		mtr_start(mtr);
		ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
	
		err = btr_store_big_rec_extern_fields(index,
						btr_cur_get_rec(btr_cur),
						big_rec, mtr);		
		mtr_commit(mtr);
	}

	if (big_rec) {
		dtuple_big_rec_free(big_rec);
	}
		
	return(err);
}
Beispiel #7
0
upd_t*
row_upd_build_difference_binary(
/*============================*/
				/* out, own: update vector of differing
				fields, excluding roll ptr and trx id */
	dict_index_t*	index,	/* in: clustered index */
	dtuple_t*	entry,	/* in: entry to insert */
	ulint*		ext_vec,/* in: array containing field numbers of
				externally stored fields in entry, or NULL */
	ulint		n_ext_vec,/* in: number of fields in ext_vec */
	rec_t*		rec,	/* in: clustered index record */
	mem_heap_t*	heap)	/* in: memory heap from which allocated */
{
	upd_field_t*	upd_field;
	dfield_t*	dfield;
	byte*		data;
	ulint		len;
	upd_t*		update;
	ulint		n_diff;
	ulint		roll_ptr_pos;
	ulint		trx_id_pos;
	ulint		i;

	/* This function is used only for a clustered index */
	ut_a(index->type & DICT_CLUSTERED);

	update = upd_create(dtuple_get_n_fields(entry), heap);

	n_diff = 0;

	roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);

	for (i = 0; i < dtuple_get_n_fields(entry); i++) {

		data = rec_get_nth_field(rec, i, &len);

		dfield = dtuple_get_nth_field(entry, i);

		/* NOTE: we compare the fields as binary strings!
		(No collation) */

		if (i == trx_id_pos || i == roll_ptr_pos) {

			goto skip_compare;
		}
		
		if (rec_get_nth_field_extern_bit(rec, i)
		    != upd_ext_vec_contains(ext_vec, n_ext_vec, i)
		    || !dfield_data_is_binary_equal(dfield, len, data)) {

			upd_field = upd_get_nth_field(update, n_diff);

			dfield_copy(&(upd_field->new_val), dfield);

			upd_field_set_field_no(upd_field, i, index);

			if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) {
				upd_field->extern_storage = TRUE;
			} else {
				upd_field->extern_storage = FALSE;
			}
				
			n_diff++;
		}
skip_compare:
		;
	}

	update->n_fields = n_diff;

	return(update);
}
Beispiel #8
0
/**************************************************************//**
Moves parts of long fields in entry to the big record vector so that
the size of tuple drops below the maximum record size allowed in the
database. Moves data only from those fields which are not necessary
to determine uniquely the insertion place of the tuple in the index.
@return own: created big record vector, NULL if we are not able to
shorten the entry enough, i.e., if there are too many fixed-length or
short fields in entry or the index is clustered */
UNIV_INTERN
big_rec_t*
dtuple_convert_big_rec(
/*===================*/
	dict_index_t*	index,	/*!< in: index */
	dtuple_t*	entry,	/*!< in/out: index entry */
	ulint*		n_ext)	/*!< in/out: number of
				externally stored columns */
{
	mem_heap_t*	heap;
	big_rec_t*	vector;
	dfield_t*	dfield;
	dict_field_t*	ifield;
	ulint		size;
	ulint		n_fields;
	ulint		local_len;
	ulint		local_prefix_len;

	if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
		return(NULL);
	}

	if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) {
		/* up to MySQL 5.1: store a 768-byte prefix locally */
		local_len = BTR_EXTERN_FIELD_REF_SIZE
			+ DICT_ANTELOPE_MAX_INDEX_COL_LEN;
	} else {
		/* new-format table: do not store any BLOB prefix locally */
		local_len = BTR_EXTERN_FIELD_REF_SIZE;
	}

	ut_a(dtuple_check_typed_no_assert(entry));

	size = rec_get_converted_size(index, entry, *n_ext);

	if (UNIV_UNLIKELY(size > 1000000000)) {
		fprintf(stderr,
			"InnoDB: Warning: tuple size very big: %lu\n",
			(ulong) size);
		fputs("InnoDB: Tuple contents: ", stderr);
		dtuple_print(stderr, entry);
		putc('\n', stderr);
	}

	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
			       * sizeof(big_rec_field_t) + 1000);

	vector = mem_heap_alloc(heap, sizeof(big_rec_t));

	vector->heap = heap;
	vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
					* sizeof(big_rec_field_t));

	/* Decide which fields to shorten: the algorithm is to look for
	a variable-length field that yields the biggest savings when
	stored externally */

	n_fields = 0;

	while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
							     *n_ext),
				      dict_table_is_comp(index->table),
				      dict_index_get_n_fields(index),
				      dict_table_zip_size(index->table))) {
		ulint			i;
		ulint			longest		= 0;
		ulint			longest_i	= ULINT_MAX;
		byte*			data;
		big_rec_field_t*	b;

		for (i = dict_index_get_n_unique_in_tree(index);
		     i < dtuple_get_n_fields(entry); i++) {
			ulint	savings;

			dfield = dtuple_get_nth_field(entry, i);
			ifield = dict_index_get_nth_field(index, i);

			/* Skip fixed-length, NULL, externally stored,
			or short columns */

			if (ifield->fixed_len
			    || dfield_is_null(dfield)
			    || dfield_is_ext(dfield)
			    || dfield_get_len(dfield) <= local_len
			    || dfield_get_len(dfield)
			    <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
				goto skip_field;
			}

			savings = dfield_get_len(dfield) - local_len;

			/* Check that there would be savings */
			if (longest >= savings) {
				goto skip_field;
			}

			/* In DYNAMIC and COMPRESSED format, store
			locally any non-BLOB columns whose maximum
			length does not exceed 256 bytes.  This is
			because there is no room for the "external
			storage" flag when the maximum length is 255
			bytes or less. This restriction trivially
			holds in REDUNDANT and COMPACT format, because
			there we always store locally columns whose
			length is up to local_len == 788 bytes.
			@see rec_init_offsets_comp_ordinary */
			if (ifield->col->mtype != DATA_BLOB
			    && ifield->col->len < 256) {
				goto skip_field;
			}

			longest_i = i;
			longest = savings;

skip_field:
			continue;
		}

		if (!longest) {
			/* Cannot shorten more */

			mem_heap_free(heap);

			return(NULL);
		}

		/* Move data from field longest_i to big rec vector.

		We store the first bytes locally to the record. Then
		we can calculate all ordering fields in all indexes
		from locally stored data. */

		dfield = dtuple_get_nth_field(entry, longest_i);
		ifield = dict_index_get_nth_field(index, longest_i);
		local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;

		b = &vector->fields[n_fields];
		b->field_no = longest_i;
		b->len = dfield_get_len(dfield) - local_prefix_len;
		b->data = (char*) dfield_get_data(dfield) + local_prefix_len;

		/* Allocate the locally stored part of the column. */
		data = mem_heap_alloc(heap, local_len);

		/* Copy the local prefix. */
		memcpy(data, dfield_get_data(dfield), local_prefix_len);
		/* Clear the extern field reference (BLOB pointer). */
		memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
#if 0
		/* The following would fail the Valgrind checks in
		page_cur_insert_rec_low() and page_cur_insert_rec_zip().
		The BLOB pointers in the record will be initialized after
		the record and the BLOBs have been written. */
		UNIV_MEM_ALLOC(data + local_prefix_len,
			       BTR_EXTERN_FIELD_REF_SIZE);
#endif

		dfield_set_data(dfield, data, local_len);
		dfield_set_ext(dfield);

		n_fields++;
		(*n_ext)++;
		ut_ad(n_fields < dtuple_get_n_fields(entry));
	}

	vector->n_fields = n_fields;
	return(vector);
}
/******************************************************************//**
Reserves a wait array cell for waiting for an object.
The event of the cell is reset to nonsignalled state. */
UNIV_INTERN
void
sync_array_reserve_cell(
/*====================*/
	sync_array_t*	arr,	/*!< in: wait array */
	void*		object, /*!< in: pointer to the object to wait for */
	ulint		type,	/*!< in: lock request type */
	const char*	file,	/*!< in: file where requested */
	ulint		line,	/*!< in: line where requested */
	ulint*		index)	/*!< out: index of the reserved cell */
{
	sync_cell_t*	cell;
	os_event_t      event;
	ulint		i;

	ut_a(object);
	ut_a(index);

	sync_array_enter(arr);

	arr->res_count++;

	/* Reserve a new cell. */
	for (i = 0; i < arr->n_cells; i++) {
		cell = sync_array_get_nth_cell(arr, i);

		if (cell->wait_object == NULL) {

			cell->waiting = FALSE;
			cell->wait_object = object;

			if (type == SYNC_MUTEX) {
				cell->old_wait_mutex = object;
			} else {
				cell->old_wait_rw_lock = object;
			}

			cell->request_type = type;

			cell->file = file;
			cell->line = line;

			arr->n_reserved++;

			*index = i;

			sync_array_exit(arr);

			/* Make sure the event is reset and also store
			the value of signal_count at which the event
			was reset. */
                        event = sync_cell_get_event(cell);
			cell->signal_count = os_event_reset(event);

			cell->reservation_time = time(NULL);

			cell->thread = os_thread_get_curr_id();

			return;
		}
	}

	ut_error; /* No free cell found */

	return;
}
Beispiel #10
0
/*****************************************************************//**
Based on a table object, this function builds the entry to be inserted
in the SYS_TABLES system table.
@return	the tuple which should be inserted */
static
dtuple_t*
dict_create_sys_tables_tuple(
    /*=========================*/
    const dict_table_t*	table,	/*!< in: table */
    mem_heap_t*		heap)	/*!< in: memory heap from
					which the memory for the built
					tuple is allocated */
{
    dict_table_t*	sys_tables;
    dtuple_t*	entry;
    dfield_t*	dfield;
    byte*		ptr;

    ut_ad(table);
    ut_ad(heap);

    sys_tables = dict_sys->sys_tables;

    entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS);

    dict_table_copy_types(entry, sys_tables);

    /* 0: NAME -----------------------------*/
    dfield = dtuple_get_nth_field(entry, 0/*NAME*/);

    dfield_set_data(dfield, table->name, ut_strlen(table->name));
    /* 3: ID -------------------------------*/
    dfield = dtuple_get_nth_field(entry, 1/*ID*/);

    ptr = mem_heap_alloc(heap, 8);
    mach_write_to_8(ptr, table->id);

    dfield_set_data(dfield, ptr, 8);
    /* 4: N_COLS ---------------------------*/
    dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);

#if DICT_TF_COMPACT != 1
#error
#endif

    ptr = mem_heap_alloc(heap, 4);
    if (dict_table_is_gcs(table))                       /* ±í¶¨ÒåÐÞ¸Ä */
    {
        ut_ad(dict_table_is_comp(table));

        mach_write_to_4(ptr, table->n_def
                        | (1 << 31) | (1 << 30));
    }
    else
    {
        mach_write_to_4(ptr, table->n_def
                        | ((table->flags & DICT_TF_COMPACT) << 31));
    }

    dfield_set_data(dfield, ptr, 4);
    /* 5: TYPE -----------------------------*/
    dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);

    ptr = mem_heap_alloc(heap, 4);
    if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
        ut_a(table->flags & DICT_TF_COMPACT);
        ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
        ut_a((table->flags & DICT_TF_ZSSIZE_MASK)
             <= (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
        ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
        mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
    } else {
        mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
    }

    dfield_set_data(dfield, ptr, 4);
    /* 6: MIX_ID (obsolete) ---------------------------*/
    dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);

    ptr = mem_heap_zalloc(heap, 8);

    dfield_set_data(dfield, ptr, 8);
    /* 7: MIX_LEN (additional flags) --------------------------*/

    dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);

    ptr = mem_heap_alloc(heap, 4);
    mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);

    ut_ad(table->n_cols_before_alter_table == 0);

    dfield_set_data(dfield, ptr, 4);
    /* 8: CLUSTER_NAME ---------------------*/
    dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
    dfield_set_null(dfield); /* not supported */

    /* 9: SPACE ----------------------------*/
    dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);

    ptr = mem_heap_alloc(heap, 4);
    mach_write_to_4(ptr, table->space);

    dfield_set_data(dfield, ptr, 4);
    /*----------------------------------*/

    return(entry);
}
Beispiel #11
0
/*******************************************************************//**
Truncates the index tree associated with a row in SYS_INDEXES table.
@return	new root page number, or FIL_NULL on failure */
UNIV_INTERN
ulint
dict_truncate_index_tree(
    /*=====================*/
    dict_table_t*	table,	/*!< in: the table the index belongs to */
    ulint		space,	/*!< in: 0=truncate,
				nonzero=create the index tree in the
				given tablespace */
    btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
				record in the clustered index of
				SYS_INDEXES table. The cursor may be
				repositioned in this call. */
    mtr_t*		mtr)	/*!< in: mtr having the latch
				on the record page. The mtr may be
				committed and restarted in this call. */
{
    ulint		root_page_no;
    ibool		drop = !space;
    ulint		zip_size;
    ulint		type;
    index_id_t	index_id;
    rec_t*		rec;
    const byte*	ptr;
    ulint		len;
    dict_index_t*	index;

    ut_ad(mutex_own(&(dict_sys->mutex)));
    ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
    rec = btr_pcur_get_rec(pcur);
    ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

    ut_ad(len == 4);

    root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

    if (drop && root_page_no == FIL_NULL) {
        /* The tree has been freed. */

        ut_print_timestamp(stderr);
        fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
                " a missing index of table %s!\n", table->name);
        drop = FALSE;
    }

    ptr = rec_get_nth_field_old(rec,
                                DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);

    ut_ad(len == 4);

    if (drop) {
        space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
    }

    zip_size = fil_space_get_zip_size(space);

    if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
        /* It is a single table tablespace and the .ibd file is
        missing: do nothing */

        ut_print_timestamp(stderr);
        fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
                " a missing .ibd file of table %s!\n", table->name);
        return(FIL_NULL);
    }

    ptr = rec_get_nth_field_old(rec,
                                DICT_SYS_INDEXES_TYPE_FIELD, &len);
    ut_ad(len == 4);
    type = mach_read_from_4(ptr);

    ptr = rec_get_nth_field_old(rec, 1, &len);
    ut_ad(len == 8);
    index_id = mach_read_from_8(ptr);

    if (!drop) {

        goto create;
    }

    /* We free all the pages but the root page first; this operation
    may span several mini-transactions */

    btr_free_but_not_root(space, zip_size, root_page_no);

    /* Then we free the root page in the same mini-transaction where
    we create the b-tree and write its new root page number to the
    appropriate field in the SYS_INDEXES record: this mini-transaction
    marks the B-tree totally truncated */

    btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr);

    btr_free_root(space, zip_size, root_page_no, mtr);
create:
    /* We will temporarily write FIL_NULL to the PAGE_NO field
    in SYS_INDEXES, so that the database will not get into an
    inconsistent state in case it crashes between the mtr_commit()
    below and the following mtr_commit() call. */
    page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
                         FIL_NULL, mtr);

    /* We will need to commit the mini-transaction in order to avoid
    deadlocks in the btr_create() call, because otherwise we would
    be freeing and allocating pages in the same mini-transaction. */
    btr_pcur_store_position(pcur, mtr);
    mtr_commit(mtr);

    mtr_start(mtr);
    btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);

    /* Find the index corresponding to this SYS_INDEXES record. */
    for (index = UT_LIST_GET_FIRST(table->indexes);
            index;
            index = UT_LIST_GET_NEXT(indexes, index)) {
        if (index->id == index_id) {
            root_page_no = btr_create(type, space, zip_size,
                                      index_id, index, mtr);
            index->page = (unsigned int) root_page_no;
            return(root_page_no);
        }
    }

    ut_print_timestamp(stderr);
    fprintf(stderr,
            "  InnoDB: Index %llu of table %s is missing\n"
            "InnoDB: from the data dictionary during TRUNCATE!\n",
            (ullint) index_id,
            table->name);

    return(FIL_NULL);
}
Beispiel #12
0
/****************************************************************//**
Creates the sys_added_cols_default table inside InnoDB
at database creation or database start if they are not found or are
not of the right form.
@return	DB_SUCCESS or error code */
UNIV_INTERN
ulint
dict_create_or_check_added_cols_default_tables(void)
/*================================================*/
{
    dict_table_t*	table1;

    ulint		error;
    trx_t*		trx;

    mutex_enter(&(dict_sys->mutex));

    table1 = dict_table_get_low("SYS_ADDED_COLS_DEFAULT");

    if (table1 && UT_LIST_GET_LEN(table1->indexes) == 1) {

        /* sys_added_cols_default table have already been
        created, and it's ok */

        mutex_exit(&(dict_sys->mutex));

#ifdef DEBUG
        fprintf(stderr,
                "SYS_ADDED_COLS_DEFAULT table have created success!!!!\n");
#endif
        return(DB_SUCCESS);
    }

#ifdef DEBUG
    fprintf(stderr,
            "SYS_ADDED_COLS_DEFAULT table have not created yet!!!\n");
#endif

    mutex_exit(&(dict_sys->mutex));

    trx = trx_allocate_for_mysql();

    trx->op_info = "creating added cols default";

    row_mysql_lock_data_dictionary(trx);

    /* sys_added_cols_default table have been created,but not OK,
    drop it and then recreate */
    if (table1) {
        fprintf(stderr,
                "InnoDB: dropping incompletely created"
                " SYS_ADDED_COLS_DEFAULT table\n");
        row_drop_table_for_mysql("SYS_ADDED_COLS_DEFAULT", trx, TRUE);
    }

    fprintf(stderr,
            "InnoDB: Creating sys_added_cols_default for fast add colums with default value\n");


    /*
        NOTE: We create the added cols default here.
        sys_added_cols_default(
        table_id    int,        //table id
        pos         int,        // column id
        def_val     varbinary,  //default value
        def_val_len int         //default value's length
        )
    */

    error = que_eval_sql(NULL,
                         "PROCEDURE CREATE_ADDED_COLS_DEFAULT_SYS_TABLES_PROC () IS\n"
                         "BEGIN\n"
                         "CREATE TABLE\n"
                         "SYS_ADDED_COLS_DEFAULT(TABLE_ID BINARY(8), POS INT,"
                         " DEF_VAL BINARY(65535), DEF_VAL_LEN INT);\n"
                         "CREATE UNIQUE CLUSTERED INDEX TID_POS"
                         " ON SYS_ADDED_COLS_DEFAULT (TABLE_ID,POS);\n"
                         "END;\n"
                         , FALSE, trx);

    if (error != DB_SUCCESS) {
        fprintf(stderr, "InnoDB: error %lu in creation\n",
                (ulong) error);

        ut_a(error == DB_OUT_OF_FILE_SPACE
             || error == DB_TOO_MANY_CONCURRENT_TRXS);

        fprintf(stderr,
                "InnoDB: creation failed\n"
                "InnoDB: tablespace is full\n"
                "InnoDB: dropping incompletely created"
                " SYS_FOREIGN tables\n");

        row_drop_table_for_mysql("SYS_ADDED_COLS_DEFAULT", trx, TRUE);

        error = DB_MUST_GET_MORE_FILE_SPACE;
    }

    trx_commit_for_mysql(trx);

    row_mysql_unlock_data_dictionary(trx);

    trx_free_for_mysql(trx);

    if (error == DB_SUCCESS) {
        fprintf(stderr,
                "InnoDB: Added columns default system tables"
                " created\n");
    }

    return(error);
}
Beispiel #13
0
/****************************************************************//**
Creates the foreign key constraints system tables inside InnoDB
at database creation or database start if they are not found or are
not of the right form.
@return	DB_SUCCESS or error code */
UNIV_INTERN
ulint
dict_create_or_check_foreign_constraint_tables(void)
/*================================================*/
{
    dict_table_t*	table1;
    dict_table_t*	table2;
    ulint		error;
    trx_t*		trx;

    mutex_enter(&(dict_sys->mutex));

    table1 = dict_table_get_low("SYS_FOREIGN");
    table2 = dict_table_get_low("SYS_FOREIGN_COLS");

    if (table1 && table2
            && UT_LIST_GET_LEN(table1->indexes) == 3
            && UT_LIST_GET_LEN(table2->indexes) == 1) {

        /* Foreign constraint system tables have already been
        created, and they are ok */

        mutex_exit(&(dict_sys->mutex));

        return(DB_SUCCESS);
    }

    mutex_exit(&(dict_sys->mutex));

    trx = trx_allocate_for_mysql();

    trx->op_info = "creating foreign key sys tables";

    row_mysql_lock_data_dictionary(trx);

    if (table1) {
        fprintf(stderr,
                "InnoDB: dropping incompletely created"
                " SYS_FOREIGN table\n");
        row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
    }

    if (table2) {
        fprintf(stderr,
                "InnoDB: dropping incompletely created"
                " SYS_FOREIGN_COLS table\n");
        row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
    }

    fprintf(stderr,
            "InnoDB: Creating foreign key constraint system tables\n");

    /* NOTE: in dict_load_foreigns we use the fact that
    there are 2 secondary indexes on SYS_FOREIGN, and they
    are defined just like below */

    /* NOTE: when designing InnoDB's foreign key support in 2001, we made
    an error and made the table names and the foreign key id of type
    'CHAR' (internally, really a VARCHAR). We should have made the type
    VARBINARY, like in other InnoDB system tables, to get a clean
    design. */

    error = que_eval_sql(NULL,
                         "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
                         "BEGIN\n"
                         "CREATE TABLE\n"
                         "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
                         " REF_NAME CHAR, N_COLS INT);\n"
                         "CREATE UNIQUE CLUSTERED INDEX ID_IND"
                         " ON SYS_FOREIGN (ID);\n"
                         "CREATE INDEX FOR_IND"
                         " ON SYS_FOREIGN (FOR_NAME);\n"
                         "CREATE INDEX REF_IND"
                         " ON SYS_FOREIGN (REF_NAME);\n"
                         "CREATE TABLE\n"
                         "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
                         " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
                         "CREATE UNIQUE CLUSTERED INDEX ID_IND"
                         " ON SYS_FOREIGN_COLS (ID, POS);\n"
                         "END;\n"
                         , FALSE, trx);

    if (error != DB_SUCCESS) {
        fprintf(stderr, "InnoDB: error %lu in creation\n",
                (ulong) error);

        ut_a(error == DB_OUT_OF_FILE_SPACE
             || error == DB_TOO_MANY_CONCURRENT_TRXS);

        fprintf(stderr,
                "InnoDB: creation failed\n"
                "InnoDB: tablespace is full\n"
                "InnoDB: dropping incompletely created"
                " SYS_FOREIGN tables\n");

        row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
        row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);

        error = DB_MUST_GET_MORE_FILE_SPACE;
    }

    trx_commit_for_mysql(trx);

    row_mysql_unlock_data_dictionary(trx);

    trx_free_for_mysql(trx);

    if (error == DB_SUCCESS) {
        fprintf(stderr,
                "InnoDB: Foreign key constraint system tables"
                " created\n");
    }

    return(error);
}
Beispiel #14
0
/***********************************************************//**
Creates an index. This is a high-level function used in SQL execution
graphs.
@return	query thread to run next or NULL */
UNIV_INTERN
que_thr_t*
dict_create_index_step(
    /*===================*/
    que_thr_t*	thr)	/*!< in: query thread */
{
    ind_node_t*	node;
    ulint		err	= DB_ERROR;
    trx_t*		trx;

    ut_ad(thr);
    ut_ad(mutex_own(&(dict_sys->mutex)));

    trx = thr_get_trx(thr);

    node = thr->run_node;

    ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);

    if (thr->prev_node == que_node_get_parent(node)) {
        node->state = INDEX_BUILD_INDEX_DEF;
    }

    if (node->state == INDEX_BUILD_INDEX_DEF) {
        /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
        err = dict_build_index_def_step(thr, node);

        if (err != DB_SUCCESS) {

            goto function_exit;
        }

        node->state = INDEX_BUILD_FIELD_DEF;
        node->field_no = 0;

        thr->run_node = node->ind_def;

        return(thr);
    }

    if (node->state == INDEX_BUILD_FIELD_DEF) {

        if (node->field_no < (node->index)->n_fields) {

            err = dict_build_field_def_step(node);

            if (err != DB_SUCCESS) {

                goto function_exit;
            }

            node->field_no++;

            thr->run_node = node->field_def;

            return(thr);
        } else {
            node->state = INDEX_ADD_TO_CACHE;
        }
    }

    if (node->state == INDEX_ADD_TO_CACHE) {

        index_id_t	index_id = node->index->id;

        err = dict_index_add_to_cache(
                  node->table, node->index, FIL_NULL,
                  trx_is_strict(trx)
                  || dict_table_get_format(node->table)
                  >= DICT_TF_FORMAT_ZIP);

        node->index = dict_index_get_if_in_cache_low(index_id);
        ut_a(!node->index == (err != DB_SUCCESS));

        if (err != DB_SUCCESS) {

            goto function_exit;
        }

        node->state = INDEX_CREATE_INDEX_TREE;
    }

    if (node->state == INDEX_CREATE_INDEX_TREE) {

        err = dict_create_index_tree_step(node);

        if (err != DB_SUCCESS) {
            dict_index_remove_from_cache(node->table, node->index);
            node->index = NULL;

            goto function_exit;
        }

        node->index->page = node->page_no;
        node->state = INDEX_COMMIT_WORK;
    }

    if (node->state == INDEX_COMMIT_WORK) {

        /* Index was correctly defined: do NOT commit the transaction
        (CREATE INDEX does NOT currently do an implicit commit of
        the current transaction) */

        node->state = INDEX_CREATE_INDEX_TREE;

        /* thr->run_node = node->commit_node;

        return(thr); */
    }

function_exit:
    trx->error_state = err;

    if (err == DB_SUCCESS) {
        /* Ok: do nothing */

    } else if (err == DB_LOCK_WAIT) {

        return(NULL);
    } else {
        /* SQL error detected */

        return(NULL);
    }

    thr->run_node = que_node_get_parent(node);

    return(thr);
}
Beispiel #15
0
/****************************************************************//**
Creates a new thread of execution. The execution starts from
the function given. The start function takes a void* parameter
and returns an ulint.
@return	handle to the thread */
UNIV_INTERN
os_thread_t
os_thread_create(
/*=============*/
#ifndef __WIN__
	os_posix_f_t		start_f,
#else
	ulint (*start_f)(void*),		/*!< in: pointer to function
						from which to start */
#endif
	void*			arg,		/*!< in: argument to start
						function */
	os_thread_id_t*		thread_id)	/*!< out: id of the created
						thread, or NULL */
{
#ifdef __WIN__
	os_thread_t	thread;
	DWORD		win_thread_id;

	os_mutex_enter(os_sync_mutex);
	os_thread_count++;
	os_mutex_exit(os_sync_mutex);

	thread = CreateThread(NULL,	/* no security attributes */
			      0,	/* default size stack */
			      (LPTHREAD_START_ROUTINE)start_f,
			      arg,
			      0,	/* thread runs immediately */
			      &win_thread_id);

	if (srv_set_thread_priorities) {

		/* Set created thread priority the same as a normal query,
		we try to prevent starvation of threads by assigning same
		priority QUERY_PRIOR to all */

		ut_a(SetThreadPriority(thread, srv_query_thread_priority));
	}

	if (thread_id) {
		*thread_id = win_thread_id;
	}

	return(thread);
#else
	int		ret;
	os_thread_t	pthread;
	pthread_attr_t	attr;

#ifndef UNIV_HPUX10
	pthread_attr_init(&attr);
#endif

#ifdef UNIV_AIX
	/* We must make sure a thread stack is at least 32 kB, otherwise
	InnoDB might crash; we do not know if the default stack size on
	AIX is always big enough. An empirical test on AIX-4.3 suggested
	the size was 96 kB, though. */

	ret = pthread_attr_setstacksize(&attr,
					(size_t)(PTHREAD_STACK_MIN
						 + 32 * 1024));
	if (ret) {
		srv_panic(ret,
			"InnoDB: Error: pthread_attr_setstacksize"
			" returned %d\n", ret);
	}
#endif
#ifdef __NETWARE__
	ret = pthread_attr_setstacksize(&attr,
					(size_t) NW_THD_STACKSIZE);
	if (ret) {
		srv_panic(ret,
			"InnoDB: Error: pthread_attr_setstacksize"
			" returned %d\n", ret);
	}
#endif
	os_mutex_enter(os_sync_mutex);
	os_thread_count++;
	os_mutex_exit(os_sync_mutex);

#ifdef UNIV_HPUX10
	ret = pthread_create(&pthread, pthread_attr_default, start_f, arg);
#else
	ret = pthread_create(&pthread, &attr, start_f, arg);
#endif
	if (ret) {
		srv_panic(ret,
			"InnoDB: Error: pthread_create returned %d\n", ret);
	}

#ifndef UNIV_HPUX10
	pthread_attr_destroy(&attr);
#endif
	if (srv_set_thread_priorities) {

#ifdef HAVE_PTHREAD_SETPRIO 
		pthread_setprio(pthread, srv_query_thread_priority);
#endif
	}

	if (thread_id) {
		*thread_id = pthread;
	}

	return(pthread);
#endif
}
Beispiel #16
0
/*****************************************************************//**
Creates the file page for the dictionary header. This function is
called only at the database creation.
@return	TRUE if succeed */
static
ibool
dict_hdr_create(
/*============*/
	mtr_t*	mtr)	/*!< in: mtr */
{
	buf_block_t*	block;
	dict_hdr_t*	dict_header;
	ulint		root_page_no;

	ut_ad(mtr);

	/* Create the dictionary header file block in a new, allocated file
	segment in the system tablespace */
	block = fseg_create(DICT_HDR_SPACE, 0,
			    DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);

	ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block));

	dict_header = dict_hdr_get(mtr);

	/* Start counting row, table, index, and tree ids from
	DICT_HDR_FIRST_ID */
	mlog_write_dulint(dict_header + DICT_HDR_ROW_ID,
			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);

	mlog_write_dulint(dict_header + DICT_HDR_TABLE_ID,
			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);

	mlog_write_dulint(dict_header + DICT_HDR_INDEX_ID,
			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);

	/* Obsolete, but we must initialize it to 0 anyway. */
	mlog_write_dulint(dict_header + DICT_HDR_MIX_ID,
			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);

	/* Create the B-tree roots for the clustered indexes of the basic
	system tables */

	/*--------------------------*/
	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
				  DICT_HDR_SPACE, 0, DICT_TABLES_ID,
				  dict_ind_redundant, mtr);
	if (root_page_no == FIL_NULL) {

		return(FALSE);
	}

	mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no,
			 MLOG_4BYTES, mtr);
	/*--------------------------*/
	root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0,
				  DICT_TABLE_IDS_ID,
				  dict_ind_redundant, mtr);
	if (root_page_no == FIL_NULL) {

		return(FALSE);
	}

	mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no,
			 MLOG_4BYTES, mtr);
	/*--------------------------*/
	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
				  DICT_HDR_SPACE, 0, DICT_COLUMNS_ID,
				  dict_ind_redundant, mtr);
	if (root_page_no == FIL_NULL) {

		return(FALSE);
	}

	mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no,
			 MLOG_4BYTES, mtr);
	/*--------------------------*/
	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
				  DICT_HDR_SPACE, 0, DICT_INDEXES_ID,
				  dict_ind_redundant, mtr);
	if (root_page_no == FIL_NULL) {

		return(FALSE);
	}

	mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no,
			 MLOG_4BYTES, mtr);
	/*--------------------------*/
	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
				  DICT_HDR_SPACE, 0, DICT_FIELDS_ID,
				  dict_ind_redundant, mtr);
	if (root_page_no == FIL_NULL) {

		return(FALSE);
	}

	mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no,
			 MLOG_4BYTES, mtr);
	/*--------------------------*/

	return(TRUE);
}
Beispiel #17
0
/*********************************************************//**
Creates an event semaphore, i.e., a semaphore which may just have two
states: signaled and nonsignaled. The created event is manual reset: it
must be reset explicitly by calling sync_os_reset_event.
@return	the event handle */
UNIV_INTERN
os_event_t
os_event_create(
/*============*/
	const char*	name)	/*!< in: the name of the event, if NULL
				the event is created without a name */
{
#ifdef __WIN__
	os_event_t event;

	event = ut_malloc(sizeof(struct os_event_struct));

	event->handle = CreateEvent(NULL, /* No security attributes */
				    TRUE, /* Manual reset */
				    FALSE, /* Initial state nonsignaled */
				    (LPCTSTR) name);
	if (!event->handle) {
		fprintf(stderr,
			"InnoDB: Could not create a Windows event semaphore;"
			" Windows error %lu\n",
			(ulong) GetLastError());
	}
#else /* Unix */
	os_event_t	event;

	UT_NOT_USED(name);

	event = ut_malloc(sizeof(struct os_event_struct));

	os_fast_mutex_init(&(event->os_mutex));

	ut_a(0 == pthread_cond_init(&(event->cond_var), NULL));

	event->is_set = FALSE;

	/* We return this value in os_event_reset(), which can then be
	be used to pass to the os_event_wait_low(). The value of zero
	is reserved in os_event_wait_low() for the case when the
	caller does not want to pass any signal_count value. To
	distinguish between the two cases we initialize signal_count
	to 1 here. */
	event->signal_count = 1;
#endif /* __WIN__ */

	/* The os_sync_mutex can be NULL because during startup an event
	can be created [ because it's embedded in the mutex/rwlock ] before
	this module has been initialized */
	if (os_sync_mutex != NULL) {
		os_mutex_enter(os_sync_mutex);
	}

	/* Put to the list of events */
	UT_LIST_ADD_FIRST(os_event_list, os_event_list, event);

	os_event_count++;

	if (os_sync_mutex != NULL) {
		os_mutex_exit(os_sync_mutex);
	}

	return(event);
}
Beispiel #18
0
/*****************************************************************//**
Initializes the data dictionary memory structures when the database is
started. This function is also called when the data dictionary is created. */
UNIV_INTERN
void
dict_boot(void)
/*===========*/
{
	dict_table_t*	table;
	dict_index_t*	index;
	dict_hdr_t*	dict_hdr;
	mem_heap_t*	heap;
	mtr_t		mtr;
	ulint		error;

	mtr_start(&mtr);

	/* Create the hash tables etc. */
	dict_init();

	heap = mem_heap_create(450);

	mutex_enter(&(dict_sys->mutex));

	/* Get the dictionary header */
	dict_hdr = dict_hdr_get(&mtr);

	/* Because we only write new row ids to disk-based data structure
	(dictionary header) when it is divisible by
	DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
	the latest value of the row id counter. Therefore we advance
	the counter at the database startup to avoid overlapping values.
	Note that when a user after database startup first time asks for
	a new row id, then because the counter is now divisible by
	..._MARGIN, it will immediately be updated to the disk-based
	header. */

	dict_sys->row_id = ut_dulint_add(
		ut_dulint_align_up(mtr_read_dulint(dict_hdr + DICT_HDR_ROW_ID,
						   &mtr),
				   DICT_HDR_ROW_ID_WRITE_MARGIN),
		DICT_HDR_ROW_ID_WRITE_MARGIN);

	/* Insert into the dictionary cache the descriptions of the basic
	system tables */
	/*-------------------------*/
	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0);

	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
	/* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
	dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
	/* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT)
	and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);

	table->id = DICT_TABLES_ID;

	dict_table_add_to_cache(table, heap);
	dict_sys->sys_tables = table;
	mem_heap_empty(heap);

	index = dict_mem_index_create("SYS_TABLES", "CLUST_IND",
				      DICT_HDR_SPACE,
				      DICT_UNIQUE | DICT_CLUSTERED, 1);

	dict_mem_index_add_field(index, "NAME", 0);

	index->id = DICT_TABLES_ID;

	error = dict_index_add_to_cache(table, index,
					mtr_read_ulint(dict_hdr
						       + DICT_HDR_TABLES,
						       MLOG_4BYTES, &mtr),
					FALSE);
	ut_a(error == DB_SUCCESS);

	/*-------------------------*/
	index = dict_mem_index_create("SYS_TABLES", "ID_IND",
				      DICT_HDR_SPACE, DICT_UNIQUE, 1);
	dict_mem_index_add_field(index, "ID", 0);

	index->id = DICT_TABLE_IDS_ID;
	error = dict_index_add_to_cache(table, index,
					mtr_read_ulint(dict_hdr
						       + DICT_HDR_TABLE_IDS,
						       MLOG_4BYTES, &mtr),
					FALSE);
	ut_a(error == DB_SUCCESS);

	/*-------------------------*/
	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0);

	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4);

	table->id = DICT_COLUMNS_ID;

	dict_table_add_to_cache(table, heap);
	dict_sys->sys_columns = table;
	mem_heap_empty(heap);

	index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND",
				      DICT_HDR_SPACE,
				      DICT_UNIQUE | DICT_CLUSTERED, 2);

	dict_mem_index_add_field(index, "TABLE_ID", 0);
	dict_mem_index_add_field(index, "POS", 0);

	index->id = DICT_COLUMNS_ID;
	error = dict_index_add_to_cache(table, index,
					mtr_read_ulint(dict_hdr
						       + DICT_HDR_COLUMNS,
						       MLOG_4BYTES, &mtr),
					FALSE);
	ut_a(error == DB_SUCCESS);

	/*-------------------------*/
	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0);

	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);

	/* The '+ 2' below comes from the 2 system fields */
#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
#endif
#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2
#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2"
#endif
#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
#endif

	table->id = DICT_INDEXES_ID;
	dict_table_add_to_cache(table, heap);
	dict_sys->sys_indexes = table;
	mem_heap_empty(heap);

	index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND",
				      DICT_HDR_SPACE,
				      DICT_UNIQUE | DICT_CLUSTERED, 2);

	dict_mem_index_add_field(index, "TABLE_ID", 0);
	dict_mem_index_add_field(index, "ID", 0);

	index->id = DICT_INDEXES_ID;
	error = dict_index_add_to_cache(table, index,
					mtr_read_ulint(dict_hdr
						       + DICT_HDR_INDEXES,
						       MLOG_4BYTES, &mtr),
					FALSE);
	ut_a(error == DB_SUCCESS);

	/*-------------------------*/
	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0);

	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);

	table->id = DICT_FIELDS_ID;
	dict_table_add_to_cache(table, heap);
	dict_sys->sys_fields = table;
	mem_heap_free(heap);

	index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND",
				      DICT_HDR_SPACE,
				      DICT_UNIQUE | DICT_CLUSTERED, 2);

	dict_mem_index_add_field(index, "INDEX_ID", 0);
	dict_mem_index_add_field(index, "POS", 0);

	index->id = DICT_FIELDS_ID;
	error = dict_index_add_to_cache(table, index,
					mtr_read_ulint(dict_hdr
						       + DICT_HDR_FIELDS,
						       MLOG_4BYTES, &mtr),
					FALSE);
	ut_a(error == DB_SUCCESS);

	mtr_commit(&mtr);
	/*-------------------------*/

	/* Initialize the insert buffer table and index for each tablespace */

	ibuf_init_at_db_start();

	/* Load definitions of other indexes on system tables */

	dict_load_sys_table(dict_sys->sys_tables);
	dict_load_sys_table(dict_sys->sys_columns);
	dict_load_sys_table(dict_sys->sys_indexes);
	dict_load_sys_table(dict_sys->sys_fields);

	mutex_exit(&(dict_sys->mutex));
}
Beispiel #19
0
/***************************************************************
Updates the clustered index record. */
static
ulint
row_upd_clust_step(
/*===============*/
				/* out: DB_SUCCESS if operation successfully
				completed, DB_LOCK_WAIT in case of a lock wait,
				else error code */
	upd_node_t*	node,	/* in: row update node */
	que_thr_t*	thr)	/* in: query thread */
{
	dict_index_t*	index;
	btr_pcur_t*	pcur;
	ibool		success;
	ibool		check_ref;
	ulint		err;
	mtr_t*		mtr;
	mtr_t		mtr_buf;
	
	index = dict_table_get_first_index(node->table);

	check_ref = row_upd_index_is_referenced(index);

	pcur = node->pcur;

	/* We have to restore the cursor to its position */
	mtr = &mtr_buf;

	mtr_start(mtr);
	
	/* If the restoration does not succeed, then the same
	transaction has deleted the record on which the cursor was,
	and that is an SQL error. If the restoration succeeds, it may
	still be that the same transaction has successively deleted
	and inserted a record with the same ordering fields, but in
	that case we know that the transaction has at least an
	implicit x-lock on the record. */
	
	ut_a(pcur->rel_pos == BTR_PCUR_ON);

	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);

	if (!success) {
		err = DB_RECORD_NOT_FOUND;

		mtr_commit(mtr);

		return(err);
	}

	/* If this is a row in SYS_INDEXES table of the data dictionary,
	then we have to free the file segments of the index tree associated
	with the index */

	if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {

		dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);

		mtr_commit(mtr);

		mtr_start(mtr);

		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
									mtr);
		if (!success) {
			err = DB_ERROR;

			mtr_commit(mtr);

			return(err);
		}
	} 

	if (!node->has_clust_rec_x_lock) {
		err = lock_clust_rec_modify_check_and_lock(0,
						btr_pcur_get_rec(pcur),
						index, thr);
		if (err != DB_SUCCESS) {
			mtr_commit(mtr);

			return(err);
		}
	}

	/* NOTE: the following function calls will also commit mtr */

	if (node->is_delete) {
		err = row_upd_del_mark_clust_rec(node, index, thr, check_ref,
									mtr);
		if (err != DB_SUCCESS) {

			return(err);
		}

		node->state = UPD_NODE_UPDATE_ALL_SEC;
		node->index = dict_table_get_next_index(index);

		return(err);
	}
	
	/* If the update is made for MySQL, we already have the update vector
	ready, else we have to do some evaluation: */
 
	if (!node->in_mysql_interface) {
		/* Copy the necessary columns from clust_rec and calculate the
		new values to set */

		row_upd_copy_columns(btr_pcur_get_rec(pcur),
					UT_LIST_GET_FIRST(node->columns));
		row_upd_eval_new_vals(node->update);
	}
		
	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {

		err = row_upd_clust_rec(node, index, thr, mtr);

		return(err);
	}
	
	row_upd_store_row(node);

	if (row_upd_changes_ord_field_binary(node->row, index, node->update)) {

		/* Update causes an ordering field (ordering fields within
		the B-tree) of the clustered index record to change: perform
		the update by delete marking and inserting.

		TODO! What to do to the 'Halloween problem', where an update
		moves the record forward in index so that it is again
		updated when the cursor arrives there? Solution: the
		read operation must check the undo record undo number when
		choosing records to update. MySQL solves now the problem
		externally! */

		err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
									mtr);
		if (err != DB_SUCCESS) {

			return(err);
		}

		node->state = UPD_NODE_UPDATE_ALL_SEC;
	} else {
		err = row_upd_clust_rec(node, index, thr, mtr);

		if (err != DB_SUCCESS) {

			return(err);
		}

		node->state = UPD_NODE_UPDATE_SOME_SEC;
	}

	node->index = dict_table_get_next_index(index);

	return(err);
}
Beispiel #20
0
byte*
mlog_parse_nbytes(
/*==============*/
			/* out: parsed record end, NULL if not a complete
			record or a corrupt record */
	ulint	type,	/* in: log record type: MLOG_1BYTE, ... */
	byte*	ptr,	/* in: buffer */
	byte*	end_ptr,/* in: buffer end */
	byte*	page)	/* in: page where to apply the log record, or NULL */
{
	ulint	offset;
	ulint	val;
	dulint	dval;

	ut_a(type <= MLOG_8BYTES);

	if (end_ptr < ptr + 2) {

		return(NULL);
	}

	offset = mach_read_from_2(ptr);
	ptr += 2;
		
	if (offset >= UNIV_PAGE_SIZE) {
		recv_sys->found_corrupt_log = TRUE;

		return(NULL);
	}

	if (type == MLOG_8BYTES) {
		ptr = mach_dulint_parse_compressed(ptr, end_ptr, &dval);

		if (ptr == NULL) {

			return(NULL);
		}

		if (page) {
			mach_write_to_8(page + offset, dval);
		}

		return(ptr);
	}

	ptr = mach_parse_compressed(ptr, end_ptr, &val);

	if (ptr == NULL) {

		return(NULL);
	}

	if (type == MLOG_1BYTE) {
		if (val > 0xFFUL) {
			recv_sys->found_corrupt_log = TRUE;

			return(NULL);
		}
	} else if (type == MLOG_2BYTES) {
		if (val > 0xFFFFUL) {
			recv_sys->found_corrupt_log = TRUE;

			return(NULL);
		}
	} else {
		if (type != MLOG_4BYTES) {
			recv_sys->found_corrupt_log = TRUE;

			return(NULL);
		}
	}

	if (page) {
		if (type == MLOG_1BYTE) {
			mach_write_to_1(page + offset, val);
		} else if (type == MLOG_2BYTES) {
			mach_write_to_2(page + offset, val);
		} else {
			ut_a(type == MLOG_4BYTES);
			mach_write_to_4(page + offset, val);
		}
	}

	return(ptr);
}
Beispiel #21
0
ibool
buf_LRU_validate(void)
/*==================*/
{
	buf_block_t*	block;
	ulint		old_len;
	ulint		new_len;
	ulint		LRU_pos;

	ut_ad(buf_pool);
	mutex_enter(&(buf_pool->mutex));

	if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {

		ut_a(buf_pool->LRU_old);
		old_len = buf_pool->LRU_old_len;
		new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8);
		ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
	}

	UT_LIST_VALIDATE(LRU, buf_block_t, buf_pool->LRU);

	block = UT_LIST_GET_FIRST(buf_pool->LRU);

	old_len = 0;

	while (block != NULL) {

		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		if (block->old) {
			old_len++;
		}

		if (buf_pool->LRU_old && (old_len == 1)) {
			ut_a(buf_pool->LRU_old == block);
		}

		LRU_pos	= block->LRU_position;

		block = UT_LIST_GET_NEXT(LRU, block);

		if (block) {
			/* If the following assert fails, it may
			not be an error: just the buf_pool clock
			has wrapped around */
			ut_a(LRU_pos >= block->LRU_position);
		}
	}

	if (buf_pool->LRU_old) {
		ut_a(buf_pool->LRU_old_len == old_len);
	}

	UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free);

	block = UT_LIST_GET_FIRST(buf_pool->free);

	while (block != NULL) {
		ut_a(block->state == BUF_BLOCK_NOT_USED);

		block = UT_LIST_GET_NEXT(free, block);
	}

	mutex_exit(&(buf_pool->mutex));
	return(TRUE);
}
Beispiel #22
0
byte*
mlog_open_and_write_index(
/*======================*/
				/* out: buffer, NULL if log mode
				MTR_LOG_NONE */
	mtr_t*		mtr,	/* in: mtr */
	byte*		rec,	/* in: index record or page */
	dict_index_t*	index,	/* in: record descriptor */
	byte		type,	/* in: log item type */
	ulint		size)	/* in: requested buffer size in bytes
				(if 0, calls mlog_close() and returns NULL) */
{
	byte*		log_ptr;
	const byte*	log_start;
	const byte*	log_end;

	ut_ad(!!page_rec_is_comp(rec) == index->table->comp);

	if (!page_rec_is_comp(rec)) {
		log_start = log_ptr = mlog_open(mtr, 11 + size);
		if (!log_ptr) {
			return(NULL); /* logging is disabled */
		}
		log_ptr = mlog_write_initial_log_record_fast(rec, type,
				log_ptr, mtr);
		log_end = log_ptr + 11 + size;
	} else {
		ulint	i;
		ulint	n	= dict_index_get_n_fields(index);
		/* total size needed */
		ulint	total	= 11 + size + (n + 2) * 2;
		ulint	alloc	= total;
		/* allocate at most DYN_ARRAY_DATA_SIZE at a time */
		if (alloc > DYN_ARRAY_DATA_SIZE) {
			alloc = DYN_ARRAY_DATA_SIZE;
		}
		log_start = log_ptr = mlog_open(mtr, alloc);
		if (!log_ptr) {
			return(NULL); /* logging is disabled */
		}
		log_end = log_ptr + alloc;
		log_ptr = mlog_write_initial_log_record_fast(rec, type,
				log_ptr, mtr);
		mach_write_to_2(log_ptr, n);
		log_ptr += 2;
		mach_write_to_2(log_ptr,
				dict_index_get_n_unique_in_tree(index));
		log_ptr += 2;
		for (i = 0; i < n; i++) {
			dict_field_t*	field;
			dtype_t*	type;
			ulint		len;
			field = dict_index_get_nth_field(index, i);
			type = dict_col_get_type(dict_field_get_col(field));
			len = field->fixed_len;
			ut_ad(len < 0x7fff);
			if (len == 0 && (dtype_get_len(type) > 255
				|| dtype_get_mtype(type) == DATA_BLOB)) {
				/* variable-length field
				with maximum length > 255 */
				len = 0x7fff;
			}
			if (dtype_get_prtype(type) & DATA_NOT_NULL) {
				len |= 0x8000;
			}
			if (log_ptr + 2 > log_end) {
				mlog_close(mtr, log_ptr);
				ut_a(total > (ulint) (log_ptr - log_start));
				total -= log_ptr - log_start;
				alloc = total;
				if (alloc > DYN_ARRAY_DATA_SIZE) {
					alloc = DYN_ARRAY_DATA_SIZE;
				}
				log_start = log_ptr = mlog_open(mtr, alloc);
				if (!log_ptr) {
					return(NULL); /* logging is disabled */
				}
				log_end = log_ptr + alloc;
			}
			mach_write_to_2(log_ptr, len);
			log_ptr += 2;
		}
	}
	if (size == 0) {
		mlog_close(mtr, log_ptr);
		log_ptr = NULL;
	} else if (log_ptr + size > log_end) {
		mlog_close(mtr, log_ptr);
		log_ptr = mlog_open(mtr, size);
	}
	return(log_ptr);
}
Beispiel #23
0
buf_block_t*
buf_LRU_get_free_block(void)
/*========================*/
				/* out: the free control block; also if AWE is
				used, it is guaranteed that the block has its
				page mapped to a frame when we return */
{
	buf_block_t*	block		= NULL;
	ibool		freed;
	ulint		n_iterations	= 1;
	ibool		mon_value_was	= FALSE;
	ibool		started_monitor	= FALSE;
loop:
	mutex_enter(&(buf_pool->mutex));

	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) {
		ut_print_timestamp(stderr);

		fprintf(stderr,
			"  InnoDB: ERROR: over 95 percent of the buffer pool"
			" is occupied by\n"
			"InnoDB: lock heaps or the adaptive hash index!"
			" Check that your\n"
			"InnoDB: transactions do not set too many row locks.\n"
			"InnoDB: Your buffer pool size is %lu MB."
			" Maybe you should make\n"
			"InnoDB: the buffer pool bigger?\n"
			"InnoDB: We intentionally generate a seg fault"
			" to print a stack trace\n"
			"InnoDB: on Linux!\n",
			(ulong) (buf_pool->curr_size
				 / (1024 * 1024 / UNIV_PAGE_SIZE)));

		ut_error;

	} else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
		   + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 3) {

		if (!buf_lru_switched_on_innodb_mon) {

	   		/* Over 67 % of the buffer pool is occupied by lock
			heaps or the adaptive hash index. This may be a memory
			leak! */

			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: WARNING: over 67 percent of"
				" the buffer pool is occupied by\n"
				"InnoDB: lock heaps or the adaptive"
				" hash index! Check that your\n"
				"InnoDB: transactions do not set too many"
				" row locks.\n"
				"InnoDB: Your buffer pool size is %lu MB."
				" Maybe you should make\n"
				"InnoDB: the buffer pool bigger?\n"
				"InnoDB: Starting the InnoDB Monitor to print"
				" diagnostics, including\n"
				"InnoDB: lock heap and hash index sizes.\n",
				(ulong) (buf_pool->curr_size
					 / (1024 * 1024 / UNIV_PAGE_SIZE)));

			buf_lru_switched_on_innodb_mon = TRUE;
			srv_print_innodb_monitor = TRUE;
			os_event_set(srv_lock_timeout_thread_event);
		}
	} else if (buf_lru_switched_on_innodb_mon) {

		/* Switch off the InnoDB Monitor; this is a simple way
		to stop the monitor if the situation becomes less urgent,
		but may also surprise users if the user also switched on the
		monitor! */

		buf_lru_switched_on_innodb_mon = FALSE;
		srv_print_innodb_monitor = FALSE;
	}

	/* If there is a block in the free list, take it */
	if (UT_LIST_GET_LEN(buf_pool->free) > 0) {

		block = UT_LIST_GET_FIRST(buf_pool->free);
		ut_a(block->in_free_list);
		UT_LIST_REMOVE(free, buf_pool->free, block);
		block->in_free_list = FALSE;
		ut_a(block->state != BUF_BLOCK_FILE_PAGE);
		ut_a(!block->in_LRU_list);

		if (srv_use_awe) {
			if (block->frame) {
				/* Remove from the list of mapped pages */

				UT_LIST_REMOVE(awe_LRU_free_mapped,
					       buf_pool->awe_LRU_free_mapped,
					       block);
			} else {
				/* We map the page to a frame; second param
				FALSE below because we do not want it to be
				added to the awe_LRU_free_mapped list */

				buf_awe_map_page_to_frame(block, FALSE);
			}
		}

		mutex_enter(&block->mutex);

		block->state = BUF_BLOCK_READY_FOR_USE;
		UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);

		mutex_exit(&block->mutex);

		mutex_exit(&(buf_pool->mutex));

		if (started_monitor) {
			srv_print_innodb_monitor = mon_value_was;
		}

		return(block);
	}

	/* If no block was in the free list, search from the end of the LRU
	list and try to free a block there */

	mutex_exit(&(buf_pool->mutex));

	freed = buf_LRU_search_and_free_block(n_iterations);

	if (freed > 0) {
		goto loop;
	}

	if (n_iterations > 30) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"InnoDB: Warning: difficult to find free blocks from\n"
			"InnoDB: the buffer pool (%lu search iterations)!"
			" Consider\n"
			"InnoDB: increasing the buffer pool size.\n"
			"InnoDB: It is also possible that"
			" in your Unix version\n"
			"InnoDB: fsync is very slow, or"
			" completely frozen inside\n"
			"InnoDB: the OS kernel. Then upgrading to"
			" a newer version\n"
			"InnoDB: of your operating system may help."
			" Look at the\n"
			"InnoDB: number of fsyncs in diagnostic info below.\n"
			"InnoDB: Pending flushes (fsync) log: %lu;"
			" buffer pool: %lu\n"
			"InnoDB: %lu OS file reads, %lu OS file writes,"
			" %lu OS fsyncs\n"
			"InnoDB: Starting InnoDB Monitor to print further\n"
			"InnoDB: diagnostics to the standard output.\n",
			(ulong) n_iterations,
			(ulong) fil_n_pending_log_flushes,
			(ulong) fil_n_pending_tablespace_flushes,
			(ulong) os_n_file_reads, (ulong) os_n_file_writes,
			(ulong) os_n_fsyncs);

		mon_value_was = srv_print_innodb_monitor;
		started_monitor = TRUE;
		srv_print_innodb_monitor = TRUE;
		os_event_set(srv_lock_timeout_thread_event);
	}

	/* No free block was found: try to flush the LRU list */

	buf_flush_free_margin();
	++srv_buf_pool_wait_free;

	os_aio_simulated_wake_handler_threads();

	mutex_enter(&(buf_pool->mutex));

	if (buf_pool->LRU_flush_ended > 0) {
		/* We have written pages in an LRU flush. To make the insert
		buffer more efficient, we try to move these pages to the free
		list. */

		mutex_exit(&(buf_pool->mutex));

		buf_LRU_try_free_flushed_blocks();
	} else {
		mutex_exit(&(buf_pool->mutex));
	}

	if (n_iterations > 10) {

		os_thread_sleep(500000);
	}

	n_iterations++;

	goto loop;
}
Beispiel #24
0
/********************************************************************//**
Flush pages from flash cache.
@return	number of pages have been flushed to tablespace */
UNIV_INTERN
ulint	
fc_flush_to_disk(
/*==================*/
	ibool do_full_io)	/*!< in: whether do full io capacity */
{
	ulint distance;
	byte* page;
	ulint ret;
	ulint space;
	ulint offset;
	ulint page_type;
	ulint i, j;
	ulint pos;
	ulint zip_size;
	ulint block_offset, byte_offset;
	ulint fc_size = fc_get_size();
	ulint fc_blk_size = fc_get_block_size_byte();
	ulint start_offset;
   	ulint data_size;
	fc_block_t *flush_block = NULL;
	ulint c_flush = 0;
    
	ut_ad(!mutex_own(&fc->mutex));
	ut_a(fc->flush_buf->free_pos == 0);

	/* step 1: get the number of blocks need to flush to tablespace */
	flash_cache_mutex_enter();

	distance = fc_get_distance();
	start_offset = fc->flush_off;
    
	if ( distance == 0 ) {
		flash_cache_mutex_exit();
		return 0;
	} else if ( recv_recovery_on ) {
		if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) {
			fc->n_flush_cur = 0;
		} else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) {
			fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance);
		} else {
			fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance);
		}
	} else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)
		&& !do_full_io ) {
		flash_cache_mutex_exit();
		return 0;
	} else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size)
		&& !do_full_io ) {
		fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct);
	} else {
		ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) 
			|| do_full_io );
		fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance);
	}

	flash_cache_mutex_exit();

	/* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */
	i = 0;
	while (i < fc->n_flush_cur) {
		ulint b_space;
		ulint b_offset;
		ulint raw_zip_size;
		ulint size;
		ulint fil_offset;
#ifdef UNIV_FLASH_CACHE_TRACE
		ulint is_v4_blk;
#endif
		byte* page_io;

		flash_cache_mutex_enter();
		pos = ( start_offset + i ) % fc_size;
		flush_block = fc_get_block(pos);

		if (flush_block == NULL) {
			i++;
			flash_cache_mutex_exit();
			continue;
		}

		/* we should get the mutex, as doublewrite may hit this block and invalid the block */
		flash_block_mutex_enter(flush_block->fil_offset);

		flash_cache_mutex_exit();
		
		data_size = fc_block_get_data_size(flush_block);

		if (flush_block->state != BLOCK_READY_FOR_FLUSH) {
			/* if readonly or merge write or already flushed*/
			ut_a (flush_block->state == BLOCK_NOT_USED
				|| flush_block->state == BLOCK_READ_CACHE
				|| flush_block->state == BLOCK_FLUSHED);
			
			i += data_size;

			flash_block_mutex_exit(flush_block->fil_offset);
			if (flush_block->state == BLOCK_NOT_USED) {
				//fc_block_detach(FALSE, flush_block);
				fc_block_free(flush_block);
			}
			
			continue;
		}

		zip_size = fil_space_get_zip_size(flush_block->space);
		if (zip_size == ULINT_UNDEFINED) {
			/* table has been droped, just set it BLOCK_FLUSHED */
#ifdef UNIV_FLASH_CACHE_TRACE
			ut_print_timestamp(fc->f_debug);
			fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n",
			(ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset);
#endif
			flush_block->state = BLOCK_FLUSHED;
			i += data_size;
			c_flush += data_size;
			flash_block_mutex_exit(flush_block->fil_offset);
			continue;
		}

#ifdef UNIV_FLASH_CACHE_TRACE
		if (flush_block->state != BLOCK_READY_FOR_FLUSH) {
			fc_block_print(flush_block);
			ut_error;
		}
#endif

		flush_block->io_fix |= IO_FIX_FLUSH;

		/* 
		 * we should set block state BLOCK_FLUSHED,  if not, doublewrite may hit this block 
		 * and invalid this block and reduce the dirty count, but when finish flush ,we will 
		 * reduce the dirty count too, so it may reduce twice.
		 */
		flush_block->state = BLOCK_FLUSHED;
		
		/* save the block info, as the block may be invalided by doublewrite after release mutex */
		b_space = flush_block->space;
		b_offset = flush_block->offset;

		raw_zip_size = flush_block->raw_zip_size;
		size = flush_block->size;
		fil_offset = flush_block->fil_offset;
#ifdef UNIV_FLASH_CACHE_TRACE
		is_v4_blk = flush_block->is_v4_blk;
#endif
		/* release the block now, so read can hit in this blocks and read the data */
		flash_block_mutex_exit(flush_block->fil_offset);
		
		/*
		 * Only flush thread will update read_buf and flush_off/round. 
		 * there only single flush thread no need to lock read_buf
		 */
		page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size;

		if (raw_zip_size > 0) {
			ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE);
			page_io = fc->flush_zip_read_buf;
		} else {
			page_io = page;
		}

		fc_io_offset(fil_offset, &block_offset, &byte_offset);
		ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0,
				block_offset, byte_offset, data_size * fc_blk_size,
				page_io, NULL);
	
		if (ret != DB_SUCCESS) {
			ut_print_timestamp(stderr);
			fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n"
				"flash cache flush offset is:%lu.\n", (ulong)(start_offset + i));
			ut_error;
		}		

		if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) {
			goto skip;
		}

		/* decompress the compress data */
		if (raw_zip_size > 0) {
#ifdef UNIV_FLASH_CACHE_TRACE
			ulint blk_zip_size_byte;
			if (is_v4_blk) {
				blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte();
			} else {
				blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte();
				ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size);				
			} 

			ut_a(page_io);
			ut_a(page);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM);
			ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER)
				== FC_ZIP_PAGE_CHECKSUM);	
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE);		
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset);	

			/* only qlz can do this check  */
			if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) {
				if (is_v4_blk) {
					ut_a(raw_zip_size * fc_get_block_size_byte()
						>= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
				} else {
					ut_a(raw_zip_size 
						== (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
				}
				
				ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
			}
#endif
			fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page);
		}

		space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
		offset = mach_read_from_4(page + FIL_PAGE_OFFSET);

		if ((space != b_space) || (offset != b_offset)) {
			ut_print_timestamp(stderr); 
			fc_block_print(flush_block);
			ut_error;
		}

		if (buf_page_is_corrupted(page, zip_size)) {
			buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH);
			ut_error;
		}		
		
		page_type = fil_page_get_type(page);
		if (page_type == FIL_PAGE_INDEX) {
			page_type = 1;
		}
		srv_flash_cache_flush_detail[page_type]++;
		
		ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, 
				zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL);
		if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) {
			ut_print_timestamp(stderr); 
			fc_block_print(flush_block);
			ut_error;
		}

		/* add  UNIV_PAGE_SIZE / fc_blk_size for safe */
		fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size;	

skip:
		i += data_size;
		c_flush += data_size;	

		if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) {
			/* FIXME: is it safe to change n_flush, as step 3 will use n_flush */
			fc->n_flush_cur = i;
			break;
		}	
	}

	/* ok, now flush all async io to disk */
	fc_flush_sync_dbfile();

	/* step 3: all the flush blocks have sync to disk,  update the state and io_fix */
	j = 0;
	while (j < fc->n_flush_cur) {

		flash_cache_mutex_enter();
		pos = (start_offset + j) % fc_size;
		flush_block = fc_get_block(pos);

		if (flush_block  == NULL) {
			j++;
			flash_cache_mutex_exit();
			continue;
		}
		/* block state and io_fix may be changed by doublewrite and lru move */
		flash_block_mutex_enter(flush_block->fil_offset);
		flash_cache_mutex_exit();
		if (flush_block->io_fix & IO_FIX_FLUSH) {
			/* the block is already in BLOCK_FLUSHED state */
			flush_block->io_fix &= ~IO_FIX_FLUSH;
		} 
		
		data_size = fc_block_get_data_size(flush_block);
		flash_block_mutex_exit(flush_block->fil_offset);	
		
		j += data_size;
	}

	
	/*
	 * i and j may be different, as the last been flushed block may be invalid by doublewrite,
	 * so maybe i > j
	 */
	
	/* add the actual flushed blocks */
	srv_flash_cache_flush = srv_flash_cache_flush + c_flush; 

	/* step 4: update fc status and flush_off, and wake up threads that are sleep for space  */
	if (i > 0) {
		ut_a(i >= c_flush);

		flash_cache_mutex_enter();
		
		/*
		 * it is safe to inc flush off and sub dirty blocks at this time,
		 * as fc_validate is not work
		 */
		fc_inc_flush_off(i);
		flash_cache_log_mutex_enter();
		fc_log->current_stat->flush_offset = fc->flush_off;
		fc_log->current_stat->flush_round = fc->flush_round;	
		flash_cache_log_mutex_exit();		
		
		ut_a(srv_flash_cache_dirty >= c_flush);		
		srv_flash_cache_dirty -= c_flush;
		
		srv_fc_flush_should_commit_log_flush++;
		os_event_set(fc->wait_space_event);	

		fc->n_flush_cur = 0;
		
		flash_cache_mutex_exit();		
	}

	fc->flush_buf->free_pos = 0;
 
	return c_flush;
}
Beispiel #25
0
/**********************************************************************
When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
hash index entries belonging to that table. This function tries to
do that in batch. Note that this is a 'best effort' attempt and does
not guarantee that ALL hash entries will be removed. */
static
void
buf_LRU_drop_page_hash_for_tablespace(
/*==================================*/
	ulint	id)	/* in: space id */
{
	buf_block_t*	block;
	ulint*		page_arr;
	ulint		num_entries;

	page_arr = ut_malloc(sizeof(ulint)
			     * BUF_LRU_DROP_SEARCH_HASH_SIZE);
	mutex_enter(&buf_pool->mutex);

scan_again:
	num_entries = 0;
	block = UT_LIST_GET_LAST(buf_pool->LRU);

	while (block != NULL) {
		buf_block_t*	prev_block;

		mutex_enter(&block->mutex);
		prev_block = UT_LIST_GET_PREV(LRU, block);

		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		if (block->space != id
		    || block->buf_fix_count > 0
		    || block->io_fix != 0) {
			/* We leave the fixed pages as is in this scan.
			To be dealt with later in the final scan. */
			mutex_exit(&block->mutex);
			goto next_page;
		}

		ut_ad(block->space == id);
		if (block->is_hashed) {

			/* Store the offset(i.e.: page_no) in the array
			so that we can drop hash index in a batch
			later. */
			page_arr[num_entries] = block->offset;
			mutex_exit(&block->mutex);
			ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
			++num_entries;

			if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
				goto next_page;
			}
			/* Array full. We release the buf_pool->mutex to
			obey the latching order. */
			mutex_exit(&buf_pool->mutex);

			buf_LRU_drop_page_hash_batch(id, page_arr,
						     num_entries);
			num_entries = 0;
			mutex_enter(&buf_pool->mutex);
		} else {
			mutex_exit(&block->mutex);
		}

next_page:
		/* Note that we may have released the buf_pool->mutex
		above after reading the prev_block during processing
		of a page_hash_batch (i.e.: when the array was full).
		This means that prev_block can change in LRU list.
		This is OK because this function is a 'best effort'
		to drop as many search hash entries as possible and
		it does not guarantee that ALL such entries will be
		dropped. */
		block = prev_block;

		/* If, however, block has been removed from LRU list
		to the free list then we should restart the scan.
		block->state is protected by buf_pool->mutex. */
		if (block && block->state != BUF_BLOCK_FILE_PAGE) {
			ut_a(num_entries == 0);
			goto scan_again;
		}
	}

	mutex_exit(&buf_pool->mutex);

	/* Drop any remaining batch of search hashed pages. */
	buf_LRU_drop_page_hash_batch(id, page_arr, num_entries);
	ut_free(page_arr);
}
Beispiel #26
0
/********************************************************************//**
Issues read requests for pages which the ibuf module wants to read in, in
order to contract the insert buffer tree. Technically, this function is like
a read-ahead function. */
UNIV_INTERN
void
buf_read_ibuf_merge_pages(
/*======================*/
	ibool		sync,		/*!< in: TRUE if the caller
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
	const ulint*	space_ids,	/*!< in: array of space ids */
	const ib_int64_t* space_versions,/*!< in: the spaces must have
					this version number
					(timestamp), otherwise we
					discard the read; we use this
					to cancel reads if DISCARD +
					IMPORT may have changed the
					tablespace size */
	const ulint*	page_nos,	/*!< in: array of page numbers
					to read, with the highest page
					number the last in the
					array */
	ulint		n_stored)	/*!< in: number of elements
					in the arrays */
{
	ulint	i;

	ut_ad(!ibuf_inside());
#ifdef UNIV_IBUF_DEBUG
	ut_a(n_stored < UNIV_PAGE_SIZE);
#endif

	for (i = 0; i < n_stored; i++) {
		ulint		err;
		buf_pool_t*	buf_pool;
		ulint		zip_size = fil_space_get_zip_size(space_ids[i]);

		buf_pool = buf_pool_get(space_ids[i], space_versions[i]);

		while (buf_pool->n_pend_reads
		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
			os_thread_sleep(500000);
		}

		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {

			goto tablespace_deleted;
		}

		buf_read_page_low(&err, sync && (i + 1 == n_stored),
				  BUF_READ_ANY_PAGE, space_ids[i],
				  zip_size, TRUE, space_versions[i],
				  page_nos[i]);

		if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
tablespace_deleted:
			/* We have deleted or are deleting the single-table
			tablespace: remove the entries for that page */

			ibuf_merge_or_delete_for_page(NULL, space_ids[i],
						      page_nos[i],
						      zip_size, FALSE);
		}
	}

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of all the LRU lists if necessary */
	buf_flush_free_margins();

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Ibuf merge read-ahead space %lu pages %lu\n",
			(ulong) space_ids[0], (ulong) n_stored);
	}
#endif /* UNIV_DEBUG */
}
Beispiel #27
0
enum db_err
ib_trx_lock_table_with_retry(
/*=========================*/
	trx_t*		trx,		/*!< in/out: transaction */
	dict_table_t*	table,		/*!< in: table to lock */
	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
{
	que_thr_t*	thr;
	enum db_err	err;
	mem_heap_t*	heap;
	sel_node_t*	node;

	ut_ad(trx->client_thread_id == os_thread_get_curr_id());

	heap = mem_heap_create(512);

	trx->op_info = "setting table lock";

	node = sel_node_create(heap);
	thr = pars_complete_graph_for_exec(node, trx, heap);
	thr->graph->state = QUE_FORK_ACTIVE;

	/* We use the select query graph as the dummy graph needed
	in the lock module call */

	thr = que_fork_get_first_thr(que_node_get_parent(thr));
	que_thr_move_to_run_state(thr);

run_again:
	thr->run_node = thr;
	thr->prev_node = thr->common.parent;

	err = lock_table(0, table, mode, thr);

	trx->error_state = err;

	if (UNIV_LIKELY(err == DB_SUCCESS)) {
		que_thr_stop_for_client_no_error(thr, trx);
	} else {
		que_thr_stop_client(thr);

		if (err != DB_QUE_THR_SUSPENDED) {
			ibool	was_lock_wait;

			was_lock_wait = ib_handle_errors(&err, trx, thr, NULL);

			if (was_lock_wait) {
				goto run_again;
			}
		} else {
			que_thr_t*	run_thr;
			que_node_t*	parent;

			parent = que_node_get_parent(thr);
			run_thr = que_fork_start_command(parent);

			ut_a(run_thr == thr);

			/* There was a lock wait but the thread was not
			in a ready to run or running state. */
			trx->error_state = DB_LOCK_WAIT;

			goto run_again;
		}
	}

	que_graph_free(thr->graph);
	trx->op_info = "";

	return(err);
}
Beispiel #28
0
/********************************************************************//**
Low-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there, in which case does nothing.
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
flag is cleared and the x-lock released by an i/o-handler thread.
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
not exist or is being dropped 
@return 1 if read request is issued. 0 if it is not */
static
ulint
buf_read_page_low(
/*==============*/
	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
			trying to read from a non-existent tablespace, or a
			tablespace which is just now being dropped */
	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
			at read-ahead functions) */
	ulint	space,	/*!< in: space id */
	ulint	zip_size,/*!< in: compressed page size, or 0 */
	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
	ib_int64_t tablespace_version, /*!< in: if the space memory object has
			this timestamp different from what we are giving here,
			treat the tablespace as dropped; this is a timestamp we
			use to stop dangling page reads from a tablespace
			which we have DISCARDed + IMPORTed back */
	ulint	offset)	/*!< in: page number */
{
	buf_page_t*	bpage;
	ulint		wake_later;

	*err = DB_SUCCESS;

	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;

	if (trx_doublewrite && space == TRX_SYS_SPACE
	    && (   (offset >= trx_doublewrite->block1
		    && offset < trx_doublewrite->block1
		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
		   || (offset >= trx_doublewrite->block2
		       && offset < trx_doublewrite->block2
		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Warning: trying to read"
			" doublewrite buffer page %lu\n",
			(ulong) offset);

		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* Trx sys header is so low in the latching order that we play
		safe and do not leave the i/o-completion to an asynchronous
		i/o-thread. Ibuf bitmap pages must always be read with
		syncronous i/o, to make sure they do not get involved in
		thread deadlocks. */

		sync = TRUE;
	}

	/* The following call will also check if the tablespace does not exist
	or is being dropped; if we succeed in initing the page in the buffer
	pool for read, then DISCARD cannot proceed until the read has
	completed */
	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
				       tablespace_version, offset);
	if (bpage == NULL) {

		return(0);
	}

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Posting read request for page %lu, sync %lu\n",
			(ulong) offset,
			(ulong) sync);
	}
#endif

	ut_ad(buf_page_in_file(bpage));

	if (zip_size) {
		*err = fil_io(OS_FILE_READ | wake_later,
			      sync, space, zip_size, offset, 0, zip_size,
			      bpage->zip.data, bpage);
	} else {
		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);

		*err = fil_io(OS_FILE_READ | wake_later,
			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
			      ((buf_block_t*) bpage)->frame, bpage);
	}
	ut_a(*err == DB_SUCCESS);

	if (sync) {
		/* The i/o is already completed when we arrive from
		fil_read */
		buf_page_io_complete(bpage);
	}

	return(1);
}
Beispiel #29
0
/**********************************************************************//**
Reports in the undo log of an update or delete marking of a clustered index
record.
@return byte offset of the inserted undo log entry on the page if
succeed, 0 if fail */
static
ulint
trx_undo_page_report_modify(
/*========================*/
	page_t*		undo_page,	/*!< in: undo log page */
	trx_t*		trx,		/*!< in: transaction */
	dict_index_t*	index,		/*!< in: clustered index where update or
					delete marking is done */
	const rec_t*	rec,		/*!< in: clustered index record which
					has NOT yet been modified */
	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
	const upd_t*	update,		/*!< in: update vector which tells the
					columns to be updated; in the case of
					a delete, this should be set to NULL */
	ulint		cmpl_info,	/*!< in: compiler info on secondary
					index updates */
	mtr_t*		mtr)		/*!< in: mtr */
{
	dict_table_t*	table;
	ulint		first_free;
	byte*		ptr;
	const byte*	field;
	ulint		flen;
	ulint		col_no;
	ulint		type_cmpl;
	byte*		type_cmpl_ptr;
	ulint		i;
	trx_id_t	trx_id;
	ibool		ignore_prefix = FALSE;
	byte		ext_buf[REC_MAX_INDEX_COL_LEN
				+ BTR_EXTERN_FIELD_REF_SIZE];

	ut_a(dict_index_is_clust(index));
	ut_ad(rec_offs_validate(rec, index, offsets));
	ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
			       + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE);
	table = index->table;

	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
				      + TRX_UNDO_PAGE_FREE);
	ptr = undo_page + first_free;

	ut_ad(first_free <= UNIV_PAGE_SIZE);

	if (trx_undo_left(undo_page, ptr) < 50) {

		/* NOTE: the value 50 must be big enough so that the general
		fields written below fit on the undo log page */

		return(0);
	}

	/* Reserve 2 bytes for the pointer to the next undo log record */
	ptr += 2;

	/* Store first some general parameters to the undo log */

	if (!update) {
		type_cmpl = TRX_UNDO_DEL_MARK_REC;
	} else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
		type_cmpl = TRX_UNDO_UPD_DEL_REC;
		/* We are about to update a delete marked record.
		We don't typically need the prefix in this case unless
		the delete marking is done by the same transaction
		(which we check below). */
		ignore_prefix = TRUE;
	} else {
		type_cmpl = TRX_UNDO_UPD_EXIST_REC;
	}

	type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
	type_cmpl_ptr = ptr;

	*ptr++ = (byte) type_cmpl;
	ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no);

	ptr += mach_dulint_write_much_compressed(ptr, table->id);

	/*----------------------------------------*/
	/* Store the state of the info bits */

	*ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));

	/* Store the values of the system columns */
	field = rec_get_nth_field(rec, offsets,
				  dict_index_get_sys_col_pos(
					  index, DATA_TRX_ID), &flen);
	ut_ad(flen == DATA_TRX_ID_LEN);

	trx_id = trx_read_trx_id(field);

	/* If it is an update of a delete marked record, then we are
	allowed to ignore blob prefixes if the delete marking was done
	by some other trx as it must have committed by now for us to
	allow an over-write. */
	if (ignore_prefix) {
		ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0;
	}
	ptr += mach_dulint_write_compressed(ptr, trx_id);

	field = rec_get_nth_field(rec, offsets,
				  dict_index_get_sys_col_pos(
					  index, DATA_ROLL_PTR), &flen);
	ut_ad(flen == DATA_ROLL_PTR_LEN);

	ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field));

	/*----------------------------------------*/
	/* Store then the fields required to uniquely determine the
	record which will be modified in the clustered index */

	for (i = 0; i < dict_index_get_n_unique(index); i++) {

		field = rec_get_nth_field(rec, offsets, i, &flen);

		/* The ordering columns must not be stored externally. */
		ut_ad(!rec_offs_nth_extern(offsets, i));
		ut_ad(dict_index_get_nth_col(index, i)->ord_part);

		if (trx_undo_left(undo_page, ptr) < 5) {

			return(0);
		}

		ptr += mach_write_compressed(ptr, flen);

		if (flen != UNIV_SQL_NULL) {
			if (trx_undo_left(undo_page, ptr) < flen) {

				return(0);
			}

			ut_memcpy(ptr, field, flen);
			ptr += flen;
		}
	}

	/*----------------------------------------*/
	/* Save to the undo log the old values of the columns to be updated. */

	if (update) {
		if (trx_undo_left(undo_page, ptr) < 5) {

			return(0);
		}

		ptr += mach_write_compressed(ptr, upd_get_n_fields(update));

		for (i = 0; i < upd_get_n_fields(update); i++) {

			ulint	pos = upd_get_nth_field(update, i)->field_no;

			/* Write field number to undo log */
			if (trx_undo_left(undo_page, ptr) < 5) {

				return(0);
			}

			ptr += mach_write_compressed(ptr, pos);

			/* Save the old value of field */
			field = rec_get_nth_field(rec, offsets, pos, &flen);

			if (trx_undo_left(undo_page, ptr) < 15) {

				return(0);
			}

			if (rec_offs_nth_extern(offsets, pos)) {
				ptr = trx_undo_page_report_modify_ext(
					ptr,
					dict_index_get_nth_col(index, pos)
					->ord_part
					&& !ignore_prefix
					&& flen < REC_MAX_INDEX_COL_LEN
					? ext_buf : NULL,
					dict_table_zip_size(table),
					&field, &flen);

				/* Notify purge that it eventually has to
				free the old externally stored field */

				trx->update_undo->del_marks = TRUE;

				*type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
			} else {
				ptr += mach_write_compressed(ptr, flen);
			}

			if (flen != UNIV_SQL_NULL) {
				if (trx_undo_left(undo_page, ptr) < flen) {

					return(0);
				}

				ut_memcpy(ptr, field, flen);
				ptr += flen;
			}
		}
	}

	/*----------------------------------------*/
	/* In the case of a delete marking, and also in the case of an update
	where any ordering field of any index changes, store the values of all
	columns which occur as ordering fields in any index. This info is used
	in the purge of old versions where we use it to build and search the
	delete marked index records, to look if we can remove them from the
	index tree. Note that starting from 4.0.14 also externally stored
	fields can be ordering in some index. Starting from 5.2, we no longer
	store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
	but we can construct the column prefix fields in the index by
	fetching the first page of the BLOB that is pointed to by the
	clustered index. This works also in crash recovery, because all pages
	(including BLOBs) are recovered before anything is rolled back. */

	if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
		byte*	old_ptr = ptr;

		trx->update_undo->del_marks = TRUE;

		if (trx_undo_left(undo_page, ptr) < 5) {

			return(0);
		}

		/* Reserve 2 bytes to write the number of bytes the stored
		fields take in this undo record */

		ptr += 2;

		for (col_no = 0; col_no < dict_table_get_n_cols(table);
		     col_no++) {

			const dict_col_t*	col
				= dict_table_get_nth_col(table, col_no);

			if (col->ord_part) {
				ulint	pos;

				/* Write field number to undo log */
				if (trx_undo_left(undo_page, ptr) < 5 + 15) {

					return(0);
				}

				pos = dict_index_get_nth_col_pos(index,
								 col_no);
				ptr += mach_write_compressed(ptr, pos);

				/* Save the old value of field */
				field = rec_get_nth_field(rec, offsets, pos,
							  &flen);

				if (rec_offs_nth_extern(offsets, pos)) {
					ptr = trx_undo_page_report_modify_ext(
						ptr,
						flen < REC_MAX_INDEX_COL_LEN
						&& !ignore_prefix
						? ext_buf : NULL,
						dict_table_zip_size(table),
						&field, &flen);
				} else {
					ptr += mach_write_compressed(
						ptr, flen);
				}

				if (flen != UNIV_SQL_NULL) {
					if (trx_undo_left(undo_page, ptr)
					    < flen) {

						return(0);
					}

					ut_memcpy(ptr, field, flen);
					ptr += flen;
				}
			}
		}

		mach_write_to_2(old_ptr, ptr - old_ptr);
	}

	/*----------------------------------------*/
	/* Write pointers to the previous and the next undo log records */
	if (trx_undo_left(undo_page, ptr) < 2) {

		return(0);
	}

	mach_write_to_2(ptr, first_free);
	ptr += 2;
	mach_write_to_2(undo_page + first_free, ptr - undo_page);

	mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
			ptr - undo_page);

	/* Write to the REDO log about this change in the UNDO log */

	trx_undof_page_add_undo_rec_log(undo_page, first_free,
					ptr - undo_page, mtr);
	return(first_free);
}
Beispiel #30
0
/********************************************************************//**
Low-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there, in which case does nothing.
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
flag is cleared and the x-lock released by an i/o-handler thread.
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
not exist or is being dropped 
@return 1 if read request is issued. 0 if it is not */
UNIV_INTERN
ulint
buf_read_page_low(
/*==============*/
	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
			trying to read from a non-existent tablespace, or a
			tablespace which is just now being dropped */
	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
			at read-ahead functions) */
	ulint	space,	/*!< in: space id */
	ulint	zip_size,/*!< in: compressed page size, or 0 */
	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
	ib_int64_t tablespace_version, /*!< in: if the space memory object has
			this timestamp different from what we are giving here,
			treat the tablespace as dropped; this is a timestamp we
			use to stop dangling page reads from a tablespace
			which we have DISCARDed + IMPORTed back */
	ulint	offset,	/*!< in: page number */
	trx_t*	trx)
{
	buf_page_t*	bpage;
	ulint		wake_later;

	*err = DB_SUCCESS;

	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;

	if (trx_doublewrite
	    && (space == TRX_SYS_SPACE
		|| (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
	    && (   (offset >= trx_doublewrite->block1
		    && offset < trx_doublewrite->block1
		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
		   || (offset >= trx_doublewrite->block2
		       && offset < trx_doublewrite->block2
		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Warning: trying to read"
			" doublewrite buffer page %lu\n",
			(ulong) offset);

		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* Trx sys header is so low in the latching order that we play
		safe and do not leave the i/o-completion to an asynchronous
		i/o-thread. Ibuf bitmap pages must always be read with
		syncronous i/o, to make sure they do not get involved in
		thread deadlocks. */

		sync = TRUE;
	}

	/* The following call will also check if the tablespace does not exist
	or is being dropped; if we succeed in initing the page in the buffer
	pool for read, then DISCARD cannot proceed until the read has
	completed */
	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
				       tablespace_version, offset);
	if (bpage == NULL) {
		/* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
		if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
			/* hashed log recs must be treated here */
			recv_addr_t*    recv_addr;

			mutex_enter(&(recv_sys->mutex));

			if (recv_sys->apply_log_recs == FALSE) {
				mutex_exit(&(recv_sys->mutex));
				goto not_to_recover;
			}

			/* recv_get_fil_addr_struct() */
			recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
					hash_calc_hash(ut_fold_ulint_pair(space, offset),
						recv_sys->addr_hash));
			while (recv_addr) {
				if ((recv_addr->space == space)
					&& (recv_addr->page_no == offset)) {
					break;
				}
				recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
			}

			if ((recv_addr == NULL)
			    || (recv_addr->state == RECV_BEING_PROCESSED)
			    || (recv_addr->state == RECV_PROCESSED)) {
				mutex_exit(&(recv_sys->mutex));
				goto not_to_recover;
			}

			fprintf(stderr, " (cannot find space: %lu)", space);
			recv_addr->state = RECV_PROCESSED;

			ut_a(recv_sys->n_addrs);
			recv_sys->n_addrs--;

			mutex_exit(&(recv_sys->mutex));
		}
not_to_recover:

		return(0);
	}

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Posting read request for page %lu, sync %lu\n",
			(ulong) offset,
			(ulong) sync);
	}
#endif

	ut_ad(buf_page_in_file(bpage));

	if (sync) {
		thd_wait_begin(NULL, THD_WAIT_DISKIO);
	}

	if (zip_size) {
		*err = _fil_io(OS_FILE_READ | wake_later,
			      sync, space, zip_size, offset, 0, zip_size,
			      bpage->zip.data, bpage, trx);
	} else {
		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);

		*err = _fil_io(OS_FILE_READ | wake_later,
			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
			      ((buf_block_t*) bpage)->frame, bpage, trx);
	}

	if (sync) {
		thd_wait_end(NULL);
	}

	if (*err == DB_TABLESPACE_DELETED) {
		buf_read_page_handle_error(bpage);
		return(0);
	}

	SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS, bpage->is_corrupt = TRUE;);