Пример #1
0
/**********************************************************************//**
Adds a column definition to a table. */
UNIV_INTERN
void
dict_mem_table_add_col(
/*===================*/
	dict_table_t*	table,	/*!< in: table */
	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
	const char*	name,	/*!< in: column name, or NULL */
	ulint		mtype,	/*!< in: main datatype */
	ulint		prtype,	/*!< in: precise type */
	ulint		len)	/*!< in: precision */
{
	dict_col_t*	col;
#ifndef UNIV_HOTBACKUP
	ulint		mbminlen;
	ulint		mbmaxlen;
#endif /* !UNIV_HOTBACKUP */
	ulint		i;

	ut_ad(table);
	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
	ut_ad(!heap == !name);

	i = table->n_def++;

	if (name) {
		if (UNIV_UNLIKELY(table->n_def == table->n_cols)) {
			heap = table->heap;
		}
		if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
			/* All preceding column names are empty. */
			char* s = mem_heap_zalloc(heap, table->n_def);
			table->col_names = s;
		}

		table->col_names = dict_add_col_name(table->col_names,
						     i, name, heap);
	}

	col = dict_table_get_nth_col(table, i);

	col->ind = (unsigned int) i;
	col->ord_part = 0;

	col->mtype = (unsigned int) mtype;
	col->prtype = (unsigned int) prtype;
	col->len = (unsigned int) len;

#ifndef UNIV_HOTBACKUP
	dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);

	col->mbminlen = (unsigned int) mbminlen;
	col->mbmaxlen = (unsigned int) mbmaxlen;
#endif /* !UNIV_HOTBACKUP */
}
Пример #2
0
/***********************************************************//**
Parses the row reference and other info in a fresh insert undo record. */
static
void
row_undo_ins_parse_undo_rec(
    /*========================*/
    ib_recovery_t	recovery,	/*!< in: recovery flag */
    undo_node_t*	node)		/*!< in/out: row undo node */
{
    dict_index_t*	clust_index;
    byte*		ptr;
    undo_no_t	undo_no;
    dulint		table_id;
    ulint		type;
    ulint		dummy;
    ibool		dummy_extern;

    ut_ad(node);

    ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
                                &dummy_extern, &undo_no, &table_id);
    ut_ad(type == TRX_UNDO_INSERT_REC);
    node->rec_type = type;

    node->update = NULL;
    node->table = dict_table_get_on_id(
                      srv_force_recovery, table_id, node->trx);

    /* Skip the UNDO if we can't find the table or the .ibd file. */
    if (UNIV_UNLIKELY(node->table == NULL)) {
    } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
        node->table = NULL;
    } else {
        clust_index = dict_table_get_first_index(node->table);

        if (clust_index != NULL) {
            ptr = trx_undo_rec_get_row_ref(
                      ptr, clust_index, &node->ref, node->heap);
        } else {
            ut_print_timestamp(ib_stream);
            ib_logger(ib_stream, "  InnoDB: table ");
            ut_print_name(ib_stream, node->trx, TRUE,
                          node->table->name);
            ib_logger(ib_stream, " has no indexes, "
                      "ignoring the table\n");

            node->table = NULL;
        }
    }
}
Пример #3
0
/********************************************************************//**
Fills the column prefix cache of an externally stored column. */
static
void
row_ext_cache_fill(
/*===============*/
	row_ext_t*	ext,	/*!< in/out: column prefix cache */
	ulint		i,	/*!< in: index of ext->ext[] */
	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
	const dfield_t*	dfield)	/*!< in: data field */
{
	const byte*	field	= dfield_get_data(dfield);
	ulint		f_len	= dfield_get_len(dfield);
	byte*		buf	= ext->buf + i * ext->max_len;

	ut_ad(ext->max_len > 0);
	ut_ad(i < ext->n_ext);
	ut_ad(dfield_is_ext(dfield));
	ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);

	if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
				  field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
				  BTR_EXTERN_FIELD_REF_SIZE))) {
		/* The BLOB pointer is not set: we cannot fetch it */
		ext->len[i] = 0;
	} else {
		/* Fetch at most ext->max_len of the column.
		The column should be non-empty.  However,
		trx_rollback_or_clean_all_recovered() may try to
		access a half-deleted BLOB if the server previously
		crashed during the execution of
		btr_free_externally_stored_field(). */
		ext->len[i] = btr_copy_externally_stored_field_prefix(
			buf, ext->max_len, zip_size, field, f_len);
	}
}
Пример #4
0
/*******************************************************************//**
Drops the index tree associated with a row in SYS_INDEXES table. */
UNIV_INTERN
void
dict_drop_index_tree(
/*=================*/
	rec_t*	rec,	/*!< in/out: record in the clustered index
			of SYS_INDEXES table */
	mtr_t*	mtr)	/*!< in: mtr having the latch on the record page */
{
	ulint		root_page_no;
	ulint		space;
	ulint		zip_size;
	const byte*	ptr;
	ulint		len;

	ut_ad(mutex_own(&(dict_sys->mutex)));
	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

	ut_ad(len == 4);

	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

	if (root_page_no == FIL_NULL) {
		/* The tree has already been freed */

		return;
	}

	ptr = rec_get_nth_field_old(rec,
				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);

	ut_ad(len == 4);

	space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
	zip_size = fil_space_get_zip_size(space);

	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
		/* It is a single table tablespace and the .ibd file is
		missing: do nothing */

		return;
	}

	/* We free all the pages but the root page first; this operation
	may span several mini-transactions */

	btr_free_but_not_root(space, zip_size, root_page_no);

	/* Then we free the root page in the same mini-transaction where
	we write FIL_NULL to the appropriate field in the SYS_INDEXES
	record: this mini-transaction marks the B-tree totally freed */

	/* printf("Dropping index tree in space %lu root page %lu\n", space,
	root_page_no); */
	btr_free_root(space, zip_size, root_page_no, mtr);

	page_rec_write_index_page_no(rec,
				     DICT_SYS_INDEXES_PAGE_NO_FIELD,
				     FIL_NULL, mtr);
}
Пример #5
0
/********************************************************//**
Parses a log record written by mlog_write_string.
@return	parsed record end, NULL if not a complete record */
UNIV_INTERN
byte*
mlog_parse_string(
/*==============*/
	byte*	ptr,	/*!< in: buffer */
	byte*	end_ptr,/*!< in: buffer end */
	byte*	page,	/*!< in: page where to apply the log record, or NULL */
	void*	page_zip)/*!< in/out: compressed page, or NULL */
{
	ulint	offset;
	ulint	len;

	ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);

	if (end_ptr < ptr + 4) {

		return(NULL);
	}

	offset = mach_read_from_2(ptr);
	ptr += 2;
	len = mach_read_from_2(ptr);
	ptr += 2;

	if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
	    || UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) {
		recv_sys->found_corrupt_log = TRUE;

		return(NULL);
	}

	if (end_ptr < ptr + len) {

		return(NULL);
	}

	if (page) {
		if (UNIV_LIKELY_NULL(page_zip)) {
			memcpy(((page_zip_des_t*) page_zip)->data
				+ offset, ptr, len);
		}
		memcpy(page + offset, ptr, len);
	}

	return(ptr + len);
}
Пример #6
0
/***********************************************************//**
Undoes a modify in secondary indexes when undo record type is UPD_DEL.
@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
static
ulint
row_undo_mod_upd_del_sec(
/*=====================*/
	undo_node_t*	node,	/*!< in: row undo node */
	que_thr_t*	thr)	/*!< in: query thread */
{
	mem_heap_t*	heap;
	dtuple_t*	entry;
	dict_index_t*	index;
	ulint		err	= DB_SUCCESS;

	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
	heap = mem_heap_create(1024);

	while (node->index != NULL) {

		/* Skip all corrupted secondary index */
		dict_table_skip_corrupt_index(node->index);

		if (!node->index) {
			break;
		}

		index = node->index;

		entry = row_build_index_entry(node->row, node->ext,
					      index, heap);
		if (UNIV_UNLIKELY(!entry)) {
			/* The database must have crashed after
			inserting a clustered index record but before
			writing all the externally stored columns of
			that record.  Because secondary index entries
			are inserted after the clustered index record,
			we may assume that the secondary index record
			does not exist.  However, this situation may
			only occur during the rollback of incomplete
			transactions. */
			ut_a(thr_is_recv(thr));
		} else {
			err = row_undo_mod_del_mark_or_remove_sec(
				node, thr, index, entry);

			if (err != DB_SUCCESS) {

				break;
			}
		}

		mem_heap_empty(heap);

		node->index = dict_table_get_next_index(node->index);
	}

	mem_heap_free(heap);

	return(err);
}
Пример #7
0
/***********************************************************//**
Undoes a fresh insert of a row to a table. A fresh insert means that
the same clustered index unique key did not have any record, even delete
marked, at the time of the insert.  InnoDB is eager in a rollback:
if it figures out that an index record will be removed in the purge
anyway, it will remove it in the rollback.
@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
UNIV_INTERN
ulint
row_undo_ins(
/*=========*/
	undo_node_t*	node)	/*!< in: row undo node */
{
	ut_ad(node);
	ut_ad(node->state == UNDO_NODE_INSERT);

	row_undo_ins_parse_undo_rec(node);

	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
		trx_undo_rec_release(node->trx, node->undo_no);

		return(DB_SUCCESS);
	}

	/* Iterate over all the indexes and undo the insert.*/

	/* Skip the clustered index (the first index) */
	node->index = dict_table_get_next_index(
		dict_table_get_first_index(node->table));

	dict_table_skip_corrupt_index(node->index);

	while (node->index != NULL) {
		dtuple_t*	entry;
		ulint		err;

		entry = row_build_index_entry(node->row, node->ext,
					      node->index, node->heap);
		if (UNIV_UNLIKELY(!entry)) {
			/* The database must have crashed after
			inserting a clustered index record but before
			writing all the externally stored columns of
			that record.  Because secondary index entries
			are inserted after the clustered index record,
			we may assume that the secondary index record
			does not exist.  However, this situation may
			only occur during the rollback of incomplete
			transactions. */
			ut_a(trx_is_recv(node->trx));
		} else {
			log_free_check();
			err = row_undo_ins_remove_sec(node->index, entry);

			if (err != DB_SUCCESS) {

				return(err);
			}
		}

		dict_table_next_uncorrupted_index(node->index);
	}

	log_free_check();
	return(row_undo_ins_remove_clust_rec(node));
}
Пример #8
0
/***********************************************************//**
Undoes a fresh insert of a row to a table. A fresh insert means that
the same clustered index unique key did not have any record, even delete
marked, at the time of the insert.  InnoDB is eager in a rollback:
if it figures out that an index record will be removed in the purge
anyway, it will remove it in the rollback.
@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
UNIV_INTERN
ulint
row_undo_ins(
/*=========*/
	undo_node_t*	node)	/*!< in: row undo node */
{
	ut_ad(node);
	ut_ad(node->state == UNDO_NODE_INSERT);

	row_undo_ins_parse_undo_rec(node);

	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
		trx_undo_rec_release(node->trx, node->undo_no);

		return(DB_SUCCESS);
	}

	/* Iterate over all the indexes and undo the insert.*/

	/* Skip the clustered index (the first index) */
	node->index = dict_table_get_next_index(
		dict_table_get_first_index(node->table));

	while (node->index != NULL) {
		dtuple_t*	entry;
		ulint		err;

		entry = row_build_index_entry(node->row, node->ext,
					      node->index, node->heap);
		if (UNIV_UNLIKELY(!entry)) {
			/* The database must have crashed after
			inserting a clustered index record but before
			writing all the externally stored columns of
			that record, or a statement is being rolled
			back because an error occurred while storing
			off-page columns.

			Because secondary index entries are inserted
			after the clustered index record, we may
			assume that the secondary index record does
			not exist. */
		} else {
			log_free_check();
			err = row_undo_ins_remove_sec(node->index, entry);

			if (err != DB_SUCCESS) {

				return(err);
			}
		}

		node->index = dict_table_get_next_index(node->index);
	}

	log_free_check();
	return(row_undo_ins_remove_clust_rec(node));
}
Пример #9
0
void
mlog_write_string(
/*==============*/
	byte*		ptr,	/* in: pointer where to write */
	const byte*	str,	/* in: string to write */
	ulint		len,	/* in: string length */
	mtr_t*		mtr)	/* in: mini-transaction handle */
{
	byte*	log_ptr;

	if (UNIV_UNLIKELY(ptr < buf_pool->frame_zero)
	    || UNIV_UNLIKELY(ptr >= buf_pool->high_end)) {
		fprintf(stderr,
			"InnoDB: Error: trying to write to"
			" a stray memory location %p\n", (void*) ptr);
		ut_error;
	}
	ut_ad(ptr && mtr);
	ut_a(len < UNIV_PAGE_SIZE);

	ut_memcpy(ptr, str, len);

	log_ptr = mlog_open(mtr, 30);

	/* If no logging is requested, we may return now */
	if (log_ptr == NULL) {

		return;
	}

	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING,
						     log_ptr, mtr);
	mach_write_to_2(log_ptr, ptr - buf_frame_align(ptr));
	log_ptr += 2;

	mach_write_to_2(log_ptr, len);
	log_ptr += 2;

	mlog_close(mtr, log_ptr);

	mlog_catenate_string(mtr, str, len);
}
Пример #10
0
void
mlog_write_dulint(
/*==============*/
	byte*	ptr,	/* in: pointer where to write */
	dulint	val,	/* in: value to write */
	mtr_t*	mtr)	/* in: mini-transaction handle */
{
	byte*	log_ptr;

	if (UNIV_UNLIKELY(ptr < buf_pool->frame_zero)
	    || UNIV_UNLIKELY(ptr >= buf_pool->high_end)) {
		fprintf(stderr,
			"InnoDB: Error: trying to write to"
			" a stray memory location %p\n", (void*) ptr);
		ut_error;
	}

	ut_ad(ptr && mtr);

	mach_write_to_8(ptr, val);

	log_ptr = mlog_open(mtr, 11 + 2 + 9);

	/* If no logging is requested, we may return now */
	if (log_ptr == NULL) {

		return;
	}

	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_8BYTES,
						     log_ptr, mtr);

	mach_write_to_2(log_ptr, ptr - buf_frame_align(ptr));
	log_ptr += 2;

	log_ptr += mach_dulint_write_compressed(log_ptr, val);

	mlog_close(mtr, log_ptr);
}
Пример #11
0
/**********************************************************************//**
Adds a column definition to a table. */
UNIV_INTERN
void
dict_mem_table_add_col(
/*===================*/
	dict_table_t*	table,	/*!< in: table */
	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
	const char*	name,	/*!< in: column name, or NULL */
	ulint		mtype,	/*!< in: main datatype */
	ulint		prtype,	/*!< in: precise type */
	ulint		len)	/*!< in: precision */
{
	dict_col_t*	col;
	ulint		i;

	ut_ad(table);
	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
	ut_ad(!heap == !name);

	i = table->n_def++;

	if (name) {
		if (UNIV_UNLIKELY(table->n_def == table->n_cols)) {
			heap = table->heap;
		}
		if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
			/* All preceding column names are empty. */
			char* s = mem_heap_zalloc(heap, table->n_def);
			table->col_names = s;
		}

		table->col_names = dict_add_col_name(table->col_names,
						     i, name, heap);
	}

	col = dict_table_get_nth_col(table, i);

	dict_mem_fill_column_struct(col, i, mtype, prtype, len);
}
Пример #12
0
/**********************************************************************//**
Set the next and previous pointers in the undo page for the undo record
that was written to ptr. Update the first free value by the number of bytes
written for this undo record.
@return	offset of the inserted entry on the page if succeeded, 0 if fail */
static
ulint
trx_undo_page_set_next_prev_and_add(
/*================================*/
	page_t*		undo_page,	/*!< in/out: undo log page */
	byte*		ptr,		/*!< in: ptr up to where data has been
					written on this undo page. */
	mtr_t*		mtr)		/*!< in: mtr */
{
	ulint		first_free;	/*!< offset within undo_page */
	ulint		end_of_rec;	/*!< offset within undo_page */
	byte*		ptr_to_first_free;
					/* pointer within undo_page
					that points to the next free
					offset value within undo_page.*/

	ut_ad(ptr > undo_page);
	ut_ad(ptr < undo_page + UNIV_PAGE_SIZE);

	if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) {

		return(0);
	}

	ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE;

	first_free = mach_read_from_2(ptr_to_first_free);

	/* Write offset of the previous undo log record */
	mach_write_to_2(ptr, first_free);
	ptr += 2;

	end_of_rec = ptr - undo_page;

	/* Write offset of the next undo log record */
	mach_write_to_2(undo_page + first_free, end_of_rec);

	/* Update the offset to first free undo record */
	mach_write_to_2(ptr_to_first_free, end_of_rec);

	/* Write this log entry to the UNDO log */
	trx_undof_page_add_undo_rec_log(undo_page, first_free,
					end_of_rec, mtr);

	return(first_free);
}
Пример #13
0
/**********************************************************//**
Frees a mutex object. */
UNIV_INTERN
void
os_fast_mutex_free(
/*===============*/
	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to free */
{
#ifdef __WIN__
	ut_a(fast_mutex);

	DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex);
#else
	int	ret;

	ret = pthread_mutex_destroy(fast_mutex);

	if (UNIV_UNLIKELY(ret != 0)) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: error: return value %lu when calling\n"
			"InnoDB: pthread_mutex_destroy().\n", (ulint)ret);
		fprintf(stderr,
			"InnoDB: Byte contents of the pthread mutex at %p:\n",
			(void*) fast_mutex);
		ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t));
		putc('\n', stderr);
	}
#endif
	if (UNIV_LIKELY(os_sync_mutex_inited)) {
		/* When freeing the last mutexes, we have
		already freed os_sync_mutex */

		os_mutex_enter(os_sync_mutex);
	}

	ut_ad(os_fast_mutex_count > 0);
	os_fast_mutex_count--;

	if (UNIV_LIKELY(os_sync_mutex_inited)) {
		os_mutex_exit(os_sync_mutex);
	}
}
Пример #14
0
/**************************************************************//**
The position of the cursor is stored by taking an initial segment of the
record the cursor is positioned on, before, or after, and copying it to the
cursor data structure, or just setting a flag if the cursor id before the
first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
page where the cursor is positioned must not be empty if the index tree is
not totally empty! */
UNIV_INTERN
void
btr_pcur_store_position(
/*====================*/
	btr_pcur_t*	cursor, /*!< in: persistent cursor */
	mtr_t*		mtr)	/*!< in: mtr */
{
	page_cur_t*	page_cursor;
	buf_block_t*	block;
	rec_t*		rec;
	dict_index_t*	index;
	page_t*		page;
	ulint		offs;

	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);

	block = btr_pcur_get_block(cursor);

	if (srv_pass_corrupt_table && !block) {
		return;
	}
	ut_a(block);

	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));

	page_cursor = btr_pcur_get_page_cur(cursor);

	rec = page_cur_get_rec(page_cursor);
	page = page_align(rec);
	offs = page_offset(rec);

	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX)
	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
	ut_a(cursor->latch_mode != BTR_NO_LATCHES);

	if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) {
		/* It must be an empty index tree; NOTE that in this case
		we do not store the modify_clock, but always do a search
		if we restore the cursor position */

		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
		ut_ad(page_is_leaf(page));
		ut_ad(page_get_page_no(page) == index->page);

		cursor->old_stored = BTR_PCUR_OLD_STORED;

		if (page_rec_is_supremum_low(offs)) {

			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
		} else {
			cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
		}

		return;
	}

	if (page_rec_is_supremum_low(offs)) {

		rec = page_rec_get_prev(rec);

		cursor->rel_pos = BTR_PCUR_AFTER;

	} else if (page_rec_is_infimum_low(offs)) {

		rec = page_rec_get_next(rec);

		cursor->rel_pos = BTR_PCUR_BEFORE;
	} else {
		cursor->rel_pos = BTR_PCUR_ON;
	}

	cursor->old_stored = BTR_PCUR_OLD_STORED;
	cursor->old_rec = dict_index_copy_rec_order_prefix(
		index, rec, &cursor->old_n_fields,
		&cursor->old_rec_buf, &cursor->buf_size);

	cursor->block_when_stored = block;
	cursor->modify_clock = buf_block_get_modify_clock(block);
}
Пример #15
0
/*******************************************************************//**
Fills the "lock_data" member of i_s_locks_row_t object.
If memory can not be allocated then FALSE is returned.
@return	FALSE if allocation fails */
static
ibool
fill_lock_data(
/*===========*/
	const char**		lock_data,/*!< out: "lock_data" to fill */
	const lock_t*		lock,	/*!< in: lock used to find the data */
	ulint			heap_no,/*!< in: rec num used to find the data */
	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
					volatile data */
{
	mtr_t			mtr;

	const buf_block_t*	block;
	const page_t*		page;
	const rec_t*		rec;

	ut_a(lock_get_type(lock) == LOCK_REC);

	mtr_start(&mtr);

	block = buf_page_try_get(lock_rec_get_space_id(lock),
				 lock_rec_get_page_no(lock),
				 &mtr);

	if (block == NULL) {

		*lock_data = NULL;

		mtr_commit(&mtr);

		return(TRUE);
	}

	page = (const page_t*) buf_block_get_frame(block);

	rec = page_find_rec_with_heap_no(page, heap_no);

	if (page_rec_is_infimum(rec)) {

		*lock_data = ha_storage_put_str_memlim(
			cache->storage, "infimum pseudo-record",
			MAX_ALLOWED_FOR_STORAGE(cache));
	} else if (page_rec_is_supremum(rec)) {

		*lock_data = ha_storage_put_str_memlim(
			cache->storage, "supremum pseudo-record",
			MAX_ALLOWED_FOR_STORAGE(cache));
	} else {

		const dict_index_t*	index;
		ulint			n_fields;
		mem_heap_t*		heap;
		ulint			offsets_onstack[REC_OFFS_NORMAL_SIZE];
		ulint*			offsets;
		char			buf[TRX_I_S_LOCK_DATA_MAX_LEN];
		ulint			buf_used;
		ulint			i;

		rec_offs_init(offsets_onstack);
		offsets = offsets_onstack;

		index = lock_rec_get_index(lock);

		n_fields = dict_index_get_n_unique(index);

		ut_a(n_fields > 0);

		heap = NULL;
		offsets = rec_get_offsets(rec, index, offsets, n_fields,
					  &heap);

		/* format and store the data */

		buf_used = 0;
		for (i = 0; i < n_fields; i++) {

			buf_used += put_nth_field(
				buf + buf_used, sizeof(buf) - buf_used,
				i, index, rec, offsets) - 1;
		}

		*lock_data = (const char*) ha_storage_put_memlim(
			cache->storage, buf, buf_used + 1,
			MAX_ALLOWED_FOR_STORAGE(cache));

		if (UNIV_UNLIKELY(heap != NULL)) {

			/* this means that rec_get_offsets() has created a new
			heap and has stored offsets in it; check that this is
			really the case and free the heap */
			ut_a(offsets != offsets_onstack);
			mem_heap_free(heap);
		}
	}

	mtr_commit(&mtr);

	if (*lock_data == NULL) {

		return(FALSE);
	}

	return(TRUE);
}
Пример #16
0
/********************************************************************//**
Frees memory to a pool. */
UNIV_INTERN
void
mem_area_free(
/*==========*/
	void*		ptr,	/*!< in, own: pointer to allocated memory
				buffer */
	mem_pool_t*	pool)	/*!< in: memory pool */
{
	mem_area_t*	area;
	mem_area_t*	buddy;
	void*		new_ptr;
	ulint		size;
	ulint		n;

	if (UNIV_LIKELY(srv_use_sys_malloc)) {
		free(ptr);

		return;
	}

	/* It may be that the area was really allocated from the OS with
	regular malloc: check if ptr points within our memory pool */

	if ((byte*)ptr < pool->buf || (byte*)ptr >= pool->buf + pool->size) {
		ut_free(ptr);

		return;
	}

	area = (mem_area_t*) (((byte*)ptr) - MEM_AREA_EXTRA_SIZE);

	if (mem_area_get_free(area)) {
		fprintf(stderr,
			"InnoDB: Error: Freeing element to mem pool"
			" free list though the\n"
			"InnoDB: element is marked free!\n");

		mem_analyze_corruption(area);
		ut_error;
	}

	size = mem_area_get_size(area);
	UNIV_MEM_FREE(ptr, size - MEM_AREA_EXTRA_SIZE);

	if (size == 0) {
		fprintf(stderr,
			"InnoDB: Error: Mem area size is 0. Possibly a"
			" memory overrun of the\n"
			"InnoDB: previous allocated area!\n");

		mem_analyze_corruption(area);
		ut_error;
	}

#ifdef UNIV_LIGHT_MEM_DEBUG
	if (((byte*)area) + size < pool->buf + pool->size) {

		ulint	next_size;

		next_size = mem_area_get_size(
			(mem_area_t*)(((byte*)area) + size));
		if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) {
			fprintf(stderr,
				"InnoDB: Error: Memory area size %lu,"
				" next area size %lu not a power of 2!\n"
				"InnoDB: Possibly a memory overrun of"
				" the buffer being freed here.\n",
				(ulong) size, (ulong) next_size);
			mem_analyze_corruption(area);

			ut_error;
		}
	}
#endif
	buddy = mem_area_get_buddy(area, size, pool);

	n = ut_2_log(size);

	mem_pool_mutex_enter(pool);
	mem_n_threads_inside++;

	ut_a(mem_n_threads_inside == 1);

	if (buddy && mem_area_get_free(buddy)
	    && (size == mem_area_get_size(buddy))) {

		/* The buddy is in a free list */

		if ((byte*)buddy < (byte*)area) {
			new_ptr = ((byte*)buddy) + MEM_AREA_EXTRA_SIZE;

			mem_area_set_size(buddy, 2 * size);
			mem_area_set_free(buddy, FALSE);
		} else {
			new_ptr = ptr;

			mem_area_set_size(area, 2 * size);
		}

		/* Remove the buddy from its free list and merge it to area */

		UT_LIST_REMOVE(free_list, pool->free_list[n], buddy);

		pool->reserved += ut_2_exp(n);

		mem_n_threads_inside--;
		mem_pool_mutex_exit(pool);

		mem_area_free(new_ptr, pool);

		return;
	} else {
		UT_LIST_ADD_FIRST(free_list, pool->free_list[n], area);

		mem_area_set_free(area, TRUE);

		ut_ad(pool->reserved >= size);

		pool->reserved -= size;
	}

	mem_n_threads_inside--;
	mem_pool_mutex_exit(pool);

	ut_ad(mem_pool_validate(pool));
}
Пример #17
0
/**************************************************************//**
Restores the stored position of a persistent cursor bufferfixing the page and
obtaining the specified latches. If the cursor position was saved when the
(1) cursor was positioned on a user record: this function restores the position
to the last record LESS OR EQUAL to the stored record;
(2) cursor was positioned on a page infimum record: restores the position to
the last record LESS than the user record which was the successor of the page
infimum;
(3) cursor was positioned on the page supremum: restores to the first record
GREATER than the user record which was the predecessor of the supremum.
(4) cursor was positioned before the first or after the last in an empty tree:
restores to before first or after the last in the tree.
@return TRUE if the cursor position was stored when it was on a user
record and it can be restored on a user record whose ordering fields
are identical to the ones of the original user record */
UNIV_INTERN
ibool
btr_pcur_restore_position_func(
/*===========================*/
	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
	btr_pcur_t*	cursor,		/*!< in: detached persistent cursor */
	const char*	file,		/*!< in: file name */
	ulint		line,		/*!< in: line where called */
	mtr_t*		mtr)		/*!< in: mtr */
{
	dict_index_t*	index;
	dtuple_t*	tuple;
	ulint		mode;
	ulint		old_mode;
	mem_heap_t*	heap;

	ut_ad(mtr);
	ut_ad(mtr->state == MTR_ACTIVE);

	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));

	if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED)
	    || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED
			     && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) {
		ut_print_buf(stderr, cursor, sizeof(btr_pcur_t));
		putc('\n', stderr);
		if (cursor->trx_if_known) {
			trx_print(stderr, cursor->trx_if_known, 0);
		}

		ut_error;
	}

	if (UNIV_UNLIKELY
	    (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
	     || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {

		/* In these cases we do not try an optimistic restoration,
		but always do a search */

		btr_cur_open_at_index_side(
			cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
			index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr);

		cursor->latch_mode = latch_mode;
		cursor->pos_state = BTR_PCUR_IS_POSITIONED;
		cursor->block_when_stored = btr_pcur_get_block(cursor);

		return(FALSE);
	}

	ut_a(cursor->old_rec);
	ut_a(cursor->old_n_fields);

	if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF)
	    || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) {
		/* Try optimistic restoration */

		if (UNIV_LIKELY(buf_page_optimistic_get(
					latch_mode,
					cursor->block_when_stored,
					cursor->modify_clock,
					file, line, mtr))) {
			cursor->pos_state = BTR_PCUR_IS_POSITIONED;

			buf_block_dbg_add_level(
				btr_pcur_get_block(cursor),
				dict_index_is_ibuf(index)
				? SYNC_IBUF_TREE_NODE : SYNC_TREE_NODE);

			if (cursor->rel_pos == BTR_PCUR_ON) {
#ifdef UNIV_DEBUG
				const rec_t*	rec;
				const ulint*	offsets1;
				const ulint*	offsets2;
#endif /* UNIV_DEBUG */
				cursor->latch_mode = latch_mode;
#ifdef UNIV_DEBUG
				rec = btr_pcur_get_rec(cursor);

				heap = mem_heap_create(256);
				offsets1 = rec_get_offsets(
					cursor->old_rec, index, NULL,
					cursor->old_n_fields, &heap);
				offsets2 = rec_get_offsets(
					rec, index, NULL,
					cursor->old_n_fields, &heap);

				ut_ad(!cmp_rec_rec(cursor->old_rec,
						   rec, offsets1, offsets2,
						   index));
				mem_heap_free(heap);
#endif /* UNIV_DEBUG */
				return(TRUE);
			}

			return(FALSE);
		}
	}

	/* If optimistic restoration did not succeed, open the cursor anew */

	heap = mem_heap_create(256);

	tuple = dict_index_build_data_tuple(index, cursor->old_rec,
					    cursor->old_n_fields, heap);

	/* Save the old search mode of the cursor */
	old_mode = cursor->search_mode;

	switch (cursor->rel_pos) {
	case BTR_PCUR_ON:
		mode = PAGE_CUR_LE;
		break;
	case BTR_PCUR_AFTER:
		mode = PAGE_CUR_G;
		break;
	case BTR_PCUR_BEFORE:
		mode = PAGE_CUR_L;
		break;
	default:
		ut_error;
		mode = 0;
	}

	btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
					cursor, 0, file, line, mtr);

	/* Restore the old search mode */
	cursor->search_mode = old_mode;

	switch (cursor->rel_pos) {
	case BTR_PCUR_ON:
		if (btr_pcur_is_on_user_rec(cursor)
		    && !cmp_dtuple_rec(
			    tuple, btr_pcur_get_rec(cursor),
			    rec_get_offsets(btr_pcur_get_rec(cursor),
					    index, NULL,
					    ULINT_UNDEFINED, &heap))) {

			/* We have to store the NEW value for
			the modify clock, since the cursor can
			now be on a different page! But we can
			retain the value of old_rec */

			cursor->block_when_stored =
				btr_pcur_get_block(cursor);
			cursor->modify_clock =
				buf_block_get_modify_clock(
					cursor->block_when_stored);
			cursor->old_stored = BTR_PCUR_OLD_STORED;

			mem_heap_free(heap);

			return(TRUE);
		}
#ifdef UNIV_DEBUG
		/* fall through */
	case BTR_PCUR_BEFORE:
	case BTR_PCUR_AFTER:
		break;
	default:
		ut_error;
#endif /* UNIV_DEBUG */
	}

	mem_heap_free(heap);

	/* We have to store new position information, modify_clock etc.,
	to the cursor because it can now be on a different page, the record
	under it may have been removed, etc. */

	btr_pcur_store_position(cursor, mtr);

	return(FALSE);
}
Пример #18
0
ibool
btr_pcur_restore_position(
/*======================*/
					/* out: TRUE if the cursor position
					was stored when it was on a user record
					and it can be restored on a user record
					whose ordering fields are identical to
					the ones of the original user record */
	ulint		latch_mode,	/* in: BTR_SEARCH_LEAF, ... */
	btr_pcur_t*	cursor,		/* in: detached persistent cursor */
	mtr_t*		mtr)		/* in: mtr */
{
	dict_index_t*	index;
	page_t*		page;
	dtuple_t*	tuple;
	ulint		mode;
	ulint		old_mode;
	mem_heap_t*	heap;

	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));

	if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED)
	    || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED
			     && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) {
		ut_print_buf(stderr, cursor, sizeof(btr_pcur_t));
		if (cursor->trx_if_known) {
			trx_print(stderr, cursor->trx_if_known, 0);
		}

		ut_error;
	}

	if (UNIV_UNLIKELY(
		    cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
		    || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {

		/* In these cases we do not try an optimistic restoration,
		but always do a search */

		btr_cur_open_at_index_side(
			cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
			index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr);

		cursor->block_when_stored
			= buf_block_align(btr_pcur_get_page(cursor));

		return(FALSE);
	}

	ut_a(cursor->old_rec);
	ut_a(cursor->old_n_fields);

	page = btr_cur_get_page(btr_pcur_get_btr_cur(cursor));

	if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF)
	    || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) {
		/* Try optimistic restoration */

		if (UNIV_LIKELY(buf_page_optimistic_get(
					latch_mode,
					cursor->block_when_stored, page,
					cursor->modify_clock, mtr))) {
			cursor->pos_state = BTR_PCUR_IS_POSITIONED;
#ifdef UNIV_SYNC_DEBUG
			buf_page_dbg_add_level(page, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
			if (cursor->rel_pos == BTR_PCUR_ON) {
#ifdef UNIV_DEBUG
				rec_t*		rec;
				ulint*		offsets1;
				ulint*		offsets2;
#endif /* UNIV_DEBUG */
				cursor->latch_mode = latch_mode;
#ifdef UNIV_DEBUG
				rec = btr_pcur_get_rec(cursor);

				heap = mem_heap_create(256);
				offsets1 = rec_get_offsets(
					cursor->old_rec, index, NULL,
					cursor->old_n_fields, &heap);
				offsets2 = rec_get_offsets(
					rec, index, NULL,
					cursor->old_n_fields, &heap);

				ut_ad(!cmp_rec_rec(cursor->old_rec,
						   rec, offsets1, offsets2,
						   index));
				mem_heap_free(heap);
#endif /* UNIV_DEBUG */
				return(TRUE);
			}

			return(FALSE);
		}
	}

	/* If optimistic restoration did not succeed, open the cursor anew */

	heap = mem_heap_create(256);

	tuple = dict_index_build_data_tuple(index, cursor->old_rec,
					    cursor->old_n_fields, heap);

	/* Save the old search mode of the cursor */
	old_mode = cursor->search_mode;

	switch (cursor->rel_pos) {
	case BTR_PCUR_ON:
		mode = PAGE_CUR_LE;
		break;
	case BTR_PCUR_AFTER:
		mode = PAGE_CUR_G;
		break;
	case BTR_PCUR_BEFORE:
		mode = PAGE_CUR_L;
		break;
	default:
		ut_error;
		mode = 0; /* silence a warning */
	}

	btr_pcur_open_with_no_init(index, tuple, mode, latch_mode,
				   cursor, 0, mtr);

	/* Restore the old search mode */
	cursor->search_mode = old_mode;

	if (btr_pcur_is_on_user_rec(cursor, mtr)) {
		switch (cursor->rel_pos) {
		case BTR_PCUR_ON:
			if (!cmp_dtuple_rec(
				    tuple, btr_pcur_get_rec(cursor),
				    rec_get_offsets(btr_pcur_get_rec(cursor),
						    index, NULL,
						    ULINT_UNDEFINED, &heap))) {

				/* We have to store the NEW value for
				the modify clock, since the cursor can
				now be on a different page! But we can
				retain the value of old_rec */

				cursor->block_when_stored =
					buf_block_align(
						btr_pcur_get_page(cursor));
				cursor->modify_clock =
					buf_block_get_modify_clock(
						cursor->block_when_stored);
				cursor->old_stored = BTR_PCUR_OLD_STORED;

				mem_heap_free(heap);

				return(TRUE);
			}

			break;
		case BTR_PCUR_BEFORE:
			page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
			break;
		case BTR_PCUR_AFTER:
			page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
			break;
#ifdef UNIV_DEBUG
		default:
			ut_error;
#endif /* UNIV_DEBUG */
		}
	}

	mem_heap_free(heap);

	/* We have to store new position information, modify_clock etc.,
	to the cursor because it can now be on a different page, the record
	under it may have been removed, etc. */

	btr_pcur_store_position(cursor, mtr);

	return(FALSE);
}
Пример #19
0
/********************************************************************//**
Fills the specified free list.
@return	TRUE if we were able to insert a block to the free list */
static
ibool
mem_pool_fill_free_list(
/*====================*/
	ulint		i,	/*!< in: free list index */
	mem_pool_t*	pool)	/*!< in: memory pool */
{
	mem_area_t*	area;
	mem_area_t*	area2;
	ibool		ret;

	ut_ad(mutex_own(&(pool->mutex)));

	if (UNIV_UNLIKELY(i >= 63)) {
		/* We come here when we have run out of space in the
		memory pool: */

		return(FALSE);
	}

	area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);

	if (area == NULL) {
		if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) {
			ut_print_timestamp(stderr);

			fprintf(stderr,
				"  InnoDB: Error: mem pool free list %lu"
				" length is %lu\n"
				"InnoDB: though the list is empty!\n",
				(ulong) i + 1,
				(ulong)
				UT_LIST_GET_LEN(pool->free_list[i + 1]));
		}

		ret = mem_pool_fill_free_list(i + 1, pool);

		if (ret == FALSE) {

			return(FALSE);
		}

		area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
	}

	if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) {
		mem_analyze_corruption(area);

		ut_error;
	}

	UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);

	area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i));
	UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);

	mem_area_set_size(area2, ut_2_exp(i));
	mem_area_set_free(area2, TRUE);

	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2);

	mem_area_set_size(area, ut_2_exp(i));

	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);

	return(TRUE);
}
Пример #20
0
/*****************************************************************//**
Constructs the last committed version of a clustered index record,
which should be seen by a semi-consistent read.
@return	DB_SUCCESS or DB_MISSING_HISTORY */
UNIV_INTERN
ulint
row_vers_build_for_semi_consistent_read(
/*====================================*/
	const rec_t*	rec,	/*!< in: record in a clustered index; the
				caller must have a latch on the page; this
				latch locks the top of the stack of versions
				of this records */
	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
	dict_index_t*	index,	/*!< in: the clustered index */
	ulint**		offsets,/*!< in/out: offsets returned by
				rec_get_offsets(rec, index) */
	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
				the offsets are allocated */
	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
				*old_vers is allocated; memory for possible
				intermediate versions is allocated and freed
				locally within the function */
	const rec_t**	old_vers)/*!< out: rec, old version, or NULL if the
				record does not exist in the view, that is,
				it was freshly inserted afterwards */
{
	const rec_t*	version;
	mem_heap_t*	heap		= NULL;
	byte*		buf;
	ulint		err;
	trx_id_t	rec_trx_id	= ut_dulint_zero;

	ut_ad(dict_index_is_clust(index));
	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
#ifdef UNIV_SYNC_DEBUG
	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
#endif /* UNIV_SYNC_DEBUG */

	ut_ad(rec_offs_validate(rec, index, *offsets));

	rw_lock_s_lock(&(purge_sys->latch));
	/* The S-latch on purge_sys prevents the purge view from
	changing.  Thus, if we have an uncommitted transaction at
	this point, then purge cannot remove its undo log even if
	the transaction could commit now. */

	version = rec;

	for (;;) {
		trx_t*		version_trx;
		mem_heap_t*	heap2;
		rec_t*		prev_version;
		trx_id_t	version_trx_id;

		version_trx_id = row_get_rec_trx_id(version, index, *offsets);
		if (rec == version) {
			rec_trx_id = version_trx_id;
		}

		mutex_enter(&kernel_mutex);
		version_trx = trx_get_on_id(version_trx_id);
		if (version_trx
		    && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY
			|| version_trx->conc_state == TRX_NOT_STARTED)) {

			version_trx = NULL;
		}
		mutex_exit(&kernel_mutex);

		if (!version_trx) {

			/* We found a version that belongs to a
			committed transaction: return it. */

#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
			ut_a(!rec_offs_any_null_extern(version, *offsets));
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */

			if (rec == version) {
				*old_vers = rec;
				err = DB_SUCCESS;
				break;
			}

			/* We assume that a rolled-back transaction stays in
			TRX_ACTIVE state until all the changes have been
			rolled back and the transaction is removed from
			the global list of transactions. */

			if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) {
				/* The transaction was committed while
				we searched for earlier versions.
				Return the current version as a
				semi-consistent read. */

				version = rec;
				*offsets = rec_get_offsets(version,
							   index, *offsets,
							   ULINT_UNDEFINED,
							   offset_heap);
			}

			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
			*old_vers = rec_copy(buf, version, *offsets);
			rec_offs_make_valid(*old_vers, index, *offsets);
			err = DB_SUCCESS;

			break;
		}

		heap2 = heap;
		heap = mem_heap_create(1024);

		err = trx_undo_prev_version_build(rec, mtr, version, index,
						  *offsets, heap,
						  &prev_version);
		if (heap2) {
			mem_heap_free(heap2); /* free version */
		}

		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
			break;
		}

		if (prev_version == NULL) {
			/* It was a freshly inserted version */
			*old_vers = NULL;
			err = DB_SUCCESS;

			break;
		}

		version = prev_version;
		*offsets = rec_get_offsets(version, index, *offsets,
					   ULINT_UNDEFINED, offset_heap);
#if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG
		ut_a(!rec_offs_any_null_extern(version, *offsets));
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
	}/* for (;;) */

	if (heap) {
		mem_heap_free(heap);
	}
	rw_lock_s_unlock(&(purge_sys->latch));

	return(err);
}
Пример #21
0
/********************************************************************//**
Issues read requests for pages which recovery wants to read in. */
UNIV_INTERN
void
buf_read_recv_pages(
/*================*/
	ibool		sync,		/*!< in: TRUE if the caller
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
	ulint		space,		/*!< in: space id */
	ulint		zip_size,	/*!< in: compressed page size in
					bytes, or 0 */
	const ulint*	page_nos,	/*!< in: array of page numbers
					to read, with the highest page
					number the last in the
					array */
	ulint		n_stored)	/*!< in: number of page numbers
					in the array */
{
	ib_int64_t	tablespace_version;
	ulint		count;
	ulint		err;
	ulint		i;

	zip_size = fil_space_get_zip_size(space);

	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
		/* It is a single table tablespace and the .ibd file is
		missing: do nothing */

		/* the log records should be treated here same reason
		for http://bugs.mysql.com/bug.php?id=43948 */

		if (recv_recovery_is_on()) {
			recv_addr_t*    recv_addr;

			mutex_enter(&(recv_sys->mutex));

			if (recv_sys->apply_log_recs == FALSE) {
				mutex_exit(&(recv_sys->mutex));
				goto not_to_recover;
			}

			for (i = 0; i < n_stored; i++) {
				/* recv_get_fil_addr_struct() */
				recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
						hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
							recv_sys->addr_hash));
				while (recv_addr) {
					if ((recv_addr->space == space)
						&& (recv_addr->page_no == page_nos[i])) {
						break;
					}
					recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
				}

				if ((recv_addr == NULL)
				    || (recv_addr->state == RECV_BEING_PROCESSED)
				    || (recv_addr->state == RECV_PROCESSED)) {
					continue;
				}

				recv_addr->state = RECV_PROCESSED;

				ut_a(recv_sys->n_addrs);
				recv_sys->n_addrs--;
			}

			mutex_exit(&(recv_sys->mutex));

			fprintf(stderr, " (cannot find space: %lu)", space);
		}
not_to_recover:

		return;
	}

	tablespace_version = fil_space_get_version(space);

	for (i = 0; i < n_stored; i++) {
		buf_pool_t*	buf_pool;

		count = 0;

		os_aio_print_debug = FALSE;
		buf_pool = buf_pool_get(space, page_nos[i]);
		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {

			os_aio_simulated_wake_handler_threads();
			os_thread_sleep(10000);

			count++;

			if (count > 1000) {
				fprintf(stderr,
					"InnoDB: Error: InnoDB has waited for"
					" 10 seconds for pending\n"
					"InnoDB: reads to the buffer pool to"
					" be finished.\n"
					"InnoDB: Number of pending reads %lu,"
					" pending pread calls %lu\n",
					(ulong) buf_pool->n_pend_reads,
					(ulong)os_file_n_pending_preads);

				os_aio_print_debug = TRUE;
			}
		}

		os_aio_print_debug = FALSE;

		if ((i + 1 == n_stored) && sync) {
			buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
					  zip_size, TRUE, tablespace_version,
					  page_nos[i], NULL);
		} else {
			buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
					  | OS_AIO_SIMULATED_WAKE_LATER,
					  space, zip_size, TRUE,
					  tablespace_version, page_nos[i], NULL);
		}
	}

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of all the LRU lists if necessary */
	buf_flush_free_margins(FALSE);

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Recovery applies read-ahead pages %lu\n",
			(ulong) n_stored);
	}
#endif /* UNIV_DEBUG */
}
Пример #22
0
/********************************************************//**
Parses a log record written by mlog_write_ulint or mlog_write_dulint.
@return	parsed record end, NULL if not a complete record or a corrupt record */
UNIV_INTERN
byte*
mlog_parse_nbytes(
/*==============*/
	ulint	type,	/*!< in: log record type: MLOG_1BYTE, ... */
	byte*	ptr,	/*!< in: buffer */
	byte*	end_ptr,/*!< in: buffer end */
	byte*	page,	/*!< in: page where to apply the log record, or NULL */
	void*	page_zip)/*!< in/out: compressed page, or NULL */
{
	ulint	offset;
	ulint	val;
	dulint	dval;

	ut_a(type <= MLOG_8BYTES);
	ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);

	if (end_ptr < ptr + 2) {

		return(NULL);
	}

	offset = mach_read_from_2(ptr);
	ptr += 2;

	if (offset >= UNIV_PAGE_SIZE) {
		recv_sys->found_corrupt_log = TRUE;

		return(NULL);
	}

	if (type == MLOG_8BYTES) {
		ptr = mach_dulint_parse_compressed(ptr, end_ptr, &dval);

		if (ptr == NULL) {

			return(NULL);
		}

		if (page) {
			if (UNIV_LIKELY_NULL(page_zip)) {
				mach_write_to_8
					(((page_zip_des_t*) page_zip)->data
					 + offset, dval);
			}
			mach_write_to_8(page + offset, dval);
		}

		return(ptr);
	}

	ptr = mach_parse_compressed(ptr, end_ptr, &val);

	if (ptr == NULL) {

		return(NULL);
	}

	switch (type) {
	case MLOG_1BYTE:
		if (UNIV_UNLIKELY(val > 0xFFUL)) {
			goto corrupt;
		}
		if (page) {
			if (UNIV_LIKELY_NULL(page_zip)) {
				mach_write_to_1
					(((page_zip_des_t*) page_zip)->data
					 + offset, val);
			}
			mach_write_to_1(page + offset, val);
		}
		break;
	case MLOG_2BYTES:
		if (UNIV_UNLIKELY(val > 0xFFFFUL)) {
			goto corrupt;
		}
		if (page) {
			if (UNIV_LIKELY_NULL(page_zip)) {
				mach_write_to_2
					(((page_zip_des_t*) page_zip)->data
					 + offset, val);
			}
			mach_write_to_2(page + offset, val);
		}
		break;
	case MLOG_4BYTES:
		if (page) {
			if (UNIV_LIKELY_NULL(page_zip)) {
				mach_write_to_4
					(((page_zip_des_t*) page_zip)->data
					 + offset, val);
			}
			mach_write_to_4(page + offset, val);
		}
		break;
	default:
	corrupt:
		recv_sys->found_corrupt_log = TRUE;
		ptr = NULL;
	}

	return(ptr);
}
Пример #23
0
/****************************************************************//**
Allocates large pages memory.
@return	allocated memory */
UNIV_INTERN
void*
os_mem_alloc_large(
/*===============*/
	ulint*	n)			/*!< in/out: number of bytes */
{
	void*	ptr;
	ulint	size;
#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
	int shmid;
	struct shmid_ds buf;

	if (!os_use_large_pages || !os_large_page_size) {
		goto skip;
	}

	/* Align block size to os_large_page_size */
	ut_ad(ut_is_2pow(os_large_page_size));
	size = ut_2pow_round(*n + (os_large_page_size - 1),
			     os_large_page_size);

	shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W);
	if (shmid < 0) {
		fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate"
			" %lu bytes. errno %d\n", size, errno);
		ptr = NULL;
	} else {
		ptr = shmat(shmid, NULL, 0);
		if (ptr == (void *)-1) {
			fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to"
				" attach shared memory segment, errno %d\n",
				errno);
			ptr = NULL;
		}

		/* Remove the shared memory segment so that it will be
		automatically freed after memory is detached or
		process exits */
		shmctl(shmid, IPC_RMID, &buf);
	}

	if (ptr) {
		*n = size;
		os_fast_mutex_lock(&ut_list_mutex);
		ut_total_allocated_memory += size;
		os_fast_mutex_unlock(&ut_list_mutex);
		UNIV_MEM_ALLOC(ptr, size);
		return(ptr);
	}

	fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional"
		" memory pool\n");
skip:
#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */

#ifdef __WIN__
	SYSTEM_INFO	system_info;
	GetSystemInfo(&system_info);

	/* Align block size to system page size */
	ut_ad(ut_is_2pow(system_info.dwPageSize));
	/* system_info.dwPageSize is only 32-bit. Casting to ulint is required
	on 64-bit Windows. */
	size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1),
				  (ulint) system_info.dwPageSize);
	ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE,
			   PAGE_READWRITE);
	if (!ptr) {
		fprintf(stderr, "InnoDB: VirtualAlloc(%lu bytes) failed;"
			" Windows error %lu\n",
			(ulong) size, (ulong) GetLastError());
	} else {
		os_fast_mutex_lock(&ut_list_mutex);
		ut_total_allocated_memory += size;
		os_fast_mutex_unlock(&ut_list_mutex);
		UNIV_MEM_ALLOC(ptr, size);
	}
#elif !defined OS_MAP_ANON
	size = *n;
	ptr = ut_malloc_low(size, TRUE, FALSE);
#else
# ifdef HAVE_GETPAGESIZE
	size = getpagesize();
# else
	size = UNIV_PAGE_SIZE;
# endif
	/* Align block size to system page size */
	ut_ad(ut_is_2pow(size));
	size = *n = ut_2pow_round(*n + (size - 1), size);
	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
		   MAP_PRIVATE | OS_MAP_ANON, -1, 0);
	if (UNIV_UNLIKELY(ptr == (void*) -1)) {
		fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;"
			" errno %lu\n",
			(ulong) size, (ulong) errno);
		ptr = NULL;
	} else {
		os_fast_mutex_lock(&ut_list_mutex);
		ut_total_allocated_memory += size;
		os_fast_mutex_unlock(&ut_list_mutex);
		UNIV_MEM_ALLOC(ptr, size);
	}
#endif
	return(ptr);
}
Пример #24
0
/*****************************************************************//**
When an insert or purge to a table is performed, this function builds
the entry to be inserted into or purged from an index on the table.
@return index entry which should be inserted or purged, or NULL if the
externally stored columns in the clustered index record are
unavailable and ext != NULL */
UNIV_INTERN
dtuple_t*
row_build_index_entry(
/*==================*/
	const dtuple_t*	row,	/*!< in: row which should be
				inserted or purged */
	row_ext_t*	ext,	/*!< in: externally stored column prefixes,
				or NULL */
	dict_index_t*	index,	/*!< in: index on the table */
	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
				the index entry is allocated */
{
	dtuple_t*	entry;
	ulint		entry_len;
	ulint		i;

	ut_ad(row && index && heap);
	ut_ad(dtuple_check_typed(row));

	entry_len = dict_index_get_n_fields(index);
	entry = dtuple_create(heap, entry_len);

	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
		dtuple_set_n_fields_cmp(entry, entry_len);
		/* There may only be externally stored columns
		in a clustered index B-tree of a user table. */
		ut_a(!ext);
	} else {
		dtuple_set_n_fields_cmp(
			entry, dict_index_get_n_unique_in_tree(index));
	}

	for (i = 0; i < entry_len; i++) {
		const dict_field_t*	ind_field
			= dict_index_get_nth_field(index, i);
		const dict_col_t*	col
			= ind_field->col;
		ulint			col_no
			= dict_col_get_no(col);
		dfield_t*		dfield
			= dtuple_get_nth_field(entry, i);
		const dfield_t*		dfield2
			= dtuple_get_nth_field(row, col_no);
		ulint			len
			= dfield_get_len(dfield2);

		dfield_copy(dfield, dfield2);

		if (dfield_is_null(dfield)) {
			continue;
		}

		if (ind_field->prefix_len == 0
		    && (!dfield_is_ext(dfield)
			|| dict_index_is_clust(index))) {
			/* The dfield_copy() above suffices for
			columns that are stored in-page, or for
			clustered index record columns that are not
			part of a column prefix in the PRIMARY KEY. */
			continue;
		}

		/* If the column is stored externally (off-page) in
		the clustered index, it must be an ordering field in
		the secondary index.  In the Antelope format, only
		prefix-indexed columns may be stored off-page in the
		clustered index record. In the Barracuda format, also
		fully indexed long CHAR or VARCHAR columns may be
		stored off-page. */
		ut_ad(col->ord_part);

		if (UNIV_LIKELY_NULL(ext)) {
			/* See if the column is stored externally. */
			const byte*	buf = row_ext_lookup(ext, col_no,
							     &len);
			if (UNIV_LIKELY_NULL(buf)) {
				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
					return(NULL);
				}
				dfield_set_data(dfield, buf, len);
			}

			if (ind_field->prefix_len == 0) {
				/* In the Barracuda format
				(ROW_FORMAT=DYNAMIC or
				ROW_FORMAT=COMPRESSED), we can have a
				secondary index on an entire column
				that is stored off-page in the
				clustered index. As this is not a
				prefix index (prefix_len == 0),
				include the entire off-page column in
				the secondary index record. */
				continue;
			}
		} else if (dfield_is_ext(dfield)) {
			/* This table is either in Antelope format
			(ROW_FORMAT=REDUNDANT or ROW_FORMAT=COMPACT)
			or a purge record where the ordered part of
			the field is not external.
			In Antelope, the maximum column prefix
			index length is 767 bytes, and the clustered
			index record contains a 768-byte prefix of
			each off-page column. */
			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
			len -= BTR_EXTERN_FIELD_REF_SIZE;
			dfield_set_len(dfield, len);
		}

		/* If a column prefix index, take only the prefix. */
		if (ind_field->prefix_len) {
			len = dtype_get_at_most_n_mbchars(
				col->prtype, col->mbminlen, col->mbmaxlen,
				ind_field->prefix_len, len,
				dfield_get_data(dfield));
			dfield_set_len(dfield, len);
		}
	}

	ut_ad(dtuple_check_typed(entry));

	return(entry);
}
Пример #25
0
/*******************************************************************//**
Builds from a secondary index record a row reference with which we can
search the clustered index record. */
UNIV_INTERN
void
row_build_row_ref_in_tuple(
/*=======================*/
	dtuple_t*		ref,	/*!< in/out: row reference built;
					see the NOTE below! */
	const rec_t*		rec,	/*!< in: record in the index;
					NOTE: the data fields in ref
					will point directly into this
					record, therefore, the buffer
					page of this record must be at
					least s-latched and the latch
					held as long as the row
					reference is used! */
	const dict_index_t*	index,	/*!< in: secondary index */
	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
					or NULL */
	trx_t*			trx)	/*!< in: transaction */
{
	const dict_index_t*	clust_index;
	dfield_t*		dfield;
	const byte*		field;
	ulint			len;
	ulint			ref_len;
	ulint			pos;
	ulint			clust_col_prefix_len;
	ulint			i;
	mem_heap_t*		heap		= NULL;
	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
	rec_offs_init(offsets_);

	ut_a(ref);
	ut_a(index);
	ut_a(rec);
	ut_ad(!dict_index_is_clust(index));

	if (UNIV_UNLIKELY(!index->table)) {
		fputs("InnoDB: table ", stderr);
notfound:
		ut_print_name(stderr, trx, TRUE, index->table_name);
		fputs(" for index ", stderr);
		ut_print_name(stderr, trx, FALSE, index->name);
		fputs(" not found\n", stderr);
		ut_error;
	}

	clust_index = dict_table_get_first_index(index->table);

	if (UNIV_UNLIKELY(!clust_index)) {
		fputs("InnoDB: clust index for table ", stderr);
		goto notfound;
	}

	if (!offsets) {
		offsets = rec_get_offsets(rec, index, offsets_,
					  ULINT_UNDEFINED, &heap);
	} else {
		ut_ad(rec_offs_validate(rec, index, offsets));
	}

	/* Secondary indexes must not contain externally stored columns. */
	ut_ad(!rec_offs_any_extern(offsets));
	ref_len = dict_index_get_n_unique(clust_index);

	ut_ad(ref_len == dtuple_get_n_fields(ref));

	dict_index_copy_types(ref, clust_index, ref_len);

	for (i = 0; i < ref_len; i++) {
		dfield = dtuple_get_nth_field(ref, i);

		pos = dict_index_get_nth_field_pos(index, clust_index, i);

		ut_a(pos != ULINT_UNDEFINED);

		field = rec_get_nth_field(rec, offsets, pos, &len);

		dfield_set_data(dfield, field, len);

		/* If the primary key contains a column prefix, then the
		secondary index may contain a longer prefix of the same
		column, or the full column, and we must adjust the length
		accordingly. */

		clust_col_prefix_len = dict_index_get_nth_field(
			clust_index, i)->prefix_len;

		if (clust_col_prefix_len > 0) {
			if (len != UNIV_SQL_NULL) {

				const dtype_t*	dtype
					= dfield_get_type(dfield);

				dfield_set_len(dfield,
					       dtype_get_at_most_n_mbchars(
						       dtype->prtype,
						       dtype->mbminlen,
						       dtype->mbmaxlen,
						       clust_col_prefix_len,
						       len, (char*) field));
			}
		}
	}

	ut_ad(dtuple_check_typed(ref));
	if (UNIV_LIKELY_NULL(heap)) {
		mem_heap_free(heap);
	}
}
Пример #26
0
/**************************************************************//**
Moves parts of long fields in entry to the big record vector so that
the size of tuple drops below the maximum record size allowed in the
database. Moves data only from those fields which are not necessary
to determine uniquely the insertion place of the tuple in the index.
@return own: created big record vector, NULL if we are not able to
shorten the entry enough, i.e., if there are too many fixed-length or
short fields in entry or the index is clustered */
UNIV_INTERN
big_rec_t*
dtuple_convert_big_rec(
/*===================*/
	dict_index_t*	index,	/*!< in: index */
	dtuple_t*	entry,	/*!< in/out: index entry */
	ulint*		n_ext)	/*!< in/out: number of
				externally stored columns */
{
	mem_heap_t*	heap;
	big_rec_t*	vector;
	dfield_t*	dfield;
	dict_field_t*	ifield;
	ulint		size;
	ulint		n_fields;
	ulint		local_len;
	ulint		local_prefix_len;

	if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
		return(NULL);
	}

	if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) {
		/* up to MySQL 5.1: store a 768-byte prefix locally */
		local_len = BTR_EXTERN_FIELD_REF_SIZE + DICT_MAX_INDEX_COL_LEN;
	} else {
		/* new-format table: do not store any BLOB prefix locally */
		local_len = BTR_EXTERN_FIELD_REF_SIZE;
	}

	ut_a(dtuple_check_typed_no_assert(entry));

	size = rec_get_converted_size(index, entry, *n_ext);

	if (UNIV_UNLIKELY(size > 1000000000)) {
		fprintf(stderr,
			"InnoDB: Warning: tuple size very big: %lu\n",
			(ulong) size);
		fputs("InnoDB: Tuple contents: ", stderr);
		dtuple_print(stderr, entry);
		putc('\n', stderr);
	}

	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
			       * sizeof(big_rec_field_t) + 1000);

	vector = mem_heap_alloc(heap, sizeof(big_rec_t));

	vector->heap = heap;
	vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
					* sizeof(big_rec_field_t));

	/* Decide which fields to shorten: the algorithm is to look for
	a variable-length field that yields the biggest savings when
	stored externally */

	n_fields = 0;

	while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
							     *n_ext),
				      dict_table_is_comp(index->table),
				      dict_index_get_n_fields(index),
				      dict_table_zip_size(index->table))) {
		ulint			i;
		ulint			longest		= 0;
		ulint			longest_i	= ULINT_MAX;
		byte*			data;
		big_rec_field_t*	b;

		for (i = dict_index_get_n_unique_in_tree(index);
		     i < dtuple_get_n_fields(entry); i++) {
			ulint	savings;

			dfield = dtuple_get_nth_field(entry, i);
			ifield = dict_index_get_nth_field(index, i);

			/* Skip fixed-length, NULL, externally stored,
			or short columns */

			if (ifield->fixed_len
			    || dfield_is_null(dfield)
			    || dfield_is_ext(dfield)
			    || dfield_get_len(dfield) <= local_len
			    || dfield_get_len(dfield)
			    <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
				goto skip_field;
			}

			savings = dfield_get_len(dfield) - local_len;

			/* Check that there would be savings */
			if (longest >= savings) {
				goto skip_field;
			}

			longest_i = i;
			longest = savings;

skip_field:
			continue;
		}

		if (!longest) {
			/* Cannot shorten more */

			mem_heap_free(heap);

			return(NULL);
		}

		/* Move data from field longest_i to big rec vector.

		We store the first bytes locally to the record. Then
		we can calculate all ordering fields in all indexes
		from locally stored data. */

		dfield = dtuple_get_nth_field(entry, longest_i);
		ifield = dict_index_get_nth_field(index, longest_i);
		local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;

		b = &vector->fields[n_fields];
		b->field_no = longest_i;
		b->len = dfield_get_len(dfield) - local_prefix_len;
		b->data = (char*) dfield_get_data(dfield) + local_prefix_len;

		/* Allocate the locally stored part of the column. */
		data = mem_heap_alloc(heap, local_len);

		/* Copy the local prefix. */
		memcpy(data, dfield_get_data(dfield), local_prefix_len);
		/* Clear the extern field reference (BLOB pointer). */
		memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
#if 0
		/* The following would fail the Valgrind checks in
		page_cur_insert_rec_low() and page_cur_insert_rec_zip().
		The BLOB pointers in the record will be initialized after
		the record and the BLOBs have been written. */
		UNIV_MEM_ALLOC(data + local_prefix_len,
			       BTR_EXTERN_FIELD_REF_SIZE);
#endif

		dfield_set_data(dfield, data, local_len);
		dfield_set_ext(dfield);

		n_fields++;
		(*n_ext)++;
		ut_ad(n_fields < dtuple_get_n_fields(entry));
	}

	vector->n_fields = n_fields;
	return(vector);
}
Пример #27
0
/********************************************************************//**
Adds the update undo log as the first log in the history list. Removes the
update undo log segment from the rseg slot if it is too big for reuse. */
UNIV_INTERN
void
trx_purge_add_update_undo_to_history(
    /*=================================*/
    trx_t*	trx,		/*!< in: transaction */
    page_t*	undo_page,	/*!< in: update undo log header page,
				x-latched */
    mtr_t*	mtr)		/*!< in: mtr */
{
    trx_undo_t*	undo;
    trx_rsegf_t*	rseg_header;
    trx_ulogf_t*	undo_header;

    undo = trx->update_undo;

    ut_ad(undo);

    ut_ad(mutex_own(&undo->rseg->mutex));

    rseg_header = trx_rsegf_get(
                      undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
                      mtr);

    undo_header = undo_page + undo->hdr_offset;
    /* Add the log as the first in the history list */

    if (undo->state != TRX_UNDO_CACHED) {
        ulint		hist_size;
#ifdef UNIV_DEBUG
        trx_usegf_t*	seg_header = undo_page + TRX_UNDO_SEG_HDR;
#endif /* UNIV_DEBUG */

        /* The undo log segment will not be reused */

        if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
            fprintf(stderr,
                    "InnoDB: Error: undo->id is %lu\n",
                    (ulong) undo->id);
            ut_error;
        }

        trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);

        hist_size = mtr_read_ulint(
                        rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);

        ut_ad(undo->size == flst_get_len(
                  seg_header + TRX_UNDO_PAGE_LIST, mtr));

        mlog_write_ulint(
            rseg_header + TRX_RSEG_HISTORY_SIZE,
            hist_size + undo->size, MLOG_4BYTES, mtr);
    }

    flst_add_first(
        rseg_header + TRX_RSEG_HISTORY,
        undo_header + TRX_UNDO_HISTORY_NODE, mtr);

    /* Write the trx number to the undo log header */

    mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);

    /* Write information about delete markings to the undo log header */

    if (!undo->del_marks) {
        mlog_write_ulint(
            undo_header + TRX_UNDO_DEL_MARKS, FALSE,
            MLOG_2BYTES, mtr);
    }

    if (undo->rseg->last_page_no == FIL_NULL) {
        undo->rseg->last_trx_no = trx->no;
        undo->rseg->last_offset = undo->hdr_offset;
        undo->rseg->last_page_no = undo->hdr_page_no;
        undo->rseg->last_del_marks = undo->del_marks;

        /* FIXME: Add a bin heap validate function to check that
        the rseg exists. */
    }

    mutex_enter(&kernel_mutex);
    trx_sys->rseg_history_len++;
    mutex_exit(&kernel_mutex);

//	if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) { /*should wake up always*/
    /* Inform the purge thread that there is work to do. */
    srv_wake_purge_thread_if_not_active();
//	}
}
Пример #28
0
/********************************************************************//**
Applies linear read-ahead if in the buf_pool the page is a border page of
a linear read-ahead area and all the pages in the area have been accessed.
Does not read any page if the read-ahead mechanism is not activated. Note
that the algorithm looks at the 'natural' adjacent successor and
predecessor of the page, which on the leaf level of a B-tree are the next
and previous page in the chain of leaves. To know these, the page specified
in (space, offset) must already be present in the buf_pool. Thus, the
natural way to use this function is to call it when a page in the buf_pool
is accessed the first time, calling this function just after it has been
bufferfixed.
NOTE 1: as this function looks at the natural predecessor and successor
fields on the page, what happens, if these are not initialized to any
sensible value? No problem, before applying read-ahead we check that the
area to read is within the span of the space, if not, read-ahead is not
applied. An uninitialized value may result in a useless read operation, but
only very improbably.
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
function must be written such that it cannot end up waiting for these
latches!
NOTE 3: the calling thread must want access to the page given: this rule is
set to prevent unintended read-aheads performed by ibuf routines, a situation
which could result in a deadlock if the OS does not support asynchronous io.
@return	number of page read requests issued */
UNIV_INTERN
ulint
buf_read_ahead_linear(
/*==================*/
	ulint	space,		/*!< in: space id */
	ulint	zip_size,	/*!< in: compressed page size in bytes, or 0 */
	ulint	offset,		/*!< in: page number; see NOTE 3 above */
	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf routine */
	trx_t*	trx)
{
	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
	ib_int64_t	tablespace_version;
	buf_page_t*	bpage;
	buf_frame_t*	frame;
	buf_page_t*	pred_bpage	= NULL;
	ulint		pred_offset;
	ulint		succ_offset;
	ulint		count;
	int		asc_or_desc;
	ulint		new_offset;
	ulint		fail_count;
	ulint		ibuf_mode;
	ulint		low, high;
	ulint		err;
	ulint		i;
	const ulint	buf_read_ahead_linear_area
		= BUF_READ_AHEAD_AREA(buf_pool);
	ulint		threshold;

	if (!(srv_read_ahead & 2)) {
		return(0);
	}

	if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
		/* No read-ahead to avoid thread deadlocks */
		return(0);
	}

	low  = (offset / buf_read_ahead_linear_area)
		* buf_read_ahead_linear_area;
	high = (offset / buf_read_ahead_linear_area + 1)
		* buf_read_ahead_linear_area;

	if ((offset != low) && (offset != high - 1)) {
		/* This is not a border page of the area: return */

		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* If it is an ibuf bitmap page or trx sys hdr, we do
		no read-ahead, as that could break the ibuf page access
		order */

		return(0);
	}

	/* Remember the tablespace version before we ask te tablespace size
	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
	do not try to read outside the bounds of the tablespace! */

	tablespace_version = fil_space_get_version(space);

	buf_pool_mutex_enter(buf_pool);

	if (high > fil_space_get_size(space)) {
		buf_pool_mutex_exit(buf_pool);
		/* The area is not whole, return */

		return(0);
	}

	if (buf_pool->n_pend_reads
	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
		buf_pool_mutex_exit(buf_pool);

		return(0);
	}
	buf_pool_mutex_exit(buf_pool);

	/* Check that almost all pages in the area have been accessed; if
	offset == low, the accesses must be in a descending order, otherwise,
	in an ascending order. */

	asc_or_desc = 1;

	if (offset == low) {
		asc_or_desc = -1;
	}

	/* How many out of order accessed pages can we ignore
	when working out the access pattern for linear readahead */
	threshold = ut_min((64 - srv_read_ahead_threshold),
			   BUF_READ_AHEAD_AREA(buf_pool));

	fail_count = 0;

	rw_lock_s_lock(&buf_pool->page_hash_latch);
	for (i = low; i < high; i++) {
		bpage = buf_page_hash_get(buf_pool, space, i);

		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
			/* Not accessed */
			fail_count++;

		} else if (pred_bpage) {
			/* Note that buf_page_is_accessed() returns
			the time of the first access.  If some blocks
			of the extent existed in the buffer pool at
			the time of a linear access pattern, the first
			access times may be nonmonotonic, even though
			the latest access times were linear.  The
			threshold (srv_read_ahead_factor) should help
			a little against this. */
			int res = ut_ulint_cmp(
				buf_page_is_accessed(bpage),
				buf_page_is_accessed(pred_bpage));
			/* Accesses not in the right order */
			if (res != 0 && res != asc_or_desc) {
				fail_count++;
			}
		}

		if (fail_count > threshold) {
			/* Too many failures: return */
			//buf_pool_mutex_exit(buf_pool);
			rw_lock_s_unlock(&buf_pool->page_hash_latch);
			return(0);
		}

		if (bpage && buf_page_is_accessed(bpage)) {
			pred_bpage = bpage;
		}
	}

	/* If we got this far, we know that enough pages in the area have
	been accessed in the right order: linear read-ahead can be sensible */

	bpage = buf_page_hash_get(buf_pool, space, offset);

	if (bpage == NULL) {
		//buf_pool_mutex_exit(buf_pool);
		rw_lock_s_unlock(&buf_pool->page_hash_latch);

		return(0);
	}

	switch (buf_page_get_state(bpage)) {
	case BUF_BLOCK_ZIP_PAGE:
		frame = bpage->zip.data;
		break;
	case BUF_BLOCK_FILE_PAGE:
		frame = ((buf_block_t*) bpage)->frame;
		break;
	default:
		ut_error;
		break;
	}

	/* Read the natural predecessor and successor page addresses from
	the page; NOTE that because the calling thread may have an x-latch
	on the page, we do not acquire an s-latch on the page, this is to
	prevent deadlocks. Even if we read values which are nonsense, the
	algorithm will work. */

	pred_offset = fil_page_get_prev(frame);
	succ_offset = fil_page_get_next(frame);

	//buf_pool_mutex_exit(buf_pool);
	rw_lock_s_unlock(&buf_pool->page_hash_latch);

	if ((offset == low) && (succ_offset == offset + 1)) {

		/* This is ok, we can continue */
		new_offset = pred_offset;

	} else if ((offset == high - 1) && (pred_offset == offset - 1)) {

		/* This is ok, we can continue */
		new_offset = succ_offset;
	} else {
		/* Successor or predecessor not in the right order */

		return(0);
	}

	low  = (new_offset / buf_read_ahead_linear_area)
		* buf_read_ahead_linear_area;
	high = (new_offset / buf_read_ahead_linear_area + 1)
		* buf_read_ahead_linear_area;

	if ((new_offset != low) && (new_offset != high - 1)) {
		/* This is not a border page of the area: return */

		return(0);
	}

	if (high > fil_space_get_size(space)) {
		/* The area is not whole, return */

		return(0);
	}

	/* If we got this far, read-ahead can be sensible: do it */

	ibuf_mode = inside_ibuf
		? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER
		: BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER;

	count = 0;

	/* Since Windows XP seems to schedule the i/o handler thread
	very eagerly, and consequently it does not wait for the
	full read batch to be posted, we use special heuristics here */

	os_aio_simulated_put_read_threads_to_sleep();

	for (i = low; i < high; i++) {
		/* It is only sensible to do read-ahead in the non-sync
		aio mode: hence FALSE as the first parameter */

		if (!ibuf_bitmap_page(zip_size, i)) {
			count += buf_read_page_low(
				&err, FALSE,
				ibuf_mode,
				space, zip_size, FALSE, tablespace_version, i, trx);
			if (err == DB_TABLESPACE_DELETED) {
				ut_print_timestamp(stderr);
				fprintf(stderr,
					"  InnoDB: Warning: in"
					" linear readahead trying to access\n"
					"InnoDB: tablespace %lu page %lu,\n"
					"InnoDB: but the tablespace does not"
					" exist or is just being dropped.\n",
					(ulong) space, (ulong) i);
			}
		}
	}

	/* In simulated aio we wake the aio handler threads only after
	queuing all aio requests, in native aio the following call does
	nothing: */

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of the LRU list if necessary */
	buf_flush_free_margin(buf_pool, TRUE);

#ifdef UNIV_DEBUG
	if (buf_debug_prints && (count > 0)) {
		fprintf(stderr,
			"LINEAR read-ahead space %lu offset %lu pages %lu\n",
			(ulong) space, (ulong) offset, (ulong) count);
	}
#endif /* UNIV_DEBUG */

	/* Read ahead is considered one I/O operation for the purpose of
	LRU policy decision. */
	buf_LRU_stat_inc_io();

	buf_pool->stat.n_ra_pages_read += count;
	return(count);
}
Пример #29
0
big_rec_t*
dtuple_convert_big_rec(
/*===================*/
				/* out, own: created big record vector,
				NULL if we are not able to shorten
				the entry enough, i.e., if there are
				too many short fields in entry */
	dict_index_t*	index,	/* in: index */
	dtuple_t*	entry,	/* in: index entry */
	ulint*		ext_vec,/* in: array of externally stored fields,
				or NULL: if a field already is externally
				stored, then we cannot move it to the vector
				this function returns */
	ulint		n_ext_vec)/* in: number of elements is ext_vec */
{
	mem_heap_t*	heap;
	big_rec_t*	vector;
	dfield_t*	dfield;
	ulint		size;
	ulint		n_fields;
	ulint		longest;
	ulint		longest_i		= ULINT_MAX;
	ibool		is_externally_stored;
	ulint		i;
	ulint		j;
	
	ut_a(dtuple_check_typed_no_assert(entry));

	size = rec_get_converted_size(index, entry);

	if (UNIV_UNLIKELY(size > 1000000000)) {
		fprintf(stderr,
"InnoDB: Warning: tuple size very big: %lu\n", (ulong) size);
		fputs("InnoDB: Tuple contents: ", stderr);
		dtuple_print(stderr, entry);
		putc('\n', stderr);
	}

	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
					* sizeof(big_rec_field_t) + 1000);

	vector = mem_heap_alloc(heap, sizeof(big_rec_t));

	vector->heap = heap;
	vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
					* sizeof(big_rec_field_t));

	/* Decide which fields to shorten: the algorithm is to look for
	the longest field whose type is DATA_BLOB */

	n_fields = 0;

	while (rec_get_converted_size(index, entry)
			>= ut_min(page_get_free_space_of_empty(
					index->table->comp) / 2,
					REC_MAX_DATA_SIZE)) {

		longest = 0;
		for (i = dict_index_get_n_unique_in_tree(index);
				i < dtuple_get_n_fields(entry); i++) {

			/* Skip over fields which already are externally
			stored */

			is_externally_stored = FALSE;

			if (ext_vec) {
				for (j = 0; j < n_ext_vec; j++) {
					if (ext_vec[j] == i) {
						is_externally_stored = TRUE;
					}
				}
			}
				
			if (!is_externally_stored) {

				dfield = dtuple_get_nth_field(entry, i);

				if (dfield->len != UNIV_SQL_NULL &&
			        		dfield->len > longest) {

			        	longest = dfield->len;

			        	longest_i = i;
				}
			}
		}
	
		/* We do not store externally fields which are smaller than
		DICT_MAX_INDEX_COL_LEN */

		ut_a(DICT_MAX_INDEX_COL_LEN > REC_1BYTE_OFFS_LIMIT);

		if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10
						+ DICT_MAX_INDEX_COL_LEN) {
			/* Cannot shorten more */

			mem_heap_free(heap);

			return(NULL);
		}

		/* Move data from field longest_i to big rec vector;
		we do not let data size of the remaining entry
		drop below 128 which is the limit for the 2-byte
		offset storage format in a physical record. This
		we accomplish by storing 128 bytes of data in entry
		itself, and only the remaining part to big rec vec.

		We store the first bytes locally to the record. Then
		we can calculate all ordering fields in all indexes
		from locally stored data. */

		dfield = dtuple_get_nth_field(entry, longest_i);
		vector->fields[n_fields].field_no = longest_i;

		ut_a(dfield->len > DICT_MAX_INDEX_COL_LEN);
		
		vector->fields[n_fields].len = dfield->len
						- DICT_MAX_INDEX_COL_LEN;

		vector->fields[n_fields].data = mem_heap_alloc(heap,
						vector->fields[n_fields].len);

		/* Copy data (from the end of field) to big rec vector */

		ut_memcpy(vector->fields[n_fields].data,
				((byte*)dfield->data) + dfield->len
						- vector->fields[n_fields].len,
				vector->fields[n_fields].len);
		dfield->len = dfield->len - vector->fields[n_fields].len
						+ BTR_EXTERN_FIELD_REF_SIZE;

		/* Set the extern field reference in dfield to zero */
		memset(((byte*)dfield->data)
			+ dfield->len - BTR_EXTERN_FIELD_REF_SIZE,
					0, BTR_EXTERN_FIELD_REF_SIZE);
		n_fields++;
	}	

	vector->n_fields = n_fields;
	return(vector);
}
Пример #30
0
/********************************************************************//**
Issues read requests for pages which the ibuf module wants to read in, in
order to contract the insert buffer tree. Technically, this function is like
a read-ahead function. */
UNIV_INTERN
void
buf_read_ibuf_merge_pages(
/*======================*/
	ibool		sync,		/*!< in: TRUE if the caller
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
	const ulint*	space_ids,	/*!< in: array of space ids */
	const ib_int64_t* space_versions,/*!< in: the spaces must have
					this version number
					(timestamp), otherwise we
					discard the read; we use this
					to cancel reads if DISCARD +
					IMPORT may have changed the
					tablespace size */
	const ulint*	page_nos,	/*!< in: array of page numbers
					to read, with the highest page
					number the last in the
					array */
	ulint		n_stored)	/*!< in: number of elements
					in the arrays */
{
	ulint	i;

#ifdef UNIV_IBUF_DEBUG
	ut_a(n_stored < UNIV_PAGE_SIZE);
#endif

	for (i = 0; i < n_stored; i++) {
		ulint		err;
		buf_pool_t*	buf_pool;
		ulint		zip_size = fil_space_get_zip_size(space_ids[i]);

		buf_pool = buf_pool_get(space_ids[i], page_nos[i]);

		while (buf_pool->n_pend_reads
		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
			os_thread_sleep(500000);
		}

		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {

			goto tablespace_deleted;
		}

		buf_read_page_low(&err, sync && (i + 1 == n_stored),
				  BUF_READ_ANY_PAGE, space_ids[i],
				  zip_size, TRUE, space_versions[i],
				  page_nos[i], NULL);

		if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
tablespace_deleted:
			/* We have deleted or are deleting the single-table
			tablespace: remove the entries for that page */

			ibuf_merge_or_delete_for_page(NULL, space_ids[i],
						      page_nos[i],
						      zip_size, FALSE);
		}
	}

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of all the LRU lists if necessary */
	buf_flush_free_margins(FALSE);

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Ibuf merge read-ahead space %lu pages %lu\n",
			(ulong) space_ids[0], (ulong) n_stored);
	}
#endif /* UNIV_DEBUG */
}