예제 #1
0
파일: ut0ut.c 프로젝트: AllenWeb/mariadb
/**********************************************************//**
Returns system time.
Upon successful completion, the value 0 is returned; otherwise the
value -1 is returned and the global variable errno is set to indicate the
error.
@return	0 on success, -1 otherwise */
UNIV_INTERN
int
ut_usectime(
/*========*/
	ulint*	sec,	/*!< out: seconds since the Epoch */
	ulint*	ms)	/*!< out: microseconds since the Epoch+*sec */
{
	struct timeval	tv;
	int		ret;
	int		errno_gettimeofday;
	int		i;

	for (i = 0; i < 10; i++) {

		ret = ut_gettimeofday(&tv, NULL);

		if (ret == -1) {
			errno_gettimeofday = errno;
			ut_print_timestamp(stderr);
			fprintf(stderr, "  InnoDB: gettimeofday(): %s\n",
				strerror(errno_gettimeofday));
			os_thread_sleep(100000);  /* 0.1 sec */
			errno = errno_gettimeofday;
		} else {
			break;
		}
	}

	if (ret != -1) {
		*sec = (ulint) tv.tv_sec;
		*ms  = (ulint) tv.tv_usec;
	}

	return(ret);
}
예제 #2
0
/**********************************************************************//**
Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
out. It cannot be used if we want to return an error message. Prints to
stderr a message if fails.
@return	TRUE if succeeded */
UNIV_INTERN
ibool
ut_test_malloc(
/*===========*/
	ulint	n)	/*!< in: try to allocate this many bytes */
{
	void*	ret;

	ret = malloc(n);

	if (ret == NULL) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Error: cannot allocate"
			" %lu bytes of memory for\n"
			"InnoDB: a BLOB with malloc! Total allocated memory\n"
			"InnoDB: by InnoDB %lu bytes."
			" Operating system errno: %d\n"
			"InnoDB: Check if you should increase"
			" the swap file or\n"
			"InnoDB: ulimits of your operating system.\n"
			"InnoDB: On FreeBSD check you have"
			" compiled the OS with\n"
			"InnoDB: a big enough maximum process size.\n",
			(ulong) n,
			(ulong) ut_total_allocated_memory,
			(int) errno);
		return(FALSE);
	}

	free(ret);

	return(TRUE);
}
예제 #3
0
파일: ut0dbg.c 프로젝트: Tusamarco/drtools
void
ut_dbg_assertion_failed(
/*====================*/
	const char* expr,	/* in: the failed assertion (optional) */
	const char* file,	/* in: source file containing the assertion */
	ulint line)		/* in: line number of the assertion */
{
	ut_print_timestamp(stderr);
	fprintf(stderr,
		"InnoDB: Assertion failure in thread %lu"
		" in file %s line %lu\n",
		os_thread_pf(os_thread_get_curr_id()), file, line);
	if (expr) {
		fprintf(stderr,
			"InnoDB: Failing assertion: %s\n", expr);
	}

	fputs(
"InnoDB: We intentionally generate a memory trap.\n"
"InnoDB: Submit a detailed bug report to http://bugs.mysql.com.\n"
"InnoDB: If you get repeated assertion failures or crashes, even\n"
"InnoDB: immediately after the mysqld startup, there may be\n"
"InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
"InnoDB: http://dev.mysql.com/doc/refman/5.0/en/forcing-recovery.html\n"
"InnoDB: about forcing recovery.\n", stderr);
	ut_dbg_stop_threads = TRUE;
}
예제 #4
0
파일: ut0dbg.c 프로젝트: AllenWeb/mariadb
/*************************************************************//**
Report a failed assertion. */
UNIV_INTERN
void
ut_dbg_assertion_failed(
/*====================*/
	const char* expr,	/*!< in: the failed assertion (optional) */
	const char* file,	/*!< in: source file containing the assertion */
	ulint line)		/*!< in: line number of the assertion */
{
	ut_print_timestamp(stderr);
#ifdef UNIV_HOTBACKUP
	fprintf(stderr, "  InnoDB: Assertion failure in file %s line %lu\n",
		file, line);
#else /* UNIV_HOTBACKUP */
	fprintf(stderr,
		"  InnoDB: Assertion failure in thread %lu"
		" in file %s line %lu\n",
		os_thread_pf(os_thread_get_curr_id()),
		innobase_basename(file), line);
#endif /* UNIV_HOTBACKUP */
	if (expr) {
		fprintf(stderr,
			"InnoDB: Failing assertion: %s\n", expr);
	}

	fputs("InnoDB: We intentionally generate a memory trap.\n"
	      "InnoDB: Submit a detailed bug report"
	      " to http://bugs.mysql.com.\n"
	      "InnoDB: If you get repeated assertion failures"
	      " or crashes, even\n"
	      "InnoDB: immediately after the mysqld startup, there may be\n"
	      "InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
	      "InnoDB: " REFMAN "forcing-innodb-recovery.html\n"
	      "InnoDB: about forcing recovery.\n", stderr);
}
예제 #5
0
ibool
buf_flush_ready_for_replace(
/*========================*/
				/* out: TRUE if can replace immediately */
	buf_block_t*	block)	/* in: buffer control block, must be in state
				BUF_BLOCK_FILE_PAGE and in the LRU list */
{
	ut_ad(mutex_own(&(buf_pool->mutex)));
	ut_ad(mutex_own(&block->mutex));
	if (block->state != BUF_BLOCK_FILE_PAGE) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Error: buffer block state %lu"
			" in the LRU list!\n",
			(ulong)block->state);
		ut_print_buf(stderr, block, sizeof(buf_block_t));

		return(FALSE);
	}

	if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
	    || (block->buf_fix_count != 0)
	    || (block->io_fix != 0)) {

		return(FALSE);
	}

	return(TRUE);
}
예제 #6
0
/*******************************************************************//**
Rolls back a transaction back to a named savepoint. Modifications after the
savepoint are undone but InnoDB does NOT release the corresponding locks
which are stored in memory. If a lock is 'implicit', that is, a new inserted
row holds a lock where the lock information is carried by the trx id stored in
the row, these locks are naturally released in the rollback. Savepoints which
were set after this savepoint are deleted.
@return if no savepoint of the name found then DB_NO_SAVEPOINT,
otherwise DB_SUCCESS */
UNIV_INTERN
ulint
trx_rollback_to_savepoint_for_mysql(
/*================================*/
	trx_t*		trx,			/*!< in: transaction handle */
	const char*	savepoint_name,		/*!< in: savepoint name */
	ib_int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
						position corresponding to this
						savepoint; MySQL needs this
						information to remove the
						binlog entries of the queries
						executed after the savepoint */
{
	trx_named_savept_t*	savep;
	ulint			err;

	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);

	while (savep != NULL) {
		if (0 == ut_strcmp(savep->name, savepoint_name)) {
			/* Found */
			break;
		}
		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
	}

	if (savep == NULL) {

		return(DB_NO_SAVEPOINT);
	}

	if (trx->conc_state == TRX_NOT_STARTED) {
		ut_print_timestamp(stderr);
		fputs("  InnoDB: Error: transaction has a savepoint ", stderr);
		ut_print_name(stderr, trx, FALSE, savep->name);
		fputs(" though it is not started\n", stderr);
		return(DB_ERROR);
	}

	/* We can now free all savepoints strictly later than this one */

	trx_roll_savepoints_free(trx, savep);

	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;

	trx->op_info = "rollback to a savepoint";

	err = trx_general_rollback_for_mysql(trx, &savep->savept);

	/* Store the current undo_no of the transaction so that we know where
	to roll back if we have to roll back the next SQL statement: */

	trx_mark_sql_stat_end(trx);

	trx->op_info = "";

	return(err);
}
예제 #7
0
파일: row0uins.c 프로젝트: zhangchuhu/s2s
/***********************************************************//**
Parses the row reference and other info in a fresh insert undo record. */
static
void
row_undo_ins_parse_undo_rec(
    /*========================*/
    ib_recovery_t	recovery,	/*!< in: recovery flag */
    undo_node_t*	node)		/*!< in/out: row undo node */
{
    dict_index_t*	clust_index;
    byte*		ptr;
    undo_no_t	undo_no;
    dulint		table_id;
    ulint		type;
    ulint		dummy;
    ibool		dummy_extern;

    ut_ad(node);

    ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
                                &dummy_extern, &undo_no, &table_id);
    ut_ad(type == TRX_UNDO_INSERT_REC);
    node->rec_type = type;

    node->update = NULL;
    node->table = dict_table_get_on_id(
                      srv_force_recovery, table_id, node->trx);

    /* Skip the UNDO if we can't find the table or the .ibd file. */
    if (UNIV_UNLIKELY(node->table == NULL)) {
    } else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
        node->table = NULL;
    } else {
        clust_index = dict_table_get_first_index(node->table);

        if (clust_index != NULL) {
            ptr = trx_undo_rec_get_row_ref(
                      ptr, clust_index, &node->ref, node->heap);
        } else {
            ut_print_timestamp(ib_stream);
            ib_logger(ib_stream, "  InnoDB: table ");
            ut_print_name(ib_stream, node->trx, TRUE,
                          node->table->name);
            ib_logger(ib_stream, " has no indexes, "
                      "ignoring the table\n");

            node->table = NULL;
        }
    }
}
예제 #8
0
파일: ha0ha.c 프로젝트: A-eolus/mysql
/*************************************************************//**
Validates a given range of the cells in hash table.
@return	TRUE if ok */
UNIV_INTERN
ibool
ha_validate(
/*========*/
	hash_table_t*	table,		/*!< in: hash table */
	ulint		start_index,	/*!< in: start index */
	ulint		end_index)	/*!< in: end index */
{
	hash_cell_t*	cell;
	ha_node_t*	node;
	ibool		ok	= TRUE;
	ulint		i;

	ut_ad(table);
	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
	ut_a(start_index <= end_index);
	ut_a(start_index < hash_get_n_cells(table));
	ut_a(end_index < hash_get_n_cells(table));

	for (i = start_index; i <= end_index; i++) {

		cell = hash_get_nth_cell(table, i);

		node = cell->node;

		while (node) {
			if (hash_calc_hash(node->fold, table) != i) {
				ut_print_timestamp(stderr);
				fprintf(stderr,
					"InnoDB: Error: hash table node"
					" fold value %lu does not\n"
					"InnoDB: match the cell number %lu.\n",
					(ulong) node->fold, (ulong) i);

				ok = FALSE;
			}

			node = node->next;
		}
	}

	return(ok);
}
예제 #9
0
/**********************************************************//**
Frees a mutex object. */
UNIV_INTERN
void
os_fast_mutex_free(
/*===============*/
	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to free */
{
#ifdef __WIN__
	ut_a(fast_mutex);

	DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex);
#else
	int	ret;

	ret = pthread_mutex_destroy(fast_mutex);

	if (UNIV_UNLIKELY(ret != 0)) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: error: return value %lu when calling\n"
			"InnoDB: pthread_mutex_destroy().\n", (ulint)ret);
		fprintf(stderr,
			"InnoDB: Byte contents of the pthread mutex at %p:\n",
			(void*) fast_mutex);
		ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t));
		putc('\n', stderr);
	}
#endif
	if (UNIV_LIKELY(os_sync_mutex_inited)) {
		/* When freeing the last mutexes, we have
		already freed os_sync_mutex */

		os_mutex_enter(os_sync_mutex);
	}

	ut_ad(os_fast_mutex_count > 0);
	os_fast_mutex_count--;

	if (UNIV_LIKELY(os_sync_mutex_inited)) {
		os_mutex_exit(os_sync_mutex);
	}
}
예제 #10
0
파일: buf0rea.c 프로젝트: MPlatform/mariadb
/********************************************************************//**
High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
@return TRUE if page has been read in, FALSE in case of failure */
UNIV_INTERN
ibool
buf_read_page(
/*==========*/
	ulint	space,	/*!< in: space id */
	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
	ulint	offset,	/*!< in: page number */
	trx_t*	trx)
{
	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
	ib_int64_t	tablespace_version;
	ulint		count;
	ulint		err;

	tablespace_version = fil_space_get_version(space);

	/* We do the i/o in the synchronous aio mode to save thread
	switches: hence TRUE */

	count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
				  zip_size, FALSE,
				  tablespace_version, offset, trx);
	srv_buf_pool_reads += count;
	if (err == DB_TABLESPACE_DELETED) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Error: trying to access"
			" tablespace %lu page no. %lu,\n"
			"InnoDB: but the tablespace does not exist"
			" or is just being dropped.\n",
			(ulong) space, (ulong) offset);
	}

	/* Flush pages from the end of the LRU list if necessary */
	buf_flush_free_margin(buf_pool, TRUE);

	/* Increment number of I/O operations used for LRU policy. */
	buf_LRU_stat_inc_io();

	return(count > 0);
}
예제 #11
0
/************************************************************************
High-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread. Does a random read-ahead if it seems
sensible. */
ibool
buf_read_page(
/*==========*/
			/* out: TRUE if success, FALSE otherwise */
	ulint	space,	/* in: space id */
	ulint	offset)	/* in: page number */
{
	ib_longlong	tablespace_version;
	ulint		count;
	ulint		err;

	tablespace_version = fil_space_get_version(space);

	buf_read_ahead_random(space, offset);

	/* We do the i/o in the synchronous aio mode to save thread
	switches: hence TRUE */

	count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
				   tablespace_version, offset);
	srv_buf_pool_reads+= count;
	if (err == DB_TABLESPACE_DELETED) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Error: trying to access"
			" tablespace %lu page no. %lu,\n"
			"InnoDB: but the tablespace does not exist"
			" or is just being dropped.\n",
			(ulong) space, (ulong) offset);
	}

	/* Flush pages from the end of the LRU list if necessary */
	buf_flush_free_margin();

	return(count > 0);
}
예제 #12
0
파일: dict0crea.c 프로젝트: Coco-wan/git-1
ulint
dict_foreign_eval_sql(
/*==================*/
				/* out: error code or DB_SUCCESS */
	pars_info_t*	info,	/* in: info struct, or NULL */
	const char*	sql,	/* in: SQL string to evaluate */
	dict_table_t*	table,	/* in: table */
	dict_foreign_t*	foreign,/* in: foreign */
	trx_t*		trx)	/* in: transaction */
{
	ulint		error;
	FILE*		ef	= dict_foreign_err_file;

	error = que_eval_sql(info, sql, FALSE, trx);

	if (error == DB_DUPLICATE_KEY) {
		mutex_enter(&dict_foreign_err_mutex);
		rewind(ef);
		ut_print_timestamp(ef);
		fputs(" Error in foreign key constraint creation for table ",
		      ef);
		ut_print_name(ef, trx, TRUE, table->name);
		fputs(".\nA foreign key constraint of name ", ef);
		ut_print_name(ef, trx, FALSE, foreign->id);
		fputs("\nalready exists."
		      " (Note that internally InnoDB adds 'databasename/'\n"
		      "in front of the user-defined constraint name).\n",
		      ef);
		fputs("Note that InnoDB's FOREIGN KEY system tables store\n"
		      "constraint names as case-insensitive, with the\n"
		      "MySQL standard latin1_swedish_ci collation. If you\n"
		      "create tables or databases whose names differ only in\n"
		      "the character case, then collisions in constraint\n"
		      "names can occur. Workaround: name your constraints\n"
		      "explicitly with unique names.\n",
		      ef);

		mutex_exit(&dict_foreign_err_mutex);

		return(error);
	}

	if (error != DB_SUCCESS) {
		fprintf(stderr,
			"InnoDB: Foreign key constraint creation failed:\n"
			"InnoDB: internal error number %lu\n", (ulong) error);

		mutex_enter(&dict_foreign_err_mutex);
		ut_print_timestamp(ef);
		fputs(" Internal error in foreign key constraint creation"
		      " for table ", ef);
		ut_print_name(ef, trx, TRUE, table->name);
		fputs(".\n"
		      "See the MySQL .err log in the datadir"
		      " for more information.\n", ef);
		mutex_exit(&dict_foreign_err_mutex);

		return(error);
	}

	return(DB_SUCCESS);
}
예제 #13
0
파일: dict0crea.c 프로젝트: Coco-wan/git-1
ulint
dict_truncate_index_tree(
/*=====================*/
				/* out: new root page number, or
				FIL_NULL on failure */
	dict_table_t*	table,	/* in: the table the index belongs to */
	btr_pcur_t*	pcur,	/* in/out: persistent cursor pointing to
				record in the clustered index of
				SYS_INDEXES table. The cursor may be
				repositioned in this call. */
	mtr_t*		mtr)	/* in: mtr having the latch
				on the record page. The mtr may be
				committed and restarted in this call. */
{
	ulint		root_page_no;
	ulint		space;
	ulint		type;
	dulint		index_id;
	rec_t*		rec;
	byte*		ptr;
	ulint		len;
	ulint		comp;
	dict_index_t*	index;

	ut_ad(mutex_own(&(dict_sys->mutex)));
	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
	rec = btr_pcur_get_rec(pcur);
	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

	ut_ad(len == 4);

	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

	if (root_page_no == FIL_NULL) {
		/* The tree has been freed. */

		ut_print_timestamp(stderr);
		fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
			" a missing index of table %s!\n", table->name);
		return(FIL_NULL);
	}

	ptr = rec_get_nth_field_old(rec,
				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);

	ut_ad(len == 4);

	space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

	if (!fil_tablespace_exists_in_mem(space)) {
		/* It is a single table tablespace and the .ibd file is
		missing: do nothing */

		ut_print_timestamp(stderr);
		fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
			" a missing .ibd file of table %s!\n", table->name);
		return(FIL_NULL);
	}

	ptr = rec_get_nth_field_old(rec,
				    DICT_SYS_INDEXES_TYPE_FIELD, &len);
	ut_ad(len == 4);
	type = mach_read_from_4(ptr);

	ptr = rec_get_nth_field_old(rec, 1, &len);
	ut_ad(len == 8);
	index_id = mach_read_from_8(ptr);

	/* We free all the pages but the root page first; this operation
	may span several mini-transactions */

	btr_free_but_not_root(space, root_page_no);

	/* Then we free the root page in the same mini-transaction where
	we create the b-tree and write its new root page number to the
	appropriate field in the SYS_INDEXES record: this mini-transaction
	marks the B-tree totally truncated */

	comp = page_is_comp(btr_page_get(space, root_page_no, RW_X_LATCH,
					 mtr));

	btr_free_root(space, root_page_no, mtr);
	/* We will temporarily write FIL_NULL to the PAGE_NO field
	in SYS_INDEXES, so that the database will not get into an
	inconsistent state in case it crashes between the mtr_commit()
	below and the following mtr_commit() call. */
	page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
				     FIL_NULL, mtr);

	/* We will need to commit the mini-transaction in order to avoid
	deadlocks in the btr_create() call, because otherwise we would
	be freeing and allocating pages in the same mini-transaction. */
	btr_pcur_store_position(pcur, mtr);
	mtr_commit(mtr);

	mtr_start(mtr);
	btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);

	/* Find the index corresponding to this SYS_INDEXES record. */
	for (index = UT_LIST_GET_FIRST(table->indexes);
	     index;
	     index = UT_LIST_GET_NEXT(indexes, index)) {
		if (!ut_dulint_cmp(index->id, index_id)) {
			break;
		}
	}

	root_page_no = btr_create(type, space, index_id, comp, mtr);
	if (index) {
		index->page = (unsigned int) root_page_no;
	} else {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Index %lu %lu of table %s is missing\n"
			"InnoDB: from the data dictionary during TRUNCATE!\n",
			ut_dulint_get_high(index_id),
			ut_dulint_get_low(index_id),
			table->name);
	}

	return(root_page_no);
}
예제 #14
0
파일: buf0rea.c 프로젝트: MPlatform/mariadb
/********************************************************************//**
Applies linear read-ahead if in the buf_pool the page is a border page of
a linear read-ahead area and all the pages in the area have been accessed.
Does not read any page if the read-ahead mechanism is not activated. Note
that the algorithm looks at the 'natural' adjacent successor and
predecessor of the page, which on the leaf level of a B-tree are the next
and previous page in the chain of leaves. To know these, the page specified
in (space, offset) must already be present in the buf_pool. Thus, the
natural way to use this function is to call it when a page in the buf_pool
is accessed the first time, calling this function just after it has been
bufferfixed.
NOTE 1: as this function looks at the natural predecessor and successor
fields on the page, what happens, if these are not initialized to any
sensible value? No problem, before applying read-ahead we check that the
area to read is within the span of the space, if not, read-ahead is not
applied. An uninitialized value may result in a useless read operation, but
only very improbably.
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
function must be written such that it cannot end up waiting for these
latches!
NOTE 3: the calling thread must want access to the page given: this rule is
set to prevent unintended read-aheads performed by ibuf routines, a situation
which could result in a deadlock if the OS does not support asynchronous io.
@return	number of page read requests issued */
UNIV_INTERN
ulint
buf_read_ahead_linear(
/*==================*/
	ulint	space,		/*!< in: space id */
	ulint	zip_size,	/*!< in: compressed page size in bytes, or 0 */
	ulint	offset,		/*!< in: page number; see NOTE 3 above */
	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf routine */
	trx_t*	trx)
{
	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
	ib_int64_t	tablespace_version;
	buf_page_t*	bpage;
	buf_frame_t*	frame;
	buf_page_t*	pred_bpage	= NULL;
	ulint		pred_offset;
	ulint		succ_offset;
	ulint		count;
	int		asc_or_desc;
	ulint		new_offset;
	ulint		fail_count;
	ulint		ibuf_mode;
	ulint		low, high;
	ulint		err;
	ulint		i;
	const ulint	buf_read_ahead_linear_area
		= BUF_READ_AHEAD_AREA(buf_pool);
	ulint		threshold;

	if (!(srv_read_ahead & 2)) {
		return(0);
	}

	if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
		/* No read-ahead to avoid thread deadlocks */
		return(0);
	}

	low  = (offset / buf_read_ahead_linear_area)
		* buf_read_ahead_linear_area;
	high = (offset / buf_read_ahead_linear_area + 1)
		* buf_read_ahead_linear_area;

	if ((offset != low) && (offset != high - 1)) {
		/* This is not a border page of the area: return */

		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* If it is an ibuf bitmap page or trx sys hdr, we do
		no read-ahead, as that could break the ibuf page access
		order */

		return(0);
	}

	/* Remember the tablespace version before we ask te tablespace size
	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
	do not try to read outside the bounds of the tablespace! */

	tablespace_version = fil_space_get_version(space);

	buf_pool_mutex_enter(buf_pool);

	if (high > fil_space_get_size(space)) {
		buf_pool_mutex_exit(buf_pool);
		/* The area is not whole, return */

		return(0);
	}

	if (buf_pool->n_pend_reads
	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
		buf_pool_mutex_exit(buf_pool);

		return(0);
	}
	buf_pool_mutex_exit(buf_pool);

	/* Check that almost all pages in the area have been accessed; if
	offset == low, the accesses must be in a descending order, otherwise,
	in an ascending order. */

	asc_or_desc = 1;

	if (offset == low) {
		asc_or_desc = -1;
	}

	/* How many out of order accessed pages can we ignore
	when working out the access pattern for linear readahead */
	threshold = ut_min((64 - srv_read_ahead_threshold),
			   BUF_READ_AHEAD_AREA(buf_pool));

	fail_count = 0;

	rw_lock_s_lock(&buf_pool->page_hash_latch);
	for (i = low; i < high; i++) {
		bpage = buf_page_hash_get(buf_pool, space, i);

		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
			/* Not accessed */
			fail_count++;

		} else if (pred_bpage) {
			/* Note that buf_page_is_accessed() returns
			the time of the first access.  If some blocks
			of the extent existed in the buffer pool at
			the time of a linear access pattern, the first
			access times may be nonmonotonic, even though
			the latest access times were linear.  The
			threshold (srv_read_ahead_factor) should help
			a little against this. */
			int res = ut_ulint_cmp(
				buf_page_is_accessed(bpage),
				buf_page_is_accessed(pred_bpage));
			/* Accesses not in the right order */
			if (res != 0 && res != asc_or_desc) {
				fail_count++;
			}
		}

		if (fail_count > threshold) {
			/* Too many failures: return */
			//buf_pool_mutex_exit(buf_pool);
			rw_lock_s_unlock(&buf_pool->page_hash_latch);
			return(0);
		}

		if (bpage && buf_page_is_accessed(bpage)) {
			pred_bpage = bpage;
		}
	}

	/* If we got this far, we know that enough pages in the area have
	been accessed in the right order: linear read-ahead can be sensible */

	bpage = buf_page_hash_get(buf_pool, space, offset);

	if (bpage == NULL) {
		//buf_pool_mutex_exit(buf_pool);
		rw_lock_s_unlock(&buf_pool->page_hash_latch);

		return(0);
	}

	switch (buf_page_get_state(bpage)) {
	case BUF_BLOCK_ZIP_PAGE:
		frame = bpage->zip.data;
		break;
	case BUF_BLOCK_FILE_PAGE:
		frame = ((buf_block_t*) bpage)->frame;
		break;
	default:
		ut_error;
		break;
	}

	/* Read the natural predecessor and successor page addresses from
	the page; NOTE that because the calling thread may have an x-latch
	on the page, we do not acquire an s-latch on the page, this is to
	prevent deadlocks. Even if we read values which are nonsense, the
	algorithm will work. */

	pred_offset = fil_page_get_prev(frame);
	succ_offset = fil_page_get_next(frame);

	//buf_pool_mutex_exit(buf_pool);
	rw_lock_s_unlock(&buf_pool->page_hash_latch);

	if ((offset == low) && (succ_offset == offset + 1)) {

		/* This is ok, we can continue */
		new_offset = pred_offset;

	} else if ((offset == high - 1) && (pred_offset == offset - 1)) {

		/* This is ok, we can continue */
		new_offset = succ_offset;
	} else {
		/* Successor or predecessor not in the right order */

		return(0);
	}

	low  = (new_offset / buf_read_ahead_linear_area)
		* buf_read_ahead_linear_area;
	high = (new_offset / buf_read_ahead_linear_area + 1)
		* buf_read_ahead_linear_area;

	if ((new_offset != low) && (new_offset != high - 1)) {
		/* This is not a border page of the area: return */

		return(0);
	}

	if (high > fil_space_get_size(space)) {
		/* The area is not whole, return */

		return(0);
	}

	/* If we got this far, read-ahead can be sensible: do it */

	ibuf_mode = inside_ibuf
		? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER
		: BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER;

	count = 0;

	/* Since Windows XP seems to schedule the i/o handler thread
	very eagerly, and consequently it does not wait for the
	full read batch to be posted, we use special heuristics here */

	os_aio_simulated_put_read_threads_to_sleep();

	for (i = low; i < high; i++) {
		/* It is only sensible to do read-ahead in the non-sync
		aio mode: hence FALSE as the first parameter */

		if (!ibuf_bitmap_page(zip_size, i)) {
			count += buf_read_page_low(
				&err, FALSE,
				ibuf_mode,
				space, zip_size, FALSE, tablespace_version, i, trx);
			if (err == DB_TABLESPACE_DELETED) {
				ut_print_timestamp(stderr);
				fprintf(stderr,
					"  InnoDB: Warning: in"
					" linear readahead trying to access\n"
					"InnoDB: tablespace %lu page %lu,\n"
					"InnoDB: but the tablespace does not"
					" exist or is just being dropped.\n",
					(ulong) space, (ulong) i);
			}
		}
	}

	/* In simulated aio we wake the aio handler threads only after
	queuing all aio requests, in native aio the following call does
	nothing: */

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of the LRU list if necessary */
	buf_flush_free_margin(buf_pool, TRUE);

#ifdef UNIV_DEBUG
	if (buf_debug_prints && (count > 0)) {
		fprintf(stderr,
			"LINEAR read-ahead space %lu offset %lu pages %lu\n",
			(ulong) space, (ulong) offset, (ulong) count);
	}
#endif /* UNIV_DEBUG */

	/* Read ahead is considered one I/O operation for the purpose of
	LRU policy decision. */
	buf_LRU_stat_inc_io();

	buf_pool->stat.n_ra_pages_read += count;
	return(count);
}
예제 #15
0
파일: mem0pool.c 프로젝트: Coco-wan/git-1
/********************************************************************//**
Fills the specified free list.
@return	TRUE if we were able to insert a block to the free list */
static
ibool
mem_pool_fill_free_list(
/*====================*/
	ulint		i,	/*!< in: free list index */
	mem_pool_t*	pool)	/*!< in: memory pool */
{
	mem_area_t*	area;
	mem_area_t*	area2;
	ibool		ret;

	ut_ad(mutex_own(&(pool->mutex)));

	if (UNIV_UNLIKELY(i >= 63)) {
		/* We come here when we have run out of space in the
		memory pool: */

		return(FALSE);
	}

	area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);

	if (area == NULL) {
		if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) {
			ut_print_timestamp(stderr);

			fprintf(stderr,
				"  InnoDB: Error: mem pool free list %lu"
				" length is %lu\n"
				"InnoDB: though the list is empty!\n",
				(ulong) i + 1,
				(ulong)
				UT_LIST_GET_LEN(pool->free_list[i + 1]));
		}

		ret = mem_pool_fill_free_list(i + 1, pool);

		if (ret == FALSE) {

			return(FALSE);
		}

		area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
	}

	if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) {
		mem_analyze_corruption(area);

		ut_error;
	}

	UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);

	area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i));
	UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);

	mem_area_set_size(area2, ut_2_exp(i));
	mem_area_set_free(area2, TRUE);

	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2);

	mem_area_set_size(area, ut_2_exp(i));

	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);

	return(TRUE);
}
예제 #16
0
파일: buf0rea.c 프로젝트: MPlatform/mariadb
/********************************************************************//**
Applies a random read-ahead in buf_pool if there are at least a threshold
value of accessed pages from the random read-ahead area. Does not read any
page, not even the one at the position (space, offset), if the read-ahead
mechanism is not activated. NOTE 1: the calling thread may own latches on
pages: to avoid deadlocks this function must be written such that it cannot
end up waiting for these latches! NOTE 2: the calling thread must want
access to the page given: this rule is set to prevent unintended read-aheads
performed by ibuf routines, a situation which could result in a deadlock if
the OS does not support asynchronous i/o.
@return number of page read requests issued; NOTE that if we read ibuf
pages, it may happen that the page at the given page number does not
get read even if we return a positive value!
@return	number of page read requests issued */
UNIV_INTERN
ulint
buf_read_ahead_random(
/*==================*/
	ulint	space,		/*!< in: space id */
	ulint	zip_size,	/*!< in: compressed page size in bytes,
				or 0 */
	ulint	offset,		/*!< in: page number of a page which
				the current thread wants to access */
	ibool	inside_ibuf,	/*!< in: TRUE if we are inside ibuf
				routine */
	trx_t*	trx)
{
	buf_pool_t*	buf_pool = buf_pool_get(space, offset);
	ib_int64_t	tablespace_version;
	ulint		recent_blocks	= 0;
	ulint		ibuf_mode;
	ulint		count;
	ulint		low, high;
	ulint		err;
	ulint		i;
	const ulint	buf_read_ahead_random_area
				= BUF_READ_AHEAD_AREA(buf_pool);

	if (!srv_random_read_ahead) {
		/* Disabled by user */
		return(0);
	}

	if (srv_startup_is_before_trx_rollback_phase) {
		/* No read-ahead to avoid thread deadlocks */
		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* If it is an ibuf bitmap page or trx sys hdr, we do
		no read-ahead, as that could break the ibuf page access
		order */

		return(0);
	}

	/* Remember the tablespace version before we ask te tablespace size
	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
	do not try to read outside the bounds of the tablespace! */

	tablespace_version = fil_space_get_version(space);

	low  = (offset / buf_read_ahead_random_area)
		* buf_read_ahead_random_area;
	high = (offset / buf_read_ahead_random_area + 1)
		* buf_read_ahead_random_area;
	if (high > fil_space_get_size(space)) {

		high = fil_space_get_size(space);
	}

	buf_pool_mutex_enter(buf_pool);

	if (buf_pool->n_pend_reads
	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
		buf_pool_mutex_exit(buf_pool);

		return(0);
	}

	/* Count how many blocks in the area have been recently accessed,
	that is, reside near the start of the LRU list. */

	for (i = low; i < high; i++) {
		const buf_page_t* bpage =
			buf_page_hash_get(buf_pool, space, i);

		if (bpage
		    && buf_page_is_accessed(bpage)
		    && buf_page_peek_if_young(bpage)) {

			recent_blocks++;

			if (recent_blocks
			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {

				buf_pool_mutex_exit(buf_pool);
				goto read_ahead;
			}
		}
	}

	buf_pool_mutex_exit(buf_pool);
	/* Do nothing */
	return(0);

read_ahead:
	/* Read all the suitable blocks within the area */

	if (inside_ibuf) {
		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
	} else {
		ibuf_mode = BUF_READ_ANY_PAGE;
	}

	count = 0;

	for (i = low; i < high; i++) {
		/* It is only sensible to do read-ahead in the non-sync aio
		mode: hence FALSE as the first parameter */

		if (!ibuf_bitmap_page(zip_size, i)) {
			count += buf_read_page_low(
				&err, FALSE,
				ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
				space, zip_size, FALSE,
				tablespace_version, i, trx);
			if (err == DB_TABLESPACE_DELETED) {
				ut_print_timestamp(stderr);
				fprintf(stderr,
					"  InnoDB: Warning: in random"
					" readahead trying to access\n"
					"InnoDB: tablespace %lu page %lu,\n"
					"InnoDB: but the tablespace does not"
					" exist or is just being dropped.\n",
					(ulong) space, (ulong) i);
			}
		}
	}

	/* In simulated aio we wake the aio handler threads only after
	queuing all aio requests, in native aio the following call does
	nothing: */

	os_aio_simulated_wake_handler_threads();

#ifdef UNIV_DEBUG
	if (buf_debug_prints && (count > 0)) {
		fprintf(stderr,
			"Random read-ahead space %lu offset %lu pages %lu\n",
			(ulong) space, (ulong) offset,
			(ulong) count);
	}
#endif /* UNIV_DEBUG */

	/* Read ahead is considered one I/O operation for the purpose of
	LRU policy decision. */
	buf_LRU_stat_inc_io();

	buf_pool->stat.n_ra_pages_read_rnd += count;
	srv_buf_pool_reads += count;
	return(count);
}
예제 #17
0
/*******************************************************************//**
Truncates the index tree associated with a row in SYS_INDEXES table.
@return	new root page number, or FIL_NULL on failure */
UNIV_INTERN
ulint
dict_truncate_index_tree(
    /*=====================*/
    dict_table_t*	table,	/*!< in: the table the index belongs to */
    ulint		space,	/*!< in: 0=truncate,
				nonzero=create the index tree in the
				given tablespace */
    btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
				record in the clustered index of
				SYS_INDEXES table. The cursor may be
				repositioned in this call. */
    mtr_t*		mtr)	/*!< in: mtr having the latch
				on the record page. The mtr may be
				committed and restarted in this call. */
{
    ulint		root_page_no;
    ibool		drop = !space;
    ulint		zip_size;
    ulint		type;
    index_id_t	index_id;
    rec_t*		rec;
    const byte*	ptr;
    ulint		len;
    dict_index_t*	index;

    ut_ad(mutex_own(&(dict_sys->mutex)));
    ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
    rec = btr_pcur_get_rec(pcur);
    ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

    ut_ad(len == 4);

    root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

    if (drop && root_page_no == FIL_NULL) {
        /* The tree has been freed. */

        ut_print_timestamp(stderr);
        fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
                " a missing index of table %s!\n", table->name);
        drop = FALSE;
    }

    ptr = rec_get_nth_field_old(rec,
                                DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);

    ut_ad(len == 4);

    if (drop) {
        space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
    }

    zip_size = fil_space_get_zip_size(space);

    if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
        /* It is a single table tablespace and the .ibd file is
        missing: do nothing */

        ut_print_timestamp(stderr);
        fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
                " a missing .ibd file of table %s!\n", table->name);
        return(FIL_NULL);
    }

    ptr = rec_get_nth_field_old(rec,
                                DICT_SYS_INDEXES_TYPE_FIELD, &len);
    ut_ad(len == 4);
    type = mach_read_from_4(ptr);

    ptr = rec_get_nth_field_old(rec, 1, &len);
    ut_ad(len == 8);
    index_id = mach_read_from_8(ptr);

    if (!drop) {

        goto create;
    }

    /* We free all the pages but the root page first; this operation
    may span several mini-transactions */

    btr_free_but_not_root(space, zip_size, root_page_no);

    /* Then we free the root page in the same mini-transaction where
    we create the b-tree and write its new root page number to the
    appropriate field in the SYS_INDEXES record: this mini-transaction
    marks the B-tree totally truncated */

    btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr);

    btr_free_root(space, zip_size, root_page_no, mtr);
create:
    /* We will temporarily write FIL_NULL to the PAGE_NO field
    in SYS_INDEXES, so that the database will not get into an
    inconsistent state in case it crashes between the mtr_commit()
    below and the following mtr_commit() call. */
    page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
                         FIL_NULL, mtr);

    /* We will need to commit the mini-transaction in order to avoid
    deadlocks in the btr_create() call, because otherwise we would
    be freeing and allocating pages in the same mini-transaction. */
    btr_pcur_store_position(pcur, mtr);
    mtr_commit(mtr);

    mtr_start(mtr);
    btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);

    /* Find the index corresponding to this SYS_INDEXES record. */
    for (index = UT_LIST_GET_FIRST(table->indexes);
            index;
            index = UT_LIST_GET_NEXT(indexes, index)) {
        if (index->id == index_id) {
            root_page_no = btr_create(type, space, zip_size,
                                      index_id, index, mtr);
            index->page = (unsigned int) root_page_no;
            return(root_page_no);
        }
    }

    ut_print_timestamp(stderr);
    fprintf(stderr,
            "  InnoDB: Index %llu of table %s is missing\n"
            "InnoDB: from the data dictionary during TRUNCATE!\n",
            (ullint) index_id,
            table->name);

    return(FIL_NULL);
}
예제 #18
0
/*************************************************************//**
Innobase uses this function to compare two data fields for which the data type
is such that we must compare whole fields or call MySQL to do the comparison
@return	1, 0, -1, if a is greater, equal, less than b, respectively */
static
int
cmp_whole_field(
/*============*/
	ulint		mtype,		/*!< in: main type */
	ulint		prtype,		/*!< in: precise type */
	const byte*	a,		/*!< in: data field */
	unsigned int	a_length,	/*!< in: data field length,
					not UNIV_SQL_NULL */
	const byte*	b,		/*!< in: data field */
	unsigned int	b_length)	/*!< in: data field length,
					not UNIV_SQL_NULL */
{
	float		f_1;
	float		f_2;
	double		d_1;
	double		d_2;
	int		swap_flag	= 1;

	switch (mtype) {

	case DATA_DECIMAL:
		/* Remove preceding spaces */
		for (; a_length && *a == ' '; a++, a_length--);
		for (; b_length && *b == ' '; b++, b_length--);

		if (*a == '-') {
			if (*b != '-') {
				return(-1);
			}

			a++; b++;
			a_length--;
			b_length--;

			swap_flag = -1;

		} else if (*b == '-') {

			return(1);
		}

		while (a_length > 0 && (*a == '+' || *a == '0')) {
			a++; a_length--;
		}

		while (b_length > 0 && (*b == '+' || *b == '0')) {
			b++; b_length--;
		}

		if (a_length != b_length) {
			if (a_length < b_length) {
				return(-swap_flag);
			}

			return(swap_flag);
		}

		while (a_length > 0 && *a == *b) {

			a++; b++; a_length--;
		}

		if (a_length == 0) {

			return(0);
		}

		if (*a > *b) {
			return(swap_flag);
		}

		return(-swap_flag);
	case DATA_DOUBLE:
		d_1 = mach_double_read(a);
		d_2 = mach_double_read(b);

		if (d_1 > d_2) {
			return(1);
		} else if (d_2 > d_1) {
			return(-1);
		}

		return(0);

	case DATA_FLOAT:
		f_1 = mach_float_read(a);
		f_2 = mach_float_read(b);

		if (f_1 > f_2) {
			return(1);
		} else if (f_2 > f_1) {
			return(-1);
		}

		return(0);
	case DATA_BLOB:
		if (prtype & DATA_BINARY_TYPE) {

			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: Error: comparing a binary BLOB"
				" with a character set sensitive\n"
				"InnoDB: comparison!\n");
		}
		/* fall through */
	case DATA_VARMYSQL:
	case DATA_MYSQL:
		return(innobase_mysql_cmp(
			       (int)(prtype & DATA_MYSQL_TYPE_MASK),
			       (uint)dtype_get_charset_coll(prtype),
			       a, a_length, b, b_length));
	default:
		fprintf(stderr,
			"InnoDB: unknown type number %lu\n",
			(ulong) mtype);
		ut_error;
	}

	return(0);
}
예제 #19
0
/*******************************************************************//**
Roll back an active transaction. */
static
void
trx_rollback_active(
/*================*/
	trx_t*	trx)	/*!< in/out: transaction */
{
	mem_heap_t*	heap;
	que_fork_t*	fork;
	que_thr_t*	thr;
	roll_node_t*	roll_node;
	dict_table_t*	table;
	ib_int64_t	rows_to_undo;
	const char*	unit		= "";
	ibool		dictionary_locked = FALSE;

	heap = mem_heap_create(512);

	fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
	fork->trx = trx;

	thr = que_thr_create(fork, heap);

	roll_node = roll_node_create(heap);

	thr->child = roll_node;
	roll_node->common.parent = thr;

	mutex_enter(&kernel_mutex);

	trx->graph = fork;

	ut_a(thr == que_fork_start_command(fork));

	trx_roll_crash_recv_trx	= trx;
	trx_roll_max_undo_no = trx->undo_no;
	trx_roll_progress_printed_pct = 0;
	rows_to_undo = trx_roll_max_undo_no;

	if (rows_to_undo > 1000000000) {
		rows_to_undo = rows_to_undo / 1000000;
		unit = "M";
	}

	ut_print_timestamp(stderr);
	fprintf(stderr,
		"  InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
		" rows to undo\n",
		(ullint) trx->id,
		(ulong) rows_to_undo, unit);
	mutex_exit(&kernel_mutex);

	trx->mysql_thread_id = os_thread_get_curr_id();

	trx->mysql_process_no = os_proc_get_number();

	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
		row_mysql_lock_data_dictionary(trx);
		dictionary_locked = TRUE;
	}

	que_run_threads(thr);

	mutex_enter(&kernel_mutex);

	while (trx->que_state != TRX_QUE_RUNNING) {

		mutex_exit(&kernel_mutex);

		fprintf(stderr,
			"InnoDB: Waiting for rollback of trx id "
			TRX_ID_FMT " to end\n",
			(ullint) trx->id);
		os_thread_sleep(100000);

		mutex_enter(&kernel_mutex);
	}

	mutex_exit(&kernel_mutex);

	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
	    && trx->table_id != 0) {

		/* If the transaction was for a dictionary operation, we
		drop the relevant table, if it still exists */

		fprintf(stderr,
			"InnoDB: Dropping table with id %llu"
			" in recovery if it exists\n",
			(ullint) trx->table_id);

		table = dict_table_get_on_id_low(trx->table_id);

		if (table) {
			ulint	err;

			fputs("InnoDB: Table found: dropping table ", stderr);
			ut_print_name(stderr, trx, TRUE, table->name);
			fputs(" in recovery\n", stderr);

			err = row_drop_table_for_mysql(table->name, trx, TRUE);
			trx_commit_for_mysql(trx);

			ut_a(err == (int) DB_SUCCESS);
		}
	}

	if (dictionary_locked) {
		row_mysql_unlock_data_dictionary(trx);
	}

	fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT
		" completed\n",
		(ullint) trx->id);
	mem_heap_free(heap);

	trx_roll_crash_recv_trx	= NULL;
}
예제 #20
0
/********************************************************************//**
Flush pages from flash cache.
@return	number of pages have been flushed to tablespace */
UNIV_INTERN
ulint	
fc_flush_to_disk(
/*==================*/
	ibool do_full_io)	/*!< in: whether do full io capacity */
{
	ulint distance;
	byte* page;
	ulint ret;
	ulint space;
	ulint offset;
	ulint page_type;
	ulint i, j;
	ulint pos;
	ulint zip_size;
	ulint block_offset, byte_offset;
	ulint fc_size = fc_get_size();
	ulint fc_blk_size = fc_get_block_size_byte();
	ulint start_offset;
   	ulint data_size;
	fc_block_t *flush_block = NULL;
	ulint c_flush = 0;
    
	ut_ad(!mutex_own(&fc->mutex));
	ut_a(fc->flush_buf->free_pos == 0);

	/* step 1: get the number of blocks need to flush to tablespace */
	flash_cache_mutex_enter();

	distance = fc_get_distance();
	start_offset = fc->flush_off;
    
	if ( distance == 0 ) {
		flash_cache_mutex_exit();
		return 0;
	} else if ( recv_recovery_on ) {
		if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) {
			fc->n_flush_cur = 0;
		} else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) {
			fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance);
		} else {
			fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance);
		}
	} else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)
		&& !do_full_io ) {
		flash_cache_mutex_exit();
		return 0;
	} else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size)
		&& !do_full_io ) {
		fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct);
	} else {
		ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) 
			|| do_full_io );
		fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance);
	}

	flash_cache_mutex_exit();

	/* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */
	i = 0;
	while (i < fc->n_flush_cur) {
		ulint b_space;
		ulint b_offset;
		ulint raw_zip_size;
		ulint size;
		ulint fil_offset;
#ifdef UNIV_FLASH_CACHE_TRACE
		ulint is_v4_blk;
#endif
		byte* page_io;

		flash_cache_mutex_enter();
		pos = ( start_offset + i ) % fc_size;
		flush_block = fc_get_block(pos);

		if (flush_block == NULL) {
			i++;
			flash_cache_mutex_exit();
			continue;
		}

		/* we should get the mutex, as doublewrite may hit this block and invalid the block */
		flash_block_mutex_enter(flush_block->fil_offset);

		flash_cache_mutex_exit();
		
		data_size = fc_block_get_data_size(flush_block);

		if (flush_block->state != BLOCK_READY_FOR_FLUSH) {
			/* if readonly or merge write or already flushed*/
			ut_a (flush_block->state == BLOCK_NOT_USED
				|| flush_block->state == BLOCK_READ_CACHE
				|| flush_block->state == BLOCK_FLUSHED);
			
			i += data_size;

			flash_block_mutex_exit(flush_block->fil_offset);
			if (flush_block->state == BLOCK_NOT_USED) {
				//fc_block_detach(FALSE, flush_block);
				fc_block_free(flush_block);
			}
			
			continue;
		}

		zip_size = fil_space_get_zip_size(flush_block->space);
		if (zip_size == ULINT_UNDEFINED) {
			/* table has been droped, just set it BLOCK_FLUSHED */
#ifdef UNIV_FLASH_CACHE_TRACE
			ut_print_timestamp(fc->f_debug);
			fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n",
			(ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset);
#endif
			flush_block->state = BLOCK_FLUSHED;
			i += data_size;
			c_flush += data_size;
			flash_block_mutex_exit(flush_block->fil_offset);
			continue;
		}

#ifdef UNIV_FLASH_CACHE_TRACE
		if (flush_block->state != BLOCK_READY_FOR_FLUSH) {
			fc_block_print(flush_block);
			ut_error;
		}
#endif

		flush_block->io_fix |= IO_FIX_FLUSH;

		/* 
		 * we should set block state BLOCK_FLUSHED,  if not, doublewrite may hit this block 
		 * and invalid this block and reduce the dirty count, but when finish flush ,we will 
		 * reduce the dirty count too, so it may reduce twice.
		 */
		flush_block->state = BLOCK_FLUSHED;
		
		/* save the block info, as the block may be invalided by doublewrite after release mutex */
		b_space = flush_block->space;
		b_offset = flush_block->offset;

		raw_zip_size = flush_block->raw_zip_size;
		size = flush_block->size;
		fil_offset = flush_block->fil_offset;
#ifdef UNIV_FLASH_CACHE_TRACE
		is_v4_blk = flush_block->is_v4_blk;
#endif
		/* release the block now, so read can hit in this blocks and read the data */
		flash_block_mutex_exit(flush_block->fil_offset);
		
		/*
		 * Only flush thread will update read_buf and flush_off/round. 
		 * there only single flush thread no need to lock read_buf
		 */
		page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size;

		if (raw_zip_size > 0) {
			ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE);
			page_io = fc->flush_zip_read_buf;
		} else {
			page_io = page;
		}

		fc_io_offset(fil_offset, &block_offset, &byte_offset);
		ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0,
				block_offset, byte_offset, data_size * fc_blk_size,
				page_io, NULL);
	
		if (ret != DB_SUCCESS) {
			ut_print_timestamp(stderr);
			fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n"
				"flash cache flush offset is:%lu.\n", (ulong)(start_offset + i));
			ut_error;
		}		

		if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) {
			goto skip;
		}

		/* decompress the compress data */
		if (raw_zip_size > 0) {
#ifdef UNIV_FLASH_CACHE_TRACE
			ulint blk_zip_size_byte;
			if (is_v4_blk) {
				blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte();
			} else {
				blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte();
				ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size);				
			} 

			ut_a(page_io);
			ut_a(page);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM);
			ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER)
				== FC_ZIP_PAGE_CHECKSUM);	
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE);		
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset);	

			/* only qlz can do this check  */
			if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) {
				if (is_v4_blk) {
					ut_a(raw_zip_size * fc_get_block_size_byte()
						>= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
				} else {
					ut_a(raw_zip_size 
						== (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
				}
				
				ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
			}
#endif
			fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page);
		}

		space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
		offset = mach_read_from_4(page + FIL_PAGE_OFFSET);

		if ((space != b_space) || (offset != b_offset)) {
			ut_print_timestamp(stderr); 
			fc_block_print(flush_block);
			ut_error;
		}

		if (buf_page_is_corrupted(page, zip_size)) {
			buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH);
			ut_error;
		}		
		
		page_type = fil_page_get_type(page);
		if (page_type == FIL_PAGE_INDEX) {
			page_type = 1;
		}
		srv_flash_cache_flush_detail[page_type]++;
		
		ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, 
				zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL);
		if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) {
			ut_print_timestamp(stderr); 
			fc_block_print(flush_block);
			ut_error;
		}

		/* add  UNIV_PAGE_SIZE / fc_blk_size for safe */
		fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size;	

skip:
		i += data_size;
		c_flush += data_size;	

		if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) {
			/* FIXME: is it safe to change n_flush, as step 3 will use n_flush */
			fc->n_flush_cur = i;
			break;
		}	
	}

	/* ok, now flush all async io to disk */
	fc_flush_sync_dbfile();

	/* step 3: all the flush blocks have sync to disk,  update the state and io_fix */
	j = 0;
	while (j < fc->n_flush_cur) {

		flash_cache_mutex_enter();
		pos = (start_offset + j) % fc_size;
		flush_block = fc_get_block(pos);

		if (flush_block  == NULL) {
			j++;
			flash_cache_mutex_exit();
			continue;
		}
		/* block state and io_fix may be changed by doublewrite and lru move */
		flash_block_mutex_enter(flush_block->fil_offset);
		flash_cache_mutex_exit();
		if (flush_block->io_fix & IO_FIX_FLUSH) {
			/* the block is already in BLOCK_FLUSHED state */
			flush_block->io_fix &= ~IO_FIX_FLUSH;
		} 
		
		data_size = fc_block_get_data_size(flush_block);
		flash_block_mutex_exit(flush_block->fil_offset);	
		
		j += data_size;
	}

	
	/*
	 * i and j may be different, as the last been flushed block may be invalid by doublewrite,
	 * so maybe i > j
	 */
	
	/* add the actual flushed blocks */
	srv_flash_cache_flush = srv_flash_cache_flush + c_flush; 

	/* step 4: update fc status and flush_off, and wake up threads that are sleep for space  */
	if (i > 0) {
		ut_a(i >= c_flush);

		flash_cache_mutex_enter();
		
		/*
		 * it is safe to inc flush off and sub dirty blocks at this time,
		 * as fc_validate is not work
		 */
		fc_inc_flush_off(i);
		flash_cache_log_mutex_enter();
		fc_log->current_stat->flush_offset = fc->flush_off;
		fc_log->current_stat->flush_round = fc->flush_round;	
		flash_cache_log_mutex_exit();		
		
		ut_a(srv_flash_cache_dirty >= c_flush);		
		srv_flash_cache_dirty -= c_flush;
		
		srv_fc_flush_should_commit_log_flush++;
		os_event_set(fc->wait_space_event);	

		fc->n_flush_cur = 0;
		
		flash_cache_mutex_exit();		
	}

	fc->flush_buf->free_pos = 0;
 
	return c_flush;
}
예제 #21
0
void
trx_free(
/*=====*/
	trx_t*	trx)	/* in, own: trx object */
{
	ut_ad(mutex_own(&kernel_mutex));

	if (trx->declared_to_be_inside_innodb) {
		ut_print_timestamp(stderr);
		fputs("  InnoDB: Error: Freeing a trx which is declared"
		      " to be processing\n"
		      "InnoDB: inside InnoDB.\n", stderr);
		trx_print(stderr, trx, 600);
		putc('\n', stderr);
	}

	if (trx->n_mysql_tables_in_use != 0
	    || trx->mysql_n_tables_locked != 0) {

		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Error: MySQL is freeing a thd\n"
			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
			(ulong)trx->n_mysql_tables_in_use,
			(ulong)trx->mysql_n_tables_locked);

		trx_print(stderr, trx, 600);

		ut_print_buf(stderr, trx, sizeof(trx_t));
	}

	ut_a(trx->magic_n == TRX_MAGIC_N);

	trx->magic_n = 11112222;

	ut_a(trx->conc_state == TRX_NOT_STARTED);

	mutex_free(&(trx->undo_mutex));

	ut_a(trx->insert_undo == NULL);
	ut_a(trx->update_undo == NULL);

	if (trx->undo_no_arr) {
		trx_undo_arr_free(trx->undo_no_arr);
	}

	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);

	ut_a(trx->wait_lock == NULL);
	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);

	ut_a(!trx->has_search_latch);
	ut_a(!trx->auto_inc_lock);

	ut_a(trx->dict_operation_lock_mode == 0);

	if (trx->lock_heap) {
		mem_heap_free(trx->lock_heap);
	}

	ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);

	if (trx->global_read_view_heap) {
		mem_heap_free(trx->global_read_view_heap);
	}

	trx->global_read_view = NULL;

	ut_a(trx->read_view == NULL);

	mem_free(trx);
}
예제 #22
0
/********************************************************************//**
Low-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there, in which case does nothing.
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
flag is cleared and the x-lock released by an i/o-handler thread.
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
not exist or is being dropped 
@return 1 if read request is issued. 0 if it is not */
static
ulint
buf_read_page_low(
/*==============*/
	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
			trying to read from a non-existent tablespace, or a
			tablespace which is just now being dropped */
	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
			at read-ahead functions) */
	ulint	space,	/*!< in: space id */
	ulint	zip_size,/*!< in: compressed page size, or 0 */
	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
	ib_int64_t tablespace_version, /*!< in: if the space memory object has
			this timestamp different from what we are giving here,
			treat the tablespace as dropped; this is a timestamp we
			use to stop dangling page reads from a tablespace
			which we have DISCARDed + IMPORTed back */
	ulint	offset)	/*!< in: page number */
{
	buf_page_t*	bpage;
	ulint		wake_later;

	*err = DB_SUCCESS;

	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;

	if (trx_doublewrite && space == TRX_SYS_SPACE
	    && (   (offset >= trx_doublewrite->block1
		    && offset < trx_doublewrite->block1
		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
		   || (offset >= trx_doublewrite->block2
		       && offset < trx_doublewrite->block2
		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Warning: trying to read"
			" doublewrite buffer page %lu\n",
			(ulong) offset);

		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* Trx sys header is so low in the latching order that we play
		safe and do not leave the i/o-completion to an asynchronous
		i/o-thread. Ibuf bitmap pages must always be read with
		syncronous i/o, to make sure they do not get involved in
		thread deadlocks. */

		sync = TRUE;
	}

	/* The following call will also check if the tablespace does not exist
	or is being dropped; if we succeed in initing the page in the buffer
	pool for read, then DISCARD cannot proceed until the read has
	completed */
	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
				       tablespace_version, offset);
	if (bpage == NULL) {

		return(0);
	}

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Posting read request for page %lu, sync %lu\n",
			(ulong) offset,
			(ulong) sync);
	}
#endif

	ut_ad(buf_page_in_file(bpage));

	if (zip_size) {
		*err = fil_io(OS_FILE_READ | wake_later,
			      sync, space, zip_size, offset, 0, zip_size,
			      bpage->zip.data, bpage);
	} else {
		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);

		*err = fil_io(OS_FILE_READ | wake_later,
			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
			      ((buf_block_t*) bpage)->frame, bpage);
	}
	ut_a(*err == DB_SUCCESS);

	if (sync) {
		/* The i/o is already completed when we arrive from
		fil_read */
		buf_page_io_complete(bpage);
	}

	return(1);
}
예제 #23
0
파일: trx0trx.c 프로젝트: huahuaxu/MySQL5.1
/********************************************************************//**
Frees a transaction object. */
UNIV_INTERN
void
trx_free(
/*=====*/
	trx_t*	trx)	/*!< in, own: trx object */
{
	ut_ad(mutex_own(&kernel_mutex));

	if (trx->declared_to_be_inside_innodb) {
		ut_print_timestamp(stderr);
		fputs("  InnoDB: Error: Freeing a trx which is declared"
		      " to be processing\n"
		      "InnoDB: inside InnoDB.\n", stderr);
		trx_print(stderr, trx, 600);
		putc('\n', stderr);

		/* This is an error but not a fatal error. We must keep
		the counters like srv_conc_n_threads accurate. */
		srv_conc_force_exit_innodb(trx);
	}

	if (trx->n_mysql_tables_in_use != 0
	    || trx->mysql_n_tables_locked != 0) {

		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Error: MySQL is freeing a thd\n"
			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
			(ulong)trx->n_mysql_tables_in_use,
			(ulong)trx->mysql_n_tables_locked);

		trx_print(stderr, trx, 600);

		ut_print_buf(stderr, trx, sizeof(trx_t));
		putc('\n', stderr);
	}

	ut_a(trx->magic_n == TRX_MAGIC_N);

	trx->magic_n = 11112222;

	ut_a(trx->conc_state == TRX_NOT_STARTED);

	mutex_free(&(trx->undo_mutex));

	ut_a(trx->insert_undo == NULL);
	ut_a(trx->update_undo == NULL);

	if (trx->undo_no_arr) {
		trx_undo_arr_free(trx->undo_no_arr);
	}

	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);

	ut_a(trx->wait_lock == NULL);
	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);

	ut_a(!trx->has_search_latch);

	ut_a(trx->dict_operation_lock_mode == 0);

	if (trx->lock_heap) {
		mem_heap_free(trx->lock_heap);
	}

	ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);

	if (trx->global_read_view_heap) {
		mem_heap_free(trx->global_read_view_heap);
	}

	trx->global_read_view = NULL;

	ut_a(trx->read_view == NULL);

	ut_a(ib_vector_is_empty(trx->autoinc_locks));
	/* We allocated a dedicated heap for the vector. */
	ib_vector_free(trx->autoinc_locks);

	mem_free(trx);
}
예제 #24
0
/*******************************************************************//**
Rollback or clean up any incomplete transactions which were
encountered in crash recovery.  If the transaction already was
committed, then we clean up a possible insert undo log. If the
transaction was not yet committed, then we roll it back. */
UNIV_INTERN
void
trx_rollback_or_clean_recovered(
/*============================*/
	ibool	all)	/*!< in: FALSE=roll back dictionary transactions;
			TRUE=roll back all non-PREPARED transactions */
{
	trx_t*	trx;

	mutex_enter(&kernel_mutex);

	if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) {
		goto leave_function;
	}

	if (all) {
		fprintf(stderr,
			"InnoDB: Starting in background the rollback"
			" of uncommitted transactions\n");
	}

	mutex_exit(&kernel_mutex);

loop:
	mutex_enter(&kernel_mutex);

	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx;
	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
		if (!trx->is_recovered) {
			continue;
		}

		switch (trx->conc_state) {
		case TRX_NOT_STARTED:
		case TRX_PREPARED:
			continue;

		case TRX_COMMITTED_IN_MEMORY:
			mutex_exit(&kernel_mutex);
			fprintf(stderr,
				"InnoDB: Cleaning up trx with id "
				TRX_ID_FMT "\n",
				(ullint) trx->id);
			trx_cleanup_at_db_startup(trx);
			goto loop;

		case TRX_ACTIVE:
			if (all || trx_get_dict_operation(trx)
			    != TRX_DICT_OP_NONE) {
				mutex_exit(&kernel_mutex);
				trx_rollback_active(trx);
				goto loop;
			}
		}
	}

	if (all) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Rollback of non-prepared"
			" transactions completed\n");
	}

leave_function:
	mutex_exit(&kernel_mutex);
}
예제 #25
0
파일: ut0mem.c 프로젝트: H0bby/GCS-SQL
/**********************************************************************//**
Allocates memory.
@return	own: allocated memory */
UNIV_INTERN
void*
ut_malloc_low(
/*==========*/
	ulint	n,		/*!< in: number of bytes to allocate */
	ibool	assert_on_error)/*!< in: if TRUE, we crash mysqld if the
				memory cannot be allocated */
{
#ifndef UNIV_HOTBACKUP
	ulint	retry_count;
	void*	ret;

	if (UNIV_LIKELY(srv_use_sys_malloc)) {
		ret = malloc(n);
		ut_a(ret || !assert_on_error);

		return(ret);
	}

	ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */
	ut_a(ut_mem_block_list_inited);

	retry_count = 0;
retry:
	os_fast_mutex_lock(&ut_list_mutex);

	ret = malloc(n + sizeof(ut_mem_block_t));

	if (ret == NULL && retry_count < 60) {
		if (retry_count == 0) {
			ut_print_timestamp(stderr);

			fprintf(stderr,
				"  InnoDB: Error: cannot allocate"
				" %lu bytes of\n"
				"InnoDB: memory with malloc!"
				" Total allocated memory\n"
				"InnoDB: by InnoDB %lu bytes."
				" Operating system errno: %lu\n"
				"InnoDB: Check if you should"
				" increase the swap file or\n"
				"InnoDB: ulimits of your operating system.\n"
				"InnoDB: On FreeBSD check you"
				" have compiled the OS with\n"
				"InnoDB: a big enough maximum process size.\n"
				"InnoDB: Note that in most 32-bit"
				" computers the process\n"
				"InnoDB: memory space is limited"
				" to 2 GB or 4 GB.\n"
				"InnoDB: We keep retrying"
				" the allocation for 60 seconds...\n",
				(ulong) n, (ulong) ut_total_allocated_memory,
#ifdef __WIN__
				(ulong) GetLastError()
#else
				(ulong) errno
#endif
				);
		}

		os_fast_mutex_unlock(&ut_list_mutex);

		/* Sleep for a second and retry the allocation; maybe this is
		just a temporary shortage of memory */

		os_thread_sleep(1000000);

		retry_count++;

		goto retry;
	}

	if (ret == NULL) {
		/* Flush stderr to make more probable that the error
		message gets in the error file before we generate a seg
		fault */

		fflush(stderr);

		os_fast_mutex_unlock(&ut_list_mutex);

		/* Make an intentional seg fault so that we get a stack
		trace */
		if (assert_on_error) {
			ut_print_timestamp(stderr);

			fprintf(stderr,
				"  InnoDB: We now intentionally"
				" generate a seg fault so that\n"
				"InnoDB: on Linux we get a stack trace.\n");

			if (*ut_mem_null_ptr) ut_mem_null_ptr = 0;
		} else {
			return(NULL);
		}
	}

	UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t));

	((ut_mem_block_t*)ret)->size = n + sizeof(ut_mem_block_t);
	((ut_mem_block_t*)ret)->magic_n = UT_MEM_MAGIC_N;

	ut_total_allocated_memory += n + sizeof(ut_mem_block_t);

	UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list,
			  ((ut_mem_block_t*)ret));
	os_fast_mutex_unlock(&ut_list_mutex);

	return((void*)((byte*)ret + sizeof(ut_mem_block_t)));
#else /* !UNIV_HOTBACKUP */
	void*	ret = malloc(n);
	ut_a(ret || !assert_on_error);

	return(ret);
#endif /* !UNIV_HOTBACKUP */
}
예제 #26
0
파일: buf0rea.c 프로젝트: MPlatform/mariadb
/********************************************************************//**
Low-level function which reads a page asynchronously from a file to the
buffer buf_pool if it is not already there, in which case does nothing.
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
flag is cleared and the x-lock released by an i/o-handler thread.
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
not exist or is being dropped 
@return 1 if read request is issued. 0 if it is not */
UNIV_INTERN
ulint
buf_read_page_low(
/*==============*/
	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
			trying to read from a non-existent tablespace, or a
			tablespace which is just now being dropped */
	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
			at read-ahead functions) */
	ulint	space,	/*!< in: space id */
	ulint	zip_size,/*!< in: compressed page size, or 0 */
	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
	ib_int64_t tablespace_version, /*!< in: if the space memory object has
			this timestamp different from what we are giving here,
			treat the tablespace as dropped; this is a timestamp we
			use to stop dangling page reads from a tablespace
			which we have DISCARDed + IMPORTed back */
	ulint	offset,	/*!< in: page number */
	trx_t*	trx)
{
	buf_page_t*	bpage;
	ulint		wake_later;

	*err = DB_SUCCESS;

	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;

	if (trx_doublewrite
	    && (space == TRX_SYS_SPACE
		|| (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
	    && (   (offset >= trx_doublewrite->block1
		    && offset < trx_doublewrite->block1
		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
		   || (offset >= trx_doublewrite->block2
		       && offset < trx_doublewrite->block2
		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: Warning: trying to read"
			" doublewrite buffer page %lu\n",
			(ulong) offset);

		return(0);
	}

	if (ibuf_bitmap_page(zip_size, offset)
	    || trx_sys_hdr_page(space, offset)) {

		/* Trx sys header is so low in the latching order that we play
		safe and do not leave the i/o-completion to an asynchronous
		i/o-thread. Ibuf bitmap pages must always be read with
		syncronous i/o, to make sure they do not get involved in
		thread deadlocks. */

		sync = TRUE;
	}

	/* The following call will also check if the tablespace does not exist
	or is being dropped; if we succeed in initing the page in the buffer
	pool for read, then DISCARD cannot proceed until the read has
	completed */
	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
				       tablespace_version, offset);
	if (bpage == NULL) {
		/* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
		if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
			/* hashed log recs must be treated here */
			recv_addr_t*    recv_addr;

			mutex_enter(&(recv_sys->mutex));

			if (recv_sys->apply_log_recs == FALSE) {
				mutex_exit(&(recv_sys->mutex));
				goto not_to_recover;
			}

			/* recv_get_fil_addr_struct() */
			recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
					hash_calc_hash(ut_fold_ulint_pair(space, offset),
						recv_sys->addr_hash));
			while (recv_addr) {
				if ((recv_addr->space == space)
					&& (recv_addr->page_no == offset)) {
					break;
				}
				recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
			}

			if ((recv_addr == NULL)
			    || (recv_addr->state == RECV_BEING_PROCESSED)
			    || (recv_addr->state == RECV_PROCESSED)) {
				mutex_exit(&(recv_sys->mutex));
				goto not_to_recover;
			}

			fprintf(stderr, " (cannot find space: %lu)", space);
			recv_addr->state = RECV_PROCESSED;

			ut_a(recv_sys->n_addrs);
			recv_sys->n_addrs--;

			mutex_exit(&(recv_sys->mutex));
		}
not_to_recover:

		return(0);
	}

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Posting read request for page %lu, sync %lu\n",
			(ulong) offset,
			(ulong) sync);
	}
#endif

	ut_ad(buf_page_in_file(bpage));

	if(sync) {
		thd_wait_begin(NULL, THD_WAIT_DISKIO);
	}

	if (zip_size) {
		*err = _fil_io(OS_FILE_READ | wake_later,
			      sync, space, zip_size, offset, 0, zip_size,
			      bpage->zip.data, bpage, trx);
	} else {
		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);

		*err = _fil_io(OS_FILE_READ | wake_later,
			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
			      ((buf_block_t*) bpage)->frame, bpage, trx);
	}

	if (sync) {
		thd_wait_end(NULL);
	}

	if (*err == DB_TABLESPACE_DELETED) {
		buf_read_page_handle_error(bpage);
		return(0);
	}

	if (srv_pass_corrupt_table) {
		if (*err != DB_SUCCESS) {
			bpage->is_corrupt = TRUE;
		}
	} else {
	ut_a(*err == DB_SUCCESS);
	}

	if (sync) {
		/* The i/o is already completed when we arrive from
		fil_read */
		if (!buf_page_io_complete(bpage)) {
			return(0);
		}
	}

	return(1);
}
예제 #27
0
파일: trx0trx.c 프로젝트: huahuaxu/MySQL5.1
/**********************************************************************//**
This function is used to find number of prepared transactions and
their transaction objects for a recovery.
@return	number of prepared transactions stored in xid_list */
UNIV_INTERN
int
trx_recover_for_mysql(
/*==================*/
	XID*	xid_list,	/*!< in/out: prepared transactions */
	ulint	len)		/*!< in: number of slots in xid_list */
{
	trx_t*	trx;
	ulint	count = 0;

	ut_ad(xid_list);
	ut_ad(len);

	/* We should set those transactions which are in the prepared state
	to the xid_list */

	mutex_enter(&kernel_mutex);

	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);

	while (trx) {
		if (trx->conc_state == TRX_PREPARED) {
			xid_list[count] = trx->xid;

			if (count == 0) {
				ut_print_timestamp(stderr);
				fprintf(stderr,
					"  InnoDB: Starting recovery for"
					" XA transactions...\n");
			}

			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: Transaction " TRX_ID_FMT " in"
				" prepared state after recovery\n",
				TRX_ID_PREP_PRINTF(trx->id));

			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: Transaction contains changes"
				" to %lu rows\n",
				(ulong) ut_conv_dulint_to_longlong(
					trx->undo_no));

			count++;

			if (count == len) {
				break;
			}
		}

		trx = UT_LIST_GET_NEXT(trx_list, trx);
	}

	mutex_exit(&kernel_mutex);

	if (count > 0){
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"  InnoDB: %lu transactions in prepared state"
			" after recovery\n",
			(ulong) count);
	}

	return ((int) count);
}
예제 #28
0
buf_block_t*
buf_LRU_get_free_block(void)
/*========================*/
				/* out: the free control block; also if AWE is
				used, it is guaranteed that the block has its
				page mapped to a frame when we return */
{
	buf_block_t*	block		= NULL;
	ibool		freed;
	ulint		n_iterations	= 1;
	ibool		mon_value_was	= FALSE;
	ibool		started_monitor	= FALSE;
loop:
	mutex_enter(&(buf_pool->mutex));

	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) {
		ut_print_timestamp(stderr);

		fprintf(stderr,
			"  InnoDB: ERROR: over 95 percent of the buffer pool"
			" is occupied by\n"
			"InnoDB: lock heaps or the adaptive hash index!"
			" Check that your\n"
			"InnoDB: transactions do not set too many row locks.\n"
			"InnoDB: Your buffer pool size is %lu MB."
			" Maybe you should make\n"
			"InnoDB: the buffer pool bigger?\n"
			"InnoDB: We intentionally generate a seg fault"
			" to print a stack trace\n"
			"InnoDB: on Linux!\n",
			(ulong) (buf_pool->curr_size
				 / (1024 * 1024 / UNIV_PAGE_SIZE)));

		ut_error;

	} else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
		   + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 3) {

		if (!buf_lru_switched_on_innodb_mon) {

	   		/* Over 67 % of the buffer pool is occupied by lock
			heaps or the adaptive hash index. This may be a memory
			leak! */

			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: WARNING: over 67 percent of"
				" the buffer pool is occupied by\n"
				"InnoDB: lock heaps or the adaptive"
				" hash index! Check that your\n"
				"InnoDB: transactions do not set too many"
				" row locks.\n"
				"InnoDB: Your buffer pool size is %lu MB."
				" Maybe you should make\n"
				"InnoDB: the buffer pool bigger?\n"
				"InnoDB: Starting the InnoDB Monitor to print"
				" diagnostics, including\n"
				"InnoDB: lock heap and hash index sizes.\n",
				(ulong) (buf_pool->curr_size
					 / (1024 * 1024 / UNIV_PAGE_SIZE)));

			buf_lru_switched_on_innodb_mon = TRUE;
			srv_print_innodb_monitor = TRUE;
			os_event_set(srv_lock_timeout_thread_event);
		}
	} else if (buf_lru_switched_on_innodb_mon) {

		/* Switch off the InnoDB Monitor; this is a simple way
		to stop the monitor if the situation becomes less urgent,
		but may also surprise users if the user also switched on the
		monitor! */

		buf_lru_switched_on_innodb_mon = FALSE;
		srv_print_innodb_monitor = FALSE;
	}

	/* If there is a block in the free list, take it */
	if (UT_LIST_GET_LEN(buf_pool->free) > 0) {

		block = UT_LIST_GET_FIRST(buf_pool->free);
		ut_a(block->in_free_list);
		UT_LIST_REMOVE(free, buf_pool->free, block);
		block->in_free_list = FALSE;
		ut_a(block->state != BUF_BLOCK_FILE_PAGE);
		ut_a(!block->in_LRU_list);

		if (srv_use_awe) {
			if (block->frame) {
				/* Remove from the list of mapped pages */

				UT_LIST_REMOVE(awe_LRU_free_mapped,
					       buf_pool->awe_LRU_free_mapped,
					       block);
			} else {
				/* We map the page to a frame; second param
				FALSE below because we do not want it to be
				added to the awe_LRU_free_mapped list */

				buf_awe_map_page_to_frame(block, FALSE);
			}
		}

		mutex_enter(&block->mutex);

		block->state = BUF_BLOCK_READY_FOR_USE;
		UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);

		mutex_exit(&block->mutex);

		mutex_exit(&(buf_pool->mutex));

		if (started_monitor) {
			srv_print_innodb_monitor = mon_value_was;
		}

		return(block);
	}

	/* If no block was in the free list, search from the end of the LRU
	list and try to free a block there */

	mutex_exit(&(buf_pool->mutex));

	freed = buf_LRU_search_and_free_block(n_iterations);

	if (freed > 0) {
		goto loop;
	}

	if (n_iterations > 30) {
		ut_print_timestamp(stderr);
		fprintf(stderr,
			"InnoDB: Warning: difficult to find free blocks from\n"
			"InnoDB: the buffer pool (%lu search iterations)!"
			" Consider\n"
			"InnoDB: increasing the buffer pool size.\n"
			"InnoDB: It is also possible that"
			" in your Unix version\n"
			"InnoDB: fsync is very slow, or"
			" completely frozen inside\n"
			"InnoDB: the OS kernel. Then upgrading to"
			" a newer version\n"
			"InnoDB: of your operating system may help."
			" Look at the\n"
			"InnoDB: number of fsyncs in diagnostic info below.\n"
			"InnoDB: Pending flushes (fsync) log: %lu;"
			" buffer pool: %lu\n"
			"InnoDB: %lu OS file reads, %lu OS file writes,"
			" %lu OS fsyncs\n"
			"InnoDB: Starting InnoDB Monitor to print further\n"
			"InnoDB: diagnostics to the standard output.\n",
			(ulong) n_iterations,
			(ulong) fil_n_pending_log_flushes,
			(ulong) fil_n_pending_tablespace_flushes,
			(ulong) os_n_file_reads, (ulong) os_n_file_writes,
			(ulong) os_n_fsyncs);

		mon_value_was = srv_print_innodb_monitor;
		started_monitor = TRUE;
		srv_print_innodb_monitor = TRUE;
		os_event_set(srv_lock_timeout_thread_event);
	}

	/* No free block was found: try to flush the LRU list */

	buf_flush_free_margin();
	++srv_buf_pool_wait_free;

	os_aio_simulated_wake_handler_threads();

	mutex_enter(&(buf_pool->mutex));

	if (buf_pool->LRU_flush_ended > 0) {
		/* We have written pages in an LRU flush. To make the insert
		buffer more efficient, we try to move these pages to the free
		list. */

		mutex_exit(&(buf_pool->mutex));

		buf_LRU_try_free_flushed_blocks();
	} else {
		mutex_exit(&(buf_pool->mutex));
	}

	if (n_iterations > 10) {

		os_thread_sleep(500000);
	}

	n_iterations++;

	goto loop;
}
예제 #29
0
/***********************************************************************//**
Updates the last not yet purged history log info in rseg when we have purged
a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */
static
void
trx_purge_rseg_get_next_history_log(
    /*================================*/
    trx_rseg_t*	rseg)	/*!< in: rollback segment */
{
    page_t*		undo_page;
    trx_ulogf_t*	log_hdr;
    fil_addr_t	prev_log_addr;
    trx_id_t	trx_no;
    ibool		del_marks;
    mtr_t		mtr;
    rseg_queue_t	rseg_queue;
    const void*	ptr;

    mutex_enter(&(rseg->mutex));

    ut_a(rseg->last_page_no != FIL_NULL);

    purge_sys->purge_trx_no = rseg->last_trx_no + 1;
    purge_sys->purge_undo_no = 0;
    purge_sys->next_stored = FALSE;

    mtr_start(&mtr);

    undo_page = trx_undo_page_get_s_latched(
                    rseg->space, rseg->zip_size, rseg->last_page_no, &mtr);

    log_hdr = undo_page + rseg->last_offset;

    /* Increase the purge page count by one for every handled log */

    purge_sys->n_pages_handled++;

    prev_log_addr = trx_purge_get_log_from_hist(
                        flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));

    if (prev_log_addr.page == FIL_NULL) {
        /* No logs left in the history list */

        rseg->last_page_no = FIL_NULL;

        mutex_exit(&(rseg->mutex));
        mtr_commit(&mtr);

        mutex_enter(&kernel_mutex);

        /* Add debug code to track history list corruption reported
        on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
        file-based list was corrupt. The prev node pointer was
        FIL_NULL, even though the list length was over 8 million nodes!
        We assume that purge truncates the history list in large
        size pieces, and if we here reach the head of the list, the
        list cannot be longer than 2000 000 undo logs now. */

        if (trx_sys->rseg_history_len > 2000000) {
            ut_print_timestamp(stderr);
            fprintf(stderr,
                    "  InnoDB: Warning: purge reached the"
                    " head of the history list,\n"
                    "InnoDB: but its length is still"
                    " reported as %lu! Make a detailed bug\n"
                    "InnoDB: report, and submit it"
                    " to http://bugs.mysql.com\n",
                    (ulong) trx_sys->rseg_history_len);
        }

        mutex_exit(&kernel_mutex);

        return;
    }

    mutex_exit(&(rseg->mutex));
    mtr_commit(&mtr);

    /* Read the trx number and del marks from the previous log header */
    mtr_start(&mtr);

    log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
                                          prev_log_addr.page, &mtr)
              + prev_log_addr.boffset;

    trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);

    del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS);

    mtr_commit(&mtr);

    mutex_enter(&(rseg->mutex));

    rseg->last_page_no = prev_log_addr.page;
    rseg->last_offset = prev_log_addr.boffset;
    rseg->last_trx_no = trx_no;
    rseg->last_del_marks = del_marks;

    rseg_queue.rseg = rseg;
    rseg_queue.trx_no = rseg->last_trx_no;

    /* Purge can also produce events, however these are already ordered
    in the rollback segment and any user generated event will be greater
    than the events that Purge produces. ie. Purge can never produce
    events from an empty rollback segment. */

    mutex_enter(&purge_sys->bh_mutex);

    ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
    ut_a(ptr != NULL);

    mutex_exit(&purge_sys->bh_mutex);

    mutex_exit(&(rseg->mutex));
}
예제 #30
0
/************************************************************************
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
	buf_block_t*	block;
	byte*		write_buf;
	ulint		len;
	ulint		len2;
	ulint		i;

	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
		os_aio_simulated_wake_handler_threads();

		return;
	}

	mutex_enter(&(trx_doublewrite->mutex));

	/* Write first to doublewrite buffer blocks. We use synchronous
	aio and thus know that file write has been completed when the
	control returns. */

	if (trx_doublewrite->first_free == 0) {

		mutex_exit(&(trx_doublewrite->mutex));

		return;
	}

	for (i = 0; i < trx_doublewrite->first_free; i++) {

		block = trx_doublewrite->buf_block_arr[i];
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
		    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be written"
				" seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in the buffer pool\n"
				"InnoDB: before posting to the"
				" doublewrite buffer.\n");
		}

		if (block->check_index_page_at_flush
		    && !page_simple_validate(block->frame)) {

			buf_page_print(block->frame);

			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: Apparent corruption of an"
				" index page n:o %lu in space %lu\n"
				"InnoDB: to be written to data file."
				" We intentionally crash server\n"
				"InnoDB: to prevent corrupt data"
				" from ending up in data\n"
				"InnoDB: files.\n",
				(ulong) block->offset, (ulong) block->space);

			ut_error;
		}
	}

	/* increment the doublewrite flushed pages counter */
	srv_dblwr_pages_written+= trx_doublewrite->first_free;
	srv_dblwr_writes++;

	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
	} else {
		len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
	}

	fil_io(OS_FILE_WRITE,
	       TRUE, TRX_SYS_SPACE,
	       trx_doublewrite->block1, 0, len,
	       (void*)trx_doublewrite->write_buf, NULL);

	write_buf = trx_doublewrite->write_buf;

	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) {
		if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4)
		    != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be written"
				" seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in the doublewrite block1.\n");
		}
	}

	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = (trx_doublewrite->first_free
		       - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;

		fil_io(OS_FILE_WRITE,
		       TRUE, TRX_SYS_SPACE,
		       trx_doublewrite->block2, 0, len,
		       (void*)(trx_doublewrite->write_buf
			       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
			       * UNIV_PAGE_SIZE),
		       NULL);

		write_buf = trx_doublewrite->write_buf
			+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
		for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
		     len2 += UNIV_PAGE_SIZE) {
			if (mach_read_from_4(write_buf + len2
					     + FIL_PAGE_LSN + 4)
			    != mach_read_from_4(write_buf + len2
						+ UNIV_PAGE_SIZE
						- FIL_PAGE_END_LSN_OLD_CHKSUM
						+ 4)) {
				ut_print_timestamp(stderr);
				fprintf(stderr,
					"  InnoDB: ERROR: The page to be"
					" written seems corrupt!\n"
					"InnoDB: The lsn fields do not match!"
					" Noticed in"
					" the doublewrite block2.\n");
			}
		}
	}

	/* Now flush the doublewrite buffer data to disk */

	fil_flush(TRX_SYS_SPACE);

	/* We know that the writes have been flushed to disk now
	and in recovery we will find them in the doublewrite buffer
	blocks. Next do the writes to the intended positions. */

	for (i = 0; i < trx_doublewrite->first_free; i++) {
		block = trx_doublewrite->buf_block_arr[i];

		if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
		    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
			ut_print_timestamp(stderr);
			fprintf(stderr,
				"  InnoDB: ERROR: The page to be written"
				" seems corrupt!\n"
				"InnoDB: The lsn fields do not match!"
				" Noticed in the buffer pool\n"
				"InnoDB: after posting and flushing"
				" the doublewrite buffer.\n"
				"InnoDB: Page buf fix count %lu,"
				" io fix %lu, state %lu\n",
				(ulong)block->buf_fix_count,
				(ulong)block->io_fix,
				(ulong)block->state);
		}
		ut_a(block->state == BUF_BLOCK_FILE_PAGE);

		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
		       FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
		       (void*)block->frame, (void*)block);
	}

	/* Wake possible simulated aio thread to actually post the
	writes to the operating system */

	os_aio_simulated_wake_handler_threads();

	/* Wait that all async writes to tablespaces have been posted to
	the OS */

	os_aio_wait_until_no_pending_writes();

	/* Now we flush the data to disk (for example, with fsync) */

	fil_flush_file_spaces(FIL_TABLESPACE);

	/* We can now reuse the doublewrite memory buffer: */

	trx_doublewrite->first_free = 0;

	mutex_exit(&(trx_doublewrite->mutex));
}