/*********************************************************************//**
Creates the memory copies for rollback segments and initializes the
rseg list and array in trx_sys at a database startup. */
UNIV_INTERN
void
trx_rseg_list_and_array_init(
/*=========================*/
	trx_sysf_t*	sys_header,	/*!< in: trx system header */
	mtr_t*		mtr)		/*!< in: mtr */
{
	ulint	i;
	ulint	page_no;
	ulint	space;

	UT_LIST_INIT(trx_sys->rseg_list);

	trx_sys->rseg_history_len = 0;

	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {

		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);

		if (page_no == FIL_NULL) {

			trx_sys_set_nth_rseg(trx_sys, i, NULL);
		} else {
			ulint	zip_size;

			space = trx_sysf_rseg_get_space(sys_header, i, mtr);

			zip_size = space ? fil_space_get_zip_size(space) : 0;

			trx_rseg_mem_create(i, space, zip_size, page_no, mtr);
		}
	}
}
/********************************************************************
Creates the memory copies for the rollback segments and initializes the
rseg list and array in trx_sys at a database startup. */
static
void
trx_rseg_create_instance(
/*=====================*/
	trx_sysf_t*	sys_header,	/*!< in: trx system header */
	mtr_t*		mtr)		/*!< in: mtr */
{
	ulint		i;

	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
		ulint	page_no;

		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);

		if (page_no == FIL_NULL) {
			trx_sys_set_nth_rseg(trx_sys, i, NULL);
		} else {
			ulint		space;
			ulint		zip_size;
			trx_rseg_t*	rseg = NULL;

			ut_a(!trx_rseg_get_on_id(i));

			space = trx_sysf_rseg_get_space(sys_header, i, mtr);

			zip_size = space ? fil_space_get_zip_size(space) : 0;

			rseg = trx_rseg_mem_create(
				i, space, zip_size, page_no, mtr);

			ut_a(rseg->id == i);
		}
	}
}
Exemple #3
0
/*******************************************************************//**
Drops the index tree associated with a row in SYS_INDEXES table. */
UNIV_INTERN
void
dict_drop_index_tree(
/*=================*/
	rec_t*	rec,	/*!< in/out: record in the clustered index
			of SYS_INDEXES table */
	mtr_t*	mtr)	/*!< in: mtr having the latch on the record page */
{
	ulint		root_page_no;
	ulint		space;
	ulint		zip_size;
	const byte*	ptr;
	ulint		len;

	ut_ad(mutex_own(&(dict_sys->mutex)));
	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

	ut_ad(len == 4);

	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

	if (root_page_no == FIL_NULL) {
		/* The tree has already been freed */

		return;
	}

	ptr = rec_get_nth_field_old(rec,
				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);

	ut_ad(len == 4);

	space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
	zip_size = fil_space_get_zip_size(space);

	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
		/* It is a single table tablespace and the .ibd file is
		missing: do nothing */

		return;
	}

	/* We free all the pages but the root page first; this operation
	may span several mini-transactions */

	btr_free_but_not_root(space, zip_size, root_page_no);

	/* Then we free the root page in the same mini-transaction where
	we write FIL_NULL to the appropriate field in the SYS_INDEXES
	record: this mini-transaction marks the B-tree totally freed */

	/* printf("Dropping index tree in space %lu root page %lu\n", space,
	root_page_no); */
	btr_free_root(space, zip_size, root_page_no, mtr);

	page_rec_write_index_page_no(rec,
				     DICT_SYS_INDEXES_PAGE_NO_FIELD,
				     FIL_NULL, mtr);
}
Exemple #4
0
/********************************************************************//**
Cuts off the tail of the list, including the node given. The number of
nodes which will be removed must be provided by the caller, as this function
does not measure the length of the tail. */
UNIV_INTERN
void
flst_cut_end(
/*=========*/
	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
	flst_node_t*		node2,	/*!< in: first node to remove */
	ulint			n_nodes,/*!< in: number of nodes to remove,
					must be >= 1 */
	mtr_t*			mtr)	/*!< in: mini-transaction handle */
{
	ulint		space;
	flst_node_t*	node1;
	fil_addr_t	node1_addr;
	fil_addr_t	node2_addr;
	ulint		len;

	ut_ad(mtr && node2 && base);
	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
	ut_ad(n_nodes > 0);

	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);

	node1_addr = flst_get_prev_addr(node2, mtr);

	if (!fil_addr_is_null(node1_addr)) {

		/* Update next field of node1 */

		if (node1_addr.page == node2_addr.page) {

			node1 = page_align(node2) + node1_addr.boffset;
		} else {
			node1 = fut_get_ptr(space,
					    fil_space_get_zip_size(space),
					    node1_addr, RW_X_LATCH, mtr);
		}

		flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr);
	} else {
		/* node2 was first in list: update the field in base */
		flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
	}

	flst_write_addr(base + FLST_LAST, node1_addr, mtr);

	/* Update len of base node */
	len = flst_get_len(base, mtr);
	ut_ad(len >= n_nodes);

	mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
}
Exemple #5
0
/********************************************************************//**
Inserts a node after another in a list. */
UNIV_INTERN
void
flst_insert_after(
/*==============*/
	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
	flst_node_t*		node1,	/*!< in: node to insert after */
	flst_node_t*		node2,	/*!< in: node to add */
	mtr_t*			mtr)	/*!< in: mini-transaction handle */
{
	ulint		space;
	fil_addr_t	node1_addr;
	fil_addr_t	node2_addr;
	flst_node_t*	node3;
	fil_addr_t	node3_addr;
	ulint		len;

	ut_ad(mtr && node1 && node2 && base);
	ut_ad(base != node1);
	ut_ad(base != node2);
	ut_ad(node2 != node1);
	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
	ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX));
	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));

	buf_ptr_get_fsp_addr(node1, &space, &node1_addr);
	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);

	node3_addr = flst_get_next_addr(node1, mtr);

	/* Set prev and next fields of node2 */
	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);

	if (!fil_addr_is_null(node3_addr)) {
		/* Update prev field of node3 */
		ulint	zip_size = fil_space_get_zip_size(space);

		node3 = fut_get_ptr(space, zip_size,
				    node3_addr, RW_X_LATCH, mtr);
		flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
	} else {
		/* node1 was last in list: update last field in base */
		flst_write_addr(base + FLST_LAST, node2_addr, mtr);
	}

	/* Set next field of node1 */
	flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);

	/* Update len of base node */
	len = flst_get_len(base, mtr);
	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
}
/*********************************************************************
Creates a rollback segment.
@return pointer to new rollback segment if create successful */
UNIV_INTERN
trx_rseg_t*
trx_rseg_create(void)
/*=================*/
{
	mtr_t		mtr;
	ulint		slot_no;
	trx_rseg_t*	rseg = NULL;

	mtr_start(&mtr);

	/* To obey the latching order, acquire the file space
	x-latch before the kernel mutex. */
	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr);

	mutex_enter(&kernel_mutex);

	slot_no = trx_sysf_rseg_find_free(&mtr);

	if (slot_no != ULINT_UNDEFINED) {
		ulint		space;
		ulint		page_no;
		ulint		zip_size;
		trx_sysf_t*	sys_header;

		page_no = trx_rseg_header_create(
			TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr);

		ut_a(page_no != FIL_NULL);

		ut_ad(!trx_rseg_get_on_id(slot_no));

		sys_header = trx_sysf_get(&mtr);

		space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr);

		zip_size = space ? fil_space_get_zip_size(space) : 0;

		rseg = trx_rseg_mem_create(
			slot_no, space, zip_size, page_no, &mtr);
	}

	mutex_exit(&kernel_mutex);
	mtr_commit(&mtr);

	return(rseg);
}
Exemple #7
0
/********************************************************************//**
Adds a node as the last node in a list. */
UNIV_INTERN
void
flst_add_last(
/*==========*/
	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
	flst_node_t*		node,	/*!< in: node to add */
	mtr_t*			mtr)	/*!< in: mini-transaction handle */
{
	ulint		space;
	fil_addr_t	node_addr;
	ulint		len;
	fil_addr_t	last_addr;
	flst_node_t*	last_node;

	ut_ad(mtr && base && node);
	ut_ad(base != node);
	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
	len = flst_get_len(base, mtr);
	last_addr = flst_get_last(base, mtr);

	buf_ptr_get_fsp_addr(node, &space, &node_addr);

	/* If the list is not empty, call flst_insert_after */
	if (len != 0) {
		if (last_addr.page == node_addr.page) {
			last_node = page_align(node) + last_addr.boffset;
		} else {
			ulint	zip_size = fil_space_get_zip_size(space);

			last_node = fut_get_ptr(space, zip_size, last_addr,
						RW_X_LATCH, mtr);
		}

		flst_insert_after(base, last_node, node, mtr);
	} else {
		/* else call flst_add_to_empty */
		flst_add_to_empty(base, node, mtr);
	}
}
Exemple #8
0
/********************************************************************//**
Issues read requests for pages which recovery wants to read in. */
UNIV_INTERN
void
buf_read_recv_pages(
/*================*/
	ibool		sync,		/*!< in: TRUE if the caller
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
	ulint		space,		/*!< in: space id */
	ulint		zip_size,	/*!< in: compressed page size in
					bytes, or 0 */
	const ulint*	page_nos,	/*!< in: array of page numbers
					to read, with the highest page
					number the last in the
					array */
	ulint		n_stored)	/*!< in: number of page numbers
					in the array */
{
	ib_int64_t	tablespace_version;
	ulint		count;
	ulint		err;
	ulint		i;

	zip_size = fil_space_get_zip_size(space);

	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
		/* It is a single table tablespace and the .ibd file is
		missing: do nothing */

		/* the log records should be treated here same reason
		for http://bugs.mysql.com/bug.php?id=43948 */

		if (recv_recovery_is_on()) {
			recv_addr_t*    recv_addr;

			mutex_enter(&(recv_sys->mutex));

			if (recv_sys->apply_log_recs == FALSE) {
				mutex_exit(&(recv_sys->mutex));
				goto not_to_recover;
			}

			for (i = 0; i < n_stored; i++) {
				/* recv_get_fil_addr_struct() */
				recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
						hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
							recv_sys->addr_hash));
				while (recv_addr) {
					if ((recv_addr->space == space)
						&& (recv_addr->page_no == page_nos[i])) {
						break;
					}
					recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
				}

				if ((recv_addr == NULL)
				    || (recv_addr->state == RECV_BEING_PROCESSED)
				    || (recv_addr->state == RECV_PROCESSED)) {
					continue;
				}

				recv_addr->state = RECV_PROCESSED;

				ut_a(recv_sys->n_addrs);
				recv_sys->n_addrs--;
			}

			mutex_exit(&(recv_sys->mutex));

			fprintf(stderr, " (cannot find space: %lu)", space);
		}
not_to_recover:

		return;
	}

	tablespace_version = fil_space_get_version(space);

	for (i = 0; i < n_stored; i++) {
		buf_pool_t*	buf_pool;

		count = 0;

		os_aio_print_debug = FALSE;
		buf_pool = buf_pool_get(space, page_nos[i]);
		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {

			os_aio_simulated_wake_handler_threads();
			os_thread_sleep(10000);

			count++;

			if (count > 1000) {
				fprintf(stderr,
					"InnoDB: Error: InnoDB has waited for"
					" 10 seconds for pending\n"
					"InnoDB: reads to the buffer pool to"
					" be finished.\n"
					"InnoDB: Number of pending reads %lu,"
					" pending pread calls %lu\n",
					(ulong) buf_pool->n_pend_reads,
					(ulong)os_file_n_pending_preads);

				os_aio_print_debug = TRUE;
			}
		}

		os_aio_print_debug = FALSE;

		if ((i + 1 == n_stored) && sync) {
			buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
					  zip_size, TRUE, tablespace_version,
					  page_nos[i], NULL);
		} else {
			buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
					  | OS_AIO_SIMULATED_WAKE_LATER,
					  space, zip_size, TRUE,
					  tablespace_version, page_nos[i], NULL);
		}
	}

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of all the LRU lists if necessary */
	buf_flush_free_margins(FALSE);

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Recovery applies read-ahead pages %lu\n",
			(ulong) n_stored);
	}
#endif /* UNIV_DEBUG */
}
Exemple #9
0
/********************************************************************//**
Issues read requests for pages which the ibuf module wants to read in, in
order to contract the insert buffer tree. Technically, this function is like
a read-ahead function. */
UNIV_INTERN
void
buf_read_ibuf_merge_pages(
/*======================*/
	ibool		sync,		/*!< in: TRUE if the caller
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
	const ulint*	space_ids,	/*!< in: array of space ids */
	const ib_int64_t* space_versions,/*!< in: the spaces must have
					this version number
					(timestamp), otherwise we
					discard the read; we use this
					to cancel reads if DISCARD +
					IMPORT may have changed the
					tablespace size */
	const ulint*	page_nos,	/*!< in: array of page numbers
					to read, with the highest page
					number the last in the
					array */
	ulint		n_stored)	/*!< in: number of elements
					in the arrays */
{
	ulint	i;

#ifdef UNIV_IBUF_DEBUG
	ut_a(n_stored < UNIV_PAGE_SIZE);
#endif

	for (i = 0; i < n_stored; i++) {
		ulint		err;
		buf_pool_t*	buf_pool;
		ulint		zip_size = fil_space_get_zip_size(space_ids[i]);

		buf_pool = buf_pool_get(space_ids[i], page_nos[i]);

		while (buf_pool->n_pend_reads
		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
			os_thread_sleep(500000);
		}

		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {

			goto tablespace_deleted;
		}

		buf_read_page_low(&err, sync && (i + 1 == n_stored),
				  BUF_READ_ANY_PAGE, space_ids[i],
				  zip_size, TRUE, space_versions[i],
				  page_nos[i], NULL);

		if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
tablespace_deleted:
			/* We have deleted or are deleting the single-table
			tablespace: remove the entries for that page */

			ibuf_merge_or_delete_for_page(NULL, space_ids[i],
						      page_nos[i],
						      zip_size, FALSE);
		}
	}

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of all the LRU lists if necessary */
	buf_flush_free_margins(FALSE);

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Ibuf merge read-ahead space %lu pages %lu\n",
			(ulong) space_ids[0], (ulong) n_stored);
	}
#endif /* UNIV_DEBUG */
}
Exemple #10
0
/********************************************************************//**
Issues read requests for pages which recovery wants to read in. */
UNIV_INTERN
void
buf_read_recv_pages(
/*================*/
	ibool		sync,		/*!< in: TRUE if the caller
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
	ulint		space,		/*!< in: space id */
	ulint		zip_size,	/*!< in: compressed page size in
					bytes, or 0 */
	const ulint*	page_nos,	/*!< in: array of page numbers
					to read, with the highest page
					number the last in the
					array */
	ulint		n_stored)	/*!< in: number of page numbers
					in the array */
{
	ib_int64_t	tablespace_version;
	ulint		count;
	ulint		err;
	ulint		i;

	zip_size = fil_space_get_zip_size(space);

	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
		/* It is a single table tablespace and the .ibd file is
		missing: do nothing */

		return;
	}

	tablespace_version = fil_space_get_version(space);

	for (i = 0; i < n_stored; i++) {
		buf_pool_t*	buf_pool;

		count = 0;

		os_aio_print_debug = FALSE;
		buf_pool = buf_pool_get(space, page_nos[i]);
		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {

			os_aio_simulated_wake_handler_threads();
			os_thread_sleep(10000);

			count++;

			if (count > 1000) {
				fprintf(stderr,
					"InnoDB: Error: InnoDB has waited for"
					" 10 seconds for pending\n"
					"InnoDB: reads to the buffer pool to"
					" be finished.\n"
					"InnoDB: Number of pending reads %lu,"
					" pending pread calls %lu\n",
					(ulong) buf_pool->n_pend_reads,
					(ulong)os_file_n_pending_preads);

				os_aio_print_debug = TRUE;
			}
		}

		os_aio_print_debug = FALSE;

		if ((i + 1 == n_stored) && sync) {
			buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
					  zip_size, TRUE, tablespace_version,
					  page_nos[i]);
		} else {
			buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
					  | OS_AIO_SIMULATED_WAKE_LATER,
					  space, zip_size, TRUE,
					  tablespace_version, page_nos[i]);
		}
	}

	os_aio_simulated_wake_handler_threads();

	/* Flush pages from the end of all the LRU lists if necessary */
	buf_flush_free_margins();

#ifdef UNIV_DEBUG
	if (buf_debug_prints) {
		fprintf(stderr,
			"Recovery applies read-ahead pages %lu\n",
			(ulong) n_stored);
	}
#endif /* UNIV_DEBUG */
}
Exemple #11
0
/********************************************************************//**
Flush pages from flash cache.
@return	number of pages have been flushed to tablespace */
UNIV_INTERN
ulint	
fc_flush_to_disk(
/*==================*/
	ibool do_full_io)	/*!< in: whether do full io capacity */
{
	ulint distance;
	byte* page;
	ulint ret;
	ulint space;
	ulint offset;
	ulint page_type;
	ulint i, j;
	ulint pos;
	ulint zip_size;
	ulint block_offset, byte_offset;
	ulint fc_size = fc_get_size();
	ulint fc_blk_size = fc_get_block_size_byte();
	ulint start_offset;
   	ulint data_size;
	fc_block_t *flush_block = NULL;
	ulint c_flush = 0;
    
	ut_ad(!mutex_own(&fc->mutex));
	ut_a(fc->flush_buf->free_pos == 0);

	/* step 1: get the number of blocks need to flush to tablespace */
	flash_cache_mutex_enter();

	distance = fc_get_distance();
	start_offset = fc->flush_off;
    
	if ( distance == 0 ) {
		flash_cache_mutex_exit();
		return 0;
	} else if ( recv_recovery_on ) {
		if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) {
			fc->n_flush_cur = 0;
		} else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) {
			fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance);
		} else {
			fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance);
		}
	} else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)
		&& !do_full_io ) {
		flash_cache_mutex_exit();
		return 0;
	} else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size)
		&& !do_full_io ) {
		fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct);
	} else {
		ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) 
			|| do_full_io );
		fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance);
	}

	flash_cache_mutex_exit();

	/* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */
	i = 0;
	while (i < fc->n_flush_cur) {
		ulint b_space;
		ulint b_offset;
		ulint raw_zip_size;
		ulint size;
		ulint fil_offset;
#ifdef UNIV_FLASH_CACHE_TRACE
		ulint is_v4_blk;
#endif
		byte* page_io;

		flash_cache_mutex_enter();
		pos = ( start_offset + i ) % fc_size;
		flush_block = fc_get_block(pos);

		if (flush_block == NULL) {
			i++;
			flash_cache_mutex_exit();
			continue;
		}

		/* we should get the mutex, as doublewrite may hit this block and invalid the block */
		flash_block_mutex_enter(flush_block->fil_offset);

		flash_cache_mutex_exit();
		
		data_size = fc_block_get_data_size(flush_block);

		if (flush_block->state != BLOCK_READY_FOR_FLUSH) {
			/* if readonly or merge write or already flushed*/
			ut_a (flush_block->state == BLOCK_NOT_USED
				|| flush_block->state == BLOCK_READ_CACHE
				|| flush_block->state == BLOCK_FLUSHED);
			
			i += data_size;

			flash_block_mutex_exit(flush_block->fil_offset);
			if (flush_block->state == BLOCK_NOT_USED) {
				//fc_block_detach(FALSE, flush_block);
				fc_block_free(flush_block);
			}
			
			continue;
		}

		zip_size = fil_space_get_zip_size(flush_block->space);
		if (zip_size == ULINT_UNDEFINED) {
			/* table has been droped, just set it BLOCK_FLUSHED */
#ifdef UNIV_FLASH_CACHE_TRACE
			ut_print_timestamp(fc->f_debug);
			fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n",
			(ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset);
#endif
			flush_block->state = BLOCK_FLUSHED;
			i += data_size;
			c_flush += data_size;
			flash_block_mutex_exit(flush_block->fil_offset);
			continue;
		}

#ifdef UNIV_FLASH_CACHE_TRACE
		if (flush_block->state != BLOCK_READY_FOR_FLUSH) {
			fc_block_print(flush_block);
			ut_error;
		}
#endif

		flush_block->io_fix |= IO_FIX_FLUSH;

		/* 
		 * we should set block state BLOCK_FLUSHED,  if not, doublewrite may hit this block 
		 * and invalid this block and reduce the dirty count, but when finish flush ,we will 
		 * reduce the dirty count too, so it may reduce twice.
		 */
		flush_block->state = BLOCK_FLUSHED;
		
		/* save the block info, as the block may be invalided by doublewrite after release mutex */
		b_space = flush_block->space;
		b_offset = flush_block->offset;

		raw_zip_size = flush_block->raw_zip_size;
		size = flush_block->size;
		fil_offset = flush_block->fil_offset;
#ifdef UNIV_FLASH_CACHE_TRACE
		is_v4_blk = flush_block->is_v4_blk;
#endif
		/* release the block now, so read can hit in this blocks and read the data */
		flash_block_mutex_exit(flush_block->fil_offset);
		
		/*
		 * Only flush thread will update read_buf and flush_off/round. 
		 * there only single flush thread no need to lock read_buf
		 */
		page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size;

		if (raw_zip_size > 0) {
			ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE);
			page_io = fc->flush_zip_read_buf;
		} else {
			page_io = page;
		}

		fc_io_offset(fil_offset, &block_offset, &byte_offset);
		ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0,
				block_offset, byte_offset, data_size * fc_blk_size,
				page_io, NULL);
	
		if (ret != DB_SUCCESS) {
			ut_print_timestamp(stderr);
			fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n"
				"flash cache flush offset is:%lu.\n", (ulong)(start_offset + i));
			ut_error;
		}		

		if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) {
			goto skip;
		}

		/* decompress the compress data */
		if (raw_zip_size > 0) {
#ifdef UNIV_FLASH_CACHE_TRACE
			ulint blk_zip_size_byte;
			if (is_v4_blk) {
				blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte();
			} else {
				blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte();
				ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size);				
			} 

			ut_a(page_io);
			ut_a(page);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM);
			ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER)
				== FC_ZIP_PAGE_CHECKSUM);	
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE);		
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space);
			ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset);	

			/* only qlz can do this check  */
			if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) {
				if (is_v4_blk) {
					ut_a(raw_zip_size * fc_get_block_size_byte()
						>= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
				} else {
					ut_a(raw_zip_size 
						== (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
				}
				
				ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA)));
			}
#endif
			fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page);
		}

		space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
		offset = mach_read_from_4(page + FIL_PAGE_OFFSET);

		if ((space != b_space) || (offset != b_offset)) {
			ut_print_timestamp(stderr); 
			fc_block_print(flush_block);
			ut_error;
		}

		if (buf_page_is_corrupted(page, zip_size)) {
			buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH);
			ut_error;
		}		
		
		page_type = fil_page_get_type(page);
		if (page_type == FIL_PAGE_INDEX) {
			page_type = 1;
		}
		srv_flash_cache_flush_detail[page_type]++;
		
		ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, 
				zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL);
		if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) {
			ut_print_timestamp(stderr); 
			fc_block_print(flush_block);
			ut_error;
		}

		/* add  UNIV_PAGE_SIZE / fc_blk_size for safe */
		fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size;	

skip:
		i += data_size;
		c_flush += data_size;	

		if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) {
			/* FIXME: is it safe to change n_flush, as step 3 will use n_flush */
			fc->n_flush_cur = i;
			break;
		}	
	}

	/* ok, now flush all async io to disk */
	fc_flush_sync_dbfile();

	/* step 3: all the flush blocks have sync to disk,  update the state and io_fix */
	j = 0;
	while (j < fc->n_flush_cur) {

		flash_cache_mutex_enter();
		pos = (start_offset + j) % fc_size;
		flush_block = fc_get_block(pos);

		if (flush_block  == NULL) {
			j++;
			flash_cache_mutex_exit();
			continue;
		}
		/* block state and io_fix may be changed by doublewrite and lru move */
		flash_block_mutex_enter(flush_block->fil_offset);
		flash_cache_mutex_exit();
		if (flush_block->io_fix & IO_FIX_FLUSH) {
			/* the block is already in BLOCK_FLUSHED state */
			flush_block->io_fix &= ~IO_FIX_FLUSH;
		} 
		
		data_size = fc_block_get_data_size(flush_block);
		flash_block_mutex_exit(flush_block->fil_offset);	
		
		j += data_size;
	}

	
	/*
	 * i and j may be different, as the last been flushed block may be invalid by doublewrite,
	 * so maybe i > j
	 */
	
	/* add the actual flushed blocks */
	srv_flash_cache_flush = srv_flash_cache_flush + c_flush; 

	/* step 4: update fc status and flush_off, and wake up threads that are sleep for space  */
	if (i > 0) {
		ut_a(i >= c_flush);

		flash_cache_mutex_enter();
		
		/*
		 * it is safe to inc flush off and sub dirty blocks at this time,
		 * as fc_validate is not work
		 */
		fc_inc_flush_off(i);
		flash_cache_log_mutex_enter();
		fc_log->current_stat->flush_offset = fc->flush_off;
		fc_log->current_stat->flush_round = fc->flush_round;	
		flash_cache_log_mutex_exit();		
		
		ut_a(srv_flash_cache_dirty >= c_flush);		
		srv_flash_cache_dirty -= c_flush;
		
		srv_fc_flush_should_commit_log_flush++;
		os_event_set(fc->wait_space_event);	

		fc->n_flush_cur = 0;
		
		flash_cache_mutex_exit();		
	}

	fc->flush_buf->free_pos = 0;
 
	return c_flush;
}
Exemple #12
0
/*******************************************************************//**
Truncates the index tree associated with a row in SYS_INDEXES table.
@return	new root page number, or FIL_NULL on failure */
UNIV_INTERN
ulint
dict_truncate_index_tree(
    /*=====================*/
    dict_table_t*	table,	/*!< in: the table the index belongs to */
    ulint		space,	/*!< in: 0=truncate,
				nonzero=create the index tree in the
				given tablespace */
    btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
				record in the clustered index of
				SYS_INDEXES table. The cursor may be
				repositioned in this call. */
    mtr_t*		mtr)	/*!< in: mtr having the latch
				on the record page. The mtr may be
				committed and restarted in this call. */
{
    ulint		root_page_no;
    ibool		drop = !space;
    ulint		zip_size;
    ulint		type;
    index_id_t	index_id;
    rec_t*		rec;
    const byte*	ptr;
    ulint		len;
    dict_index_t*	index;

    ut_ad(mutex_own(&(dict_sys->mutex)));
    ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
    rec = btr_pcur_get_rec(pcur);
    ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);

    ut_ad(len == 4);

    root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);

    if (drop && root_page_no == FIL_NULL) {
        /* The tree has been freed. */

        ut_print_timestamp(stderr);
        fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
                " a missing index of table %s!\n", table->name);
        drop = FALSE;
    }

    ptr = rec_get_nth_field_old(rec,
                                DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);

    ut_ad(len == 4);

    if (drop) {
        space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
    }

    zip_size = fil_space_get_zip_size(space);

    if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
        /* It is a single table tablespace and the .ibd file is
        missing: do nothing */

        ut_print_timestamp(stderr);
        fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
                " a missing .ibd file of table %s!\n", table->name);
        return(FIL_NULL);
    }

    ptr = rec_get_nth_field_old(rec,
                                DICT_SYS_INDEXES_TYPE_FIELD, &len);
    ut_ad(len == 4);
    type = mach_read_from_4(ptr);

    ptr = rec_get_nth_field_old(rec, 1, &len);
    ut_ad(len == 8);
    index_id = mach_read_from_8(ptr);

    if (!drop) {

        goto create;
    }

    /* We free all the pages but the root page first; this operation
    may span several mini-transactions */

    btr_free_but_not_root(space, zip_size, root_page_no);

    /* Then we free the root page in the same mini-transaction where
    we create the b-tree and write its new root page number to the
    appropriate field in the SYS_INDEXES record: this mini-transaction
    marks the B-tree totally truncated */

    btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr);

    btr_free_root(space, zip_size, root_page_no, mtr);
create:
    /* We will temporarily write FIL_NULL to the PAGE_NO field
    in SYS_INDEXES, so that the database will not get into an
    inconsistent state in case it crashes between the mtr_commit()
    below and the following mtr_commit() call. */
    page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
                         FIL_NULL, mtr);

    /* We will need to commit the mini-transaction in order to avoid
    deadlocks in the btr_create() call, because otherwise we would
    be freeing and allocating pages in the same mini-transaction. */
    btr_pcur_store_position(pcur, mtr);
    mtr_commit(mtr);

    mtr_start(mtr);
    btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);

    /* Find the index corresponding to this SYS_INDEXES record. */
    for (index = UT_LIST_GET_FIRST(table->indexes);
            index;
            index = UT_LIST_GET_NEXT(indexes, index)) {
        if (index->id == index_id) {
            root_page_no = btr_create(type, space, zip_size,
                                      index_id, index, mtr);
            index->page = (unsigned int) root_page_no;
            return(root_page_no);
        }
    }

    ut_print_timestamp(stderr);
    fprintf(stderr,
            "  InnoDB: Index %llu of table %s is missing\n"
            "InnoDB: from the data dictionary during TRUNCATE!\n",
            (ullint) index_id,
            table->name);

    return(FIL_NULL);
}
Exemple #13
0
/********************************************************************//**
Validates a file-based list.
@return	TRUE if ok */
UNIV_INTERN
ibool
flst_validate(
/*==========*/
	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
	mtr_t*			mtr1)	/*!< in: mtr */
{
	ulint			space;
	ulint			zip_size;
	const flst_node_t*	node;
	fil_addr_t		node_addr;
	fil_addr_t		base_addr;
	ulint			len;
	ulint			i;
	mtr_t			mtr2;

	ut_ad(base);
	ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX));

	/* We use two mini-transaction handles: the first is used to
	lock the base node, and prevent other threads from modifying the
	list. The second is used to traverse the list. We cannot run the
	second mtr without committing it at times, because if the list
	is long, then the x-locked pages could fill the buffer resulting
	in a deadlock. */

	/* Find out the space id */
	buf_ptr_get_fsp_addr(base, &space, &base_addr);
	zip_size = fil_space_get_zip_size(space);

	len = flst_get_len(base, mtr1);
	node_addr = flst_get_first(base, mtr1);

	for (i = 0; i < len; i++) {
		mtr_start(&mtr2);

		node = fut_get_ptr(space, zip_size,
				   node_addr, RW_X_LATCH, &mtr2);
		node_addr = flst_get_next_addr(node, &mtr2);

		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
				   becoming full */
	}

	ut_a(fil_addr_is_null(node_addr));

	node_addr = flst_get_last(base, mtr1);

	for (i = 0; i < len; i++) {
		mtr_start(&mtr2);

		node = fut_get_ptr(space, zip_size,
				   node_addr, RW_X_LATCH, &mtr2);
		node_addr = flst_get_prev_addr(node, &mtr2);

		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
				   becoming full */
	}

	ut_a(fil_addr_is_null(node_addr));

	return(TRUE);
}
Exemple #14
0
/********************************************************************//**
Removes a node. */
UNIV_INTERN
void
flst_remove(
/*========*/
	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
	flst_node_t*		node2,	/*!< in: node to remove */
	mtr_t*			mtr)	/*!< in: mini-transaction handle */
{
	ulint		space;
	ulint		zip_size;
	flst_node_t*	node1;
	fil_addr_t	node1_addr;
	fil_addr_t	node2_addr;
	flst_node_t*	node3;
	fil_addr_t	node3_addr;
	ulint		len;

	ut_ad(mtr && node2 && base);
	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));

	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
	zip_size = fil_space_get_zip_size(space);

	node1_addr = flst_get_prev_addr(node2, mtr);
	node3_addr = flst_get_next_addr(node2, mtr);

	if (!fil_addr_is_null(node1_addr)) {

		/* Update next field of node1 */

		if (node1_addr.page == node2_addr.page) {

			node1 = page_align(node2) + node1_addr.boffset;
		} else {
			node1 = fut_get_ptr(space, zip_size,
					    node1_addr, RW_X_LATCH, mtr);
		}

		ut_ad(node1 != node2);

		flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr);
	} else {
		/* node2 was first in list: update first field in base */
		flst_write_addr(base + FLST_FIRST, node3_addr, mtr);
	}

	if (!fil_addr_is_null(node3_addr)) {
		/* Update prev field of node3 */

		if (node3_addr.page == node2_addr.page) {

			node3 = page_align(node2) + node3_addr.boffset;
		} else {
			node3 = fut_get_ptr(space, zip_size,
					    node3_addr, RW_X_LATCH, mtr);
		}

		ut_ad(node2 != node3);

		flst_write_addr(node3 + FLST_PREV, node1_addr, mtr);
	} else {
		/* node2 was last in list: update last field in base */
		flst_write_addr(base + FLST_LAST, node1_addr, mtr);
	}

	/* Update len of base node */
	len = flst_get_len(base, mtr);
	ut_ad(len > 0);

	mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr);
}