/*********************************************************************//** Creates the memory copies for rollback segments and initializes the rseg list and array in trx_sys at a database startup. */ UNIV_INTERN void trx_rseg_list_and_array_init( /*=========================*/ trx_sysf_t* sys_header, /*!< in: trx system header */ mtr_t* mtr) /*!< in: mtr */ { ulint i; ulint page_no; ulint space; UT_LIST_INIT(trx_sys->rseg_list); trx_sys->rseg_history_len = 0; for (i = 0; i < TRX_SYS_N_RSEGS; i++) { page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); if (page_no == FIL_NULL) { trx_sys_set_nth_rseg(trx_sys, i, NULL); } else { ulint zip_size; space = trx_sysf_rseg_get_space(sys_header, i, mtr); zip_size = space ? fil_space_get_zip_size(space) : 0; trx_rseg_mem_create(i, space, zip_size, page_no, mtr); } } }
/******************************************************************** Creates the memory copies for the rollback segments and initializes the rseg list and array in trx_sys at a database startup. */ static void trx_rseg_create_instance( /*=====================*/ trx_sysf_t* sys_header, /*!< in: trx system header */ mtr_t* mtr) /*!< in: mtr */ { ulint i; for (i = 0; i < TRX_SYS_N_RSEGS; i++) { ulint page_no; page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr); if (page_no == FIL_NULL) { trx_sys_set_nth_rseg(trx_sys, i, NULL); } else { ulint space; ulint zip_size; trx_rseg_t* rseg = NULL; ut_a(!trx_rseg_get_on_id(i)); space = trx_sysf_rseg_get_space(sys_header, i, mtr); zip_size = space ? fil_space_get_zip_size(space) : 0; rseg = trx_rseg_mem_create( i, space, zip_size, page_no, mtr); ut_a(rseg->id == i); } } }
/*******************************************************************//** Drops the index tree associated with a row in SYS_INDEXES table. */ UNIV_INTERN void dict_drop_index_tree( /*=================*/ rec_t* rec, /*!< in/out: record in the clustered index of SYS_INDEXES table */ mtr_t* mtr) /*!< in: mtr having the latch on the record page */ { ulint root_page_no; ulint space; ulint zip_size; const byte* ptr; ulint len; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (root_page_no == FIL_NULL) { /* The tree has already been freed */ return; } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ return; } /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, zip_size, root_page_no); /* Then we free the root page in the same mini-transaction where we write FIL_NULL to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally freed */ /* printf("Dropping index tree in space %lu root page %lu\n", space, root_page_no); */ btr_free_root(space, zip_size, root_page_no, mtr); page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); }
/********************************************************************//** Cuts off the tail of the list, including the node given. The number of nodes which will be removed must be provided by the caller, as this function does not measure the length of the tail. */ UNIV_INTERN void flst_cut_end( /*=========*/ flst_base_node_t* base, /*!< in: pointer to base node of list */ flst_node_t* node2, /*!< in: first node to remove */ ulint n_nodes,/*!< in: number of nodes to remove, must be >= 1 */ mtr_t* mtr) /*!< in: mini-transaction handle */ { ulint space; flst_node_t* node1; fil_addr_t node1_addr; fil_addr_t node2_addr; ulint len; ut_ad(mtr && node2 && base); ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); ut_ad(n_nodes > 0); buf_ptr_get_fsp_addr(node2, &space, &node2_addr); node1_addr = flst_get_prev_addr(node2, mtr); if (!fil_addr_is_null(node1_addr)) { /* Update next field of node1 */ if (node1_addr.page == node2_addr.page) { node1 = page_align(node2) + node1_addr.boffset; } else { node1 = fut_get_ptr(space, fil_space_get_zip_size(space), node1_addr, RW_X_LATCH, mtr); } flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr); } else { /* node2 was first in list: update the field in base */ flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr); } flst_write_addr(base + FLST_LAST, node1_addr, mtr); /* Update len of base node */ len = flst_get_len(base, mtr); ut_ad(len >= n_nodes); mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr); }
/********************************************************************//** Inserts a node after another in a list. */ UNIV_INTERN void flst_insert_after( /*==============*/ flst_base_node_t* base, /*!< in: pointer to base node of list */ flst_node_t* node1, /*!< in: node to insert after */ flst_node_t* node2, /*!< in: node to add */ mtr_t* mtr) /*!< in: mini-transaction handle */ { ulint space; fil_addr_t node1_addr; fil_addr_t node2_addr; flst_node_t* node3; fil_addr_t node3_addr; ulint len; ut_ad(mtr && node1 && node2 && base); ut_ad(base != node1); ut_ad(base != node2); ut_ad(node2 != node1); ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); buf_ptr_get_fsp_addr(node1, &space, &node1_addr); buf_ptr_get_fsp_addr(node2, &space, &node2_addr); node3_addr = flst_get_next_addr(node1, mtr); /* Set prev and next fields of node2 */ flst_write_addr(node2 + FLST_PREV, node1_addr, mtr); flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr); if (!fil_addr_is_null(node3_addr)) { /* Update prev field of node3 */ ulint zip_size = fil_space_get_zip_size(space); node3 = fut_get_ptr(space, zip_size, node3_addr, RW_X_LATCH, mtr); flst_write_addr(node3 + FLST_PREV, node2_addr, mtr); } else { /* node1 was last in list: update last field in base */ flst_write_addr(base + FLST_LAST, node2_addr, mtr); } /* Set next field of node1 */ flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr); /* Update len of base node */ len = flst_get_len(base, mtr); mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr); }
/********************************************************************* Creates a rollback segment. @return pointer to new rollback segment if create successful */ UNIV_INTERN trx_rseg_t* trx_rseg_create(void) /*=================*/ { mtr_t mtr; ulint slot_no; trx_rseg_t* rseg = NULL; mtr_start(&mtr); /* To obey the latching order, acquire the file space x-latch before the kernel mutex. */ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr); mutex_enter(&kernel_mutex); slot_no = trx_sysf_rseg_find_free(&mtr); if (slot_no != ULINT_UNDEFINED) { ulint space; ulint page_no; ulint zip_size; trx_sysf_t* sys_header; page_no = trx_rseg_header_create( TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr); ut_a(page_no != FIL_NULL); ut_ad(!trx_rseg_get_on_id(slot_no)); sys_header = trx_sysf_get(&mtr); space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr); zip_size = space ? fil_space_get_zip_size(space) : 0; rseg = trx_rseg_mem_create( slot_no, space, zip_size, page_no, &mtr); } mutex_exit(&kernel_mutex); mtr_commit(&mtr); return(rseg); }
/********************************************************************//** Adds a node as the last node in a list. */ UNIV_INTERN void flst_add_last( /*==========*/ flst_base_node_t* base, /*!< in: pointer to base node of list */ flst_node_t* node, /*!< in: node to add */ mtr_t* mtr) /*!< in: mini-transaction handle */ { ulint space; fil_addr_t node_addr; ulint len; fil_addr_t last_addr; flst_node_t* last_node; ut_ad(mtr && base && node); ut_ad(base != node); ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX)); len = flst_get_len(base, mtr); last_addr = flst_get_last(base, mtr); buf_ptr_get_fsp_addr(node, &space, &node_addr); /* If the list is not empty, call flst_insert_after */ if (len != 0) { if (last_addr.page == node_addr.page) { last_node = page_align(node) + last_addr.boffset; } else { ulint zip_size = fil_space_get_zip_size(space); last_node = fut_get_ptr(space, zip_size, last_addr, RW_X_LATCH, mtr); } flst_insert_after(base, last_node, node, mtr); } else { /* else call flst_add_to_empty */ flst_add_to_empty(base, node, mtr); } }
/********************************************************************//** Issues read requests for pages which recovery wants to read in. */ UNIV_INTERN void buf_read_recv_pages( /*================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of page numbers in the array */ { ib_int64_t tablespace_version; ulint count; ulint err; ulint i; zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ /* the log records should be treated here same reason for http://bugs.mysql.com/bug.php?id=43948 */ if (recv_recovery_is_on()) { recv_addr_t* recv_addr; mutex_enter(&(recv_sys->mutex)); if (recv_sys->apply_log_recs == FALSE) { mutex_exit(&(recv_sys->mutex)); goto not_to_recover; } for (i = 0; i < n_stored; i++) { /* recv_get_fil_addr_struct() */ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]), recv_sys->addr_hash)); while (recv_addr) { if ((recv_addr->space == space) && (recv_addr->page_no == page_nos[i])) { break; } recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); } if ((recv_addr == NULL) || (recv_addr->state == RECV_BEING_PROCESSED) || (recv_addr->state == RECV_PROCESSED)) { continue; } recv_addr->state = RECV_PROCESSED; ut_a(recv_sys->n_addrs); recv_sys->n_addrs--; } mutex_exit(&(recv_sys->mutex)); fprintf(stderr, " (cannot find space: %lu)", space); } not_to_recover: return; } tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { buf_pool_t* buf_pool; count = 0; os_aio_print_debug = FALSE; buf_pool = buf_pool_get(space, page_nos[i]); while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(10000); count++; if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," " pending pread calls %lu\n", (ulong) buf_pool->n_pend_reads, (ulong)os_file_n_pending_preads); os_aio_print_debug = TRUE; } } os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, zip_size, TRUE, tablespace_version, page_nos[i], NULL); } else { buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, space, zip_size, TRUE, tablespace_version, page_nos[i], NULL); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
/********************************************************************//** Issues read requests for pages which the ibuf module wants to read in, in order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ UNIV_INTERN void buf_read_ibuf_merge_pages( /*======================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ const ulint* space_ids, /*!< in: array of space ids */ const ib_int64_t* space_versions,/*!< in: the spaces must have this version number (timestamp), otherwise we discard the read; we use this to cancel reads if DISCARD + IMPORT may have changed the tablespace size */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of elements in the arrays */ { ulint i; #ifdef UNIV_IBUF_DEBUG ut_a(n_stored < UNIV_PAGE_SIZE); #endif for (i = 0; i < n_stored; i++) { ulint err; buf_pool_t* buf_pool; ulint zip_size = fil_space_get_zip_size(space_ids[i]); buf_pool = buf_pool_get(space_ids[i], page_nos[i]); while (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { os_thread_sleep(500000); } if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { goto tablespace_deleted; } buf_read_page_low(&err, sync && (i + 1 == n_stored), BUF_READ_ANY_PAGE, space_ids[i], zip_size, TRUE, space_versions[i], page_nos[i], NULL); if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) { tablespace_deleted: /* We have deleted or are deleting the single-table tablespace: remove the entries for that page */ ibuf_merge_or_delete_for_page(NULL, space_ids[i], page_nos[i], zip_size, FALSE); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Ibuf merge read-ahead space %lu pages %lu\n", (ulong) space_ids[0], (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
/********************************************************************//** Issues read requests for pages which recovery wants to read in. */ UNIV_INTERN void buf_read_recv_pages( /*================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of page numbers in the array */ { ib_int64_t tablespace_version; ulint count; ulint err; ulint i; zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ return; } tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { buf_pool_t* buf_pool; count = 0; os_aio_print_debug = FALSE; buf_pool = buf_pool_get(space, page_nos[i]); while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(10000); count++; if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," " pending pread calls %lu\n", (ulong) buf_pool->n_pend_reads, (ulong)os_file_n_pending_preads); os_aio_print_debug = TRUE; } } os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, zip_size, TRUE, tablespace_version, page_nos[i]); } else { buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, space, zip_size, TRUE, tablespace_version, page_nos[i]); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
/********************************************************************//** Flush pages from flash cache. @return number of pages have been flushed to tablespace */ UNIV_INTERN ulint fc_flush_to_disk( /*==================*/ ibool do_full_io) /*!< in: whether do full io capacity */ { ulint distance; byte* page; ulint ret; ulint space; ulint offset; ulint page_type; ulint i, j; ulint pos; ulint zip_size; ulint block_offset, byte_offset; ulint fc_size = fc_get_size(); ulint fc_blk_size = fc_get_block_size_byte(); ulint start_offset; ulint data_size; fc_block_t *flush_block = NULL; ulint c_flush = 0; ut_ad(!mutex_own(&fc->mutex)); ut_a(fc->flush_buf->free_pos == 0); /* step 1: get the number of blocks need to flush to tablespace */ flash_cache_mutex_enter(); distance = fc_get_distance(); start_offset = fc->flush_off; if ( distance == 0 ) { flash_cache_mutex_exit(); return 0; } else if ( recv_recovery_on ) { if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) { fc->n_flush_cur = 0; } else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) { fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance); } else { fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance); } } else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size) && !do_full_io ) { flash_cache_mutex_exit(); return 0; } else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) && !do_full_io ) { fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct); } else { ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) || do_full_io ); fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance); } flash_cache_mutex_exit(); /* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */ i = 0; while (i < fc->n_flush_cur) { ulint b_space; ulint b_offset; ulint raw_zip_size; ulint size; ulint fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE ulint is_v4_blk; #endif byte* page_io; flash_cache_mutex_enter(); pos = ( start_offset + i ) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { i++; flash_cache_mutex_exit(); continue; } /* we should get the mutex, as doublewrite may hit this block and invalid the block */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); data_size = fc_block_get_data_size(flush_block); if (flush_block->state != BLOCK_READY_FOR_FLUSH) { /* if readonly or merge write or already flushed*/ ut_a (flush_block->state == BLOCK_NOT_USED || flush_block->state == BLOCK_READ_CACHE || flush_block->state == BLOCK_FLUSHED); i += data_size; flash_block_mutex_exit(flush_block->fil_offset); if (flush_block->state == BLOCK_NOT_USED) { //fc_block_detach(FALSE, flush_block); fc_block_free(flush_block); } continue; } zip_size = fil_space_get_zip_size(flush_block->space); if (zip_size == ULINT_UNDEFINED) { /* table has been droped, just set it BLOCK_FLUSHED */ #ifdef UNIV_FLASH_CACHE_TRACE ut_print_timestamp(fc->f_debug); fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n", (ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset); #endif flush_block->state = BLOCK_FLUSHED; i += data_size; c_flush += data_size; flash_block_mutex_exit(flush_block->fil_offset); continue; } #ifdef UNIV_FLASH_CACHE_TRACE if (flush_block->state != BLOCK_READY_FOR_FLUSH) { fc_block_print(flush_block); ut_error; } #endif flush_block->io_fix |= IO_FIX_FLUSH; /* * we should set block state BLOCK_FLUSHED, if not, doublewrite may hit this block * and invalid this block and reduce the dirty count, but when finish flush ,we will * reduce the dirty count too, so it may reduce twice. */ flush_block->state = BLOCK_FLUSHED; /* save the block info, as the block may be invalided by doublewrite after release mutex */ b_space = flush_block->space; b_offset = flush_block->offset; raw_zip_size = flush_block->raw_zip_size; size = flush_block->size; fil_offset = flush_block->fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE is_v4_blk = flush_block->is_v4_blk; #endif /* release the block now, so read can hit in this blocks and read the data */ flash_block_mutex_exit(flush_block->fil_offset); /* * Only flush thread will update read_buf and flush_off/round. * there only single flush thread no need to lock read_buf */ page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size; if (raw_zip_size > 0) { ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE); page_io = fc->flush_zip_read_buf; } else { page_io = page; } fc_io_offset(fil_offset, &block_offset, &byte_offset); ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0, block_offset, byte_offset, data_size * fc_blk_size, page_io, NULL); if (ret != DB_SUCCESS) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n" "flash cache flush offset is:%lu.\n", (ulong)(start_offset + i)); ut_error; } if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) { goto skip; } /* decompress the compress data */ if (raw_zip_size > 0) { #ifdef UNIV_FLASH_CACHE_TRACE ulint blk_zip_size_byte; if (is_v4_blk) { blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte(); } else { blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte(); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size); } ut_a(page_io); ut_a(page); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset); /* only qlz can do this check */ if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) { if (is_v4_blk) { ut_a(raw_zip_size * fc_get_block_size_byte() >= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } else { ut_a(raw_zip_size == (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } #endif fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page); } space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); offset = mach_read_from_4(page + FIL_PAGE_OFFSET); if ((space != b_space) || (offset != b_offset)) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } if (buf_page_is_corrupted(page, zip_size)) { buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH); ut_error; } page_type = fil_page_get_type(page); if (page_type == FIL_PAGE_INDEX) { page_type = 1; } srv_flash_cache_flush_detail[page_type]++; ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL); if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } /* add UNIV_PAGE_SIZE / fc_blk_size for safe */ fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size; skip: i += data_size; c_flush += data_size; if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) { /* FIXME: is it safe to change n_flush, as step 3 will use n_flush */ fc->n_flush_cur = i; break; } } /* ok, now flush all async io to disk */ fc_flush_sync_dbfile(); /* step 3: all the flush blocks have sync to disk, update the state and io_fix */ j = 0; while (j < fc->n_flush_cur) { flash_cache_mutex_enter(); pos = (start_offset + j) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { j++; flash_cache_mutex_exit(); continue; } /* block state and io_fix may be changed by doublewrite and lru move */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); if (flush_block->io_fix & IO_FIX_FLUSH) { /* the block is already in BLOCK_FLUSHED state */ flush_block->io_fix &= ~IO_FIX_FLUSH; } data_size = fc_block_get_data_size(flush_block); flash_block_mutex_exit(flush_block->fil_offset); j += data_size; } /* * i and j may be different, as the last been flushed block may be invalid by doublewrite, * so maybe i > j */ /* add the actual flushed blocks */ srv_flash_cache_flush = srv_flash_cache_flush + c_flush; /* step 4: update fc status and flush_off, and wake up threads that are sleep for space */ if (i > 0) { ut_a(i >= c_flush); flash_cache_mutex_enter(); /* * it is safe to inc flush off and sub dirty blocks at this time, * as fc_validate is not work */ fc_inc_flush_off(i); flash_cache_log_mutex_enter(); fc_log->current_stat->flush_offset = fc->flush_off; fc_log->current_stat->flush_round = fc->flush_round; flash_cache_log_mutex_exit(); ut_a(srv_flash_cache_dirty >= c_flush); srv_flash_cache_dirty -= c_flush; srv_fc_flush_should_commit_log_flush++; os_event_set(fc->wait_space_event); fc->n_flush_cur = 0; flash_cache_mutex_exit(); } fc->flush_buf->free_pos = 0; return c_flush; }
/*******************************************************************//** Truncates the index tree associated with a row in SYS_INDEXES table. @return new root page number, or FIL_NULL on failure */ UNIV_INTERN ulint dict_truncate_index_tree( /*=====================*/ dict_table_t* table, /*!< in: the table the index belongs to */ ulint space, /*!< in: 0=truncate, nonzero=create the index tree in the given tablespace */ btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to record in the clustered index of SYS_INDEXES table. The cursor may be repositioned in this call. */ mtr_t* mtr) /*!< in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ { ulint root_page_no; ibool drop = !space; ulint zip_size; ulint type; index_id_t index_id; rec_t* rec; const byte* ptr; ulint len; dict_index_t* index; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (drop && root_page_no == FIL_NULL) { /* The tree has been freed. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing index of table %s!\n", table->name); drop = FALSE; } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); if (drop) { space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); } zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing .ibd file of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_TYPE_FIELD, &len); ut_ad(len == 4); type = mach_read_from_4(ptr); ptr = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); index_id = mach_read_from_8(ptr); if (!drop) { goto create; } /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, zip_size, root_page_no); /* Then we free the root page in the same mini-transaction where we create the b-tree and write its new root page number to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally truncated */ btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr); btr_free_root(space, zip_size, root_page_no, mtr); create: /* We will temporarily write FIL_NULL to the PAGE_NO field in SYS_INDEXES, so that the database will not get into an inconsistent state in case it crashes between the mtr_commit() below and the following mtr_commit() call. */ page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { if (index->id == index_id) { root_page_no = btr_create(type, space, zip_size, index_id, index, mtr); index->page = (unsigned int) root_page_no; return(root_page_no); } } ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Index %llu of table %s is missing\n" "InnoDB: from the data dictionary during TRUNCATE!\n", (ullint) index_id, table->name); return(FIL_NULL); }
/********************************************************************//** Validates a file-based list. @return TRUE if ok */ UNIV_INTERN ibool flst_validate( /*==========*/ const flst_base_node_t* base, /*!< in: pointer to base node of list */ mtr_t* mtr1) /*!< in: mtr */ { ulint space; ulint zip_size; const flst_node_t* node; fil_addr_t node_addr; fil_addr_t base_addr; ulint len; ulint i; mtr_t mtr2; ut_ad(base); ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX)); /* We use two mini-transaction handles: the first is used to lock the base node, and prevent other threads from modifying the list. The second is used to traverse the list. We cannot run the second mtr without committing it at times, because if the list is long, then the x-locked pages could fill the buffer resulting in a deadlock. */ /* Find out the space id */ buf_ptr_get_fsp_addr(base, &space, &base_addr); zip_size = fil_space_get_zip_size(space); len = flst_get_len(base, mtr1); node_addr = flst_get_first(base, mtr1); for (i = 0; i < len; i++) { mtr_start(&mtr2); node = fut_get_ptr(space, zip_size, node_addr, RW_X_LATCH, &mtr2); node_addr = flst_get_next_addr(node, &mtr2); mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer becoming full */ } ut_a(fil_addr_is_null(node_addr)); node_addr = flst_get_last(base, mtr1); for (i = 0; i < len; i++) { mtr_start(&mtr2); node = fut_get_ptr(space, zip_size, node_addr, RW_X_LATCH, &mtr2); node_addr = flst_get_prev_addr(node, &mtr2); mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer becoming full */ } ut_a(fil_addr_is_null(node_addr)); return(TRUE); }
/********************************************************************//** Removes a node. */ UNIV_INTERN void flst_remove( /*========*/ flst_base_node_t* base, /*!< in: pointer to base node of list */ flst_node_t* node2, /*!< in: node to remove */ mtr_t* mtr) /*!< in: mini-transaction handle */ { ulint space; ulint zip_size; flst_node_t* node1; fil_addr_t node1_addr; fil_addr_t node2_addr; flst_node_t* node3; fil_addr_t node3_addr; ulint len; ut_ad(mtr && node2 && base); ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX)); buf_ptr_get_fsp_addr(node2, &space, &node2_addr); zip_size = fil_space_get_zip_size(space); node1_addr = flst_get_prev_addr(node2, mtr); node3_addr = flst_get_next_addr(node2, mtr); if (!fil_addr_is_null(node1_addr)) { /* Update next field of node1 */ if (node1_addr.page == node2_addr.page) { node1 = page_align(node2) + node1_addr.boffset; } else { node1 = fut_get_ptr(space, zip_size, node1_addr, RW_X_LATCH, mtr); } ut_ad(node1 != node2); flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr); } else { /* node2 was first in list: update first field in base */ flst_write_addr(base + FLST_FIRST, node3_addr, mtr); } if (!fil_addr_is_null(node3_addr)) { /* Update prev field of node3 */ if (node3_addr.page == node2_addr.page) { node3 = page_align(node2) + node3_addr.boffset; } else { node3 = fut_get_ptr(space, zip_size, node3_addr, RW_X_LATCH, mtr); } ut_ad(node2 != node3); flst_write_addr(node3 + FLST_PREV, node1_addr, mtr); } else { /* node2 was last in list: update last field in base */ flst_write_addr(base + FLST_LAST, node1_addr, mtr); } /* Update len of base node */ len = flst_get_len(base, mtr); ut_ad(len > 0); mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr); }