void rw_lock_debug_mutex_exit(void) /*==========================*/ { mutex_exit(&rw_lock_debug_mutex); if (rw_lock_debug_waiters) { rw_lock_debug_waiters = FALSE; os_event_set(rw_lock_debug_event); } }
/****************************************************************//** Add a work item to the queue. */ UNIV_INTERN void ib_wqueue_add( /*==========*/ ib_wqueue_t* wq, /*!< in: work queue */ void* item, /*!< in: work item */ mem_heap_t* heap) /*!< in: memory heap to use for allocating the list node */ { mutex_enter(&wq->mutex); ib_list_add_last(wq->items, item, heap); os_event_set(wq->event); mutex_exit(&wq->mutex); }
void buf_flush_write_complete( /*=====================*/ buf_block_t* block) /* in: pointer to the block in question */ { ut_ad(block); #ifdef UNIV_SYNC_DEBUG ut_ad(mutex_own(&(buf_pool->mutex))); #endif /* UNIV_SYNC_DEBUG */ ut_a(block->state == BUF_BLOCK_FILE_PAGE); block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list)); (buf_pool->n_flush[block->flush_type])--; if (block->flush_type == BUF_FLUSH_LRU) { /* Put the block to the end of the LRU list to wait to be moved to the free list */ buf_LRU_make_block_old(block); buf_pool->LRU_flush_ended++; } /* fprintf(stderr, "n pending flush %lu\n", buf_pool->n_flush[block->flush_type]); */ if ((buf_pool->n_flush[block->flush_type] == 0) && (buf_pool->init_flush[block->flush_type] == FALSE)) { /* The running flush batch has ended */ os_event_set(buf_pool->no_flush[block->flush_type]); } }
ulint buf_flush_batch( /*============*/ /* out: number of blocks for which the write request was queued; ULINT_UNDEFINED if there was a flush of the same type already running */ ulint flush_type, /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if BUF_FLUSH_LIST, then the caller must not own any latches on pages */ ulint min_n, /* in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ dulint lsn_limit) /* in the case BUF_FLUSH_LIST all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ { buf_block_t* block; ulint page_count = 0; ulint old_page_count; ulint space; ulint offset; ibool found; ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); #ifdef UNIV_SYNC_DEBUG ut_ad((flush_type != BUF_FLUSH_LIST) || sync_thread_levels_empty_gen(TRUE)); #endif /* UNIV_SYNC_DEBUG */ mutex_enter(&(buf_pool->mutex)); if ((buf_pool->n_flush[flush_type] > 0) || (buf_pool->init_flush[flush_type] == TRUE)) { /* There is already a flush batch of the same type running */ mutex_exit(&(buf_pool->mutex)); return(ULINT_UNDEFINED); } (buf_pool->init_flush)[flush_type] = TRUE; for (;;) { /* If we have flushed enough, leave the loop */ if (page_count >= min_n) { break; } /* Start from the end of the list looking for a suitable block to be flushed. */ if (flush_type == BUF_FLUSH_LRU) { block = UT_LIST_GET_LAST(buf_pool->LRU); } else { ut_ad(flush_type == BUF_FLUSH_LIST); block = UT_LIST_GET_LAST(buf_pool->flush_list); if (!block || (ut_dulint_cmp(block->oldest_modification, lsn_limit) >= 0)) { /* We have flushed enough */ break; } } found = FALSE; /* Note that after finding a single flushable page, we try to flush also all its neighbors, and after that start from the END of the LRU list or flush list again: the list may change during the flushing and we cannot safely preserve within this function a pointer to a block in the list! */ while ((block != NULL) && !found) { ut_a(block->state == BUF_BLOCK_FILE_PAGE); mutex_enter(&block->mutex); if (buf_flush_ready_for_flush(block, flush_type)) { found = TRUE; space = block->space; offset = block->offset; mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); old_page_count = page_count; /* Try to flush also all the neighbors */ page_count += buf_flush_try_neighbors( space, offset, flush_type); /* fprintf(stderr, "Flush type %lu, page no %lu, neighb %lu\n", flush_type, offset, page_count - old_page_count); */ mutex_enter(&(buf_pool->mutex)); } else if (flush_type == BUF_FLUSH_LRU) { mutex_exit(&block->mutex); block = UT_LIST_GET_PREV(LRU, block); } else { ut_ad(flush_type == BUF_FLUSH_LIST); mutex_exit(&block->mutex); block = UT_LIST_GET_PREV(flush_list, block); } } /* If we could not find anything to flush, leave the loop */ if (!found) { break; } } (buf_pool->init_flush)[flush_type] = FALSE; if ((buf_pool->n_flush[flush_type] == 0) && (buf_pool->init_flush[flush_type] == FALSE)) { /* The running flush batch has ended */ os_event_set(buf_pool->no_flush[flush_type]); } mutex_exit(&(buf_pool->mutex)); buf_flush_buffered_writes(); #ifdef UNIV_DEBUG if (buf_debug_prints && page_count > 0) { ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); fprintf(stderr, flush_type == BUF_FLUSH_LRU ? "Flushed %lu pages in LRU flush\n" : "Flushed %lu pages in flush list flush\n", (ulong) page_count); } #endif /* UNIV_DEBUG */ srv_buf_pool_flushed += page_count; return(page_count); }
buf_block_t* buf_LRU_get_free_block(void) /*========================*/ /* out: the free control block; also if AWE is used, it is guaranteed that the block has its page mapped to a frame when we return */ { buf_block_t* block = NULL; ibool freed; ulint n_iterations = 1; ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: mutex_enter(&(buf_pool->mutex)); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: over 95 percent of the buffer pool" " is occupied by\n" "InnoDB: lock heaps or the adaptive hash index!" " Check that your\n" "InnoDB: transactions do not set too many row locks.\n" "InnoDB: Your buffer pool size is %lu MB." " Maybe you should make\n" "InnoDB: the buffer pool bigger?\n" "InnoDB: We intentionally generate a seg fault" " to print a stack trace\n" "InnoDB: on Linux!\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); ut_error; } else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 3) { if (!buf_lru_switched_on_innodb_mon) { /* Over 67 % of the buffer pool is occupied by lock heaps or the adaptive hash index. This may be a memory leak! */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: WARNING: over 67 percent of" " the buffer pool is occupied by\n" "InnoDB: lock heaps or the adaptive" " hash index! Check that your\n" "InnoDB: transactions do not set too many" " row locks.\n" "InnoDB: Your buffer pool size is %lu MB." " Maybe you should make\n" "InnoDB: the buffer pool bigger?\n" "InnoDB: Starting the InnoDB Monitor to print" " diagnostics, including\n" "InnoDB: lock heap and hash index sizes.\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); buf_lru_switched_on_innodb_mon = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } } else if (buf_lru_switched_on_innodb_mon) { /* Switch off the InnoDB Monitor; this is a simple way to stop the monitor if the situation becomes less urgent, but may also surprise users if the user also switched on the monitor! */ buf_lru_switched_on_innodb_mon = FALSE; srv_print_innodb_monitor = FALSE; } /* If there is a block in the free list, take it */ if (UT_LIST_GET_LEN(buf_pool->free) > 0) { block = UT_LIST_GET_FIRST(buf_pool->free); ut_a(block->in_free_list); UT_LIST_REMOVE(free, buf_pool->free, block); block->in_free_list = FALSE; ut_a(block->state != BUF_BLOCK_FILE_PAGE); ut_a(!block->in_LRU_list); if (srv_use_awe) { if (block->frame) { /* Remove from the list of mapped pages */ UT_LIST_REMOVE(awe_LRU_free_mapped, buf_pool->awe_LRU_free_mapped, block); } else { /* We map the page to a frame; second param FALSE below because we do not want it to be added to the awe_LRU_free_mapped list */ buf_awe_map_page_to_frame(block, FALSE); } } mutex_enter(&block->mutex); block->state = BUF_BLOCK_READY_FOR_USE; UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; } return(block); } /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ mutex_exit(&(buf_pool->mutex)); freed = buf_LRU_search_and_free_block(n_iterations); if (freed > 0) { goto loop; } if (n_iterations > 30) { ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Warning: difficult to find free blocks from\n" "InnoDB: the buffer pool (%lu search iterations)!" " Consider\n" "InnoDB: increasing the buffer pool size.\n" "InnoDB: It is also possible that" " in your Unix version\n" "InnoDB: fsync is very slow, or" " completely frozen inside\n" "InnoDB: the OS kernel. Then upgrading to" " a newer version\n" "InnoDB: of your operating system may help." " Look at the\n" "InnoDB: number of fsyncs in diagnostic info below.\n" "InnoDB: Pending flushes (fsync) log: %lu;" " buffer pool: %lu\n" "InnoDB: %lu OS file reads, %lu OS file writes," " %lu OS fsyncs\n" "InnoDB: Starting InnoDB Monitor to print further\n" "InnoDB: diagnostics to the standard output.\n", (ulong) n_iterations, (ulong) fil_n_pending_log_flushes, (ulong) fil_n_pending_tablespace_flushes, (ulong) os_n_file_reads, (ulong) os_n_file_writes, (ulong) os_n_fsyncs); mon_value_was = srv_print_innodb_monitor; started_monitor = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } /* No free block was found: try to flush the LRU list */ buf_flush_free_margin(); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); mutex_enter(&(buf_pool->mutex)); if (buf_pool->LRU_flush_ended > 0) { /* We have written pages in an LRU flush. To make the insert buffer more efficient, we try to move these pages to the free list. */ mutex_exit(&(buf_pool->mutex)); buf_LRU_try_free_flushed_blocks(); } else { mutex_exit(&(buf_pool->mutex)); } if (n_iterations > 10) { os_thread_sleep(500000); } n_iterations++; goto loop; }
/********************************************************************//** Flush pages from flash cache. @return number of pages have been flushed to tablespace */ UNIV_INTERN ulint fc_flush_to_disk( /*==================*/ ibool do_full_io) /*!< in: whether do full io capacity */ { ulint distance; byte* page; ulint ret; ulint space; ulint offset; ulint page_type; ulint i, j; ulint pos; ulint zip_size; ulint block_offset, byte_offset; ulint fc_size = fc_get_size(); ulint fc_blk_size = fc_get_block_size_byte(); ulint start_offset; ulint data_size; fc_block_t *flush_block = NULL; ulint c_flush = 0; ut_ad(!mutex_own(&fc->mutex)); ut_a(fc->flush_buf->free_pos == 0); /* step 1: get the number of blocks need to flush to tablespace */ flash_cache_mutex_enter(); distance = fc_get_distance(); start_offset = fc->flush_off; if ( distance == 0 ) { flash_cache_mutex_exit(); return 0; } else if ( recv_recovery_on ) { if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) { fc->n_flush_cur = 0; } else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) { fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance); } else { fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance); } } else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size) && !do_full_io ) { flash_cache_mutex_exit(); return 0; } else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) && !do_full_io ) { fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct); } else { ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) || do_full_io ); fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance); } flash_cache_mutex_exit(); /* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */ i = 0; while (i < fc->n_flush_cur) { ulint b_space; ulint b_offset; ulint raw_zip_size; ulint size; ulint fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE ulint is_v4_blk; #endif byte* page_io; flash_cache_mutex_enter(); pos = ( start_offset + i ) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { i++; flash_cache_mutex_exit(); continue; } /* we should get the mutex, as doublewrite may hit this block and invalid the block */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); data_size = fc_block_get_data_size(flush_block); if (flush_block->state != BLOCK_READY_FOR_FLUSH) { /* if readonly or merge write or already flushed*/ ut_a (flush_block->state == BLOCK_NOT_USED || flush_block->state == BLOCK_READ_CACHE || flush_block->state == BLOCK_FLUSHED); i += data_size; flash_block_mutex_exit(flush_block->fil_offset); if (flush_block->state == BLOCK_NOT_USED) { //fc_block_detach(FALSE, flush_block); fc_block_free(flush_block); } continue; } zip_size = fil_space_get_zip_size(flush_block->space); if (zip_size == ULINT_UNDEFINED) { /* table has been droped, just set it BLOCK_FLUSHED */ #ifdef UNIV_FLASH_CACHE_TRACE ut_print_timestamp(fc->f_debug); fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n", (ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset); #endif flush_block->state = BLOCK_FLUSHED; i += data_size; c_flush += data_size; flash_block_mutex_exit(flush_block->fil_offset); continue; } #ifdef UNIV_FLASH_CACHE_TRACE if (flush_block->state != BLOCK_READY_FOR_FLUSH) { fc_block_print(flush_block); ut_error; } #endif flush_block->io_fix |= IO_FIX_FLUSH; /* * we should set block state BLOCK_FLUSHED, if not, doublewrite may hit this block * and invalid this block and reduce the dirty count, but when finish flush ,we will * reduce the dirty count too, so it may reduce twice. */ flush_block->state = BLOCK_FLUSHED; /* save the block info, as the block may be invalided by doublewrite after release mutex */ b_space = flush_block->space; b_offset = flush_block->offset; raw_zip_size = flush_block->raw_zip_size; size = flush_block->size; fil_offset = flush_block->fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE is_v4_blk = flush_block->is_v4_blk; #endif /* release the block now, so read can hit in this blocks and read the data */ flash_block_mutex_exit(flush_block->fil_offset); /* * Only flush thread will update read_buf and flush_off/round. * there only single flush thread no need to lock read_buf */ page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size; if (raw_zip_size > 0) { ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE); page_io = fc->flush_zip_read_buf; } else { page_io = page; } fc_io_offset(fil_offset, &block_offset, &byte_offset); ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0, block_offset, byte_offset, data_size * fc_blk_size, page_io, NULL); if (ret != DB_SUCCESS) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n" "flash cache flush offset is:%lu.\n", (ulong)(start_offset + i)); ut_error; } if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) { goto skip; } /* decompress the compress data */ if (raw_zip_size > 0) { #ifdef UNIV_FLASH_CACHE_TRACE ulint blk_zip_size_byte; if (is_v4_blk) { blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte(); } else { blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte(); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size); } ut_a(page_io); ut_a(page); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset); /* only qlz can do this check */ if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) { if (is_v4_blk) { ut_a(raw_zip_size * fc_get_block_size_byte() >= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } else { ut_a(raw_zip_size == (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } #endif fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page); } space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); offset = mach_read_from_4(page + FIL_PAGE_OFFSET); if ((space != b_space) || (offset != b_offset)) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } if (buf_page_is_corrupted(page, zip_size)) { buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH); ut_error; } page_type = fil_page_get_type(page); if (page_type == FIL_PAGE_INDEX) { page_type = 1; } srv_flash_cache_flush_detail[page_type]++; ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL); if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } /* add UNIV_PAGE_SIZE / fc_blk_size for safe */ fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size; skip: i += data_size; c_flush += data_size; if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) { /* FIXME: is it safe to change n_flush, as step 3 will use n_flush */ fc->n_flush_cur = i; break; } } /* ok, now flush all async io to disk */ fc_flush_sync_dbfile(); /* step 3: all the flush blocks have sync to disk, update the state and io_fix */ j = 0; while (j < fc->n_flush_cur) { flash_cache_mutex_enter(); pos = (start_offset + j) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { j++; flash_cache_mutex_exit(); continue; } /* block state and io_fix may be changed by doublewrite and lru move */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); if (flush_block->io_fix & IO_FIX_FLUSH) { /* the block is already in BLOCK_FLUSHED state */ flush_block->io_fix &= ~IO_FIX_FLUSH; } data_size = fc_block_get_data_size(flush_block); flash_block_mutex_exit(flush_block->fil_offset); j += data_size; } /* * i and j may be different, as the last been flushed block may be invalid by doublewrite, * so maybe i > j */ /* add the actual flushed blocks */ srv_flash_cache_flush = srv_flash_cache_flush + c_flush; /* step 4: update fc status and flush_off, and wake up threads that are sleep for space */ if (i > 0) { ut_a(i >= c_flush); flash_cache_mutex_enter(); /* * it is safe to inc flush off and sub dirty blocks at this time, * as fc_validate is not work */ fc_inc_flush_off(i); flash_cache_log_mutex_enter(); fc_log->current_stat->flush_offset = fc->flush_off; fc_log->current_stat->flush_round = fc->flush_round; flash_cache_log_mutex_exit(); ut_a(srv_flash_cache_dirty >= c_flush); srv_flash_cache_dirty -= c_flush; srv_fc_flush_should_commit_log_flush++; os_event_set(fc->wait_space_event); fc->n_flush_cur = 0; flash_cache_mutex_exit(); } fc->flush_buf->free_pos = 0; return c_flush; }