void buf_read_ibuf_merge_pages( /*======================*/ ibool sync, /* in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ ulint* space_ids, /* in: array of space ids */ ib_longlong* space_versions,/* in: the spaces must have this version number (timestamp), otherwise we discard the read; we use this to cancel reads if DISCARD + IMPORT may have changed the tablespace size */ ulint* page_nos, /* in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /* in: number of page numbers in the array */ { ulint err; ulint i; ut_ad(!ibuf_inside()); #ifdef UNIV_IBUF_DEBUG ut_a(n_stored < UNIV_PAGE_SIZE); #endif while (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { os_thread_sleep(500000); } for (i = 0; i < n_stored; i++) { buf_read_page_low(&err, (i + 1 == n_stored) && sync, BUF_READ_ANY_PAGE, space_ids[i], space_versions[i], page_nos[i]); if (err == DB_TABLESPACE_DELETED) { /* We have deleted or are deleting the single-table tablespace: remove the entries for that page */ ibuf_merge_or_delete_for_page(NULL, space_ids[i], page_nos[i], FALSE); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Ibuf merge read-ahead space %lu pages %lu\n", (ulong) space_ids[0], (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
/********************************************************************//** Flush a batch of writes to the datafiles that have already been written by the OS. */ UNIV_INTERN void fc_flush_sync_dbfile(void) /*==========================*/ { /* Wake possible simulated aio thread to actually post the writes to the operating system */ os_aio_simulated_wake_handler_threads(); /* Wait that all async writes to tablespaces have been posted to the OS */ os_aio_wait_until_no_pending_writes(); /* Now we flush the data to disk (for example, with fsync) */ fil_flush_file_spaces(FIL_TABLESPACE); return; }
/********************************************************************//** Issues read requests for pages which recovery wants to read in. */ UNIV_INTERN void buf_read_recv_pages( /*================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of page numbers in the array */ { ib_int64_t tablespace_version; ulint count; ulint err; ulint i; zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ /* the log records should be treated here same reason for http://bugs.mysql.com/bug.php?id=43948 */ if (recv_recovery_is_on()) { recv_addr_t* recv_addr; mutex_enter(&(recv_sys->mutex)); if (recv_sys->apply_log_recs == FALSE) { mutex_exit(&(recv_sys->mutex)); goto not_to_recover; } for (i = 0; i < n_stored; i++) { /* recv_get_fil_addr_struct() */ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]), recv_sys->addr_hash)); while (recv_addr) { if ((recv_addr->space == space) && (recv_addr->page_no == page_nos[i])) { break; } recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); } if ((recv_addr == NULL) || (recv_addr->state == RECV_BEING_PROCESSED) || (recv_addr->state == RECV_PROCESSED)) { continue; } recv_addr->state = RECV_PROCESSED; ut_a(recv_sys->n_addrs); recv_sys->n_addrs--; } mutex_exit(&(recv_sys->mutex)); fprintf(stderr, " (cannot find space: %lu)", space); } not_to_recover: return; } tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { buf_pool_t* buf_pool; count = 0; os_aio_print_debug = FALSE; buf_pool = buf_pool_get(space, page_nos[i]); while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(10000); count++; if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," " pending pread calls %lu\n", (ulong) buf_pool->n_pend_reads, (ulong)os_file_n_pending_preads); os_aio_print_debug = TRUE; } } os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, zip_size, TRUE, tablespace_version, page_nos[i], NULL); } else { buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, space, zip_size, TRUE, tablespace_version, page_nos[i], NULL); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
/********************************************************************//** Issues read requests for pages which the ibuf module wants to read in, in order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ UNIV_INTERN void buf_read_ibuf_merge_pages( /*======================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ const ulint* space_ids, /*!< in: array of space ids */ const ib_int64_t* space_versions,/*!< in: the spaces must have this version number (timestamp), otherwise we discard the read; we use this to cancel reads if DISCARD + IMPORT may have changed the tablespace size */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of elements in the arrays */ { ulint i; #ifdef UNIV_IBUF_DEBUG ut_a(n_stored < UNIV_PAGE_SIZE); #endif for (i = 0; i < n_stored; i++) { ulint err; buf_pool_t* buf_pool; ulint zip_size = fil_space_get_zip_size(space_ids[i]); buf_pool = buf_pool_get(space_ids[i], page_nos[i]); while (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { os_thread_sleep(500000); } if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { goto tablespace_deleted; } buf_read_page_low(&err, sync && (i + 1 == n_stored), BUF_READ_ANY_PAGE, space_ids[i], zip_size, TRUE, space_versions[i], page_nos[i], NULL); if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) { tablespace_deleted: /* We have deleted or are deleting the single-table tablespace: remove the entries for that page */ ibuf_merge_or_delete_for_page(NULL, space_ids[i], page_nos[i], zip_size, FALSE); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(FALSE); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Ibuf merge read-ahead space %lu pages %lu\n", (ulong) space_ids[0], (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
/********************************************************************//** Applies linear read-ahead if in the buf_pool the page is a border page of a linear read-ahead area and all the pages in the area have been accessed. Does not read any page if the read-ahead mechanism is not activated. Note that the algorithm looks at the 'natural' adjacent successor and predecessor of the page, which on the leaf level of a B-tree are the next and previous page in the chain of leaves. To know these, the page specified in (space, offset) must already be present in the buf_pool. Thus, the natural way to use this function is to call it when a page in the buf_pool is accessed the first time, calling this function just after it has been bufferfixed. NOTE 1: as this function looks at the natural predecessor and successor fields on the page, what happens, if these are not initialized to any sensible value? No problem, before applying read-ahead we check that the area to read is within the span of the space, if not, read-ahead is not applied. An uninitialized value may result in a useless read operation, but only very improbably. NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this function must be written such that it cannot end up waiting for these latches! NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @return number of page read requests issued */ UNIV_INTERN ulint buf_read_ahead_linear( /*==================*/ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ ulint offset, /*!< in: page number; see NOTE 3 above */ ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf routine */ trx_t* trx) { buf_pool_t* buf_pool = buf_pool_get(space, offset); ib_int64_t tablespace_version; buf_page_t* bpage; buf_frame_t* frame; buf_page_t* pred_bpage = NULL; ulint pred_offset; ulint succ_offset; ulint count; int asc_or_desc; ulint new_offset; ulint fail_count; ulint ibuf_mode; ulint low, high; ulint err; ulint i; const ulint buf_read_ahead_linear_area = BUF_READ_AHEAD_AREA(buf_pool); ulint threshold; if (!(srv_read_ahead & 2)) { return(0); } if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { /* No read-ahead to avoid thread deadlocks */ return(0); } low = (offset / buf_read_ahead_linear_area) * buf_read_ahead_linear_area; high = (offset / buf_read_ahead_linear_area + 1) * buf_read_ahead_linear_area; if ((offset != low) && (offset != high - 1)) { /* This is not a border page of the area: return */ return(0); } if (ibuf_bitmap_page(zip_size, offset) || trx_sys_hdr_page(space, offset)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access order */ return(0); } /* Remember the tablespace version before we ask te tablespace size below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we do not try to read outside the bounds of the tablespace! */ tablespace_version = fil_space_get_version(space); buf_pool_mutex_enter(buf_pool); if (high > fil_space_get_size(space)) { buf_pool_mutex_exit(buf_pool); /* The area is not whole, return */ return(0); } if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { buf_pool_mutex_exit(buf_pool); return(0); } buf_pool_mutex_exit(buf_pool); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, in an ascending order. */ asc_or_desc = 1; if (offset == low) { asc_or_desc = -1; } /* How many out of order accessed pages can we ignore when working out the access pattern for linear readahead */ threshold = ut_min((64 - srv_read_ahead_threshold), BUF_READ_AHEAD_AREA(buf_pool)); fail_count = 0; rw_lock_s_lock(&buf_pool->page_hash_latch); for (i = low; i < high; i++) { bpage = buf_page_hash_get(buf_pool, space, i); if (bpage == NULL || !buf_page_is_accessed(bpage)) { /* Not accessed */ fail_count++; } else if (pred_bpage) { /* Note that buf_page_is_accessed() returns the time of the first access. If some blocks of the extent existed in the buffer pool at the time of a linear access pattern, the first access times may be nonmonotonic, even though the latest access times were linear. The threshold (srv_read_ahead_factor) should help a little against this. */ int res = ut_ulint_cmp( buf_page_is_accessed(bpage), buf_page_is_accessed(pred_bpage)); /* Accesses not in the right order */ if (res != 0 && res != asc_or_desc) { fail_count++; } } if (fail_count > threshold) { /* Too many failures: return */ //buf_pool_mutex_exit(buf_pool); rw_lock_s_unlock(&buf_pool->page_hash_latch); return(0); } if (bpage && buf_page_is_accessed(bpage)) { pred_bpage = bpage; } } /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage == NULL) { //buf_pool_mutex_exit(buf_pool); rw_lock_s_unlock(&buf_pool->page_hash_latch); return(0); } switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: frame = bpage->zip.data; break; case BUF_BLOCK_FILE_PAGE: frame = ((buf_block_t*) bpage)->frame; break; default: ut_error; break; } /* Read the natural predecessor and successor page addresses from the page; NOTE that because the calling thread may have an x-latch on the page, we do not acquire an s-latch on the page, this is to prevent deadlocks. Even if we read values which are nonsense, the algorithm will work. */ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); //buf_pool_mutex_exit(buf_pool); rw_lock_s_unlock(&buf_pool->page_hash_latch); if ((offset == low) && (succ_offset == offset + 1)) { /* This is ok, we can continue */ new_offset = pred_offset; } else if ((offset == high - 1) && (pred_offset == offset - 1)) { /* This is ok, we can continue */ new_offset = succ_offset; } else { /* Successor or predecessor not in the right order */ return(0); } low = (new_offset / buf_read_ahead_linear_area) * buf_read_ahead_linear_area; high = (new_offset / buf_read_ahead_linear_area + 1) * buf_read_ahead_linear_area; if ((new_offset != low) && (new_offset != high - 1)) { /* This is not a border page of the area: return */ return(0); } if (high > fil_space_get_size(space)) { /* The area is not whole, return */ return(0); } /* If we got this far, read-ahead can be sensible: do it */ ibuf_mode = inside_ibuf ? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER : BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER; count = 0; /* Since Windows XP seems to schedule the i/o handler thread very eagerly, and consequently it does not wait for the full read batch to be posted, we use special heuristics here */ os_aio_simulated_put_read_threads_to_sleep(); for (i = low; i < high; i++) { /* It is only sensible to do read-ahead in the non-sync aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(zip_size, i)) { count += buf_read_page_low( &err, FALSE, ibuf_mode, space, zip_size, FALSE, tablespace_version, i, trx); if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: in" " linear readahead trying to access\n" "InnoDB: tablespace %lu page %lu,\n" "InnoDB: but the tablespace does not" " exist or is just being dropped.\n", (ulong) space, (ulong) i); } } } /* In simulated aio we wake the aio handler threads only after queuing all aio requests, in native aio the following call does nothing: */ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(buf_pool, TRUE); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "LINEAR read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } #endif /* UNIV_DEBUG */ /* Read ahead is considered one I/O operation for the purpose of LRU policy decision. */ buf_LRU_stat_inc_io(); buf_pool->stat.n_ra_pages_read += count; return(count); }
/********************************************************************//** Applies a random read-ahead in buf_pool if there are at least a threshold value of accessed pages from the random read-ahead area. Does not read any page, not even the one at the position (space, offset), if the read-ahead mechanism is not activated. NOTE 1: the calling thread may own latches on pages: to avoid deadlocks this function must be written such that it cannot end up waiting for these latches! NOTE 2: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. @return number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a positive value! @return number of page read requests issued */ UNIV_INTERN ulint buf_read_ahead_random( /*==================*/ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ ulint offset, /*!< in: page number of a page which the current thread wants to access */ ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf routine */ trx_t* trx) { buf_pool_t* buf_pool = buf_pool_get(space, offset); ib_int64_t tablespace_version; ulint recent_blocks = 0; ulint ibuf_mode; ulint count; ulint low, high; ulint err; ulint i; const ulint buf_read_ahead_random_area = BUF_READ_AHEAD_AREA(buf_pool); if (!srv_random_read_ahead) { /* Disabled by user */ return(0); } if (srv_startup_is_before_trx_rollback_phase) { /* No read-ahead to avoid thread deadlocks */ return(0); } if (ibuf_bitmap_page(zip_size, offset) || trx_sys_hdr_page(space, offset)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access order */ return(0); } /* Remember the tablespace version before we ask te tablespace size below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we do not try to read outside the bounds of the tablespace! */ tablespace_version = fil_space_get_version(space); low = (offset / buf_read_ahead_random_area) * buf_read_ahead_random_area; high = (offset / buf_read_ahead_random_area + 1) * buf_read_ahead_random_area; if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); } buf_pool_mutex_enter(buf_pool); if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { buf_pool_mutex_exit(buf_pool); return(0); } /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ for (i = low; i < high; i++) { const buf_page_t* bpage = buf_page_hash_get(buf_pool, space, i); if (bpage && buf_page_is_accessed(bpage) && buf_page_peek_if_young(bpage)) { recent_blocks++; if (recent_blocks >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) { buf_pool_mutex_exit(buf_pool); goto read_ahead; } } } buf_pool_mutex_exit(buf_pool); /* Do nothing */ return(0); read_ahead: /* Read all the suitable blocks within the area */ if (inside_ibuf) { ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; } else { ibuf_mode = BUF_READ_ANY_PAGE; } count = 0; for (i = low; i < high; i++) { /* It is only sensible to do read-ahead in the non-sync aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(zip_size, i)) { count += buf_read_page_low( &err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, space, zip_size, FALSE, tablespace_version, i, trx); if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: in random" " readahead trying to access\n" "InnoDB: tablespace %lu page %lu,\n" "InnoDB: but the tablespace does not" " exist or is just being dropped.\n", (ulong) space, (ulong) i); } } } /* In simulated aio we wake the aio handler threads only after queuing all aio requests, in native aio the following call does nothing: */ os_aio_simulated_wake_handler_threads(); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "Random read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } #endif /* UNIV_DEBUG */ /* Read ahead is considered one I/O operation for the purpose of LRU policy decision. */ buf_LRU_stat_inc_io(); buf_pool->stat.n_ra_pages_read_rnd += count; srv_buf_pool_reads += count; return(count); }
/************************************************************************ Flushes possible buffered writes from the doublewrite memory buffer to disk, and also wakes up the aio thread if simulated aio is used. It is very important to call this function after a batch of writes has been posted, and also when we may have to wait for a page latch! Otherwise a deadlock of threads can occur. */ static void buf_flush_buffered_writes(void) /*===========================*/ { buf_block_t* block; byte* write_buf; ulint len; ulint len2; ulint i; if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) { os_aio_simulated_wake_handler_threads(); return; } mutex_enter(&(trx_doublewrite->mutex)); /* Write first to doublewrite buffer blocks. We use synchronous aio and thus know that file write has been completed when the control returns. */ if (trx_doublewrite->first_free == 0) { mutex_exit(&(trx_doublewrite->mutex)); return; } for (i = 0; i < trx_doublewrite->first_free; i++) { block = trx_doublewrite->buf_block_arr[i]; ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4) != mach_read_from_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: The page to be written" " seems corrupt!\n" "InnoDB: The lsn fields do not match!" " Noticed in the buffer pool\n" "InnoDB: before posting to the" " doublewrite buffer.\n"); } if (block->check_index_page_at_flush && !page_simple_validate(block->frame)) { buf_page_print(block->frame); ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Apparent corruption of an" " index page n:o %lu in space %lu\n" "InnoDB: to be written to data file." " We intentionally crash server\n" "InnoDB: to prevent corrupt data" " from ending up in data\n" "InnoDB: files.\n", (ulong) block->offset, (ulong) block->space); ut_error; } } /* increment the doublewrite flushed pages counter */ srv_dblwr_pages_written+= trx_doublewrite->first_free; srv_dblwr_writes++; if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; } else { len = trx_doublewrite->first_free * UNIV_PAGE_SIZE; } fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, trx_doublewrite->block1, 0, len, (void*)trx_doublewrite->write_buf, NULL); write_buf = trx_doublewrite->write_buf; for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) { if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4) != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: The page to be written" " seems corrupt!\n" "InnoDB: The lsn fields do not match!" " Noticed in the doublewrite block1.\n"); } } if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE; fil_io(OS_FILE_WRITE, TRUE, TRX_SYS_SPACE, trx_doublewrite->block2, 0, len, (void*)(trx_doublewrite->write_buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE), NULL); write_buf = trx_doublewrite->write_buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE; for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len; len2 += UNIV_PAGE_SIZE) { if (mach_read_from_4(write_buf + len2 + FIL_PAGE_LSN + 4) != mach_read_from_4(write_buf + len2 + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: The page to be" " written seems corrupt!\n" "InnoDB: The lsn fields do not match!" " Noticed in" " the doublewrite block2.\n"); } } } /* Now flush the doublewrite buffer data to disk */ fil_flush(TRX_SYS_SPACE); /* We know that the writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer blocks. Next do the writes to the intended positions. */ for (i = 0; i < trx_doublewrite->first_free; i++) { block = trx_doublewrite->buf_block_arr[i]; if (mach_read_from_4(block->frame + FIL_PAGE_LSN + 4) != mach_read_from_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: The page to be written" " seems corrupt!\n" "InnoDB: The lsn fields do not match!" " Noticed in the buffer pool\n" "InnoDB: after posting and flushing" " the doublewrite buffer.\n" "InnoDB: Page buf fix count %lu," " io fix %lu, state %lu\n", (ulong)block->buf_fix_count, (ulong)block->io_fix, (ulong)block->state); } ut_a(block->state == BUF_BLOCK_FILE_PAGE); fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE, (void*)block->frame, (void*)block); } /* Wake possible simulated aio thread to actually post the writes to the operating system */ os_aio_simulated_wake_handler_threads(); /* Wait that all async writes to tablespaces have been posted to the OS */ os_aio_wait_until_no_pending_writes(); /* Now we flush the data to disk (for example, with fsync) */ fil_flush_file_spaces(FIL_TABLESPACE); /* We can now reuse the doublewrite memory buffer: */ trx_doublewrite->first_free = 0; mutex_exit(&(trx_doublewrite->mutex)); }
buf_block_t* buf_LRU_get_free_block(void) /*========================*/ /* out: the free control block; also if AWE is used, it is guaranteed that the block has its page mapped to a frame when we return */ { buf_block_t* block = NULL; ibool freed; ulint n_iterations = 1; ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: mutex_enter(&(buf_pool->mutex)); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: over 95 percent of the buffer pool" " is occupied by\n" "InnoDB: lock heaps or the adaptive hash index!" " Check that your\n" "InnoDB: transactions do not set too many row locks.\n" "InnoDB: Your buffer pool size is %lu MB." " Maybe you should make\n" "InnoDB: the buffer pool bigger?\n" "InnoDB: We intentionally generate a seg fault" " to print a stack trace\n" "InnoDB: on Linux!\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); ut_error; } else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 3) { if (!buf_lru_switched_on_innodb_mon) { /* Over 67 % of the buffer pool is occupied by lock heaps or the adaptive hash index. This may be a memory leak! */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: WARNING: over 67 percent of" " the buffer pool is occupied by\n" "InnoDB: lock heaps or the adaptive" " hash index! Check that your\n" "InnoDB: transactions do not set too many" " row locks.\n" "InnoDB: Your buffer pool size is %lu MB." " Maybe you should make\n" "InnoDB: the buffer pool bigger?\n" "InnoDB: Starting the InnoDB Monitor to print" " diagnostics, including\n" "InnoDB: lock heap and hash index sizes.\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); buf_lru_switched_on_innodb_mon = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } } else if (buf_lru_switched_on_innodb_mon) { /* Switch off the InnoDB Monitor; this is a simple way to stop the monitor if the situation becomes less urgent, but may also surprise users if the user also switched on the monitor! */ buf_lru_switched_on_innodb_mon = FALSE; srv_print_innodb_monitor = FALSE; } /* If there is a block in the free list, take it */ if (UT_LIST_GET_LEN(buf_pool->free) > 0) { block = UT_LIST_GET_FIRST(buf_pool->free); ut_a(block->in_free_list); UT_LIST_REMOVE(free, buf_pool->free, block); block->in_free_list = FALSE; ut_a(block->state != BUF_BLOCK_FILE_PAGE); ut_a(!block->in_LRU_list); if (srv_use_awe) { if (block->frame) { /* Remove from the list of mapped pages */ UT_LIST_REMOVE(awe_LRU_free_mapped, buf_pool->awe_LRU_free_mapped, block); } else { /* We map the page to a frame; second param FALSE below because we do not want it to be added to the awe_LRU_free_mapped list */ buf_awe_map_page_to_frame(block, FALSE); } } mutex_enter(&block->mutex); block->state = BUF_BLOCK_READY_FOR_USE; UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; } return(block); } /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ mutex_exit(&(buf_pool->mutex)); freed = buf_LRU_search_and_free_block(n_iterations); if (freed > 0) { goto loop; } if (n_iterations > 30) { ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Warning: difficult to find free blocks from\n" "InnoDB: the buffer pool (%lu search iterations)!" " Consider\n" "InnoDB: increasing the buffer pool size.\n" "InnoDB: It is also possible that" " in your Unix version\n" "InnoDB: fsync is very slow, or" " completely frozen inside\n" "InnoDB: the OS kernel. Then upgrading to" " a newer version\n" "InnoDB: of your operating system may help." " Look at the\n" "InnoDB: number of fsyncs in diagnostic info below.\n" "InnoDB: Pending flushes (fsync) log: %lu;" " buffer pool: %lu\n" "InnoDB: %lu OS file reads, %lu OS file writes," " %lu OS fsyncs\n" "InnoDB: Starting InnoDB Monitor to print further\n" "InnoDB: diagnostics to the standard output.\n", (ulong) n_iterations, (ulong) fil_n_pending_log_flushes, (ulong) fil_n_pending_tablespace_flushes, (ulong) os_n_file_reads, (ulong) os_n_file_writes, (ulong) os_n_fsyncs); mon_value_was = srv_print_innodb_monitor; started_monitor = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } /* No free block was found: try to flush the LRU list */ buf_flush_free_margin(); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); mutex_enter(&(buf_pool->mutex)); if (buf_pool->LRU_flush_ended > 0) { /* We have written pages in an LRU flush. To make the insert buffer more efficient, we try to move these pages to the free list. */ mutex_exit(&(buf_pool->mutex)); buf_LRU_try_free_flushed_blocks(); } else { mutex_exit(&(buf_pool->mutex)); } if (n_iterations > 10) { os_thread_sleep(500000); } n_iterations++; goto loop; }
/********************************************************************//** Issues read requests for pages which recovery wants to read in. */ UNIV_INTERN void buf_read_recv_pages( /*================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of page numbers in the array */ { ib_int64_t tablespace_version; ulint count; ulint err; ulint i; zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ return; } tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { buf_pool_t* buf_pool; count = 0; os_aio_print_debug = FALSE; buf_pool = buf_pool_get(space, page_nos[i]); while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(10000); count++; if (count > 1000) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" " 10 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," " pending pread calls %lu\n", (ulong) buf_pool->n_pend_reads, (ulong)os_file_n_pending_preads); os_aio_print_debug = TRUE; } } os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, zip_size, TRUE, tablespace_version, page_nos[i]); } else { buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, space, zip_size, TRUE, tablespace_version, page_nos[i]); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
void buf_read_recv_pages( /*================*/ ibool sync, /* in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ ulint space, /* in: space id */ ulint* page_nos, /* in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /* in: number of page numbers in the array */ { ib_longlong tablespace_version; ulint count; ulint err; ulint i; tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { count = 0; os_aio_print_debug = FALSE; while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) { os_aio_simulated_wake_handler_threads(); os_thread_sleep(500000); count++; if (count > 100) { fprintf(stderr, "InnoDB: Error: InnoDB has waited for" " 50 seconds for pending\n" "InnoDB: reads to the buffer pool to" " be finished.\n" "InnoDB: Number of pending reads %lu," " pending pread calls %lu\n", (ulong) buf_pool->n_pend_reads, (ulong)os_file_n_pending_preads); os_aio_print_debug = TRUE; } } os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, tablespace_version, page_nos[i]); } else { buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, space, tablespace_version, page_nos[i]); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Recovery applies read-ahead pages %lu\n", (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
ulint buf_read_ahead_linear( /*==================*/ /* out: number of page read requests issued */ ulint space, /* in: space id */ ulint offset) /* in: page number of a page; NOTE: the current thread must want access to this page (see NOTE 3 above) */ { ib_longlong tablespace_version; buf_block_t* block; buf_frame_t* frame; buf_block_t* pred_block = NULL; ulint pred_offset; ulint succ_offset; ulint count; int asc_or_desc; ulint new_offset; ulint fail_count; ulint ibuf_mode; ulint low, high; ulint err; ulint i; if (srv_startup_is_before_trx_rollback_phase) { /* No read-ahead to avoid thread deadlocks */ return(0); } if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access order */ return(0); } low = (offset / BUF_READ_AHEAD_LINEAR_AREA) * BUF_READ_AHEAD_LINEAR_AREA; high = (offset / BUF_READ_AHEAD_LINEAR_AREA + 1) * BUF_READ_AHEAD_LINEAR_AREA; if ((offset != low) && (offset != high - 1)) { /* This is not a border page of the area: return */ return(0); } /* Remember the tablespace version before we ask te tablespace size below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we do not try to read outside the bounds of the tablespace! */ tablespace_version = fil_space_get_version(space); mutex_enter(&(buf_pool->mutex)); if (high > fil_space_get_size(space)) { mutex_exit(&(buf_pool->mutex)); /* The area is not whole, return */ return(0); } if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { mutex_exit(&(buf_pool->mutex)); return(0); } /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, in an ascending order. */ asc_or_desc = 1; if (offset == low) { asc_or_desc = -1; } fail_count = 0; for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); if ((block == NULL) || !block->accessed) { /* Not accessed */ fail_count++; } else if (pred_block && (ut_ulint_cmp(block->LRU_position, pred_block->LRU_position) != asc_or_desc)) { /* Accesses not in the right order */ fail_count++; pred_block = block; } } if (fail_count > BUF_READ_AHEAD_LINEAR_AREA - BUF_READ_AHEAD_LINEAR_THRESHOLD) { /* Too many failures: return */ mutex_exit(&(buf_pool->mutex)); return(0); } /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ block = buf_page_hash_get(space, offset); if (block == NULL) { mutex_exit(&(buf_pool->mutex)); return(0); } frame = block->frame; /* Read the natural predecessor and successor page addresses from the page; NOTE that because the calling thread may have an x-latch on the page, we do not acquire an s-latch on the page, this is to prevent deadlocks. Even if we read values which are nonsense, the algorithm will work. */ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); mutex_exit(&(buf_pool->mutex)); if ((offset == low) && (succ_offset == offset + 1)) { /* This is ok, we can continue */ new_offset = pred_offset; } else if ((offset == high - 1) && (pred_offset == offset - 1)) { /* This is ok, we can continue */ new_offset = succ_offset; } else { /* Successor or predecessor not in the right order */ return(0); } low = (new_offset / BUF_READ_AHEAD_LINEAR_AREA) * BUF_READ_AHEAD_LINEAR_AREA; high = (new_offset / BUF_READ_AHEAD_LINEAR_AREA + 1) * BUF_READ_AHEAD_LINEAR_AREA; if ((new_offset != low) && (new_offset != high - 1)) { /* This is not a border page of the area: return */ return(0); } if (high > fil_space_get_size(space)) { /* The area is not whole, return */ return(0); } /* If we got this far, read-ahead can be sensible: do it */ if (ibuf_inside()) { ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; } else { ibuf_mode = BUF_READ_ANY_PAGE; } count = 0; /* Since Windows XP seems to schedule the i/o handler thread very eagerly, and consequently it does not wait for the full read batch to be posted, we use special heuristics here */ os_aio_simulated_put_read_threads_to_sleep(); for (i = low; i < high; i++) { /* It is only sensible to do read-ahead in the non-sync aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(i)) { count += buf_read_page_low( &err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, space, tablespace_version, i); if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: in" " linear readahead trying to access\n" "InnoDB: tablespace %lu page %lu,\n" "InnoDB: but the tablespace does not" " exist or is just being dropped.\n", (ulong) space, (ulong) i); } } } /* In simulated aio we wake the aio handler threads only after queuing all aio requests, in native aio the following call does nothing: */ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "LINEAR read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } #endif /* UNIV_DEBUG */ ++srv_read_ahead_seq; return(count); }
/************************************************************************ Applies a random read-ahead in buf_pool if there are at least a threshold value of accessed pages from the random read-ahead area. Does not read any page, not even the one at the position (space, offset), if the read-ahead mechanism is not activated. NOTE 1: the calling thread may own latches on pages: to avoid deadlocks this function must be written such that it cannot end up waiting for these latches! NOTE 2: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous i/o. */ static ulint buf_read_ahead_random( /*==================*/ /* out: number of page read requests issued; NOTE that if we read ibuf pages, it may happen that the page at the given page number does not get read even if we return a value > 0! */ ulint space, /* in: space id */ ulint offset) /* in: page number of a page which the current thread wants to access */ { ib_longlong tablespace_version; buf_block_t* block; ulint recent_blocks = 0; ulint count; ulint LRU_recent_limit; ulint ibuf_mode; ulint low, high; ulint err; ulint i; if (srv_startup_is_before_trx_rollback_phase) { /* No read-ahead to avoid thread deadlocks */ return(0); } if (ibuf_bitmap_page(offset) || trx_sys_hdr_page(space, offset)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access order */ return(0); } /* Remember the tablespace version before we ask te tablespace size below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we do not try to read outside the bounds of the tablespace! */ tablespace_version = fil_space_get_version(space); low = (offset / BUF_READ_AHEAD_RANDOM_AREA) * BUF_READ_AHEAD_RANDOM_AREA; high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1) * BUF_READ_AHEAD_RANDOM_AREA; if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); } /* Get the minimum LRU_position field value for an initial segment of the LRU list, to determine which blocks have recently been added to the start of the list. */ LRU_recent_limit = buf_LRU_get_recent_limit(); mutex_enter(&(buf_pool->mutex)); if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { mutex_exit(&(buf_pool->mutex)); return(0); } /* Count how many blocks in the area have been recently accessed, that is, reside near the start of the LRU list. */ for (i = low; i < high; i++) { block = buf_page_hash_get(space, i); if ((block) && (block->LRU_position > LRU_recent_limit) && block->accessed) { recent_blocks++; } } mutex_exit(&(buf_pool->mutex)); if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) { /* Do nothing */ return(0); } /* Read all the suitable blocks within the area */ if (ibuf_inside()) { ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; } else { ibuf_mode = BUF_READ_ANY_PAGE; } count = 0; for (i = low; i < high; i++) { /* It is only sensible to do read-ahead in the non-sync aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(i)) { count += buf_read_page_low( &err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, space, tablespace_version, i); if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: in random" " readahead trying to access\n" "InnoDB: tablespace %lu page %lu,\n" "InnoDB: but the tablespace does not" " exist or is just being dropped.\n", (ulong) space, (ulong) i); } } } /* In simulated aio we wake the aio handler threads only after queuing all aio requests, in native aio the following call does nothing: */ os_aio_simulated_wake_handler_threads(); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "Random read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } #endif /* UNIV_DEBUG */ ++srv_read_ahead_rnd; return(count); }