int lwip_sock_read(void *conn, unsigned char *buff, unsigned long len) { struct netbuf *new_buf=0; unsigned char *data; int data_len=0; int ret,newret; SYSCALL_DEBUG(" SOCK read :%x len:%d \n",buff,len); mutexLock(g_netBH_lock); ret=netconn_recv(conn, &new_buf); mutexUnLock(g_netBH_lock); if (ret!=ERR_OK){ SYSCALL_DEBUG(" Fail to recv data: %x newret:%x(%d) \n",ret,-ret,-ret); return 0; } netbuf_data(new_buf,&data,&data_len); SYSCALL_DEBUG(" SUCESS to recv data:%d ret:%d\n",data_len,ret); if (data_len > 0){ ut_memcpy(buff,data,ut_min(data_len,len)); ret = ut_min(data_len,len); }else{ ret = 0; } mutexLock(g_netBH_lock); netbuf_delete(new_buf); mutexUnLock(g_netBH_lock); return ret; }
int lwip_sock_read_from(void *conn, unsigned char *buff, unsigned long len,struct sockaddr *sockaddr, int addr_len) { struct netbuf *new_buf=0; unsigned char *data; int data_len=0; int ret=0; SYSCALL_DEBUG(" SOCK recvfrom :%x len:%d \n",buff,len); mutexLock(g_netBH_lock); ret=netconn_recv(conn, &new_buf); mutexUnLock(g_netBH_lock); if (ret == ERR_TIMEOUT){ if (g_current_task->killed == 1){ return 0; } } if (ret!=ERR_OK){ SYSCALL_DEBUG(" Fail to recvfrom data: %x newret:%x(%d) \n",ret,-ret,-ret); return 0; } SYSCALL_DEBUG(" SUCESS to recv data:%d \n",ret); netbuf_data(new_buf,&data,&data_len); if (data_len > 0){ if (sockaddr != 0){ sockaddr->addr = new_buf->addr.addr; sockaddr->sin_port = new_buf->port; } ut_memcpy(buff,data,ut_min(data_len,len)); ret = ut_min(data_len,len); }else{ ret =0; } mutexLock(g_netBH_lock); netbuf_delete(new_buf); mutexUnLock(g_netBH_lock); return ret; }
/*************************************************************//** Creates a hash table with at least n array cells. The actual number of cells is chosen to be a prime number slightly bigger than n. @return own: created table */ UNIV_INTERN hash_table_t* ha_create_func( /*===========*/ ulint n, /*!< in: number of array cells */ #ifdef UNIV_SYNC_DEBUG ulint mutex_level, /*!< in: level of the mutexes in the latching order: this is used in the debug version */ #endif /* UNIV_SYNC_DEBUG */ ulint n_mutexes) /*!< in: number of mutexes to protect the hash table: must be a power of 2, or 0 */ { hash_table_t* table; #ifndef UNIV_HOTBACKUP ulint i; #endif /* !UNIV_HOTBACKUP */ ut_ad(ut_is_2pow(n_mutexes)); table = hash_create(n); #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG # ifndef UNIV_HOTBACKUP table->adaptive = TRUE; # endif /* !UNIV_HOTBACKUP */ #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ /* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail, but in practise it never should in this case, hence the asserts. */ if (n_mutexes == 0) { table->heap = mem_heap_create_in_btr_search( ut_min(4096, MEM_MAX_ALLOC_IN_BUF)); ut_a(table->heap); return(table); } #ifndef UNIV_HOTBACKUP hash_create_mutexes(table, n_mutexes, mutex_level); table->heaps = mem_alloc(n_mutexes * sizeof(void*)); for (i = 0; i < n_mutexes; i++) { table->heaps[i] = mem_heap_create_in_btr_search(4096); ut_a(table->heaps[i]); } #endif /* !UNIV_HOTBACKUP */ return(table); }
/**********************************************************************//** Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last (size - 1) bytes of src, not the first. @return strlen(src) */ UNIV_INTERN ulint ut_strlcpy_rev( /*===========*/ char* dst, /*!< in: destination buffer */ const char* src, /*!< in: source buffer */ ulint size) /*!< in: size of destination buffer */ { ulint src_size = strlen(src); if (size != 0) { ulint n = ut_min(src_size, size - 1); memcpy(dst, src + src_size - n, n + 1); } return(src_size); }
/*******************************************************************//** Formats the raw data in "data" (in InnoDB on-disk format) that is of type DATA_INT using "prtype" and writes the result to "buf". If the data is in unknown format, then nothing is written to "buf", 0 is returned and "format_in_hex" is set to TRUE, otherwise "format_in_hex" is left untouched. Not more than "buf_size" bytes are written to "buf". The result is always '\0'-terminated (provided buf_size > 0) and the number of bytes that were written to "buf" is returned (including the terminating '\0'). @return number of bytes that were written */ static ulint row_raw_format_int( /*===============*/ const char* data, /*!< in: raw data */ ulint data_len, /*!< in: raw data length in bytes */ ulint prtype, /*!< in: precise type */ char* buf, /*!< out: output buffer */ ulint buf_size, /*!< in: output buffer size in bytes */ ibool* format_in_hex) /*!< out: should the data be formated in hex */ { ulint ret; if (data_len <= sizeof(ullint)) { ullint value; ibool unsigned_type = prtype & DATA_UNSIGNED; value = mach_read_int_type((const byte*) data, data_len, unsigned_type); if (unsigned_type) { ret = ut_snprintf(buf, buf_size, "%llu", value) + 1; } else { ret = ut_snprintf(buf, buf_size, "%lld", (long long) value) + 1; } } else { *format_in_hex = TRUE; ret = 0; } return(ut_min(ret, buf_size)); }
/*************************************************************//** Print a dfield value using ut_print_buf. */ static void dfield_print_raw( /*=============*/ FILE* f, /*!< in: output stream */ const dfield_t* dfield) /*!< in: dfield */ { ulint len = dfield_get_len(dfield); if (!dfield_is_null(dfield)) { ulint print_len = ut_min(len, 1000); ut_print_buf(f, dfield_get_data(dfield), print_len); if (len != print_len) { fprintf(f, "(total %lu bytes%s)", (ulong) len, dfield_is_ext(dfield) ? ", external" : ""); } } else { fputs(" SQL NULL", f); } }
/********************************************************************//** Applies linear read-ahead if in the buf_pool the page is a border page of a linear read-ahead area and all the pages in the area have been accessed. Does not read any page if the read-ahead mechanism is not activated. Note that the algorithm looks at the 'natural' adjacent successor and predecessor of the page, which on the leaf level of a B-tree are the next and previous page in the chain of leaves. To know these, the page specified in (space, offset) must already be present in the buf_pool. Thus, the natural way to use this function is to call it when a page in the buf_pool is accessed the first time, calling this function just after it has been bufferfixed. NOTE 1: as this function looks at the natural predecessor and successor fields on the page, what happens, if these are not initialized to any sensible value? No problem, before applying read-ahead we check that the area to read is within the span of the space, if not, read-ahead is not applied. An uninitialized value may result in a useless read operation, but only very improbably. NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this function must be written such that it cannot end up waiting for these latches! NOTE 3: the calling thread must want access to the page given: this rule is set to prevent unintended read-aheads performed by ibuf routines, a situation which could result in a deadlock if the OS does not support asynchronous io. @return number of page read requests issued */ UNIV_INTERN ulint buf_read_ahead_linear( /*==================*/ ulint space, /*!< in: space id */ ulint zip_size, /*!< in: compressed page size in bytes, or 0 */ ulint offset, /*!< in: page number; see NOTE 3 above */ ibool inside_ibuf, /*!< in: TRUE if we are inside ibuf routine */ trx_t* trx) { buf_pool_t* buf_pool = buf_pool_get(space, offset); ib_int64_t tablespace_version; buf_page_t* bpage; buf_frame_t* frame; buf_page_t* pred_bpage = NULL; ulint pred_offset; ulint succ_offset; ulint count; int asc_or_desc; ulint new_offset; ulint fail_count; ulint ibuf_mode; ulint low, high; ulint err; ulint i; const ulint buf_read_ahead_linear_area = BUF_READ_AHEAD_AREA(buf_pool); ulint threshold; if (!(srv_read_ahead & 2)) { return(0); } if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) { /* No read-ahead to avoid thread deadlocks */ return(0); } low = (offset / buf_read_ahead_linear_area) * buf_read_ahead_linear_area; high = (offset / buf_read_ahead_linear_area + 1) * buf_read_ahead_linear_area; if ((offset != low) && (offset != high - 1)) { /* This is not a border page of the area: return */ return(0); } if (ibuf_bitmap_page(zip_size, offset) || trx_sys_hdr_page(space, offset)) { /* If it is an ibuf bitmap page or trx sys hdr, we do no read-ahead, as that could break the ibuf page access order */ return(0); } /* Remember the tablespace version before we ask te tablespace size below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we do not try to read outside the bounds of the tablespace! */ tablespace_version = fil_space_get_version(space); buf_pool_mutex_enter(buf_pool); if (high > fil_space_get_size(space)) { buf_pool_mutex_exit(buf_pool); /* The area is not whole, return */ return(0); } if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { buf_pool_mutex_exit(buf_pool); return(0); } buf_pool_mutex_exit(buf_pool); /* Check that almost all pages in the area have been accessed; if offset == low, the accesses must be in a descending order, otherwise, in an ascending order. */ asc_or_desc = 1; if (offset == low) { asc_or_desc = -1; } /* How many out of order accessed pages can we ignore when working out the access pattern for linear readahead */ threshold = ut_min((64 - srv_read_ahead_threshold), BUF_READ_AHEAD_AREA(buf_pool)); fail_count = 0; rw_lock_s_lock(&buf_pool->page_hash_latch); for (i = low; i < high; i++) { bpage = buf_page_hash_get(buf_pool, space, i); if (bpage == NULL || !buf_page_is_accessed(bpage)) { /* Not accessed */ fail_count++; } else if (pred_bpage) { /* Note that buf_page_is_accessed() returns the time of the first access. If some blocks of the extent existed in the buffer pool at the time of a linear access pattern, the first access times may be nonmonotonic, even though the latest access times were linear. The threshold (srv_read_ahead_factor) should help a little against this. */ int res = ut_ulint_cmp( buf_page_is_accessed(bpage), buf_page_is_accessed(pred_bpage)); /* Accesses not in the right order */ if (res != 0 && res != asc_or_desc) { fail_count++; } } if (fail_count > threshold) { /* Too many failures: return */ //buf_pool_mutex_exit(buf_pool); rw_lock_s_unlock(&buf_pool->page_hash_latch); return(0); } if (bpage && buf_page_is_accessed(bpage)) { pred_bpage = bpage; } } /* If we got this far, we know that enough pages in the area have been accessed in the right order: linear read-ahead can be sensible */ bpage = buf_page_hash_get(buf_pool, space, offset); if (bpage == NULL) { //buf_pool_mutex_exit(buf_pool); rw_lock_s_unlock(&buf_pool->page_hash_latch); return(0); } switch (buf_page_get_state(bpage)) { case BUF_BLOCK_ZIP_PAGE: frame = bpage->zip.data; break; case BUF_BLOCK_FILE_PAGE: frame = ((buf_block_t*) bpage)->frame; break; default: ut_error; break; } /* Read the natural predecessor and successor page addresses from the page; NOTE that because the calling thread may have an x-latch on the page, we do not acquire an s-latch on the page, this is to prevent deadlocks. Even if we read values which are nonsense, the algorithm will work. */ pred_offset = fil_page_get_prev(frame); succ_offset = fil_page_get_next(frame); //buf_pool_mutex_exit(buf_pool); rw_lock_s_unlock(&buf_pool->page_hash_latch); if ((offset == low) && (succ_offset == offset + 1)) { /* This is ok, we can continue */ new_offset = pred_offset; } else if ((offset == high - 1) && (pred_offset == offset - 1)) { /* This is ok, we can continue */ new_offset = succ_offset; } else { /* Successor or predecessor not in the right order */ return(0); } low = (new_offset / buf_read_ahead_linear_area) * buf_read_ahead_linear_area; high = (new_offset / buf_read_ahead_linear_area + 1) * buf_read_ahead_linear_area; if ((new_offset != low) && (new_offset != high - 1)) { /* This is not a border page of the area: return */ return(0); } if (high > fil_space_get_size(space)) { /* The area is not whole, return */ return(0); } /* If we got this far, read-ahead can be sensible: do it */ ibuf_mode = inside_ibuf ? BUF_READ_IBUF_PAGES_ONLY | OS_AIO_SIMULATED_WAKE_LATER : BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER; count = 0; /* Since Windows XP seems to schedule the i/o handler thread very eagerly, and consequently it does not wait for the full read batch to be posted, we use special heuristics here */ os_aio_simulated_put_read_threads_to_sleep(); for (i = low; i < high; i++) { /* It is only sensible to do read-ahead in the non-sync aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(zip_size, i)) { count += buf_read_page_low( &err, FALSE, ibuf_mode, space, zip_size, FALSE, tablespace_version, i, trx); if (err == DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: in" " linear readahead trying to access\n" "InnoDB: tablespace %lu page %lu,\n" "InnoDB: but the tablespace does not" " exist or is just being dropped.\n", (ulong) space, (ulong) i); } } } /* In simulated aio we wake the aio handler threads only after queuing all aio requests, in native aio the following call does nothing: */ os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(buf_pool, TRUE); #ifdef UNIV_DEBUG if (buf_debug_prints && (count > 0)) { fprintf(stderr, "LINEAR read-ahead space %lu offset %lu pages %lu\n", (ulong) space, (ulong) offset, (ulong) count); } #endif /* UNIV_DEBUG */ /* Read ahead is considered one I/O operation for the purpose of LRU policy decision. */ buf_LRU_stat_inc_io(); buf_pool->stat.n_ra_pages_read += count; return(count); }
big_rec_t* dtuple_convert_big_rec( /*===================*/ /* out, own: created big record vector, NULL if we are not able to shorten the entry enough, i.e., if there are too many short fields in entry */ dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry */ ulint* ext_vec,/* in: array of externally stored fields, or NULL: if a field already is externally stored, then we cannot move it to the vector this function returns */ ulint n_ext_vec)/* in: number of elements is ext_vec */ { mem_heap_t* heap; big_rec_t* vector; dfield_t* dfield; ulint size; ulint n_fields; ulint longest; ulint longest_i = ULINT_MAX; ibool is_externally_stored; ulint i; ulint j; ut_a(dtuple_check_typed_no_assert(entry)); size = rec_get_converted_size(index, entry); if (UNIV_UNLIKELY(size > 1000000000)) { fprintf(stderr, "InnoDB: Warning: tuple size very big: %lu\n", (ulong) size); fputs("InnoDB: Tuple contents: ", stderr); dtuple_print(stderr, entry); putc('\n', stderr); } heap = mem_heap_create(size + dtuple_get_n_fields(entry) * sizeof(big_rec_field_t) + 1000); vector = mem_heap_alloc(heap, sizeof(big_rec_t)); vector->heap = heap; vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry) * sizeof(big_rec_field_t)); /* Decide which fields to shorten: the algorithm is to look for the longest field whose type is DATA_BLOB */ n_fields = 0; while (rec_get_converted_size(index, entry) >= ut_min(page_get_free_space_of_empty( index->table->comp) / 2, REC_MAX_DATA_SIZE)) { longest = 0; for (i = dict_index_get_n_unique_in_tree(index); i < dtuple_get_n_fields(entry); i++) { /* Skip over fields which already are externally stored */ is_externally_stored = FALSE; if (ext_vec) { for (j = 0; j < n_ext_vec; j++) { if (ext_vec[j] == i) { is_externally_stored = TRUE; } } } if (!is_externally_stored) { dfield = dtuple_get_nth_field(entry, i); if (dfield->len != UNIV_SQL_NULL && dfield->len > longest) { longest = dfield->len; longest_i = i; } } } /* We do not store externally fields which are smaller than DICT_MAX_INDEX_COL_LEN */ ut_a(DICT_MAX_INDEX_COL_LEN > REC_1BYTE_OFFS_LIMIT); if (longest < BTR_EXTERN_FIELD_REF_SIZE + 10 + DICT_MAX_INDEX_COL_LEN) { /* Cannot shorten more */ mem_heap_free(heap); return(NULL); } /* Move data from field longest_i to big rec vector; we do not let data size of the remaining entry drop below 128 which is the limit for the 2-byte offset storage format in a physical record. This we accomplish by storing 128 bytes of data in entry itself, and only the remaining part to big rec vec. We store the first bytes locally to the record. Then we can calculate all ordering fields in all indexes from locally stored data. */ dfield = dtuple_get_nth_field(entry, longest_i); vector->fields[n_fields].field_no = longest_i; ut_a(dfield->len > DICT_MAX_INDEX_COL_LEN); vector->fields[n_fields].len = dfield->len - DICT_MAX_INDEX_COL_LEN; vector->fields[n_fields].data = mem_heap_alloc(heap, vector->fields[n_fields].len); /* Copy data (from the end of field) to big rec vector */ ut_memcpy(vector->fields[n_fields].data, ((byte*)dfield->data) + dfield->len - vector->fields[n_fields].len, vector->fields[n_fields].len); dfield->len = dfield->len - vector->fields[n_fields].len + BTR_EXTERN_FIELD_REF_SIZE; /* Set the extern field reference in dfield to zero */ memset(((byte*)dfield->data) + dfield->len - BTR_EXTERN_FIELD_REF_SIZE, 0, BTR_EXTERN_FIELD_REF_SIZE); n_fields++; } vector->n_fields = n_fields; return(vector); }
/*******************************************************************//** Formats the raw data in "data" (in InnoDB on-disk format) using "dict_field" and writes the result to "buf". Not more than "buf_size" bytes are written to "buf". The result is always NUL-terminated (provided buf_size is positive) and the number of bytes that were written to "buf" is returned (including the terminating NUL). @return number of bytes that were written */ UNIV_INTERN ulint row_raw_format( /*===========*/ const char* data, /*!< in: raw data */ ulint data_len, /*!< in: raw data length in bytes */ const dict_field_t* dict_field, /*!< in: index field */ char* buf, /*!< out: output buffer */ ulint buf_size) /*!< in: output buffer size in bytes */ { ulint mtype; ulint prtype; ulint ret; ibool format_in_hex; if (buf_size == 0) { return(0); } if (data_len == UNIV_SQL_NULL) { ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1; return(ut_min(ret, buf_size)); } mtype = dict_field->col->mtype; prtype = dict_field->col->prtype; format_in_hex = FALSE; switch (mtype) { case DATA_INT: ret = row_raw_format_int(data, data_len, prtype, buf, buf_size, &format_in_hex); if (format_in_hex) { goto format_in_hex; } break; case DATA_CHAR: case DATA_VARCHAR: case DATA_MYSQL: case DATA_VARMYSQL: ret = row_raw_format_str(data, data_len, prtype, buf, buf_size, &format_in_hex); if (format_in_hex) { goto format_in_hex; } break; /* XXX support more data types */ default: format_in_hex: if (UNIV_LIKELY(buf_size > 2)) { memcpy(buf, "0x", 2); buf += 2; buf_size -= 2; ret = 2 + ut_raw_to_hex(data, data_len, buf, buf_size); } else { buf[0] = '\0'; ret = 1; } } return(ret); }
/********************************************************************//** Flush pages from flash cache. @return number of pages have been flushed to tablespace */ UNIV_INTERN ulint fc_flush_to_disk( /*==================*/ ibool do_full_io) /*!< in: whether do full io capacity */ { ulint distance; byte* page; ulint ret; ulint space; ulint offset; ulint page_type; ulint i, j; ulint pos; ulint zip_size; ulint block_offset, byte_offset; ulint fc_size = fc_get_size(); ulint fc_blk_size = fc_get_block_size_byte(); ulint start_offset; ulint data_size; fc_block_t *flush_block = NULL; ulint c_flush = 0; ut_ad(!mutex_own(&fc->mutex)); ut_a(fc->flush_buf->free_pos == 0); /* step 1: get the number of blocks need to flush to tablespace */ flash_cache_mutex_enter(); distance = fc_get_distance(); start_offset = fc->flush_off; if ( distance == 0 ) { flash_cache_mutex_exit(); return 0; } else if ( recv_recovery_on ) { if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) { fc->n_flush_cur = 0; } else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) { fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance); } else { fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance); } } else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size) && !do_full_io ) { flash_cache_mutex_exit(); return 0; } else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) && !do_full_io ) { fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct); } else { ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) || do_full_io ); fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance); } flash_cache_mutex_exit(); /* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */ i = 0; while (i < fc->n_flush_cur) { ulint b_space; ulint b_offset; ulint raw_zip_size; ulint size; ulint fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE ulint is_v4_blk; #endif byte* page_io; flash_cache_mutex_enter(); pos = ( start_offset + i ) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { i++; flash_cache_mutex_exit(); continue; } /* we should get the mutex, as doublewrite may hit this block and invalid the block */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); data_size = fc_block_get_data_size(flush_block); if (flush_block->state != BLOCK_READY_FOR_FLUSH) { /* if readonly or merge write or already flushed*/ ut_a (flush_block->state == BLOCK_NOT_USED || flush_block->state == BLOCK_READ_CACHE || flush_block->state == BLOCK_FLUSHED); i += data_size; flash_block_mutex_exit(flush_block->fil_offset); if (flush_block->state == BLOCK_NOT_USED) { //fc_block_detach(FALSE, flush_block); fc_block_free(flush_block); } continue; } zip_size = fil_space_get_zip_size(flush_block->space); if (zip_size == ULINT_UNDEFINED) { /* table has been droped, just set it BLOCK_FLUSHED */ #ifdef UNIV_FLASH_CACHE_TRACE ut_print_timestamp(fc->f_debug); fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n", (ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset); #endif flush_block->state = BLOCK_FLUSHED; i += data_size; c_flush += data_size; flash_block_mutex_exit(flush_block->fil_offset); continue; } #ifdef UNIV_FLASH_CACHE_TRACE if (flush_block->state != BLOCK_READY_FOR_FLUSH) { fc_block_print(flush_block); ut_error; } #endif flush_block->io_fix |= IO_FIX_FLUSH; /* * we should set block state BLOCK_FLUSHED, if not, doublewrite may hit this block * and invalid this block and reduce the dirty count, but when finish flush ,we will * reduce the dirty count too, so it may reduce twice. */ flush_block->state = BLOCK_FLUSHED; /* save the block info, as the block may be invalided by doublewrite after release mutex */ b_space = flush_block->space; b_offset = flush_block->offset; raw_zip_size = flush_block->raw_zip_size; size = flush_block->size; fil_offset = flush_block->fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE is_v4_blk = flush_block->is_v4_blk; #endif /* release the block now, so read can hit in this blocks and read the data */ flash_block_mutex_exit(flush_block->fil_offset); /* * Only flush thread will update read_buf and flush_off/round. * there only single flush thread no need to lock read_buf */ page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size; if (raw_zip_size > 0) { ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE); page_io = fc->flush_zip_read_buf; } else { page_io = page; } fc_io_offset(fil_offset, &block_offset, &byte_offset); ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0, block_offset, byte_offset, data_size * fc_blk_size, page_io, NULL); if (ret != DB_SUCCESS) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n" "flash cache flush offset is:%lu.\n", (ulong)(start_offset + i)); ut_error; } if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) { goto skip; } /* decompress the compress data */ if (raw_zip_size > 0) { #ifdef UNIV_FLASH_CACHE_TRACE ulint blk_zip_size_byte; if (is_v4_blk) { blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte(); } else { blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte(); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size); } ut_a(page_io); ut_a(page); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset); /* only qlz can do this check */ if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) { if (is_v4_blk) { ut_a(raw_zip_size * fc_get_block_size_byte() >= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } else { ut_a(raw_zip_size == (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } #endif fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page); } space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); offset = mach_read_from_4(page + FIL_PAGE_OFFSET); if ((space != b_space) || (offset != b_offset)) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } if (buf_page_is_corrupted(page, zip_size)) { buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH); ut_error; } page_type = fil_page_get_type(page); if (page_type == FIL_PAGE_INDEX) { page_type = 1; } srv_flash_cache_flush_detail[page_type]++; ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL); if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } /* add UNIV_PAGE_SIZE / fc_blk_size for safe */ fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size; skip: i += data_size; c_flush += data_size; if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) { /* FIXME: is it safe to change n_flush, as step 3 will use n_flush */ fc->n_flush_cur = i; break; } } /* ok, now flush all async io to disk */ fc_flush_sync_dbfile(); /* step 3: all the flush blocks have sync to disk, update the state and io_fix */ j = 0; while (j < fc->n_flush_cur) { flash_cache_mutex_enter(); pos = (start_offset + j) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { j++; flash_cache_mutex_exit(); continue; } /* block state and io_fix may be changed by doublewrite and lru move */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); if (flush_block->io_fix & IO_FIX_FLUSH) { /* the block is already in BLOCK_FLUSHED state */ flush_block->io_fix &= ~IO_FIX_FLUSH; } data_size = fc_block_get_data_size(flush_block); flash_block_mutex_exit(flush_block->fil_offset); j += data_size; } /* * i and j may be different, as the last been flushed block may be invalid by doublewrite, * so maybe i > j */ /* add the actual flushed blocks */ srv_flash_cache_flush = srv_flash_cache_flush + c_flush; /* step 4: update fc status and flush_off, and wake up threads that are sleep for space */ if (i > 0) { ut_a(i >= c_flush); flash_cache_mutex_enter(); /* * it is safe to inc flush off and sub dirty blocks at this time, * as fc_validate is not work */ fc_inc_flush_off(i); flash_cache_log_mutex_enter(); fc_log->current_stat->flush_offset = fc->flush_off; fc_log->current_stat->flush_round = fc->flush_round; flash_cache_log_mutex_exit(); ut_a(srv_flash_cache_dirty >= c_flush); srv_flash_cache_dirty -= c_flush; srv_fc_flush_should_commit_log_flush++; os_event_set(fc->wait_space_event); fc->n_flush_cur = 0; flash_cache_mutex_exit(); } fc->flush_buf->free_pos = 0; return c_flush; }