/*************************************************************//** Inserts an entry into a hash table. If an entry with the same fold number is found, its node is updated to point to the new data, and no new node is inserted. If btr_search_enabled is set to FALSE, we will only allow updating existing nodes, but no new node is allowed to be added. @return TRUE if succeed, FALSE if no more memory could be allocated */ UNIV_INTERN ibool ha_insert_for_fold_func( /*====================*/ hash_table_t* table, /*!< in: hash table */ ulint fold, /*!< in: folded value of data; if a node with the same fold value already exists, it is updated to point to the same data, and no new node is created! */ #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG buf_block_t* block, /*!< in: buffer block containing the data */ #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ const rec_t* data) /*!< in: data, must not be NULL */ { hash_cell_t* cell; ha_node_t* node; ha_node_t* prev_node; ulint hash; ut_ad(data); ut_ad(table); ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG ut_a(block->frame == page_align(data)); #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ #ifdef UNIV_SYNC_DEBUG ut_ad(rw_lock_own(block->btr_search_latch, RW_LOCK_EX)); #endif /* UNIV_SYNC_DEBUG */ ASSERT_HASH_MUTEX_OWN(table, fold); ut_ad(btr_search_enabled); hash = hash_calc_hash(fold, table); cell = hash_get_nth_cell(table, hash); prev_node = cell->node; while (prev_node != NULL) { if (prev_node->fold == fold) { #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { buf_block_t* prev_block = prev_node->block; ut_a(prev_block->frame == page_align(prev_node->data)); ut_a(prev_block->n_pointers > 0); prev_block->n_pointers--; block->n_pointers++; } prev_node->block = block; #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ prev_node->data = data; return(TRUE); } prev_node = prev_node->next; } /* We have to allocate a new chain node */ node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t)); if (node == NULL) { /* It was a btr search type memory heap and at the moment no more memory could be allocated: return */ ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH); return(FALSE); } ha_node_set_data(node, block, data); #if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG if (table->adaptive) { block->n_pointers++; } #endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ node->fold = fold; node->next = NULL; prev_node = cell->node; if (prev_node == NULL) { cell->node = node; return(TRUE); } while (prev_node->next != NULL) { prev_node = prev_node->next; } prev_node->next = node; return(TRUE); }
ibool ib_handle_errors( /*=============*/ enum db_err* new_err,/*!< out: possible new error encountered in lock wait, or if no new error, the value of trx->error_state at the entry of this function */ trx_t* trx, /*!< in: transaction */ que_thr_t* thr, /*!< in: query thread */ trx_savept_t* savept) /*!< in: savepoint or NULL */ { enum db_err err; handle_new_error: err = trx->error_state; ut_a(err != DB_SUCCESS); trx->error_state = DB_SUCCESS; switch (err) { case DB_LOCK_WAIT_TIMEOUT: if (ses_rollback_on_timeout) { trx_general_rollback(trx, FALSE, NULL); break; } /* fall through */ case DB_DUPLICATE_KEY: case DB_FOREIGN_DUPLICATE_KEY: case DB_TOO_BIG_RECORD: case DB_ROW_IS_REFERENCED: case DB_NO_REFERENCED_ROW: case DB_CANNOT_ADD_CONSTRAINT: case DB_TOO_MANY_CONCURRENT_TRXS: case DB_OUT_OF_FILE_SPACE: if (savept) { /* Roll back the latest, possibly incomplete insertion or update */ trx_general_rollback(trx, TRUE, savept); } break; case DB_LOCK_WAIT: srv_suspend_user_thread(thr); if (trx->error_state != DB_SUCCESS) { que_thr_stop_client(thr); goto handle_new_error; } *new_err = err; return(TRUE); /* Operation needs to be retried. */ case DB_DEADLOCK: case DB_LOCK_TABLE_FULL: /* Roll back the whole transaction; this resolution was added to version 3.23.43 */ trx_general_rollback(trx, FALSE, NULL); break; case DB_MUST_GET_MORE_FILE_SPACE: srv_panic(DB_ERROR, "InnoDB: The database cannot continue" " operation because of\n" "InnoDB: lack of space. You must add" " a new data file\n" "InnoDB: and restart the database.\n"); break; case DB_CORRUPTION: ib_logger(ib_stream, "InnoDB: We detected index corruption" " in an InnoDB type table.\n" "InnoDB: You have to dump + drop + reimport" " the table or, in\n" "InnoDB: a case of widespread corruption," " dump all InnoDB\n" "InnoDB: tables and recreate the" " whole InnoDB tablespace.\n" "InnoDB: If the server crashes" " after the startup or when\n" "InnoDB: you dump the tables, check the \n" "InnoDB: InnoDB website for help.\n"); break; default: ib_logger(ib_stream, "InnoDB: unknown error code %lu\n", (ulong) err); ut_error; } if (trx->error_state != DB_SUCCESS) { *new_err = trx->error_state; } else { *new_err = err; } trx->error_state = DB_SUCCESS; return(FALSE); }
/*******************************************************************//** Builds a partial row from an update undo log record. It contains the columns which occur as ordering in any index of the table. @return pointer to remaining part of undo record */ UNIV_INTERN byte* trx_undo_rec_get_partial_row( /*=========================*/ byte* ptr, /*!< in: remaining part in update undo log record of a suitable type, at the start of the stored index columns; NOTE that this copy of the undo log record must be preserved as long as the partial row is used, as we do NOT copy the data in the record! */ dict_index_t* index, /*!< in: clustered index */ dtuple_t** row, /*!< out, own: partial row */ ibool ignore_prefix, /*!< in: flag to indicate if we expect blob prefixes in undo. Used only in the assertion. */ mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ { const byte* end_ptr; ulint row_len; ut_ad(index); ut_ad(ptr); ut_ad(row); ut_ad(heap); ut_ad(dict_index_is_clust(index)); row_len = dict_table_get_n_cols(index->table); *row = dtuple_create(heap, row_len); dict_table_copy_types(*row, index->table); end_ptr = ptr + mach_read_from_2(ptr); ptr += 2; while (ptr != end_ptr) { dfield_t* dfield; byte* field; ulint field_no; const dict_col_t* col; ulint col_no; ulint len; ulint orig_len; ptr = trx_undo_update_rec_get_field_no(ptr, &field_no); col = dict_index_get_nth_col(index, field_no); col_no = dict_col_get_no(col); ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len); dfield = dtuple_get_nth_field(*row, col_no); dfield_set_data(dfield, field, len); if (len != UNIV_SQL_NULL && len >= UNIV_EXTERN_STORAGE_FIELD) { dfield_set_len(dfield, len - UNIV_EXTERN_STORAGE_FIELD); dfield_set_ext(dfield); /* If the prefix of this column is indexed, ensure that enough prefix is stored in the undo log record. */ ut_a(ignore_prefix || !col->ord_part || dfield_get_len(dfield) >= REC_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE); } } return(ptr); }
void buf_LRU_invalidate_tablespace( /*==========================*/ ulint id) /* in: space id */ { buf_block_t* block; ulint page_no; ibool all_freed; /* Before we attempt to drop pages one by one we first attempt to drop page hash index entries in batches to make it more efficient. The batching attempt is a best effort attempt and does not guarantee that all pages hash entries will be dropped. We get rid of remaining page hash entries one by one below. */ buf_LRU_drop_page_hash_for_tablespace(id); scan_again: mutex_enter(&(buf_pool->mutex)); all_freed = TRUE; block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { buf_block_t* prev_block; mutex_enter(&block->mutex); prev_block = UT_LIST_GET_PREV(LRU, block); ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (block->space == id && (block->buf_fix_count > 0 || block->io_fix != 0)) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it in, or flushing the modifications to the file */ all_freed = FALSE; goto next_page; } if (block->space == id) { #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Dropping space %lu page %lu\n", (ulong) block->space, (ulong) block->offset); } #endif if (block->is_hashed) { page_no = block->offset; mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); /* Note that the following call will acquire an S-latch on the page */ btr_search_drop_page_hash_when_freed(id, page_no); goto scan_again; } if (0 != ut_dulint_cmp(block->oldest_modification, ut_dulint_zero)) { /* Remove from the flush list of modified blocks */ block->oldest_modification = ut_dulint_zero; UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block); } /* Remove from the LRU list */ buf_LRU_block_remove_hashed_page(block); buf_LRU_block_free_hashed_page(block); } next_page: mutex_exit(&block->mutex); block = prev_block; } mutex_exit(&(buf_pool->mutex)); if (!all_freed) { os_thread_sleep(20000); goto scan_again; } }
/********************************************************************** Adds a block to the LRU list. */ UNIV_INLINE void buf_LRU_add_block_low( /*==================*/ buf_block_t* block, /* in: control block */ ibool old) /* in: TRUE if should be put to the old blocks in the LRU list, else put to the start; if the LRU list is very short, the block is added to the start, regardless of this parameter */ { ulint cl; ut_ad(buf_pool); ut_ad(block); ut_ad(mutex_own(&(buf_pool->mutex))); ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(!block->in_LRU_list); block->old = old; cl = buf_pool_clock_tic(); if (srv_use_awe && block->frame) { /* Add to the list of mapped pages; for simplicity we always add to the start, even if the user would have set 'old' TRUE */ UT_LIST_ADD_FIRST(awe_LRU_free_mapped, buf_pool->awe_LRU_free_mapped, block); } if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) { UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, block); block->LRU_position = cl; block->freed_page_clock = buf_pool->freed_page_clock; } else { UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old, block); buf_pool->LRU_old_len++; /* We copy the LRU position field of the previous block to the new block */ block->LRU_position = (buf_pool->LRU_old)->LRU_position; } block->in_LRU_list = TRUE; if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) { ut_ad(buf_pool->LRU_old); /* Adjust the length of the old block list if necessary */ buf_LRU_old_adjust_len(); } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) { /* The LRU list is now long enough for LRU_old to become defined: init it */ buf_LRU_old_init(); } }
/*************************************************************** Updates a clustered index record of a row when the ordering fields do not change. */ static ulint row_upd_clust_rec( /*==============*/ /* out: DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ upd_node_t* node, /* in: row update node */ dict_index_t* index, /* in: clustered index */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr) /* in: mtr; gets committed here */ { big_rec_t* big_rec = NULL; btr_pcur_t* pcur; btr_cur_t* btr_cur; ulint err; ut_ad(node); ut_ad(index->type & DICT_CLUSTERED); pcur = node->pcur; btr_cur = btr_pcur_get_btr_cur(pcur); ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); /* Try optimistic updating of the record, keeping changes within the page; we do not check locks because we assume the x-lock on the record to update */ if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) { err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG, btr_cur, node->update, node->cmpl_info, thr, mtr); } else { err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, node->update, node->cmpl_info, thr, mtr); } mtr_commit(mtr); if (err == DB_SUCCESS) { return(err); } /* We may have to modify the tree structure: do a pessimistic descent down the index tree */ mtr_start(mtr); /* NOTE: this transaction has an s-lock or x-lock on the record and therefore other transactions cannot modify the record when we have no latch on the page. In addition, we assume that other query threads of the same transaction do not modify the record in the meantime. Therefore we can assert that the restoration of the cursor succeeds. */ ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur))); err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur, &big_rec, node->update, node->cmpl_info, thr, mtr); mtr_commit(mtr); if (err == DB_SUCCESS && big_rec) { mtr_start(mtr); ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr)); err = btr_store_big_rec_extern_fields(index, btr_cur_get_rec(btr_cur), big_rec, mtr); mtr_commit(mtr); } if (big_rec) { dtuple_big_rec_free(big_rec); } return(err); }
upd_t* row_upd_build_difference_binary( /*============================*/ /* out, own: update vector of differing fields, excluding roll ptr and trx id */ dict_index_t* index, /* in: clustered index */ dtuple_t* entry, /* in: entry to insert */ ulint* ext_vec,/* in: array containing field numbers of externally stored fields in entry, or NULL */ ulint n_ext_vec,/* in: number of fields in ext_vec */ rec_t* rec, /* in: clustered index record */ mem_heap_t* heap) /* in: memory heap from which allocated */ { upd_field_t* upd_field; dfield_t* dfield; byte* data; ulint len; upd_t* update; ulint n_diff; ulint roll_ptr_pos; ulint trx_id_pos; ulint i; /* This function is used only for a clustered index */ ut_a(index->type & DICT_CLUSTERED); update = upd_create(dtuple_get_n_fields(entry), heap); n_diff = 0; roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR); trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID); for (i = 0; i < dtuple_get_n_fields(entry); i++) { data = rec_get_nth_field(rec, i, &len); dfield = dtuple_get_nth_field(entry, i); /* NOTE: we compare the fields as binary strings! (No collation) */ if (i == trx_id_pos || i == roll_ptr_pos) { goto skip_compare; } if (rec_get_nth_field_extern_bit(rec, i) != upd_ext_vec_contains(ext_vec, n_ext_vec, i) || !dfield_data_is_binary_equal(dfield, len, data)) { upd_field = upd_get_nth_field(update, n_diff); dfield_copy(&(upd_field->new_val), dfield); upd_field_set_field_no(upd_field, i, index); if (upd_ext_vec_contains(ext_vec, n_ext_vec, i)) { upd_field->extern_storage = TRUE; } else { upd_field->extern_storage = FALSE; } n_diff++; } skip_compare: ; } update->n_fields = n_diff; return(update); }
/**************************************************************//** Moves parts of long fields in entry to the big record vector so that the size of tuple drops below the maximum record size allowed in the database. Moves data only from those fields which are not necessary to determine uniquely the insertion place of the tuple in the index. @return own: created big record vector, NULL if we are not able to shorten the entry enough, i.e., if there are too many fixed-length or short fields in entry or the index is clustered */ UNIV_INTERN big_rec_t* dtuple_convert_big_rec( /*===================*/ dict_index_t* index, /*!< in: index */ dtuple_t* entry, /*!< in/out: index entry */ ulint* n_ext) /*!< in/out: number of externally stored columns */ { mem_heap_t* heap; big_rec_t* vector; dfield_t* dfield; dict_field_t* ifield; ulint size; ulint n_fields; ulint local_len; ulint local_prefix_len; if (UNIV_UNLIKELY(!dict_index_is_clust(index))) { return(NULL); } if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) { /* up to MySQL 5.1: store a 768-byte prefix locally */ local_len = BTR_EXTERN_FIELD_REF_SIZE + DICT_ANTELOPE_MAX_INDEX_COL_LEN; } else { /* new-format table: do not store any BLOB prefix locally */ local_len = BTR_EXTERN_FIELD_REF_SIZE; } ut_a(dtuple_check_typed_no_assert(entry)); size = rec_get_converted_size(index, entry, *n_ext); if (UNIV_UNLIKELY(size > 1000000000)) { fprintf(stderr, "InnoDB: Warning: tuple size very big: %lu\n", (ulong) size); fputs("InnoDB: Tuple contents: ", stderr); dtuple_print(stderr, entry); putc('\n', stderr); } heap = mem_heap_create(size + dtuple_get_n_fields(entry) * sizeof(big_rec_field_t) + 1000); vector = mem_heap_alloc(heap, sizeof(big_rec_t)); vector->heap = heap; vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry) * sizeof(big_rec_field_t)); /* Decide which fields to shorten: the algorithm is to look for a variable-length field that yields the biggest savings when stored externally */ n_fields = 0; while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, *n_ext), dict_table_is_comp(index->table), dict_index_get_n_fields(index), dict_table_zip_size(index->table))) { ulint i; ulint longest = 0; ulint longest_i = ULINT_MAX; byte* data; big_rec_field_t* b; for (i = dict_index_get_n_unique_in_tree(index); i < dtuple_get_n_fields(entry); i++) { ulint savings; dfield = dtuple_get_nth_field(entry, i); ifield = dict_index_get_nth_field(index, i); /* Skip fixed-length, NULL, externally stored, or short columns */ if (ifield->fixed_len || dfield_is_null(dfield) || dfield_is_ext(dfield) || dfield_get_len(dfield) <= local_len || dfield_get_len(dfield) <= BTR_EXTERN_FIELD_REF_SIZE * 2) { goto skip_field; } savings = dfield_get_len(dfield) - local_len; /* Check that there would be savings */ if (longest >= savings) { goto skip_field; } /* In DYNAMIC and COMPRESSED format, store locally any non-BLOB columns whose maximum length does not exceed 256 bytes. This is because there is no room for the "external storage" flag when the maximum length is 255 bytes or less. This restriction trivially holds in REDUNDANT and COMPACT format, because there we always store locally columns whose length is up to local_len == 788 bytes. @see rec_init_offsets_comp_ordinary */ if (ifield->col->mtype != DATA_BLOB && ifield->col->len < 256) { goto skip_field; } longest_i = i; longest = savings; skip_field: continue; } if (!longest) { /* Cannot shorten more */ mem_heap_free(heap); return(NULL); } /* Move data from field longest_i to big rec vector. We store the first bytes locally to the record. Then we can calculate all ordering fields in all indexes from locally stored data. */ dfield = dtuple_get_nth_field(entry, longest_i); ifield = dict_index_get_nth_field(index, longest_i); local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE; b = &vector->fields[n_fields]; b->field_no = longest_i; b->len = dfield_get_len(dfield) - local_prefix_len; b->data = (char*) dfield_get_data(dfield) + local_prefix_len; /* Allocate the locally stored part of the column. */ data = mem_heap_alloc(heap, local_len); /* Copy the local prefix. */ memcpy(data, dfield_get_data(dfield), local_prefix_len); /* Clear the extern field reference (BLOB pointer). */ memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE); #if 0 /* The following would fail the Valgrind checks in page_cur_insert_rec_low() and page_cur_insert_rec_zip(). The BLOB pointers in the record will be initialized after the record and the BLOBs have been written. */ UNIV_MEM_ALLOC(data + local_prefix_len, BTR_EXTERN_FIELD_REF_SIZE); #endif dfield_set_data(dfield, data, local_len); dfield_set_ext(dfield); n_fields++; (*n_ext)++; ut_ad(n_fields < dtuple_get_n_fields(entry)); } vector->n_fields = n_fields; return(vector); }
/******************************************************************//** Reserves a wait array cell for waiting for an object. The event of the cell is reset to nonsignalled state. */ UNIV_INTERN void sync_array_reserve_cell( /*====================*/ sync_array_t* arr, /*!< in: wait array */ void* object, /*!< in: pointer to the object to wait for */ ulint type, /*!< in: lock request type */ const char* file, /*!< in: file where requested */ ulint line, /*!< in: line where requested */ ulint* index) /*!< out: index of the reserved cell */ { sync_cell_t* cell; os_event_t event; ulint i; ut_a(object); ut_a(index); sync_array_enter(arr); arr->res_count++; /* Reserve a new cell. */ for (i = 0; i < arr->n_cells; i++) { cell = sync_array_get_nth_cell(arr, i); if (cell->wait_object == NULL) { cell->waiting = FALSE; cell->wait_object = object; if (type == SYNC_MUTEX) { cell->old_wait_mutex = object; } else { cell->old_wait_rw_lock = object; } cell->request_type = type; cell->file = file; cell->line = line; arr->n_reserved++; *index = i; sync_array_exit(arr); /* Make sure the event is reset and also store the value of signal_count at which the event was reset. */ event = sync_cell_get_event(cell); cell->signal_count = os_event_reset(event); cell->reservation_time = time(NULL); cell->thread = os_thread_get_curr_id(); return; } } ut_error; /* No free cell found */ return; }
/*****************************************************************//** Based on a table object, this function builds the entry to be inserted in the SYS_TABLES system table. @return the tuple which should be inserted */ static dtuple_t* dict_create_sys_tables_tuple( /*=========================*/ const dict_table_t* table, /*!< in: table */ mem_heap_t* heap) /*!< in: memory heap from which the memory for the built tuple is allocated */ { dict_table_t* sys_tables; dtuple_t* entry; dfield_t* dfield; byte* ptr; ut_ad(table); ut_ad(heap); sys_tables = dict_sys->sys_tables; entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); dict_table_copy_types(entry, sys_tables); /* 0: NAME -----------------------------*/ dfield = dtuple_get_nth_field(entry, 0/*NAME*/); dfield_set_data(dfield, table->name, ut_strlen(table->name)); /* 3: ID -------------------------------*/ dfield = dtuple_get_nth_field(entry, 1/*ID*/); ptr = mem_heap_alloc(heap, 8); mach_write_to_8(ptr, table->id); dfield_set_data(dfield, ptr, 8); /* 4: N_COLS ---------------------------*/ dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/); #if DICT_TF_COMPACT != 1 #error #endif ptr = mem_heap_alloc(heap, 4); if (dict_table_is_gcs(table)) /* ±í¶¨ÒåÐÞ¸Ä */ { ut_ad(dict_table_is_comp(table)); mach_write_to_4(ptr, table->n_def | (1 << 31) | (1 << 30)); } else { mach_write_to_4(ptr, table->n_def | ((table->flags & DICT_TF_COMPACT) << 31)); } dfield_set_data(dfield, ptr, 4); /* 5: TYPE -----------------------------*/ dfield = dtuple_get_nth_field(entry, 3/*TYPE*/); ptr = mem_heap_alloc(heap, 4); if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) { ut_a(table->flags & DICT_TF_COMPACT); ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP); ut_a((table->flags & DICT_TF_ZSSIZE_MASK) <= (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT)); ut_a(!(table->flags & (~0 << DICT_TF2_BITS))); mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS)); } else { mach_write_to_4(ptr, DICT_TABLE_ORDINARY); } dfield_set_data(dfield, ptr, 4); /* 6: MIX_ID (obsolete) ---------------------------*/ dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/); ptr = mem_heap_zalloc(heap, 8); dfield_set_data(dfield, ptr, 8); /* 7: MIX_LEN (additional flags) --------------------------*/ dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/); ptr = mem_heap_alloc(heap, 4); mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT); ut_ad(table->n_cols_before_alter_table == 0); dfield_set_data(dfield, ptr, 4); /* 8: CLUSTER_NAME ---------------------*/ dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/); dfield_set_null(dfield); /* not supported */ /* 9: SPACE ----------------------------*/ dfield = dtuple_get_nth_field(entry, 7/*SPACE*/); ptr = mem_heap_alloc(heap, 4); mach_write_to_4(ptr, table->space); dfield_set_data(dfield, ptr, 4); /*----------------------------------*/ return(entry); }
/*******************************************************************//** Truncates the index tree associated with a row in SYS_INDEXES table. @return new root page number, or FIL_NULL on failure */ UNIV_INTERN ulint dict_truncate_index_tree( /*=====================*/ dict_table_t* table, /*!< in: the table the index belongs to */ ulint space, /*!< in: 0=truncate, nonzero=create the index tree in the given tablespace */ btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to record in the clustered index of SYS_INDEXES table. The cursor may be repositioned in this call. */ mtr_t* mtr) /*!< in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ { ulint root_page_no; ibool drop = !space; ulint zip_size; ulint type; index_id_t index_id; rec_t* rec; const byte* ptr; ulint len; dict_index_t* index; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (drop && root_page_no == FIL_NULL) { /* The tree has been freed. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing index of table %s!\n", table->name); drop = FALSE; } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); if (drop) { space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); } zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing .ibd file of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_TYPE_FIELD, &len); ut_ad(len == 4); type = mach_read_from_4(ptr); ptr = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); index_id = mach_read_from_8(ptr); if (!drop) { goto create; } /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, zip_size, root_page_no); /* Then we free the root page in the same mini-transaction where we create the b-tree and write its new root page number to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally truncated */ btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr); btr_free_root(space, zip_size, root_page_no, mtr); create: /* We will temporarily write FIL_NULL to the PAGE_NO field in SYS_INDEXES, so that the database will not get into an inconsistent state in case it crashes between the mtr_commit() below and the following mtr_commit() call. */ page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { if (index->id == index_id) { root_page_no = btr_create(type, space, zip_size, index_id, index, mtr); index->page = (unsigned int) root_page_no; return(root_page_no); } } ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Index %llu of table %s is missing\n" "InnoDB: from the data dictionary during TRUNCATE!\n", (ullint) index_id, table->name); return(FIL_NULL); }
/****************************************************************//** Creates the sys_added_cols_default table inside InnoDB at database creation or database start if they are not found or are not of the right form. @return DB_SUCCESS or error code */ UNIV_INTERN ulint dict_create_or_check_added_cols_default_tables(void) /*================================================*/ { dict_table_t* table1; ulint error; trx_t* trx; mutex_enter(&(dict_sys->mutex)); table1 = dict_table_get_low("SYS_ADDED_COLS_DEFAULT"); if (table1 && UT_LIST_GET_LEN(table1->indexes) == 1) { /* sys_added_cols_default table have already been created, and it's ok */ mutex_exit(&(dict_sys->mutex)); #ifdef DEBUG fprintf(stderr, "SYS_ADDED_COLS_DEFAULT table have created success!!!!\n"); #endif return(DB_SUCCESS); } #ifdef DEBUG fprintf(stderr, "SYS_ADDED_COLS_DEFAULT table have not created yet!!!\n"); #endif mutex_exit(&(dict_sys->mutex)); trx = trx_allocate_for_mysql(); trx->op_info = "creating added cols default"; row_mysql_lock_data_dictionary(trx); /* sys_added_cols_default table have been created,but not OK, drop it and then recreate */ if (table1) { fprintf(stderr, "InnoDB: dropping incompletely created" " SYS_ADDED_COLS_DEFAULT table\n"); row_drop_table_for_mysql("SYS_ADDED_COLS_DEFAULT", trx, TRUE); } fprintf(stderr, "InnoDB: Creating sys_added_cols_default for fast add colums with default value\n"); /* NOTE: We create the added cols default here. sys_added_cols_default( table_id int, //table id pos int, // column id def_val varbinary, //default value def_val_len int //default value's length ) */ error = que_eval_sql(NULL, "PROCEDURE CREATE_ADDED_COLS_DEFAULT_SYS_TABLES_PROC () IS\n" "BEGIN\n" "CREATE TABLE\n" "SYS_ADDED_COLS_DEFAULT(TABLE_ID BINARY(8), POS INT," " DEF_VAL BINARY(65535), DEF_VAL_LEN INT);\n" "CREATE UNIQUE CLUSTERED INDEX TID_POS" " ON SYS_ADDED_COLS_DEFAULT (TABLE_ID,POS);\n" "END;\n" , FALSE, trx); if (error != DB_SUCCESS) { fprintf(stderr, "InnoDB: error %lu in creation\n", (ulong) error); ut_a(error == DB_OUT_OF_FILE_SPACE || error == DB_TOO_MANY_CONCURRENT_TRXS); fprintf(stderr, "InnoDB: creation failed\n" "InnoDB: tablespace is full\n" "InnoDB: dropping incompletely created" " SYS_FOREIGN tables\n"); row_drop_table_for_mysql("SYS_ADDED_COLS_DEFAULT", trx, TRUE); error = DB_MUST_GET_MORE_FILE_SPACE; } trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); trx_free_for_mysql(trx); if (error == DB_SUCCESS) { fprintf(stderr, "InnoDB: Added columns default system tables" " created\n"); } return(error); }
/****************************************************************//** Creates the foreign key constraints system tables inside InnoDB at database creation or database start if they are not found or are not of the right form. @return DB_SUCCESS or error code */ UNIV_INTERN ulint dict_create_or_check_foreign_constraint_tables(void) /*================================================*/ { dict_table_t* table1; dict_table_t* table2; ulint error; trx_t* trx; mutex_enter(&(dict_sys->mutex)); table1 = dict_table_get_low("SYS_FOREIGN"); table2 = dict_table_get_low("SYS_FOREIGN_COLS"); if (table1 && table2 && UT_LIST_GET_LEN(table1->indexes) == 3 && UT_LIST_GET_LEN(table2->indexes) == 1) { /* Foreign constraint system tables have already been created, and they are ok */ mutex_exit(&(dict_sys->mutex)); return(DB_SUCCESS); } mutex_exit(&(dict_sys->mutex)); trx = trx_allocate_for_mysql(); trx->op_info = "creating foreign key sys tables"; row_mysql_lock_data_dictionary(trx); if (table1) { fprintf(stderr, "InnoDB: dropping incompletely created" " SYS_FOREIGN table\n"); row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); } if (table2) { fprintf(stderr, "InnoDB: dropping incompletely created" " SYS_FOREIGN_COLS table\n"); row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); } fprintf(stderr, "InnoDB: Creating foreign key constraint system tables\n"); /* NOTE: in dict_load_foreigns we use the fact that there are 2 secondary indexes on SYS_FOREIGN, and they are defined just like below */ /* NOTE: when designing InnoDB's foreign key support in 2001, we made an error and made the table names and the foreign key id of type 'CHAR' (internally, really a VARCHAR). We should have made the type VARBINARY, like in other InnoDB system tables, to get a clean design. */ error = que_eval_sql(NULL, "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n" "BEGIN\n" "CREATE TABLE\n" "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR," " REF_NAME CHAR, N_COLS INT);\n" "CREATE UNIQUE CLUSTERED INDEX ID_IND" " ON SYS_FOREIGN (ID);\n" "CREATE INDEX FOR_IND" " ON SYS_FOREIGN (FOR_NAME);\n" "CREATE INDEX REF_IND" " ON SYS_FOREIGN (REF_NAME);\n" "CREATE TABLE\n" "SYS_FOREIGN_COLS(ID CHAR, POS INT," " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n" "CREATE UNIQUE CLUSTERED INDEX ID_IND" " ON SYS_FOREIGN_COLS (ID, POS);\n" "END;\n" , FALSE, trx); if (error != DB_SUCCESS) { fprintf(stderr, "InnoDB: error %lu in creation\n", (ulong) error); ut_a(error == DB_OUT_OF_FILE_SPACE || error == DB_TOO_MANY_CONCURRENT_TRXS); fprintf(stderr, "InnoDB: creation failed\n" "InnoDB: tablespace is full\n" "InnoDB: dropping incompletely created" " SYS_FOREIGN tables\n"); row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE); row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE); error = DB_MUST_GET_MORE_FILE_SPACE; } trx_commit_for_mysql(trx); row_mysql_unlock_data_dictionary(trx); trx_free_for_mysql(trx); if (error == DB_SUCCESS) { fprintf(stderr, "InnoDB: Foreign key constraint system tables" " created\n"); } return(error); }
/***********************************************************//** Creates an index. This is a high-level function used in SQL execution graphs. @return query thread to run next or NULL */ UNIV_INTERN que_thr_t* dict_create_index_step( /*===================*/ que_thr_t* thr) /*!< in: query thread */ { ind_node_t* node; ulint err = DB_ERROR; trx_t* trx; ut_ad(thr); ut_ad(mutex_own(&(dict_sys->mutex))); trx = thr_get_trx(thr); node = thr->run_node; ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX); if (thr->prev_node == que_node_get_parent(node)) { node->state = INDEX_BUILD_INDEX_DEF; } if (node->state == INDEX_BUILD_INDEX_DEF) { /* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */ err = dict_build_index_def_step(thr, node); if (err != DB_SUCCESS) { goto function_exit; } node->state = INDEX_BUILD_FIELD_DEF; node->field_no = 0; thr->run_node = node->ind_def; return(thr); } if (node->state == INDEX_BUILD_FIELD_DEF) { if (node->field_no < (node->index)->n_fields) { err = dict_build_field_def_step(node); if (err != DB_SUCCESS) { goto function_exit; } node->field_no++; thr->run_node = node->field_def; return(thr); } else { node->state = INDEX_ADD_TO_CACHE; } } if (node->state == INDEX_ADD_TO_CACHE) { index_id_t index_id = node->index->id; err = dict_index_add_to_cache( node->table, node->index, FIL_NULL, trx_is_strict(trx) || dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP); node->index = dict_index_get_if_in_cache_low(index_id); ut_a(!node->index == (err != DB_SUCCESS)); if (err != DB_SUCCESS) { goto function_exit; } node->state = INDEX_CREATE_INDEX_TREE; } if (node->state == INDEX_CREATE_INDEX_TREE) { err = dict_create_index_tree_step(node); if (err != DB_SUCCESS) { dict_index_remove_from_cache(node->table, node->index); node->index = NULL; goto function_exit; } node->index->page = node->page_no; node->state = INDEX_COMMIT_WORK; } if (node->state == INDEX_COMMIT_WORK) { /* Index was correctly defined: do NOT commit the transaction (CREATE INDEX does NOT currently do an implicit commit of the current transaction) */ node->state = INDEX_CREATE_INDEX_TREE; /* thr->run_node = node->commit_node; return(thr); */ } function_exit: trx->error_state = err; if (err == DB_SUCCESS) { /* Ok: do nothing */ } else if (err == DB_LOCK_WAIT) { return(NULL); } else { /* SQL error detected */ return(NULL); } thr->run_node = que_node_get_parent(node); return(thr); }
/****************************************************************//** Creates a new thread of execution. The execution starts from the function given. The start function takes a void* parameter and returns an ulint. @return handle to the thread */ UNIV_INTERN os_thread_t os_thread_create( /*=============*/ #ifndef __WIN__ os_posix_f_t start_f, #else ulint (*start_f)(void*), /*!< in: pointer to function from which to start */ #endif void* arg, /*!< in: argument to start function */ os_thread_id_t* thread_id) /*!< out: id of the created thread, or NULL */ { #ifdef __WIN__ os_thread_t thread; DWORD win_thread_id; os_mutex_enter(os_sync_mutex); os_thread_count++; os_mutex_exit(os_sync_mutex); thread = CreateThread(NULL, /* no security attributes */ 0, /* default size stack */ (LPTHREAD_START_ROUTINE)start_f, arg, 0, /* thread runs immediately */ &win_thread_id); if (srv_set_thread_priorities) { /* Set created thread priority the same as a normal query, we try to prevent starvation of threads by assigning same priority QUERY_PRIOR to all */ ut_a(SetThreadPriority(thread, srv_query_thread_priority)); } if (thread_id) { *thread_id = win_thread_id; } return(thread); #else int ret; os_thread_t pthread; pthread_attr_t attr; #ifndef UNIV_HPUX10 pthread_attr_init(&attr); #endif #ifdef UNIV_AIX /* We must make sure a thread stack is at least 32 kB, otherwise InnoDB might crash; we do not know if the default stack size on AIX is always big enough. An empirical test on AIX-4.3 suggested the size was 96 kB, though. */ ret = pthread_attr_setstacksize(&attr, (size_t)(PTHREAD_STACK_MIN + 32 * 1024)); if (ret) { srv_panic(ret, "InnoDB: Error: pthread_attr_setstacksize" " returned %d\n", ret); } #endif #ifdef __NETWARE__ ret = pthread_attr_setstacksize(&attr, (size_t) NW_THD_STACKSIZE); if (ret) { srv_panic(ret, "InnoDB: Error: pthread_attr_setstacksize" " returned %d\n", ret); } #endif os_mutex_enter(os_sync_mutex); os_thread_count++; os_mutex_exit(os_sync_mutex); #ifdef UNIV_HPUX10 ret = pthread_create(&pthread, pthread_attr_default, start_f, arg); #else ret = pthread_create(&pthread, &attr, start_f, arg); #endif if (ret) { srv_panic(ret, "InnoDB: Error: pthread_create returned %d\n", ret); } #ifndef UNIV_HPUX10 pthread_attr_destroy(&attr); #endif if (srv_set_thread_priorities) { #ifdef HAVE_PTHREAD_SETPRIO pthread_setprio(pthread, srv_query_thread_priority); #endif } if (thread_id) { *thread_id = pthread; } return(pthread); #endif }
/*****************************************************************//** Creates the file page for the dictionary header. This function is called only at the database creation. @return TRUE if succeed */ static ibool dict_hdr_create( /*============*/ mtr_t* mtr) /*!< in: mtr */ { buf_block_t* block; dict_hdr_t* dict_header; ulint root_page_no; ut_ad(mtr); /* Create the dictionary header file block in a new, allocated file segment in the system tablespace */ block = fseg_create(DICT_HDR_SPACE, 0, DICT_HDR + DICT_HDR_FSEG_HEADER, mtr); ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block)); dict_header = dict_hdr_get(mtr); /* Start counting row, table, index, and tree ids from DICT_HDR_FIRST_ID */ mlog_write_dulint(dict_header + DICT_HDR_ROW_ID, ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); mlog_write_dulint(dict_header + DICT_HDR_TABLE_ID, ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); mlog_write_dulint(dict_header + DICT_HDR_INDEX_ID, ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); /* Obsolete, but we must initialize it to 0 anyway. */ mlog_write_dulint(dict_header + DICT_HDR_MIX_ID, ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr); /* Create the B-tree roots for the clustered indexes of the basic system tables */ /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, 0, DICT_TABLES_ID, dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); } mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0, DICT_TABLE_IDS_ID, dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); } mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, 0, DICT_COLUMNS_ID, dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); } mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, 0, DICT_INDEXES_ID, dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); } mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE, DICT_HDR_SPACE, 0, DICT_FIELDS_ID, dict_ind_redundant, mtr); if (root_page_no == FIL_NULL) { return(FALSE); } mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no, MLOG_4BYTES, mtr); /*--------------------------*/ return(TRUE); }
/*********************************************************//** Creates an event semaphore, i.e., a semaphore which may just have two states: signaled and nonsignaled. The created event is manual reset: it must be reset explicitly by calling sync_os_reset_event. @return the event handle */ UNIV_INTERN os_event_t os_event_create( /*============*/ const char* name) /*!< in: the name of the event, if NULL the event is created without a name */ { #ifdef __WIN__ os_event_t event; event = ut_malloc(sizeof(struct os_event_struct)); event->handle = CreateEvent(NULL, /* No security attributes */ TRUE, /* Manual reset */ FALSE, /* Initial state nonsignaled */ (LPCTSTR) name); if (!event->handle) { fprintf(stderr, "InnoDB: Could not create a Windows event semaphore;" " Windows error %lu\n", (ulong) GetLastError()); } #else /* Unix */ os_event_t event; UT_NOT_USED(name); event = ut_malloc(sizeof(struct os_event_struct)); os_fast_mutex_init(&(event->os_mutex)); ut_a(0 == pthread_cond_init(&(event->cond_var), NULL)); event->is_set = FALSE; /* We return this value in os_event_reset(), which can then be be used to pass to the os_event_wait_low(). The value of zero is reserved in os_event_wait_low() for the case when the caller does not want to pass any signal_count value. To distinguish between the two cases we initialize signal_count to 1 here. */ event->signal_count = 1; #endif /* __WIN__ */ /* The os_sync_mutex can be NULL because during startup an event can be created [ because it's embedded in the mutex/rwlock ] before this module has been initialized */ if (os_sync_mutex != NULL) { os_mutex_enter(os_sync_mutex); } /* Put to the list of events */ UT_LIST_ADD_FIRST(os_event_list, os_event_list, event); os_event_count++; if (os_sync_mutex != NULL) { os_mutex_exit(os_sync_mutex); } return(event); }
/*****************************************************************//** Initializes the data dictionary memory structures when the database is started. This function is also called when the data dictionary is created. */ UNIV_INTERN void dict_boot(void) /*===========*/ { dict_table_t* table; dict_index_t* index; dict_hdr_t* dict_hdr; mem_heap_t* heap; mtr_t mtr; ulint error; mtr_start(&mtr); /* Create the hash tables etc. */ dict_init(); heap = mem_heap_create(450); mutex_enter(&(dict_sys->mutex)); /* Get the dictionary header */ dict_hdr = dict_hdr_get(&mtr); /* Because we only write new row ids to disk-based data structure (dictionary header) when it is divisible by DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover the latest value of the row id counter. Therefore we advance the counter at the database startup to avoid overlapping values. Note that when a user after database startup first time asks for a new row id, then because the counter is now divisible by ..._MARGIN, it will immediately be updated to the disk-based header. */ dict_sys->row_id = ut_dulint_add( ut_dulint_align_up(mtr_read_dulint(dict_hdr + DICT_HDR_ROW_ID, &mtr), DICT_HDR_ROW_ID_WRITE_MARGIN), DICT_HDR_ROW_ID_WRITE_MARGIN); /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); /* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT) and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); table->id = DICT_TABLES_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_tables = table; mem_heap_empty(heap); index = dict_mem_index_create("SYS_TABLES", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 1); dict_mem_index_add_field(index, "NAME", 0); index->id = DICT_TABLES_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_TABLES, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ index = dict_mem_index_create("SYS_TABLES", "ID_IND", DICT_HDR_SPACE, DICT_UNIQUE, 1); dict_mem_index_add_field(index, "ID", 0); index->id = DICT_TABLE_IDS_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_TABLE_IDS, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0); dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4); table->id = DICT_COLUMNS_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_columns = table; mem_heap_empty(heap); index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "TABLE_ID", 0); dict_mem_index_add_field(index, "POS", 0); index->id = DICT_COLUMNS_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_COLUMNS, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0); dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); /* The '+ 2' below comes from the 2 system fields */ #if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2 #error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2" #endif #if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2 #error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2" #endif #if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2 #error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2" #endif table->id = DICT_INDEXES_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_indexes = table; mem_heap_empty(heap); index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "TABLE_ID", 0); dict_mem_index_add_field(index, "ID", 0); index->id = DICT_INDEXES_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_INDEXES, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0); dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0); table->id = DICT_FIELDS_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_fields = table; mem_heap_free(heap); index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "INDEX_ID", 0); dict_mem_index_add_field(index, "POS", 0); index->id = DICT_FIELDS_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_FIELDS, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); mtr_commit(&mtr); /*-------------------------*/ /* Initialize the insert buffer table and index for each tablespace */ ibuf_init_at_db_start(); /* Load definitions of other indexes on system tables */ dict_load_sys_table(dict_sys->sys_tables); dict_load_sys_table(dict_sys->sys_columns); dict_load_sys_table(dict_sys->sys_indexes); dict_load_sys_table(dict_sys->sys_fields); mutex_exit(&(dict_sys->mutex)); }
/*************************************************************** Updates the clustered index record. */ static ulint row_upd_clust_step( /*===============*/ /* out: DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT in case of a lock wait, else error code */ upd_node_t* node, /* in: row update node */ que_thr_t* thr) /* in: query thread */ { dict_index_t* index; btr_pcur_t* pcur; ibool success; ibool check_ref; ulint err; mtr_t* mtr; mtr_t mtr_buf; index = dict_table_get_first_index(node->table); check_ref = row_upd_index_is_referenced(index); pcur = node->pcur; /* We have to restore the cursor to its position */ mtr = &mtr_buf; mtr_start(mtr); /* If the restoration does not succeed, then the same transaction has deleted the record on which the cursor was, and that is an SQL error. If the restoration succeeds, it may still be that the same transaction has successively deleted and inserted a record with the same ordering fields, but in that case we know that the transaction has at least an implicit x-lock on the record. */ ut_a(pcur->rel_pos == BTR_PCUR_ON); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); if (!success) { err = DB_RECORD_NOT_FOUND; mtr_commit(mtr); return(err); } /* If this is a row in SYS_INDEXES table of the data dictionary, then we have to free the file segments of the index tree associated with the index */ if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); mtr_commit(mtr); mtr_start(mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); if (!success) { err = DB_ERROR; mtr_commit(mtr); return(err); } } if (!node->has_clust_rec_x_lock) { err = lock_clust_rec_modify_check_and_lock(0, btr_pcur_get_rec(pcur), index, thr); if (err != DB_SUCCESS) { mtr_commit(mtr); return(err); } } /* NOTE: the following function calls will also commit mtr */ if (node->is_delete) { err = row_upd_del_mark_clust_rec(node, index, thr, check_ref, mtr); if (err != DB_SUCCESS) { return(err); } node->state = UPD_NODE_UPDATE_ALL_SEC; node->index = dict_table_get_next_index(index); return(err); } /* If the update is made for MySQL, we already have the update vector ready, else we have to do some evaluation: */ if (!node->in_mysql_interface) { /* Copy the necessary columns from clust_rec and calculate the new values to set */ row_upd_copy_columns(btr_pcur_get_rec(pcur), UT_LIST_GET_FIRST(node->columns)); row_upd_eval_new_vals(node->update); } if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { err = row_upd_clust_rec(node, index, thr, mtr); return(err); } row_upd_store_row(node); if (row_upd_changes_ord_field_binary(node->row, index, node->update)) { /* Update causes an ordering field (ordering fields within the B-tree) of the clustered index record to change: perform the update by delete marking and inserting. TODO! What to do to the 'Halloween problem', where an update moves the record forward in index so that it is again updated when the cursor arrives there? Solution: the read operation must check the undo record undo number when choosing records to update. MySQL solves now the problem externally! */ err = row_upd_clust_rec_by_insert(node, index, thr, check_ref, mtr); if (err != DB_SUCCESS) { return(err); } node->state = UPD_NODE_UPDATE_ALL_SEC; } else { err = row_upd_clust_rec(node, index, thr, mtr); if (err != DB_SUCCESS) { return(err); } node->state = UPD_NODE_UPDATE_SOME_SEC; } node->index = dict_table_get_next_index(index); return(err); }
byte* mlog_parse_nbytes( /*==============*/ /* out: parsed record end, NULL if not a complete record or a corrupt record */ ulint type, /* in: log record type: MLOG_1BYTE, ... */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ byte* page) /* in: page where to apply the log record, or NULL */ { ulint offset; ulint val; dulint dval; ut_a(type <= MLOG_8BYTES); if (end_ptr < ptr + 2) { return(NULL); } offset = mach_read_from_2(ptr); ptr += 2; if (offset >= UNIV_PAGE_SIZE) { recv_sys->found_corrupt_log = TRUE; return(NULL); } if (type == MLOG_8BYTES) { ptr = mach_dulint_parse_compressed(ptr, end_ptr, &dval); if (ptr == NULL) { return(NULL); } if (page) { mach_write_to_8(page + offset, dval); } return(ptr); } ptr = mach_parse_compressed(ptr, end_ptr, &val); if (ptr == NULL) { return(NULL); } if (type == MLOG_1BYTE) { if (val > 0xFFUL) { recv_sys->found_corrupt_log = TRUE; return(NULL); } } else if (type == MLOG_2BYTES) { if (val > 0xFFFFUL) { recv_sys->found_corrupt_log = TRUE; return(NULL); } } else { if (type != MLOG_4BYTES) { recv_sys->found_corrupt_log = TRUE; return(NULL); } } if (page) { if (type == MLOG_1BYTE) { mach_write_to_1(page + offset, val); } else if (type == MLOG_2BYTES) { mach_write_to_2(page + offset, val); } else { ut_a(type == MLOG_4BYTES); mach_write_to_4(page + offset, val); } } return(ptr); }
ibool buf_LRU_validate(void) /*==================*/ { buf_block_t* block; ulint old_len; ulint new_len; ulint LRU_pos; ut_ad(buf_pool); mutex_enter(&(buf_pool->mutex)); if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) { ut_a(buf_pool->LRU_old); old_len = buf_pool->LRU_old_len; new_len = 3 * (UT_LIST_GET_LEN(buf_pool->LRU) / 8); ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE); ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE); } UT_LIST_VALIDATE(LRU, buf_block_t, buf_pool->LRU); block = UT_LIST_GET_FIRST(buf_pool->LRU); old_len = 0; while (block != NULL) { ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (block->old) { old_len++; } if (buf_pool->LRU_old && (old_len == 1)) { ut_a(buf_pool->LRU_old == block); } LRU_pos = block->LRU_position; block = UT_LIST_GET_NEXT(LRU, block); if (block) { /* If the following assert fails, it may not be an error: just the buf_pool clock has wrapped around */ ut_a(LRU_pos >= block->LRU_position); } } if (buf_pool->LRU_old) { ut_a(buf_pool->LRU_old_len == old_len); } UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free); block = UT_LIST_GET_FIRST(buf_pool->free); while (block != NULL) { ut_a(block->state == BUF_BLOCK_NOT_USED); block = UT_LIST_GET_NEXT(free, block); } mutex_exit(&(buf_pool->mutex)); return(TRUE); }
byte* mlog_open_and_write_index( /*======================*/ /* out: buffer, NULL if log mode MTR_LOG_NONE */ mtr_t* mtr, /* in: mtr */ byte* rec, /* in: index record or page */ dict_index_t* index, /* in: record descriptor */ byte type, /* in: log item type */ ulint size) /* in: requested buffer size in bytes (if 0, calls mlog_close() and returns NULL) */ { byte* log_ptr; const byte* log_start; const byte* log_end; ut_ad(!!page_rec_is_comp(rec) == index->table->comp); if (!page_rec_is_comp(rec)) { log_start = log_ptr = mlog_open(mtr, 11 + size); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); log_end = log_ptr + 11 + size; } else { ulint i; ulint n = dict_index_get_n_fields(index); /* total size needed */ ulint total = 11 + size + (n + 2) * 2; ulint alloc = total; /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ if (alloc > DYN_ARRAY_DATA_SIZE) { alloc = DYN_ARRAY_DATA_SIZE; } log_start = log_ptr = mlog_open(mtr, alloc); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_end = log_ptr + alloc; log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); mach_write_to_2(log_ptr, n); log_ptr += 2; mach_write_to_2(log_ptr, dict_index_get_n_unique_in_tree(index)); log_ptr += 2; for (i = 0; i < n; i++) { dict_field_t* field; dtype_t* type; ulint len; field = dict_index_get_nth_field(index, i); type = dict_col_get_type(dict_field_get_col(field)); len = field->fixed_len; ut_ad(len < 0x7fff); if (len == 0 && (dtype_get_len(type) > 255 || dtype_get_mtype(type) == DATA_BLOB)) { /* variable-length field with maximum length > 255 */ len = 0x7fff; } if (dtype_get_prtype(type) & DATA_NOT_NULL) { len |= 0x8000; } if (log_ptr + 2 > log_end) { mlog_close(mtr, log_ptr); ut_a(total > (ulint) (log_ptr - log_start)); total -= log_ptr - log_start; alloc = total; if (alloc > DYN_ARRAY_DATA_SIZE) { alloc = DYN_ARRAY_DATA_SIZE; } log_start = log_ptr = mlog_open(mtr, alloc); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_end = log_ptr + alloc; } mach_write_to_2(log_ptr, len); log_ptr += 2; } } if (size == 0) { mlog_close(mtr, log_ptr); log_ptr = NULL; } else if (log_ptr + size > log_end) { mlog_close(mtr, log_ptr); log_ptr = mlog_open(mtr, size); } return(log_ptr); }
buf_block_t* buf_LRU_get_free_block(void) /*========================*/ /* out: the free control block; also if AWE is used, it is guaranteed that the block has its page mapped to a frame when we return */ { buf_block_t* block = NULL; ibool freed; ulint n_iterations = 1; ibool mon_value_was = FALSE; ibool started_monitor = FALSE; loop: mutex_enter(&(buf_pool->mutex)); if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: ERROR: over 95 percent of the buffer pool" " is occupied by\n" "InnoDB: lock heaps or the adaptive hash index!" " Check that your\n" "InnoDB: transactions do not set too many row locks.\n" "InnoDB: Your buffer pool size is %lu MB." " Maybe you should make\n" "InnoDB: the buffer pool bigger?\n" "InnoDB: We intentionally generate a seg fault" " to print a stack trace\n" "InnoDB: on Linux!\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); ut_error; } else if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free) + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 3) { if (!buf_lru_switched_on_innodb_mon) { /* Over 67 % of the buffer pool is occupied by lock heaps or the adaptive hash index. This may be a memory leak! */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: WARNING: over 67 percent of" " the buffer pool is occupied by\n" "InnoDB: lock heaps or the adaptive" " hash index! Check that your\n" "InnoDB: transactions do not set too many" " row locks.\n" "InnoDB: Your buffer pool size is %lu MB." " Maybe you should make\n" "InnoDB: the buffer pool bigger?\n" "InnoDB: Starting the InnoDB Monitor to print" " diagnostics, including\n" "InnoDB: lock heap and hash index sizes.\n", (ulong) (buf_pool->curr_size / (1024 * 1024 / UNIV_PAGE_SIZE))); buf_lru_switched_on_innodb_mon = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } } else if (buf_lru_switched_on_innodb_mon) { /* Switch off the InnoDB Monitor; this is a simple way to stop the monitor if the situation becomes less urgent, but may also surprise users if the user also switched on the monitor! */ buf_lru_switched_on_innodb_mon = FALSE; srv_print_innodb_monitor = FALSE; } /* If there is a block in the free list, take it */ if (UT_LIST_GET_LEN(buf_pool->free) > 0) { block = UT_LIST_GET_FIRST(buf_pool->free); ut_a(block->in_free_list); UT_LIST_REMOVE(free, buf_pool->free, block); block->in_free_list = FALSE; ut_a(block->state != BUF_BLOCK_FILE_PAGE); ut_a(!block->in_LRU_list); if (srv_use_awe) { if (block->frame) { /* Remove from the list of mapped pages */ UT_LIST_REMOVE(awe_LRU_free_mapped, buf_pool->awe_LRU_free_mapped, block); } else { /* We map the page to a frame; second param FALSE below because we do not want it to be added to the awe_LRU_free_mapped list */ buf_awe_map_page_to_frame(block, FALSE); } } mutex_enter(&block->mutex); block->state = BUF_BLOCK_READY_FOR_USE; UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE); mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); if (started_monitor) { srv_print_innodb_monitor = mon_value_was; } return(block); } /* If no block was in the free list, search from the end of the LRU list and try to free a block there */ mutex_exit(&(buf_pool->mutex)); freed = buf_LRU_search_and_free_block(n_iterations); if (freed > 0) { goto loop; } if (n_iterations > 30) { ut_print_timestamp(stderr); fprintf(stderr, "InnoDB: Warning: difficult to find free blocks from\n" "InnoDB: the buffer pool (%lu search iterations)!" " Consider\n" "InnoDB: increasing the buffer pool size.\n" "InnoDB: It is also possible that" " in your Unix version\n" "InnoDB: fsync is very slow, or" " completely frozen inside\n" "InnoDB: the OS kernel. Then upgrading to" " a newer version\n" "InnoDB: of your operating system may help." " Look at the\n" "InnoDB: number of fsyncs in diagnostic info below.\n" "InnoDB: Pending flushes (fsync) log: %lu;" " buffer pool: %lu\n" "InnoDB: %lu OS file reads, %lu OS file writes," " %lu OS fsyncs\n" "InnoDB: Starting InnoDB Monitor to print further\n" "InnoDB: diagnostics to the standard output.\n", (ulong) n_iterations, (ulong) fil_n_pending_log_flushes, (ulong) fil_n_pending_tablespace_flushes, (ulong) os_n_file_reads, (ulong) os_n_file_writes, (ulong) os_n_fsyncs); mon_value_was = srv_print_innodb_monitor; started_monitor = TRUE; srv_print_innodb_monitor = TRUE; os_event_set(srv_lock_timeout_thread_event); } /* No free block was found: try to flush the LRU list */ buf_flush_free_margin(); ++srv_buf_pool_wait_free; os_aio_simulated_wake_handler_threads(); mutex_enter(&(buf_pool->mutex)); if (buf_pool->LRU_flush_ended > 0) { /* We have written pages in an LRU flush. To make the insert buffer more efficient, we try to move these pages to the free list. */ mutex_exit(&(buf_pool->mutex)); buf_LRU_try_free_flushed_blocks(); } else { mutex_exit(&(buf_pool->mutex)); } if (n_iterations > 10) { os_thread_sleep(500000); } n_iterations++; goto loop; }
/********************************************************************//** Flush pages from flash cache. @return number of pages have been flushed to tablespace */ UNIV_INTERN ulint fc_flush_to_disk( /*==================*/ ibool do_full_io) /*!< in: whether do full io capacity */ { ulint distance; byte* page; ulint ret; ulint space; ulint offset; ulint page_type; ulint i, j; ulint pos; ulint zip_size; ulint block_offset, byte_offset; ulint fc_size = fc_get_size(); ulint fc_blk_size = fc_get_block_size_byte(); ulint start_offset; ulint data_size; fc_block_t *flush_block = NULL; ulint c_flush = 0; ut_ad(!mutex_own(&fc->mutex)); ut_a(fc->flush_buf->free_pos == 0); /* step 1: get the number of blocks need to flush to tablespace */ flash_cache_mutex_enter(); distance = fc_get_distance(); start_offset = fc->flush_off; if ( distance == 0 ) { flash_cache_mutex_exit(); return 0; } else if ( recv_recovery_on ) { if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size)) { fc->n_flush_cur = 0; } else if ( distance < ( ( 1.0*srv_flash_cache_do_full_io_pct /100 ) * fc_size)) { fc->n_flush_cur = ut_min(PCT_IO_FC(10), distance); } else { fc->n_flush_cur = ut_min(PCT_IO_FC(100), distance); } } else if ( distance < (( 1.0 * srv_flash_cache_write_cache_pct /100 ) * fc_size) && !do_full_io ) { flash_cache_mutex_exit(); return 0; } else if ( distance < (( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) && !do_full_io ) { fc->n_flush_cur = PCT_IO_FC(srv_fc_write_cache_flush_pct); } else { ut_ad((distance > ( 1.0 * srv_flash_cache_do_full_io_pct/100 ) * fc_size) || do_full_io ); fc->n_flush_cur = ut_min(PCT_IO_FC(srv_fc_full_flush_pct), distance); } flash_cache_mutex_exit(); /* step 2: start to flush blocks use async io, set block io_fix IO_FIX_FLUSH */ i = 0; while (i < fc->n_flush_cur) { ulint b_space; ulint b_offset; ulint raw_zip_size; ulint size; ulint fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE ulint is_v4_blk; #endif byte* page_io; flash_cache_mutex_enter(); pos = ( start_offset + i ) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { i++; flash_cache_mutex_exit(); continue; } /* we should get the mutex, as doublewrite may hit this block and invalid the block */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); data_size = fc_block_get_data_size(flush_block); if (flush_block->state != BLOCK_READY_FOR_FLUSH) { /* if readonly or merge write or already flushed*/ ut_a (flush_block->state == BLOCK_NOT_USED || flush_block->state == BLOCK_READ_CACHE || flush_block->state == BLOCK_FLUSHED); i += data_size; flash_block_mutex_exit(flush_block->fil_offset); if (flush_block->state == BLOCK_NOT_USED) { //fc_block_detach(FALSE, flush_block); fc_block_free(flush_block); } continue; } zip_size = fil_space_get_zip_size(flush_block->space); if (zip_size == ULINT_UNDEFINED) { /* table has been droped, just set it BLOCK_FLUSHED */ #ifdef UNIV_FLASH_CACHE_TRACE ut_print_timestamp(fc->f_debug); fprintf(fc->f_debug, "space:%lu is droped, the page(%lu, %lu) need not to be flushed.\n", (ulong)flush_block->space, (ulong)flush_block->space, (ulong)flush_block->offset); #endif flush_block->state = BLOCK_FLUSHED; i += data_size; c_flush += data_size; flash_block_mutex_exit(flush_block->fil_offset); continue; } #ifdef UNIV_FLASH_CACHE_TRACE if (flush_block->state != BLOCK_READY_FOR_FLUSH) { fc_block_print(flush_block); ut_error; } #endif flush_block->io_fix |= IO_FIX_FLUSH; /* * we should set block state BLOCK_FLUSHED, if not, doublewrite may hit this block * and invalid this block and reduce the dirty count, but when finish flush ,we will * reduce the dirty count too, so it may reduce twice. */ flush_block->state = BLOCK_FLUSHED; /* save the block info, as the block may be invalided by doublewrite after release mutex */ b_space = flush_block->space; b_offset = flush_block->offset; raw_zip_size = flush_block->raw_zip_size; size = flush_block->size; fil_offset = flush_block->fil_offset; #ifdef UNIV_FLASH_CACHE_TRACE is_v4_blk = flush_block->is_v4_blk; #endif /* release the block now, so read can hit in this blocks and read the data */ flash_block_mutex_exit(flush_block->fil_offset); /* * Only flush thread will update read_buf and flush_off/round. * there only single flush thread no need to lock read_buf */ page = fc->flush_buf->buf + fc->flush_buf->free_pos * fc_blk_size; if (raw_zip_size > 0) { ut_a((size * fc_blk_size) == UNIV_PAGE_SIZE); page_io = fc->flush_zip_read_buf; } else { page_io = page; } fc_io_offset(fil_offset, &block_offset, &byte_offset); ret = fil_io(OS_FILE_READ, TRUE, FLASH_CACHE_SPACE, 0, block_offset, byte_offset, data_size * fc_blk_size, page_io, NULL); if (ret != DB_SUCCESS) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Flash cache [Error]: unable to read page from flash cache.\n" "flash cache flush offset is:%lu.\n", (ulong)(start_offset + i)); ut_error; } if ((flush_block != NULL) && (flush_block->state == BLOCK_NOT_USED)) { goto skip; } /* decompress the compress data */ if (raw_zip_size > 0) { #ifdef UNIV_FLASH_CACHE_TRACE ulint blk_zip_size_byte; if (is_v4_blk) { blk_zip_size_byte = raw_zip_size * fc_get_block_size_byte(); } else { blk_zip_size_byte = fc_block_compress_align(raw_zip_size) * fc_get_block_size_byte(); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ZIP_RAW_SIZE) == raw_zip_size); } ut_a(page_io); ut_a(page); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_HEADER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + blk_zip_size_byte - FC_ZIP_PAGE_TAILER) == FC_ZIP_PAGE_CHECKSUM); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SIZE) == blk_zip_size_byte); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_ORIG_SIZE) == UNIV_PAGE_SIZE); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_SPACE) == b_space); ut_a((ulint)mach_read_from_4(page_io + FC_ZIP_PAGE_OFFSET) == b_offset); /* only qlz can do this check */ if (srv_flash_cache_compress_algorithm == FC_BLOCK_COMPRESS_QUICKLZ) { if (is_v4_blk) { ut_a(raw_zip_size * fc_get_block_size_byte() >= (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } else { ut_a(raw_zip_size == (ulint)fc_qlz_size_compressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } ut_a(UNIV_PAGE_SIZE == fc_qlz_size_decompressed((const char *)(page_io + FC_ZIP_PAGE_DATA))); } #endif fc_block_do_decompress(DECOMPRESS_FLUSH, page_io, raw_zip_size, page); } space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); offset = mach_read_from_4(page + FIL_PAGE_OFFSET); if ((space != b_space) || (offset != b_offset)) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } if (buf_page_is_corrupted(page, zip_size)) { buf_page_print(page, zip_size, BUF_PAGE_PRINT_NO_CRASH); ut_error; } page_type = fil_page_get_type(page); if (page_type == FIL_PAGE_INDEX) { page_type = 1; } srv_flash_cache_flush_detail[page_type]++; ret = fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER, FALSE, space, zip_size, offset, 0, zip_size ? zip_size : UNIV_PAGE_SIZE, page, NULL); if (ret != DB_SUCCESS && ret != DB_TABLESPACE_DELETED) { ut_print_timestamp(stderr); fc_block_print(flush_block); ut_error; } /* add UNIV_PAGE_SIZE / fc_blk_size for safe */ fc->flush_buf->free_pos += UNIV_PAGE_SIZE / fc_blk_size; skip: i += data_size; c_flush += data_size; if ((fc->flush_buf->free_pos + UNIV_PAGE_SIZE / fc_blk_size) >= fc->flush_buf->size) { /* FIXME: is it safe to change n_flush, as step 3 will use n_flush */ fc->n_flush_cur = i; break; } } /* ok, now flush all async io to disk */ fc_flush_sync_dbfile(); /* step 3: all the flush blocks have sync to disk, update the state and io_fix */ j = 0; while (j < fc->n_flush_cur) { flash_cache_mutex_enter(); pos = (start_offset + j) % fc_size; flush_block = fc_get_block(pos); if (flush_block == NULL) { j++; flash_cache_mutex_exit(); continue; } /* block state and io_fix may be changed by doublewrite and lru move */ flash_block_mutex_enter(flush_block->fil_offset); flash_cache_mutex_exit(); if (flush_block->io_fix & IO_FIX_FLUSH) { /* the block is already in BLOCK_FLUSHED state */ flush_block->io_fix &= ~IO_FIX_FLUSH; } data_size = fc_block_get_data_size(flush_block); flash_block_mutex_exit(flush_block->fil_offset); j += data_size; } /* * i and j may be different, as the last been flushed block may be invalid by doublewrite, * so maybe i > j */ /* add the actual flushed blocks */ srv_flash_cache_flush = srv_flash_cache_flush + c_flush; /* step 4: update fc status and flush_off, and wake up threads that are sleep for space */ if (i > 0) { ut_a(i >= c_flush); flash_cache_mutex_enter(); /* * it is safe to inc flush off and sub dirty blocks at this time, * as fc_validate is not work */ fc_inc_flush_off(i); flash_cache_log_mutex_enter(); fc_log->current_stat->flush_offset = fc->flush_off; fc_log->current_stat->flush_round = fc->flush_round; flash_cache_log_mutex_exit(); ut_a(srv_flash_cache_dirty >= c_flush); srv_flash_cache_dirty -= c_flush; srv_fc_flush_should_commit_log_flush++; os_event_set(fc->wait_space_event); fc->n_flush_cur = 0; flash_cache_mutex_exit(); } fc->flush_buf->free_pos = 0; return c_flush; }
/********************************************************************** When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page hash index entries belonging to that table. This function tries to do that in batch. Note that this is a 'best effort' attempt and does not guarantee that ALL hash entries will be removed. */ static void buf_LRU_drop_page_hash_for_tablespace( /*==================================*/ ulint id) /* in: space id */ { buf_block_t* block; ulint* page_arr; ulint num_entries; page_arr = ut_malloc(sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); mutex_enter(&buf_pool->mutex); scan_again: num_entries = 0; block = UT_LIST_GET_LAST(buf_pool->LRU); while (block != NULL) { buf_block_t* prev_block; mutex_enter(&block->mutex); prev_block = UT_LIST_GET_PREV(LRU, block); ut_a(block->state == BUF_BLOCK_FILE_PAGE); if (block->space != id || block->buf_fix_count > 0 || block->io_fix != 0) { /* We leave the fixed pages as is in this scan. To be dealt with later in the final scan. */ mutex_exit(&block->mutex); goto next_page; } ut_ad(block->space == id); if (block->is_hashed) { /* Store the offset(i.e.: page_no) in the array so that we can drop hash index in a batch later. */ page_arr[num_entries] = block->offset; mutex_exit(&block->mutex); ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); ++num_entries; if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { goto next_page; } /* Array full. We release the buf_pool->mutex to obey the latching order. */ mutex_exit(&buf_pool->mutex); buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); num_entries = 0; mutex_enter(&buf_pool->mutex); } else { mutex_exit(&block->mutex); } next_page: /* Note that we may have released the buf_pool->mutex above after reading the prev_block during processing of a page_hash_batch (i.e.: when the array was full). This means that prev_block can change in LRU list. This is OK because this function is a 'best effort' to drop as many search hash entries as possible and it does not guarantee that ALL such entries will be dropped. */ block = prev_block; /* If, however, block has been removed from LRU list to the free list then we should restart the scan. block->state is protected by buf_pool->mutex. */ if (block && block->state != BUF_BLOCK_FILE_PAGE) { ut_a(num_entries == 0); goto scan_again; } } mutex_exit(&buf_pool->mutex); /* Drop any remaining batch of search hashed pages. */ buf_LRU_drop_page_hash_batch(id, page_arr, num_entries); ut_free(page_arr); }
/********************************************************************//** Issues read requests for pages which the ibuf module wants to read in, in order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ UNIV_INTERN void buf_read_ibuf_merge_pages( /*======================*/ ibool sync, /*!< in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ const ulint* space_ids, /*!< in: array of space ids */ const ib_int64_t* space_versions,/*!< in: the spaces must have this version number (timestamp), otherwise we discard the read; we use this to cancel reads if DISCARD + IMPORT may have changed the tablespace size */ const ulint* page_nos, /*!< in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /*!< in: number of elements in the arrays */ { ulint i; ut_ad(!ibuf_inside()); #ifdef UNIV_IBUF_DEBUG ut_a(n_stored < UNIV_PAGE_SIZE); #endif for (i = 0; i < n_stored; i++) { ulint err; buf_pool_t* buf_pool; ulint zip_size = fil_space_get_zip_size(space_ids[i]); buf_pool = buf_pool_get(space_ids[i], space_versions[i]); while (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { os_thread_sleep(500000); } if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { goto tablespace_deleted; } buf_read_page_low(&err, sync && (i + 1 == n_stored), BUF_READ_ANY_PAGE, space_ids[i], zip_size, TRUE, space_versions[i], page_nos[i]); if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) { tablespace_deleted: /* We have deleted or are deleting the single-table tablespace: remove the entries for that page */ ibuf_merge_or_delete_for_page(NULL, space_ids[i], page_nos[i], zip_size, FALSE); } } os_aio_simulated_wake_handler_threads(); /* Flush pages from the end of all the LRU lists if necessary */ buf_flush_free_margins(); #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Ibuf merge read-ahead space %lu pages %lu\n", (ulong) space_ids[0], (ulong) n_stored); } #endif /* UNIV_DEBUG */ }
enum db_err ib_trx_lock_table_with_retry( /*=========================*/ trx_t* trx, /*!< in/out: transaction */ dict_table_t* table, /*!< in: table to lock */ enum lock_mode mode) /*!< in: LOCK_X or LOCK_S */ { que_thr_t* thr; enum db_err err; mem_heap_t* heap; sel_node_t* node; ut_ad(trx->client_thread_id == os_thread_get_curr_id()); heap = mem_heap_create(512); trx->op_info = "setting table lock"; node = sel_node_create(heap); thr = pars_complete_graph_for_exec(node, trx, heap); thr->graph->state = QUE_FORK_ACTIVE; /* We use the select query graph as the dummy graph needed in the lock module call */ thr = que_fork_get_first_thr(que_node_get_parent(thr)); que_thr_move_to_run_state(thr); run_again: thr->run_node = thr; thr->prev_node = thr->common.parent; err = lock_table(0, table, mode, thr); trx->error_state = err; if (UNIV_LIKELY(err == DB_SUCCESS)) { que_thr_stop_for_client_no_error(thr, trx); } else { que_thr_stop_client(thr); if (err != DB_QUE_THR_SUSPENDED) { ibool was_lock_wait; was_lock_wait = ib_handle_errors(&err, trx, thr, NULL); if (was_lock_wait) { goto run_again; } } else { que_thr_t* run_thr; que_node_t* parent; parent = que_node_get_parent(thr); run_thr = que_fork_start_command(parent); ut_a(run_thr == thr); /* There was a lock wait but the thread was not in a ready to run or running state. */ trx->error_state = DB_LOCK_WAIT; goto run_again; } } que_graph_free(thr->graph); trx->op_info = ""; return(err); }
/********************************************************************//** Low-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there, in which case does nothing. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by an i/o-handler thread. @return 1 if a read request was queued, 0 if the page already resided in buf_pool, or if the page is in the doublewrite buffer blocks in which case it is never read into the pool, or if the tablespace does not exist or is being dropped @return 1 if read request is issued. 0 if it is not */ static ulint buf_read_page_low( /*==============*/ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are trying to read from a non-existent tablespace, or a tablespace which is just now being dropped */ ibool sync, /*!< in: TRUE if synchronous aio is desired */ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ..., ORed to OS_AIO_SIMULATED_WAKE_LATER (see below at read-ahead functions) */ ulint space, /*!< in: space id */ ulint zip_size,/*!< in: compressed page size, or 0 */ ibool unzip, /*!< in: TRUE=request uncompressed page */ ib_int64_t tablespace_version, /*!< in: if the space memory object has this timestamp different from what we are giving here, treat the tablespace as dropped; this is a timestamp we use to stop dangling page reads from a tablespace which we have DISCARDed + IMPORTed back */ ulint offset) /*!< in: page number */ { buf_page_t* bpage; ulint wake_later; *err = DB_SUCCESS; wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; if (trx_doublewrite && space == TRX_SYS_SPACE && ( (offset >= trx_doublewrite->block1 && offset < trx_doublewrite->block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) || (offset >= trx_doublewrite->block2 && offset < trx_doublewrite->block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: trying to read" " doublewrite buffer page %lu\n", (ulong) offset); return(0); } if (ibuf_bitmap_page(zip_size, offset) || trx_sys_hdr_page(space, offset)) { /* Trx sys header is so low in the latching order that we play safe and do not leave the i/o-completion to an asynchronous i/o-thread. Ibuf bitmap pages must always be read with syncronous i/o, to make sure they do not get involved in thread deadlocks. */ sync = TRUE; } /* The following call will also check if the tablespace does not exist or is being dropped; if we succeed in initing the page in the buffer pool for read, then DISCARD cannot proceed until the read has completed */ bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip, tablespace_version, offset); if (bpage == NULL) { return(0); } #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Posting read request for page %lu, sync %lu\n", (ulong) offset, (ulong) sync); } #endif ut_ad(buf_page_in_file(bpage)); if (zip_size) { *err = fil_io(OS_FILE_READ | wake_later, sync, space, zip_size, offset, 0, zip_size, bpage->zip.data, bpage); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = fil_io(OS_FILE_READ | wake_later, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, ((buf_block_t*) bpage)->frame, bpage); } ut_a(*err == DB_SUCCESS); if (sync) { /* The i/o is already completed when we arrive from fil_read */ buf_page_io_complete(bpage); } return(1); }
/**********************************************************************//** Reports in the undo log of an update or delete marking of a clustered index record. @return byte offset of the inserted undo log entry on the page if succeed, 0 if fail */ static ulint trx_undo_page_report_modify( /*========================*/ page_t* undo_page, /*!< in: undo log page */ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: clustered index where update or delete marking is done */ const rec_t* rec, /*!< in: clustered index record which has NOT yet been modified */ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ const upd_t* update, /*!< in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ ulint cmpl_info, /*!< in: compiler info on secondary index updates */ mtr_t* mtr) /*!< in: mtr */ { dict_table_t* table; ulint first_free; byte* ptr; const byte* field; ulint flen; ulint col_no; ulint type_cmpl; byte* type_cmpl_ptr; ulint i; trx_id_t trx_id; ibool ignore_prefix = FALSE; byte ext_buf[REC_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE]; ut_a(dict_index_is_clust(index)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); ptr = undo_page + first_free; ut_ad(first_free <= UNIV_PAGE_SIZE); if (trx_undo_left(undo_page, ptr) < 50) { /* NOTE: the value 50 must be big enough so that the general fields written below fit on the undo log page */ return(0); } /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; /* Store first some general parameters to the undo log */ if (!update) { type_cmpl = TRX_UNDO_DEL_MARK_REC; } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { type_cmpl = TRX_UNDO_UPD_DEL_REC; /* We are about to update a delete marked record. We don't typically need the prefix in this case unless the delete marking is done by the same transaction (which we check below). */ ignore_prefix = TRUE; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; } type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; type_cmpl_ptr = ptr; *ptr++ = (byte) type_cmpl; ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no); ptr += mach_dulint_write_much_compressed(ptr, table->id); /*----------------------------------------*/ /* Store the state of the info bits */ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); /* Store the values of the system columns */ field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_TRX_ID), &flen); ut_ad(flen == DATA_TRX_ID_LEN); trx_id = trx_read_trx_id(field); /* If it is an update of a delete marked record, then we are allowed to ignore blob prefixes if the delete marking was done by some other trx as it must have committed by now for us to allow an over-write. */ if (ignore_prefix) { ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0; } ptr += mach_dulint_write_compressed(ptr, trx_id); field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_ROLL_PTR), &flen); ut_ad(flen == DATA_ROLL_PTR_LEN); ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field)); /*----------------------------------------*/ /* Store then the fields required to uniquely determine the record which will be modified in the clustered index */ for (i = 0; i < dict_index_get_n_unique(index); i++) { field = rec_get_nth_field(rec, offsets, i, &flen); /* The ordering columns must not be stored externally. */ ut_ad(!rec_offs_nth_extern(offsets, i)); ut_ad(dict_index_get_nth_col(index, i)->ord_part); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, flen); if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } /*----------------------------------------*/ /* Save to the undo log the old values of the columns to be updated. */ if (update) { if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, upd_get_n_fields(update)); for (i = 0; i < upd_get_n_fields(update); i++) { ulint pos = upd_get_nth_field(update, i)->field_no; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 15) { return(0); } if (rec_offs_nth_extern(offsets, pos)) { ptr = trx_undo_page_report_modify_ext( ptr, dict_index_get_nth_col(index, pos) ->ord_part && !ignore_prefix && flen < REC_MAX_INDEX_COL_LEN ? ext_buf : NULL, dict_table_zip_size(table), &field, &flen); /* Notify purge that it eventually has to free the old externally stored field */ trx->update_undo->del_marks = TRUE; *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; } else { ptr += mach_write_compressed(ptr, flen); } if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } /*----------------------------------------*/ /* In the case of a delete marking, and also in the case of an update where any ordering field of any index changes, store the values of all columns which occur as ordering fields in any index. This info is used in the purge of old versions where we use it to build and search the delete marked index records, to look if we can remove them from the index tree. Note that starting from 4.0.14 also externally stored fields can be ordering in some index. Starting from 5.2, we no longer store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, but we can construct the column prefix fields in the index by fetching the first page of the BLOB that is pointed to by the clustered index. This works also in crash recovery, because all pages (including BLOBs) are recovered before anything is rolled back. */ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { byte* old_ptr = ptr; trx->update_undo->del_marks = TRUE; if (trx_undo_left(undo_page, ptr) < 5) { return(0); } /* Reserve 2 bytes to write the number of bytes the stored fields take in this undo record */ ptr += 2; for (col_no = 0; col_no < dict_table_get_n_cols(table); col_no++) { const dict_col_t* col = dict_table_get_nth_col(table, col_no); if (col->ord_part) { ulint pos; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5 + 15) { return(0); } pos = dict_index_get_nth_col_pos(index, col_no); ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (rec_offs_nth_extern(offsets, pos)) { ptr = trx_undo_page_report_modify_ext( ptr, flen < REC_MAX_INDEX_COL_LEN && !ignore_prefix ? ext_buf : NULL, dict_table_zip_size(table), &field, &flen); } else { ptr += mach_write_compressed( ptr, flen); } if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } mach_write_to_2(old_ptr, ptr - old_ptr); } /*----------------------------------------*/ /* Write pointers to the previous and the next undo log records */ if (trx_undo_left(undo_page, ptr) < 2) { return(0); } mach_write_to_2(ptr, first_free); ptr += 2; mach_write_to_2(undo_page + first_free, ptr - undo_page); mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, ptr - undo_page); /* Write to the REDO log about this change in the UNDO log */ trx_undof_page_add_undo_rec_log(undo_page, first_free, ptr - undo_page, mtr); return(first_free); }
/********************************************************************//** Low-level function which reads a page asynchronously from a file to the buffer buf_pool if it is not already there, in which case does nothing. Sets the io_fix flag and sets an exclusive lock on the buffer frame. The flag is cleared and the x-lock released by an i/o-handler thread. @return 1 if a read request was queued, 0 if the page already resided in buf_pool, or if the page is in the doublewrite buffer blocks in which case it is never read into the pool, or if the tablespace does not exist or is being dropped @return 1 if read request is issued. 0 if it is not */ UNIV_INTERN ulint buf_read_page_low( /*==============*/ ulint* err, /*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are trying to read from a non-existent tablespace, or a tablespace which is just now being dropped */ ibool sync, /*!< in: TRUE if synchronous aio is desired */ ulint mode, /*!< in: BUF_READ_IBUF_PAGES_ONLY, ..., ORed to OS_AIO_SIMULATED_WAKE_LATER (see below at read-ahead functions) */ ulint space, /*!< in: space id */ ulint zip_size,/*!< in: compressed page size, or 0 */ ibool unzip, /*!< in: TRUE=request uncompressed page */ ib_int64_t tablespace_version, /*!< in: if the space memory object has this timestamp different from what we are giving here, treat the tablespace as dropped; this is a timestamp we use to stop dangling page reads from a tablespace which we have DISCARDed + IMPORTed back */ ulint offset, /*!< in: page number */ trx_t* trx) { buf_page_t* bpage; ulint wake_later; *err = DB_SUCCESS; wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; if (trx_doublewrite && (space == TRX_SYS_SPACE || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE)) && ( (offset >= trx_doublewrite->block1 && offset < trx_doublewrite->block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) || (offset >= trx_doublewrite->block2 && offset < trx_doublewrite->block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: trying to read" " doublewrite buffer page %lu\n", (ulong) offset); return(0); } if (ibuf_bitmap_page(zip_size, offset) || trx_sys_hdr_page(space, offset)) { /* Trx sys header is so low in the latching order that we play safe and do not leave the i/o-completion to an asynchronous i/o-thread. Ibuf bitmap pages must always be read with syncronous i/o, to make sure they do not get involved in thread deadlocks. */ sync = TRUE; } /* The following call will also check if the tablespace does not exist or is being dropped; if we succeed in initing the page in the buffer pool for read, then DISCARD cannot proceed until the read has completed */ bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip, tablespace_version, offset); if (bpage == NULL) { /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */ if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) { /* hashed log recs must be treated here */ recv_addr_t* recv_addr; mutex_enter(&(recv_sys->mutex)); if (recv_sys->apply_log_recs == FALSE) { mutex_exit(&(recv_sys->mutex)); goto not_to_recover; } /* recv_get_fil_addr_struct() */ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, hash_calc_hash(ut_fold_ulint_pair(space, offset), recv_sys->addr_hash)); while (recv_addr) { if ((recv_addr->space == space) && (recv_addr->page_no == offset)) { break; } recv_addr = HASH_GET_NEXT(addr_hash, recv_addr); } if ((recv_addr == NULL) || (recv_addr->state == RECV_BEING_PROCESSED) || (recv_addr->state == RECV_PROCESSED)) { mutex_exit(&(recv_sys->mutex)); goto not_to_recover; } fprintf(stderr, " (cannot find space: %lu)", space); recv_addr->state = RECV_PROCESSED; ut_a(recv_sys->n_addrs); recv_sys->n_addrs--; mutex_exit(&(recv_sys->mutex)); } not_to_recover: return(0); } #ifdef UNIV_DEBUG if (buf_debug_prints) { fprintf(stderr, "Posting read request for page %lu, sync %lu\n", (ulong) offset, (ulong) sync); } #endif ut_ad(buf_page_in_file(bpage)); if (sync) { thd_wait_begin(NULL, THD_WAIT_DISKIO); } if (zip_size) { *err = _fil_io(OS_FILE_READ | wake_later, sync, space, zip_size, offset, 0, zip_size, bpage->zip.data, bpage, trx); } else { ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); *err = _fil_io(OS_FILE_READ | wake_later, sync, space, 0, offset, 0, UNIV_PAGE_SIZE, ((buf_block_t*) bpage)->frame, bpage, trx); } if (sync) { thd_wait_end(NULL); } if (*err == DB_TABLESPACE_DELETED) { buf_read_page_handle_error(bpage); return(0); } SRV_CORRUPT_TABLE_CHECK(*err == DB_SUCCESS, bpage->is_corrupt = TRUE;);