/*************************************************************** Stores to the heap the row on which the node->pcur is positioned. */ static void row_upd_store_row( /*==============*/ upd_node_t* node) /* in: row update node */ { dict_index_t* clust_index; upd_t* update; rec_t* rec; ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES); if (node->row != NULL) { mem_heap_empty(node->heap); node->row = NULL; } clust_index = dict_table_get_first_index(node->table); rec = btr_pcur_get_rec(node->pcur); node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); node->ext_vec = mem_heap_alloc(node->heap, sizeof(ulint) * rec_get_n_fields(rec)); if (node->is_delete) { update = NULL; } else { update = node->update; } node->n_ext_vec = btr_push_update_extern_fields(node->ext_vec, rec, update); }
/***********************************************************//** Undoes a modify in secondary indexes when undo record type is UPD_DEL. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ static ulint row_undo_mod_upd_del_sec( /*=====================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; dtuple_t* entry; dict_index_t* index; ulint err = DB_SUCCESS; ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC); heap = mem_heap_create(1024); while (node->index != NULL) { /* Skip all corrupted secondary index */ dict_table_skip_corrupt_index(node->index); if (!node->index) { break; } index = node->index; entry = row_build_index_entry(node->row, node->ext, index, heap); if (UNIV_UNLIKELY(!entry)) { /* The database must have crashed after inserting a clustered index record but before writing all the externally stored columns of that record. Because secondary index entries are inserted after the clustered index record, we may assume that the secondary index record does not exist. However, this situation may only occur during the rollback of incomplete transactions. */ ut_a(thr_is_recv(thr)); } else { err = row_undo_mod_del_mark_or_remove_sec( node, thr, index, entry); if (err != DB_SUCCESS) { break; } } mem_heap_empty(heap); node->index = dict_table_get_next_index(node->index); } mem_heap_free(heap); return(err); }
void row_purge( /*======*/ purge_node_t* node, /*!< in: row purge node */ que_thr_t* thr) /*!< in: query thread */ { ibool updated_extern; ut_ad(node); ut_ad(thr); node->undo_rec = trx_purge_fetch_next_rec(&node->roll_ptr, &node->reservation, node->heap); if (!node->undo_rec) { /* Purge completed for this query thread */ thr->run_node = que_node_get_parent(node); return; } if (node->undo_rec != &trx_purge_dummy_rec && row_purge_parse_undo_rec(node, &updated_extern, thr)) { node->found_clust = FALSE; node->index = dict_table_get_next_index( dict_table_get_first_index(node->table)); if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { row_purge_del_mark(node); } else if (updated_extern || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { row_purge_upd_exist_or_extern(thr, node); } if (node->found_clust) { btr_pcur_close(&(node->pcur)); } row_mysql_unfreeze_data_dictionary(thr_get_trx(thr)); } /* Do some cleanup */ trx_purge_rec_release(node->reservation); mem_heap_empty(node->heap); thr->run_node = node; }
void read_view_close_for_mysql( /*======================*/ trx_t* trx) /* in: trx which has a read view */ { ut_a(trx->global_read_view); mutex_enter(&kernel_mutex); read_view_close(trx->global_read_view); mem_heap_empty(trx->global_read_view_heap); trx->read_view = NULL; trx->global_read_view = NULL; mutex_exit(&kernel_mutex); }
/********************************************************************//** Disable the adaptive hash search system and empty the index. */ UNIV_INTERN void btr_search_disable(void) /*====================*/ { dict_table_t* table; mutex_enter(&dict_sys->mutex); rw_lock_x_lock(&btr_search_latch); btr_search_enabled = FALSE; /* Clear the index->search_info->ref_count of every index in the data dictionary cache. */ for (table = UT_LIST_GET_FIRST(dict_sys->table_LRU); table; table = UT_LIST_GET_NEXT(table_LRU, table)) { dict_index_t* index; for (index = dict_table_get_first_index(table); index; index = dict_table_get_next_index(index)) { index->search_info->ref_count = 0; } } mutex_exit(&dict_sys->mutex); /* Set all block->index = NULL. */ buf_pool_clear_hash_index(); /* Clear the adaptive hash index. */ hash_table_clear(btr_search_sys->hash_index); mem_heap_empty(btr_search_sys->hash_index->heap); rw_lock_x_unlock(&btr_search_latch); }
/*******************************************************************//** This function runs a purge batch. @return number of undo log pages handled in the batch */ UNIV_INTERN ulint trx_purge( /*======*/ ulint limit) /*!< in: the maximum number of records to purge in one batch */ { que_thr_t* thr; ulint old_pages_handled; if (srv_fake_write) return(0); ut_a(purge_sys->trx->n_active_thrs == 0); rw_lock_x_lock(&purge_sys->latch); mutex_enter(&kernel_mutex); /* Close and free the old purge view */ read_view_close(purge_sys->view); purge_sys->view = NULL; mem_heap_empty(purge_sys->heap); /* Determine how much data manipulation language (DML) statements need to be delayed in order to reduce the lagging of the purge thread. */ srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ /* If we cannot advance the 'purge view' because of an old 'consistent read view', then the DML statements cannot be delayed. Also, srv_max_purge_lag <= 0 means 'infinity'. */ if (srv_max_purge_lag > 0) { float ratio = (float) trx_sys->rseg_history_len / srv_max_purge_lag; if (ratio > ULINT_MAX / 10000) { /* Avoid overflow: maximum delay is 4295 seconds */ srv_dml_needed_delay = ULINT_MAX; } else if (ratio > 1) { /* If the history list length exceeds the innodb_max_purge_lag, the data manipulation statements are delayed by at least 5000 microseconds. */ srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); } } purge_sys->view = read_view_oldest_copy_or_open_new( 0, purge_sys->heap); mutex_exit(&kernel_mutex); rw_lock_x_unlock(&(purge_sys->latch)); purge_sys->state = TRX_PURGE_ON; purge_sys->handle_limit = purge_sys->n_pages_handled + limit; old_pages_handled = purge_sys->n_pages_handled; mutex_enter(&kernel_mutex); thr = que_fork_start_command(purge_sys->query); ut_ad(thr); mutex_exit(&kernel_mutex); if (srv_print_thread_releases) { fputs("Starting purge\n", stderr); } que_run_threads(thr); if (srv_print_thread_releases) { fprintf(stderr, "Purge ends; pages handled %lu\n", (ulong) purge_sys->n_pages_handled); } return(purge_sys->n_pages_handled - old_pages_handled); }
/****************************************************************//** Commits a transaction. */ UNIV_INTERN void trx_commit_off_kernel( /*==================*/ trx_t* trx) /*!< in: transaction */ { page_t* update_hdr_page; ib_uint64_t lsn = 0; trx_rseg_t* rseg; trx_undo_t* undo; mtr_t mtr; ut_ad(mutex_own(&kernel_mutex)); trx->must_flush_log_later = FALSE; rseg = trx->rseg; if (trx->insert_undo != NULL || trx->update_undo != NULL) { mutex_exit(&kernel_mutex); mtr_start(&mtr); /* Change the undo log segment states from TRX_UNDO_ACTIVE to some other state: these modifications to the file data structure define the transaction as committed in the file based world, at the serialization point of the log sequence number lsn obtained below. */ mutex_enter(&(rseg->mutex)); if (trx->insert_undo != NULL) { trx_undo_set_state_at_finish( rseg, trx, trx->insert_undo, &mtr); } undo = trx->update_undo; if (undo) { mutex_enter(&kernel_mutex); trx->no = trx_sys_get_new_trx_no(); mutex_exit(&kernel_mutex); /* It is not necessary to obtain trx->undo_mutex here because only a single OS thread is allowed to do the transaction commit for this transaction. */ update_hdr_page = trx_undo_set_state_at_finish( rseg, trx, undo, &mtr); /* We have to do the cleanup for the update log while holding the rseg mutex because update log headers have to be put to the history list in the order of the trx number. */ trx_undo_update_cleanup(trx, update_hdr_page, &mtr); } mutex_exit(&(rseg->mutex)); /* Update the latest MySQL binlog name and offset info in trx sys header if MySQL binlogging is on or the database server is a MySQL replication slave */ if (trx->mysql_log_file_name && trx->mysql_log_file_name[0] != '\0') { trx_sys_update_mysql_binlog_offset( trx->mysql_log_file_name, trx->mysql_log_offset, TRX_SYS_MYSQL_LOG_INFO, &mtr); trx->mysql_log_file_name = NULL; } /* The following call commits the mini-transaction, making the whole transaction committed in the file-based world, at this log sequence number. The transaction becomes 'durable' when we write the log to disk, but in the logical sense the commit in the file-based data structures (undo logs etc.) happens here. NOTE that transaction numbers, which are assigned only to transactions with an update undo log, do not necessarily come in exactly the same order as commit lsn's, if the transactions have different rollback segments. To get exactly the same order we should hold the kernel mutex up to this point, adding to the contention of the kernel mutex. However, if a transaction T2 is able to see modifications made by a transaction T1, T2 will always get a bigger transaction number and a bigger commit lsn than T1. */ /*--------------*/ mtr_commit(&mtr); /*--------------*/ lsn = mtr.end_lsn; mutex_enter(&kernel_mutex); } ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED); ut_ad(mutex_own(&kernel_mutex)); /* The following assignment makes the transaction committed in memory and makes its changes to data visible to other transactions. NOTE that there is a small discrepancy from the strict formal visibility rules here: a human user of the database can see modifications made by another transaction T even before the necessary log segment has been flushed to the disk. If the database happens to crash before the flush, the user has seen modifications from T which will never be a committed transaction. However, any transaction T2 which sees the modifications of the committing transaction T, and which also itself makes modifications to the database, will get an lsn larger than the committing transaction T. In the case where the log flush fails, and T never gets committed, also T2 will never get committed. */ /*--------------------------------------*/ trx->conc_state = TRX_COMMITTED_IN_MEMORY; /*--------------------------------------*/ /* If we release kernel_mutex below and we are still doing recovery i.e.: back ground rollback thread is still active then there is a chance that the rollback thread may see this trx as COMMITTED_IN_MEMORY and goes adhead to clean it up calling trx_cleanup_at_db_startup(). This can happen in the case we are committing a trx here that is left in PREPARED state during the crash. Note that commit of the rollback of a PREPARED trx happens in the recovery thread while the rollback of other transactions happen in the background thread. To avoid this race we unconditionally unset the is_recovered flag from the trx. */ trx->is_recovered = FALSE; lock_release_off_kernel(trx); if (trx->global_read_view) { read_view_close(trx->global_read_view); mem_heap_empty(trx->global_read_view_heap); trx->global_read_view = NULL; } trx->read_view = NULL; if (lsn) { mutex_exit(&kernel_mutex); if (trx->insert_undo != NULL) { trx_undo_insert_cleanup(trx); } /* NOTE that we could possibly make a group commit more efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ /*-------------------------------------*/ /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the transaction durable if the OS does not crash. We may also flush the log files to disk, making the transaction durable also at an OS crash or a power outage. The idea in InnoDB's group commit is that a group of transactions gather behind a trx doing a physical disk write to log files, and when that physical write has been completed, one of those transactions does a write which commits the whole group. Note that this group commit will only bring benefit if there are > 2 users in the database. Then at least 2 users can gather behind one doing the physical log write to disk. If we are calling trx_commit() under prepare_commit_mutex, we will delay possible log write and flush to a separate function trx_commit_complete_for_mysql(), which is only called when the thread has released the mutex. This is to make the group commit algorithm to work. Otherwise, the prepare_commit mutex would serialize all commits and prevent a group of transactions from gathering. */ if (trx->flush_log_later) { /* Do nothing yet */ trx->must_flush_log_later = TRUE; } else if (srv_flush_log_at_trx_commit == 0) { /* Do nothing */ } else if (srv_flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { /* Write the log to the log files AND flush them to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } } else if (srv_flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { ut_error; } trx->commit_lsn = lsn; /*-------------------------------------*/ mutex_enter(&kernel_mutex); } /* Free all savepoints */ trx_roll_free_all_savepoints(trx); trx->conc_state = TRX_NOT_STARTED; trx->rseg = NULL; trx->undo_no = ut_dulint_zero; trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0); UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); }
/***********************************************************//** Fetches an undo log record and does the undo for the recorded operation. If none left, or a partial rollback completed, returns control to the parent node, which is always a query thread node. @return DB_SUCCESS if operation successfully completed, else error code */ static ulint row_undo( /*=====*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { ulint err; trx_t* trx; roll_ptr_t roll_ptr; ibool locked_data_dict; ut_ad(node && thr); trx = node->trx; if (node->state == UNDO_NODE_FETCH_NEXT) { node->undo_rec = trx_roll_pop_top_rec_of_trx(trx, trx->roll_limit, &roll_ptr, node->heap); if (!node->undo_rec) { /* Rollback completed for this query thread */ thr->run_node = que_node_get_parent(node); return(DB_SUCCESS); } node->roll_ptr = roll_ptr; node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); if (trx_undo_roll_ptr_is_insert(roll_ptr)) { node->state = UNDO_NODE_INSERT; } else { node->state = UNDO_NODE_MODIFY; } } else if (node->state == UNDO_NODE_PREV_VERS) { /* Undo should be done to the same clustered index record again in this same rollback, restoring the previous version */ roll_ptr = node->new_roll_ptr; node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr, node->heap); node->roll_ptr = roll_ptr; node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec); if (trx_undo_roll_ptr_is_insert(roll_ptr)) { node->state = UNDO_NODE_INSERT; } else { node->state = UNDO_NODE_MODIFY; } } /* Prevent DROP TABLE etc. while we are rolling back this row. If we are doing a TABLE CREATE or some other dictionary operation, then we already have dict_operation_lock locked in x-mode. Do not try to lock again, because that would cause a hang. */ locked_data_dict = (trx->dict_operation_lock_mode == 0); if (locked_data_dict) { row_mysql_lock_data_dictionary(trx); } if (node->state == UNDO_NODE_INSERT) { err = row_undo_ins(node); node->state = UNDO_NODE_FETCH_NEXT; } else { ut_ad(node->state == UNDO_NODE_MODIFY); err = row_undo_mod(node, thr); } if (locked_data_dict) { row_mysql_unlock_data_dictionary(trx); } /* Do some cleanup */ btr_pcur_close(&(node->pcur)); mem_heap_empty(node->heap); thr->run_node = node; return(err); }
/*******************************************************************//** This function runs a purge batch. @return number of undo log pages handled in the batch */ UNIV_INTERN ulint trx_purge(void) /*===========*/ { que_thr_t* thr; /* que_thr_t* thr2; */ ulonglong old_pages_handled; mutex_enter(&(purge_sys->mutex)); if (purge_sys->trx->n_active_thrs > 0) { mutex_exit(&(purge_sys->mutex)); /* Should not happen */ ut_error; return(0); } rw_lock_x_lock(&(purge_sys->latch)); mutex_enter(&kernel_mutex); /* Close and free the old purge view */ read_view_close(purge_sys->view); purge_sys->view = NULL; mem_heap_empty(purge_sys->heap); /* Determine how much data manipulation language (DML) statements need to be delayed in order to reduce the lagging of the purge thread. */ srv_dml_needed_delay = 0; /* in microseconds; default: no delay */ /* If we cannot advance the 'purge view' because of an old 'consistent read view', then the DML statements cannot be delayed. Also, srv_max_purge_lag <= 0 means 'infinity'. */ if (srv_max_purge_lag > 0 && !UT_LIST_GET_LAST(trx_sys->view_list)) { float ratio = (float) trx_sys->rseg_history_len / srv_max_purge_lag; if (ratio > ULINT_MAX / 10000) { /* Avoid overflow: maximum delay is 4295 seconds */ srv_dml_needed_delay = ULINT_MAX; } else if (ratio > 1) { /* If the history list length exceeds the innodb_max_purge_lag, the data manipulation statements are delayed by at least 5000 microseconds. */ srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000); } } purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero, purge_sys->heap); mutex_exit(&kernel_mutex); rw_lock_x_unlock(&(purge_sys->latch)); #ifdef UNIV_DEBUG if (srv_purge_view_update_only_debug) { mutex_exit(&(purge_sys->mutex)); return(0); } #endif purge_sys->state = TRX_PURGE_ON; /* Handle at most 20 undo log pages in one purge batch */ purge_sys->handle_limit = purge_sys->n_pages_handled + 20; old_pages_handled = purge_sys->n_pages_handled; mutex_exit(&(purge_sys->mutex)); mutex_enter(&kernel_mutex); thr = que_fork_start_command(purge_sys->query); ut_ad(thr); /* thr2 = que_fork_start_command(purge_sys->query); ut_ad(thr2); */ mutex_exit(&kernel_mutex); /* srv_que_task_enqueue(thr2); */ if (srv_print_thread_releases) { fputs("Starting purge\n", stderr); } que_run_threads(thr); if (srv_print_thread_releases) { fprintf(stderr, "Purge ends; pages handled %lu\n", (ulong) purge_sys->n_pages_handled); } return((ulint) (purge_sys->n_pages_handled - old_pages_handled)); }
/*****************************************************************//** Constructs the version of a clustered index record which a consistent read should see. We assume that the trx id stored in rec is such that the consistent read should not see rec in its present version. @return DB_SUCCESS or DB_MISSING_HISTORY */ UNIV_INTERN ulint row_vers_build_for_consistent_read( /*===============================*/ const rec_t* rec, /*!< in: record in a clustered index; the caller must have a latch on the page; this latch locks the top of the stack of versions of this records */ mtr_t* mtr, /*!< in: mtr holding the latch on rec */ dict_index_t* index, /*!< in: the clustered index */ ulint** offsets,/*!< in/out: offsets returned by rec_get_offsets(rec, index) */ read_view_t* view, /*!< in: the consistent read view */ mem_heap_t** offset_heap,/*!< in/out: memory heap from which the offsets are allocated */ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for *old_vers is allocated; memory for possible intermediate versions is allocated and freed locally within the function */ rec_t** old_vers)/*!< out, own: old version, or NULL if the record does not exist in the view, that is, it was freshly inserted afterwards */ { const rec_t* version; rec_t* prev_version; trx_id_t trx_id; mem_heap_t* heap = NULL; byte* buf; ulint err; ut_ad(dict_index_is_clust(index)); ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(rec_offs_validate(rec, index, *offsets)); trx_id = row_get_rec_trx_id(rec, index, *offsets); ut_ad(!read_view_sees_trx_id(view, trx_id)); rw_lock_s_lock(&(purge_sys->latch)); version = rec; for (;;) { mem_heap_t* heap2 = heap; trx_undo_rec_t* undo_rec; roll_ptr_t roll_ptr; undo_no_t undo_no; heap = mem_heap_create(1024); /* If we have high-granularity consistent read view and creating transaction of the view is the same as trx_id in the record we see this record only in the case when undo_no of the record is < undo_no in the view. */ if (view->type == VIEW_HIGH_GRANULARITY && ut_dulint_cmp(view->creator_trx_id, trx_id) == 0) { roll_ptr = row_get_rec_roll_ptr(version, index, *offsets); undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap); undo_no = trx_undo_rec_get_undo_no(undo_rec); mem_heap_empty(heap); if (ut_dulint_cmp(view->undo_no, undo_no) > 0) { /* The view already sees this version: we can copy it to in_heap and return */ #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern( version, *offsets)); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); *old_vers = rec_copy(buf, version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); err = DB_SUCCESS; break; } } err = trx_undo_prev_version_build(rec, mtr, version, index, *offsets, heap, &prev_version); if (heap2) { mem_heap_free(heap2); /* free version */ } if (err != DB_SUCCESS) { break; } if (prev_version == NULL) { /* It was a freshly inserted version */ *old_vers = NULL; err = DB_SUCCESS; break; } *offsets = rec_get_offsets(prev_version, index, *offsets, ULINT_UNDEFINED, offset_heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(prev_version, *offsets)); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ trx_id = row_get_rec_trx_id(prev_version, index, *offsets); if (read_view_sees_trx_id(view, trx_id)) { /* The view already sees this version: we can copy it to in_heap and return */ buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); *old_vers = rec_copy(buf, prev_version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); err = DB_SUCCESS; break; } version = prev_version; }/* for (;;) */ mem_heap_free(heap); rw_lock_s_unlock(&(purge_sys->latch)); return(err); }
/*************************************************************** Updates the affected index records of a row. When the control is transferred to this node, we assume that we have a persistent cursor which was on a record, and the position of the cursor is stored in the cursor. */ static ulint row_upd( /*====*/ /* out: DB_SUCCESS if operation successfully completed, else error code or DB_LOCK_WAIT */ upd_node_t* node, /* in: row update node */ que_thr_t* thr) /* in: query thread */ { ulint err = DB_SUCCESS; ut_ad(node && thr); if (node->in_mysql_interface) { /* We do not get the cmpl_info value from the MySQL interpreter: we must calculate it on the fly: */ if (node->is_delete || row_upd_changes_some_index_ord_field_binary( node->table, node->update)) { node->cmpl_info = 0; } else { node->cmpl_info = UPD_NODE_NO_ORD_CHANGE; } } if (node->state == UPD_NODE_UPDATE_CLUSTERED || node->state == UPD_NODE_INSERT_CLUSTERED) { err = row_upd_clust_step(node, thr); if (err != DB_SUCCESS) { goto function_exit; } } if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { goto function_exit; } while (node->index != NULL) { err = row_upd_sec_step(node, thr); if (err != DB_SUCCESS) { goto function_exit; } node->index = dict_table_get_next_index(node->index); } function_exit: if (err == DB_SUCCESS) { /* Do some cleanup */ if (node->row != NULL) { mem_heap_empty(node->heap); node->row = NULL; node->n_ext_vec = 0; } node->state = UPD_NODE_UPDATE_CLUSTERED; } return(err); }
/*****************************************************************//** Initializes the data dictionary memory structures when the database is started. This function is also called when the data dictionary is created. */ UNIV_INTERN void dict_boot(void) /*===========*/ { dict_table_t* table; dict_index_t* index; dict_hdr_t* dict_hdr; mem_heap_t* heap; mtr_t mtr; ulint error; mtr_start(&mtr); /* Create the hash tables etc. */ dict_init(); heap = mem_heap_create(450); mutex_enter(&(dict_sys->mutex)); /* Get the dictionary header */ dict_hdr = dict_hdr_get(&mtr); /* Because we only write new row ids to disk-based data structure (dictionary header) when it is divisible by DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover the latest value of the row id counter. Therefore we advance the counter at the database startup to avoid overlapping values. Note that when a user after database startup first time asks for a new row id, then because the counter is now divisible by ..._MARGIN, it will immediately be updated to the disk-based header. */ dict_sys->row_id = ut_dulint_add( ut_dulint_align_up(mtr_read_dulint(dict_hdr + DICT_HDR_ROW_ID, &mtr), DICT_HDR_ROW_ID_WRITE_MARGIN), DICT_HDR_ROW_ID_WRITE_MARGIN); /* Insert into the dictionary cache the descriptions of the basic system tables */ /*-------------------------*/ table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); /* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */ dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4); /* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT) and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */ dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); table->id = DICT_TABLES_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_tables = table; mem_heap_empty(heap); index = dict_mem_index_create("SYS_TABLES", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 1); dict_mem_index_add_field(index, "NAME", 0); index->id = DICT_TABLES_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_TABLES, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ index = dict_mem_index_create("SYS_TABLES", "ID_IND", DICT_HDR_SPACE, DICT_UNIQUE, 1); dict_mem_index_add_field(index, "ID", 0); index->id = DICT_TABLE_IDS_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_TABLE_IDS, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0); dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4); table->id = DICT_COLUMNS_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_columns = table; mem_heap_empty(heap); index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "TABLE_ID", 0); dict_mem_index_add_field(index, "POS", 0); index->id = DICT_COLUMNS_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_COLUMNS, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0); dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4); /* The '+ 2' below comes from the 2 system fields */ #if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2 #error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2" #endif #if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2 #error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2" #endif #if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2 #error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2" #endif table->id = DICT_INDEXES_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_indexes = table; mem_heap_empty(heap); index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "TABLE_ID", 0); dict_mem_index_add_field(index, "ID", 0); index->id = DICT_INDEXES_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_INDEXES, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); /*-------------------------*/ table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0); dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0); dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4); dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0); table->id = DICT_FIELDS_ID; dict_table_add_to_cache(table, heap); dict_sys->sys_fields = table; mem_heap_free(heap); index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND", DICT_HDR_SPACE, DICT_UNIQUE | DICT_CLUSTERED, 2); dict_mem_index_add_field(index, "INDEX_ID", 0); dict_mem_index_add_field(index, "POS", 0); index->id = DICT_FIELDS_ID; error = dict_index_add_to_cache(table, index, mtr_read_ulint(dict_hdr + DICT_HDR_FIELDS, MLOG_4BYTES, &mtr), FALSE); ut_a(error == DB_SUCCESS); mtr_commit(&mtr); /*-------------------------*/ /* Initialize the insert buffer table and index for each tablespace */ ibuf_init_at_db_start(); /* Load definitions of other indexes on system tables */ dict_load_sys_table(dict_sys->sys_tables); dict_load_sys_table(dict_sys->sys_columns); dict_load_sys_table(dict_sys->sys_indexes); dict_load_sys_table(dict_sys->sys_fields); mutex_exit(&(dict_sys->mutex)); }
/*************************************************************** Fetches an undo log record and does the purge for the recorded operation. If none left, or the current purge completed, returns the control to the parent node, which is always a query thread node. */ static ulint row_purge( /*======*/ /* out: DB_SUCCESS if operation successfully completed, else error code */ purge_node_t* node, /* in: row purge node */ que_thr_t* thr) /* in: query thread */ { dulint roll_ptr; ibool purge_needed; ibool updated_extern; ut_ad(node && thr); node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr, &(node->reservation), node->heap); if (!node->undo_rec) { /* Purge completed for this query thread */ thr->run_node = que_node_get_parent(node); return(DB_SUCCESS); } node->roll_ptr = roll_ptr; if (node->undo_rec == &trx_purge_dummy_rec) { purge_needed = FALSE; } else { purge_needed = row_purge_parse_undo_rec(node, &updated_extern, thr); } if (purge_needed) { node->found_clust = FALSE; node->index = dict_table_get_next_index( dict_table_get_first_index(node->table)); if (node->rec_type == TRX_UNDO_DEL_MARK_REC) { row_purge_del_mark(node, thr); } else if (updated_extern || node->rec_type == TRX_UNDO_UPD_EXIST_REC) { row_purge_upd_exist_or_extern(node, thr); } if (node->found_clust) { btr_pcur_close(&(node->pcur)); } rw_lock_x_unlock(&(purge_sys->purge_is_running)); } /* Do some cleanup */ trx_purge_rec_release(node->reservation); mem_heap_empty(node->heap); thr->run_node = node; return(DB_SUCCESS); }
/***********************************************************//** Undoes a modify in secondary indexes when undo record type is UPD_EXIST. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ static ulint row_undo_mod_upd_exist_sec( /*=======================*/ undo_node_t* node, /*!< in: row undo node */ que_thr_t* thr) /*!< in: query thread */ { mem_heap_t* heap; dtuple_t* entry; dict_index_t* index; ulint err; if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) { /* No change in secondary indexes */ return(DB_SUCCESS); } heap = mem_heap_create(1024); while (node->index != NULL) { /* Skip all corrupted secondary index */ dict_table_skip_corrupt_index(node->index); if (!node->index) { break; } index = node->index; if (row_upd_changes_ord_field_binary(node->index, node->update, thr, node->row, node->ext)) { /* Build the newest version of the index entry */ entry = row_build_index_entry(node->row, node->ext, index, heap); if (UNIV_UNLIKELY(!entry)) { /* The server must have crashed in row_upd_clust_rec_by_insert() before the updated externally stored columns (BLOBs) of the new clustered index entry were written. */ /* The table must be in DYNAMIC or COMPRESSED format. REDUNDANT and COMPACT formats store a local 768-byte prefix of each externally stored column. */ ut_a(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP); /* This is only legitimate when rolling back an incomplete transaction after crash recovery. */ ut_a(thr_get_trx(thr)->is_recovered); /* The server must have crashed before completing the insert of the new clustered index entry and before inserting to the secondary indexes. Because node->row was not yet written to this index, we can ignore it. But we must restore node->undo_row. */ } else { /* NOTE that if we updated the fields of a delete-marked secondary index record so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot return to the original values because we do not know them. But this should not cause problems because in row0sel.c, in queries we always retrieve the clustered index record or an earlier version of it, if the secondary index record through which we do the search is delete-marked. */ err = row_undo_mod_del_mark_or_remove_sec( node, thr, index, entry); if (err != DB_SUCCESS) { mem_heap_free(heap); return(err); } mem_heap_empty(heap); } /* We may have to update the delete mark in the secondary index record of the previous version of the row. We also need to update the fields of the secondary index record if we updated its fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'. */ entry = row_build_index_entry(node->undo_row, node->undo_ext, index, heap); ut_a(entry); err = row_undo_mod_del_unmark_sec_and_undo_update( BTR_MODIFY_LEAF, thr, index, entry); if (err == DB_FAIL) { err = row_undo_mod_del_unmark_sec_and_undo_update( BTR_MODIFY_TREE, thr, index, entry); } if (err != DB_SUCCESS) { mem_heap_free(heap); return(err); } } node->index = dict_table_get_next_index(node->index); } mem_heap_free(heap); return(DB_SUCCESS); }