ibool flst_validate( /*==========*/ /* out: TRUE if ok */ flst_base_node_t* base, /* in: pointer to base node of list */ mtr_t* mtr1) /* in: mtr */ { ulint space; flst_node_t* node; fil_addr_t node_addr; fil_addr_t base_addr; ulint len; ulint i; mtr_t mtr2; ut_ad(base); ut_ad(mtr_memo_contains(mtr1, buf_block_align(base), MTR_MEMO_PAGE_X_FIX)); /* We use two mini-transaction handles: the first is used to lock the base node, and prevent other threads from modifying the list. The second is used to traverse the list. We cannot run the second mtr without committing it at times, because if the list is long, then the x-locked pages could fill the buffer resulting in a deadlock. */ /* Find out the space id */ buf_ptr_get_fsp_addr(base, &space, &base_addr); len = flst_get_len(base, mtr1); node_addr = flst_get_first(base, mtr1); for (i = 0; i < len; i++) { mtr_start(&mtr2); node = fut_get_ptr(space, node_addr, RW_X_LATCH, &mtr2); node_addr = flst_get_next_addr(node, &mtr2); mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer becoming full */ } ut_a(fil_addr_is_null(node_addr)); node_addr = flst_get_last(base, mtr1); for (i = 0; i < len; i++) { mtr_start(&mtr2); node = fut_get_ptr(space, node_addr, RW_X_LATCH, &mtr2); node_addr = flst_get_prev_addr(node, &mtr2); mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer becoming full */ } ut_a(fil_addr_is_null(node_addr)); return(TRUE); }
/**********************************************************************//** Returns a new table, index, or tree id. @return the new id */ UNIV_INTERN dulint dict_hdr_get_new_id( /*================*/ ulint type) /*!< in: DICT_HDR_ROW_ID, ... */ { dict_hdr_t* dict_hdr; dulint id; mtr_t mtr; ut_ad((type == DICT_HDR_TABLE_ID) || (type == DICT_HDR_INDEX_ID)); mtr_start(&mtr); dict_hdr = dict_hdr_get(&mtr); id = mtr_read_dulint(dict_hdr + type, &mtr); id = ut_dulint_add(id, 1); mlog_write_dulint(dict_hdr + type, id, &mtr); mtr_commit(&mtr); return(id); }
/***************************************************************//** Removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ static ulint row_undo_ins_remove_sec_low( /*========================*/ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether we wish optimistic or pessimistic descent down the index tree */ dict_index_t* index, /*!< in: index */ dtuple_t* entry) /*!< in: index entry to remove */ { btr_pcur_t pcur; btr_cur_t* btr_cur; ulint err; mtr_t mtr; enum row_search_result search_result; mtr_start(&mtr); btr_cur = btr_pcur_get_btr_cur(&pcur); ut_ad(mode == BTR_MODIFY_TREE || mode == BTR_MODIFY_LEAF); search_result = row_search_index_entry(index, entry, mode, &pcur, &mtr); switch (search_result) { case ROW_NOT_FOUND: err = DB_SUCCESS; goto func_exit; case ROW_FOUND: break; case ROW_BUFFERED: case ROW_NOT_DELETED_REF: /* These are invalid outcomes, because the mode passed to row_search_index_entry() did not include any of the flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ ut_error; } if (mode == BTR_MODIFY_LEAF) { err = btr_cur_optimistic_delete(btr_cur, &mtr) ? DB_SUCCESS : DB_FAIL; } else { ut_ad(mode == BTR_MODIFY_TREE); /* No need to distinguish RB_RECOVERY here, because we are deleting a secondary index record: the distinction between RB_NORMAL and RB_RECOVERY only matters when deleting a record that contains externally stored columns. */ ut_ad(!dict_index_is_clust(index)); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NORMAL, &mtr); } func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); return(err); }
/***************************************************************//** Removes a secondary index entry if found. @return DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ static ulint row_undo_ins_remove_sec_low( /*========================*/ ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether we wish optimistic or pessimistic descent down the index tree */ dict_index_t* index, /*!< in: index */ dtuple_t* entry) /*!< in: index entry to remove */ { btr_pcur_t pcur; btr_cur_t* btr_cur; ibool found; ibool success; ulint err; mtr_t mtr; log_free_check(); mtr_start(&mtr); found = row_search_index_entry(index, entry, mode, &pcur, &mtr); btr_cur = btr_pcur_get_btr_cur(&pcur); if (!found) { /* Not found */ btr_pcur_close(&pcur); mtr_commit(&mtr); return(DB_SUCCESS); } if (mode == BTR_MODIFY_LEAF) { success = btr_cur_optimistic_delete(btr_cur, &mtr); if (success) { err = DB_SUCCESS; } else { err = DB_FAIL; } } else { ut_ad(mode == BTR_MODIFY_TREE); /* No need to distinguish RB_RECOVERY here, because we are deleting a secondary index record: the distinction between RB_NORMAL and RB_RECOVERY only matters when deleting a record that contains externally stored columns. */ ut_ad(!dict_index_is_clust(index)); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NORMAL, &mtr); } btr_pcur_close(&pcur); mtr_commit(&mtr); return(err); }
ibool row_undo_search_clust_to_pcur( /*==========================*/ /* out: TRUE if found; NOTE the node->pcur must be closed by the caller, regardless of the return value */ undo_node_t* node) /* in: row undo node */ { dict_index_t* clust_index; ibool found; mtr_t mtr; ibool ret; rec_t* rec; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; *offsets_ = (sizeof offsets_) / sizeof *offsets_; mtr_start(&mtr); clust_index = dict_table_get_first_index(node->table); found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, node->table, node->ref, &mtr); rec = btr_pcur_get_rec(&(node->pcur)); offsets = rec_get_offsets(rec, clust_index, offsets, ULINT_UNDEFINED, &heap); if (!found || 0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(rec, clust_index, offsets))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this is to make sure that some thread will eventually undo the modification corresponding to node->roll_ptr. */ /* fputs("--------------------undoing a previous version\n", stderr); */ ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; } btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } return(ret); }
mtr_t* mtr_start_noninline( /*================*/ /* out: mtr buffer which also acts as the mtr handle */ mtr_t* mtr) /* in: memory buffer for the mtr buffer */ { return(mtr_start(mtr)); }
/***************************************************************//** Creates an index tree for the index if it is not a member of a cluster. @return DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ static ulint dict_create_index_tree_step( /*========================*/ ind_node_t* node) /*!< in: index create node */ { dict_index_t* index; dict_table_t* sys_indexes; dict_table_t* table; dtuple_t* search_tuple; ulint zip_size; btr_pcur_t pcur; mtr_t mtr; ut_ad(mutex_own(&(dict_sys->mutex))); index = node->index; table = node->table; sys_indexes = dict_sys->sys_indexes; /* Run a mini-transaction in which the index tree is allocated for the index and its root address is written to the index entry in sys_indexes */ mtr_start(&mtr); search_tuple = dict_create_search_tuple(node->ind_row, node->heap); btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes), search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, &pcur, &mtr); btr_pcur_move_to_next_user_rec(&pcur, &mtr); zip_size = dict_table_zip_size(index->table); node->page_no = btr_create(index->type, index->space, zip_size, index->id, index, &mtr); /* printf("Created a new index tree in space %lu root page %lu\n", index->space, index->page_no); */ page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), DICT_SYS_INDEXES_PAGE_NO_FIELD, node->page_no, &mtr); btr_pcur_close(&pcur); mtr_commit(&mtr); if (node->page_no == FIL_NULL) { return(DB_OUT_OF_FILE_SPACE); } return(DB_SUCCESS); }
/************************************************************************* Checks if possible foreign key constraints hold after a delete of the record under pcur. NOTE that this function will temporarily commit mtr and lose pcur position! */ static ulint row_upd_check_references_constraints( /*=================================*/ /* out: DB_SUCCESS, DB_LOCK_WAIT, or an error code */ btr_pcur_t* pcur, /* in: cursor positioned on a record; NOTE: the cursor position is lost in this function! */ dict_table_t* table, /* in: table in question */ dict_index_t* index, /* in: index of the cursor */ que_thr_t* thr, /* in: query thread */ mtr_t* mtr) /* in: mtr */ { dict_foreign_t* foreign; mem_heap_t* heap; dtuple_t* entry; rec_t* rec; ulint err; rec = btr_pcur_get_rec(pcur); heap = mem_heap_create(500); entry = row_rec_to_index_entry(ROW_COPY_DATA, index, rec, heap); mtr_commit(mtr); mtr_start(mtr); rw_lock_s_lock(&dict_foreign_key_check_lock); foreign = UT_LIST_GET_FIRST(table->referenced_list); while (foreign) { if (foreign->referenced_index == index) { err = row_ins_check_foreign_constraint(FALSE, foreign, table, index, entry, thr); if (err != DB_SUCCESS) { rw_lock_s_unlock(&dict_foreign_key_check_lock); mem_heap_free(heap); return(err); } } foreign = UT_LIST_GET_NEXT(referenced_list, foreign); } rw_lock_s_unlock(&dict_foreign_key_check_lock); mem_heap_free(heap); return(DB_SUCCESS); }
/******************************************************************* Removes a secondary index entry if found. */ static ulint row_undo_ins_remove_sec_low( /*========================*/ /* out: DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether we wish optimistic or pessimistic descent down the index tree */ dict_index_t* index, /* in: index */ dtuple_t* entry) /* in: index entry to remove */ { btr_pcur_t pcur; btr_cur_t* btr_cur; ibool found; ibool success; ulint err; mtr_t mtr; log_free_check(); mtr_start(&mtr); found = row_search_index_entry(index, entry, mode, &pcur, &mtr); btr_cur = btr_pcur_get_btr_cur(&pcur); if (!found) { /* Not found */ btr_pcur_close(&pcur); mtr_commit(&mtr); return(DB_SUCCESS); } if (mode == BTR_MODIFY_LEAF) { success = btr_cur_optimistic_delete(btr_cur, &mtr); if (success) { err = DB_SUCCESS; } else { err = DB_FAIL; } } else { ut_ad(mode == BTR_MODIFY_TREE); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr); } btr_pcur_close(&pcur); mtr_commit(&mtr); return(err); }
ibool row_undo_search_clust_to_pcur( /*==========================*/ /* out: TRUE if found; NOTE the node->pcur must be closed by the caller, regardless of the return value */ undo_node_t* node, /* in: row undo node */ que_thr_t* thr) /* in: query thread */ { dict_index_t* clust_index; ibool found; mtr_t mtr; ibool ret; rec_t* rec; UT_NOT_USED(thr); mtr_start(&mtr); clust_index = dict_table_get_first_index(node->table); found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, node->table, node->ref, &mtr); rec = btr_pcur_get_rec(&(node->pcur)); if (!found || 0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(rec, clust_index))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this is to make sure that some thread will eventually undo the modification corresponding to node->roll_ptr. */ /* printf("--------------------undoing a previous version\n"); */ ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; } btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); return(ret); }
/********************************************************************* Creates a rollback segment. @return pointer to new rollback segment if create successful */ UNIV_INTERN trx_rseg_t* trx_rseg_create(void) /*=================*/ { mtr_t mtr; ulint slot_no; trx_rseg_t* rseg = NULL; mtr_start(&mtr); /* To obey the latching order, acquire the file space x-latch before the kernel mutex. */ mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), &mtr); mutex_enter(&kernel_mutex); slot_no = trx_sysf_rseg_find_free(&mtr); if (slot_no != ULINT_UNDEFINED) { ulint space; ulint page_no; ulint zip_size; trx_sysf_t* sys_header; page_no = trx_rseg_header_create( TRX_SYS_SPACE, 0, ULINT_MAX, slot_no, &mtr); ut_a(page_no != FIL_NULL); ut_ad(!trx_rseg_get_on_id(slot_no)); sys_header = trx_sysf_get(&mtr); space = trx_sysf_rseg_get_space(sys_header, slot_no, &mtr); zip_size = space ? fil_space_get_zip_size(space) : 0; rseg = trx_rseg_mem_create( slot_no, space, zip_size, page_no, &mtr); } mutex_exit(&kernel_mutex); mtr_commit(&mtr); return(rseg); }
/*****************************************************************//** Creates and initializes the data dictionary at the database creation. */ UNIV_INTERN void dict_create(void) /*=============*/ { mtr_t mtr; mtr_start(&mtr); dict_hdr_create(&mtr); mtr_commit(&mtr); dict_boot(); dict_insert_initial_data(); }
/**********************************************************************//** Returns a new table, index, or space id. */ UNIV_INTERN void dict_hdr_get_new_id( /*================*/ table_id_t* table_id, /*!< out: table id (not assigned if NULL) */ index_id_t* index_id, /*!< out: index id (not assigned if NULL) */ ulint* space_id) /*!< out: space id (not assigned if NULL) */ { dict_hdr_t* dict_hdr; ib_id_t id; mtr_t mtr; mtr_start(&mtr); dict_hdr = dict_hdr_get(&mtr); if (table_id) { id = mach_read_from_8(dict_hdr + DICT_HDR_TABLE_ID); id++; mlog_write_ull(dict_hdr + DICT_HDR_TABLE_ID, id, &mtr); *table_id = id; } if (index_id) { id = mach_read_from_8(dict_hdr + DICT_HDR_INDEX_ID); id++; mlog_write_ull(dict_hdr + DICT_HDR_INDEX_ID, id, &mtr); *index_id = id; } if (space_id) { *space_id = mtr_read_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID, MLOG_4BYTES, &mtr); if (fil_assign_new_space_id(space_id)) { mlog_write_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID, *space_id, MLOG_4BYTES, &mtr); } } mtr_commit(&mtr); }
/**********************************************************************//** Writes the current value of the row id counter to the dictionary header file page. */ UNIV_INTERN void dict_hdr_flush_row_id(void) /*=======================*/ { dict_hdr_t* dict_hdr; dulint id; mtr_t mtr; ut_ad(mutex_own(&(dict_sys->mutex))); id = dict_sys->row_id; mtr_start(&mtr); dict_hdr = dict_hdr_get(&mtr); mlog_write_dulint(dict_hdr + DICT_HDR_ROW_ID, id, &mtr); mtr_commit(&mtr); }
/***********************************************************************//** Position the purge sys "iterator" on the undo record to use for purging. */ static void trx_purge_read_undo_rec( /*====================*/ trx_purge_t* purge_sys, /*!< in/out: purge instance */ ulint zip_size) /*!< in: block size or 0 */ { ulint page_no; ulint offset = 0; ib_uint64_t undo_no = 0; purge_sys->hdr_offset = purge_sys->rseg->last_offset; page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no; if (purge_sys->rseg->last_del_marks) { mtr_t mtr; trx_undo_rec_t* undo_rec; mtr_start(&mtr); undo_rec = trx_undo_get_first_rec( 0 /* System space id */, zip_size, purge_sys->hdr_page_no, purge_sys->hdr_offset, RW_S_LATCH, &mtr); if (undo_rec != NULL) { offset = page_offset(undo_rec); undo_no = trx_undo_rec_get_undo_no(undo_rec); page_no = page_get_page_no(page_align(undo_rec)); } mtr_commit(&mtr); } purge_sys->offset = offset; purge_sys->page_no = page_no; purge_sys->purge_undo_no = undo_no; purge_sys->next_stored = TRUE; }
/***********************************************************//** Determines if it is possible to remove a secondary index entry. Removal is possible if the secondary index entry does not refer to any not delete marked version of a clustered index record where DB_TRX_ID is newer than the purge view. NOTE: This function should only be called by the purge thread, only while holding a latch on the leaf page of the secondary index entry (or keeping the buffer pool watch on the page). It is possible that this function first returns TRUE and then FALSE, if a user transaction inserts a record that the secondary index entry would refer to. However, in that case, the user transaction would also re-insert the secondary index entry after purge has removed it and released the leaf page latch. @return TRUE if the secondary index record can be purged */ UNIV_INTERN ibool row_purge_poss_sec( /*===============*/ purge_node_t* node, /*!< in/out: row purge node */ dict_index_t* index, /*!< in: secondary index */ const dtuple_t* entry) /*!< in: secondary index entry */ { ibool can_delete; mtr_t mtr; ut_ad(!dict_index_is_clust(index)); mtr_start(&mtr); can_delete = !row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr) || !row_vers_old_has_index_entry(TRUE, btr_pcur_get_rec(&node->pcur), &mtr, index, entry); btr_pcur_commit_specify_mtr(&node->pcur, &mtr); return(can_delete); }
/***********************************************************************//** Updates the last not yet purged history log info in rseg when we have purged a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */ static void trx_purge_rseg_get_next_history_log( /*================================*/ trx_rseg_t* rseg) /*!< in: rollback segment */ { page_t* undo_page; trx_ulogf_t* log_hdr; fil_addr_t prev_log_addr; trx_id_t trx_no; ibool del_marks; mtr_t mtr; rseg_queue_t rseg_queue; const void* ptr; mutex_enter(&(rseg->mutex)); ut_a(rseg->last_page_no != FIL_NULL); purge_sys->purge_trx_no = rseg->last_trx_no + 1; purge_sys->purge_undo_no = 0; purge_sys->next_stored = FALSE; mtr_start(&mtr); undo_page = trx_undo_page_get_s_latched( rseg->space, rseg->zip_size, rseg->last_page_no, &mtr); log_hdr = undo_page + rseg->last_offset; /* Increase the purge page count by one for every handled log */ purge_sys->n_pages_handled++; prev_log_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); if (prev_log_addr.page == FIL_NULL) { /* No logs left in the history list */ rseg->last_page_no = FIL_NULL; mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); mutex_enter(&kernel_mutex); /* Add debug code to track history list corruption reported on the MySQL mailing list on Nov 9, 2004. The fut0lst.c file-based list was corrupt. The prev node pointer was FIL_NULL, even though the list length was over 8 million nodes! We assume that purge truncates the history list in large size pieces, and if we here reach the head of the list, the list cannot be longer than 2000 000 undo logs now. */ if (trx_sys->rseg_history_len > 2000000) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Warning: purge reached the" " head of the history list,\n" "InnoDB: but its length is still" " reported as %lu! Make a detailed bug\n" "InnoDB: report, and submit it" " to http://bugs.mysql.com\n", (ulong) trx_sys->rseg_history_len); } mutex_exit(&kernel_mutex); return; } mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); /* Read the trx number and del marks from the previous log header */ mtr_start(&mtr); log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size, prev_log_addr.page, &mtr) + prev_log_addr.boffset; trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS); mtr_commit(&mtr); mutex_enter(&(rseg->mutex)); rseg->last_page_no = prev_log_addr.page; rseg->last_offset = prev_log_addr.boffset; rseg->last_trx_no = trx_no; rseg->last_del_marks = del_marks; rseg_queue.rseg = rseg; rseg_queue.trx_no = rseg->last_trx_no; /* Purge can also produce events, however these are already ordered in the rollback segment and any user generated event will be greater than the events that Purge produces. ie. Purge can never produce events from an empty rollback segment. */ mutex_enter(&purge_sys->bh_mutex); ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue); ut_a(ptr != NULL); mutex_exit(&purge_sys->bh_mutex); mutex_exit(&(rseg->mutex)); }
/********************************************************************//** Removes unnecessary history data from a rollback segment. */ static void trx_purge_truncate_rseg_history( /*============================*/ trx_rseg_t* rseg, /*!< in: rollback segment */ trx_id_t limit_trx_no, /*!< in: remove update undo logs whose trx number is < limit_trx_no */ undo_no_t limit_undo_no) /*!< in: if transaction number is equal to limit_trx_no, truncate undo records with undo number < limit_undo_no */ { fil_addr_t hdr_addr; fil_addr_t prev_hdr_addr; trx_rsegf_t* rseg_hdr; page_t* undo_page; trx_ulogf_t* log_hdr; trx_usegf_t* seg_hdr; ulint n_removed_logs = 0; mtr_t mtr; trx_id_t undo_trx_no; mtr_start(&mtr); mutex_enter(&(rseg->mutex)); rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, &mtr); hdr_addr = trx_purge_get_log_from_hist( flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); loop: if (hdr_addr.page == FIL_NULL) { mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); return; } undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, hdr_addr.page, &mtr); log_hdr = undo_page + hdr_addr.boffset; undo_trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO); if (undo_trx_no >= limit_trx_no) { if (undo_trx_no == limit_trx_no) { trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page, hdr_addr.boffset, limit_undo_no); } mutex_enter(&kernel_mutex); ut_a(trx_sys->rseg_history_len >= n_removed_logs); trx_sys->rseg_history_len -= n_removed_logs; mutex_exit(&kernel_mutex); flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); return; } prev_hdr_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); n_removed_logs++; seg_hdr = undo_page + TRX_UNDO_SEG_HDR; if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE) && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) { /* We can free the whole log segment */ mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); trx_purge_free_segment(rseg, hdr_addr, n_removed_logs); n_removed_logs = 0; } else { mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); } mtr_start(&mtr); mutex_enter(&(rseg->mutex)); rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, &mtr); hdr_addr = prev_hdr_addr; goto loop; }
/**********************************************************************//** Frees an undo log segment which is in the history list. Cuts the end of the history list at the youngest undo log in this segment. */ static void trx_purge_free_segment( /*===================*/ trx_rseg_t* rseg, /*!< in: rollback segment */ fil_addr_t hdr_addr, /*!< in: the file address of log_hdr */ ulint n_removed_logs) /*!< in: count of how many undo logs we will cut off from the end of the history list */ { page_t* undo_page; trx_rsegf_t* rseg_hdr; trx_ulogf_t* log_hdr; trx_usegf_t* seg_hdr; ibool freed; ulint seg_size; ulint hist_size; ibool marked = FALSE; mtr_t mtr; /* fputs("Freeing an update undo log segment\n", stderr); */ loop: mtr_start(&mtr); mutex_enter(&(rseg->mutex)); rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, &mtr); undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, hdr_addr.page, &mtr); seg_hdr = undo_page + TRX_UNDO_SEG_HDR; log_hdr = undo_page + hdr_addr.boffset; /* Mark the last undo log totally purged, so that if the system crashes, the tail of the undo log will not get accessed again. The list of pages in the undo log tail gets inconsistent during the freeing of the segment, and therefore purge should not try to access them again. */ if (!marked) { mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE, MLOG_2BYTES, &mtr); marked = TRUE; } freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr); if (!freed) { mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); goto loop; } /* The page list may now be inconsistent, but the length field stored in the list base node tells us how big it was before we started the freeing. */ seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr); /* We may free the undo log segment header page; it must be freed within the same mtr as the undo log header is removed from the history list: otherwise, in case of a database crash, the segment could become inaccessible garbage in the file space. */ flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); mutex_enter(&kernel_mutex); ut_ad(trx_sys->rseg_history_len >= n_removed_logs); trx_sys->rseg_history_len -= n_removed_logs; mutex_exit(&kernel_mutex); freed = FALSE; while (!freed) { /* Here we assume that a file segment with just the header page can be freed in a few steps, so that the buffer pool is not flooded with bufferfixed pages: see the note in fsp0fsp.c. */ freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER, &mtr); } hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, &mtr); ut_ad(hist_size >= seg_size); mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE, hist_size - seg_size, MLOG_4BYTES, &mtr); ut_ad(rseg->curr_size >= seg_size); rseg->curr_size -= seg_size; mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); }
/****************************************************************//** Commits a transaction. */ UNIV_INTERN void trx_commit_off_kernel( /*==================*/ trx_t* trx) /*!< in: transaction */ { page_t* update_hdr_page; ib_uint64_t lsn = 0; trx_rseg_t* rseg; trx_undo_t* undo; mtr_t mtr; ut_ad(mutex_own(&kernel_mutex)); trx->must_flush_log_later = FALSE; rseg = trx->rseg; if (trx->insert_undo != NULL || trx->update_undo != NULL) { mutex_exit(&kernel_mutex); mtr_start(&mtr); /* Change the undo log segment states from TRX_UNDO_ACTIVE to some other state: these modifications to the file data structure define the transaction as committed in the file based world, at the serialization point of the log sequence number lsn obtained below. */ mutex_enter(&(rseg->mutex)); if (trx->insert_undo != NULL) { trx_undo_set_state_at_finish( rseg, trx, trx->insert_undo, &mtr); } undo = trx->update_undo; if (undo) { mutex_enter(&kernel_mutex); trx->no = trx_sys_get_new_trx_no(); mutex_exit(&kernel_mutex); /* It is not necessary to obtain trx->undo_mutex here because only a single OS thread is allowed to do the transaction commit for this transaction. */ update_hdr_page = trx_undo_set_state_at_finish( rseg, trx, undo, &mtr); /* We have to do the cleanup for the update log while holding the rseg mutex because update log headers have to be put to the history list in the order of the trx number. */ trx_undo_update_cleanup(trx, update_hdr_page, &mtr); } mutex_exit(&(rseg->mutex)); /* Update the latest MySQL binlog name and offset info in trx sys header if MySQL binlogging is on or the database server is a MySQL replication slave */ if (trx->mysql_log_file_name && trx->mysql_log_file_name[0] != '\0') { trx_sys_update_mysql_binlog_offset( trx->mysql_log_file_name, trx->mysql_log_offset, TRX_SYS_MYSQL_LOG_INFO, &mtr); trx->mysql_log_file_name = NULL; } /* The following call commits the mini-transaction, making the whole transaction committed in the file-based world, at this log sequence number. The transaction becomes 'durable' when we write the log to disk, but in the logical sense the commit in the file-based data structures (undo logs etc.) happens here. NOTE that transaction numbers, which are assigned only to transactions with an update undo log, do not necessarily come in exactly the same order as commit lsn's, if the transactions have different rollback segments. To get exactly the same order we should hold the kernel mutex up to this point, adding to the contention of the kernel mutex. However, if a transaction T2 is able to see modifications made by a transaction T1, T2 will always get a bigger transaction number and a bigger commit lsn than T1. */ /*--------------*/ mtr_commit(&mtr); /*--------------*/ lsn = mtr.end_lsn; mutex_enter(&kernel_mutex); } ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED); ut_ad(mutex_own(&kernel_mutex)); /* The following assignment makes the transaction committed in memory and makes its changes to data visible to other transactions. NOTE that there is a small discrepancy from the strict formal visibility rules here: a human user of the database can see modifications made by another transaction T even before the necessary log segment has been flushed to the disk. If the database happens to crash before the flush, the user has seen modifications from T which will never be a committed transaction. However, any transaction T2 which sees the modifications of the committing transaction T, and which also itself makes modifications to the database, will get an lsn larger than the committing transaction T. In the case where the log flush fails, and T never gets committed, also T2 will never get committed. */ /*--------------------------------------*/ trx->conc_state = TRX_COMMITTED_IN_MEMORY; /*--------------------------------------*/ /* If we release kernel_mutex below and we are still doing recovery i.e.: back ground rollback thread is still active then there is a chance that the rollback thread may see this trx as COMMITTED_IN_MEMORY and goes adhead to clean it up calling trx_cleanup_at_db_startup(). This can happen in the case we are committing a trx here that is left in PREPARED state during the crash. Note that commit of the rollback of a PREPARED trx happens in the recovery thread while the rollback of other transactions happen in the background thread. To avoid this race we unconditionally unset the is_recovered flag from the trx. */ trx->is_recovered = FALSE; lock_release_off_kernel(trx); if (trx->global_read_view) { read_view_close(trx->global_read_view); mem_heap_empty(trx->global_read_view_heap); trx->global_read_view = NULL; } trx->read_view = NULL; if (lsn) { mutex_exit(&kernel_mutex); if (trx->insert_undo != NULL) { trx_undo_insert_cleanup(trx); } /* NOTE that we could possibly make a group commit more efficient here: call os_thread_yield here to allow also other trxs to come to commit! */ /*-------------------------------------*/ /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the transaction durable if the OS does not crash. We may also flush the log files to disk, making the transaction durable also at an OS crash or a power outage. The idea in InnoDB's group commit is that a group of transactions gather behind a trx doing a physical disk write to log files, and when that physical write has been completed, one of those transactions does a write which commits the whole group. Note that this group commit will only bring benefit if there are > 2 users in the database. Then at least 2 users can gather behind one doing the physical log write to disk. If we are calling trx_commit() under prepare_commit_mutex, we will delay possible log write and flush to a separate function trx_commit_complete_for_mysql(), which is only called when the thread has released the mutex. This is to make the group commit algorithm to work. Otherwise, the prepare_commit mutex would serialize all commits and prevent a group of transactions from gathering. */ if (trx->flush_log_later) { /* Do nothing yet */ trx->must_flush_log_later = TRUE; } else if (srv_flush_log_at_trx_commit == 0) { /* Do nothing */ } else if (srv_flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { /* Write the log to the log files AND flush them to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } } else if (srv_flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { ut_error; } trx->commit_lsn = lsn; /*-------------------------------------*/ mutex_enter(&kernel_mutex); } /* Free all savepoints */ trx_roll_free_all_savepoints(trx); trx->conc_state = TRX_NOT_STARTED; trx->rseg = NULL; trx->undo_no = ut_dulint_zero; trx->last_sql_stat_start.least_undo_no = ut_dulint_zero; ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0); ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0); UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx); }
/****************************************************************//** Prepares a transaction. */ UNIV_INTERN void trx_prepare_off_kernel( /*===================*/ trx_t* trx) /*!< in: transaction */ { trx_rseg_t* rseg; ib_uint64_t lsn = 0; mtr_t mtr; ut_ad(mutex_own(&kernel_mutex)); rseg = trx->rseg; if (trx->insert_undo != NULL || trx->update_undo != NULL) { mutex_exit(&kernel_mutex); mtr_start(&mtr); /* Change the undo log segment states from TRX_UNDO_ACTIVE to TRX_UNDO_PREPARED: these modifications to the file data structure define the transaction as prepared in the file-based world, at the serialization point of lsn. */ mutex_enter(&(rseg->mutex)); if (trx->insert_undo != NULL) { /* It is not necessary to obtain trx->undo_mutex here because only a single OS thread is allowed to do the transaction prepare for this transaction. */ trx_undo_set_state_at_prepare(trx, trx->insert_undo, &mtr); } if (trx->update_undo) { trx_undo_set_state_at_prepare( trx, trx->update_undo, &mtr); } mutex_exit(&(rseg->mutex)); /*--------------*/ mtr_commit(&mtr); /* This mtr commit makes the transaction prepared in the file-based world */ /*--------------*/ lsn = mtr.end_lsn; mutex_enter(&kernel_mutex); } ut_ad(mutex_own(&kernel_mutex)); /*--------------------------------------*/ trx->conc_state = TRX_PREPARED; /*--------------------------------------*/ if (lsn) { /* Depending on the my.cnf options, we may now write the log buffer to the log files, making the prepared state of the transaction durable if the OS does not crash. We may also flush the log files to disk, making the prepared state of the transaction durable also at an OS crash or a power outage. The idea in InnoDB's group prepare is that a group of transactions gather behind a trx doing a physical disk write to log files, and when that physical write has been completed, one of those transactions does a write which prepares the whole group. Note that this group prepare will only bring benefit if there are > 2 users in the database. Then at least 2 users can gather behind one doing the physical log write to disk. TODO: find out if MySQL holds some mutex when calling this. That would spoil our group prepare algorithm. */ mutex_exit(&kernel_mutex); if (srv_flush_log_at_trx_commit == 0) { /* Do nothing */ } else if (srv_flush_log_at_trx_commit == 1) { if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) { /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { /* Write the log to the log files AND flush them to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE); } } else if (srv_flush_log_at_trx_commit == 2) { /* Write the log but do not flush it to disk */ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE); } else { ut_error; } mutex_enter(&kernel_mutex); } }
/*******************************************************************//** Fills the "lock_data" member of i_s_locks_row_t object. If memory can not be allocated then FALSE is returned. @return FALSE if allocation fails */ static ibool fill_lock_data( /*===========*/ const char** lock_data,/*!< out: "lock_data" to fill */ const lock_t* lock, /*!< in: lock used to find the data */ ulint heap_no,/*!< in: rec num used to find the data */ trx_i_s_cache_t* cache) /*!< in/out: cache where to store volatile data */ { mtr_t mtr; const buf_block_t* block; const page_t* page; const rec_t* rec; ut_a(lock_get_type(lock) == LOCK_REC); mtr_start(&mtr); block = buf_page_try_get(lock_rec_get_space_id(lock), lock_rec_get_page_no(lock), &mtr); if (block == NULL) { *lock_data = NULL; mtr_commit(&mtr); return(TRUE); } page = (const page_t*) buf_block_get_frame(block); rec = page_find_rec_with_heap_no(page, heap_no); if (page_rec_is_infimum(rec)) { *lock_data = ha_storage_put_str_memlim( cache->storage, "infimum pseudo-record", MAX_ALLOWED_FOR_STORAGE(cache)); } else if (page_rec_is_supremum(rec)) { *lock_data = ha_storage_put_str_memlim( cache->storage, "supremum pseudo-record", MAX_ALLOWED_FOR_STORAGE(cache)); } else { const dict_index_t* index; ulint n_fields; mem_heap_t* heap; ulint offsets_onstack[REC_OFFS_NORMAL_SIZE]; ulint* offsets; char buf[TRX_I_S_LOCK_DATA_MAX_LEN]; ulint buf_used; ulint i; rec_offs_init(offsets_onstack); offsets = offsets_onstack; index = lock_rec_get_index(lock); n_fields = dict_index_get_n_unique(index); ut_a(n_fields > 0); heap = NULL; offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap); /* format and store the data */ buf_used = 0; for (i = 0; i < n_fields; i++) { buf_used += put_nth_field( buf + buf_used, sizeof(buf) - buf_used, i, index, rec, offsets) - 1; } *lock_data = (const char*) ha_storage_put_memlim( cache->storage, buf, buf_used + 1, MAX_ALLOWED_FOR_STORAGE(cache)); if (UNIV_UNLIKELY(heap != NULL)) { /* this means that rec_get_offsets() has created a new heap and has stored offsets in it; check that this is really the case and free the heap */ ut_a(offsets != offsets_onstack); mem_heap_free(heap); } } mtr_commit(&mtr); if (*lock_data == NULL) { return(FALSE); } return(TRUE); }
ulint dict_truncate_index_tree( /*=====================*/ /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ btr_pcur_t* pcur, /* in/out: persistent cursor pointing to record in the clustered index of SYS_INDEXES table. The cursor may be repositioned in this call. */ mtr_t* mtr) /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ { ulint root_page_no; ulint space; ulint type; dulint index_id; rec_t* rec; byte* ptr; ulint len; ulint comp; dict_index_t* index; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (root_page_no == FIL_NULL) { /* The tree has been freed. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing index of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (!fil_tablespace_exists_in_mem(space)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing .ibd file of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_TYPE_FIELD, &len); ut_ad(len == 4); type = mach_read_from_4(ptr); ptr = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); index_id = mach_read_from_8(ptr); /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, root_page_no); /* Then we free the root page in the same mini-transaction where we create the b-tree and write its new root page number to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally truncated */ comp = page_is_comp(btr_page_get(space, root_page_no, RW_X_LATCH, mtr)); btr_free_root(space, root_page_no, mtr); /* We will temporarily write FIL_NULL to the PAGE_NO field in SYS_INDEXES, so that the database will not get into an inconsistent state in case it crashes between the mtr_commit() below and the following mtr_commit() call. */ page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { if (!ut_dulint_cmp(index->id, index_id)) { break; } } root_page_no = btr_create(type, space, index_id, comp, mtr); if (index) { index->page = (unsigned int) root_page_no; } else { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Index %lu %lu of table %s is missing\n" "InnoDB: from the data dictionary during TRUNCATE!\n", ut_dulint_get_high(index_id), ut_dulint_get_low(index_id), table->name); } return(root_page_no); }
/******************************************************************* Builds a table definition to insert. */ static ulint dict_build_table_def_step( /*======================*/ /* out: DB_SUCCESS or error code */ que_thr_t* thr, /* in: query thread */ tab_node_t* node) /* in: table create node */ { dict_table_t* table; dtuple_t* row; ulint error; const char* path_or_name; ibool is_path; mtr_t mtr; ulint i; ulint row_len; ut_ad(mutex_own(&(dict_sys->mutex))); table = node->table; table->id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); thr_get_trx(thr)->table_id = table->id; row_len = 0; for (i = 0; i < table->n_def; i++) { row_len += dict_col_get_min_size(&table->cols[i]); } if (row_len > BTR_PAGE_MAX_REC_SIZE) { return(DB_TOO_BIG_RECORD); } if (srv_file_per_table) { /* We create a new single-table tablespace for the table. We initially let it be 4 pages: - page 0 is the fsp header and an extent descriptor page, - page 1 is an ibuf bitmap page, - page 2 is the first inode page, - page 3 will contain the root of the clustered index of the table we create here. */ ulint space = 0; /* reset to zero for the call below */ if (table->dir_path_of_temp_table) { /* We place tables created with CREATE TEMPORARY TABLE in the tmp dir of mysqld server */ path_or_name = table->dir_path_of_temp_table; is_path = TRUE; } else { path_or_name = table->name; is_path = FALSE; } error = fil_create_new_single_table_tablespace( &space, path_or_name, is_path, FIL_IBD_FILE_INITIAL_SIZE); table->space = (unsigned int) space; if (error != DB_SUCCESS) { return(error); } mtr_start(&mtr); fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr); mtr_commit(&mtr); } row = dict_create_sys_tables_tuple(table, node->heap); ins_node_set_new_row(node->tab_def, row); return(DB_SUCCESS); }
/******************************************************************* Removes a clustered index record. The pcur in node was positioned on the record, now it is detached. */ static ulint row_undo_ins_remove_clust_rec( /*==========================*/ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ undo_node_t* node, /* in: undo node */ que_thr_t* thr) /* in: query thread */ { btr_cur_t* btr_cur; ibool success; ulint err; ulint n_tries = 0; mtr_t mtr; UT_NOT_USED(thr); mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), &mtr); ut_a(success); if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { /* Drop the index tree associated with the row in SYS_INDEXES table: */ dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); mtr_commit(&mtr); mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), &mtr); ut_a(success); } btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); success = btr_cur_optimistic_delete(btr_cur, &mtr); btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); if (success) { trx_undo_rec_release(node->trx, node->undo_no); return(DB_SUCCESS); } retry: /* If did not succeed, try pessimistic descent to tree */ mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_TREE, &(node->pcur), &mtr); ut_a(success); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr); /* The delete operation may fail if we have little file space left: TODO: easiest to crash the database and restart with more file space */ if (err == DB_OUT_OF_FILE_SPACE && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); n_tries++; os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); goto retry; } btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); trx_undo_rec_release(node->trx, node->undo_no); return(err); }
/******************************************************************* Removes a secondary index entry if found. */ static ulint row_undo_ins_remove_sec_low( /*========================*/ /* out: DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */ ulint mode, /* in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE, depending on whether we wish optimistic or pessimistic descent down the index tree */ dict_index_t* index, /* in: index */ dtuple_t* entry, /* in: index entry to remove */ que_thr_t* thr) /* in: query thread */ { btr_pcur_t pcur; btr_cur_t* btr_cur; ibool found; ibool success; ulint err; mtr_t mtr; UT_NOT_USED(thr); log_free_check(); mtr_start(&mtr); found = row_search_index_entry(index, entry, mode, &pcur, &mtr); btr_cur = btr_pcur_get_btr_cur(&pcur); if (!found) { /* Not found */ /* FIXME: remove printfs in the final version */ /* printf( "--UNDO INS: Record not found from page %lu index %s\n", buf_frame_get_page_no(btr_cur_get_rec(btr_cur)), index->name); */ /* ibuf_print(); */ btr_pcur_close(&pcur); mtr_commit(&mtr); return(DB_SUCCESS); } if (mode == BTR_MODIFY_LEAF) { success = btr_cur_optimistic_delete(btr_cur, &mtr); if (success) { err = DB_SUCCESS; } else { err = DB_FAIL; } } else { ut_ad(mode == BTR_MODIFY_TREE); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr); } btr_pcur_close(&pcur); mtr_commit(&mtr); return(err); }
/*****************************************************************//** Finds out if an active transaction has inserted or modified a secondary index record. NOTE: the kernel mutex is temporarily released in this function! @return NULL if committed, else the active transaction */ UNIV_INTERN trx_t* row_vers_impl_x_locked_off_kernel( /*==============================*/ const rec_t* rec, /*!< in: record in a secondary index */ dict_index_t* index, /*!< in: the secondary index */ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { dict_index_t* clust_index; rec_t* clust_rec; ulint* clust_offsets; rec_t* version; trx_id_t trx_id; mem_heap_t* heap; mem_heap_t* heap2; dtuple_t* row; dtuple_t* entry = NULL; /* assignment to eliminate compiler warning */ trx_t* trx; ulint rec_del; #ifdef UNIV_DEBUG ulint err; #endif /* UNIV_DEBUG */ mtr_t mtr; ulint comp; ut_ad(mutex_own(&kernel_mutex)); #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ mutex_exit(&kernel_mutex); mtr_start(&mtr); /* Search for the clustered index record: this is a time-consuming operation: therefore we release the kernel mutex; also, the release is required by the latching order convention. The latch on the clustered index locks the top of the stack of versions. We also reserve purge_latch to lock the bottom of the version stack. */ clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr); if (!clust_rec) { /* In a rare case it is possible that no clust rec is found for a secondary index record: if in row0umod.c row_undo_mod_remove_clust_low() we have already removed the clust rec, while purge is still cleaning and removing secondary index records associated with earlier versions of the clustered index record. In that case there cannot be any implicit lock on the secondary index record, because an active transaction which has modified the secondary index record has also modified the clustered index record. And in a rollback we always undo the modifications to secondary index records before the clustered index record. */ mutex_enter(&kernel_mutex); mtr_commit(&mtr); return(NULL); } heap = mem_heap_create(1024); clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap); trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); mtr_s_lock(&(purge_sys->latch), &mtr); mutex_enter(&kernel_mutex); trx = NULL; if (!trx_is_active(trx_id)) { /* The transaction that modified or inserted clust_rec is no longer active: no implicit lock on rec */ goto exit_func; } if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, clust_offsets, TRUE)) { /* Corruption noticed: try to avoid a crash by returning */ goto exit_func; } comp = page_rec_is_comp(rec); ut_ad(index->table == clust_index->table); ut_ad(!!comp == dict_table_is_comp(index->table)); ut_ad(!comp == !page_rec_is_comp(clust_rec)); /* We look up if some earlier version, which was modified by the trx_id transaction, of the clustered index record would require rec to be in a different state (delete marked or unmarked, or have different field values, or not existing). If there is such a version, then rec was modified by the trx_id transaction, and it has an implicit x-lock on rec. Note that if clust_rec itself would require rec to be in a different state, then the trx_id transaction has not yet had time to modify rec, and does not necessarily have an implicit x-lock on rec. */ rec_del = rec_get_deleted_flag(rec, comp); trx = NULL; version = clust_rec; for (;;) { rec_t* prev_version; ulint vers_del; row_ext_t* ext; trx_id_t prev_trx_id; mutex_exit(&kernel_mutex); /* While we retrieve an earlier version of clust_rec, we release the kernel mutex, because it may take time to access the disk. After the release, we have to check if the trx_id transaction is still active. We keep the semaphore in mtr on the clust_rec page, so that no other transaction can update it and get an implicit x-lock on rec. */ heap2 = heap; heap = mem_heap_create(1024); #ifdef UNIV_DEBUG err = #endif /* UNIV_DEBUG */ trx_undo_prev_version_build(clust_rec, &mtr, version, clust_index, clust_offsets, heap, &prev_version); mem_heap_free(heap2); /* free version and clust_offsets */ if (prev_version == NULL) { mutex_enter(&kernel_mutex); if (!trx_is_active(trx_id)) { /* Transaction no longer active: no implicit x-lock */ break; } /* If the transaction is still active, clust_rec must be a fresh insert, because no previous version was found. */ ut_ad(err == DB_SUCCESS); /* It was a freshly inserted version: there is an implicit x-lock on rec */ trx = trx_get_on_id(trx_id); break; } clust_offsets = rec_get_offsets(prev_version, clust_index, NULL, ULINT_UNDEFINED, &heap); vers_del = rec_get_deleted_flag(prev_version, comp); prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, clust_offsets); /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, clust_offsets, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* entry may be NULL if a record was inserted in place of a deleted record, and the BLOB pointers of the new record were not initialized yet. But in that case, prev_version should be NULL. */ ut_a(entry); mutex_enter(&kernel_mutex); if (!trx_is_active(trx_id)) { /* Transaction no longer active: no implicit x-lock */ break; } /* If we get here, we know that the trx_id transaction is still active and it has modified prev_version. Let us check if prev_version would require rec to be in a different state. */ /* The previous version of clust_rec must be accessible, because the transaction is still active and clust_rec was not a fresh insert. */ ut_ad(err == DB_SUCCESS); /* We check if entry and rec are identified in the alphabetical ordering */ if (0 == cmp_dtuple_rec(entry, rec, offsets)) { /* The delete marks of rec and prev_version should be equal for rec to be in the state required by prev_version */ if (rec_del != vers_del) { trx = trx_get_on_id(trx_id); break; } /* It is possible that the row was updated so that the secondary index record remained the same in alphabetical ordering, but the field values changed still. For example, 'abc' -> 'ABC'. Check also that. */ dtuple_set_types_binary(entry, dtuple_get_n_fields(entry)); if (0 != cmp_dtuple_rec(entry, rec, offsets)) { trx = trx_get_on_id(trx_id); break; } } else if (!rec_del) { /* The delete mark should be set in rec for it to be in the state required by prev_version */ trx = trx_get_on_id(trx_id); break; } if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { /* The versions modified by the trx_id transaction end to prev_version: no implicit x-lock */ break; } version = prev_version; }/* for (;;) */ exit_func: mtr_commit(&mtr); mem_heap_free(heap); return(trx); }
/*********************************************************//** Moves the persistent cursor backward if it is on the first record of the page. Commits mtr. Note that to prevent a possible deadlock, the operation first stores the position of the cursor, commits mtr, acquires the necessary latches and restores the cursor position again before returning. The alphabetical position of the cursor is guaranteed to be sensible on return, but it may happen that the cursor is not positioned on the last record of any page, because the structure of the tree may have changed during the time when the cursor had no latches. */ UNIV_INTERN void btr_pcur_move_backward_from_page( /*=============================*/ btr_pcur_t* cursor, /*!< in: persistent cursor, must be on the first record of the current page */ mtr_t* mtr) /*!< in: mtr */ { ulint prev_page_no; page_t* page; buf_block_t* prev_block; ulint latch_mode; ulint latch_mode2; ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED); ut_ad(cursor->latch_mode != BTR_NO_LATCHES); ut_ad(btr_pcur_is_before_first_on_page(cursor)); ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr)); latch_mode = cursor->latch_mode; if (latch_mode == BTR_SEARCH_LEAF) { latch_mode2 = BTR_SEARCH_PREV; } else if (latch_mode == BTR_MODIFY_LEAF) { latch_mode2 = BTR_MODIFY_PREV; } else { latch_mode2 = 0; /* To eliminate compiler warning */ ut_error; } btr_pcur_store_position(cursor, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(latch_mode2, cursor, mtr); page = btr_pcur_get_page(cursor); prev_page_no = btr_page_get_prev(page, mtr); if (prev_page_no == FIL_NULL) { } else if (btr_pcur_is_before_first_on_page(cursor)) { prev_block = btr_pcur_get_btr_cur(cursor)->left_block; btr_leaf_page_release(btr_pcur_get_block(cursor), latch_mode, mtr); page_cur_set_after_last(prev_block, btr_pcur_get_page_cur(cursor)); } else { /* The repositioned cursor did not end on an infimum record on a page. Cursor repositioning acquired a latch also on the previous page, but we do not need the latch: release it. */ prev_block = btr_pcur_get_btr_cur(cursor)->left_block; btr_leaf_page_release(prev_block, latch_mode, mtr); } cursor->latch_mode = latch_mode; cursor->old_stored = BTR_PCUR_OLD_NOT_STORED; }
/***********************************************************************//** Gets the next record to purge and updates the info in the purge system. @return copy of an undo log record or pointer to the dummy undo log record */ static trx_undo_rec_t* trx_purge_get_next_rec( /*===================*/ mem_heap_t* heap) /*!< in: memory heap where copied */ { trx_undo_rec_t* rec; trx_undo_rec_t* rec_copy; trx_undo_rec_t* rec2; trx_undo_rec_t* next_rec; page_t* undo_page; page_t* page; ulint offset; ulint page_no; ulint space; ulint zip_size; ulint type; ulint cmpl_info; mtr_t mtr; ut_ad(purge_sys->next_stored); space = purge_sys->rseg->space; zip_size = purge_sys->rseg->zip_size; page_no = purge_sys->page_no; offset = purge_sys->offset; if (offset == 0) { /* It is the dummy undo log record, which means that there is no need to purge this undo log */ trx_purge_rseg_get_next_history_log(purge_sys->rseg); /* Look for the next undo log and record to purge */ trx_purge_choose_next_log(); return(&trx_purge_dummy_rec); } mtr_start(&mtr); undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr); rec = undo_page + offset; rec2 = rec; for (;;) { /* Try first to find the next record which requires a purge operation from the same page of the same undo log */ next_rec = trx_undo_page_get_next_rec( rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset); if (next_rec == NULL) { rec2 = trx_undo_get_next_rec( rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset, &mtr); break; } rec2 = next_rec; type = trx_undo_rec_get_type(rec2); if (type == TRX_UNDO_DEL_MARK_REC) { break; } cmpl_info = trx_undo_rec_get_cmpl_info(rec2); if (trx_undo_rec_get_extern_storage(rec2)) { break; } if ((type == TRX_UNDO_UPD_EXIST_REC) && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { break; } } if (rec2 == NULL) { mtr_commit(&mtr); trx_purge_rseg_get_next_history_log(purge_sys->rseg); /* Look for the next undo log and record to purge */ trx_purge_choose_next_log(); mtr_start(&mtr); undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr); rec = undo_page + offset; } else { page = page_align(rec2); purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2); purge_sys->page_no = page_get_page_no(page); purge_sys->offset = rec2 - page; if (undo_page != page) { /* We advance to a new page of the undo log: */ purge_sys->n_pages_handled++; } } rec_copy = trx_undo_rec_copy(rec, heap); mtr_commit(&mtr); return(rec_copy); }
/***********************************************************************//** Chooses the next undo log to purge and updates the info in purge_sys. This function is used to initialize purge_sys when the next record to purge is not known, and also to update the purge system info on the next record when purge has handled the whole undo log for a transaction. */ static void trx_purge_choose_next_log(void) /*===========================*/ { trx_undo_rec_t* rec; trx_rseg_t* rseg; trx_rseg_t* min_rseg; trx_id_t min_trx_no; ulint space = 0; /* remove warning (??? bug ???) */ ulint zip_size = 0; ulint page_no = 0; /* remove warning (??? bug ???) */ ulint offset = 0; /* remove warning (??? bug ???) */ mtr_t mtr; ut_ad(mutex_own(&(purge_sys->mutex))); ut_ad(purge_sys->next_stored == FALSE); rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); min_trx_no = ut_dulint_max; min_rseg = NULL; while (rseg) { mutex_enter(&(rseg->mutex)); if (rseg->last_page_no != FIL_NULL) { if ((min_rseg == NULL) || (ut_dulint_cmp(min_trx_no, rseg->last_trx_no) > 0)) { min_rseg = rseg; min_trx_no = rseg->last_trx_no; space = rseg->space; zip_size = rseg->zip_size; ut_a(space == 0); /* We assume in purge of externally stored fields that space id == 0 */ page_no = rseg->last_page_no; offset = rseg->last_offset; } } mutex_exit(&(rseg->mutex)); rseg = UT_LIST_GET_NEXT(rseg_list, rseg); } if (min_rseg == NULL) { return; } mtr_start(&mtr); if (!min_rseg->last_del_marks) { /* No need to purge this log */ rec = &trx_purge_dummy_rec; } else { rec = trx_undo_get_first_rec(space, zip_size, page_no, offset, RW_S_LATCH, &mtr); if (rec == NULL) { /* Undo log empty */ rec = &trx_purge_dummy_rec; } } purge_sys->next_stored = TRUE; purge_sys->rseg = min_rseg; purge_sys->hdr_page_no = page_no; purge_sys->hdr_offset = offset; purge_sys->purge_trx_no = min_trx_no; if (rec == &trx_purge_dummy_rec) { purge_sys->purge_undo_no = ut_dulint_zero; purge_sys->page_no = page_no; purge_sys->offset = 0; } else { purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec); purge_sys->page_no = page_get_page_no(page_align(rec)); purge_sys->offset = page_offset(rec); } mtr_commit(&mtr); }