/********************************************************************** Validates the flush list. */ static ibool buf_flush_validate_low(void) /*========================*/ /* out: TRUE if ok */ { buf_block_t* block; dulint om; UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list); block = UT_LIST_GET_FIRST(buf_pool->flush_list); while (block != NULL) { om = block->oldest_modification; ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(ut_dulint_cmp(om, ut_dulint_zero) > 0); block = UT_LIST_GET_NEXT(flush_list, block); if (block) { ut_a(ut_dulint_cmp(om, block->oldest_modification) >= 0); } } return(TRUE); }
/*************************************************************** Checks if also the previous version of the clustered index record was modified or inserted by the same transaction, and its undo number is such that it should be undone in the same rollback. */ UNIV_INLINE ibool row_undo_mod_undo_also_prev_vers( /*=============================*/ /* out: TRUE if also previous modify or insert of this row should be undone */ undo_node_t* node, /* in: row undo node */ que_thr_t* thr, /* in: query thread */ dulint* undo_no)/* out: the undo number */ { trx_undo_rec_t* undo_rec; ibool ret; trx_t* trx; UT_NOT_USED(thr); trx = node->trx; if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { return(FALSE); } undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); *undo_no = trx_undo_rec_get_undo_no(undo_rec); if (ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0) { ret = TRUE; } else { ret = FALSE; } return(ret); }
/*************************************************************** Checks if also the previous version of the clustered index record was modified or inserted by the same transaction, and its undo number is such that it should be undone in the same rollback. */ UNIV_INLINE ibool row_undo_mod_undo_also_prev_vers( /*=============================*/ /* out: TRUE if also previous modify or insert of this row should be undone */ undo_node_t* node, /* in: row undo node */ dulint* undo_no)/* out: the undo number */ { trx_undo_rec_t* undo_rec; trx_t* trx; trx = node->trx; if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { *undo_no = ut_dulint_zero; return(FALSE); } undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); *undo_no = trx_undo_rec_get_undo_no(undo_rec); return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0); }
/****************************************************************//** Inserts the trx handle in the trx system trx list in the right position. The list is sorted on the trx id so that the biggest id is at the list start. This function is used at the database startup to insert incomplete transactions to the list. */ static void trx_list_insert_ordered( /*====================*/ trx_t* trx) /*!< in: trx handle */ { trx_t* trx2; ut_ad(mutex_own(&kernel_mutex)); trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list); while (trx2 != NULL) { if (ut_dulint_cmp(trx->id, trx2->id) >= 0) { ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1); break; } trx2 = UT_LIST_GET_NEXT(trx_list, trx2); } if (trx2 != NULL) { trx2 = UT_LIST_GET_PREV(trx_list, trx2); if (trx2 == NULL) { UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx); } else { UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list, trx2, trx); } } else { UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx); } }
ibool buf_flush_ready_for_replace( /*========================*/ /* out: TRUE if can replace immediately */ buf_block_t* block) /* in: buffer control block, must be in state BUF_BLOCK_FILE_PAGE and in the LRU list */ { ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); if (block->state != BUF_BLOCK_FILE_PAGE) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: buffer block state %lu" " in the LRU list!\n", (ulong)block->state); ut_print_buf(stderr, block, sizeof(buf_block_t)); return(FALSE); } if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) || (block->buf_fix_count != 0) || (block->io_fix != 0)) { return(FALSE); } return(TRUE); }
/********************************************************************** Takes a block out of the LRU list and page hash table and sets the block state to BUF_BLOCK_REMOVE_HASH. */ static void buf_LRU_block_remove_hashed_page( /*=============================*/ buf_block_t* block) /* in: block, must contain a file page and be in a state where it can be freed; there may or may not be a hash index to the page */ { ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&block->mutex)); ut_ad(block); ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_a(block->io_fix == 0); ut_a(block->buf_fix_count == 0); ut_a(ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) == 0); buf_LRU_remove_block(block); buf_pool->freed_page_clock += 1; /* Note that if AWE is enabled the block may not have a frame at all */ buf_block_modify_clock_inc(block); if (block != buf_page_hash_get(block->space, block->offset)) { fprintf(stderr, "InnoDB: Error: page %lu %lu not found" " in the hash table\n", (ulong) block->space, (ulong) block->offset); if (buf_page_hash_get(block->space, block->offset)) { fprintf(stderr, "InnoDB: In hash table we find block" " %p of %lu %lu which is not %p\n", (void*) buf_page_hash_get (block->space, block->offset), (ulong) buf_page_hash_get (block->space, block->offset)->space, (ulong) buf_page_hash_get (block->space, block->offset)->offset, (void*) block); } #ifdef UNIV_DEBUG buf_print(); buf_LRU_print(); buf_validate(); buf_LRU_validate(); #endif ut_a(0); } HASH_DELETE(buf_block_t, hash, buf_pool->page_hash, buf_page_address_fold(block->space, block->offset), block); UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE); block->state = BUF_BLOCK_REMOVE_HASH; }
/************************************************************************ Returns TRUE if the block is modified and ready for flushing. */ UNIV_INLINE ibool buf_flush_ready_for_flush( /*======================*/ /* out: TRUE if can flush immediately */ buf_block_t* block, /* in: buffer control block, must be in state BUF_BLOCK_FILE_PAGE */ ulint flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */ { ut_ad(mutex_own(&(buf_pool->mutex))); ut_ad(mutex_own(&(block->mutex))); ut_a(block->state == BUF_BLOCK_FILE_PAGE); if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) && (block->io_fix == 0)) { if (flush_type != BUF_FLUSH_LRU) { return(TRUE); } else if (block->buf_fix_count == 0) { /* If we are flushing the LRU list, to avoid deadlocks we require the block not to be bufferfixed, and hence not latched. */ return(TRUE); } } return(FALSE); }
void buf_flush_insert_sorted_into_flush_list( /*====================================*/ buf_block_t* block) /* in: block which is modified */ { buf_block_t* prev_b; buf_block_t* b; ut_ad(mutex_own(&(buf_pool->mutex))); prev_b = NULL; b = UT_LIST_GET_FIRST(buf_pool->flush_list); while (b && (ut_dulint_cmp(b->oldest_modification, block->oldest_modification) > 0)) { prev_b = b; b = UT_LIST_GET_NEXT(flush_list, b); } if (prev_b == NULL) { UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); } else { UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b, block); } ut_ad(buf_flush_validate_low()); }
/*******************************************************************//** Gets the biggest pair of a trx number and an undo number in a purge array. */ static void trx_purge_arr_get_biggest( /*======================*/ trx_undo_arr_t* arr, /*!< in: purge array */ trx_id_t* trx_no, /*!< out: transaction number: ut_dulint_zero if array is empty */ undo_no_t* undo_no)/*!< out: undo number */ { trx_undo_inf_t* cell; trx_id_t pair_trx_no; undo_no_t pair_undo_no; int trx_cmp; ulint n_used; ulint i; ulint n; n = 0; n_used = arr->n_used; pair_trx_no = ut_dulint_zero; pair_undo_no = ut_dulint_zero; for (i = 0;; i++) { cell = trx_undo_arr_get_nth_info(arr, i); if (cell->in_use) { n++; trx_cmp = ut_dulint_cmp(cell->trx_no, pair_trx_no); if ((trx_cmp > 0) || ((trx_cmp == 0) && (ut_dulint_cmp(cell->undo_no, pair_undo_no) >= 0))) { pair_trx_no = cell->trx_no; pair_undo_no = cell->undo_no; } } if (n == n_used) { *trx_no = pair_trx_no; *undo_no = pair_undo_no; return; } } }
ibool row_undo_search_clust_to_pcur( /*==========================*/ /* out: TRUE if found; NOTE the node->pcur must be closed by the caller, regardless of the return value */ undo_node_t* node) /* in: row undo node */ { dict_index_t* clust_index; ibool found; mtr_t mtr; ibool ret; rec_t* rec; mem_heap_t* heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; ulint* offsets = offsets_; *offsets_ = (sizeof offsets_) / sizeof *offsets_; mtr_start(&mtr); clust_index = dict_table_get_first_index(node->table); found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, node->table, node->ref, &mtr); rec = btr_pcur_get_rec(&(node->pcur)); offsets = rec_get_offsets(rec, clust_index, offsets, ULINT_UNDEFINED, &heap); if (!found || 0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(rec, clust_index, offsets))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this is to make sure that some thread will eventually undo the modification corresponding to node->roll_ptr. */ /* fputs("--------------------undoing a previous version\n", stderr); */ ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; } btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); if (UNIV_LIKELY_NULL(heap)) { mem_heap_free(heap); } return(ret); }
void buf_LRU_print(void) /*===============*/ { buf_block_t* block; buf_frame_t* frame; ulint len; ut_ad(buf_pool); mutex_enter(&(buf_pool->mutex)); fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock); block = UT_LIST_GET_FIRST(buf_pool->LRU); len = 0; while (block != NULL) { fprintf(stderr, "BLOCK %lu ", (ulong) block->offset); if (block->old) { fputs("old ", stderr); } if (block->buf_fix_count) { fprintf(stderr, "buffix count %lu ", (ulong) block->buf_fix_count); } if (block->io_fix) { fprintf(stderr, "io_fix %lu ", (ulong) block->io_fix); } if (ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0) { fputs("modif. ", stderr); } frame = buf_block_get_frame(block); fprintf(stderr, "LRU pos %lu type %lu index id %lu ", (ulong) block->LRU_position, (ulong) fil_page_get_type(frame), (ulong) ut_dulint_get_low (btr_page_get_index_id(frame))); block = UT_LIST_GET_NEXT(LRU, block); if (++len == 10) { len = 0; putc('\n', stderr); } } mutex_exit(&(buf_pool->mutex)); }
/********************************************************************//** Removes unnecessary history data from rollback segments. NOTE that when this function is called, the caller must not have any latches on undo log pages! */ static void trx_purge_truncate_history(void) /*============================*/ { trx_rseg_t* rseg; trx_id_t limit_trx_no; undo_no_t limit_undo_no; ut_ad(mutex_own(&(purge_sys->mutex))); trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no, &limit_undo_no); if (ut_dulint_is_zero(limit_trx_no)) { limit_trx_no = purge_sys->purge_trx_no; limit_undo_no = purge_sys->purge_undo_no; } /* We play safe and set the truncate limit at most to the purge view low_limit number, though this is not necessary */ if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) { limit_trx_no = purge_sys->view->low_limit_no; limit_undo_no = ut_dulint_zero; } ut_ad((ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) <= 0)); rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); while (rseg) { trx_purge_truncate_rseg_history(rseg, limit_trx_no, limit_undo_no); rseg = UT_LIST_GET_NEXT(rseg_list, rseg); } }
ibool row_undo_search_clust_to_pcur( /*==========================*/ /* out: TRUE if found; NOTE the node->pcur must be closed by the caller, regardless of the return value */ undo_node_t* node, /* in: row undo node */ que_thr_t* thr) /* in: query thread */ { dict_index_t* clust_index; ibool found; mtr_t mtr; ibool ret; rec_t* rec; UT_NOT_USED(thr); mtr_start(&mtr); clust_index = dict_table_get_first_index(node->table); found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF, node->table, node->ref, &mtr); rec = btr_pcur_get_rec(&(node->pcur)); if (!found || 0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(rec, clust_index))) { /* We must remove the reservation on the undo log record BEFORE releasing the latch on the clustered index page: this is to make sure that some thread will eventually undo the modification corresponding to node->roll_ptr. */ /* printf("--------------------undoing a previous version\n"); */ ret = FALSE; } else { node->row = row_build(ROW_COPY_DATA, clust_index, rec, node->heap); btr_pcur_store_position(&(node->pcur), &mtr); ret = TRUE; } btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); return(ret); }
/***********************************************************//** Checks if also the previous version of the clustered index record was modified or inserted by the same transaction, and its undo number is such that it should be undone in the same rollback. @return TRUE if also previous modify or insert of this row should be undone */ static ibool row_undo_mod_undo_also_prev_vers( /*=============================*/ undo_node_t* node, /*!< in: row undo node */ undo_no_t* undo_no)/*!< out: the undo number */ { trx_undo_rec_t* undo_rec; trx_t* trx; trx = node->trx; if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) { *undo_no = ut_dulint_zero; return(FALSE); } undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap); *undo_no = trx_undo_rec_get_undo_no(undo_rec); return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0); }
int trx_weight_cmp( /*===========*/ /* out: <0, 0 or >0; similar to strcmp(3) */ trx_t* a, /* in: the first transaction to be compared */ trx_t* b) /* in: the second transaction to be compared */ { ibool a_notrans_edit; ibool b_notrans_edit; /* If mysql_thd is NULL for a transaction we assume that it has not edited non-transactional tables. */ a_notrans_edit = a->mysql_thd != NULL && thd_has_edited_nontrans_tables(a->mysql_thd); b_notrans_edit = b->mysql_thd != NULL && thd_has_edited_nontrans_tables(b->mysql_thd); if (a_notrans_edit && !b_notrans_edit) { return(1); } if (!a_notrans_edit && b_notrans_edit) { return(-1); } /* Either both had edited non-transactional tables or both had not, we fall back to comparing the number of altered/locked rows. */ #if 0 fprintf(stderr, "%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n", __func__, ut_conv_dulint_to_longlong(a->undo_no), UT_LIST_GET_LEN(a->trx_locks), ut_conv_dulint_to_longlong(b->undo_no), UT_LIST_GET_LEN(b->trx_locks)); #endif #define TRX_WEIGHT(t) \ ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks)) return(ut_dulint_cmp(TRX_WEIGHT(a), TRX_WEIGHT(b))); }
void buf_flush_insert_into_flush_list( /*=============================*/ buf_block_t* block) /* in: block which is modified */ { ut_ad(mutex_own(&(buf_pool->mutex))); ut_a(block->state == BUF_BLOCK_FILE_PAGE); ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) || (ut_dulint_cmp((UT_LIST_GET_FIRST(buf_pool->flush_list)) ->oldest_modification, block->oldest_modification) <= 0)); UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block); ut_ad(buf_flush_validate_low()); }
/*****************************************************************//** Finds out if an active transaction has inserted or modified a secondary index record. NOTE: the kernel mutex is temporarily released in this function! @return NULL if committed, else the active transaction */ UNIV_INTERN trx_t* row_vers_impl_x_locked_off_kernel( /*==============================*/ const rec_t* rec, /*!< in: record in a secondary index */ dict_index_t* index, /*!< in: the secondary index */ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { dict_index_t* clust_index; rec_t* clust_rec; ulint* clust_offsets; rec_t* version; trx_id_t trx_id; mem_heap_t* heap; mem_heap_t* heap2; dtuple_t* row; dtuple_t* entry = NULL; /* assignment to eliminate compiler warning */ trx_t* trx; ulint rec_del; #ifdef UNIV_DEBUG ulint err; #endif /* UNIV_DEBUG */ mtr_t mtr; ulint comp; ut_ad(mutex_own(&kernel_mutex)); #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ mutex_exit(&kernel_mutex); mtr_start(&mtr); /* Search for the clustered index record: this is a time-consuming operation: therefore we release the kernel mutex; also, the release is required by the latching order convention. The latch on the clustered index locks the top of the stack of versions. We also reserve purge_latch to lock the bottom of the version stack. */ clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr); if (!clust_rec) { /* In a rare case it is possible that no clust rec is found for a secondary index record: if in row0umod.c row_undo_mod_remove_clust_low() we have already removed the clust rec, while purge is still cleaning and removing secondary index records associated with earlier versions of the clustered index record. In that case there cannot be any implicit lock on the secondary index record, because an active transaction which has modified the secondary index record has also modified the clustered index record. And in a rollback we always undo the modifications to secondary index records before the clustered index record. */ mutex_enter(&kernel_mutex); mtr_commit(&mtr); return(NULL); } heap = mem_heap_create(1024); clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap); trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); mtr_s_lock(&(purge_sys->latch), &mtr); mutex_enter(&kernel_mutex); trx = NULL; if (!trx_is_active(trx_id)) { /* The transaction that modified or inserted clust_rec is no longer active: no implicit lock on rec */ goto exit_func; } if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, clust_offsets, TRUE)) { /* Corruption noticed: try to avoid a crash by returning */ goto exit_func; } comp = page_rec_is_comp(rec); ut_ad(index->table == clust_index->table); ut_ad(!!comp == dict_table_is_comp(index->table)); ut_ad(!comp == !page_rec_is_comp(clust_rec)); /* We look up if some earlier version, which was modified by the trx_id transaction, of the clustered index record would require rec to be in a different state (delete marked or unmarked, or have different field values, or not existing). If there is such a version, then rec was modified by the trx_id transaction, and it has an implicit x-lock on rec. Note that if clust_rec itself would require rec to be in a different state, then the trx_id transaction has not yet had time to modify rec, and does not necessarily have an implicit x-lock on rec. */ rec_del = rec_get_deleted_flag(rec, comp); trx = NULL; version = clust_rec; for (;;) { rec_t* prev_version; ulint vers_del; row_ext_t* ext; trx_id_t prev_trx_id; mutex_exit(&kernel_mutex); /* While we retrieve an earlier version of clust_rec, we release the kernel mutex, because it may take time to access the disk. After the release, we have to check if the trx_id transaction is still active. We keep the semaphore in mtr on the clust_rec page, so that no other transaction can update it and get an implicit x-lock on rec. */ heap2 = heap; heap = mem_heap_create(1024); #ifdef UNIV_DEBUG err = #endif /* UNIV_DEBUG */ trx_undo_prev_version_build(clust_rec, &mtr, version, clust_index, clust_offsets, heap, &prev_version); mem_heap_free(heap2); /* free version and clust_offsets */ if (prev_version == NULL) { mutex_enter(&kernel_mutex); if (!trx_is_active(trx_id)) { /* Transaction no longer active: no implicit x-lock */ break; } /* If the transaction is still active, clust_rec must be a fresh insert, because no previous version was found. */ ut_ad(err == DB_SUCCESS); /* It was a freshly inserted version: there is an implicit x-lock on rec */ trx = trx_get_on_id(trx_id); break; } clust_offsets = rec_get_offsets(prev_version, clust_index, NULL, ULINT_UNDEFINED, &heap); vers_del = rec_get_deleted_flag(prev_version, comp); prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, clust_offsets); /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, clust_offsets, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* entry may be NULL if a record was inserted in place of a deleted record, and the BLOB pointers of the new record were not initialized yet. But in that case, prev_version should be NULL. */ ut_a(entry); mutex_enter(&kernel_mutex); if (!trx_is_active(trx_id)) { /* Transaction no longer active: no implicit x-lock */ break; } /* If we get here, we know that the trx_id transaction is still active and it has modified prev_version. Let us check if prev_version would require rec to be in a different state. */ /* The previous version of clust_rec must be accessible, because the transaction is still active and clust_rec was not a fresh insert. */ ut_ad(err == DB_SUCCESS); /* We check if entry and rec are identified in the alphabetical ordering */ if (0 == cmp_dtuple_rec(entry, rec, offsets)) { /* The delete marks of rec and prev_version should be equal for rec to be in the state required by prev_version */ if (rec_del != vers_del) { trx = trx_get_on_id(trx_id); break; } /* It is possible that the row was updated so that the secondary index record remained the same in alphabetical ordering, but the field values changed still. For example, 'abc' -> 'ABC'. Check also that. */ dtuple_set_types_binary(entry, dtuple_get_n_fields(entry)); if (0 != cmp_dtuple_rec(entry, rec, offsets)) { trx = trx_get_on_id(trx_id); break; } } else if (!rec_del) { /* The delete mark should be set in rec for it to be in the state required by prev_version */ trx = trx_get_on_id(trx_id); break; } if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { /* The versions modified by the trx_id transaction end to prev_version: no implicit x-lock */ break; } version = prev_version; }/* for (;;) */ exit_func: mtr_commit(&mtr); mem_heap_free(heap); return(trx); }
/*****************************************************************//** Constructs the last committed version of a clustered index record, which should be seen by a semi-consistent read. @return DB_SUCCESS or DB_MISSING_HISTORY */ UNIV_INTERN ulint row_vers_build_for_semi_consistent_read( /*====================================*/ const rec_t* rec, /*!< in: record in a clustered index; the caller must have a latch on the page; this latch locks the top of the stack of versions of this records */ mtr_t* mtr, /*!< in: mtr holding the latch on rec */ dict_index_t* index, /*!< in: the clustered index */ ulint** offsets,/*!< in/out: offsets returned by rec_get_offsets(rec, index) */ mem_heap_t** offset_heap,/*!< in/out: memory heap from which the offsets are allocated */ mem_heap_t* in_heap,/*!< in: memory heap from which the memory for *old_vers is allocated; memory for possible intermediate versions is allocated and freed locally within the function */ const rec_t** old_vers)/*!< out: rec, old version, or NULL if the record does not exist in the view, that is, it was freshly inserted afterwards */ { const rec_t* version; mem_heap_t* heap = NULL; byte* buf; ulint err; trx_id_t rec_trx_id = ut_dulint_zero; ut_ad(dict_index_is_clust(index)); ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ ut_ad(rec_offs_validate(rec, index, *offsets)); rw_lock_s_lock(&(purge_sys->latch)); /* The S-latch on purge_sys prevents the purge view from changing. Thus, if we have an uncommitted transaction at this point, then purge cannot remove its undo log even if the transaction could commit now. */ version = rec; for (;;) { trx_t* version_trx; mem_heap_t* heap2; rec_t* prev_version; trx_id_t version_trx_id; version_trx_id = row_get_rec_trx_id(version, index, *offsets); if (rec == version) { rec_trx_id = version_trx_id; } mutex_enter(&kernel_mutex); version_trx = trx_get_on_id(version_trx_id); if (version_trx && (version_trx->conc_state == TRX_COMMITTED_IN_MEMORY || version_trx->conc_state == TRX_NOT_STARTED)) { version_trx = NULL; } mutex_exit(&kernel_mutex); if (!version_trx) { /* We found a version that belongs to a committed transaction: return it. */ #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (rec == version) { *old_vers = rec; err = DB_SUCCESS; break; } /* We assume that a rolled-back transaction stays in TRX_ACTIVE state until all the changes have been rolled back and the transaction is removed from the global list of transactions. */ if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) { /* The transaction was committed while we searched for earlier versions. Return the current version as a semi-consistent read. */ version = rec; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); } buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets)); *old_vers = rec_copy(buf, version, *offsets); rec_offs_make_valid(*old_vers, index, *offsets); err = DB_SUCCESS; break; } heap2 = heap; heap = mem_heap_create(1024); err = trx_undo_prev_version_build(rec, mtr, version, index, *offsets, heap, &prev_version); if (heap2) { mem_heap_free(heap2); /* free version */ } if (UNIV_UNLIKELY(err != DB_SUCCESS)) { break; } if (prev_version == NULL) { /* It was a freshly inserted version */ *old_vers = NULL; err = DB_SUCCESS; break; } version = prev_version; *offsets = rec_get_offsets(version, index, *offsets, ULINT_UNDEFINED, offset_heap); #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG ut_a(!rec_offs_any_null_extern(version, *offsets)); #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ }/* for (;;) */ if (heap) { mem_heap_free(heap); } rw_lock_s_unlock(&(purge_sys->latch)); return(err); }
cursor_view_t* read_cursor_view_create_for_mysql( /*==============================*/ trx_t* cr_trx) /* in: trx where cursor view is created */ { cursor_view_t* curview; read_view_t* view; mem_heap_t* heap; trx_t* trx; ulint n; ut_a(cr_trx); /* Use larger heap than in trx_create when creating a read_view because cursors are quite long. */ heap = mem_heap_create(512); curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t)); curview->heap = heap; /* Drop cursor tables from consideration when evaluating the need of auto-commit */ curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use; cr_trx->n_mysql_tables_in_use = 0; mutex_enter(&kernel_mutex); curview->read_view = read_view_create_low( UT_LIST_GET_LEN(trx_sys->trx_list), curview->heap); view = curview->read_view; view->creator_trx_id = cr_trx->id; view->type = VIEW_HIGH_GRANULARITY; view->undo_no = cr_trx->undo_no; /* No future transactions should be visible in the view */ view->low_limit_no = trx_sys->max_trx_id; view->low_limit_id = view->low_limit_no; n = 0; trx = UT_LIST_GET_FIRST(trx_sys->trx_list); /* No active transaction should be visible */ while (trx) { if (trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED) { read_view_set_nth_trx_id(view, n, trx->id); n++; /* NOTE that a transaction whose trx number is < trx_sys->max_trx_id can still be active, if it is in the middle of its commit! Note that when a transaction starts, we initialize trx->no to ut_dulint_max. */ if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) { view->low_limit_no = trx->no; } } trx = UT_LIST_GET_NEXT(trx_list, trx); } view->n_trx_ids = n; if (n > 0) { /* The last active transaction has the smallest id: */ view->up_limit_id = read_view_get_nth_trx_id(view, n - 1); } else { view->up_limit_id = view->low_limit_id; } UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view); mutex_exit(&kernel_mutex); return(curview); }
/********************************************************************//** Removes unnecessary history data from a rollback segment. */ static void trx_purge_truncate_rseg_history( /*============================*/ trx_rseg_t* rseg, /*!< in: rollback segment */ trx_id_t limit_trx_no, /*!< in: remove update undo logs whose trx number is < limit_trx_no */ undo_no_t limit_undo_no) /*!< in: if transaction number is equal to limit_trx_no, truncate undo records with undo number < limit_undo_no */ { fil_addr_t hdr_addr; fil_addr_t prev_hdr_addr; trx_rsegf_t* rseg_hdr; page_t* undo_page; trx_ulogf_t* log_hdr; trx_usegf_t* seg_hdr; int cmp; ulint n_removed_logs = 0; mtr_t mtr; ut_ad(mutex_own(&(purge_sys->mutex))); mtr_start(&mtr); mutex_enter(&(rseg->mutex)); rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, &mtr); hdr_addr = trx_purge_get_log_from_hist( flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr)); loop: if (hdr_addr.page == FIL_NULL) { mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); return; } undo_page = trx_undo_page_get(rseg->space, rseg->zip_size, hdr_addr.page, &mtr); log_hdr = undo_page + hdr_addr.boffset; cmp = ut_dulint_cmp(mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO), limit_trx_no); if (cmp == 0) { trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page, hdr_addr.boffset, limit_undo_no); } if (cmp >= 0) { mutex_enter(&kernel_mutex); ut_a(trx_sys->rseg_history_len >= n_removed_logs); trx_sys->rseg_history_len -= n_removed_logs; mutex_exit(&kernel_mutex); flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY, log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr); mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); return; } prev_hdr_addr = trx_purge_get_log_from_hist( flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr)); n_removed_logs++; seg_hdr = undo_page + TRX_UNDO_SEG_HDR; if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE) && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) { /* We can free the whole log segment */ mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); trx_purge_free_segment(rseg, hdr_addr, n_removed_logs); n_removed_logs = 0; } else { mutex_exit(&(rseg->mutex)); mtr_commit(&mtr); } mtr_start(&mtr); mutex_enter(&(rseg->mutex)); rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no, &mtr); hdr_addr = prev_hdr_addr; goto loop; }
ulint buf_flush_batch( /*============*/ /* out: number of blocks for which the write request was queued; ULINT_UNDEFINED if there was a flush of the same type already running */ ulint flush_type, /* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST; if BUF_FLUSH_LIST, then the caller must not own any latches on pages */ ulint min_n, /* in: wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) */ dulint lsn_limit) /* in the case BUF_FLUSH_LIST all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ { buf_block_t* block; ulint page_count = 0; ulint old_page_count; ulint space; ulint offset; ibool found; ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)); #ifdef UNIV_SYNC_DEBUG ut_ad((flush_type != BUF_FLUSH_LIST) || sync_thread_levels_empty_gen(TRUE)); #endif /* UNIV_SYNC_DEBUG */ mutex_enter(&(buf_pool->mutex)); if ((buf_pool->n_flush[flush_type] > 0) || (buf_pool->init_flush[flush_type] == TRUE)) { /* There is already a flush batch of the same type running */ mutex_exit(&(buf_pool->mutex)); return(ULINT_UNDEFINED); } (buf_pool->init_flush)[flush_type] = TRUE; for (;;) { /* If we have flushed enough, leave the loop */ if (page_count >= min_n) { break; } /* Start from the end of the list looking for a suitable block to be flushed. */ if (flush_type == BUF_FLUSH_LRU) { block = UT_LIST_GET_LAST(buf_pool->LRU); } else { ut_ad(flush_type == BUF_FLUSH_LIST); block = UT_LIST_GET_LAST(buf_pool->flush_list); if (!block || (ut_dulint_cmp(block->oldest_modification, lsn_limit) >= 0)) { /* We have flushed enough */ break; } } found = FALSE; /* Note that after finding a single flushable page, we try to flush also all its neighbors, and after that start from the END of the LRU list or flush list again: the list may change during the flushing and we cannot safely preserve within this function a pointer to a block in the list! */ while ((block != NULL) && !found) { ut_a(block->state == BUF_BLOCK_FILE_PAGE); mutex_enter(&block->mutex); if (buf_flush_ready_for_flush(block, flush_type)) { found = TRUE; space = block->space; offset = block->offset; mutex_exit(&block->mutex); mutex_exit(&(buf_pool->mutex)); old_page_count = page_count; /* Try to flush also all the neighbors */ page_count += buf_flush_try_neighbors( space, offset, flush_type); /* fprintf(stderr, "Flush type %lu, page no %lu, neighb %lu\n", flush_type, offset, page_count - old_page_count); */ mutex_enter(&(buf_pool->mutex)); } else if (flush_type == BUF_FLUSH_LRU) { mutex_exit(&block->mutex); block = UT_LIST_GET_PREV(LRU, block); } else { ut_ad(flush_type == BUF_FLUSH_LIST); mutex_exit(&block->mutex); block = UT_LIST_GET_PREV(flush_list, block); } } /* If we could not find anything to flush, leave the loop */ if (!found) { break; } } (buf_pool->init_flush)[flush_type] = FALSE; if ((buf_pool->n_flush[flush_type] == 0) && (buf_pool->init_flush[flush_type] == FALSE)) { /* The running flush batch has ended */ os_event_set(buf_pool->no_flush[flush_type]); } mutex_exit(&(buf_pool->mutex)); buf_flush_buffered_writes(); #ifdef UNIV_DEBUG if (buf_debug_prints && page_count > 0) { ut_a(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); fprintf(stderr, flush_type == BUF_FLUSH_LRU ? "Flushed %lu pages in LRU flush\n" : "Flushed %lu pages in flush list flush\n", (ulong) page_count); } #endif /* UNIV_DEBUG */ srv_buf_pool_flushed += page_count; return(page_count); }
/**********************************************************************//** Reports in the undo log of an update or delete marking of a clustered index record. @return byte offset of the inserted undo log entry on the page if succeed, 0 if fail */ static ulint trx_undo_page_report_modify( /*========================*/ page_t* undo_page, /*!< in: undo log page */ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: clustered index where update or delete marking is done */ const rec_t* rec, /*!< in: clustered index record which has NOT yet been modified */ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ const upd_t* update, /*!< in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ ulint cmpl_info, /*!< in: compiler info on secondary index updates */ mtr_t* mtr) /*!< in: mtr */ { dict_table_t* table; ulint first_free; byte* ptr; const byte* field; ulint flen; ulint col_no; ulint type_cmpl; byte* type_cmpl_ptr; ulint i; trx_id_t trx_id; ibool ignore_prefix = FALSE; byte ext_buf[REC_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE]; ut_a(dict_index_is_clust(index)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); ptr = undo_page + first_free; ut_ad(first_free <= UNIV_PAGE_SIZE); if (trx_undo_left(undo_page, ptr) < 50) { /* NOTE: the value 50 must be big enough so that the general fields written below fit on the undo log page */ return(0); } /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; /* Store first some general parameters to the undo log */ if (!update) { type_cmpl = TRX_UNDO_DEL_MARK_REC; } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { type_cmpl = TRX_UNDO_UPD_DEL_REC; /* We are about to update a delete marked record. We don't typically need the prefix in this case unless the delete marking is done by the same transaction (which we check below). */ ignore_prefix = TRUE; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; } type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; type_cmpl_ptr = ptr; *ptr++ = (byte) type_cmpl; ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no); ptr += mach_dulint_write_much_compressed(ptr, table->id); /*----------------------------------------*/ /* Store the state of the info bits */ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); /* Store the values of the system columns */ field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_TRX_ID), &flen); ut_ad(flen == DATA_TRX_ID_LEN); trx_id = trx_read_trx_id(field); /* If it is an update of a delete marked record, then we are allowed to ignore blob prefixes if the delete marking was done by some other trx as it must have committed by now for us to allow an over-write. */ if (ignore_prefix) { ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0; } ptr += mach_dulint_write_compressed(ptr, trx_id); field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_ROLL_PTR), &flen); ut_ad(flen == DATA_ROLL_PTR_LEN); ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field)); /*----------------------------------------*/ /* Store then the fields required to uniquely determine the record which will be modified in the clustered index */ for (i = 0; i < dict_index_get_n_unique(index); i++) { field = rec_get_nth_field(rec, offsets, i, &flen); /* The ordering columns must not be stored externally. */ ut_ad(!rec_offs_nth_extern(offsets, i)); ut_ad(dict_index_get_nth_col(index, i)->ord_part); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, flen); if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } /*----------------------------------------*/ /* Save to the undo log the old values of the columns to be updated. */ if (update) { if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, upd_get_n_fields(update)); for (i = 0; i < upd_get_n_fields(update); i++) { ulint pos = upd_get_nth_field(update, i)->field_no; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 15) { return(0); } if (rec_offs_nth_extern(offsets, pos)) { ptr = trx_undo_page_report_modify_ext( ptr, dict_index_get_nth_col(index, pos) ->ord_part && !ignore_prefix && flen < REC_MAX_INDEX_COL_LEN ? ext_buf : NULL, dict_table_zip_size(table), &field, &flen); /* Notify purge that it eventually has to free the old externally stored field */ trx->update_undo->del_marks = TRUE; *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; } else { ptr += mach_write_compressed(ptr, flen); } if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } /*----------------------------------------*/ /* In the case of a delete marking, and also in the case of an update where any ordering field of any index changes, store the values of all columns which occur as ordering fields in any index. This info is used in the purge of old versions where we use it to build and search the delete marked index records, to look if we can remove them from the index tree. Note that starting from 4.0.14 also externally stored fields can be ordering in some index. Starting from 5.2, we no longer store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, but we can construct the column prefix fields in the index by fetching the first page of the BLOB that is pointed to by the clustered index. This works also in crash recovery, because all pages (including BLOBs) are recovered before anything is rolled back. */ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { byte* old_ptr = ptr; trx->update_undo->del_marks = TRUE; if (trx_undo_left(undo_page, ptr) < 5) { return(0); } /* Reserve 2 bytes to write the number of bytes the stored fields take in this undo record */ ptr += 2; for (col_no = 0; col_no < dict_table_get_n_cols(table); col_no++) { const dict_col_t* col = dict_table_get_nth_col(table, col_no); if (col->ord_part) { ulint pos; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5 + 15) { return(0); } pos = dict_index_get_nth_col_pos(index, col_no); ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (rec_offs_nth_extern(offsets, pos)) { ptr = trx_undo_page_report_modify_ext( ptr, flen < REC_MAX_INDEX_COL_LEN && !ignore_prefix ? ext_buf : NULL, dict_table_zip_size(table), &field, &flen); } else { ptr += mach_write_compressed( ptr, flen); } if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } mach_write_to_2(old_ptr, ptr - old_ptr); } /*----------------------------------------*/ /* Write pointers to the previous and the next undo log records */ if (trx_undo_left(undo_page, ptr) < 2) { return(0); } mach_write_to_2(ptr, first_free); ptr += 2; mach_write_to_2(undo_page + first_free, ptr - undo_page); mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, ptr - undo_page); /* Write to the REDO log about this change in the UNDO log */ trx_undof_page_add_undo_rec_log(undo_page, first_free, ptr - undo_page, mtr); return(first_free); }
void trx_print( /*======*/ FILE* f, /* in: output stream */ trx_t* trx, /* in: transaction */ ulint max_query_len) /* in: max query length to print, or 0 to use the default max length */ { ibool newline; fprintf(f, "TRANSACTION %lu %lu", (ulong) ut_dulint_get_high(trx->id), (ulong) ut_dulint_get_low(trx->id)); switch (trx->conc_state) { case TRX_NOT_STARTED: fputs(", not started", f); break; case TRX_ACTIVE: fprintf(f, ", ACTIVE %lu sec", (ulong)difftime(time(NULL), trx->start_time)); break; case TRX_PREPARED: fprintf(f, ", ACTIVE (PREPARED) %lu sec", (ulong)difftime(time(NULL), trx->start_time)); break; case TRX_COMMITTED_IN_MEMORY: fputs(", COMMITTED IN MEMORY", f); break; default: fprintf(f, " state %lu", (ulong) trx->conc_state); } #ifdef UNIV_LINUX fprintf(f, ", process no %lu", trx->mysql_process_no); #endif fprintf(f, ", OS thread id %lu", (ulong) os_thread_pf(trx->mysql_thread_id)); if (*trx->op_info) { putc(' ', f); fputs(trx->op_info, f); } if (trx->is_purge) { fputs(" purge trx", f); } if (trx->declared_to_be_inside_innodb) { fprintf(f, ", thread declared inside InnoDB %lu", (ulong) trx->n_tickets_to_enter_innodb); } putc('\n', f); if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { fprintf(f, "mysql tables in use %lu, locked %lu\n", (ulong) trx->n_mysql_tables_in_use, (ulong) trx->mysql_n_tables_locked); } newline = TRUE; switch (trx->que_state) { case TRX_QUE_RUNNING: newline = FALSE; break; case TRX_QUE_LOCK_WAIT: fputs("LOCK WAIT ", f); break; case TRX_QUE_ROLLING_BACK: fputs("ROLLING BACK ", f); break; case TRX_QUE_COMMITTING: fputs("COMMITTING ", f); break; default: fprintf(f, "que state %lu ", (ulong) trx->que_state); } if (0 < UT_LIST_GET_LEN(trx->trx_locks) || mem_heap_get_size(trx->lock_heap) > 400) { newline = TRUE; fprintf(f, "%lu lock struct(s), heap size %lu," " %lu row lock(s)", (ulong) UT_LIST_GET_LEN(trx->trx_locks), (ulong) mem_heap_get_size(trx->lock_heap), (ulong) lock_number_of_rows_locked(trx)); } if (trx->has_search_latch) { newline = TRUE; fputs(", holds adaptive hash latch", f); } if (ut_dulint_cmp(trx->undo_no, ut_dulint_zero) != 0) { newline = TRUE; fprintf(f, ", undo log entries %lu", (ulong) ut_dulint_get_low(trx->undo_no)); } if (newline) { putc('\n', f); } if (trx->mysql_thd != NULL) { innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len); } }
/***********************************************************************//** Chooses the next undo log to purge and updates the info in purge_sys. This function is used to initialize purge_sys when the next record to purge is not known, and also to update the purge system info on the next record when purge has handled the whole undo log for a transaction. */ static void trx_purge_choose_next_log(void) /*===========================*/ { trx_undo_rec_t* rec; trx_rseg_t* rseg; trx_rseg_t* min_rseg; trx_id_t min_trx_no; ulint space = 0; /* remove warning (??? bug ???) */ ulint zip_size = 0; ulint page_no = 0; /* remove warning (??? bug ???) */ ulint offset = 0; /* remove warning (??? bug ???) */ mtr_t mtr; ut_ad(mutex_own(&(purge_sys->mutex))); ut_ad(purge_sys->next_stored == FALSE); rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); min_trx_no = ut_dulint_max; min_rseg = NULL; while (rseg) { mutex_enter(&(rseg->mutex)); if (rseg->last_page_no != FIL_NULL) { if ((min_rseg == NULL) || (ut_dulint_cmp(min_trx_no, rseg->last_trx_no) > 0)) { min_rseg = rseg; min_trx_no = rseg->last_trx_no; space = rseg->space; zip_size = rseg->zip_size; ut_a(space == 0); /* We assume in purge of externally stored fields that space id == 0 */ page_no = rseg->last_page_no; offset = rseg->last_offset; } } mutex_exit(&(rseg->mutex)); rseg = UT_LIST_GET_NEXT(rseg_list, rseg); } if (min_rseg == NULL) { return; } mtr_start(&mtr); if (!min_rseg->last_del_marks) { /* No need to purge this log */ rec = &trx_purge_dummy_rec; } else { rec = trx_undo_get_first_rec(space, zip_size, page_no, offset, RW_S_LATCH, &mtr); if (rec == NULL) { /* Undo log empty */ rec = &trx_purge_dummy_rec; } } purge_sys->next_stored = TRUE; purge_sys->rseg = min_rseg; purge_sys->hdr_page_no = page_no; purge_sys->hdr_offset = offset; purge_sys->purge_trx_no = min_trx_no; if (rec == &trx_purge_dummy_rec) { purge_sys->purge_undo_no = ut_dulint_zero; purge_sys->page_no = page_no; purge_sys->offset = 0; } else { purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec); purge_sys->page_no = page_get_page_no(page_align(rec)); purge_sys->offset = page_offset(rec); } mtr_commit(&mtr); }
/******************************************************************* Removes a clustered index record. The pcur in node was positioned on the record, now it is detached. */ static ulint row_undo_ins_remove_clust_rec( /*==========================*/ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ undo_node_t* node, /* in: undo node */ que_thr_t* thr) /* in: query thread */ { btr_cur_t* btr_cur; ibool success; ulint err; ulint n_tries = 0; mtr_t mtr; UT_NOT_USED(thr); mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), &mtr); ut_a(success); if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { /* Drop the index tree associated with the row in SYS_INDEXES table: */ dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr); mtr_commit(&mtr); mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur), &mtr); ut_a(success); } btr_cur = btr_pcur_get_btr_cur(&(node->pcur)); success = btr_cur_optimistic_delete(btr_cur, &mtr); btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); if (success) { trx_undo_rec_release(node->trx, node->undo_no); return(DB_SUCCESS); } retry: /* If did not succeed, try pessimistic descent to tree */ mtr_start(&mtr); success = btr_pcur_restore_position(BTR_MODIFY_TREE, &(node->pcur), &mtr); ut_a(success); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, TRUE, &mtr); /* The delete operation may fail if we have little file space left: TODO: easiest to crash the database and restart with more file space */ if (err == DB_OUT_OF_FILE_SPACE && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) { btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); n_tries++; os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME); goto retry; } btr_pcur_commit_specify_mtr(&(node->pcur), &mtr); trx_undo_rec_release(node->trx, node->undo_no); return(err); }
/********************************************************************//** Fetches the next undo log record from the history list to purge. It must be released with the corresponding release function. @return copy of an undo log record or pointer to trx_purge_dummy_rec, if the whole undo log can skipped in purge; NULL if none left */ UNIV_INTERN trx_undo_rec_t* trx_purge_fetch_next_rec( /*=====================*/ roll_ptr_t* roll_ptr,/*!< out: roll pointer to undo record */ trx_undo_inf_t** cell, /*!< out: storage cell for the record in the purge array */ mem_heap_t* heap) /*!< in: memory heap where copied */ { trx_undo_rec_t* undo_rec; mutex_enter(&(purge_sys->mutex)); if (purge_sys->state == TRX_STOP_PURGE) { trx_purge_truncate_if_arr_empty(); mutex_exit(&(purge_sys->mutex)); return(NULL); } if (!purge_sys->next_stored) { trx_purge_choose_next_log(); if (!purge_sys->next_stored) { purge_sys->state = TRX_STOP_PURGE; trx_purge_truncate_if_arr_empty(); if (srv_print_thread_releases) { fprintf(stderr, "Purge: No logs left in the" " history list; pages handled %lu\n", (ulong) purge_sys->n_pages_handled); } mutex_exit(&(purge_sys->mutex)); return(NULL); } } if (purge_sys->n_pages_handled >= purge_sys->handle_limit) { purge_sys->state = TRX_STOP_PURGE; trx_purge_truncate_if_arr_empty(); mutex_exit(&(purge_sys->mutex)); return(NULL); } if (ut_dulint_cmp(purge_sys->purge_trx_no, purge_sys->view->low_limit_no) >= 0) { purge_sys->state = TRX_STOP_PURGE; trx_purge_truncate_if_arr_empty(); mutex_exit(&(purge_sys->mutex)); return(NULL); } /* fprintf(stderr, "Thread %lu purging trx %lu undo record %lu\n", os_thread_get_curr_id(), ut_dulint_get_low(purge_sys->purge_trx_no), ut_dulint_get_low(purge_sys->purge_undo_no)); */ *roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id, purge_sys->page_no, purge_sys->offset); *cell = trx_purge_arr_store_info(purge_sys->purge_trx_no, purge_sys->purge_undo_no); ut_ad(ut_dulint_cmp(purge_sys->purge_trx_no, (purge_sys->view)->low_limit_no) < 0); /* The following call will advance the stored values of purge_trx_no and purge_undo_no, therefore we had to store them first */ undo_rec = trx_purge_get_next_rec(heap); mutex_exit(&(purge_sys->mutex)); return(undo_rec); }
ulint dict_truncate_index_tree( /*=====================*/ /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ btr_pcur_t* pcur, /* in/out: persistent cursor pointing to record in the clustered index of SYS_INDEXES table. The cursor may be repositioned in this call. */ mtr_t* mtr) /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ { ulint root_page_no; ulint space; ulint type; dulint index_id; rec_t* rec; byte* ptr; ulint len; ulint comp; dict_index_t* index; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (root_page_no == FIL_NULL) { /* The tree has been freed. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing index of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (!fil_tablespace_exists_in_mem(space)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing .ibd file of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_TYPE_FIELD, &len); ut_ad(len == 4); type = mach_read_from_4(ptr); ptr = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); index_id = mach_read_from_8(ptr); /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, root_page_no); /* Then we free the root page in the same mini-transaction where we create the b-tree and write its new root page number to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally truncated */ comp = page_is_comp(btr_page_get(space, root_page_no, RW_X_LATCH, mtr)); btr_free_root(space, root_page_no, mtr); /* We will temporarily write FIL_NULL to the PAGE_NO field in SYS_INDEXES, so that the database will not get into an inconsistent state in case it crashes between the mtr_commit() below and the following mtr_commit() call. */ page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { if (!ut_dulint_cmp(index->id, index_id)) { break; } } root_page_no = btr_create(type, space, index_id, comp, mtr); if (index) { index->page = (unsigned int) root_page_no; } else { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Index %lu %lu of table %s is missing\n" "InnoDB: from the data dictionary during TRUNCATE!\n", ut_dulint_get_high(index_id), ut_dulint_get_low(index_id), table->name); } return(root_page_no); }
read_view_t* read_view_oldest_copy_or_open_new( /*==============================*/ /* out, own: read view struct */ dulint cr_trx_id, /* in: trx_id of creating transaction, or (0, 0) used in purge*/ mem_heap_t* heap) /* in: memory heap from which allocated */ { read_view_t* old_view; read_view_t* view_copy; ibool needs_insert = TRUE; ulint insert_done = 0; ulint n; ulint i; ut_ad(mutex_own(&kernel_mutex)); old_view = UT_LIST_GET_LAST(trx_sys->view_list); if (old_view == NULL) { return(read_view_open_now(cr_trx_id, heap)); } n = old_view->n_trx_ids; if (ut_dulint_cmp(old_view->creator_trx_id, ut_dulint_create(0,0)) != 0) { n++; } else { needs_insert = FALSE; } view_copy = read_view_create_low(n, heap); /* Insert the id of the creator in the right place of the descending array of ids, if needs_insert is TRUE: */ i = 0; while (i < n) { if (needs_insert && (i >= old_view->n_trx_ids || ut_dulint_cmp(old_view->creator_trx_id, read_view_get_nth_trx_id(old_view, i)) > 0)) { read_view_set_nth_trx_id(view_copy, i, old_view->creator_trx_id); needs_insert = FALSE; insert_done = 1; } else { read_view_set_nth_trx_id(view_copy, i, read_view_get_nth_trx_id( old_view, i - insert_done)); } i++; } view_copy->creator_trx_id = cr_trx_id; view_copy->low_limit_no = old_view->low_limit_no; view_copy->low_limit_id = old_view->low_limit_id; if (n > 0) { /* The last active transaction has the smallest id: */ view_copy->up_limit_id = read_view_get_nth_trx_id( view_copy, n - 1); } else { view_copy->up_limit_id = old_view->up_limit_id; } UT_LIST_ADD_LAST(view_list, trx_sys->view_list, view_copy); return(view_copy); }
/****************************************************************//** Creates trx objects for transactions and initializes the trx list of trx_sys at database start. Rollback segment and undo log lists must already exist when this function is called, because the lists of transactions to be rolled back or cleaned up are built based on the undo log lists. */ UNIV_INTERN void trx_lists_init_at_db_start(void) /*============================*/ { trx_rseg_t* rseg; trx_undo_t* undo; trx_t* trx; ut_ad(mutex_own(&kernel_mutex)); UT_LIST_INIT(trx_sys->trx_list); /* Look from the rollback segments if there exist undo logs for transactions */ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list); while (rseg != NULL) { undo = UT_LIST_GET_FIRST(rseg->insert_undo_list); while (undo != NULL) { trx = trx_create(trx_dummy_sess); trx->is_recovered = TRUE; trx->id = undo->trx_id; trx->xid = undo->xid; trx->insert_undo = undo; trx->rseg = rseg; if (undo->state != TRX_UNDO_ACTIVE) { /* Prepared transactions are left in the prepared state waiting for a commit or abort decision from MySQL */ if (undo->state == TRX_UNDO_PREPARED) { fprintf(stderr, "InnoDB: Transaction " TRX_ID_FMT " was in the" " XA prepared state.\n", TRX_ID_PREP_PRINTF(trx->id)); if (srv_force_recovery == 0) { trx->conc_state = TRX_PREPARED; } else { fprintf(stderr, "InnoDB: Since" " innodb_force_recovery" " > 0, we will" " rollback it" " anyway.\n"); trx->conc_state = TRX_ACTIVE; } } else { trx->conc_state = TRX_COMMITTED_IN_MEMORY; } /* We give a dummy value for the trx no; this should have no relevance since purge is not interested in committed transaction numbers, unless they are in the history list, in which case it looks the number from the disk based undo log structure */ trx->no = trx->id; } else { trx->conc_state = TRX_ACTIVE; /* A running transaction always has the number field inited to ut_dulint_max */ trx->no = ut_dulint_max; } if (undo->dict_operation) { trx_set_dict_operation( trx, TRX_DICT_OP_TABLE); trx->table_id = undo->table_id; } if (!undo->empty) { trx->undo_no = ut_dulint_add(undo->top_undo_no, 1); } trx_list_insert_ordered(trx); undo = UT_LIST_GET_NEXT(undo_list, undo); } undo = UT_LIST_GET_FIRST(rseg->update_undo_list); while (undo != NULL) { trx = trx_get_on_id(undo->trx_id); if (NULL == trx) { trx = trx_create(trx_dummy_sess); trx->is_recovered = TRUE; trx->id = undo->trx_id; trx->xid = undo->xid; if (undo->state != TRX_UNDO_ACTIVE) { /* Prepared transactions are left in the prepared state waiting for a commit or abort decision from MySQL */ if (undo->state == TRX_UNDO_PREPARED) { fprintf(stderr, "InnoDB: Transaction " TRX_ID_FMT " was in the" " XA prepared state.\n", TRX_ID_PREP_PRINTF( trx->id)); if (srv_force_recovery == 0) { trx->conc_state = TRX_PREPARED; } else { fprintf(stderr, "InnoDB: Since" " innodb_force_recovery" " > 0, we will" " rollback it" " anyway.\n"); trx->conc_state = TRX_ACTIVE; } } else { trx->conc_state = TRX_COMMITTED_IN_MEMORY; } /* We give a dummy value for the trx number */ trx->no = trx->id; } else { trx->conc_state = TRX_ACTIVE; /* A running transaction always has the number field inited to ut_dulint_max */ trx->no = ut_dulint_max; } trx->rseg = rseg; trx_list_insert_ordered(trx); if (undo->dict_operation) { trx_set_dict_operation( trx, TRX_DICT_OP_TABLE); trx->table_id = undo->table_id; } } trx->update_undo = undo; if ((!undo->empty) && (ut_dulint_cmp(undo->top_undo_no, trx->undo_no) >= 0)) { trx->undo_no = ut_dulint_add(undo->top_undo_no, 1); } undo = UT_LIST_GET_NEXT(undo_list, undo); } rseg = UT_LIST_GET_NEXT(rseg_list, rseg); } }
read_view_t* read_view_open_now( /*===============*/ /* out, own: read view struct */ dulint cr_trx_id, /* in: trx_id of creating transaction, or (0, 0) used in purge */ mem_heap_t* heap) /* in: memory heap from which allocated */ { read_view_t* view; trx_t* trx; ulint n; ut_ad(mutex_own(&kernel_mutex)); view = read_view_create_low(UT_LIST_GET_LEN(trx_sys->trx_list), heap); view->creator_trx_id = cr_trx_id; view->type = VIEW_NORMAL; view->undo_no = ut_dulint_create(0, 0); /* No future transactions should be visible in the view */ view->low_limit_no = trx_sys->max_trx_id; view->low_limit_id = view->low_limit_no; n = 0; trx = UT_LIST_GET_FIRST(trx_sys->trx_list); /* No active transaction should be visible, except cr_trx */ while (trx) { if (ut_dulint_cmp(trx->id, cr_trx_id) != 0 && (trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED)) { read_view_set_nth_trx_id(view, n, trx->id); n++; /* NOTE that a transaction whose trx number is < trx_sys->max_trx_id can still be active, if it is in the middle of its commit! Note that when a transaction starts, we initialize trx->no to ut_dulint_max. */ if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) { view->low_limit_no = trx->no; } } trx = UT_LIST_GET_NEXT(trx_list, trx); } view->n_trx_ids = n; if (n > 0) { /* The last active transaction has the smallest id: */ view->up_limit_id = read_view_get_nth_trx_id(view, n - 1); } else { view->up_limit_id = view->low_limit_id; } UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view); return(view); }