/*******************************************************************//** Drops the index tree associated with a row in SYS_INDEXES table. */ UNIV_INTERN void dict_drop_index_tree( /*=================*/ rec_t* rec, /*!< in/out: record in the clustered index of SYS_INDEXES table */ mtr_t* mtr) /*!< in: mtr having the latch on the record page */ { ulint root_page_no; ulint space; ulint zip_size; const byte* ptr; ulint len; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (root_page_no == FIL_NULL) { /* The tree has already been freed */ return; } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ return; } /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, zip_size, root_page_no); /* Then we free the root page in the same mini-transaction where we write FIL_NULL to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally freed */ /* printf("Dropping index tree in space %lu root page %lu\n", space, root_page_no); */ btr_free_root(space, zip_size, root_page_no, mtr); page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); }
/******************************************************************* Creates an index tree for the index if it is not a member of a cluster. */ static ulint dict_create_index_tree_step( /*========================*/ /* out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE */ ind_node_t* node) /* in: index create node */ { dict_index_t* index; dict_table_t* sys_indexes; dict_table_t* table; dtuple_t* search_tuple; btr_pcur_t pcur; mtr_t mtr; ut_ad(mutex_own(&(dict_sys->mutex))); index = node->index; table = node->table; sys_indexes = dict_sys->sys_indexes; /* Run a mini-transaction in which the index tree is allocated for the index and its root address is written to the index entry in sys_indexes */ mtr_start(&mtr); search_tuple = dict_create_search_tuple(node->ind_row, node->heap); btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes), search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF, &pcur, &mtr); btr_pcur_move_to_next_user_rec(&pcur, &mtr); node->page_no = btr_create(index->type, index->space, index->id, dict_table_is_comp(table), &mtr); /* printf("Created a new index tree in space %lu root page %lu\n", index->space, index->page_no); */ page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), DICT_SYS_INDEXES_PAGE_NO_FIELD, node->page_no, &mtr); btr_pcur_close(&pcur); mtr_commit(&mtr); if (node->page_no == FIL_NULL) { return(DB_OUT_OF_FILE_SPACE); } return(DB_SUCCESS); }
/********************************************************//** Opens a buffer for mlog, writes the initial log record and, if needed, the field lengths of an index. @return buffer, NULL if log mode MTR_LOG_NONE */ UNIV_INTERN byte* mlog_open_and_write_index( /*======================*/ mtr_t* mtr, /*!< in: mtr */ const byte* rec, /*!< in: index record or page */ dict_index_t* index, /*!< in: record descriptor */ byte type, /*!< in: log item type */ ulint size) /*!< in: requested buffer size in bytes (if 0, calls mlog_close() and returns NULL) */ { byte* log_ptr; const byte* log_start; const byte* log_end; ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); if (!page_rec_is_comp(rec)) { log_start = log_ptr = mlog_open(mtr, 11 + size); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); log_end = log_ptr + 11 + size; } else { ulint i; ulint n = dict_index_get_n_fields(index); /* total size needed */ ulint total = 11 + size + (n + 2) * 2; ulint alloc = total; /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ if (alloc > DYN_ARRAY_DATA_SIZE) { alloc = DYN_ARRAY_DATA_SIZE; } log_start = log_ptr = mlog_open(mtr, alloc); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_end = log_ptr + alloc; log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); mach_write_to_2(log_ptr, n); log_ptr += 2; mach_write_to_2(log_ptr, dict_index_get_n_unique_in_tree(index)); log_ptr += 2; for (i = 0; i < n; i++) { dict_field_t* field; const dict_col_t* col; ulint len; field = dict_index_get_nth_field(index, i); col = dict_field_get_col(field); len = field->fixed_len; ut_ad(len < 0x7fff); if (len == 0 && (col->len > 255 || col->mtype == DATA_BLOB)) { /* variable-length field with maximum length > 255 */ len = 0x7fff; } if (col->prtype & DATA_NOT_NULL) { len |= 0x8000; } if (log_ptr + 2 > log_end) { mlog_close(mtr, log_ptr); ut_a(total > (ulint) (log_ptr - log_start)); total -= log_ptr - log_start; alloc = total; if (alloc > DYN_ARRAY_DATA_SIZE) { alloc = DYN_ARRAY_DATA_SIZE; } log_start = log_ptr = mlog_open(mtr, alloc); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_end = log_ptr + alloc; } mach_write_to_2(log_ptr, len); log_ptr += 2; } } if (size == 0) { mlog_close(mtr, log_ptr); log_ptr = NULL; } else if (log_ptr + size > log_end) { mlog_close(mtr, log_ptr); log_ptr = mlog_open(mtr, size); } return(log_ptr); }
ulint dict_truncate_index_tree( /*=====================*/ /* out: new root page number, or FIL_NULL on failure */ dict_table_t* table, /* in: the table the index belongs to */ btr_pcur_t* pcur, /* in/out: persistent cursor pointing to record in the clustered index of SYS_INDEXES table. The cursor may be repositioned in this call. */ mtr_t* mtr) /* in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ { ulint root_page_no; ulint space; ulint type; dulint index_id; rec_t* rec; byte* ptr; ulint len; ulint comp; dict_index_t* index; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (root_page_no == FIL_NULL) { /* The tree has been freed. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing index of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (!fil_tablespace_exists_in_mem(space)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing .ibd file of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_TYPE_FIELD, &len); ut_ad(len == 4); type = mach_read_from_4(ptr); ptr = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); index_id = mach_read_from_8(ptr); /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, root_page_no); /* Then we free the root page in the same mini-transaction where we create the b-tree and write its new root page number to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally truncated */ comp = page_is_comp(btr_page_get(space, root_page_no, RW_X_LATCH, mtr)); btr_free_root(space, root_page_no, mtr); /* We will temporarily write FIL_NULL to the PAGE_NO field in SYS_INDEXES, so that the database will not get into an inconsistent state in case it crashes between the mtr_commit() below and the following mtr_commit() call. */ page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { if (!ut_dulint_cmp(index->id, index_id)) { break; } } root_page_no = btr_create(type, space, index_id, comp, mtr); if (index) { index->page = (unsigned int) root_page_no; } else { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Index %lu %lu of table %s is missing\n" "InnoDB: from the data dictionary during TRUNCATE!\n", ut_dulint_get_high(index_id), ut_dulint_get_low(index_id), table->name); } return(root_page_no); }
/**********************************************************************//** Reports in the undo log of an update or delete marking of a clustered index record. @return byte offset of the inserted undo log entry on the page if succeed, 0 if fail */ static ulint trx_undo_page_report_modify( /*========================*/ page_t* undo_page, /*!< in: undo log page */ trx_t* trx, /*!< in: transaction */ dict_index_t* index, /*!< in: clustered index where update or delete marking is done */ const rec_t* rec, /*!< in: clustered index record which has NOT yet been modified */ const ulint* offsets, /*!< in: rec_get_offsets(rec, index) */ const upd_t* update, /*!< in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ ulint cmpl_info, /*!< in: compiler info on secondary index updates */ mtr_t* mtr) /*!< in: mtr */ { dict_table_t* table; ulint first_free; byte* ptr; const byte* field; ulint flen; ulint col_no; ulint type_cmpl; byte* type_cmpl_ptr; ulint i; trx_id_t trx_id; ibool ignore_prefix = FALSE; byte ext_buf[REC_MAX_INDEX_COL_LEN + BTR_EXTERN_FIELD_REF_SIZE]; ut_a(dict_index_is_clust(index)); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); ptr = undo_page + first_free; ut_ad(first_free <= UNIV_PAGE_SIZE); if (trx_undo_left(undo_page, ptr) < 50) { /* NOTE: the value 50 must be big enough so that the general fields written below fit on the undo log page */ return(0); } /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; /* Store first some general parameters to the undo log */ if (!update) { type_cmpl = TRX_UNDO_DEL_MARK_REC; } else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { type_cmpl = TRX_UNDO_UPD_DEL_REC; /* We are about to update a delete marked record. We don't typically need the prefix in this case unless the delete marking is done by the same transaction (which we check below). */ ignore_prefix = TRUE; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; } type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT; type_cmpl_ptr = ptr; *ptr++ = (byte) type_cmpl; ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no); ptr += mach_dulint_write_much_compressed(ptr, table->id); /*----------------------------------------*/ /* Store the state of the info bits */ *ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table)); /* Store the values of the system columns */ field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_TRX_ID), &flen); ut_ad(flen == DATA_TRX_ID_LEN); trx_id = trx_read_trx_id(field); /* If it is an update of a delete marked record, then we are allowed to ignore blob prefixes if the delete marking was done by some other trx as it must have committed by now for us to allow an over-write. */ if (ignore_prefix) { ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0; } ptr += mach_dulint_write_compressed(ptr, trx_id); field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_ROLL_PTR), &flen); ut_ad(flen == DATA_ROLL_PTR_LEN); ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field)); /*----------------------------------------*/ /* Store then the fields required to uniquely determine the record which will be modified in the clustered index */ for (i = 0; i < dict_index_get_n_unique(index); i++) { field = rec_get_nth_field(rec, offsets, i, &flen); /* The ordering columns must not be stored externally. */ ut_ad(!rec_offs_nth_extern(offsets, i)); ut_ad(dict_index_get_nth_col(index, i)->ord_part); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, flen); if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } /*----------------------------------------*/ /* Save to the undo log the old values of the columns to be updated. */ if (update) { if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, upd_get_n_fields(update)); for (i = 0; i < upd_get_n_fields(update); i++) { ulint pos = upd_get_nth_field(update, i)->field_no; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5) { return(0); } ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 15) { return(0); } if (rec_offs_nth_extern(offsets, pos)) { ptr = trx_undo_page_report_modify_ext( ptr, dict_index_get_nth_col(index, pos) ->ord_part && !ignore_prefix && flen < REC_MAX_INDEX_COL_LEN ? ext_buf : NULL, dict_table_zip_size(table), &field, &flen); /* Notify purge that it eventually has to free the old externally stored field */ trx->update_undo->del_marks = TRUE; *type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN; } else { ptr += mach_write_compressed(ptr, flen); } if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } /*----------------------------------------*/ /* In the case of a delete marking, and also in the case of an update where any ordering field of any index changes, store the values of all columns which occur as ordering fields in any index. This info is used in the purge of old versions where we use it to build and search the delete marked index records, to look if we can remove them from the index tree. Note that starting from 4.0.14 also externally stored fields can be ordering in some index. Starting from 5.2, we no longer store REC_MAX_INDEX_COL_LEN first bytes to the undo log record, but we can construct the column prefix fields in the index by fetching the first page of the BLOB that is pointed to by the clustered index. This works also in crash recovery, because all pages (including BLOBs) are recovered before anything is rolled back. */ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { byte* old_ptr = ptr; trx->update_undo->del_marks = TRUE; if (trx_undo_left(undo_page, ptr) < 5) { return(0); } /* Reserve 2 bytes to write the number of bytes the stored fields take in this undo record */ ptr += 2; for (col_no = 0; col_no < dict_table_get_n_cols(table); col_no++) { const dict_col_t* col = dict_table_get_nth_col(table, col_no); if (col->ord_part) { ulint pos; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5 + 15) { return(0); } pos = dict_index_get_nth_col_pos(index, col_no); ptr += mach_write_compressed(ptr, pos); /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (rec_offs_nth_extern(offsets, pos)) { ptr = trx_undo_page_report_modify_ext( ptr, flen < REC_MAX_INDEX_COL_LEN && !ignore_prefix ? ext_buf : NULL, dict_table_zip_size(table), &field, &flen); } else { ptr += mach_write_compressed( ptr, flen); } if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } mach_write_to_2(old_ptr, ptr - old_ptr); } /*----------------------------------------*/ /* Write pointers to the previous and the next undo log records */ if (trx_undo_left(undo_page, ptr) < 2) { return(0); } mach_write_to_2(ptr, first_free); ptr += 2; mach_write_to_2(undo_page + first_free, ptr - undo_page); mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, ptr - undo_page); /* Write to the REDO log about this change in the UNDO log */ trx_undof_page_add_undo_rec_log(undo_page, first_free, ptr - undo_page, mtr); return(first_free); }
/**************************************************************//** Moves parts of long fields in entry to the big record vector so that the size of tuple drops below the maximum record size allowed in the database. Moves data only from those fields which are not necessary to determine uniquely the insertion place of the tuple in the index. @return own: created big record vector, NULL if we are not able to shorten the entry enough, i.e., if there are too many fixed-length or short fields in entry or the index is clustered */ UNIV_INTERN big_rec_t* dtuple_convert_big_rec( /*===================*/ dict_index_t* index, /*!< in: index */ dtuple_t* entry, /*!< in/out: index entry */ ulint* n_ext) /*!< in/out: number of externally stored columns */ { mem_heap_t* heap; big_rec_t* vector; dfield_t* dfield; dict_field_t* ifield; ulint size; ulint n_fields; ulint local_len; ulint local_prefix_len; if (UNIV_UNLIKELY(!dict_index_is_clust(index))) { return(NULL); } if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) { /* up to MySQL 5.1: store a 768-byte prefix locally */ local_len = BTR_EXTERN_FIELD_REF_SIZE + DICT_MAX_INDEX_COL_LEN; } else { /* new-format table: do not store any BLOB prefix locally */ local_len = BTR_EXTERN_FIELD_REF_SIZE; } ut_a(dtuple_check_typed_no_assert(entry)); size = rec_get_converted_size(index, entry, *n_ext); if (UNIV_UNLIKELY(size > 1000000000)) { fprintf(stderr, "InnoDB: Warning: tuple size very big: %lu\n", (ulong) size); fputs("InnoDB: Tuple contents: ", stderr); dtuple_print(stderr, entry); putc('\n', stderr); } heap = mem_heap_create(size + dtuple_get_n_fields(entry) * sizeof(big_rec_field_t) + 1000); vector = mem_heap_alloc(heap, sizeof(big_rec_t)); vector->heap = heap; vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry) * sizeof(big_rec_field_t)); /* Decide which fields to shorten: the algorithm is to look for a variable-length field that yields the biggest savings when stored externally */ n_fields = 0; while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, *n_ext), dict_table_is_comp(index->table), dict_index_get_n_fields(index), dict_table_zip_size(index->table))) { ulint i; ulint longest = 0; ulint longest_i = ULINT_MAX; byte* data; big_rec_field_t* b; for (i = dict_index_get_n_unique_in_tree(index); i < dtuple_get_n_fields(entry); i++) { ulint savings; dfield = dtuple_get_nth_field(entry, i); ifield = dict_index_get_nth_field(index, i); /* Skip fixed-length, NULL, externally stored, or short columns */ if (ifield->fixed_len || dfield_is_null(dfield) || dfield_is_ext(dfield) || dfield_get_len(dfield) <= local_len || dfield_get_len(dfield) <= BTR_EXTERN_FIELD_REF_SIZE * 2) { goto skip_field; } savings = dfield_get_len(dfield) - local_len; /* Check that there would be savings */ if (longest >= savings) { goto skip_field; } longest_i = i; longest = savings; skip_field: continue; } if (!longest) { /* Cannot shorten more */ mem_heap_free(heap); return(NULL); } /* Move data from field longest_i to big rec vector. We store the first bytes locally to the record. Then we can calculate all ordering fields in all indexes from locally stored data. */ dfield = dtuple_get_nth_field(entry, longest_i); ifield = dict_index_get_nth_field(index, longest_i); local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE; b = &vector->fields[n_fields]; b->field_no = longest_i; b->len = dfield_get_len(dfield) - local_prefix_len; b->data = (char*) dfield_get_data(dfield) + local_prefix_len; /* Allocate the locally stored part of the column. */ data = mem_heap_alloc(heap, local_len); /* Copy the local prefix. */ memcpy(data, dfield_get_data(dfield), local_prefix_len); /* Clear the extern field reference (BLOB pointer). */ memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE); #if 0 /* The following would fail the Valgrind checks in page_cur_insert_rec_low() and page_cur_insert_rec_zip(). The BLOB pointers in the record will be initialized after the record and the BLOBs have been written. */ UNIV_MEM_ALLOC(data + local_prefix_len, BTR_EXTERN_FIELD_REF_SIZE); #endif dfield_set_data(dfield, data, local_len); dfield_set_ext(dfield); n_fields++; (*n_ext)++; ut_ad(n_fields < dtuple_get_n_fields(entry)); } vector->n_fields = n_fields; return(vector); }
/*******************************************************************//** An inverse function to row_build_index_entry. Builds a row from a record in a clustered index. @return own: row built; see the NOTE below! */ UNIV_INTERN dtuple_t* row_build( /*======*/ ulint type, /*!< in: ROW_COPY_POINTERS or ROW_COPY_DATA; the latter copies also the data fields to heap while the first only places pointers to data fields on the index page, and thus is more efficient */ const dict_index_t* index, /*!< in: clustered index */ const rec_t* rec, /*!< in: record in the clustered index; NOTE: in the case ROW_COPY_POINTERS the data fields in the row will point directly into this record, therefore, the buffer page of this record must be at least s-latched and the latch held as long as the row dtuple is used! */ const ulint* offsets,/*!< in: rec_get_offsets(rec,index) or NULL, in which case this function will invoke rec_get_offsets() */ const dict_table_t* col_table, /*!< in: table, to check which externally stored columns occur in the ordering columns of an index, or NULL if index->table should be consulted instead */ row_ext_t** ext, /*!< out, own: cache of externally stored column prefixes, or NULL */ mem_heap_t* heap) /*!< in: memory heap from which the memory needed is allocated */ { dtuple_t* row; const dict_table_t* table; ulint n_fields; ulint n_ext_cols; ulint* ext_cols = NULL; /* remove warning */ ulint len; ulint row_len; byte* buf; ulint i; ulint j; mem_heap_t* tmp_heap = NULL; ulint offsets_[REC_OFFS_NORMAL_SIZE]; rec_offs_init(offsets_); ut_ad(index && rec && heap); ut_ad(dict_index_is_clust(index)); ut_ad(!mutex_own(&kernel_mutex)); if (!offsets) { offsets = rec_get_offsets(rec, index, offsets_, ULINT_UNDEFINED, &tmp_heap); } else { ut_ad(rec_offs_validate(rec, index, offsets)); } #if defined UNIV_DEBUG || defined UNIV_BLOB_LIGHT_DEBUG if (rec_offs_any_null_extern(rec, offsets)) { /* This condition can occur during crash recovery before trx_rollback_active() has completed execution, or when a concurrently executing row_ins_index_entry_low() has committed the B-tree mini-transaction but has not yet managed to restore the cursor position for writing the big_rec. */ ut_a(trx_undo_roll_ptr_is_insert( row_get_rec_roll_ptr(rec, index, offsets))); } #endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */ if (type != ROW_COPY_POINTERS) { /* Take a copy of rec to heap */ buf = mem_heap_alloc(heap, rec_offs_size(offsets)); rec = rec_copy(buf, rec, offsets); /* Avoid a debug assertion in rec_offs_validate(). */ rec_offs_make_valid(rec, index, (ulint*) offsets); } table = index->table; row_len = dict_table_get_n_cols(table); row = dtuple_create(heap, row_len); dict_table_copy_types(row, table); dtuple_set_info_bits(row, rec_get_info_bits( rec, dict_table_is_comp(table))); n_fields = rec_offs_n_fields(offsets); n_ext_cols = rec_offs_n_extern(offsets); if (n_ext_cols) { ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols); } for (i = j = 0; i < n_fields; i++) { dict_field_t* ind_field = dict_index_get_nth_field(index, i); const dict_col_t* col = dict_field_get_col(ind_field); ulint col_no = dict_col_get_no(col); dfield_t* dfield = dtuple_get_nth_field(row, col_no); if (ind_field->prefix_len == 0) { const byte* field = rec_get_nth_field( rec, offsets, i, &len); dfield_set_data(dfield, field, len); } if (rec_offs_nth_extern(offsets, i)) { dfield_set_ext(dfield); if (UNIV_LIKELY_NULL(col_table)) { ut_a(col_no < dict_table_get_n_cols(col_table)); col = dict_table_get_nth_col( col_table, col_no); } if (col->ord_part) { /* We will have to fetch prefixes of externally stored columns that are referenced by column prefixes. */ ext_cols[j++] = col_no; } } } ut_ad(dtuple_check_typed(row)); if (!ext) { /* REDUNDANT and COMPACT formats store a local 768-byte prefix of each externally stored column. No cache is needed. */ ut_ad(dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP); } else if (j) { *ext = row_ext_create(j, ext_cols, row, dict_table_zip_size(index->table), heap); } else { *ext = NULL; } if (tmp_heap) { mem_heap_free(tmp_heap); } return(row); }
/*****************************************************************//** Finds out if an active transaction has inserted or modified a secondary index record. NOTE: the kernel mutex is temporarily released in this function! @return NULL if committed, else the active transaction */ UNIV_INTERN trx_t* row_vers_impl_x_locked_off_kernel( /*==============================*/ const rec_t* rec, /*!< in: record in a secondary index */ dict_index_t* index, /*!< in: the secondary index */ const ulint* offsets)/*!< in: rec_get_offsets(rec, index) */ { dict_index_t* clust_index; rec_t* clust_rec; ulint* clust_offsets; rec_t* version; trx_id_t trx_id; mem_heap_t* heap; mem_heap_t* heap2; dtuple_t* row; dtuple_t* entry = NULL; /* assignment to eliminate compiler warning */ trx_t* trx; ulint rec_del; #ifdef UNIV_DEBUG ulint err; #endif /* UNIV_DEBUG */ mtr_t mtr; ulint comp; ut_ad(mutex_own(&kernel_mutex)); #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ mutex_exit(&kernel_mutex); mtr_start(&mtr); /* Search for the clustered index record: this is a time-consuming operation: therefore we release the kernel mutex; also, the release is required by the latching order convention. The latch on the clustered index locks the top of the stack of versions. We also reserve purge_latch to lock the bottom of the version stack. */ clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index, &clust_index, &mtr); if (!clust_rec) { /* In a rare case it is possible that no clust rec is found for a secondary index record: if in row0umod.c row_undo_mod_remove_clust_low() we have already removed the clust rec, while purge is still cleaning and removing secondary index records associated with earlier versions of the clustered index record. In that case there cannot be any implicit lock on the secondary index record, because an active transaction which has modified the secondary index record has also modified the clustered index record. And in a rollback we always undo the modifications to secondary index records before the clustered index record. */ mutex_enter(&kernel_mutex); mtr_commit(&mtr); return(NULL); } heap = mem_heap_create(1024); clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL, ULINT_UNDEFINED, &heap); trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets); mtr_s_lock(&(purge_sys->latch), &mtr); mutex_enter(&kernel_mutex); trx = NULL; if (!trx_is_active(trx_id)) { /* The transaction that modified or inserted clust_rec is no longer active: no implicit lock on rec */ goto exit_func; } if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index, clust_offsets, TRUE)) { /* Corruption noticed: try to avoid a crash by returning */ goto exit_func; } comp = page_rec_is_comp(rec); ut_ad(index->table == clust_index->table); ut_ad(!!comp == dict_table_is_comp(index->table)); ut_ad(!comp == !page_rec_is_comp(clust_rec)); /* We look up if some earlier version, which was modified by the trx_id transaction, of the clustered index record would require rec to be in a different state (delete marked or unmarked, or have different field values, or not existing). If there is such a version, then rec was modified by the trx_id transaction, and it has an implicit x-lock on rec. Note that if clust_rec itself would require rec to be in a different state, then the trx_id transaction has not yet had time to modify rec, and does not necessarily have an implicit x-lock on rec. */ rec_del = rec_get_deleted_flag(rec, comp); trx = NULL; version = clust_rec; for (;;) { rec_t* prev_version; ulint vers_del; row_ext_t* ext; trx_id_t prev_trx_id; mutex_exit(&kernel_mutex); /* While we retrieve an earlier version of clust_rec, we release the kernel mutex, because it may take time to access the disk. After the release, we have to check if the trx_id transaction is still active. We keep the semaphore in mtr on the clust_rec page, so that no other transaction can update it and get an implicit x-lock on rec. */ heap2 = heap; heap = mem_heap_create(1024); #ifdef UNIV_DEBUG err = #endif /* UNIV_DEBUG */ trx_undo_prev_version_build(clust_rec, &mtr, version, clust_index, clust_offsets, heap, &prev_version); mem_heap_free(heap2); /* free version and clust_offsets */ if (prev_version == NULL) { mutex_enter(&kernel_mutex); if (!trx_is_active(trx_id)) { /* Transaction no longer active: no implicit x-lock */ break; } /* If the transaction is still active, clust_rec must be a fresh insert, because no previous version was found. */ ut_ad(err == DB_SUCCESS); /* It was a freshly inserted version: there is an implicit x-lock on rec */ trx = trx_get_on_id(trx_id); break; } clust_offsets = rec_get_offsets(prev_version, clust_index, NULL, ULINT_UNDEFINED, &heap); vers_del = rec_get_deleted_flag(prev_version, comp); prev_trx_id = row_get_rec_trx_id(prev_version, clust_index, clust_offsets); /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, clust_offsets, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* entry may be NULL if a record was inserted in place of a deleted record, and the BLOB pointers of the new record were not initialized yet. But in that case, prev_version should be NULL. */ ut_a(entry); mutex_enter(&kernel_mutex); if (!trx_is_active(trx_id)) { /* Transaction no longer active: no implicit x-lock */ break; } /* If we get here, we know that the trx_id transaction is still active and it has modified prev_version. Let us check if prev_version would require rec to be in a different state. */ /* The previous version of clust_rec must be accessible, because the transaction is still active and clust_rec was not a fresh insert. */ ut_ad(err == DB_SUCCESS); /* We check if entry and rec are identified in the alphabetical ordering */ if (0 == cmp_dtuple_rec(entry, rec, offsets)) { /* The delete marks of rec and prev_version should be equal for rec to be in the state required by prev_version */ if (rec_del != vers_del) { trx = trx_get_on_id(trx_id); break; } /* It is possible that the row was updated so that the secondary index record remained the same in alphabetical ordering, but the field values changed still. For example, 'abc' -> 'ABC'. Check also that. */ dtuple_set_types_binary(entry, dtuple_get_n_fields(entry)); if (0 != cmp_dtuple_rec(entry, rec, offsets)) { trx = trx_get_on_id(trx_id); break; } } else if (!rec_del) { /* The delete mark should be set in rec for it to be in the state required by prev_version */ trx = trx_get_on_id(trx_id); break; } if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) { /* The versions modified by the trx_id transaction end to prev_version: no implicit x-lock */ break; } version = prev_version; }/* for (;;) */ exit_func: mtr_commit(&mtr); mem_heap_free(heap); return(trx); }
/*****************************************************************//** Finds out if a version of the record, where the version >= the current purge view, should have ientry as its secondary index entry. We check if there is any not delete marked version of the record where the trx id >= purge view, and the secondary index entry and ientry are identified in the alphabetical ordering; exactly in this case we return TRUE. @return TRUE if earlier version should have */ UNIV_INTERN ibool row_vers_old_has_index_entry( /*=========================*/ ibool also_curr,/*!< in: TRUE if also rec is included in the versions to search; otherwise only versions prior to it are searched */ const rec_t* rec, /*!< in: record in the clustered index; the caller must have a latch on the page */ mtr_t* mtr, /*!< in: mtr holding the latch on rec; it will also hold the latch on purge_view */ dict_index_t* index, /*!< in: the secondary index */ const dtuple_t* ientry) /*!< in: the secondary index entry */ { const rec_t* version; rec_t* prev_version; dict_index_t* clust_index; ulint* clust_offsets; mem_heap_t* heap; mem_heap_t* heap2; const dtuple_t* row; const dtuple_t* entry; ulint err; ulint comp; ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX)); #ifdef UNIV_SYNC_DEBUG ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED)); #endif /* UNIV_SYNC_DEBUG */ mtr_s_lock(&(purge_sys->latch), mtr); clust_index = dict_table_get_first_index(index->table); comp = page_rec_is_comp(rec); ut_ad(!dict_table_is_comp(index->table) == !comp); heap = mem_heap_create(1024); clust_offsets = rec_get_offsets(rec, clust_index, NULL, ULINT_UNDEFINED, &heap); if (also_curr && !rec_get_deleted_flag(rec, comp)) { row_ext_t* ext; /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, rec, clust_offsets, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* If entry == NULL, the record contains unset BLOB pointers. This must be a freshly inserted record. If this is called from row_purge_remove_sec_if_poss_low(), the thread will hold latches on the clustered index and the secondary index. Because the insert works in three steps: (1) insert the record to clustered index (2) store the BLOBs and update BLOB pointers (3) insert records to secondary indexes the purge thread can safely ignore freshly inserted records and delete the secondary index record. The thread that inserted the new record will be inserting the secondary index records. */ /* NOTE that we cannot do the comparison as binary fields because the row is maybe being modified so that the clustered index record has already been updated to a different binary value in a char field, but the collation identifies the old and new value anyway! */ if (entry && !dtuple_coll_cmp(ientry, entry)) { mem_heap_free(heap); return(TRUE); } } version = rec; for (;;) { heap2 = heap; heap = mem_heap_create(1024); err = trx_undo_prev_version_build(rec, mtr, version, clust_index, clust_offsets, heap, &prev_version); mem_heap_free(heap2); /* free version and clust_offsets */ if (err != DB_SUCCESS || !prev_version) { /* Versions end here */ mem_heap_free(heap); return(FALSE); } clust_offsets = rec_get_offsets(prev_version, clust_index, NULL, ULINT_UNDEFINED, &heap); if (!rec_get_deleted_flag(prev_version, comp)) { row_ext_t* ext; /* The stack of versions is locked by mtr. Thus, it is safe to fetch the prefixes for externally stored columns. */ row = row_build(ROW_COPY_POINTERS, clust_index, prev_version, clust_offsets, NULL, &ext, heap); entry = row_build_index_entry(row, ext, index, heap); /* If entry == NULL, the record contains unset BLOB pointers. This must be a freshly inserted record that we can safely ignore. For the justification, see the comments after the previous row_build_index_entry() call. */ /* NOTE that we cannot do the comparison as binary fields because maybe the secondary index record has already been updated to a different binary value in a char field, but the collation identifies the old and new value anyway! */ if (entry && !dtuple_coll_cmp(ientry, entry)) { mem_heap_free(heap); return(TRUE); } } version = prev_version; } }
/*******************************************************************//** Truncates the index tree associated with a row in SYS_INDEXES table. @return new root page number, or FIL_NULL on failure */ UNIV_INTERN ulint dict_truncate_index_tree( /*=====================*/ dict_table_t* table, /*!< in: the table the index belongs to */ ulint space, /*!< in: 0=truncate, nonzero=create the index tree in the given tablespace */ btr_pcur_t* pcur, /*!< in/out: persistent cursor pointing to record in the clustered index of SYS_INDEXES table. The cursor may be repositioned in this call. */ mtr_t* mtr) /*!< in: mtr having the latch on the record page. The mtr may be committed and restarted in this call. */ { ulint root_page_no; ibool drop = !space; ulint zip_size; ulint type; index_id_t index_id; rec_t* rec; const byte* ptr; ulint len; dict_index_t* index; ut_ad(mutex_own(&(dict_sys->mutex))); ut_a(!dict_table_is_comp(dict_sys->sys_indexes)); rec = btr_pcur_get_rec(pcur); ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len); ut_ad(len == 4); root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); if (drop && root_page_no == FIL_NULL) { /* The tree has been freed. */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing index of table %s!\n", table->name); drop = FALSE; } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_SPACE_NO_FIELD, &len); ut_ad(len == 4); if (drop) { space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); } zip_size = fil_space_get_zip_size(space); if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) { /* It is a single table tablespace and the .ibd file is missing: do nothing */ ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Trying to TRUNCATE" " a missing .ibd file of table %s!\n", table->name); return(FIL_NULL); } ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_TYPE_FIELD, &len); ut_ad(len == 4); type = mach_read_from_4(ptr); ptr = rec_get_nth_field_old(rec, 1, &len); ut_ad(len == 8); index_id = mach_read_from_8(ptr); if (!drop) { goto create; } /* We free all the pages but the root page first; this operation may span several mini-transactions */ btr_free_but_not_root(space, zip_size, root_page_no); /* Then we free the root page in the same mini-transaction where we create the b-tree and write its new root page number to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally truncated */ btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, NULL, mtr); btr_free_root(space, zip_size, root_page_no, mtr); create: /* We will temporarily write FIL_NULL to the PAGE_NO field in SYS_INDEXES, so that the database will not get into an inconsistent state in case it crashes between the mtr_commit() below and the following mtr_commit() call. */ page_rec_write_field(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, FIL_NULL, mtr); /* We will need to commit the mini-transaction in order to avoid deadlocks in the btr_create() call, because otherwise we would be freeing and allocating pages in the same mini-transaction. */ btr_pcur_store_position(pcur, mtr); mtr_commit(mtr); mtr_start(mtr); btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr); /* Find the index corresponding to this SYS_INDEXES record. */ for (index = UT_LIST_GET_FIRST(table->indexes); index; index = UT_LIST_GET_NEXT(indexes, index)) { if (index->id == index_id) { root_page_no = btr_create(type, space, zip_size, index_id, index, mtr); index->page = (unsigned int) root_page_no; return(root_page_no); } } ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Index %llu of table %s is missing\n" "InnoDB: from the data dictionary during TRUNCATE!\n", (ullint) index_id, table->name); return(FIL_NULL); }
/*****************************************************************//** Based on a table object, this function builds the entry to be inserted in the SYS_TABLES system table. @return the tuple which should be inserted */ static dtuple_t* dict_create_sys_tables_tuple( /*=========================*/ const dict_table_t* table, /*!< in: table */ mem_heap_t* heap) /*!< in: memory heap from which the memory for the built tuple is allocated */ { dict_table_t* sys_tables; dtuple_t* entry; dfield_t* dfield; byte* ptr; ut_ad(table); ut_ad(heap); sys_tables = dict_sys->sys_tables; entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS); dict_table_copy_types(entry, sys_tables); /* 0: NAME -----------------------------*/ dfield = dtuple_get_nth_field(entry, 0/*NAME*/); dfield_set_data(dfield, table->name, ut_strlen(table->name)); /* 3: ID -------------------------------*/ dfield = dtuple_get_nth_field(entry, 1/*ID*/); ptr = mem_heap_alloc(heap, 8); mach_write_to_8(ptr, table->id); dfield_set_data(dfield, ptr, 8); /* 4: N_COLS ---------------------------*/ dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/); #if DICT_TF_COMPACT != 1 #error #endif ptr = mem_heap_alloc(heap, 4); if (dict_table_is_gcs(table)) /* ±í¶¨ÒåÐÞ¸Ä */ { ut_ad(dict_table_is_comp(table)); mach_write_to_4(ptr, table->n_def | (1 << 31) | (1 << 30)); } else { mach_write_to_4(ptr, table->n_def | ((table->flags & DICT_TF_COMPACT) << 31)); } dfield_set_data(dfield, ptr, 4); /* 5: TYPE -----------------------------*/ dfield = dtuple_get_nth_field(entry, 3/*TYPE*/); ptr = mem_heap_alloc(heap, 4); if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) { ut_a(table->flags & DICT_TF_COMPACT); ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP); ut_a((table->flags & DICT_TF_ZSSIZE_MASK) <= (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT)); ut_a(!(table->flags & (~0 << DICT_TF2_BITS))); mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS)); } else { mach_write_to_4(ptr, DICT_TABLE_ORDINARY); } dfield_set_data(dfield, ptr, 4); /* 6: MIX_ID (obsolete) ---------------------------*/ dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/); ptr = mem_heap_zalloc(heap, 8); dfield_set_data(dfield, ptr, 8); /* 7: MIX_LEN (additional flags) --------------------------*/ dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/); ptr = mem_heap_alloc(heap, 4); mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT); ut_ad(table->n_cols_before_alter_table == 0); dfield_set_data(dfield, ptr, 4); /* 8: CLUSTER_NAME ---------------------*/ dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/); dfield_set_null(dfield); /* not supported */ /* 9: SPACE ----------------------------*/ dfield = dtuple_get_nth_field(entry, 7/*SPACE*/); ptr = mem_heap_alloc(heap, 4); mach_write_to_4(ptr, table->space); dfield_set_data(dfield, ptr, 4); /*----------------------------------*/ return(entry); }
/************************************************************************** Reports in the undo log of an update or delete marking of a clustered index record. */ static ulint trx_undo_page_report_modify( /*========================*/ /* out: byte offset of the inserted undo log entry on the page if succeed, 0 if fail */ page_t* undo_page, /* in: undo log page */ trx_t* trx, /* in: transaction */ dict_index_t* index, /* in: clustered index where update or delete marking is done */ rec_t* rec, /* in: clustered index record which has NOT yet been modified */ const ulint* offsets, /* in: rec_get_offsets(rec, index) */ upd_t* update, /* in: update vector which tells the columns to be updated; in the case of a delete, this should be set to NULL */ ulint cmpl_info, /* in: compiler info on secondary index updates */ mtr_t* mtr) /* in: mtr */ { dict_table_t* table; upd_field_t* upd_field; ulint first_free; byte* ptr; ulint len; byte* field; ulint flen; ulint pos; dulint roll_ptr; dulint trx_id; ulint bits; ulint col_no; byte* old_ptr; ulint type_cmpl; byte* type_cmpl_ptr; ulint i; ut_a(index->type & DICT_CLUSTERED); ut_ad(rec_offs_validate(rec, index, offsets)); ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE); table = index->table; first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE); ptr = undo_page + first_free; ut_ad(first_free <= UNIV_PAGE_SIZE); if (trx_undo_left(undo_page, ptr) < 50) { /* NOTE: the value 50 must be big enough so that the general fields written below fit on the undo log page */ return(0); } /* Reserve 2 bytes for the pointer to the next undo log record */ ptr += 2; /* Store first some general parameters to the undo log */ if (update) { if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) { type_cmpl = TRX_UNDO_UPD_DEL_REC; } else { type_cmpl = TRX_UNDO_UPD_EXIST_REC; } } else { type_cmpl = TRX_UNDO_DEL_MARK_REC; } type_cmpl = type_cmpl | (cmpl_info * TRX_UNDO_CMPL_INFO_MULT); mach_write_to_1(ptr, type_cmpl); type_cmpl_ptr = ptr; ptr++; len = mach_dulint_write_much_compressed(ptr, trx->undo_no); ptr += len; len = mach_dulint_write_much_compressed(ptr, table->id); ptr += len; /*----------------------------------------*/ /* Store the state of the info bits */ bits = rec_get_info_bits(rec, dict_table_is_comp(table)); mach_write_to_1(ptr, bits); ptr += 1; /* Store the values of the system columns */ field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_TRX_ID), &len); ut_ad(len == DATA_TRX_ID_LEN); trx_id = trx_read_trx_id(field); field = rec_get_nth_field(rec, offsets, dict_index_get_sys_col_pos( index, DATA_ROLL_PTR), &len); ut_ad(len == DATA_ROLL_PTR_LEN); roll_ptr = trx_read_roll_ptr(field); len = mach_dulint_write_compressed(ptr, trx_id); ptr += len; len = mach_dulint_write_compressed(ptr, roll_ptr); ptr += len; /*----------------------------------------*/ /* Store then the fields required to uniquely determine the record which will be modified in the clustered index */ for (i = 0; i < dict_index_get_n_unique(index); i++) { field = rec_get_nth_field(rec, offsets, i, &flen); if (trx_undo_left(undo_page, ptr) < 4) { return(0); } len = mach_write_compressed(ptr, flen); ptr += len; if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } /*----------------------------------------*/ /* Save to the undo log the old values of the columns to be updated. */ if (update) { if (trx_undo_left(undo_page, ptr) < 5) { return(0); } len = mach_write_compressed(ptr, upd_get_n_fields(update)); ptr += len; for (i = 0; i < upd_get_n_fields(update); i++) { upd_field = upd_get_nth_field(update, i); pos = upd_field->field_no; /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5) { return(0); } len = mach_write_compressed(ptr, pos); ptr += len; /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } if (rec_offs_nth_extern(offsets, pos)) { /* If a field has external storage, we add to flen the flag */ len = mach_write_compressed( ptr, UNIV_EXTERN_STORAGE_FIELD + flen); /* Notify purge that it eventually has to free the old externally stored field */ trx->update_undo->del_marks = TRUE; *type_cmpl_ptr = *type_cmpl_ptr | TRX_UNDO_UPD_EXTERN; } else { len = mach_write_compressed(ptr, flen); } ptr += len; if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } /*----------------------------------------*/ /* In the case of a delete marking, and also in the case of an update where any ordering field of any index changes, store the values of all columns which occur as ordering fields in any index. This info is used in the purge of old versions where we use it to build and search the delete marked index records, to look if we can remove them from the index tree. Note that starting from 4.0.14 also externally stored fields can be ordering in some index. But we always store at least 384 first bytes locally to the clustered index record, which means we can construct the column prefix fields in the index from the stored data. */ if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) { trx->update_undo->del_marks = TRUE; if (trx_undo_left(undo_page, ptr) < 5) { return(0); } old_ptr = ptr; /* Reserve 2 bytes to write the number of bytes the stored fields take in this undo record */ ptr += 2; for (col_no = 0; col_no < dict_table_get_n_cols(table); col_no++) { const dict_col_t* col = dict_table_get_nth_col(table, col_no); if (col->ord_part > 0) { pos = dict_index_get_nth_col_pos(index, col_no); /* Write field number to undo log */ if (trx_undo_left(undo_page, ptr) < 5) { return(0); } len = mach_write_compressed(ptr, pos); ptr += len; /* Save the old value of field */ field = rec_get_nth_field(rec, offsets, pos, &flen); if (trx_undo_left(undo_page, ptr) < 5) { return(0); } len = mach_write_compressed(ptr, flen); ptr += len; if (flen != UNIV_SQL_NULL) { if (trx_undo_left(undo_page, ptr) < flen) { return(0); } ut_memcpy(ptr, field, flen); ptr += flen; } } } mach_write_to_2(old_ptr, ptr - old_ptr); } /*----------------------------------------*/ /* Write pointers to the previous and the next undo log records */ if (trx_undo_left(undo_page, ptr) < 2) { return(0); } mach_write_to_2(ptr, first_free); ptr += 2; mach_write_to_2(undo_page + first_free, ptr - undo_page); mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE, ptr - undo_page); /* Write to the REDO log about this change in the UNDO log */ trx_undof_page_add_undo_rec_log(undo_page, first_free, ptr - undo_page, mtr); return(first_free); }
/*************************************************************** Removes a secondary index entry without modifying the index tree, if possible. @return TRUE if success or if not found */ static ibool row_purge_remove_sec_if_poss_leaf( /*==============================*/ purge_node_t* node, /*!< in: row purge node */ dict_index_t* index, /*!< in: index */ const dtuple_t* entry) /*!< in: index entry */ { mtr_t mtr; btr_pcur_t pcur; enum row_search_result search_result; log_free_check(); mtr_start(&mtr); /* Set the purge node for the call to row_purge_poss_sec(). */ pcur.btr_cur.purge_node = node; /* Set the query thread, so that ibuf_insert_low() will be able to invoke thd_get_trx(). */ pcur.btr_cur.thr = que_node_get_parent(node); search_result = row_search_index_entry( index, entry, BTR_MODIFY_LEAF | BTR_DELETE, &pcur, &mtr); switch (search_result) { ibool success; case ROW_FOUND: /* Before attempting to purge a record, check if it is safe to do so. */ if (row_purge_poss_sec(node, index, entry)) { btr_cur_t* btr_cur = btr_pcur_get_btr_cur(&pcur); /* Only delete-marked records should be purged. */ ut_ad(REC_INFO_DELETED_FLAG & rec_get_info_bits( btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); if (!btr_cur_optimistic_delete(btr_cur, &mtr)) { /* The index entry could not be deleted. */ success = FALSE; goto func_exit; } } /* fall through (the index entry is still needed, or the deletion succeeded) */ case ROW_NOT_DELETED_REF: /* The index entry is still needed. */ case ROW_BUFFERED: /* The deletion was buffered. */ case ROW_NOT_FOUND: /* The index entry does not exist, nothing to do. */ success = TRUE; func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); return(success); } ut_error; return(FALSE); }
/*************************************************************** Removes a secondary index entry if possible, by modifying the index tree. Does not try to buffer the delete. @return TRUE if success or if not found */ static ibool row_purge_remove_sec_if_poss_tree( /*==============================*/ purge_node_t* node, /*!< in: row purge node */ dict_index_t* index, /*!< in: index */ const dtuple_t* entry) /*!< in: index entry */ { btr_pcur_t pcur; btr_cur_t* btr_cur; ibool success = TRUE; ulint err; mtr_t mtr; enum row_search_result search_result; log_free_check(); mtr_start(&mtr); search_result = row_search_index_entry(index, entry, BTR_MODIFY_TREE, &pcur, &mtr); switch (search_result) { case ROW_NOT_FOUND: /* Not found. This is a legitimate condition. In a rollback, InnoDB will remove secondary recs that would be purged anyway. Then the actual purge will not find the secondary index record. Also, the purge itself is eager: if it comes to consider a secondary index record, and notices it does not need to exist in the index, it will remove it. Then if/when the purge comes to consider the secondary index record a second time, it will not exist any more in the index. */ /* fputs("PURGE:........sec entry not found\n", stderr); */ /* dtuple_print(stderr, entry); */ goto func_exit; case ROW_FOUND: break; case ROW_BUFFERED: case ROW_NOT_DELETED_REF: /* These are invalid outcomes, because the mode passed to row_search_index_entry() did not include any of the flags BTR_INSERT, BTR_DELETE, or BTR_DELETE_MARK. */ ut_error; } btr_cur = btr_pcur_get_btr_cur(&pcur); /* We should remove the index record if no later version of the row, which cannot be purged yet, requires its existence. If some requires, we should do nothing. */ if (row_purge_poss_sec(node, index, entry)) { /* Remove the index record, which should have been marked for deletion. */ ut_ad(REC_INFO_DELETED_FLAG & rec_get_info_bits(btr_cur_get_rec(btr_cur), dict_table_is_comp(index->table))); btr_cur_pessimistic_delete(&err, FALSE, btr_cur, RB_NONE, &mtr); switch (UNIV_EXPECT(err, DB_SUCCESS)) { case DB_SUCCESS: break; case DB_OUT_OF_FILE_SPACE: success = FALSE; break; default: ut_error; } } func_exit: btr_pcur_close(&pcur); mtr_commit(&mtr); return(success); }
/********************************************************//** Opens a buffer for mlog, writes the initial log record and, if needed, the field lengths of an index. @return buffer, NULL if log mode MTR_LOG_NONE */ UNIV_INTERN byte* mlog_open_and_write_index( /*======================*/ mtr_t* mtr, /*!< in: mtr */ const byte* rec, /*!< in: index record or page */ dict_index_t* index, /*!< in: record descriptor */ byte type, /*!< in: log item type */ ulint size) /*!< in: requested buffer size in bytes (if 0, calls mlog_close() and returns NULL) */ { byte* log_ptr; const byte* log_start; const byte* log_end; ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table)); if (!page_rec_is_comp(rec)) { log_start = log_ptr = mlog_open(mtr, 11 + size); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); log_end = log_ptr + 11 + size; } else { ulint i; ulint n = dict_index_get_n_fields(index); ibool is_gcs_cluster = dict_index_is_gcs_clust_after_alter_table(index); /* total size needed */ /* redo日志有可能需要多两字节,total需要根据实际情况分配空间! */ ulint total = 11 + size + (n + 2 + (is_gcs_cluster ? 1 : 0)) * 2; ulint alloc = total; /* allocate at most DYN_ARRAY_DATA_SIZE at a time */ if (alloc > DYN_ARRAY_DATA_SIZE) { alloc = DYN_ARRAY_DATA_SIZE; } log_start = log_ptr = mlog_open(mtr, alloc); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_end = log_ptr + alloc; log_ptr = mlog_write_initial_log_record_fast(rec, type, log_ptr, mtr); /* 在第一次alter table前,所有gcs表可以当成compact表,即使是redo过程,这样redo log也兼容! */ if (is_gcs_cluster) { //ut_ad(rec_is_gcs(rec) || rec == page_align(rec) || rec_get_status(rec) & REC_STATUS_NODE_PTR); /* rec有可能就是页头,如日志MLOG_COMP_LIST_END_COPY_CREATED */ mach_write_to_2(log_ptr, n | 0x8000); /* 标记是gcs表 */ } else { ut_ad(rec == page_align(rec) || !rec_is_gcs(rec)); mach_write_to_2(log_ptr, n); } /* 对于gcs聚集索引,记录第一次alter table前聚集索引的字段数 */ if (is_gcs_cluster) { log_ptr += 2; mach_write_to_2(log_ptr, (ulint)index->n_fields_before_alter); ut_ad(!index->n_fields_before_alter == !dict_index_is_gcs_clust_after_alter_table(index)); } log_ptr += 2; mach_write_to_2(log_ptr, dict_index_get_n_unique_in_tree(index)); log_ptr += 2; for (i = 0; i < n; i++) { dict_field_t* field; const dict_col_t* col; ulint len; field = dict_index_get_nth_field(index, i); col = dict_field_get_col(field); len = field->fixed_len; ut_ad(len < 0x7fff); if (len == 0 && (col->len > 255 || col->mtype == DATA_BLOB)) { /* variable-length field with maximum length > 255 */ len = 0x7fff; } if (col->prtype & DATA_NOT_NULL) { len |= 0x8000; } if (log_ptr + 2 > log_end) { mlog_close(mtr, log_ptr); ut_a(total > (ulint) (log_ptr - log_start)); total -= log_ptr - log_start; alloc = total; if (alloc > DYN_ARRAY_DATA_SIZE) { alloc = DYN_ARRAY_DATA_SIZE; } log_start = log_ptr = mlog_open(mtr, alloc); if (!log_ptr) { return(NULL); /* logging is disabled */ } log_end = log_ptr + alloc; } mach_write_to_2(log_ptr, len); log_ptr += 2; } } if (size == 0) { mlog_close(mtr, log_ptr); log_ptr = NULL; } else if (log_ptr + size > log_end) { mlog_close(mtr, log_ptr); log_ptr = mlog_open(mtr, size); } return(log_ptr); }