cache_rec_ptr_t db_csh_getn(block_id block) { cache_rec_ptr_t hdr, q0, start_cr, cr; bt_rec_ptr_t bt; unsigned int lcnt, ocnt; int rip, max_ent, pass1, pass2, pass3; int4 flsh_trigger; uint4 r_epid, dummy; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; srch_blk_status *tp_srch_status; error_def(ERR_BUFRDTIMEOUT); error_def(ERR_INVALIDRIP); csa = cs_addrs; csd = csa->hdr; assert(csa->now_crit); assert(csa == &FILE_INFO(gv_cur_region)->s_addrs); max_ent = csd->n_bts; cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off); hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets); start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets; pass1 = max_ent; /* skip referred or dirty or read-into cache records */ pass2 = 2 * max_ent; /* skip referred cache records */ pass3 = 3 * max_ent; /* skip nothing */ INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1); for (lcnt = 0; ; lcnt++) { if (lcnt > pass3) { BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed); assert(FALSE); break; } cr++; if (cr == start_cr + max_ent) cr = start_cr; VMS_ONLY( if ((lcnt == pass1) || (lcnt == pass2)) wcs_wtfini(gv_cur_region); ) if (TRUE == cr->refer && lcnt < pass2) { /* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */ cr->refer = FALSE; continue; } if (TRUE == cr->in_cw_set) { /* this process already owns it - skip it */ cr->refer = TRUE; continue; } if (CDB_STAGNATE <= t_tries || mu_reorg_process) { /* Prevent stepping on self when crit for entire transaction. * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block. * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use * is updated to the latest cw_stagnate list of blocks only in tp_hist(). * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's. * This is to allow big-read TP transactions which may use up more than the available global buffers. * There is one issue here in that a block that has been only read till now may be stepped upon here * but may later be needed for update. It is handled by updating the block's corresponding * entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the * "cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the * first time within the transaction since otherwise the transaction would restart due to a * cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still * remain the "tn" when the block was first read within this transaction to ensure the block * hasn't been modified since the start of the transaction. Once we intend on changing the * block i.e. srch_blk_status->ptr is non-NULL, we ensure in the code below not to step on it. * [tp_hist() is the routine that updates the "cr", "cycle" and "tn" of the block]. * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn" * of the first t_qread of the block within that transaction. The above is the only exception. * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of * them even if they don't have a cse. This is to ensure that the current action doesn't * encounter a restart due to cdb_sc_lostcr in tp_hist() even in the fourth-retry. */ if (dollar_tlevel && (tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)cr->blk, &dummy)) && tp_srch_status->ptr) { /* this process is already using the block - skip it */ cr->refer = TRUE; continue; } if (NULL != lookup_hashtab_ent(cw_stagnate, (void *)cr->blk, &dummy)) { cr->refer = TRUE; continue; } } if (cr->dirty) { /* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a * concurrent wcs_wtstart() has reset dirty to 0 but that update did not reach us yet). In this * case the call to wcs_get_space() below will do the necessary memory barrier instructions * (through calls to aswp()) which will allow us to see the non-stale value of cr->dirty. * * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space * done below will return FALSE forcing a cache-rebuild which will fix this situation. * * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine * is wcs_wtfini() which is executed in crit which another process cannot be in as we are in crit now. */ if (gv_cur_region->read_only) continue; if (lcnt < pass1) { if (!csa->timer && (csa->nl->wcs_timers < 1)) wcs_timer_start(gv_cur_region, FALSE); continue; } BG_TRACE_PRO(db_csh_getn_flush_dirty); if (FALSE == wcs_get_space(gv_cur_region, 0, cr)) { /* failed to flush it out - force a rebuild */ BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt); assert(FALSE); break; } assert(0 == cr->dirty); } UNIX_ONLY( /* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR. * In VMS, resetting the write-latch value occurs in wcs_wtfini() which is in CRIT, we are fine. * In Unix, this resetting is done by wcs_wtstart() which is out-of-crit. Therefore, we need to * wait for this value to be LATCH_CLEAR before reusing this cache-record. * Note that we are examining the write-latch-value without holding the interlock. It is ok to do * this because the only two routines that modify the latch value are bg_update() and * wcs_wtstart(). The former cannot be concurrently executing because we are in crit. * The latter will not update the latch value unless this cache-record is dirty. But in this * case we would have most likely gone through the if (cr->dirty) check above. Most likely * because there is one rare possibility where a concurrent wcs_wtstart() has set cr->dirty * to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared. * In all other cases, nobody is modifying the latch since when we got crit and therefore * it is safe to observe the value of the latch without holding the interlock. */ if (LATCH_CLEAR != WRITE_LATCH_VAL(cr)) { /* possible if a concurrent wcs_wtstart() has set cr->dirty to 0 but not yet * cleared the latch. this should be very rare though. */ if (lcnt < pass2) continue; /* try to find some other cache-record to reuse until the 3rd pass */ for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++) wcs_sleep(SLEEP_WRTLATCHWAIT); /* since it is a short lock, sleep the minimum */ if (MAXWRTLATCHWAIT <= ocnt) { BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck); assert(FALSE); continue; } } )
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { int4 status; uint4 blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; boolean_t clustered, hold_onto_crit, was_crit; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; enum db_ver ondsk_blkver; int4 dummy_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked; ht_ent_int4 *tabent; srch_blk_status *blkhist; trans_num dirty, blkhdrtn; sm_uc_ptr_t buffaddr; uint4 stuck_cnt = 0; boolean_t lcl_blk_free; node_local_ptr_t cnl; lcl_blk_free = block_is_free; block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */ first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); /* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */ assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover); if (dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); assert(!chain1.flag || cse); if (cse) { /* transaction has modified the sought after block */ if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode)) { /* Changes have not been committed to shared memory, i.e. still in private memory. * Build block in private buffer if not already done and return the same. */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ assert(gds_t_committed != cse->mode); already_built = (NULL != cse->new_buff); /* Validate the block's search history right after building a private copy. * This is not needed in case gvcst_search is going to reuse the clue's search * history and return (because tp_hist will do the validation of this block). * But if gvcst_search decides to do a fresh traversal (because the clue does not * cover the path of the current input key etc.) the block build that happened now * will not get validated in tp_hist since it will instead be given the current * key's search history path (a totally new path) for validation. Since a private * copy of the block has been built, tp_tend would also skip validating this block * so it is necessary that we validate the block right here. Since it is tricky to * accurately differentiate between the two cases, we do the validation * unconditionally here (besides it is only a few if checks done per block build * so it is considered okay performance-wise). */ gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(NULL != cse->blk_target); if (!already_built && !chain1.flag) { buffaddr = first_tp_srch_status->buffaddr; cr = first_tp_srch_status->cr; assert((is_mm || cr) && buffaddr); blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn; if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl); return (sm_uc_ptr_t)NULL; } if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle) || (first_tp_srch_status->blk_num != cr->blk))) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } else { /* Block changes are already committed to shared memory (possible if we are in TP * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read * block from shared memory; do not look at private memory (i.e. cse) as that might * not be as uptodate as shared memory. */ assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */ /* If this block was newly created as part of the TP transaction, it should not be killed * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would * have had an old_mode of kill_t_create in which case we would not have come into this * else block. Assert accordingly. */ assert(!chain1.flag); first_tp_srch_status = NULL; /* do not use any previous srch_hist information */ } } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; } ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { cr = first_tp_srch_status->cr; assert(cr && !first_tp_srch_status->cse); if (first_tp_srch_status->cycle == cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = cr; cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled in the cache. * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect * if block recycling happened within the same mini-action and restart in that case. * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know * the values to update to. Set a variable that will enable the updation before returning. * Also assert that if we are in the final retry, we are never in a situation where we have a * block that got recycled since the start of the current mini-action. This is easily detected since * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that * have been read as part of the current mini-action until now. */ assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk))); reset_first_tp_srch_status = TRUE; } } } if ((blk >= csa->ti->total_blks) || (blk < 0)) { /* requested block out of range; could occur because of a concurrency conflict */ if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data)) GTMASSERT; assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } # ifdef GTM_CRYPT /* If database is encrypted, check if encryption initialization went fine for this database. If not, * do not let process proceed as it could now potentially get a peek at the desired data from the * decrypted shared memory global buffers (read in from disk by other processes) without having to go to disk. * If DSE, allow for a special case where it is trying to dump a local bitmap block. In this case, DSE * can continue to run fine (even if encryption initialization failed) since bitmap blocks are unencrypted. */ if (csa->encrypt_init_status && (!dse_running || !IS_BITMAP_BLK(blk))) GC_RTS_ERROR(csa->encrypt_init_status, gv_cur_region->dyn.addr->fname); # endif assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; was_crit = csa->now_crit; ocnt = 0; cnl = csa->nl; set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */ hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (!csa->now_crit) { assert(!hold_onto_crit); if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only)) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno); /* a macro that dclast's "wcs_wtstart" and checks for errors etc. */ grab_crit(gv_cur_region); cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csd->trans_hist.curr_tn; /* Record history of most recent disk reads only in dbg builds for now. Although the macro * is just a couple dozen instructions, it is done while holding crit so we want to avoid * delaying crit unless really necessary. Whoever wants this information can enable it * by a build change to remove the DEBUG_ONLY part below. */ DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);) if (!was_crit && !hold_onto_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr); if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr), &ondsk_blkver, lcl_blk_free))) { /* buffer does not contain valid data, so reset blk to be empty */ cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */ cr->blk = CR_BLKEMPTY; cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); assert(was_crit == csa->now_crit); if (FUTURE_READ == status) { /* in cluster, block can be in the "future" with respect to the local history */ assert(TRUE == clustered); assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_future_read; /* t_retry forces the history up to date */ return (sm_uc_ptr_t)NULL; } if (ERR_DYNUPGRDFAIL == status) { /* if we dont hold crit on the region, it is possible due to concurrency conflicts * that this block is unused (i.e. marked free/recycled in bitmap, see comments in * gds_blk_upgrade.h). in this case we should not error out but instead restart. */ if (was_crit) { assert(FALSE); rts_error(VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region)); } else { rdfail_detail = cdb_sc_lostcr; return (sm_uc_ptr_t)NULL; } } if (-1 == status) { /* could have been concurrent truncate, and we read a blk >= csa->ti->total_blks */ /* restart */ rdfail_detail = cdb_sc_truncate; return (sm_uc_ptr_t)NULL; } else rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } disk_blk_read = TRUE; assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); /* Only set in cache if read was success */ cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); } else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt)) { assert(!hold_onto_crit); assert(TRUE == csa->now_crit); assert(cnl->in_crit == process_id); rel_crit(gv_cur_region); } }
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { uint4 status, duint4, blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; bool clustered, was_crit; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; int4 dummy_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked; error_def(ERR_DBFILERR); error_def(ERR_BUFOWNERSTUCK); first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); assert((t_tries < CDB_STAGNATE) || csa->now_crit); if (0 < dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { first_tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)blk, &duint4); ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->ptr : NULL; } assert(!cse || !cse->high_tlevel); if (cse) { /* transaction has modified the sought after block */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ already_built = (NULL != cse->new_buff); gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(cse->blk_target); if (!already_built && !chain1.flag) { assert(first_tp_srch_status && (is_mm || first_tp_srch_status->cr) && first_tp_srch_status->buffaddr); if (first_tp_srch_status->tn <= ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn, ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->levl); return (sm_uc_ptr_t)NULL; } if ((!is_mm) && (first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle || first_tp_srch_status->blk_num != first_tp_srch_status->cr->blk)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } if (certify_all_blocks && FALSE == cert_blk(gv_cur_region, blk, (blk_hdr_ptr_t)cse->new_buff, cse->blk_target->root)) GTMASSERT; } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } assert(!chain1.flag); } else first_tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)blk, &duint4); ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { assert(first_tp_srch_status->cr && !first_tp_srch_status->ptr); if (first_tp_srch_status->cycle == first_tp_srch_status->cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = first_tp_srch_status->cr; first_tp_srch_status->cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled. Allow for * recycling. But update the first_tp_srch_status (for this blk) in the si->first_tp_hist * array to reflect the new buffer, cycle and cache-record. Since we know those only at the end of * t_qread, set a variable here that will enable the updation before returning from t_qread(). */ reset_first_tp_srch_status = TRUE; } } } if ((blk >= csa->ti->total_blks) || (blk < 0)) { /* requested block out of range; could occur because of a concurrency conflict */ if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data)) GTMASSERT; assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; was_crit = csa->now_crit; ocnt = 0; set_wc_blocked = FALSE; /* to indicate whether csd->wc_blocked was set to TRUE by us */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (FALSE == csa->now_crit) { if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } if (csd->flush_trigger <= csa->nl->wcs_active_lvl && FALSE == gv_cur_region->read_only) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno); /* a macro that dclast's wcs_wtstart() and checks for errors etc. */ grab_crit(gv_cur_region); cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csa->ti->curr_tn; if (FALSE == was_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); INCR_DB_CSH_COUNTER(csa, n_dsk_reads, 1); if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr)))) { RELEASE_BUFF_READ_LOCK(cr); assert(was_crit == csa->now_crit); if (FUTURE_READ == status) { /* in cluster, block can be in the "future" with respect to the local history */ assert(TRUE == clustered); assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_future_read; /* t_retry forces the history up to date */ return (sm_uc_ptr_t)NULL; } rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); } else if ((FALSE == was_crit) && (BAD_LUCK_ABOUNDS > ocnt)) { assert(TRUE == csa->now_crit); assert(csa->nl->in_crit == process_id); rel_crit(gv_cur_region); } } if (CR_NOTVALID == (sm_long_t)cr) { SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_get_invalid_blk); set_wc_blocked = TRUE; break; } for (lcnt = 1; ; lcnt++) { if (0 > cr->read_in_progress) { /* it's not being read */ if (clustered && (0 == cr->bt_index) && (cr->tn < ((th_rec *)((uchar_ptr_t)csa->th_base + csa->th_base->tnque.fl))->tn)) { /* can't rely on the buffer */ cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */ cr->blk = CR_BLKEMPTY; break; } *cycle = cr->cycle; *cr_out = cr; VMS_ONLY( /* If we were doing the db_csh_get() above (in t_qread itself) and located the cache-record * which, before coming here and taking a copy of cr->cycle a few lines above, was made an * older twin by another process in bg_update (note this can happen in VMS only) which has * already incremented the cycle, we will end up having a copy of the old cache-record with * its incremented cycle number and hence will succeed in tp_hist validation if we return * this <cr,cycle> combination although we don't want to since this "cr" is not current for * the given block as of now. Note that the "indexmod" optimization in tp_tend() relies on * an accurate intermediate validation by tp_hist() which in turn relies on the <cr,cycle> * value returned by t_qread() to be accurate for a given blk at the current point in time. * We detect the older-twin case by the following check. Note that here we depend on the * the fact that bg_update() sets cr->bt_index to 0 before incrementing cr->cycle. * Given that order, cr->bt_index can be guaranteed to be 0 if we read the incremented cycle */ if (cr->twin && (0 == cr->bt_index)) break; ) if (cr->blk != blk) break; if (was_crit != csa->now_crit) rel_crit(gv_cur_region); assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } /* Note that at this point we expect t_qread() to return a <cr,cycle> combination that * corresponds to "blk" passed in. It is crucial to get an accurate value for both the fields * since tp_hist() relies on this for its intermediate validation. */ return (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr); } if (blk != cr->blk) break; if (lcnt >= BUF_OWNER_STUCK && (0 == (lcnt % BUF_OWNER_STUCK))) { if (FALSE == csa->now_crit) grab_crit(gv_cur_region); if (cr->read_in_progress < -1) { /* outside of design; clear to known state */ BG_TRACE_PRO(t_qread_out_of_design); INTERLOCK_INIT(cr); assert(0 == cr->r_epid); cr->r_epid = 0; } else if (cr->read_in_progress >= 0) { BG_TRACE_PRO(t_qread_buf_owner_stuck); if (0 != (blocking_pid = cr->r_epid)) { if (FALSE == is_proc_alive(blocking_pid, cr->image_count)) { /* process gone: release that process's lock */ assert(0 == cr->bt_index); if (cr->bt_index) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index1); set_wc_blocked = TRUE; break; } cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */ cr->blk = CR_BLKEMPTY; RELEASE_BUFF_READ_LOCK(cr); } else { rel_crit(gv_cur_region); send_msg(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region)); send_msg(VARLSTCNT(9) ERR_BUFOWNERSTUCK, 7, process_id, blocking_pid, cr->blk, cr->blk, (lcnt / BUF_OWNER_STUCK), cr->read_in_progress, cr->rip_latch.latch_pid); if ((4 * BUF_OWNER_STUCK) <= lcnt) GTMASSERT; /* Kickstart the process taking a long time in case it was suspended */ UNIX_ONLY(continue_proc(blocking_pid)); } } else { /* process stopped before could set r_epid */ assert(0 == cr->bt_index); if (cr->bt_index) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index2); set_wc_blocked = TRUE; break; } cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */ cr->blk = CR_BLKEMPTY; RELEASE_BUFF_READ_LOCK(cr); if (cr->read_in_progress < -1) /* race: process released since if r_epid */ LOCK_BUFF_FOR_READ(cr, dummy); } } if (was_crit != csa->now_crit) rel_crit(gv_cur_region); } else wcs_sleep(lcnt); } if (set_wc_blocked) /* cannot use csd->wc_blocked here as we might not necessarily have crit */ break; ocnt++; if (BAD_LUCK_ABOUNDS <= ocnt) { if (BAD_LUCK_ABOUNDS < ocnt || csa->now_crit) { rel_crit(gv_cur_region); GTMASSERT; } if (FALSE == csa->now_crit) grab_crit(gv_cur_region); } } while (TRUE);
enum cdb_sc gvcst_search(gv_key *pKey, /* Key to search for */ srch_hist *pHist) /* History to fill in*/ { unsigned char nLevl; enum cdb_sc status; register int n1; register uchar_ptr_t c1, c2; register sm_uc_ptr_t pRec, pBlkBase; register gv_namehead *pTarg; /* Local copy of gv_target; hope it gets put into register */ register srch_blk_status *pCurr; register srch_blk_status *pNonStar; register srch_hist *pTargHist; int tmp_cmpc; block_id nBlkId; cache_rec_ptr_t cr; int cycle; unsigned short n0, nKeyLen; trans_num tn; cw_set_element *cse; off_chain chain1, chain2; srch_blk_status *tp_srch_status, *srch_status, *leaf_blk_hist; boolean_t already_built, is_mm; ht_ent_int4 *tabent; sm_uc_ptr_t buffaddr; trans_num blkhdrtn, oldest_hist_tn; int hist_size; # ifdef DEBUG boolean_t save_donot_commit; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; pTarg = gv_target; assert(NULL != pTarg); assert(pTarg->root); assert(pKey != &pTarg->clue); nKeyLen = pKey->end + 1; assert(!dollar_tlevel || ((NULL != sgm_info_ptr) && (cs_addrs->sgm_info_ptr == sgm_info_ptr))); SET_GVCST_SEARCH_CLUE(0); INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srches, 1); pTargHist = ((NULL == pHist) ? &pTarg->hist : pHist); /* If FINAL RETRY and TP then we can safely use clues of gv_targets that have been referenced in this * TP transaction (read_local_tn == local_tn). While that is guaranteed to be true for all updates, it * does not hold good for READs since we allow a lot more reads to be done inside a transaction compared * to the # of updates allowed. We allow the same global to be read multiple times inside the same transaction * using different global buffers for each read. This means that we need to validate any clues from the first * read before using it for the second read even if it is in the final retry. This validation is done inside * the below IF block. As for gv_targets which are referenced for the very first time in this TP transaction, * we have no easy way of determining if their clues are still uptodate (i.e. using the clue will guarantee us * no restart) and since we are in the final retry, we dont want to take a risk. So dont use the clue in that case. * * If FINAL RETRY and Non-TP, we will be dealing with only ONE gv_target so its clue would have been reset as * part of the penultimate restart so we dont have any of the above issue in the non-tp case. The only exception * is if we are in gvcst_kill in which case, gvcst_search will be called twice and the clue could be non-zero * for the second invocation. In this case, the clue is guaranteed to be uptodate since it was set just now * as part of the first invocation. So no need to do anything about clue in final retry for Non-TP. */ if ((0 != pTarg->clue.end) && ((CDB_STAGNATE > t_tries) || !dollar_tlevel || (pTarg->read_local_tn == local_tn))) { /* Have non-zero clue. Check if it is usable for the current search key. If so validate clue then and use it. */ /* In t_end, we skipped validating the clue in case of reorg due to the assumption that reorg never uses the clue * i.e. it nullifies the clue before calling gvcst_search. However, it doesn't reset the clue for directory tree * and so continue using the clue if called for root search. Assert accordingly. */ assert(!mu_reorg_process UNIX_ONLY(|| (pTarg->gd_csa->dir_tree == pTarg))); INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srch_clues, 1); status = cdb_sc_normal; /* clue is usable unless proved otherwise */ DEBUG_ONLY(save_donot_commit = TREF(donot_commit);) if (NULL != pHist)
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { int4 status; uint4 blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; boolean_t clustered, hold_onto_crit, was_crit, issued_db_init_crypt_warning, sync_needed; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; enum db_ver ondsk_blkver; int4 dummy_errno, gtmcrypt_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked; ht_ent_int4 *tabent; srch_blk_status *blkhist; trans_num dirty, blkhdrtn; sm_uc_ptr_t buffaddr; uint4 stuck_cnt = 0; boolean_t lcl_blk_free; node_local_ptr_t cnl; gd_segment *seg; uint4 buffs_per_flush, flush_target; enc_info_t *encr_ptr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; lcl_blk_free = block_is_free; block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */ first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); /* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */ assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover); if (dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); assert(!chain1.flag || cse); if (cse) { /* transaction has modified the sought after block */ if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode)) { /* Changes have not been committed to shared memory, i.e. still in private memory. * Build block in private buffer if not already done and return the same. */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ assert(gds_t_committed != cse->mode); already_built = (NULL != cse->new_buff); /* Validate the block's search history right after building a private copy. * This is not needed in case gvcst_search is going to reuse the clue's search * history and return (because tp_hist will do the validation of this block). * But if gvcst_search decides to do a fresh traversal (because the clue does not * cover the path of the current input key etc.) the block build that happened now * will not get validated in tp_hist since it will instead be given the current * key's search history path (a totally new path) for validation. Since a private * copy of the block has been built, tp_tend would also skip validating this block * so it is necessary that we validate the block right here. Since it is tricky to * accurately differentiate between the two cases, we do the validation * unconditionally here (besides it is only a few if checks done per block build * so it is considered okay performance-wise). */ gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(NULL != cse->blk_target); if (!already_built && !chain1.flag) { buffaddr = first_tp_srch_status->buffaddr; cr = first_tp_srch_status->cr; assert((is_mm || cr) && buffaddr); blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn; if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl); return (sm_uc_ptr_t)NULL; } if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle) || (first_tp_srch_status->blk_num != cr->blk))) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } else { /* Block changes are already committed to shared memory (possible if we are in TP * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read * block from shared memory; do not look at private memory (i.e. cse) as that might * not be as uptodate as shared memory. */ assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */ /* If this block was newly created as part of the TP transaction, it should not be killed * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would * have had an old_mode of kill_t_create in which case we would not have come into this * else block. Assert accordingly. */ assert(!chain1.flag); first_tp_srch_status = NULL; /* do not use any previous srch_hist information */ } } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; } ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { cr = first_tp_srch_status->cr; assert(cr && !first_tp_srch_status->cse); if (first_tp_srch_status->cycle == cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = cr; cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled in the cache. * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect * if block recycling happened within the same mini-action and restart in that case. * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know * the values to update to. Set a variable that will enable the updation before returning. * Also assert that if we are in the final retry, we are never in a situation where we have a * block that got recycled since the start of the current mini-action. This is easily detected since * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that * have been read as part of the current mini-action until now. */ assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk))); reset_first_tp_srch_status = TRUE; } } } if ((uint4)blk >= (uint4)csa->ti->total_blks) { /* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is * never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread * can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below. */ assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data)); assert(!csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } was_crit = csa->now_crit; cnl = csa->nl; encr_ptr = csa->encr_ptr; if (NULL != encr_ptr) { /* If this is an encrypted database and we hold crit, make sure our private cycle matches the shared cycle. * Or else we would need to call "process_reorg_encrypt_restart" below (a heavyweight operation) holding crit. */ assert(!was_crit || (cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle)); seg = gv_cur_region->dyn.addr; issued_db_init_crypt_warning = encr_ptr->issued_db_init_crypt_warning; if (!IS_BITMAP_BLK(blk) && issued_db_init_crypt_warning) { /* A non-GT.M process is attempting to read a non-bitmap block, yet it has previously encountered an error * during db_init (because it did not have access to the encryption keys) and reported it with a -W- * severity. Since the block it is attempting to read can be in the unencrypted shared memory (read from * disk by another process with access to the encryption keys), we cannot let it access it without a valid * handle, so issue an rts_error. * * TODO: DSE and LKE could bypass getting the ftok semaphore. LKE is not an issue, but DSE does care about * the csa->reorg_encrypt_cycle. So it means DSE could get an inconsistent copy of reorg_encrypt_cycle * and associated hashes if it had done a bypass and a concurrent REORG -ENCRYPT is holding the ftok * semaphore and changing these values at the same time. */ assert(!IS_GTM_IMAGE); /* GT.M would have error'ed out in db_init */ gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG)); GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname); } else if (cnl->reorg_encrypt_cycle != encr_ptr->reorg_encrypt_cycle) { /* A concurrent MUPIP REORG ENCRYPT occurred. Cannot proceed with the read even if the block is * already loaded from disk into the unencrypted global buffers (security issue). Need to load the * new encryption keys and only let those processes which are able to successfully do this proceed * with the read. First, copy the key hashes from csd into csa->encr_ptr. That needs crit * to ensure a concurrent MUPIP REORG ENCRYPT does not sneak in. * * Note: Even though we asserted a few lines above that if "was_crit" is TRUE, then we expect * the encryption cycles to be in sync, we handle this out-of-design situation in "pro" by fixing * the cycles while holding crit (hopefully rare case so it is okay to hold crit for a heavyweight call). */ if (!was_crit) grab_crit(gv_cur_region); /* Now that we have crit, sync them up by copying the new keys inside crit and opening the key handles * outside crit (a potentially long running operation). */ SIGNAL_REORG_ENCRYPT_RESTART(mu_reorg_encrypt_in_prog, reorg_encrypt_restart_csa, cnl, csa, csd, rdfail_detail, process_id); assert(csa == reorg_encrypt_restart_csa); if (!was_crit) rel_crit(gv_cur_region); /* If we are inside a TP read-write transaction, it is possible we already used the old keys for * prior calls to "jnl_format" so we have to restart (cannot sync up cycles). Do the same for * TP read-only transaction as well as NON-TP read-write transaction. In all these cases we know * the caller is capable of restarting. All other cases we dont know if the caller is capable so * sync up the cycles and proceed using the new keys for the read. * * But since it is possible the caller does not call t_retry right away (e.g. mupip reorg which can * choose to abandone this tree path and move on to another block without aborting this transaction) * it is better we finish the pending call to "process_reorg_encrypt_restart" right here before returning. */ process_reorg_encrypt_restart(); assert(NULL == reorg_encrypt_restart_csa); if (IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans)) { assert(cdb_sc_reorg_encrypt == rdfail_detail); /* set by SIGNAL_REORG_ENCRYPT_RESTART macro */ return (sm_uc_ptr_t)NULL; } } } assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; ocnt = 0; set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */ hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (!csa->now_crit) { assert(!hold_onto_crit); if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } /* assume defaults for flush_target and buffs_per_flush */ flush_target = csd->flush_trigger; buffs_per_flush = 0; if ((0 != csd->epoch_taper) && (FALSE == gv_cur_region->read_only) && JNL_ENABLED(csd) && (0 != cnl->wcs_active_lvl) && (NOJNL != csa->jnl->channel) && (0 != cnl->jnl_file.u.inode) && csd->jnl_before_image) { EPOCH_TAPER_IF_NEEDED(csa, csd, cnl, (gd_region *) 0, FALSE, buffs_per_flush, flush_target); } if ((flush_target <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only)) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, buffs_per_flush, dummy_errno); /* a macro that dclast's "wcs_wtstart" and checks for errors etc. */ /* Get crit but also ensure encryption cycles are in sync ("dsk_read" relies on this). * Note: "sync_needed" should be TRUE very rarely since we synced the cycles just a few lines * above. But in case a MUPIP REORG ENCRYPT concurrently sneaked in between these lines we * need to resync. */ sync_needed = grab_crit_encr_cycle_sync(gv_cur_region); assert(NULL == reorg_encrypt_restart_csa); assert(!sync_needed || (NULL != encr_ptr)); if (sync_needed && IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans)) { assert(cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle); rel_crit(gv_cur_region); rdfail_detail = cdb_sc_reorg_encrypt; /* set by SIGNAL_REORG_ENCRYPT_RESTART macro */ return (sm_uc_ptr_t)NULL; } cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csd->trans_hist.curr_tn; /* Record history of most recent disk reads only in dbg builds for now. Although the macro * is just a couple dozen instructions, it is done while holding crit so we want to avoid * delaying crit unless really necessary. Whoever wants this information can enable it * by a build change to remove the DEBUG_ONLY part below. */ DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);) if (!was_crit && !hold_onto_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr); buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); # ifdef DEBUG /* stop self to test sechshr_db_clnup clears the read state */ if (gtm_white_box_test_case_enabled && (WBTEST_SIGTSTP_IN_T_QREAD == gtm_white_box_test_case_number)) { /* this should never fail, but because of the way we developed the test we got paranoid */ dummy = kill(process_id, SIGTERM); assert(0 == dummy); for (dummy = 10; dummy; dummy--) LONG_SLEEP(10); /* time for sigterm to take hit before we clear block_now_locked */ } # endif if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free))) { /* buffer does not contain valid data, so reset blk to be empty */ cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */ cr->blk = CR_BLKEMPTY; cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); TREF(block_now_locked) = NULL; assert(-1 <= cr->read_in_progress); assert(was_crit == csa->now_crit); if (ERR_DYNUPGRDFAIL == status) { /* if we dont hold crit on the region, it is possible due to concurrency conflicts * that this block is unused (i.e. marked free/recycled in bitmap, see comments in * gds_blk_upgrade.h). in this case we should not error out but instead restart. */ if (was_crit) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region)); } else { rdfail_detail = cdb_sc_lostcr; return (sm_uc_ptr_t)NULL; } } if ((-1 == status) && !was_crit) { /* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch * a full database block's length of data. This can happen if the requested read is * past the end of the file, which can happen if a concurrent truncate occurred * after the blk >= csa->ti->total_blks comparison above. Allow for this scenario * by restarting. However, if we've had crit the whole time, no truncate could have * happened. -1 indicates a problem with the file, so fall through to DBFILERR. */ rdfail_detail = cdb_sc_truncate; return (sm_uc_ptr_t)NULL; } else if (IS_CRYPTERR_MASK(status)) { seg = gv_cur_region->dyn.addr; GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname); } else { /* A DBFILERR can be thrown for two possible reasons: * (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or * (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block * number for one region from another region with fewer total_blks. * We suspect the former is what happened in GTM-7623. Apparently the latter * has been an issue before, too. If either occurs again in pro, this assertpro * distinguishes the two possibilities. */ assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data)); rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } } disk_blk_read = TRUE; assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); /* Only set in cache if read was success */ cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); TREF(block_now_locked) = NULL; assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); return buffaddr; } else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt)) { assert(!hold_onto_crit); assert(TRUE == csa->now_crit); assert(cnl->in_crit == process_id); rel_crit(gv_cur_region); } }