bt_rec_ptr_t bt_put(gd_region *reg, int4 block) { bt_rec_ptr_t bt, q0, q1, hdr; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; cache_rec_ptr_t cr; th_rec_ptr_t th; trans_num lcl_tn; uint4 lcnt; csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs; csd = csa->hdr; assert(csa->now_crit || csd->clustered); assert(dba_mm != csa->hdr->acc_meth); lcl_tn = csa->ti->curr_tn; hdr = csa->bt_header + (block % csd->bt_buckets); assert(BT_QUEHEAD == hdr->blk); for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl); ; bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++) { if (BT_QUEHEAD == bt->blk) { /* there is no matching bt */ assert(bt == hdr); bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque)); if (CR_NOTVALID != bt->cache_index) { /* the oldest bt is still valid */ assert(!in_wcs_recover); cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index); if (cr->dirty) { /* get it written so it can be reused */ BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty); if (FALSE == wcs_get_space(reg, 0, cr)) { assert(csa->nl->wc_blocked); /* only reason we currently know * why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); BG_TRACE_PRO_ANY(csa, wcb_bt_put); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"), process_id, &lcl_tn, DB_LEN_STR(reg)); return NULL; } } bt->cache_index = CR_NOTVALID; cr->bt_index = 0; } q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl); q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0); if (EMPTY_QUEUE == (sm_long_t)q1) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1); bt->blk = block; bt->killtn = lcl_tn; insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr); th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (bt->blk == block) { /* bt_put should never be called twice for the same block with the same lcl_tn. This is because * t_end/tp_tend update every block only once as part of each update transaction. Assert this. * The two exceptions are * a) Forward journal recovery which simulates a 2-phase M-kill where the same block * could get updated in both phases (example bitmap block gets updated for blocks created * within the TP transaction as well as for blocks that are freed up in the 2nd phase of * the M-kill) with the same transaction number. This is because although GT.M would have * updated the same block with different transaction numbers in the two phases, forward * recovery will update it with the same tn and instead increment the db tn on seeing the * following INCTN journal record(s). * b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block * and potentially with the same tn. This is because the state of the queues is questionable * and there could be more than one cache record for a given block number. */ assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa))); q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl); th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque))); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (0 == bt->blkque.fl) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2); if (lcnt >= csd->n_bts) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3); } insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base); bt->tn = lcl_tn; return bt; }
cache_rec_ptr_t db_csh_getn(block_id block) { cache_rec_ptr_t hdr, q0, start_cr, cr; bt_rec_ptr_t bt; unsigned int lcnt, ocnt; int rip, max_ent, pass1, pass2, pass3; int4 flsh_trigger; uint4 r_epid, dummy; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; srch_blk_status *tp_srch_status; error_def(ERR_BUFRDTIMEOUT); error_def(ERR_INVALIDRIP); csa = cs_addrs; csd = csa->hdr; assert(csa->now_crit); assert(csa == &FILE_INFO(gv_cur_region)->s_addrs); max_ent = csd->n_bts; cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off); hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets); start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets; pass1 = max_ent; /* skip referred or dirty or read-into cache records */ pass2 = 2 * max_ent; /* skip referred cache records */ pass3 = 3 * max_ent; /* skip nothing */ INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1); for (lcnt = 0; ; lcnt++) { if (lcnt > pass3) { BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed); assert(FALSE); break; } cr++; if (cr == start_cr + max_ent) cr = start_cr; VMS_ONLY( if ((lcnt == pass1) || (lcnt == pass2)) wcs_wtfini(gv_cur_region); ) if (TRUE == cr->refer && lcnt < pass2) { /* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */ cr->refer = FALSE; continue; } if (TRUE == cr->in_cw_set) { /* this process already owns it - skip it */ cr->refer = TRUE; continue; } if (CDB_STAGNATE <= t_tries || mu_reorg_process) { /* Prevent stepping on self when crit for entire transaction. * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block. * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use * is updated to the latest cw_stagnate list of blocks only in tp_hist(). * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's. * This is to allow big-read TP transactions which may use up more than the available global buffers. * There is one issue here in that a block that has been only read till now may be stepped upon here * but may later be needed for update. It is handled by updating the block's corresponding * entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the * "cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the * first time within the transaction since otherwise the transaction would restart due to a * cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still * remain the "tn" when the block was first read within this transaction to ensure the block * hasn't been modified since the start of the transaction. Once we intend on changing the * block i.e. srch_blk_status->ptr is non-NULL, we ensure in the code below not to step on it. * [tp_hist() is the routine that updates the "cr", "cycle" and "tn" of the block]. * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn" * of the first t_qread of the block within that transaction. The above is the only exception. * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of * them even if they don't have a cse. This is to ensure that the current action doesn't * encounter a restart due to cdb_sc_lostcr in tp_hist() even in the fourth-retry. */ if (dollar_tlevel && (tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)cr->blk, &dummy)) && tp_srch_status->ptr) { /* this process is already using the block - skip it */ cr->refer = TRUE; continue; } if (NULL != lookup_hashtab_ent(cw_stagnate, (void *)cr->blk, &dummy)) { cr->refer = TRUE; continue; } } if (cr->dirty) { /* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a * concurrent wcs_wtstart() has reset dirty to 0 but that update did not reach us yet). In this * case the call to wcs_get_space() below will do the necessary memory barrier instructions * (through calls to aswp()) which will allow us to see the non-stale value of cr->dirty. * * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space * done below will return FALSE forcing a cache-rebuild which will fix this situation. * * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine * is wcs_wtfini() which is executed in crit which another process cannot be in as we are in crit now. */ if (gv_cur_region->read_only) continue; if (lcnt < pass1) { if (!csa->timer && (csa->nl->wcs_timers < 1)) wcs_timer_start(gv_cur_region, FALSE); continue; } BG_TRACE_PRO(db_csh_getn_flush_dirty); if (FALSE == wcs_get_space(gv_cur_region, 0, cr)) { /* failed to flush it out - force a rebuild */ BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt); assert(FALSE); break; } assert(0 == cr->dirty); } UNIX_ONLY( /* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR. * In VMS, resetting the write-latch value occurs in wcs_wtfini() which is in CRIT, we are fine. * In Unix, this resetting is done by wcs_wtstart() which is out-of-crit. Therefore, we need to * wait for this value to be LATCH_CLEAR before reusing this cache-record. * Note that we are examining the write-latch-value without holding the interlock. It is ok to do * this because the only two routines that modify the latch value are bg_update() and * wcs_wtstart(). The former cannot be concurrently executing because we are in crit. * The latter will not update the latch value unless this cache-record is dirty. But in this * case we would have most likely gone through the if (cr->dirty) check above. Most likely * because there is one rare possibility where a concurrent wcs_wtstart() has set cr->dirty * to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared. * In all other cases, nobody is modifying the latch since when we got crit and therefore * it is safe to observe the value of the latch without holding the interlock. */ if (LATCH_CLEAR != WRITE_LATCH_VAL(cr)) { /* possible if a concurrent wcs_wtstart() has set cr->dirty to 0 but not yet * cleared the latch. this should be very rare though. */ if (lcnt < pass2) continue; /* try to find some other cache-record to reuse until the 3rd pass */ for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++) wcs_sleep(SLEEP_WRTLATCHWAIT); /* since it is a short lock, sleep the minimum */ if (MAXWRTLATCHWAIT <= ocnt) { BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck); assert(FALSE); continue; } } )
boolean_t tp_tend(boolean_t crit_only) { block_id tp_blk; boolean_t history_validated, is_mm, was_crit, x_lock, do_validation; boolean_t do_deferred_writes = FALSE, replication = FALSE; bt_rec_ptr_t bt; cache_rec_ptr_t cr; cw_set_element *cse; file_control *fc; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; jnl_format_buffer *jfb; sgm_info *si, *tmpsi; tp_region *tr, *tr_last; sgmnt_addrs *csa, *tmpcsa; sgmnt_data_ptr_t csd; srch_blk_status *t1; trans_num ctn, tnque_earliest_tn; trans_num valid_thru; /* buffers touched by this transaction will be valid thru this tn */ enum cdb_sc status; gd_region *save_gv_cur_region; int lcnt, participants; jnldata_hdr_ptr_t jnl_header; int repl_tp_region_count = 0; boolean_t first_time = TRUE, release_crit, yes_jnl_no_repl, retvalue; uint4 jnl_status, leafmods, indexmods; uint4 total_jnl_rec_size; jnlpool_ctl_ptr_t jpl, tjpl; error_def(ERR_DLCKAVOIDANCE); error_def(ERR_JNLTRANS2BIG); assert(dollar_tlevel > 0); assert(0 == jnl_fence_ctl.level); participants = 0; status = cdb_sc_normal; /* if the transaction does no updates and the transaction history has not changed, we do not need any more validation */ do_validation = FALSE; /* initially set to FALSE, but set to TRUE below */ jnl_status = 0; if (FALSE == crit_only) { for (si = first_sgm_info; (NULL != si); si = si->next_sgm_info) { sgm_info_ptr = si; TP_CHANGE_REG_IF_NEEDED(si->gv_cur_region); csa = cs_addrs; csd = cs_data; if ((csd->wc_blocked) || /* If blocked, or.. */ ((dba_mm == csa->hdr->acc_meth) && /* we have MM and.. */ (csa->total_blks != csa->ti->total_blks))) /* and file has been extended */ { /* Force repair */ t_fail_hist[t_tries] = cdb_sc_helpedout; /* special status to prevent punishing altruism */ TP_TRACE_HIST(CR_BLKEMPTY, NULL); return FALSE; } /* whenever si->first_cw_set is non-NULL, ensure that si->update_trans is TRUE */ assert((NULL == si->first_cw_set) || si->update_trans); /* whenever si->first_cw_set is NULL, ensure that si->update_trans is FALSE * except when the set noop optimization is enabled */ assert((NULL != si->first_cw_set) || !si->update_trans || gvdupsetnoop); if (!si->update_trans) { if (si->start_tn == csa->ti->early_tn) { /* read with no change to the transaction history. ensure we haven't overrun * our history buffer and we have reasonable values for first and last */ assert(si->last_tp_hist - si->first_tp_hist <= si->tp_hist_size); continue; } else do_validation = TRUE; } else { do_validation = TRUE; is_mm = (dba_mm == cs_data->acc_meth); /* We are still out of crit if this is not our last attempt. If so, run the region list and check * that we have sufficient free blocks for our update. If not, get them now while we can. * We will repeat this check later in crit but it will hopefully have little or nothing to do. * bypass 1st check if already in crit -- check later */ if (!csa->now_crit && !is_mm && (csa->nl->wc_in_free < si->cw_set_depth + 1) && !wcs_get_space(si->gv_cur_region, si->cw_set_depth + 1, NULL)) assert(FALSE); /* wcs_get_space() should have returned TRUE unconditionally in this case */ } if (si->update_trans && JNL_ENABLED(csa)) { /* compute the total journal record size requirements before grab_crit(). * there is code later that will check for state changes from now to then */ TOTAL_TPJNL_REC_SIZE(total_jnl_rec_size, si, csa); /* compute current transaction's maximum journal space needs in number of disk blocks */ si->tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size); /* check if current TP transaction's journal size needs are greater than max jnl file size */ if (si->tot_jrec_size > csd->autoswitchlimit) /* can't fit in current transaction's journal records into one journal file */ rts_error(VARLSTCNT(6) ERR_JNLTRANS2BIG, 4, si->tot_jrec_size, JNL_LEN_STR(csd), csd->autoswitchlimit); } } /* for (si ... ) */ if (!do_validation) { if (CDB_STAGNATE <= t_tries) { for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr) rel_crit(tr->reg); } UNIX_ONLY( /* Must be done after REVERT since we are no longer in crit */ if (unhandled_stale_timer_pop) process_deferred_stale(); ) return TRUE; } }