Example #1
0
cache_rec_ptr_t	db_csh_getn(block_id block)
{
	cache_rec_ptr_t		hdr, q0, start_cr, cr;
	bt_rec_ptr_t		bt;
	unsigned int		lcnt, ocnt;
	int			rip, max_ent, pass1, pass2, pass3;
	int4			flsh_trigger;
	uint4			r_epid, dummy;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		*tp_srch_status;

	error_def(ERR_BUFRDTIMEOUT);
	error_def(ERR_INVALIDRIP);

	csa = cs_addrs;
	csd = csa->hdr;
	assert(csa->now_crit);
	assert(csa == &FILE_INFO(gv_cur_region)->s_addrs);
	max_ent = csd->n_bts;
	cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off);
	hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets);
	start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets;
	pass1 = max_ent;	/* skip referred or dirty or read-into cache records */
	pass2 = 2 * max_ent;	/* skip referred cache records */
	pass3 = 3 * max_ent;	/* skip nothing */
	INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1);
	for (lcnt = 0;  ; lcnt++)
	{
		if (lcnt > pass3)
		{
			BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed);
			assert(FALSE);
			break;
		}
		cr++;
		if (cr == start_cr + max_ent)
			cr = start_cr;
		VMS_ONLY(
			if ((lcnt == pass1) || (lcnt == pass2))
				wcs_wtfini(gv_cur_region);
		)
		if (TRUE == cr->refer && lcnt < pass2)
		{	/* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */
			cr->refer = FALSE;
			continue;
		}
		if (TRUE == cr->in_cw_set)
		{	/* this process already owns it - skip it */
			cr->refer = TRUE;
			continue;
		}
		if (CDB_STAGNATE <= t_tries || mu_reorg_process)
		{
			/* Prevent stepping on self when crit for entire transaction.
			 * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block.
			 * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use
			 * 	is updated to the latest cw_stagnate list of blocks only in tp_hist().
			 * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's.
			 * This is to allow big-read TP transactions which may use up more than the available global buffers.
			 * There is one issue here in that a block that has been only read till now may be stepped upon here
			 *	but may later be needed for update. It is handled by updating the block's corresponding
			 *	entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the
			 *	"cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the
			 *	first time within the transaction since otherwise the transaction would restart due to a
			 *	cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still
			 *	remain the "tn" when the block was first read within this transaction to ensure the block
			 *	hasn't been modified since the start of the transaction. Once we intend on changing the
			 *	block i.e. srch_blk_status->ptr is non-NULL, we ensure in the code below not to step on it.
			 *	[tp_hist() is the routine that updates the "cr", "cycle" and "tn" of the block].
			 * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn"
			 *	of the first t_qread of the block within that transaction. The above is the only exception.
			 * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of
			 *	them even if they don't have a cse. This is to ensure that the current action doesn't
			 *	encounter a restart due to cdb_sc_lostcr in tp_hist() even in the fourth-retry.
			 */
			if (dollar_tlevel
				&& (tp_srch_status =
					(srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)cr->blk, &dummy))
				&& tp_srch_status->ptr)
			{	/* this process is already using the block - skip it */
				cr->refer = TRUE;
				continue;
			}
			if (NULL != lookup_hashtab_ent(cw_stagnate, (void *)cr->blk, &dummy))
			{
				cr->refer = TRUE;
				continue;
			}
		}
		if (cr->dirty)
		{	/* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a
			 * concurrent wcs_wtstart() has reset dirty to 0 but that update did not reach us yet). In this
			 * case the call to wcs_get_space() below will do the necessary memory barrier instructions
			 * (through calls to aswp()) which will allow us to see the non-stale value of cr->dirty.
			 *
			 * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space
			 * done below will return FALSE forcing a cache-rebuild which will fix this situation.
			 *
			 * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine
			 * is wcs_wtfini() which is executed in crit which another process cannot be in as we are in crit now.
			 */
			if (gv_cur_region->read_only)
				continue;
			if (lcnt < pass1)
			{
				if (!csa->timer && (csa->nl->wcs_timers < 1))
					wcs_timer_start(gv_cur_region, FALSE);
				continue;
			}
			BG_TRACE_PRO(db_csh_getn_flush_dirty);
			if (FALSE == wcs_get_space(gv_cur_region, 0, cr))
			{	/* failed to flush it out - force a rebuild */
				BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt);
				assert(FALSE);
				break;
			}
			assert(0 == cr->dirty);
		}
		UNIX_ONLY(
			/* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR.
			 * In VMS, resetting the write-latch value occurs in wcs_wtfini() which is in CRIT, we are fine.
			 * In Unix, this resetting is done by wcs_wtstart() which is out-of-crit. Therefore, we need to
			 * 	wait for this value to be LATCH_CLEAR before reusing this cache-record.
			 * Note that we are examining the write-latch-value without holding the interlock. It is ok to do
			 * 	this because the only two routines that modify the latch value are bg_update() and
			 * 	wcs_wtstart(). The former cannot be concurrently executing because we are in crit.
			 * 	The latter will not update the latch value unless this cache-record is dirty. But in this
			 * 	case we would have most likely gone through the if (cr->dirty) check above. Most likely
			 * 	because there is one rare possibility where a concurrent wcs_wtstart() has set cr->dirty
			 * 	to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared.
			 * 	In all other cases, nobody is modifying the latch since when we got crit and therefore
			 * 	it is safe to observe the value of the latch without holding the interlock.
			 */
			if (LATCH_CLEAR != WRITE_LATCH_VAL(cr))
			{	/* possible if a concurrent wcs_wtstart() has set cr->dirty to 0 but not yet
				 * cleared the latch. this should be very rare though.
				 */
				if (lcnt < pass2)
					continue; /* try to find some other cache-record to reuse until the 3rd pass */
				for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++)
					wcs_sleep(SLEEP_WRTLATCHWAIT);	/* since it is a short lock, sleep the minimum */
				if (MAXWRTLATCHWAIT <= ocnt)
				{
					BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck);
					assert(FALSE);
					continue;
				}
			}
		)
Example #2
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
	/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
	int4			status;
	uint4			blocking_pid;
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	boolean_t		clustered, hold_onto_crit, was_crit;
	int			dummy, lcnt, ocnt;
	cw_set_element		*cse;
	off_chain		chain1;
	register sgmnt_addrs	*csa;
	register sgmnt_data_ptr_t	csd;
	enum db_ver		ondsk_blkver;
	int4			dummy_errno;
	boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
	ht_ent_int4		*tabent;
	srch_blk_status		*blkhist;
	trans_num		dirty, blkhdrtn;
	sm_uc_ptr_t		buffaddr;
	uint4			stuck_cnt = 0;
	boolean_t		lcl_blk_free;
	node_local_ptr_t	cnl;

	lcl_blk_free = block_is_free;
	block_is_free = FALSE;	/* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
	first_tp_srch_status = NULL;
	reset_first_tp_srch_status = FALSE;
	csa = cs_addrs;
	csd = csa->hdr;
	INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
	is_mm = (dba_mm == csd->acc_meth);
	/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
	assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
	if (dollar_tlevel)
	{
		assert(sgm_info_ptr);
		if (0 != sgm_info_ptr->cw_set_depth)
		{
			chain1 = *(off_chain *)&blk;
			if (1 == chain1.flag)
			{
				assert(sgm_info_ptr->cw_set_depth);
				if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				else
				{
					assert(FALSE == csa->now_crit);
					rdfail_detail = cdb_sc_blknumerr;
					return (sm_uc_ptr_t)NULL;
				}
			} else
			{
				if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
					first_tp_srch_status = tabent->value;
				else
					first_tp_srch_status = NULL;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
				cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			assert(!chain1.flag || cse);
			if (cse)
			{	/* transaction has modified the sought after block  */
				if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
				{	/* Changes have not been committed to shared memory, i.e. still in private memory.
					 * Build block in private buffer if not already done and return the same.
					 */
					assert(gds_t_writemap != cse->mode);
					if (FALSE == cse->done)
					{	/* out of date, so make it current */
						assert(gds_t_committed != cse->mode);
						already_built = (NULL != cse->new_buff);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
						assert(NULL != cse->blk_target);
						if (!already_built && !chain1.flag)
						{
							buffaddr = first_tp_srch_status->buffaddr;
							cr = first_tp_srch_status->cr;
							assert((is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
								TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
									first_tp_srch_status->tn, blkhdrtn,
									((blk_hdr_ptr_t)buffaddr)->levl);
								return (sm_uc_ptr_t)NULL;
							}
							if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
										|| (first_tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_lostcr; /* should this be something else */
								return (sm_uc_ptr_t)NULL;
							}
						}
						cse->done = TRUE;
					}
					*cycle = CYCLE_PVT_COPY;
					*cr_out = 0;
					return (sm_uc_ptr_t)cse->new_buff;
				} else
				{	/* Block changes are already committed to shared memory (possible if we are in TP
					 * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
					 * block from shared memory; do not look at private memory (i.e. cse) as that might
					 * not be as uptodate as shared memory.
					 */
					assert(csa->now_crit);	/* gvcst_expand_free_subtree does t_qread in crit */
					/* If this block was newly created as part of the TP transaction, it should not be killed
					 * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
					 * have had an old_mode of kill_t_create in which case we would not have come into this
					 * else block. Assert accordingly.
					 */
					assert(!chain1.flag);
					first_tp_srch_status = NULL;	/* do not use any previous srch_hist information */
				}
			}
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				first_tp_srch_status = tabent->value;
			else
				first_tp_srch_status = NULL;
		}
		ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
		if (!is_mm && first_tp_srch_status)
		{
			cr = first_tp_srch_status->cr;
			assert(cr && !first_tp_srch_status->cse);
			if (first_tp_srch_status->cycle == cr->cycle)
			{
				*cycle = first_tp_srch_status->cycle;
				*cr_out = cr;
				cr->refer = TRUE;
				if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
					CWS_INSERT(blk);
				return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
			} else
			{	/* Block was already part of the read-set of this transaction, but got recycled in the cache.
				 * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
				 * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
				 * if block recycling happened within the same mini-action and restart in that case.
				 * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
				 * the values to update to. Set a variable that will enable the updation before returning.
				 * Also assert that if we are in the final retry, we are never in a situation where we have a
				 * block that got recycled since the start of the current mini-action. This is easily detected since
				 * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
				 * have been read as part of the current mini-action until now.
				 */
				assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
				reset_first_tp_srch_status = TRUE;
			}
		}
	}
	if ((blk >= csa->ti->total_blks) || (blk < 0))
	{	/* requested block out of range; could occur because of a concurrency conflict */
		if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data))
			GTMASSERT;
		assert(FALSE == csa->now_crit);
		rdfail_detail = cdb_sc_blknumerr;
		return (sm_uc_ptr_t)NULL;
	}
	if (is_mm)
	{
		*cycle = CYCLE_SHRD_COPY;
		*cr_out = 0;
		return (sm_uc_ptr_t)(mm_read(blk));
	}
#	ifdef GTM_CRYPT
	/* If database is encrypted, check if encryption initialization went fine for this database. If not,
	 * do not let process proceed as it could now potentially get a peek at the desired data from the
	 * decrypted shared memory global buffers (read in from disk by other processes) without having to go to disk.
	 * If DSE, allow for a special case where it is trying to dump a local bitmap block. In this case, DSE
	 * can continue to run fine (even if encryption initialization failed) since bitmap blocks are unencrypted.
	 */
	if (csa->encrypt_init_status && (!dse_running || !IS_BITMAP_BLK(blk)))
		GC_RTS_ERROR(csa->encrypt_init_status, gv_cur_region->dyn.addr->fname);
#	endif
	assert(dba_bg == csd->acc_meth);
	assert(!first_tp_srch_status || !first_tp_srch_status->cr
					|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
	if (FALSE == (clustered = csd->clustered))
		bt = NULL;
	was_crit = csa->now_crit;
	ocnt = 0;
	cnl = csa->nl;
	set_wc_blocked = FALSE;	/* to indicate whether cnl->wc_blocked was set to TRUE by us */
	hold_onto_crit = csa->hold_onto_crit;	/* note down in local to avoid csa-> dereference in multiple usages below */
	do
	{
		if (NULL == (cr = db_csh_get(blk)))
		{	/* not in memory */
			if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
				bt = NULL;
			if (!csa->now_crit)
			{
				assert(!hold_onto_crit);
				if (NULL != bt)
				{	/* at this point, bt is not NULL only if clustered and flushing - wait no crit */
					assert(clustered);
					wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
				}
				if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno);
						/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
				grab_crit(gv_cur_region);
				cr = db_csh_get(blk);			/* in case blk arrived before crit */
			}
			if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
			{	/* Once crit, need to assure that if clustered, that flushing is [still] complete
				 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
				wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
			}
			if (NULL == cr)
			{	/* really not in memory - must get a new buffer */
				assert(csa->now_crit);
				cr = db_csh_getn(blk);
				if (CR_NOTVALID == (sm_long_t)cr)
				{
					assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
					assert(gtm_white_box_test_case_enabled);
					SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
					BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
					set_wc_blocked = TRUE;
					break;
				}
				assert(0 <= cr->read_in_progress);
				*cycle = cr->cycle;
				cr->tn = csd->trans_hist.curr_tn;
				/* Record history of most recent disk reads only in dbg builds for now. Although the macro
				 * is just a couple dozen instructions, it is done while holding crit so we want to avoid
				 * delaying crit unless really necessary. Whoever wants this information can enable it
				 * by a build change to remove the DEBUG_ONLY part below.
				 */
				DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
				if (!was_crit && !hold_onto_crit)
					rel_crit(gv_cur_region);
				/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
				assert(0 == cr->dirty);
				assert(cr->read_in_progress >= 0);
				CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
				if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr), &ondsk_blkver, lcl_blk_free)))
				{	/* buffer does not contain valid data, so reset blk to be empty */
					cr->cycle++;	/* increment cycle for blk number changes (for tp_hist and others) */
					cr->blk = CR_BLKEMPTY;
					cr->r_epid = 0;
					RELEASE_BUFF_READ_LOCK(cr);
					assert(-1 <= cr->read_in_progress);
					assert(was_crit == csa->now_crit);
					if (FUTURE_READ == status)
					{	/* in cluster, block can be in the "future" with respect to the local history */
						assert(TRUE == clustered);
						assert(FALSE == csa->now_crit);
						rdfail_detail = cdb_sc_future_read;	/* t_retry forces the history up to date */
						return (sm_uc_ptr_t)NULL;
					}
					if (ERR_DYNUPGRDFAIL == status)
					{	/* if we dont hold crit on the region, it is possible due to concurrency conflicts
						 * that this block is unused (i.e. marked free/recycled in bitmap, see comments in
						 * gds_blk_upgrade.h). in this case we should not error out but instead restart.
						 */
						if (was_crit)
						{
							assert(FALSE);
							rts_error(VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region));
						} else
						{
							rdfail_detail = cdb_sc_lostcr;
							return (sm_uc_ptr_t)NULL;
						}
					}
					if (-1 == status)
					{
						/* could have been concurrent truncate, and we read a blk >= csa->ti->total_blks */
						/* restart */
						rdfail_detail = cdb_sc_truncate;
						return (sm_uc_ptr_t)NULL;
					} else
						rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
				}
				disk_blk_read = TRUE;
				assert(0 <= cr->read_in_progress);
				assert(0 == cr->dirty);
				/* Only set in cache if read was success */
				cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
				cr->r_epid = 0;
				RELEASE_BUFF_READ_LOCK(cr);
				assert(-1 <= cr->read_in_progress);
				*cr_out = cr;
				assert(was_crit == csa->now_crit);
				if (reset_first_tp_srch_status)
				{	/* keep the parantheses for the if (although single line) since the following is a macro */
					RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
				}
				return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
			} else  if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
			{
				assert(!hold_onto_crit);
				assert(TRUE == csa->now_crit);
				assert(cnl->in_crit == process_id);
				rel_crit(gv_cur_region);
			}
		}
Example #3
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
    uint4			status, duint4, blocking_pid;
    cache_rec_ptr_t		cr;
    bt_rec_ptr_t		bt;
    bool			clustered, was_crit;
    int			dummy, lcnt, ocnt;
    cw_set_element		*cse;
    off_chain		chain1;
    register sgmnt_addrs	*csa;
    register sgmnt_data_ptr_t	csd;
    int4			dummy_errno;
    boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked;

    error_def(ERR_DBFILERR);
    error_def(ERR_BUFOWNERSTUCK);

    first_tp_srch_status = NULL;
    reset_first_tp_srch_status = FALSE;
    csa = cs_addrs;
    csd = csa->hdr;
    INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
    is_mm = (dba_mm == csd->acc_meth);
    assert((t_tries < CDB_STAGNATE) || csa->now_crit);
    if (0 < dollar_tlevel)
    {
        assert(sgm_info_ptr);
        if (0 != sgm_info_ptr->cw_set_depth)
        {
            chain1 = *(off_chain *)&blk;
            if (1 == chain1.flag)
            {
                assert(sgm_info_ptr->cw_set_depth);
                if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
                    tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
                else
                {
                    assert(FALSE == csa->now_crit);
                    rdfail_detail = cdb_sc_blknumerr;
                    return (sm_uc_ptr_t)NULL;
                }
            } else
            {
                first_tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use,
                                       (void *)blk, &duint4);
                ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
                cse = first_tp_srch_status ? first_tp_srch_status->ptr : NULL;
            }
            assert(!cse || !cse->high_tlevel);
            if (cse)
            {   /* transaction has modified the sought after block  */
                assert(gds_t_writemap != cse->mode);
                if (FALSE == cse->done)
                {   /* out of date, so make it current */
                    already_built = (NULL != cse->new_buff);
                    gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
                    assert(cse->blk_target);
                    if (!already_built && !chain1.flag)
                    {
                        assert(first_tp_srch_status && (is_mm || first_tp_srch_status->cr)
                               && first_tp_srch_status->buffaddr);
                        if (first_tp_srch_status->tn <=
                                ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn)
                        {
                            assert(CDB_STAGNATE > t_tries);
                            rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
                            TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
                                              first_tp_srch_status->tn,
                                              ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn,
                                              ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->levl);
                            return (sm_uc_ptr_t)NULL;
                        }
                        if ((!is_mm) && (first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle
                                         || first_tp_srch_status->blk_num != first_tp_srch_status->cr->blk))
                        {
                            assert(CDB_STAGNATE > t_tries);
                            rdfail_detail = cdb_sc_lostcr;	/* should this be something else */
                            return (sm_uc_ptr_t)NULL;
                        }
                        if (certify_all_blocks &&
                                FALSE == cert_blk(gv_cur_region, blk, (blk_hdr_ptr_t)cse->new_buff,
                                                  cse->blk_target->root))
                            GTMASSERT;
                    }
                    cse->done = TRUE;
                }
                *cycle = CYCLE_PVT_COPY;
                *cr_out = 0;
                return (sm_uc_ptr_t)cse->new_buff;
            }
            assert(!chain1.flag);
        } else
            first_tp_srch_status =
                (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)blk, &duint4);
        ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
        if (!is_mm && first_tp_srch_status)
        {
            assert(first_tp_srch_status->cr && !first_tp_srch_status->ptr);
            if (first_tp_srch_status->cycle == first_tp_srch_status->cr->cycle)
            {
                *cycle = first_tp_srch_status->cycle;
                *cr_out = first_tp_srch_status->cr;
                first_tp_srch_status->cr->refer = TRUE;
                if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
                    CWS_INSERT(blk);
                return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
            } else
            {   /* Block was already part of the read-set of this transaction, but got recycled. Allow for
                 * recycling. But update the first_tp_srch_status (for this blk) in the si->first_tp_hist
                 * array to reflect the new buffer, cycle and cache-record. Since we know those only at the end of
                 * t_qread, set a variable here that will enable the updation before returning from t_qread().
                 */
                reset_first_tp_srch_status = TRUE;
            }
        }
    }
    if ((blk >= csa->ti->total_blks) || (blk < 0))
    {   /* requested block out of range; could occur because of a concurrency conflict */
        if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data))
            GTMASSERT;
        assert(FALSE == csa->now_crit);
        rdfail_detail = cdb_sc_blknumerr;
        return (sm_uc_ptr_t)NULL;
    }
    if (is_mm)
    {
        *cycle = CYCLE_SHRD_COPY;
        *cr_out = 0;
        return (sm_uc_ptr_t)(mm_read(blk));
    }
    assert(dba_bg == csd->acc_meth);
    assert(!first_tp_srch_status || !first_tp_srch_status->cr
           || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
    if (FALSE == (clustered = csd->clustered))
        bt = NULL;
    was_crit = csa->now_crit;
    ocnt = 0;
    set_wc_blocked = FALSE;	/* to indicate whether csd->wc_blocked was set to TRUE by us */
    do
    {
        if (NULL == (cr = db_csh_get(blk)))
        {   /* not in memory */
            if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
                bt = NULL;
            if (FALSE == csa->now_crit)
            {
                if (NULL != bt)
                {   /* at this point, bt is not NULL only if clustered and flushing - wait no crit */
                    assert(clustered);
                    wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
                }
                if (csd->flush_trigger <= csa->nl->wcs_active_lvl  &&  FALSE == gv_cur_region->read_only)
                    JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno);
                /* a macro that dclast's wcs_wtstart() and checks for errors etc. */
                grab_crit(gv_cur_region);
                cr = db_csh_get(blk);			/* in case blk arrived before crit */
            }
            if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
            {   /* Once crit, need to assure that if clustered, that flushing is [still] complete
                 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
                wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
            }
            if (NULL == cr)
            {   /* really not in memory - must get a new buffer */
                assert(csa->now_crit);
                cr = db_csh_getn(blk);
                if (CR_NOTVALID == (sm_long_t)cr)
                {
                    SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE);
                    BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
                    set_wc_blocked = TRUE;
                    break;
                }
                assert(0 <= cr->read_in_progress);
                *cycle = cr->cycle;
                cr->tn = csa->ti->curr_tn;
                if (FALSE == was_crit)
                    rel_crit(gv_cur_region);
                /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
                assert(0 == cr->dirty);
                assert(cr->read_in_progress >= 0);
                INCR_DB_CSH_COUNTER(csa, n_dsk_reads, 1);
                if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr))))
                {
                    RELEASE_BUFF_READ_LOCK(cr);
                    assert(was_crit == csa->now_crit);
                    if (FUTURE_READ == status)
                    {   /* in cluster, block can be in the "future" with respect to the local history */
                        assert(TRUE == clustered);
                        assert(FALSE == csa->now_crit);
                        rdfail_detail = cdb_sc_future_read;	/* t_retry forces the history up to date */
                        return (sm_uc_ptr_t)NULL;
                    }
                    rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
                }
                assert(0 <= cr->read_in_progress);
                assert(0 == cr->dirty);
                cr->r_epid = 0;
                RELEASE_BUFF_READ_LOCK(cr);
                assert(-1 <= cr->read_in_progress);
                *cr_out = cr;
                assert(was_crit == csa->now_crit);
                if (reset_first_tp_srch_status)
                {   /* keep the parantheses for the if (although single line) since the following is a macro */
                    RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
                }
                return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
            } else  if ((FALSE == was_crit) && (BAD_LUCK_ABOUNDS > ocnt))
            {
                assert(TRUE == csa->now_crit);
                assert(csa->nl->in_crit == process_id);
                rel_crit(gv_cur_region);
            }
        }
        if (CR_NOTVALID == (sm_long_t)cr)
        {
            SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE);
            BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_get_invalid_blk);
            set_wc_blocked = TRUE;
            break;
        }
        for (lcnt = 1;  ; lcnt++)
        {
            if (0 > cr->read_in_progress)
            {   /* it's not being read */
                if (clustered && (0 == cr->bt_index) &&
                        (cr->tn < ((th_rec *)((uchar_ptr_t)csa->th_base + csa->th_base->tnque.fl))->tn))
                {   /* can't rely on the buffer */
                    cr->cycle++;	/* increment cycle whenever blk number changes (tp_hist depends on this) */
                    cr->blk = CR_BLKEMPTY;
                    break;
                }
                *cycle = cr->cycle;
                *cr_out = cr;
                VMS_ONLY(
                    /* If we were doing the db_csh_get() above (in t_qread itself) and located the cache-record
                     * which, before coming here and taking a copy of cr->cycle a few lines above, was made an
                     * older twin by another process in bg_update (note this can happen in VMS only) which has
                     * already incremented the cycle, we will end up having a copy of the old cache-record with
                     * its incremented cycle number and hence will succeed in tp_hist validation if we return
                     * this <cr,cycle> combination although we don't want to since this "cr" is not current for
                     * the given block as of now. Note that the "indexmod" optimization in tp_tend() relies on
                     * an accurate intermediate validation by tp_hist() which in turn relies on the <cr,cycle>
                     * value returned by t_qread() to be accurate for a given blk at the current point in time.
                     * We detect the older-twin case by the following check. Note that here we depend on the
                     * the fact that bg_update() sets cr->bt_index to 0 before incrementing cr->cycle.
                     * Given that order, cr->bt_index can be guaranteed to be 0 if we read the incremented cycle
                     */
                    if (cr->twin && (0 == cr->bt_index))
                    break;
                )
                    if (cr->blk != blk)
                        break;
                if (was_crit != csa->now_crit)
                    rel_crit(gv_cur_region);
                assert(was_crit == csa->now_crit);
                if (reset_first_tp_srch_status)
                {   /* keep the parantheses for the if (although single line) since the following is a macro */
                    RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
                }
                /* Note that at this point we expect t_qread() to return a <cr,cycle> combination that
                 * corresponds to "blk" passed in. It is crucial to get an accurate value for both the fields
                 * since tp_hist() relies on this for its intermediate validation.
                 */
                return (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr);
            }
            if (blk != cr->blk)
                break;
            if (lcnt >= BUF_OWNER_STUCK && (0 == (lcnt % BUF_OWNER_STUCK)))
            {
                if (FALSE == csa->now_crit)
                    grab_crit(gv_cur_region);
                if (cr->read_in_progress < -1)
                {   /* outside of design; clear to known state */
                    BG_TRACE_PRO(t_qread_out_of_design);
                    INTERLOCK_INIT(cr);
                    assert(0 == cr->r_epid);
                    cr->r_epid = 0;
                } else  if (cr->read_in_progress >= 0)
                {
                    BG_TRACE_PRO(t_qread_buf_owner_stuck);
                    if (0 != (blocking_pid = cr->r_epid))
                    {
                        if (FALSE == is_proc_alive(blocking_pid, cr->image_count))
                        {   /* process gone: release that process's lock */
                            assert(0 == cr->bt_index);
                            if (cr->bt_index)
                            {
                                SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
                                BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index1);
                                set_wc_blocked = TRUE;
                                break;
                            }
                            cr->cycle++;	/* increment cycle for blk number changes (for tp_hist) */
                            cr->blk = CR_BLKEMPTY;
                            RELEASE_BUFF_READ_LOCK(cr);
                        } else
                        {
                            rel_crit(gv_cur_region);
                            send_msg(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region));
                            send_msg(VARLSTCNT(9) ERR_BUFOWNERSTUCK, 7, process_id, blocking_pid,
                                     cr->blk, cr->blk, (lcnt / BUF_OWNER_STUCK),
                                     cr->read_in_progress, cr->rip_latch.latch_pid);
                            if ((4 * BUF_OWNER_STUCK) <= lcnt)
                                GTMASSERT;
                            /* Kickstart the process taking a long time in case it was suspended */
                            UNIX_ONLY(continue_proc(blocking_pid));
                        }
                    } else
                    {   /* process stopped before could set r_epid */
                        assert(0 == cr->bt_index);
                        if (cr->bt_index)
                        {
                            SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
                            BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index2);
                            set_wc_blocked = TRUE;
                            break;
                        }
                        cr->cycle++;	/* increment cycle for blk number changes (for tp_hist) */
                        cr->blk = CR_BLKEMPTY;
                        RELEASE_BUFF_READ_LOCK(cr);
                        if (cr->read_in_progress < -1)	/* race: process released since if r_epid */
                            LOCK_BUFF_FOR_READ(cr, dummy);
                    }
                }
                if (was_crit != csa->now_crit)
                    rel_crit(gv_cur_region);
            } else
                wcs_sleep(lcnt);
        }
        if (set_wc_blocked)	/* cannot use csd->wc_blocked here as we might not necessarily have crit */
            break;
        ocnt++;
        if (BAD_LUCK_ABOUNDS <= ocnt)
        {
            if (BAD_LUCK_ABOUNDS < ocnt || csa->now_crit)
            {
                rel_crit(gv_cur_region);
                GTMASSERT;
            }
            if (FALSE == csa->now_crit)
                grab_crit(gv_cur_region);
        }
    } while (TRUE);
Example #4
0
enum cdb_sc 	gvcst_search(gv_key *pKey,		/* Key to search for */
			     srch_hist *pHist)		/* History to fill in*/
{
	unsigned char		nLevl;
	enum cdb_sc		status;
	register int		n1;
	register uchar_ptr_t	c1, c2;
	register sm_uc_ptr_t	pRec, pBlkBase;
	register gv_namehead	*pTarg;	/* Local copy of gv_target;  hope it gets put into register */
	register srch_blk_status *pCurr;
	register srch_blk_status *pNonStar;
	register srch_hist	*pTargHist;
	int			tmp_cmpc;
	block_id		nBlkId;
	cache_rec_ptr_t		cr;
	int			cycle;
	unsigned short		n0, nKeyLen;
	trans_num		tn;
	cw_set_element		*cse;
	off_chain		chain1, chain2;
	srch_blk_status		*tp_srch_status, *srch_status, *leaf_blk_hist;
	boolean_t		already_built, is_mm;
	ht_ent_int4		*tabent;
	sm_uc_ptr_t		buffaddr;
	trans_num		blkhdrtn, oldest_hist_tn;
	int			hist_size;
#	ifdef DEBUG
	boolean_t		save_donot_commit;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	pTarg = gv_target;
	assert(NULL != pTarg);
	assert(pTarg->root);
	assert(pKey != &pTarg->clue);
	nKeyLen = pKey->end + 1;

	assert(!dollar_tlevel || ((NULL != sgm_info_ptr) && (cs_addrs->sgm_info_ptr == sgm_info_ptr)));
	SET_GVCST_SEARCH_CLUE(0);
	INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srches, 1);
	pTargHist = ((NULL == pHist) ? &pTarg->hist : pHist);
	/* If FINAL RETRY and TP then we can safely use clues of gv_targets that have been referenced in this
	 * TP transaction (read_local_tn == local_tn). While that is guaranteed to be true for all updates, it
	 * does not hold good for READs since we allow a lot more reads to be done inside a transaction compared
	 * to the # of updates allowed. We allow the same global to be read multiple times inside the same transaction
	 * using different global buffers for each read. This means that we need to validate any clues from the first
	 * read before using it for the second read even if it is in the final retry. This validation is done inside
	 * the below IF block. As for gv_targets which are referenced for the very first time in this TP transaction,
	 * we have no easy way of determining if their clues are still uptodate (i.e. using the clue will guarantee us
	 * no restart) and since we are in the final retry, we dont want to take a risk. So dont use the clue in that case.
	 *
	 * If FINAL RETRY and Non-TP, we will be dealing with only ONE gv_target so its clue would have been reset as
	 * part of the penultimate restart so we dont have any of the above issue in the non-tp case. The only exception
	 * is if we are in gvcst_kill in which case, gvcst_search will be called twice and the clue could be non-zero
	 * for the second invocation. In this case, the clue is guaranteed to be uptodate since it was set just now
	 * as part of the first invocation. So no need to do anything about clue in final retry for Non-TP.
	 */
	if ((0 != pTarg->clue.end) && ((CDB_STAGNATE > t_tries) || !dollar_tlevel || (pTarg->read_local_tn == local_tn)))
	{	/* Have non-zero clue. Check if it is usable for the current search key. If so validate clue then and use it. */
		/* In t_end, we skipped validating the clue in case of reorg due to the assumption that reorg never uses the clue
		 * i.e. it nullifies the clue before calling gvcst_search. However, it doesn't reset the clue for directory tree
		 * and so continue using the clue if called for root search. Assert accordingly.
		 */
		assert(!mu_reorg_process UNIX_ONLY(|| (pTarg->gd_csa->dir_tree == pTarg)));
		INCR_DB_CSH_COUNTER(cs_addrs, n_gvcst_srch_clues, 1);
		status = cdb_sc_normal;	/* clue is usable unless proved otherwise */
		DEBUG_ONLY(save_donot_commit = TREF(donot_commit);)
		if (NULL != pHist)
Example #5
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
	/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
	int4			status;
	uint4			blocking_pid;
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	boolean_t		clustered, hold_onto_crit, was_crit, issued_db_init_crypt_warning, sync_needed;
	int			dummy, lcnt, ocnt;
	cw_set_element		*cse;
	off_chain		chain1;
	register sgmnt_addrs	*csa;
	register sgmnt_data_ptr_t	csd;
	enum db_ver		ondsk_blkver;
	int4			dummy_errno, gtmcrypt_errno;
	boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
	ht_ent_int4		*tabent;
	srch_blk_status		*blkhist;
	trans_num		dirty, blkhdrtn;
	sm_uc_ptr_t		buffaddr;
	uint4			stuck_cnt = 0;
	boolean_t		lcl_blk_free;
	node_local_ptr_t	cnl;
	gd_segment		*seg;
	uint4			buffs_per_flush, flush_target;
	enc_info_t		*encr_ptr;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	lcl_blk_free = block_is_free;
	block_is_free = FALSE;	/* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
	first_tp_srch_status = NULL;
	reset_first_tp_srch_status = FALSE;
	csa = cs_addrs;
	csd = csa->hdr;
	INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
	is_mm = (dba_mm == csd->acc_meth);
	/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
	assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
	if (dollar_tlevel)
	{
		assert(sgm_info_ptr);
		if (0 != sgm_info_ptr->cw_set_depth)
		{
			chain1 = *(off_chain *)&blk;
			if (1 == chain1.flag)
			{
				assert(sgm_info_ptr->cw_set_depth);
				if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				else
				{
					assert(FALSE == csa->now_crit);
					rdfail_detail = cdb_sc_blknumerr;
					return (sm_uc_ptr_t)NULL;
				}
			} else
			{
				if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
					first_tp_srch_status = tabent->value;
				else
					first_tp_srch_status = NULL;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
				cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			assert(!chain1.flag || cse);
			if (cse)
			{	/* transaction has modified the sought after block  */
				if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
				{	/* Changes have not been committed to shared memory, i.e. still in private memory.
					 * Build block in private buffer if not already done and return the same.
					 */
					assert(gds_t_writemap != cse->mode);
					if (FALSE == cse->done)
					{	/* out of date, so make it current */
						assert(gds_t_committed != cse->mode);
						already_built = (NULL != cse->new_buff);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
						assert(NULL != cse->blk_target);
						if (!already_built && !chain1.flag)
						{
							buffaddr = first_tp_srch_status->buffaddr;
							cr = first_tp_srch_status->cr;
							assert((is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
								TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
									first_tp_srch_status->tn, blkhdrtn,
									((blk_hdr_ptr_t)buffaddr)->levl);
								return (sm_uc_ptr_t)NULL;
							}
							if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
										|| (first_tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_lostcr; /* should this be something else */
								return (sm_uc_ptr_t)NULL;
							}
						}
						cse->done = TRUE;
					}
					*cycle = CYCLE_PVT_COPY;
					*cr_out = 0;
					return (sm_uc_ptr_t)cse->new_buff;
				} else
				{	/* Block changes are already committed to shared memory (possible if we are in TP
					 * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
					 * block from shared memory; do not look at private memory (i.e. cse) as that might
					 * not be as uptodate as shared memory.
					 */
					assert(csa->now_crit);	/* gvcst_expand_free_subtree does t_qread in crit */
					/* If this block was newly created as part of the TP transaction, it should not be killed
					 * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
					 * have had an old_mode of kill_t_create in which case we would not have come into this
					 * else block. Assert accordingly.
					 */
					assert(!chain1.flag);
					first_tp_srch_status = NULL;	/* do not use any previous srch_hist information */
				}
			}
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				first_tp_srch_status = tabent->value;
			else
				first_tp_srch_status = NULL;
		}
		ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
		if (!is_mm && first_tp_srch_status)
		{
			cr = first_tp_srch_status->cr;
			assert(cr && !first_tp_srch_status->cse);
			if (first_tp_srch_status->cycle == cr->cycle)
			{
				*cycle = first_tp_srch_status->cycle;
				*cr_out = cr;
				cr->refer = TRUE;
				if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
					CWS_INSERT(blk);
				return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
			} else
			{	/* Block was already part of the read-set of this transaction, but got recycled in the cache.
				 * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
				 * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
				 * if block recycling happened within the same mini-action and restart in that case.
				 * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
				 * the values to update to. Set a variable that will enable the updation before returning.
				 * Also assert that if we are in the final retry, we are never in a situation where we have a
				 * block that got recycled since the start of the current mini-action. This is easily detected since
				 * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
				 * have been read as part of the current mini-action until now.
				 */
				assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
				reset_first_tp_srch_status = TRUE;
			}
		}
	}
	if ((uint4)blk >= (uint4)csa->ti->total_blks)
	{	/* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is
		 * never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread
		 * can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below.
		 */
		assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
		assert(!csa->now_crit);
		rdfail_detail = cdb_sc_blknumerr;
		return (sm_uc_ptr_t)NULL;
	}
	if (is_mm)
	{
		*cycle = CYCLE_SHRD_COPY;
		*cr_out = 0;
		return (sm_uc_ptr_t)(mm_read(blk));
	}
	was_crit = csa->now_crit;
	cnl = csa->nl;
	encr_ptr = csa->encr_ptr;
	if (NULL != encr_ptr)
	{
		/* If this is an encrypted database and we hold crit, make sure our private cycle matches the shared cycle.
		 * Or else we would need to call "process_reorg_encrypt_restart" below (a heavyweight operation) holding crit.
		 */
		assert(!was_crit || (cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle));
		seg = gv_cur_region->dyn.addr;
		issued_db_init_crypt_warning = encr_ptr->issued_db_init_crypt_warning;
		if (!IS_BITMAP_BLK(blk) && issued_db_init_crypt_warning)
		{	/* A non-GT.M process is attempting to read a non-bitmap block, yet it has previously encountered an error
			 * during db_init (because it did not have access to the encryption keys) and reported it with a -W-
			 * severity. Since the block it is attempting to read can be in the unencrypted shared memory (read from
			 * disk by another process with access to the encryption keys), we cannot let it access it without a valid
			 * handle, so issue an rts_error.
			 *
			 * TODO: DSE and LKE could bypass getting the ftok semaphore. LKE is not an issue, but DSE does care about
			 *       the csa->reorg_encrypt_cycle. So it means DSE could get an inconsistent copy of reorg_encrypt_cycle
			 *       and associated hashes if it had done a bypass and a concurrent REORG -ENCRYPT is holding the ftok
			 *       semaphore and changing these values at the same time.
			 */
			assert(!IS_GTM_IMAGE);	/* GT.M would have error'ed out in db_init */
			gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG));
			GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
		} else if (cnl->reorg_encrypt_cycle != encr_ptr->reorg_encrypt_cycle)
		{	/* A concurrent MUPIP REORG ENCRYPT occurred. Cannot proceed with the read even if the block is
			 * already loaded from disk into the unencrypted global buffers (security issue). Need to load the
			 * new encryption keys and only let those processes which are able to successfully do this proceed
			 * with the read. First, copy the key hashes from csd into csa->encr_ptr. That needs crit
			 * to ensure a concurrent MUPIP REORG ENCRYPT does not sneak in.
			 *
			 * Note: Even though we asserted a few lines above that if "was_crit" is TRUE, then we expect
			 * the encryption cycles to be in sync, we handle this out-of-design situation in "pro" by fixing
			 * the cycles while holding crit (hopefully rare case so it is okay to hold crit for a heavyweight call).
			 */
			if (!was_crit)
				grab_crit(gv_cur_region);
			/* Now that we have crit, sync them up by copying the new keys inside crit and opening the key handles
			 * outside crit (a potentially long running operation).
			 */
			SIGNAL_REORG_ENCRYPT_RESTART(mu_reorg_encrypt_in_prog, reorg_encrypt_restart_csa,
					cnl, csa, csd, rdfail_detail, process_id);
			assert(csa == reorg_encrypt_restart_csa);
			if (!was_crit)
				rel_crit(gv_cur_region);
			/* If we are inside a TP read-write transaction, it is possible we already used the old keys for
			 * prior calls to "jnl_format" so we have to restart (cannot sync up cycles). Do the same for
			 * TP read-only transaction as well as NON-TP read-write transaction. In all these cases we know
			 * the caller is capable of restarting. All other cases we dont know if the caller is capable so
			 * sync up the cycles and proceed using the new keys for the read.
			 *
			 * But since it is possible the caller does not call t_retry right away (e.g. mupip reorg which can
			 * choose to abandone this tree path and move on to another block without aborting this transaction)
			 * it is better we finish the pending call to "process_reorg_encrypt_restart" right here before returning.
			 */
			process_reorg_encrypt_restart();
			assert(NULL == reorg_encrypt_restart_csa);
			if (IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans))
			{
				assert(cdb_sc_reorg_encrypt == rdfail_detail);	/* set by SIGNAL_REORG_ENCRYPT_RESTART macro */
				return (sm_uc_ptr_t)NULL;
			}
		}
	}
	assert(dba_bg == csd->acc_meth);
	assert(!first_tp_srch_status || !first_tp_srch_status->cr
					|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
	if (FALSE == (clustered = csd->clustered))
		bt = NULL;
	ocnt = 0;
	set_wc_blocked = FALSE;	/* to indicate whether cnl->wc_blocked was set to TRUE by us */
	hold_onto_crit = csa->hold_onto_crit;	/* note down in local to avoid csa-> dereference in multiple usages below */
	do
	{
		if (NULL == (cr = db_csh_get(blk)))
		{	/* not in memory */
			if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
				bt = NULL;
			if (!csa->now_crit)
			{
				assert(!hold_onto_crit);
				if (NULL != bt)
				{	/* at this point, bt is not NULL only if clustered and flushing - wait no crit */
					assert(clustered);
					wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
				}
				/* assume defaults for flush_target and buffs_per_flush */
				flush_target = csd->flush_trigger;
				buffs_per_flush = 0;
				if ((0 != csd->epoch_taper) && (FALSE == gv_cur_region->read_only) && JNL_ENABLED(csd) &&
						(0 != cnl->wcs_active_lvl) && (NOJNL != csa->jnl->channel) &&
						(0 != cnl->jnl_file.u.inode) && csd->jnl_before_image)
				{
					EPOCH_TAPER_IF_NEEDED(csa, csd, cnl, (gd_region *) 0, FALSE, buffs_per_flush, flush_target);
				}
				if ((flush_target <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, buffs_per_flush, dummy_errno);
						/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
				/* Get crit but also ensure encryption cycles are in sync ("dsk_read" relies on this).
				 * Note: "sync_needed" should be TRUE very rarely since we synced the cycles just a few lines
				 * above. But in case a MUPIP REORG ENCRYPT concurrently sneaked in between these lines we
				 * need to resync.
				 */
				sync_needed = grab_crit_encr_cycle_sync(gv_cur_region);
				assert(NULL == reorg_encrypt_restart_csa);
				assert(!sync_needed || (NULL != encr_ptr));
				if (sync_needed && IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans))
				{
					assert(cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle);
					rel_crit(gv_cur_region);
					rdfail_detail = cdb_sc_reorg_encrypt;	/* set by SIGNAL_REORG_ENCRYPT_RESTART macro */
					return (sm_uc_ptr_t)NULL;
				}
				cr = db_csh_get(blk);			/* in case blk arrived before crit */
			}
			if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
			{	/* Once crit, need to assure that if clustered, that flushing is [still] complete
				 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
				wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
			}
			if (NULL == cr)
			{	/* really not in memory - must get a new buffer */
				assert(csa->now_crit);
				cr = db_csh_getn(blk);
				if (CR_NOTVALID == (sm_long_t)cr)
				{
					assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
					assert(gtm_white_box_test_case_enabled);
					SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
					BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
					set_wc_blocked = TRUE;
					break;
				}
				assert(0 <= cr->read_in_progress);
				*cycle = cr->cycle;
				cr->tn = csd->trans_hist.curr_tn;
				/* Record history of most recent disk reads only in dbg builds for now. Although the macro
				 * is just a couple dozen instructions, it is done while holding crit so we want to avoid
				 * delaying crit unless really necessary. Whoever wants this information can enable it
				 * by a build change to remove the DEBUG_ONLY part below.
				 */
				DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
				if (!was_crit && !hold_onto_crit)
					rel_crit(gv_cur_region);
				/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
				assert(0 == cr->dirty);
				assert(cr->read_in_progress >= 0);
				CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
				buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
#				ifdef DEBUG
				/* stop self to test sechshr_db_clnup clears the read state */
				if (gtm_white_box_test_case_enabled
					&& (WBTEST_SIGTSTP_IN_T_QREAD == gtm_white_box_test_case_number))
				{	/* this should never fail, but because of the way we developed the test we got paranoid */
					dummy = kill(process_id, SIGTERM);
					assert(0 == dummy);
					for (dummy = 10; dummy; dummy--)
						LONG_SLEEP(10); /* time for sigterm to take hit before we clear block_now_locked */
				}
#				endif
				if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free)))
				{
					/* buffer does not contain valid data, so reset blk to be empty */
					cr->cycle++;	/* increment cycle for blk number changes (for tp_hist and others) */
					cr->blk = CR_BLKEMPTY;
					cr->r_epid = 0;
					RELEASE_BUFF_READ_LOCK(cr);
					TREF(block_now_locked) = NULL;
					assert(-1 <= cr->read_in_progress);
					assert(was_crit == csa->now_crit);
					if (ERR_DYNUPGRDFAIL == status)
					{	/* if we dont hold crit on the region, it is possible due to concurrency conflicts
						 * that this block is unused (i.e. marked free/recycled in bitmap, see comments in
						 * gds_blk_upgrade.h). in this case we should not error out but instead restart.
						 */
						if (was_crit)
						{
							assert(FALSE);
							rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk,
									DB_LEN_STR(gv_cur_region));
						} else
						{
							rdfail_detail = cdb_sc_lostcr;
							return (sm_uc_ptr_t)NULL;
						}
					}
					if ((-1 == status) && !was_crit)
					{	/* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch
						 * a full database block's length of data. This can happen if the requested read is
						 * past the end of the file, which can happen if a concurrent truncate occurred
						 * after the blk >= csa->ti->total_blks comparison above. Allow for this scenario
						 * by restarting. However, if we've had crit the whole time, no truncate could have
						 * happened. -1 indicates a problem with the file, so fall through to DBFILERR.
						 */
						rdfail_detail = cdb_sc_truncate;
						return (sm_uc_ptr_t)NULL;
					} else if (IS_CRYPTERR_MASK(status))
					{
						seg = gv_cur_region->dyn.addr;
						GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname);
					}
					else
					{	/* A DBFILERR can be thrown for two possible reasons:
						 * (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or
						 * (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block
						 * number for one region from another region with fewer total_blks.
						 *    We suspect the former is what happened in GTM-7623. Apparently the latter
						 * has been an issue before, too. If either occurs again in pro, this assertpro
						 * distinguishes the two possibilities.
						 */
						assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
						rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region),
								status);
					}
				}
				disk_blk_read = TRUE;
				assert(0 <= cr->read_in_progress);
				assert(0 == cr->dirty);
				/* Only set in cache if read was success */
				cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
				cr->r_epid = 0;
				RELEASE_BUFF_READ_LOCK(cr);
				TREF(block_now_locked) = NULL;
				assert(-1 <= cr->read_in_progress);
				*cr_out = cr;
				assert(was_crit == csa->now_crit);
				if (reset_first_tp_srch_status)
					RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
				return buffaddr;
			} else  if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
			{
				assert(!hold_onto_crit);
				assert(TRUE == csa->now_crit);
				assert(cnl->in_crit == process_id);
				rel_crit(gv_cur_region);
			}
		}