Exemplo n.º 1
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
	/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
	int4			status;
	uint4			blocking_pid;
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	boolean_t		clustered, hold_onto_crit, was_crit;
	int			dummy, lcnt, ocnt;
	cw_set_element		*cse;
	off_chain		chain1;
	register sgmnt_addrs	*csa;
	register sgmnt_data_ptr_t	csd;
	enum db_ver		ondsk_blkver;
	int4			dummy_errno;
	boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
	ht_ent_int4		*tabent;
	srch_blk_status		*blkhist;
	trans_num		dirty, blkhdrtn;
	sm_uc_ptr_t		buffaddr;
	uint4			stuck_cnt = 0;
	boolean_t		lcl_blk_free;
	node_local_ptr_t	cnl;

	lcl_blk_free = block_is_free;
	block_is_free = FALSE;	/* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
	first_tp_srch_status = NULL;
	reset_first_tp_srch_status = FALSE;
	csa = cs_addrs;
	csd = csa->hdr;
	INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
	is_mm = (dba_mm == csd->acc_meth);
	/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
	assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
	if (dollar_tlevel)
	{
		assert(sgm_info_ptr);
		if (0 != sgm_info_ptr->cw_set_depth)
		{
			chain1 = *(off_chain *)&blk;
			if (1 == chain1.flag)
			{
				assert(sgm_info_ptr->cw_set_depth);
				if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				else
				{
					assert(FALSE == csa->now_crit);
					rdfail_detail = cdb_sc_blknumerr;
					return (sm_uc_ptr_t)NULL;
				}
			} else
			{
				if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
					first_tp_srch_status = tabent->value;
				else
					first_tp_srch_status = NULL;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
				cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			assert(!chain1.flag || cse);
			if (cse)
			{	/* transaction has modified the sought after block  */
				if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
				{	/* Changes have not been committed to shared memory, i.e. still in private memory.
					 * Build block in private buffer if not already done and return the same.
					 */
					assert(gds_t_writemap != cse->mode);
					if (FALSE == cse->done)
					{	/* out of date, so make it current */
						assert(gds_t_committed != cse->mode);
						already_built = (NULL != cse->new_buff);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
						assert(NULL != cse->blk_target);
						if (!already_built && !chain1.flag)
						{
							buffaddr = first_tp_srch_status->buffaddr;
							cr = first_tp_srch_status->cr;
							assert((is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
								TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
									first_tp_srch_status->tn, blkhdrtn,
									((blk_hdr_ptr_t)buffaddr)->levl);
								return (sm_uc_ptr_t)NULL;
							}
							if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
										|| (first_tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_lostcr; /* should this be something else */
								return (sm_uc_ptr_t)NULL;
							}
						}
						cse->done = TRUE;
					}
					*cycle = CYCLE_PVT_COPY;
					*cr_out = 0;
					return (sm_uc_ptr_t)cse->new_buff;
				} else
				{	/* Block changes are already committed to shared memory (possible if we are in TP
					 * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
					 * block from shared memory; do not look at private memory (i.e. cse) as that might
					 * not be as uptodate as shared memory.
					 */
					assert(csa->now_crit);	/* gvcst_expand_free_subtree does t_qread in crit */
					/* If this block was newly created as part of the TP transaction, it should not be killed
					 * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
					 * have had an old_mode of kill_t_create in which case we would not have come into this
					 * else block. Assert accordingly.
					 */
					assert(!chain1.flag);
					first_tp_srch_status = NULL;	/* do not use any previous srch_hist information */
				}
			}
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				first_tp_srch_status = tabent->value;
			else
				first_tp_srch_status = NULL;
		}
		ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
		if (!is_mm && first_tp_srch_status)
		{
			cr = first_tp_srch_status->cr;
			assert(cr && !first_tp_srch_status->cse);
			if (first_tp_srch_status->cycle == cr->cycle)
			{
				*cycle = first_tp_srch_status->cycle;
				*cr_out = cr;
				cr->refer = TRUE;
				if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
					CWS_INSERT(blk);
				return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
			} else
			{	/* Block was already part of the read-set of this transaction, but got recycled in the cache.
				 * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
				 * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
				 * if block recycling happened within the same mini-action and restart in that case.
				 * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
				 * the values to update to. Set a variable that will enable the updation before returning.
				 * Also assert that if we are in the final retry, we are never in a situation where we have a
				 * block that got recycled since the start of the current mini-action. This is easily detected since
				 * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
				 * have been read as part of the current mini-action until now.
				 */
				assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
				reset_first_tp_srch_status = TRUE;
			}
		}
	}
	if ((blk >= csa->ti->total_blks) || (blk < 0))
	{	/* requested block out of range; could occur because of a concurrency conflict */
		if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data))
			GTMASSERT;
		assert(FALSE == csa->now_crit);
		rdfail_detail = cdb_sc_blknumerr;
		return (sm_uc_ptr_t)NULL;
	}
	if (is_mm)
	{
		*cycle = CYCLE_SHRD_COPY;
		*cr_out = 0;
		return (sm_uc_ptr_t)(mm_read(blk));
	}
#	ifdef GTM_CRYPT
	/* If database is encrypted, check if encryption initialization went fine for this database. If not,
	 * do not let process proceed as it could now potentially get a peek at the desired data from the
	 * decrypted shared memory global buffers (read in from disk by other processes) without having to go to disk.
	 * If DSE, allow for a special case where it is trying to dump a local bitmap block. In this case, DSE
	 * can continue to run fine (even if encryption initialization failed) since bitmap blocks are unencrypted.
	 */
	if (csa->encrypt_init_status && (!dse_running || !IS_BITMAP_BLK(blk)))
		GC_RTS_ERROR(csa->encrypt_init_status, gv_cur_region->dyn.addr->fname);
#	endif
	assert(dba_bg == csd->acc_meth);
	assert(!first_tp_srch_status || !first_tp_srch_status->cr
					|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
	if (FALSE == (clustered = csd->clustered))
		bt = NULL;
	was_crit = csa->now_crit;
	ocnt = 0;
	cnl = csa->nl;
	set_wc_blocked = FALSE;	/* to indicate whether cnl->wc_blocked was set to TRUE by us */
	hold_onto_crit = csa->hold_onto_crit;	/* note down in local to avoid csa-> dereference in multiple usages below */
	do
	{
		if (NULL == (cr = db_csh_get(blk)))
		{	/* not in memory */
			if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
				bt = NULL;
			if (!csa->now_crit)
			{
				assert(!hold_onto_crit);
				if (NULL != bt)
				{	/* at this point, bt is not NULL only if clustered and flushing - wait no crit */
					assert(clustered);
					wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
				}
				if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno);
						/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
				grab_crit(gv_cur_region);
				cr = db_csh_get(blk);			/* in case blk arrived before crit */
			}
			if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
			{	/* Once crit, need to assure that if clustered, that flushing is [still] complete
				 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
				wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
			}
			if (NULL == cr)
			{	/* really not in memory - must get a new buffer */
				assert(csa->now_crit);
				cr = db_csh_getn(blk);
				if (CR_NOTVALID == (sm_long_t)cr)
				{
					assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
					assert(gtm_white_box_test_case_enabled);
					SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
					BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
					set_wc_blocked = TRUE;
					break;
				}
				assert(0 <= cr->read_in_progress);
				*cycle = cr->cycle;
				cr->tn = csd->trans_hist.curr_tn;
				/* Record history of most recent disk reads only in dbg builds for now. Although the macro
				 * is just a couple dozen instructions, it is done while holding crit so we want to avoid
				 * delaying crit unless really necessary. Whoever wants this information can enable it
				 * by a build change to remove the DEBUG_ONLY part below.
				 */
				DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
				if (!was_crit && !hold_onto_crit)
					rel_crit(gv_cur_region);
				/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
				assert(0 == cr->dirty);
				assert(cr->read_in_progress >= 0);
				CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
				if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr), &ondsk_blkver, lcl_blk_free)))
				{	/* buffer does not contain valid data, so reset blk to be empty */
					cr->cycle++;	/* increment cycle for blk number changes (for tp_hist and others) */
					cr->blk = CR_BLKEMPTY;
					cr->r_epid = 0;
					RELEASE_BUFF_READ_LOCK(cr);
					assert(-1 <= cr->read_in_progress);
					assert(was_crit == csa->now_crit);
					if (FUTURE_READ == status)
					{	/* in cluster, block can be in the "future" with respect to the local history */
						assert(TRUE == clustered);
						assert(FALSE == csa->now_crit);
						rdfail_detail = cdb_sc_future_read;	/* t_retry forces the history up to date */
						return (sm_uc_ptr_t)NULL;
					}
					if (ERR_DYNUPGRDFAIL == status)
					{	/* if we dont hold crit on the region, it is possible due to concurrency conflicts
						 * that this block is unused (i.e. marked free/recycled in bitmap, see comments in
						 * gds_blk_upgrade.h). in this case we should not error out but instead restart.
						 */
						if (was_crit)
						{
							assert(FALSE);
							rts_error(VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region));
						} else
						{
							rdfail_detail = cdb_sc_lostcr;
							return (sm_uc_ptr_t)NULL;
						}
					}
					if (-1 == status)
					{
						/* could have been concurrent truncate, and we read a blk >= csa->ti->total_blks */
						/* restart */
						rdfail_detail = cdb_sc_truncate;
						return (sm_uc_ptr_t)NULL;
					} else
						rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
				}
				disk_blk_read = TRUE;
				assert(0 <= cr->read_in_progress);
				assert(0 == cr->dirty);
				/* Only set in cache if read was success */
				cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
				cr->r_epid = 0;
				RELEASE_BUFF_READ_LOCK(cr);
				assert(-1 <= cr->read_in_progress);
				*cr_out = cr;
				assert(was_crit == csa->now_crit);
				if (reset_first_tp_srch_status)
				{	/* keep the parantheses for the if (although single line) since the following is a macro */
					RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
				}
				return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
			} else  if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
			{
				assert(!hold_onto_crit);
				assert(TRUE == csa->now_crit);
				assert(cnl->in_crit == process_id);
				rel_crit(gv_cur_region);
			}
		}
Exemplo n.º 2
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
    uint4			status, duint4, blocking_pid;
    cache_rec_ptr_t		cr;
    bt_rec_ptr_t		bt;
    bool			clustered, was_crit;
    int			dummy, lcnt, ocnt;
    cw_set_element		*cse;
    off_chain		chain1;
    register sgmnt_addrs	*csa;
    register sgmnt_data_ptr_t	csd;
    int4			dummy_errno;
    boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked;

    error_def(ERR_DBFILERR);
    error_def(ERR_BUFOWNERSTUCK);

    first_tp_srch_status = NULL;
    reset_first_tp_srch_status = FALSE;
    csa = cs_addrs;
    csd = csa->hdr;
    INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
    is_mm = (dba_mm == csd->acc_meth);
    assert((t_tries < CDB_STAGNATE) || csa->now_crit);
    if (0 < dollar_tlevel)
    {
        assert(sgm_info_ptr);
        if (0 != sgm_info_ptr->cw_set_depth)
        {
            chain1 = *(off_chain *)&blk;
            if (1 == chain1.flag)
            {
                assert(sgm_info_ptr->cw_set_depth);
                if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
                    tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
                else
                {
                    assert(FALSE == csa->now_crit);
                    rdfail_detail = cdb_sc_blknumerr;
                    return (sm_uc_ptr_t)NULL;
                }
            } else
            {
                first_tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use,
                                       (void *)blk, &duint4);
                ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
                cse = first_tp_srch_status ? first_tp_srch_status->ptr : NULL;
            }
            assert(!cse || !cse->high_tlevel);
            if (cse)
            {   /* transaction has modified the sought after block  */
                assert(gds_t_writemap != cse->mode);
                if (FALSE == cse->done)
                {   /* out of date, so make it current */
                    already_built = (NULL != cse->new_buff);
                    gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
                    assert(cse->blk_target);
                    if (!already_built && !chain1.flag)
                    {
                        assert(first_tp_srch_status && (is_mm || first_tp_srch_status->cr)
                               && first_tp_srch_status->buffaddr);
                        if (first_tp_srch_status->tn <=
                                ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn)
                        {
                            assert(CDB_STAGNATE > t_tries);
                            rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
                            TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
                                              first_tp_srch_status->tn,
                                              ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn,
                                              ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->levl);
                            return (sm_uc_ptr_t)NULL;
                        }
                        if ((!is_mm) && (first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle
                                         || first_tp_srch_status->blk_num != first_tp_srch_status->cr->blk))
                        {
                            assert(CDB_STAGNATE > t_tries);
                            rdfail_detail = cdb_sc_lostcr;	/* should this be something else */
                            return (sm_uc_ptr_t)NULL;
                        }
                        if (certify_all_blocks &&
                                FALSE == cert_blk(gv_cur_region, blk, (blk_hdr_ptr_t)cse->new_buff,
                                                  cse->blk_target->root))
                            GTMASSERT;
                    }
                    cse->done = TRUE;
                }
                *cycle = CYCLE_PVT_COPY;
                *cr_out = 0;
                return (sm_uc_ptr_t)cse->new_buff;
            }
            assert(!chain1.flag);
        } else
            first_tp_srch_status =
                (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)blk, &duint4);
        ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
        if (!is_mm && first_tp_srch_status)
        {
            assert(first_tp_srch_status->cr && !first_tp_srch_status->ptr);
            if (first_tp_srch_status->cycle == first_tp_srch_status->cr->cycle)
            {
                *cycle = first_tp_srch_status->cycle;
                *cr_out = first_tp_srch_status->cr;
                first_tp_srch_status->cr->refer = TRUE;
                if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
                    CWS_INSERT(blk);
                return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
            } else
            {   /* Block was already part of the read-set of this transaction, but got recycled. Allow for
                 * recycling. But update the first_tp_srch_status (for this blk) in the si->first_tp_hist
                 * array to reflect the new buffer, cycle and cache-record. Since we know those only at the end of
                 * t_qread, set a variable here that will enable the updation before returning from t_qread().
                 */
                reset_first_tp_srch_status = TRUE;
            }
        }
    }
    if ((blk >= csa->ti->total_blks) || (blk < 0))
    {   /* requested block out of range; could occur because of a concurrency conflict */
        if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data))
            GTMASSERT;
        assert(FALSE == csa->now_crit);
        rdfail_detail = cdb_sc_blknumerr;
        return (sm_uc_ptr_t)NULL;
    }
    if (is_mm)
    {
        *cycle = CYCLE_SHRD_COPY;
        *cr_out = 0;
        return (sm_uc_ptr_t)(mm_read(blk));
    }
    assert(dba_bg == csd->acc_meth);
    assert(!first_tp_srch_status || !first_tp_srch_status->cr
           || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
    if (FALSE == (clustered = csd->clustered))
        bt = NULL;
    was_crit = csa->now_crit;
    ocnt = 0;
    set_wc_blocked = FALSE;	/* to indicate whether csd->wc_blocked was set to TRUE by us */
    do
    {
        if (NULL == (cr = db_csh_get(blk)))
        {   /* not in memory */
            if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
                bt = NULL;
            if (FALSE == csa->now_crit)
            {
                if (NULL != bt)
                {   /* at this point, bt is not NULL only if clustered and flushing - wait no crit */
                    assert(clustered);
                    wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
                }
                if (csd->flush_trigger <= csa->nl->wcs_active_lvl  &&  FALSE == gv_cur_region->read_only)
                    JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno);
                /* a macro that dclast's wcs_wtstart() and checks for errors etc. */
                grab_crit(gv_cur_region);
                cr = db_csh_get(blk);			/* in case blk arrived before crit */
            }
            if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
            {   /* Once crit, need to assure that if clustered, that flushing is [still] complete
                 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
                wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
            }
            if (NULL == cr)
            {   /* really not in memory - must get a new buffer */
                assert(csa->now_crit);
                cr = db_csh_getn(blk);
                if (CR_NOTVALID == (sm_long_t)cr)
                {
                    SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE);
                    BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
                    set_wc_blocked = TRUE;
                    break;
                }
                assert(0 <= cr->read_in_progress);
                *cycle = cr->cycle;
                cr->tn = csa->ti->curr_tn;
                if (FALSE == was_crit)
                    rel_crit(gv_cur_region);
                /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
                assert(0 == cr->dirty);
                assert(cr->read_in_progress >= 0);
                INCR_DB_CSH_COUNTER(csa, n_dsk_reads, 1);
                if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr))))
                {
                    RELEASE_BUFF_READ_LOCK(cr);
                    assert(was_crit == csa->now_crit);
                    if (FUTURE_READ == status)
                    {   /* in cluster, block can be in the "future" with respect to the local history */
                        assert(TRUE == clustered);
                        assert(FALSE == csa->now_crit);
                        rdfail_detail = cdb_sc_future_read;	/* t_retry forces the history up to date */
                        return (sm_uc_ptr_t)NULL;
                    }
                    rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
                }
                assert(0 <= cr->read_in_progress);
                assert(0 == cr->dirty);
                cr->r_epid = 0;
                RELEASE_BUFF_READ_LOCK(cr);
                assert(-1 <= cr->read_in_progress);
                *cr_out = cr;
                assert(was_crit == csa->now_crit);
                if (reset_first_tp_srch_status)
                {   /* keep the parantheses for the if (although single line) since the following is a macro */
                    RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
                }
                return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
            } else  if ((FALSE == was_crit) && (BAD_LUCK_ABOUNDS > ocnt))
            {
                assert(TRUE == csa->now_crit);
                assert(csa->nl->in_crit == process_id);
                rel_crit(gv_cur_region);
            }
        }
        if (CR_NOTVALID == (sm_long_t)cr)
        {
            SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE);
            BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_get_invalid_blk);
            set_wc_blocked = TRUE;
            break;
        }
        for (lcnt = 1;  ; lcnt++)
        {
            if (0 > cr->read_in_progress)
            {   /* it's not being read */
                if (clustered && (0 == cr->bt_index) &&
                        (cr->tn < ((th_rec *)((uchar_ptr_t)csa->th_base + csa->th_base->tnque.fl))->tn))
                {   /* can't rely on the buffer */
                    cr->cycle++;	/* increment cycle whenever blk number changes (tp_hist depends on this) */
                    cr->blk = CR_BLKEMPTY;
                    break;
                }
                *cycle = cr->cycle;
                *cr_out = cr;
                VMS_ONLY(
                    /* If we were doing the db_csh_get() above (in t_qread itself) and located the cache-record
                     * which, before coming here and taking a copy of cr->cycle a few lines above, was made an
                     * older twin by another process in bg_update (note this can happen in VMS only) which has
                     * already incremented the cycle, we will end up having a copy of the old cache-record with
                     * its incremented cycle number and hence will succeed in tp_hist validation if we return
                     * this <cr,cycle> combination although we don't want to since this "cr" is not current for
                     * the given block as of now. Note that the "indexmod" optimization in tp_tend() relies on
                     * an accurate intermediate validation by tp_hist() which in turn relies on the <cr,cycle>
                     * value returned by t_qread() to be accurate for a given blk at the current point in time.
                     * We detect the older-twin case by the following check. Note that here we depend on the
                     * the fact that bg_update() sets cr->bt_index to 0 before incrementing cr->cycle.
                     * Given that order, cr->bt_index can be guaranteed to be 0 if we read the incremented cycle
                     */
                    if (cr->twin && (0 == cr->bt_index))
                    break;
                )
                    if (cr->blk != blk)
                        break;
                if (was_crit != csa->now_crit)
                    rel_crit(gv_cur_region);
                assert(was_crit == csa->now_crit);
                if (reset_first_tp_srch_status)
                {   /* keep the parantheses for the if (although single line) since the following is a macro */
                    RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
                }
                /* Note that at this point we expect t_qread() to return a <cr,cycle> combination that
                 * corresponds to "blk" passed in. It is crucial to get an accurate value for both the fields
                 * since tp_hist() relies on this for its intermediate validation.
                 */
                return (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr);
            }
            if (blk != cr->blk)
                break;
            if (lcnt >= BUF_OWNER_STUCK && (0 == (lcnt % BUF_OWNER_STUCK)))
            {
                if (FALSE == csa->now_crit)
                    grab_crit(gv_cur_region);
                if (cr->read_in_progress < -1)
                {   /* outside of design; clear to known state */
                    BG_TRACE_PRO(t_qread_out_of_design);
                    INTERLOCK_INIT(cr);
                    assert(0 == cr->r_epid);
                    cr->r_epid = 0;
                } else  if (cr->read_in_progress >= 0)
                {
                    BG_TRACE_PRO(t_qread_buf_owner_stuck);
                    if (0 != (blocking_pid = cr->r_epid))
                    {
                        if (FALSE == is_proc_alive(blocking_pid, cr->image_count))
                        {   /* process gone: release that process's lock */
                            assert(0 == cr->bt_index);
                            if (cr->bt_index)
                            {
                                SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
                                BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index1);
                                set_wc_blocked = TRUE;
                                break;
                            }
                            cr->cycle++;	/* increment cycle for blk number changes (for tp_hist) */
                            cr->blk = CR_BLKEMPTY;
                            RELEASE_BUFF_READ_LOCK(cr);
                        } else
                        {
                            rel_crit(gv_cur_region);
                            send_msg(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region));
                            send_msg(VARLSTCNT(9) ERR_BUFOWNERSTUCK, 7, process_id, blocking_pid,
                                     cr->blk, cr->blk, (lcnt / BUF_OWNER_STUCK),
                                     cr->read_in_progress, cr->rip_latch.latch_pid);
                            if ((4 * BUF_OWNER_STUCK) <= lcnt)
                                GTMASSERT;
                            /* Kickstart the process taking a long time in case it was suspended */
                            UNIX_ONLY(continue_proc(blocking_pid));
                        }
                    } else
                    {   /* process stopped before could set r_epid */
                        assert(0 == cr->bt_index);
                        if (cr->bt_index)
                        {
                            SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
                            BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index2);
                            set_wc_blocked = TRUE;
                            break;
                        }
                        cr->cycle++;	/* increment cycle for blk number changes (for tp_hist) */
                        cr->blk = CR_BLKEMPTY;
                        RELEASE_BUFF_READ_LOCK(cr);
                        if (cr->read_in_progress < -1)	/* race: process released since if r_epid */
                            LOCK_BUFF_FOR_READ(cr, dummy);
                    }
                }
                if (was_crit != csa->now_crit)
                    rel_crit(gv_cur_region);
            } else
                wcs_sleep(lcnt);
        }
        if (set_wc_blocked)	/* cannot use csd->wc_blocked here as we might not necessarily have crit */
            break;
        ocnt++;
        if (BAD_LUCK_ABOUNDS <= ocnt)
        {
            if (BAD_LUCK_ABOUNDS < ocnt || csa->now_crit)
            {
                rel_crit(gv_cur_region);
                GTMASSERT;
            }
            if (FALSE == csa->now_crit)
                grab_crit(gv_cur_region);
        }
    } while (TRUE);
Exemplo n.º 3
0
void	mu_reorg_upgrd_dwngrd(void)
{
	blk_hdr			new_hdr;
	blk_segment		*bs1, *bs_ptr;
	block_id		*blkid_ptr, curblk, curbmp, start_blk, stop_blk, start_bmp, last_bmp;
	block_id		startblk_input, stopblk_input;
	boolean_t		upgrade, downgrade, safejnl, nosafejnl, region, first_reorg_in_this_db_fmt, reorg_entiredb;
	boolean_t		startblk_specified, stopblk_specified, set_fully_upgraded, db_got_to_v5_once, mark_blk_free;
	cache_rec_ptr_t		cr;
	char			*bml_lcl_buff = NULL, *command, *reorg_command;
	sm_uc_ptr_t		bptr = NULL;
	cw_set_element		*cse;
	enum cdb_sc		cdb_status;
	enum db_ver		new_db_format, ondsk_blkver;
	gd_region		*reg;
	int			cycle;
	int4			blk_seg_cnt, blk_size;	/* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */
	int4			blocks_left, expected_blks2upgrd, actual_blks2upgrd, total_blks, free_blks;
	int4			status, status1, mapsize, lcnt, bml_status;
	reorg_stats_t		reorg_stats;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	sm_uc_ptr_t		blkBase, bml_sm_buff;	/* shared memory pointer to the bitmap global buffer */
	srch_hist		alt_hist;
	srch_blk_status		*blkhist, bmlhist;
	tp_region		*rptr;
	trans_num		curr_tn;
	unsigned char    	save_cw_set_depth;
	uint4			lcl_update_trans;

	region    = (CLI_PRESENT == cli_present("REGION"));
	upgrade   = (CLI_PRESENT == cli_present("UPGRADE"));
	downgrade = (CLI_PRESENT == cli_present("DOWNGRADE"));
	assert(upgrade && !downgrade || !upgrade && downgrade);
	command = upgrade ? "UPGRADE" : "DOWNGRADE";
	reorg_command = upgrade ? "MUPIP REORG UPGRADE" : "MUPIP REORG DOWNGRADE";
	reorg_entiredb = TRUE;	/* unless STARTBLK or STOPBLK is specified we are going to {up,down}grade the entire database */
	startblk_specified = FALSE;
	assert(SIZEOF(block_id) == SIZEOF(uint4));
	if ((CLI_PRESENT == cli_present("STARTBLK")) && (cli_get_hex("STARTBLK", (uint4 *)&startblk_input)))
	{
		reorg_entiredb = FALSE;
		startblk_specified = TRUE;
	}
	stopblk_specified = FALSE;
	assert(SIZEOF(block_id) == SIZEOF(uint4));
	if ((CLI_PRESENT == cli_present("STOPBLK")) && (cli_get_hex("STOPBLK", (uint4 *)&stopblk_input)))
	{
		reorg_entiredb = FALSE;
		stopblk_specified = TRUE;
	}
	mu_reorg_upgrd_dwngrd_in_prog = TRUE;
	mu_reorg_nosafejnl = (CLI_NEGATED == cli_present("SAFEJNL")) ? TRUE : FALSE;

	assert(region);
	status = SS_NORMAL;
	error_mupip = FALSE;
	gvinit();	/* initialize gd_header (needed by the later call to mu_getlst) */
	mu_getlst("REG_NAME", SIZEOF(tp_region));	/* get the parameter corresponding to REGION qualifier */
	if (error_mupip)
	{
		util_out_print("!/MUPIP REORG !AD cannot proceed with above errors!/", TRUE, LEN_AND_STR(command));
		mupip_exit(ERR_MUNOACTION);
	}
	assert(DBKEYSIZE(MAX_KEY_SZ) == gv_keysize);	/* no need to invoke GVKEYSIZE_INIT_IF_NEEDED macro */
	gv_target = targ_alloc(gv_keysize, NULL, NULL);	/* t_begin needs this initialized */
	gv_target_list = NULL;
	memset(&alt_hist, 0, SIZEOF(alt_hist));	/* null-initialize history */
	blkhist = &alt_hist.h[0];
	for (rptr = grlist;  NULL != rptr;  rptr = rptr->fPtr)
	{
		if (mu_ctrly_occurred || mu_ctrlc_occurred)
			break;
		reg = rptr->reg;
		util_out_print("!/Region !AD : MUPIP REORG !AD started", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
		if (reg_cmcheck(reg))
		{
			util_out_print("Region !AD : MUPIP REORG !AD cannot run across network",
				TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			status = ERR_MUNOFINISH;
			continue;
		}
		mu_reorg_process = TRUE;	/* gvcst_init will use this value to use gtm_poollimit settings. */
		gvcst_init(reg);
		mu_reorg_process = FALSE;
		assert(update_array != NULL);
		/* access method stored in global directory and database file header might be different in which case
		 * the database setting prevails. therefore, the access method check can be done only after opening
		 * the database (i.e. after the gvcst_init)
		 */
		if (dba_bg != REG_ACC_METH(reg))
		{
			util_out_print("Region !AD : MUPIP REORG !AD cannot continue as access method is not BG",
				TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			status = ERR_MUNOFINISH;
			continue;
		}
		/* The mu_getlst call above uses insert_region to create the grlist, which ensures that duplicate regions mapping to
		 * the same db file correspond to only one grlist entry.
		 */
		assert(FALSE == reg->was_open);
		TP_CHANGE_REG(reg);	/* sets gv_cur_region, cs_addrs, cs_data */
		csa = cs_addrs;
		csd = cs_data;
		blk_size = csd->blk_size;	/* "blk_size" is used by the BLK_FINI macro */
		if (reg->read_only)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(reg));
			status = ERR_MUNOFINISH;
			continue;
		}
		assert(GDSVCURR == GDSV6); /* so we trip this assert in case GDSVCURR changes without a change to this module */
		new_db_format = (upgrade ? GDSV6 : GDSV4);
		grab_crit(reg);
		curr_tn = csd->trans_hist.curr_tn;
		/* set the desired db format in the file header to the appropriate version, increment transaction number */
		status1 = desired_db_format_set(reg, new_db_format, reorg_command);
		assert(csa->now_crit);	/* desired_db_format_set() should not have released crit */
		first_reorg_in_this_db_fmt = TRUE;	/* with the current desired_db_format, this is the first reorg */
		if (SS_NORMAL != status1)
		{	/* "desired_db_format_set" would have printed appropriate error messages */
			if (ERR_MUNOACTION != status1)
			{	/* real error occurred while setting the db format. skip to next region */
				status = ERR_MUNOFINISH;
				rel_crit(reg);
				continue;
			}
			util_out_print("Region !AD : Desired DB Format remains at !AD after !AD", TRUE, REG_LEN_STR(reg),
				LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command));
			if (csd->reorg_db_fmt_start_tn == csd->desired_db_format_tn)
				first_reorg_in_this_db_fmt = FALSE;
		} else
			util_out_print("Region !AD : Desired DB Format set to !AD by !AD", TRUE, REG_LEN_STR(reg),
				LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command));
		assert(dba_bg == csd->acc_meth);
		/* Check blks_to_upgrd counter to see if upgrade/downgrade is complete */
		total_blks = csd->trans_hist.total_blks;
		free_blks = csd->trans_hist.free_blocks;
		actual_blks2upgrd = csd->blks_to_upgrd;
		/* If MUPIP REORG UPGRADE and there is no block to upgrade in the database as indicated by BOTH
		 * 	"csd->blks_to_upgrd" and "csd->fully_upgraded", then we can skip processing.
		 * If MUPIP REORG UPGRADE and all non-free blocks need to be upgraded then again we can skip processing.
		 */
		if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded)
			|| (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd)))
		{
			util_out_print("Region !AD : Blocks to Upgrade counter indicates no action needed for MUPIP REORG !AD",
				       TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : "
				       "Blocks to upgrade = [0x!XL]",
				       TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd);
			util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			rel_crit(reg);
			continue;
		}
		stop_blk = total_blks;
		if (stopblk_specified && stopblk_input <= stop_blk)
			stop_blk = stopblk_input;
		if (first_reorg_in_this_db_fmt)
		{	/* Note down reorg start tn (in case we are interrupted, future reorg will know to resume) */
			csd->reorg_db_fmt_start_tn = csd->desired_db_format_tn;
			csd->reorg_upgrd_dwngrd_restart_block = 0;
			start_blk = (startblk_specified ? startblk_input : 0);
		} else
		{	/* Either a concurrent MUPIP REORG of the same type ({up,down}grade) is currently running
			 * or a previously running REORG of the same type was interrupted (Ctrl-Ced).
			 * In either case resume processing from whatever restart block number is stored in fileheader
			 * the only exception is if "STARTBLK" was specified in the input in which use that unconditionally.
			 */
			start_blk = (startblk_specified ? startblk_input : csd->reorg_upgrd_dwngrd_restart_block);
		}
		if (start_blk > stop_blk)
			start_blk = stop_blk;
		mu_reorg_upgrd_dwngrd_start_tn = csd->reorg_db_fmt_start_tn;
		/* Before releasing crit, flush the file-header and dirty buffers in cache to disk. This is because we are now
		 * going to read each GDS block directly from disk to determine if it needs to be upgraded/downgraded or not.
		 */
		if (!wcs_flu(WCSFLU_FLUSH_HDR))	/* wcs_flu assumes gv_cur_region is set (which it is in this routine) */
		{
			rel_crit(reg);
			gtm_putmsg_csa(CSA_ARG(csa)
				VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg));
			status = ERR_MUNOFINISH;
			continue;
		}
		rel_crit(reg);
		/* Loop through entire database one GDS block at a time and upgrade/downgrade each of them */
		status1 = SS_NORMAL;
		start_bmp = ROUND_DOWN2(start_blk, BLKS_PER_LMAP);
		last_bmp  = ROUND_DOWN2(stop_blk - 1, BLKS_PER_LMAP);
		curblk = start_blk;	/* curblk is the block to be upgraded/downgraded */
		util_out_print("Region !AD : Started processing from block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk);
		if (NULL != bptr)
		{	/* malloc/free "bptr" for each region as GDS block-size can be different */
			free(bptr);
			bptr = NULL;
		}
		memset(&reorg_stats, 0, SIZEOF(reorg_stats));	/* initialize statistics for this region */
		for (curbmp = start_bmp; curbmp <= last_bmp; curbmp += BLKS_PER_LMAP)
		{
			if (mu_ctrly_occurred || mu_ctrlc_occurred)
			{
				status1 = ERR_MUNOFINISH;
				break;
			}
			/* --------------------------------------------------------------
			 *             Read in current bitmap block
			 * --------------------------------------------------------------
			 */
			assert(!csa->now_crit);
			bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* bring block into the cache outside of crit */
			reorg_stats.blks_read_from_disk_bmp++;
			grab_crit_encr_cycle_sync(reg); /* needed so t_qread does not return NULL below */
			if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn)
			{	/* csd->desired_db_format changed since reorg started. discontinue the reorg */
				/* see later comment on "csd->reorg_upgrd_dwngrd_restart_block" for why the assignment
				 * of this field should be done only if a db format change did not occur.
				 */
				rel_crit(reg);
				status1 = ERR_MUNOFINISH;
				/* This "start_tn" check is redone after the for-loop and an error message is printed there */
				break;
			} else if (reorg_entiredb)
			{	/* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */
				assert(csd->reorg_upgrd_dwngrd_restart_block <= MAX(start_blk, curbmp));
				csd->reorg_upgrd_dwngrd_restart_block = curbmp;	/* previous blocks have been upgraded/downgraded */
			}
			/* Check blks_to_upgrd counter to see if upgrade/downgrade is complete.
			 * Repeat check done a few steps earlier outside of this for loop.
			 */
			total_blks = csd->trans_hist.total_blks;
			free_blks = csd->trans_hist.free_blocks;
			actual_blks2upgrd = csd->blks_to_upgrd;
			if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded)
				|| (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd)))
			{
				rel_crit(reg);
				break;
			}
			bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* now that in crit, note down stable buffer */
			if (NULL == bml_sm_buff)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL);
			ondsk_blkver = cr->ondsk_blkver;	/* note down db fmt on disk for bitmap block */
			/* Take a copy of the shared memory bitmap buffer into process-private memory before releasing crit.
			 * We are interested in those blocks that are currently marked as USED in the bitmap.
			 * It is possible that once we release crit, concurrent updates change the bitmap state of those blocks.
			 * In that case, those updates will take care of doing the upgrade/downgrade of those blocks in the
			 * format currently set in csd->desired_db_format i.e. accomplishing MUPIP REORG UPGRADE/DOWNGRADE's job.
			 * If the desired_db_format changes concurrently, we will stop doing REORG UPGRADE/DOWNGRADE processing.
			 */
			if (NULL == bml_lcl_buff)
				bml_lcl_buff = malloc(BM_SIZE(BLKS_PER_LMAP));
			memcpy(bml_lcl_buff, (blk_hdr_ptr_t)bml_sm_buff, BM_SIZE(BLKS_PER_LMAP));
			if (FALSE == cert_blk(reg, curbmp, (blk_hdr_ptr_t)bml_lcl_buff, 0, FALSE))
			{	/* certify the block while holding crit as cert_blk uses fields from file-header (shared memory) */
				assert(FALSE);	/* in pro, skip ugprading/downgarding all blks in this unreliable local bitmap */
				rel_crit(reg);
				util_out_print("Region !AD : Bitmap Block [0x!XL] has integrity errors. Skipping this bitmap.",
					TRUE, REG_LEN_STR(reg), curbmp);
				status1 = ERR_MUNOFINISH;
				continue;
			}
			rel_crit(reg);
			/* ------------------------------------------------------------------------
			 *         Upgrade/Downgrade all BUSY blocks in the current bitmap
			 * ------------------------------------------------------------------------
			 */
			curblk = (curbmp == start_bmp) ? start_blk : curbmp;
			mapsize = (curbmp == last_bmp) ? (stop_blk - curbmp) : BLKS_PER_LMAP;
			assert(0 != mapsize);
			assert(mapsize <= BLKS_PER_LMAP);
			db_got_to_v5_once = csd->db_got_to_v5_once;
			for (lcnt = curblk - curbmp; lcnt < mapsize; lcnt++, curblk++)
			{
				if (mu_ctrly_occurred || mu_ctrlc_occurred)
				{
					status1 = ERR_MUNOFINISH;
					goto stop_reorg_on_this_reg;	/* goto needed because of nested FOR Loop */
				}
				GET_BM_STATUS(bml_lcl_buff, lcnt, bml_status);
				assert(BLK_MAPINVALID != bml_status); /* cert_blk ran clean so we dont expect invalid entries */
				if (BLK_FREE == bml_status)
				{
					reorg_stats.blks_skipped_free++;
					continue;
				}
				/* MUPIP REORG UPGRADE/DOWNGRADE will convert USED & RECYCLED blocks */
				if (db_got_to_v5_once || (BLK_RECYCLED != bml_status))
				{	/* Do NOT read recycled V4 block from disk unless it is guaranteed NOT to be too full */
					if (lcnt)
					{	/* non-bitmap block */
						/* read in block from disk into private buffer. dont pollute the cache yet */
						if (NULL == bptr)
							bptr = (sm_uc_ptr_t)malloc(blk_size);
						status1 = dsk_read(curblk, bptr, &ondsk_blkver, FALSE);
						/* dsk_read on curblk could return an error (DYNUPGRDFAIL) if curblk needs to be
						 * upgraded and if its block size was too big to allow the extra block-header space
						 * requirements for a dynamic upgrade. a MUPIP REORG DOWNGRADE should not error out
						 * in that case as the block is already in the downgraded format.
						 */
						if (SS_NORMAL != status1)
						{
							if (!upgrade && (ERR_DYNUPGRDFAIL == status1))
							{
								assert(GDSV4 == new_db_format);
								ondsk_blkver = new_db_format;
							} else
							{
								gtm_putmsg_csa(CSA_ARG(csa)
									VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status1);
								util_out_print("Region !AD : Error occurred while reading block "
									"[0x!XL]", TRUE, REG_LEN_STR(reg), curblk);
								status1 = ERR_MUNOFINISH;
								goto stop_reorg_on_this_reg;/* goto needed due to nested FOR Loop */
							}
						}
						reorg_stats.blks_read_from_disk_nonbmp++;
					} /* else bitmap block has been read in crit earlier and ondsk_blkver appropriately set */
					if (new_db_format == ondsk_blkver)
					{
						assert((SS_NORMAL == status1) || (!upgrade && (ERR_DYNUPGRDFAIL == status1)));
						status1 = SS_NORMAL;	/* treat DYNUPGRDFAIL as no error in case of downgrade */
						reorg_stats.blks_skipped_newfmtindisk++;
						continue;	/* current disk version is identical to what is desired */
					}
					assert(SS_NORMAL == status1);
				}
				/* Begin non-TP transaction to upgrade/downgrade the block.
				 * The way we do that is by updating the block using a null update array.
				 * Any update to a block will trigger an automatic upgrade/downgrade of the block based on
				 * 	the current fileheader desired_db_format setting and we use that here.
				 */
				t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
				for (; ;)
				{
					CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
					curr_tn = csd->trans_hist.curr_tn;
					db_got_to_v5_once = csd->db_got_to_v5_once;
					if (db_got_to_v5_once || (BLK_RECYCLED != bml_status))
					{
						blkhist->cse = NULL;	/* start afresh (do not use value from previous retry) */
						blkBase = t_qread(curblk, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr);
						if (NULL == blkBase)
						{
							t_retry((enum cdb_sc)rdfail_detail);
							continue;
						}
						blkhist->blk_num = curblk;
						blkhist->buffaddr = blkBase;
						ondsk_blkver = blkhist->cr->ondsk_blkver;
						new_hdr = *(blk_hdr_ptr_t)blkBase;
						mu_reorg_upgrd_dwngrd_blktn = new_hdr.tn;
						mark_blk_free = FALSE;
						inctn_opcode = upgrade ? inctn_blkupgrd : inctn_blkdwngrd;
					} else
					{
						mark_blk_free = TRUE;
						inctn_opcode = inctn_blkmarkfree;
					}
					inctn_detail.blknum_struct.blknum = curblk;
					/* t_end assumes that the history it is passed does not contain a bitmap block.
					 * for bitmap block, the history validation information is passed through cse instead.
					 * therefore we need to handle bitmap and non-bitmap cases separately.
					 */
					if (!lcnt)
					{	/* Means a bitmap block.
						 * At this point we can do a "new_db_format != ondsk_blkver" check to determine
						 * if the block got converted since we did the dsk_read (see the non-bitmap case
						 * for a similar check done there), but in that case we will have a transaction
						 * which has read 1 bitmap block and is updating no block. "t_end" currently cannot
						 * handle this case as it expects any bitmap block that needs validation to also
						 * have a corresponding cse which will hold its history. Hence we avoid doing the
						 * new_db_format check. The only disadvantage of this is that we will end up
						 * modifying the bitmap block as part of this transaction (in an attempt to convert
						 * its ondsk_blkver) even though it is already in the right format. Since this
						 * overhead is going to be one per bitmap block and since the block is in the cache
						 * at this point, we should not lose much.
						 */
						assert(!mark_blk_free);
						BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id);
						*blkid_ptr = 0;
						t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0);
						assert(&alt_hist.h[0] == blkhist);
						alt_hist.h[0].blk_num = 0; /* create empty history for bitmap block */
						assert(update_trans);
					} else
					{	/* non-bitmap block. fill in history for validation in t_end */
						assert(curblk);	/* we should never come here for block 0 (bitmap) */
						if (!mark_blk_free)
						{
							assert(blkhist->blk_num == curblk);
							assert(blkhist->buffaddr == blkBase);
							blkhist->tn      = curr_tn;
							alt_hist.h[1].blk_num = 0;
						}
						/* Also need to pass the bitmap as history to detect if any concurrent M-kill
						 * is freeing up the same USED block that we are trying to convert OR if any
						 * concurrent M-set is reusing the same RECYCLED block that we are trying to
						 * convert. Because of t_end currently not being able to validate a bitmap
						 * without that simultaneously having a cse, we need to create a cse for the
						 * bitmap that is used only for bitmap history validation, but should not be
						 * used to update the contents of the bitmap block in bg_update.
						 */
						bmlhist.buffaddr = t_qread(curbmp, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr);
						if (NULL == bmlhist.buffaddr)
						{
							t_retry((enum cdb_sc)rdfail_detail);
							continue;
						}
						bmlhist.blk_num = curbmp;
						bmlhist.tn = curr_tn;
						GET_BM_STATUS(bmlhist.buffaddr, lcnt, bml_status);
						if (BLK_MAPINVALID == bml_status)
						{
							t_retry(cdb_sc_lostbmlcr);
							continue;
						}
						if (!mark_blk_free)
						{
							if ((new_db_format != ondsk_blkver) && (BLK_FREE != bml_status))
							{	/* block still needs to be converted. create cse */
								BLK_INIT(bs_ptr, bs1);
								BLK_SEG(bs_ptr, blkBase + SIZEOF(new_hdr),
									new_hdr.bsiz - SIZEOF(new_hdr));
								BLK_FINI(bs_ptr, bs1);
								t_write(blkhist, (unsigned char *)bs1, 0, 0,
									((blk_hdr_ptr_t)blkBase)->levl, FALSE,
									FALSE, GDS_WRITE_PLAIN);
								/* The directory tree status for now is only used to determine
								 * whether writing the block to snapshot file (see t_end_sysops.c).
 								 * For reorg upgrade/downgrade process, the block is updated in a
								 * sequential way without changing the gv_target. In this case, we
								 * assume the block is in directory tree so as to have it written to
								 * the snapshot file.
			 					 */
								BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state);
								/* reset update_trans in case previous retry had set it to 0 */
								update_trans = UPDTRNS_DB_UPDATED_MASK;
								if (BLK_RECYCLED == bml_status)
								{	/* If block that we are upgarding is RECYCLED, indicate to
									 * bg_update that blks_to_upgrd counter should NOT be
									 * touched in this case by setting "mode" to a special value
									 */
									assert(cw_set[cw_set_depth-1].mode == gds_t_write);
									cw_set[cw_set_depth-1].mode = gds_t_write_recycled;
									/* we SET block as NOT RECYCLED, otherwise, the mm_update()
									 * or bg_update_phase2 may skip writing it to snapshot file
									 * when its level is 0
									 */
									BIT_CLEAR_RECYCLED(cw_set[cw_set_depth-1].blk_prior_state);
								}
							} else
							{	/* Block got converted by another process since we did the dsk_read.
								 * 	or this block became marked free in the bitmap.
								 * No need to update this block. just call t_end for validation of
								 * 	both the non-bitmap block as well as the bitmap block.
								 * Note down that this transaction is no longer updating any blocks.
								 */
								update_trans = 0;
							}
							/* Need to put bit maps on the end of the cw set for concurrency checking.
							 * We want to simulate t_write_map, except we want to update "cw_map_depth"
							 * instead of "cw_set_depth". Hence the save and restore logic below.
							 * This part of the code is similar to the one in mu_swap_blk.c
							 */
							save_cw_set_depth = cw_set_depth;
							assert(!cw_map_depth);
							t_write_map(&bmlhist, NULL, curr_tn, 0); /* will increment cw_set_depth */
							cw_map_depth = cw_set_depth; /* set cw_map_depth to latest cw_set_depth */
							cw_set_depth = save_cw_set_depth;/* restore cw_set_depth */
							/* t_write_map simulation end */
						} else
						{
							if (BLK_RECYCLED != bml_status)
							{	/* Block was RECYCLED at beginning but no longer so. Retry */
								t_retry(cdb_sc_bmlmod);
								continue;
							}
							/* Mark recycled block as FREE in bitmap */
							assert(lcnt == (curblk - curbmp));
							assert(update_array_ptr == update_array);
							*((block_id *)update_array_ptr) = lcnt;
							update_array_ptr += SIZEOF(block_id);
							/* the following assumes SIZEOF(block_id) == SIZEOF(int) */
							assert(SIZEOF(block_id) == SIZEOF(int));
							*(int *)update_array_ptr = 0;
							t_write_map(&bmlhist, (unsigned char *)update_array, curr_tn, 0);
							update_trans = UPDTRNS_DB_UPDATED_MASK;
						}
					}
					assert(SIZEOF(lcl_update_trans) == SIZEOF(update_trans));
					lcl_update_trans = update_trans;	/* take a copy before t_end modifies it */
					if ((trans_num)0 != t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
					{	/* In case this is MM and t_end() remapped an extended database, reset csd */
						assert(csd == cs_data);
						if (!lcl_update_trans)
						{
							assert(lcnt);
							assert(!mark_blk_free);
							assert((new_db_format == ondsk_blkver) || (BLK_BUSY != bml_status));
							if (BLK_BUSY != bml_status)
								reorg_stats.blks_skipped_free++;
							else
								reorg_stats.blks_skipped_newfmtincache++;
						} else if (!lcnt)
							reorg_stats.blks_converted_bmp++;
						else
							reorg_stats.blks_converted_nonbmp++;
						break;
					}
					assert(csd == cs_data);
				}
			}
		}
	stop_reorg_on_this_reg:
		/* even though ctrl-c occurred, update file-header fields to store reorg's progress before exiting */
		grab_crit(reg);
		blocks_left = 0;
		assert(csd->trans_hist.total_blks >= csd->blks_to_upgrd);
		actual_blks2upgrd = csd->blks_to_upgrd;
		total_blks = csd->trans_hist.total_blks;
		free_blks = csd->trans_hist.free_blocks;
		/* Care should be taken not to set "csd->reorg_upgrd_dwngrd_restart_block" in case of a concurrent db fmt
		 * change. This is because let us say we are doing REORG UPGRADE. A concurrent REORG DOWNGRADE would
		 * have reset "csd->reorg_upgrd_dwngrd_restart_block" field to 0 and if that reorg is interrupted by a
		 * Ctrl-C (before this reorg came here) it would have updated "csd->reorg_upgrd_dwngrd_restart_block" to
		 * a non-zero value indicating how many blocks from 0 have been downgraded. We should not reset this
		 * field to "curblk" as it will be mis-interpreted as the number of blocks that have been DOWNgraded.
		 */
		set_fully_upgraded = FALSE;
		if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn)
		{	/* csd->desired_db_format changed since reorg started. discontinue the reorg */
			util_out_print("Region !AD : Desired DB Format changed during REORG. Stopping REORG.",
				TRUE, REG_LEN_STR(reg));
			status1 = ERR_MUNOFINISH;
		} else if (reorg_entiredb)
		{	/* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */
			assert(csd->reorg_upgrd_dwngrd_restart_block <= curblk);
			csd->reorg_upgrd_dwngrd_restart_block = curblk;	/* blocks lesser than this have been upgraded/downgraded */
			expected_blks2upgrd = upgrade ? 0 : (total_blks - free_blks);
			blocks_left = upgrade ? actual_blks2upgrd : (expected_blks2upgrd - actual_blks2upgrd);
			/* If this reorg command went through all blocks in the database, then it should have
			 * 	correctly concluded at this point whether the reorg is complete or not.
			 * If this reorg command started from where a previous incomplete reorg left
			 *	(i.e. first_reorg_in_this_db_fmt is FALSE), it cannot determine if the initial
			 *	GDS blocks that it skipped are completely {up,down}graded or not.
			 */
			assert((0 == blocks_left) || (SS_NORMAL != status1) || !first_reorg_in_this_db_fmt);
			/* If this is a MUPIP REORG UPGRADE that did go through every block in the database (indicated by
			 * "reorg_entiredb" && "first_reorg_in_this_db_fmt") and the current count of "blks_to_upgrd" is
			 * 0 in the file-header and the desired_db_format did not change since the start of the REORG,
			 * we can be sure that the entire database has been upgraded. Set "csd->fully_upgraded" to TRUE.
			 */
			if ((SS_NORMAL == status1) && first_reorg_in_this_db_fmt && upgrade && (0 == actual_blks2upgrd))
			{
				csd->fully_upgraded = TRUE;
				csd->db_got_to_v5_once = TRUE;
				set_fully_upgraded = TRUE;
			}
			/* flush all changes noted down in the file-header */
			if (!wcs_flu(WCSFLU_FLUSH_HDR))	/* wcs_flu assumes gv_cur_region is set (which it is in this routine) */
			{
				gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4,
					LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg));
				status = ERR_MUNOFINISH;
				rel_crit(reg);
				continue;
			}
		}
		curr_tn = csd->trans_hist.curr_tn;
		rel_crit(reg);
		util_out_print("Region !AD : Stopped processing at block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk);
		/* Print statistics */
		util_out_print("Region !AD : Statistics : Blocks Read From Disk (Bitmap)     : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_bmp);
		util_out_print("Region !AD : Statistics : Blocks Skipped (Free)              : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_free);
		util_out_print("Region !AD : Statistics : Blocks Read From Disk (Non-Bitmap) : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_nonbmp);
		util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in disk)   : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtindisk);
		util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in cache)  : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtincache);
		util_out_print("Region !AD : Statistics : Blocks Converted (Bitmap)          : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_bmp);
		util_out_print("Region !AD : Statistics : Blocks Converted (Non-Bitmap)      : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_nonbmp);
		if (reorg_entiredb && (SS_NORMAL == status1) && (0 != blocks_left))
		{	/* file-header counter does not match what reorg on the entire database expected to see */
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBBTUWRNG, 2, expected_blks2upgrd, actual_blks2upgrd);
			util_out_print("Region !AD : Run MUPIP INTEG (without FAST qualifier) to fix the counter",
				TRUE, REG_LEN_STR(reg));
			status1 = ERR_MUNOFINISH;
		} else
			util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : "
				       "Blocks to upgrade = [0x!XL]",
				       TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd);
		/* Issue success or failure message for this region */
		if (SS_NORMAL == status1)
		{	/* issue success only if REORG did not encounter any error in its processing */
			if (set_fully_upgraded)
				util_out_print("Region !AD : Database is now FULLY UPGRADED", TRUE, REG_LEN_STR(reg));
			util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUREUPDWNGRDEND, 5, REG_LEN_STR(reg),
										process_id, process_id, &curr_tn);
		} else
		{
			assert(ERR_MUNOFINISH == status1);
			assert((SS_NORMAL == status) || (ERR_MUNOFINISH == status));
			util_out_print("Region !AD : MUPIP REORG !AD incomplete. See above messages.!/",
					TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			status = status1;
		}
	}
	if (NULL != bptr)
		free(bptr);
	if (NULL != bml_lcl_buff)
		free(bml_lcl_buff);
	if (mu_ctrly_occurred || mu_ctrlc_occurred)
	{
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_REORGCTRLY);
		status = ERR_MUNOFINISH;
	}
	mupip_exit(status);
}
Exemplo n.º 4
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
	/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
	int4			status;
	uint4			blocking_pid;
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	boolean_t		clustered, hold_onto_crit, was_crit, issued_db_init_crypt_warning, sync_needed;
	int			dummy, lcnt, ocnt;
	cw_set_element		*cse;
	off_chain		chain1;
	register sgmnt_addrs	*csa;
	register sgmnt_data_ptr_t	csd;
	enum db_ver		ondsk_blkver;
	int4			dummy_errno, gtmcrypt_errno;
	boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
	ht_ent_int4		*tabent;
	srch_blk_status		*blkhist;
	trans_num		dirty, blkhdrtn;
	sm_uc_ptr_t		buffaddr;
	uint4			stuck_cnt = 0;
	boolean_t		lcl_blk_free;
	node_local_ptr_t	cnl;
	gd_segment		*seg;
	uint4			buffs_per_flush, flush_target;
	enc_info_t		*encr_ptr;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	lcl_blk_free = block_is_free;
	block_is_free = FALSE;	/* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
	first_tp_srch_status = NULL;
	reset_first_tp_srch_status = FALSE;
	csa = cs_addrs;
	csd = csa->hdr;
	INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
	is_mm = (dba_mm == csd->acc_meth);
	/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
	assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
	if (dollar_tlevel)
	{
		assert(sgm_info_ptr);
		if (0 != sgm_info_ptr->cw_set_depth)
		{
			chain1 = *(off_chain *)&blk;
			if (1 == chain1.flag)
			{
				assert(sgm_info_ptr->cw_set_depth);
				if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				else
				{
					assert(FALSE == csa->now_crit);
					rdfail_detail = cdb_sc_blknumerr;
					return (sm_uc_ptr_t)NULL;
				}
			} else
			{
				if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
					first_tp_srch_status = tabent->value;
				else
					first_tp_srch_status = NULL;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
				cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			assert(!chain1.flag || cse);
			if (cse)
			{	/* transaction has modified the sought after block  */
				if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
				{	/* Changes have not been committed to shared memory, i.e. still in private memory.
					 * Build block in private buffer if not already done and return the same.
					 */
					assert(gds_t_writemap != cse->mode);
					if (FALSE == cse->done)
					{	/* out of date, so make it current */
						assert(gds_t_committed != cse->mode);
						already_built = (NULL != cse->new_buff);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
						assert(NULL != cse->blk_target);
						if (!already_built && !chain1.flag)
						{
							buffaddr = first_tp_srch_status->buffaddr;
							cr = first_tp_srch_status->cr;
							assert((is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
								TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
									first_tp_srch_status->tn, blkhdrtn,
									((blk_hdr_ptr_t)buffaddr)->levl);
								return (sm_uc_ptr_t)NULL;
							}
							if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
										|| (first_tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_lostcr; /* should this be something else */
								return (sm_uc_ptr_t)NULL;
							}
						}
						cse->done = TRUE;
					}
					*cycle = CYCLE_PVT_COPY;
					*cr_out = 0;
					return (sm_uc_ptr_t)cse->new_buff;
				} else
				{	/* Block changes are already committed to shared memory (possible if we are in TP
					 * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
					 * block from shared memory; do not look at private memory (i.e. cse) as that might
					 * not be as uptodate as shared memory.
					 */
					assert(csa->now_crit);	/* gvcst_expand_free_subtree does t_qread in crit */
					/* If this block was newly created as part of the TP transaction, it should not be killed
					 * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
					 * have had an old_mode of kill_t_create in which case we would not have come into this
					 * else block. Assert accordingly.
					 */
					assert(!chain1.flag);
					first_tp_srch_status = NULL;	/* do not use any previous srch_hist information */
				}
			}
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				first_tp_srch_status = tabent->value;
			else
				first_tp_srch_status = NULL;
		}
		ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
		if (!is_mm && first_tp_srch_status)
		{
			cr = first_tp_srch_status->cr;
			assert(cr && !first_tp_srch_status->cse);
			if (first_tp_srch_status->cycle == cr->cycle)
			{
				*cycle = first_tp_srch_status->cycle;
				*cr_out = cr;
				cr->refer = TRUE;
				if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
					CWS_INSERT(blk);
				return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
			} else
			{	/* Block was already part of the read-set of this transaction, but got recycled in the cache.
				 * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
				 * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
				 * if block recycling happened within the same mini-action and restart in that case.
				 * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
				 * the values to update to. Set a variable that will enable the updation before returning.
				 * Also assert that if we are in the final retry, we are never in a situation where we have a
				 * block that got recycled since the start of the current mini-action. This is easily detected since
				 * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
				 * have been read as part of the current mini-action until now.
				 */
				assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
				reset_first_tp_srch_status = TRUE;
			}
		}
	}
	if ((uint4)blk >= (uint4)csa->ti->total_blks)
	{	/* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is
		 * never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread
		 * can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below.
		 */
		assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
		assert(!csa->now_crit);
		rdfail_detail = cdb_sc_blknumerr;
		return (sm_uc_ptr_t)NULL;
	}
	if (is_mm)
	{
		*cycle = CYCLE_SHRD_COPY;
		*cr_out = 0;
		return (sm_uc_ptr_t)(mm_read(blk));
	}
	was_crit = csa->now_crit;
	cnl = csa->nl;
	encr_ptr = csa->encr_ptr;
	if (NULL != encr_ptr)
	{
		/* If this is an encrypted database and we hold crit, make sure our private cycle matches the shared cycle.
		 * Or else we would need to call "process_reorg_encrypt_restart" below (a heavyweight operation) holding crit.
		 */
		assert(!was_crit || (cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle));
		seg = gv_cur_region->dyn.addr;
		issued_db_init_crypt_warning = encr_ptr->issued_db_init_crypt_warning;
		if (!IS_BITMAP_BLK(blk) && issued_db_init_crypt_warning)
		{	/* A non-GT.M process is attempting to read a non-bitmap block, yet it has previously encountered an error
			 * during db_init (because it did not have access to the encryption keys) and reported it with a -W-
			 * severity. Since the block it is attempting to read can be in the unencrypted shared memory (read from
			 * disk by another process with access to the encryption keys), we cannot let it access it without a valid
			 * handle, so issue an rts_error.
			 *
			 * TODO: DSE and LKE could bypass getting the ftok semaphore. LKE is not an issue, but DSE does care about
			 *       the csa->reorg_encrypt_cycle. So it means DSE could get an inconsistent copy of reorg_encrypt_cycle
			 *       and associated hashes if it had done a bypass and a concurrent REORG -ENCRYPT is holding the ftok
			 *       semaphore and changing these values at the same time.
			 */
			assert(!IS_GTM_IMAGE);	/* GT.M would have error'ed out in db_init */
			gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG));
			GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
		} else if (cnl->reorg_encrypt_cycle != encr_ptr->reorg_encrypt_cycle)
		{	/* A concurrent MUPIP REORG ENCRYPT occurred. Cannot proceed with the read even if the block is
			 * already loaded from disk into the unencrypted global buffers (security issue). Need to load the
			 * new encryption keys and only let those processes which are able to successfully do this proceed
			 * with the read. First, copy the key hashes from csd into csa->encr_ptr. That needs crit
			 * to ensure a concurrent MUPIP REORG ENCRYPT does not sneak in.
			 *
			 * Note: Even though we asserted a few lines above that if "was_crit" is TRUE, then we expect
			 * the encryption cycles to be in sync, we handle this out-of-design situation in "pro" by fixing
			 * the cycles while holding crit (hopefully rare case so it is okay to hold crit for a heavyweight call).
			 */
			if (!was_crit)
				grab_crit(gv_cur_region);
			/* Now that we have crit, sync them up by copying the new keys inside crit and opening the key handles
			 * outside crit (a potentially long running operation).
			 */
			SIGNAL_REORG_ENCRYPT_RESTART(mu_reorg_encrypt_in_prog, reorg_encrypt_restart_csa,
					cnl, csa, csd, rdfail_detail, process_id);
			assert(csa == reorg_encrypt_restart_csa);
			if (!was_crit)
				rel_crit(gv_cur_region);
			/* If we are inside a TP read-write transaction, it is possible we already used the old keys for
			 * prior calls to "jnl_format" so we have to restart (cannot sync up cycles). Do the same for
			 * TP read-only transaction as well as NON-TP read-write transaction. In all these cases we know
			 * the caller is capable of restarting. All other cases we dont know if the caller is capable so
			 * sync up the cycles and proceed using the new keys for the read.
			 *
			 * But since it is possible the caller does not call t_retry right away (e.g. mupip reorg which can
			 * choose to abandone this tree path and move on to another block without aborting this transaction)
			 * it is better we finish the pending call to "process_reorg_encrypt_restart" right here before returning.
			 */
			process_reorg_encrypt_restart();
			assert(NULL == reorg_encrypt_restart_csa);
			if (IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans))
			{
				assert(cdb_sc_reorg_encrypt == rdfail_detail);	/* set by SIGNAL_REORG_ENCRYPT_RESTART macro */
				return (sm_uc_ptr_t)NULL;
			}
		}
	}
	assert(dba_bg == csd->acc_meth);
	assert(!first_tp_srch_status || !first_tp_srch_status->cr
					|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
	if (FALSE == (clustered = csd->clustered))
		bt = NULL;
	ocnt = 0;
	set_wc_blocked = FALSE;	/* to indicate whether cnl->wc_blocked was set to TRUE by us */
	hold_onto_crit = csa->hold_onto_crit;	/* note down in local to avoid csa-> dereference in multiple usages below */
	do
	{
		if (NULL == (cr = db_csh_get(blk)))
		{	/* not in memory */
			if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
				bt = NULL;
			if (!csa->now_crit)
			{
				assert(!hold_onto_crit);
				if (NULL != bt)
				{	/* at this point, bt is not NULL only if clustered and flushing - wait no crit */
					assert(clustered);
					wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
				}
				/* assume defaults for flush_target and buffs_per_flush */
				flush_target = csd->flush_trigger;
				buffs_per_flush = 0;
				if ((0 != csd->epoch_taper) && (FALSE == gv_cur_region->read_only) && JNL_ENABLED(csd) &&
						(0 != cnl->wcs_active_lvl) && (NOJNL != csa->jnl->channel) &&
						(0 != cnl->jnl_file.u.inode) && csd->jnl_before_image)
				{
					EPOCH_TAPER_IF_NEEDED(csa, csd, cnl, (gd_region *) 0, FALSE, buffs_per_flush, flush_target);
				}
				if ((flush_target <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, buffs_per_flush, dummy_errno);
						/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
				/* Get crit but also ensure encryption cycles are in sync ("dsk_read" relies on this).
				 * Note: "sync_needed" should be TRUE very rarely since we synced the cycles just a few lines
				 * above. But in case a MUPIP REORG ENCRYPT concurrently sneaked in between these lines we
				 * need to resync.
				 */
				sync_needed = grab_crit_encr_cycle_sync(gv_cur_region);
				assert(NULL == reorg_encrypt_restart_csa);
				assert(!sync_needed || (NULL != encr_ptr));
				if (sync_needed && IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans))
				{
					assert(cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle);
					rel_crit(gv_cur_region);
					rdfail_detail = cdb_sc_reorg_encrypt;	/* set by SIGNAL_REORG_ENCRYPT_RESTART macro */
					return (sm_uc_ptr_t)NULL;
				}
				cr = db_csh_get(blk);			/* in case blk arrived before crit */
			}
			if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
			{	/* Once crit, need to assure that if clustered, that flushing is [still] complete
				 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
				wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
			}
			if (NULL == cr)
			{	/* really not in memory - must get a new buffer */
				assert(csa->now_crit);
				cr = db_csh_getn(blk);
				if (CR_NOTVALID == (sm_long_t)cr)
				{
					assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
					assert(gtm_white_box_test_case_enabled);
					SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
					BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
					set_wc_blocked = TRUE;
					break;
				}
				assert(0 <= cr->read_in_progress);
				*cycle = cr->cycle;
				cr->tn = csd->trans_hist.curr_tn;
				/* Record history of most recent disk reads only in dbg builds for now. Although the macro
				 * is just a couple dozen instructions, it is done while holding crit so we want to avoid
				 * delaying crit unless really necessary. Whoever wants this information can enable it
				 * by a build change to remove the DEBUG_ONLY part below.
				 */
				DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
				if (!was_crit && !hold_onto_crit)
					rel_crit(gv_cur_region);
				/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
				assert(0 == cr->dirty);
				assert(cr->read_in_progress >= 0);
				CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
				buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
#				ifdef DEBUG
				/* stop self to test sechshr_db_clnup clears the read state */
				if (gtm_white_box_test_case_enabled
					&& (WBTEST_SIGTSTP_IN_T_QREAD == gtm_white_box_test_case_number))
				{	/* this should never fail, but because of the way we developed the test we got paranoid */
					dummy = kill(process_id, SIGTERM);
					assert(0 == dummy);
					for (dummy = 10; dummy; dummy--)
						LONG_SLEEP(10); /* time for sigterm to take hit before we clear block_now_locked */
				}
#				endif
				if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free)))
				{
					/* buffer does not contain valid data, so reset blk to be empty */
					cr->cycle++;	/* increment cycle for blk number changes (for tp_hist and others) */
					cr->blk = CR_BLKEMPTY;
					cr->r_epid = 0;
					RELEASE_BUFF_READ_LOCK(cr);
					TREF(block_now_locked) = NULL;
					assert(-1 <= cr->read_in_progress);
					assert(was_crit == csa->now_crit);
					if (ERR_DYNUPGRDFAIL == status)
					{	/* if we dont hold crit on the region, it is possible due to concurrency conflicts
						 * that this block is unused (i.e. marked free/recycled in bitmap, see comments in
						 * gds_blk_upgrade.h). in this case we should not error out but instead restart.
						 */
						if (was_crit)
						{
							assert(FALSE);
							rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk,
									DB_LEN_STR(gv_cur_region));
						} else
						{
							rdfail_detail = cdb_sc_lostcr;
							return (sm_uc_ptr_t)NULL;
						}
					}
					if ((-1 == status) && !was_crit)
					{	/* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch
						 * a full database block's length of data. This can happen if the requested read is
						 * past the end of the file, which can happen if a concurrent truncate occurred
						 * after the blk >= csa->ti->total_blks comparison above. Allow for this scenario
						 * by restarting. However, if we've had crit the whole time, no truncate could have
						 * happened. -1 indicates a problem with the file, so fall through to DBFILERR.
						 */
						rdfail_detail = cdb_sc_truncate;
						return (sm_uc_ptr_t)NULL;
					} else if (IS_CRYPTERR_MASK(status))
					{
						seg = gv_cur_region->dyn.addr;
						GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname);
					}
					else
					{	/* A DBFILERR can be thrown for two possible reasons:
						 * (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or
						 * (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block
						 * number for one region from another region with fewer total_blks.
						 *    We suspect the former is what happened in GTM-7623. Apparently the latter
						 * has been an issue before, too. If either occurs again in pro, this assertpro
						 * distinguishes the two possibilities.
						 */
						assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
						rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region),
								status);
					}
				}
				disk_blk_read = TRUE;
				assert(0 <= cr->read_in_progress);
				assert(0 == cr->dirty);
				/* Only set in cache if read was success */
				cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
				cr->r_epid = 0;
				RELEASE_BUFF_READ_LOCK(cr);
				TREF(block_now_locked) = NULL;
				assert(-1 <= cr->read_in_progress);
				*cr_out = cr;
				assert(was_crit == csa->now_crit);
				if (reset_first_tp_srch_status)
					RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
				return buffaddr;
			} else  if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
			{
				assert(!hold_onto_crit);
				assert(TRUE == csa->now_crit);
				assert(cnl->in_crit == process_id);
				rel_crit(gv_cur_region);
			}
		}