Esempio n. 1
0
bt_rec_ptr_t bt_put(gd_region *reg, int4 block)
{
	bt_rec_ptr_t		bt, q0, q1, hdr;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	cache_rec_ptr_t		cr;
	th_rec_ptr_t		th;
	trans_num		lcl_tn;
	uint4			lcnt;

	csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	assert(csa->now_crit || csd->clustered);
	assert(dba_mm != csa->hdr->acc_meth);
	lcl_tn = csa->ti->curr_tn;
	hdr = csa->bt_header + (block % csd->bt_buckets);
	assert(BT_QUEHEAD == hdr->blk);
	for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl);  ;
		bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++)
	{
		if (BT_QUEHEAD == bt->blk)
		{	/* there is no matching bt */
			assert(bt == hdr);
			bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque));
			if (CR_NOTVALID != bt->cache_index)
			{	/* the oldest bt is still valid */
				assert(!in_wcs_recover);
				cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
				if (cr->dirty)
				{	/* get it written so it can be reused */
					BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty);
					if (FALSE == wcs_get_space(reg, 0, cr))
					{
						assert(csa->nl->wc_blocked);	/* only reason we currently know
										 * why wcs_get_space could fail */
						assert(gtm_white_box_test_case_enabled);
						BG_TRACE_PRO_ANY(csa, wcb_bt_put);
						send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"),
							process_id, &lcl_tn, DB_LEN_STR(reg));
						return NULL;
					}
				}
				bt->cache_index = CR_NOTVALID;
				cr->bt_index = 0;
			}
			q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl);
			q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0);
			if (EMPTY_QUEUE == (sm_long_t)q1)
				rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1);
			bt->blk = block;
			bt->killtn = lcl_tn;
			insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr);
			th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base);
			if (EMPTY_QUEUE == (sm_long_t)th)
				GTMASSERT;
			break;
		}
		if (bt->blk == block)
		{	/* bt_put should never be called twice for the same block with the same lcl_tn. This is because
			 * t_end/tp_tend update every block only once as part of each update transaction. Assert this.
			 * The two exceptions are
			 *   a) Forward journal recovery which simulates a 2-phase M-kill where the same block
			 *	could get updated in both phases (example bitmap block gets updated for blocks created
			 *	within the TP transaction as well as for blocks that are freed up in the 2nd phase of
			 *	the M-kill) with the same transaction number. This is because although GT.M would have
			 *	updated the same block with different transaction numbers in the two phases, forward
			 *	recovery will update it with the same tn and instead increment the db tn on seeing the
			 *	following INCTN journal record(s).
			 *   b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block
			 *	and potentially with the same tn. This is because the state of the queues is questionable
			 *	and there could be more than one cache record for a given block number.
			 */
			assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa)));
			q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl);
			th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque)));
			if (EMPTY_QUEUE == (sm_long_t)th)
				GTMASSERT;
			break;
		}
		if (0 == bt->blkque.fl)
			rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2);
		if (lcnt >= csd->n_bts)
			rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3);
	}
	insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base);
	bt->tn = lcl_tn;
	return bt;
}
Esempio n. 2
0
cache_rec_ptr_t	db_csh_getn(block_id block)
{
	cache_rec_ptr_t		hdr, q0, start_cr, cr;
	bt_rec_ptr_t		bt;
	unsigned int		lcnt, ocnt;
	int			rip, max_ent, pass1, pass2, pass3;
	int4			flsh_trigger;
	uint4			r_epid, dummy;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		*tp_srch_status;

	error_def(ERR_BUFRDTIMEOUT);
	error_def(ERR_INVALIDRIP);

	csa = cs_addrs;
	csd = csa->hdr;
	assert(csa->now_crit);
	assert(csa == &FILE_INFO(gv_cur_region)->s_addrs);
	max_ent = csd->n_bts;
	cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off);
	hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets);
	start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets;
	pass1 = max_ent;	/* skip referred or dirty or read-into cache records */
	pass2 = 2 * max_ent;	/* skip referred cache records */
	pass3 = 3 * max_ent;	/* skip nothing */
	INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1);
	for (lcnt = 0;  ; lcnt++)
	{
		if (lcnt > pass3)
		{
			BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed);
			assert(FALSE);
			break;
		}
		cr++;
		if (cr == start_cr + max_ent)
			cr = start_cr;
		VMS_ONLY(
			if ((lcnt == pass1) || (lcnt == pass2))
				wcs_wtfini(gv_cur_region);
		)
		if (TRUE == cr->refer && lcnt < pass2)
		{	/* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */
			cr->refer = FALSE;
			continue;
		}
		if (TRUE == cr->in_cw_set)
		{	/* this process already owns it - skip it */
			cr->refer = TRUE;
			continue;
		}
		if (CDB_STAGNATE <= t_tries || mu_reorg_process)
		{
			/* Prevent stepping on self when crit for entire transaction.
			 * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block.
			 * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use
			 * 	is updated to the latest cw_stagnate list of blocks only in tp_hist().
			 * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's.
			 * This is to allow big-read TP transactions which may use up more than the available global buffers.
			 * There is one issue here in that a block that has been only read till now may be stepped upon here
			 *	but may later be needed for update. It is handled by updating the block's corresponding
			 *	entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the
			 *	"cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the
			 *	first time within the transaction since otherwise the transaction would restart due to a
			 *	cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still
			 *	remain the "tn" when the block was first read within this transaction to ensure the block
			 *	hasn't been modified since the start of the transaction. Once we intend on changing the
			 *	block i.e. srch_blk_status->ptr is non-NULL, we ensure in the code below not to step on it.
			 *	[tp_hist() is the routine that updates the "cr", "cycle" and "tn" of the block].
			 * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn"
			 *	of the first t_qread of the block within that transaction. The above is the only exception.
			 * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of
			 *	them even if they don't have a cse. This is to ensure that the current action doesn't
			 *	encounter a restart due to cdb_sc_lostcr in tp_hist() even in the fourth-retry.
			 */
			if (dollar_tlevel
				&& (tp_srch_status =
					(srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)cr->blk, &dummy))
				&& tp_srch_status->ptr)
			{	/* this process is already using the block - skip it */
				cr->refer = TRUE;
				continue;
			}
			if (NULL != lookup_hashtab_ent(cw_stagnate, (void *)cr->blk, &dummy))
			{
				cr->refer = TRUE;
				continue;
			}
		}
		if (cr->dirty)
		{	/* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a
			 * concurrent wcs_wtstart() has reset dirty to 0 but that update did not reach us yet). In this
			 * case the call to wcs_get_space() below will do the necessary memory barrier instructions
			 * (through calls to aswp()) which will allow us to see the non-stale value of cr->dirty.
			 *
			 * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space
			 * done below will return FALSE forcing a cache-rebuild which will fix this situation.
			 *
			 * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine
			 * is wcs_wtfini() which is executed in crit which another process cannot be in as we are in crit now.
			 */
			if (gv_cur_region->read_only)
				continue;
			if (lcnt < pass1)
			{
				if (!csa->timer && (csa->nl->wcs_timers < 1))
					wcs_timer_start(gv_cur_region, FALSE);
				continue;
			}
			BG_TRACE_PRO(db_csh_getn_flush_dirty);
			if (FALSE == wcs_get_space(gv_cur_region, 0, cr))
			{	/* failed to flush it out - force a rebuild */
				BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt);
				assert(FALSE);
				break;
			}
			assert(0 == cr->dirty);
		}
		UNIX_ONLY(
			/* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR.
			 * In VMS, resetting the write-latch value occurs in wcs_wtfini() which is in CRIT, we are fine.
			 * In Unix, this resetting is done by wcs_wtstart() which is out-of-crit. Therefore, we need to
			 * 	wait for this value to be LATCH_CLEAR before reusing this cache-record.
			 * Note that we are examining the write-latch-value without holding the interlock. It is ok to do
			 * 	this because the only two routines that modify the latch value are bg_update() and
			 * 	wcs_wtstart(). The former cannot be concurrently executing because we are in crit.
			 * 	The latter will not update the latch value unless this cache-record is dirty. But in this
			 * 	case we would have most likely gone through the if (cr->dirty) check above. Most likely
			 * 	because there is one rare possibility where a concurrent wcs_wtstart() has set cr->dirty
			 * 	to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared.
			 * 	In all other cases, nobody is modifying the latch since when we got crit and therefore
			 * 	it is safe to observe the value of the latch without holding the interlock.
			 */
			if (LATCH_CLEAR != WRITE_LATCH_VAL(cr))
			{	/* possible if a concurrent wcs_wtstart() has set cr->dirty to 0 but not yet
				 * cleared the latch. this should be very rare though.
				 */
				if (lcnt < pass2)
					continue; /* try to find some other cache-record to reuse until the 3rd pass */
				for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++)
					wcs_sleep(SLEEP_WRTLATCHWAIT);	/* since it is a short lock, sleep the minimum */
				if (MAXWRTLATCHWAIT <= ocnt)
				{
					BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck);
					assert(FALSE);
					continue;
				}
			}
		)
Esempio n. 3
0
boolean_t	tp_tend(boolean_t crit_only)
{
	block_id		tp_blk;
	boolean_t		history_validated, is_mm, was_crit, x_lock, do_validation;
	boolean_t		do_deferred_writes = FALSE, replication = FALSE;
	bt_rec_ptr_t		bt;
	cache_rec_ptr_t		cr;
	cw_set_element		*cse;
	file_control		*fc;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	jnl_format_buffer	*jfb;
	sgm_info		*si, *tmpsi;
	tp_region		*tr, *tr_last;
	sgmnt_addrs		*csa, *tmpcsa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		*t1;
	trans_num		ctn, tnque_earliest_tn;
	trans_num		valid_thru;	/* buffers touched by this transaction will be valid thru this tn */
	enum cdb_sc		status;
	gd_region		*save_gv_cur_region;
	int			lcnt, participants;
	jnldata_hdr_ptr_t	jnl_header;
	int			repl_tp_region_count = 0;
	boolean_t		first_time = TRUE, release_crit, yes_jnl_no_repl, retvalue;
	uint4			jnl_status, leafmods, indexmods;
	uint4			total_jnl_rec_size;
	jnlpool_ctl_ptr_t	jpl, tjpl;

	error_def(ERR_DLCKAVOIDANCE);
	error_def(ERR_JNLTRANS2BIG);

	assert(dollar_tlevel > 0);
	assert(0 == jnl_fence_ctl.level);
	participants = 0;
	status = cdb_sc_normal;
	/* if the transaction does no updates and the transaction history has not changed, we do not need any more validation */
	do_validation = FALSE;	/* initially set to FALSE, but set to TRUE below */
	jnl_status = 0;
	if (FALSE == crit_only)
	{
		for (si = first_sgm_info;  (NULL != si); si = si->next_sgm_info)
		{
			sgm_info_ptr = si;
			TP_CHANGE_REG_IF_NEEDED(si->gv_cur_region);
			csa = cs_addrs;
			csd = cs_data;
			if ((csd->wc_blocked) ||			/* If blocked, or.. */
				((dba_mm == csa->hdr->acc_meth) &&	/* we have MM and.. */
				(csa->total_blks != csa->ti->total_blks)))	/* and file has been extended */
			{	/* Force repair */
				t_fail_hist[t_tries] = cdb_sc_helpedout; /* special status to prevent punishing altruism */
				TP_TRACE_HIST(CR_BLKEMPTY, NULL);
				return FALSE;
			}
			/* whenever si->first_cw_set is non-NULL, ensure that si->update_trans is TRUE */
			assert((NULL == si->first_cw_set) || si->update_trans);
			/* whenever si->first_cw_set is NULL, ensure that si->update_trans is FALSE
			 * except when the set noop optimization is enabled */
			assert((NULL != si->first_cw_set) || !si->update_trans || gvdupsetnoop);
			if (!si->update_trans)
			{
				if (si->start_tn == csa->ti->early_tn)
				{	/* read with no change to the transaction history. ensure we haven't overrun
					 * our history buffer and we have reasonable values for first and last */
					assert(si->last_tp_hist - si->first_tp_hist <= si->tp_hist_size);
					continue;
				} else
					do_validation = TRUE;
			} else
			{
				do_validation = TRUE;
				is_mm = (dba_mm == cs_data->acc_meth);
				/* We are still out of crit if this is not our last attempt. If so, run the region list and check
				 * that we have sufficient free blocks for our update. If not, get them now while we can.
				 * We will repeat this check later in crit but it will hopefully have little or nothing to do.
				 * bypass 1st check if already in crit -- check later
				 */
				if (!csa->now_crit && !is_mm && (csa->nl->wc_in_free < si->cw_set_depth + 1)
						&& !wcs_get_space(si->gv_cur_region, si->cw_set_depth + 1, NULL))
					assert(FALSE);	/* wcs_get_space() should have returned TRUE unconditionally in this case */
			}
			if (si->update_trans && JNL_ENABLED(csa))
			{	/* compute the total journal record size requirements before grab_crit().
				 * there is code later that will check for state changes from now to then
				 */
				TOTAL_TPJNL_REC_SIZE(total_jnl_rec_size, si, csa);
				/* compute current transaction's maximum journal space needs in number of disk blocks */
				si->tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size);
				/* check if current TP transaction's journal size needs are greater than max jnl file size */
				if (si->tot_jrec_size > csd->autoswitchlimit)
					/* can't fit in current transaction's journal records into one journal file */
					rts_error(VARLSTCNT(6) ERR_JNLTRANS2BIG, 4, si->tot_jrec_size,
						JNL_LEN_STR(csd), csd->autoswitchlimit);
			}
		}	/* for (si ... ) */
		if (!do_validation)
		{
			if (CDB_STAGNATE <= t_tries)
			{
				for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
					rel_crit(tr->reg);
			}
			UNIX_ONLY(
				/* Must be done after REVERT since we are no longer in crit */
				if (unhandled_stale_timer_pop)
					process_deferred_stale();
			)
			return TRUE;
		}
	}