void process_reorg_encrypt_restart(void)
{
	intrpt_state_t	prev_intrpt_state;
	enc_info_t	*encr_ptr;
	int		gtmcrypt_errno;
	gd_segment	*seg;
	sgmnt_addrs	*csa;

	csa = reorg_encrypt_restart_csa;
	assert(NULL != csa);	/* caller should have ensured this */
	/* Opening handles for encryption is a heavyweight operation. Caller should have ensured we are not in crit for
	 * any region when the new key handles are opened for any one region. Assert that.
	 */
	assert(0 == have_crit(CRIT_HAVE_ANY_REG));
	DEFER_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state);
	encr_ptr = csa->encr_ptr;
	assert(NULL != encr_ptr);
	DBG_RECORD_CRYPT_RECEIVE(csa->hdr, csa, csa->nl, process_id, encr_ptr);
	seg = csa->region->dyn.addr;
	INIT_DB_OR_JNL_ENCRYPTION(csa, encr_ptr, seg->fname_len, seg->fname, gtmcrypt_errno);
	if (0 != gtmcrypt_errno)
	{
		ENABLE_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state);
		GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
	}
	reorg_encrypt_restart_csa = NULL;
	ENABLE_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state);
}
Example #2
0
/* Given a file (journal or database), the function extracts the buffer of the given length at the given offset and displays it
 * on the STDIN. Note that, the offset and length should match the values at the encryption time. In case of journal files,
 * this offset could be obtained for every record using a detailed journal extract. */
int	mu_decrypt(char *fname, uint4 off, uint4 len)
{
#	ifdef GTM_CRYPT
	int				fd, n_len, save_errno, gtmcrypt_errno, i;
	char				hash[GTMCRYPT_HASH_LEN], *buff;
	boolean_t			is_encrypted;
	gtmcrypt_key_t			key_handle;

	assert(fname);
	assert(STRLEN(fname));
	n_len = STRLEN(fname);
	GET_FD_HASH(fname, n_len, fd, hash, is_encrypted);
	buff = (char *)malloc(len);
	LSEEKREAD(fd, off, buff, len, save_errno);
	if (0 != save_errno)
	{
		close(fd);
		GC_DISPLAY_FILE_ERROR_AND_RETURN("Error reading from file !AD", fname, n_len, save_errno);
	}
	if (is_encrypted)
	{
		INIT_PROC_ENCRYPTION(NULL, gtmcrypt_errno);
		GTMCRYPT_GETKEY(NULL, hash, key_handle, gtmcrypt_errno);
		if (0 == gtmcrypt_errno)
			GTMCRYPT_DECRYPT(NULL, key_handle, buff, len, NULL, gtmcrypt_errno);
		if (0 != gtmcrypt_errno)
		{
			close(fd);
			free(buff);
			GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, n_len, fname);
		}
	}
	for (i = 0; i < len; i++)
	{
		if (WITHIN_PRINTABLE_RANGE(buff[i]))
			PRINTF("%c", buff[i]);
		else
			PRINTF("%c", '.');
	}
	free(buff);
	close(fd);
#	endif
	return SS_NORMAL;

}
void jnl_write_aimg_rec(sgmnt_addrs *csa, cw_set_element *cse, uint4 com_csum)
{
	struct_jrec_blk		aimg_record;
	int			tmp_jrec_size, jrec_size, zero_len;
	jnl_format_buffer 	blk_trailer;	/* partial record after the aimg block */
	char			local_buff[JNL_REC_START_BNDRY + JREC_SUFFIX_SIZE];
	jrec_suffix		*suffix;
	blk_hdr_ptr_t		buffer, save_buffer;
	jnl_private_control	*jpc;
	sgmnt_data_ptr_t	csd;
	uint4			cursum;
#	ifdef GTM_CRYPT
	char			*in, *out;
	int			in_len, gtmcrypt_errno;
	gd_segment		*seg;
#	endif

	csd = csa->hdr;
	assert(csa->now_crit);
	jpc = csa->jnl;
	assert(0 != jpc->pini_addr);
	aimg_record.prefix.jrec_type = JRT_AIMG;
	aimg_record.prefix.pini_addr = (0 == jpc->pini_addr) ? JNL_HDR_LEN : jpc->pini_addr;
	aimg_record.prefix.tn = csa->ti->curr_tn;
	/* At this point jgbl.gbl_jrec_time should be set by the caller */
	assert(jgbl.gbl_jrec_time);
	aimg_record.prefix.time = jgbl.gbl_jrec_time;
	aimg_record.prefix.checksum = INIT_CHECKSUM_SEED;
	aimg_record.blknum = cse->blk;
	/* in case we have a bad block-size, we dont want to write an AIMG larger than the GDS block size (maximum block size) */
	buffer = (blk_hdr_ptr_t)cse->new_buff;
	assert(buffer->bsiz <= csd->blk_size);
	assert(buffer->bsiz >= SIZEOF(blk_hdr));
	aimg_record.bsiz = MIN(csd->blk_size, buffer->bsiz);
	aimg_record.ondsk_blkver = cse->ondsk_blkver;
	tmp_jrec_size = (int)FIXED_AIMG_RECLEN + aimg_record.bsiz + JREC_SUFFIX_SIZE;
	jrec_size = ROUND_UP2(tmp_jrec_size, JNL_REC_START_BNDRY);
	zero_len = jrec_size - tmp_jrec_size;
	blk_trailer.buff = local_buff + (JNL_REC_START_BNDRY - zero_len);
	memset(blk_trailer.buff, 0, zero_len);
	blk_trailer.record_size = zero_len + JREC_SUFFIX_SIZE;
	suffix = (jrec_suffix *)&local_buff[JNL_REC_START_BNDRY];
	aimg_record.prefix.forwptr = suffix->backptr = jrec_size;
	suffix->suffix_code = JNL_REC_SUFFIX_CODE;
	assert(SIZEOF(uint4) == SIZEOF(jrec_suffix));
	save_buffer = buffer;
#	ifdef GTM_CRYPT
	in_len = aimg_record.bsiz - SIZEOF(*buffer);
	if (BLK_NEEDS_ENCRYPTION3(csd->is_encrypted, buffer->levl, in_len))
	{
		ASSERT_ENCRYPTION_INITIALIZED;
		assert(aimg_record.bsiz <= csa->hdr->blk_size);
		REALLOC_CRYPTBUF_IF_NEEDED(csa->hdr->blk_size);
		memcpy(pvt_crypt_buf.addr, buffer, SIZEOF(blk_hdr));	/* copy the block header */
		in = (char *)(buffer + 1);	/* + 1 because `buffer' is of type blk_hdr_ptr_t */
		out = pvt_crypt_buf.addr + SIZEOF(blk_hdr);
		GTMCRYPT_ENCRYPT(csa, csa->encr_key_handle, in, in_len, out, gtmcrypt_errno);
		if (0 != gtmcrypt_errno)
		{
			seg = csa->region->dyn.addr;
			GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
		}
		buffer = (blk_hdr_ptr_t)pvt_crypt_buf.addr;
	}
#	endif
	cursum = jnl_get_checksum((uint4 *)buffer, NULL, aimg_record.bsiz);
	COMPUTE_AIMG_CHECKSUM(cursum, &aimg_record, com_csum, aimg_record.prefix.checksum);
	jnl_write(jpc, JRT_AIMG, (jnl_record *)&aimg_record, buffer, &blk_trailer);
	buffer = save_buffer;
}
Example #4
0
void mu_int_reg(gd_region *reg, boolean_t *return_value, boolean_t return_after_open)
{
	boolean_t		read_only, was_crit;
	freeze_status		status;
	node_local_ptr_t	cnl;
	sgmnt_addrs     	*csa;
	sgmnt_data_ptr_t	csd;
	sgmnt_data		*csd_copy_ptr;
	gd_segment		*seg;
	int			gtmcrypt_errno;
#	ifdef DEBUG
	boolean_t		need_to_wait = FALSE;
	int			trynum;
	uint4			curr_wbox_seq_num;
#	endif

	*return_value = FALSE;
	jnlpool_init_needed = TRUE;
	ESTABLISH(mu_int_reg_ch);
	if (dba_usr == reg->dyn.addr->acc_meth)
	{
		util_out_print("!/Can't integ region !AD; not GDS format", TRUE,  REG_LEN_STR(reg));
		mu_int_skipreg_cnt++;
		return;
	}
	gv_cur_region = reg;
	if (reg_cmcheck(reg))
	{
		util_out_print("!/Can't integ region across network", TRUE);
		mu_int_skipreg_cnt++;
		return;
	}
	gvcst_init(gv_cur_region);
	if (gv_cur_region->was_open)
	{	/* already open under another name */
		gv_cur_region->open = FALSE;
		return;
	}
	if (return_after_open)
	{
		*return_value = TRUE;
		return;
	}
	change_reg();
	csa = &FILE_INFO(gv_cur_region)->s_addrs;
	cnl = csa->nl;
	csd = csa->hdr;
	read_only = gv_cur_region->read_only;
	assert(NULL != mu_int_master);
	/* Ensure that we don't see an increase in the file header and master map size compared to it's maximum values */
	assert(SGMNT_HDR_LEN >= SIZEOF(sgmnt_data) && (MASTER_MAP_SIZE_MAX >= MASTER_MAP_SIZE(csd)));
	/* ONLINE INTEG if asked for explicitly by specifying -ONLINE is an error if the db has partial V4 blocks.
	 * However, if -ONLINE is not explicitly specified but rather assumed implicitly (as default for -REG)
	 * then turn off ONLINE INTEG for this region and continue as if -NOONLINE was specified
	 */
	if (!csd->fully_upgraded)
	{
		ointeg_this_reg = FALSE; /* Turn off ONLINE INTEG for this region */
		if (online_specified)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_SSV4NOALLOW, 2, DB_LEN_STR(gv_cur_region));
			util_out_print(NO_ONLINE_ERR_MSG, TRUE);
			mu_int_skipreg_cnt++;
			return;
		}
	}
	if (!ointeg_this_reg || read_only)
	{
		status = region_freeze(gv_cur_region, TRUE, FALSE, TRUE, FALSE, !read_only);
		switch (status)
		{
			case REG_ALREADY_FROZEN:
				if (csa->read_only_fs)
					break;
				util_out_print("!/Database for region !AD is already frozen, not integing",
					TRUE, REG_LEN_STR(gv_cur_region));
				mu_int_skipreg_cnt++;
				return;
			case REG_FLUSH_ERROR:
				gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT(MUPIP_INTEG),
					DB_LEN_STR(gv_cur_region));
				mu_int_skipreg_cnt++;
				return;
			case REG_HAS_KIP:
				/* We have already waited for KIP to reset. This time do not wait for KIP */
				status = region_freeze(gv_cur_region, TRUE, FALSE, FALSE, FALSE, !read_only);
				if (REG_ALREADY_FROZEN == status)
				{
					if (csa->read_only_fs)
						break;
					util_out_print("!/Database for region !AD is already frozen, not integing",
						TRUE, REG_LEN_STR(gv_cur_region));
					mu_int_skipreg_cnt++;
					return;
				} else if (REG_FLUSH_ERROR == status)
				{
					gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT(MUPIP_INTEG),
						DB_LEN_STR(gv_cur_region));
					mu_int_skipreg_cnt++;
					return;
				}
				assert(REG_FREEZE_SUCCESS == status);
				/* no break */
			case REG_FREEZE_SUCCESS:
				break;
			default:
				assert(FALSE);
				/* no break */
		}
		if (read_only && (dba_bg == csa->hdr->acc_meth) && !mu_int_wait_rdonly(csa, MUPIP_INTEG))
		{
			mu_int_skipreg_cnt++;
			return;
		}
	}
	if (!ointeg_this_reg)
	{	/* Take a copy of the file-header. To ensure it is consistent, do it while holding crit. */
		was_crit = csa->now_crit;
		if (!was_crit)
			grab_crit(gv_cur_region);
		memcpy((uchar_ptr_t)&mu_int_data, (uchar_ptr_t)csd, SIZEOF(sgmnt_data));
		if (!was_crit)
			rel_crit(gv_cur_region);
		memcpy(mu_int_master, MM_ADDR(csd), MASTER_MAP_SIZE(csd));
		csd_copy_ptr = &mu_int_data;
	} else
	{
		if (!ss_initiate(gv_cur_region, util_ss_ptr, &csa->ss_ctx, preserve_snapshot, MUPIP_INTEG))
		{
			mu_int_skipreg_cnt++;
			assert(NULL != csa->ss_ctx);
			ss_release(&csa->ss_ctx);
			ointeg_this_reg = FALSE; /* Turn off ONLINE INTEG for this region */
			assert(process_id != cnl->in_crit); /* Ensure ss_initiate released the crit before returning */
			assert(!FROZEN_HARD(csd)); /* Ensure region is unfrozen before returning from ss_initiate */
			assert(INTRPT_IN_SS_INITIATE != intrpt_ok_state); /* Ensure ss_initiate released intrpt_ok_state */
			return;
		}
		assert(process_id != cnl->in_crit); /* Ensure ss_initiate released the crit before returning */
		assert(INTRPT_IN_SS_INITIATE != intrpt_ok_state); /* Ensure ss_initiate released intrpt_ok_state */
		csd_copy_ptr = &csa->ss_ctx->ss_shm_ptr->shadow_file_header;
#		if defined(DEBUG)
		curr_wbox_seq_num = 1;
		cnl->wbox_test_seq_num = curr_wbox_seq_num; /* indicate we took the next step */
		GTM_WHITE_BOX_TEST(WBTEST_OINTEG_WAIT_ON_START, need_to_wait, TRUE);
		if (need_to_wait) /* wait for them to take next step */
		{
			trynum = 30; /* given 30 cycles to tell you to go */
			while ((curr_wbox_seq_num == cnl->wbox_test_seq_num) && trynum--)
				LONG_SLEEP(1);
			cnl->wbox_test_seq_num++; /* let them know we took the next step */
			assert(trynum);
		}
#		endif
	}
	if (USES_ANY_KEY(csd_copy_ptr))
	{ 	/* Initialize mu_int_encrypt_key_handle to be used in mu_int_read */
		seg = gv_cur_region->dyn.addr;
		INIT_DB_OR_JNL_ENCRYPTION(&mu_int_encr_handles, csd_copy_ptr, seg->fname_len, (char *)seg->fname, gtmcrypt_errno);
		if (0 != gtmcrypt_errno)
		{
			GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, seg->fname_len, seg->fname);
			mu_int_skipreg_cnt++;
			return;
		}
	}
	*return_value = mu_int_fhead();
	REVERT;
	return;
}
uint4	mur_forward_play_cur_jrec(reg_ctl_list *rctl)
{
	boolean_t		process_losttn;
	boolean_t		is_set_kill_zkill_ztworm_lgtrig_ztrig, is_set_kill_zkill_ztrig;
	trans_num		curr_tn;
	enum jnl_record_type	rectype;
	enum rec_fence_type	rec_fence;
	enum broken_type	recstat;
	jnl_tm_t		rec_time;
	uint4			status;
	mval			mv;
	seq_num 		rec_token_seq, rec_strm_seqno, resync_strm_seqno;
	jnl_record		*rec;
	jnl_string		*keystr;
	multi_struct 		*multi;
	jnl_ctl_list		*jctl;
	ht_ent_mname		*tabent;
	mname_entry	 	gvent;
	gvnh_reg_t		*gvnh_reg;
	pini_list_struct	*plst;
	int4			gtmcrypt_errno;
	boolean_t		use_new_key;
	forw_multi_struct	*forw_multi;
#	if (defined(DEBUG) && defined(UNIX))
	int4			strm_idx;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(!rctl->forw_eof_seen);
	if (multi_proc_in_use)
	{	/* Set key to print this rctl's region-name as prefix in case this forked off process prints any output.
		 * e.g. If this function ends up calling t_end/op_tcommit which in turn needs to do a jnl autoswitch
		 * inside jnl_file_extend and prints a GTM-I-FILERENAME message.
		 */
		MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
	}
	jctl = rctl->jctl;
	/* Ensure we never DOUBLE process the same journal record in the forward phase */
	assert((jctl != rctl->last_processed_jctl) || (jctl->rec_offset != rctl->last_processed_rec_offset));
#	ifdef DEBUG
	rctl->last_processed_jctl = jctl;
	rctl->last_processed_rec_offset = jctl->rec_offset;
#	endif
	rec = rctl->mur_desc->jnlrec;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	rec_time = rec->prefix.time;
	assert(rec_time <= mur_options.before_time);
	assert(rec_time >= mur_options.after_time);
	assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated));
	is_set_kill_zkill_ztworm_lgtrig_ztrig = (boolean_t)(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype));
	if (is_set_kill_zkill_ztworm_lgtrig_ztrig)
	{
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		if (USES_ANY_KEY(jctl->jfh))
		{
			use_new_key = USES_NEW_KEY(jctl->jfh);
			assert(NEEDS_NEW_KEY(jctl->jfh, rec->prefix.tn) == use_new_key);
			MUR_DECRYPT_LOGICAL_RECS(
					keystr,
					(use_new_key ? TRUE : jctl->jfh->non_null_iv),
					rec->prefix.forwptr,
					(use_new_key ? jctl->encr_key_handle2 : jctl->encr_key_handle),
					gtmcrypt_errno);
			if (0 != gtmcrypt_errno)
			{
				GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, jctl->jnl_fn_len, jctl->jnl_fn);
				return gtmcrypt_errno;
			}
		}
	}
	if (mur_options.selection && !mur_select_rec(jctl))
		return SS_NORMAL;
	rec_token_seq = (REC_HAS_TOKEN_SEQ(rectype)) ? GET_JNL_SEQNO(rec) : 0;
	process_losttn = rctl->process_losttn;
	if (!process_losttn && mur_options.rollback)
	{
		if (IS_REPLICATED(rectype) && (rec_token_seq >= murgbl.losttn_seqno))
			process_losttn = rctl->process_losttn = TRUE;
#		if (defined(UNIX) && defined(DEBUG))
		if ((rec_token_seq < murgbl.losttn_seqno) && murgbl.resync_strm_seqno_nonzero && IS_REPLICATED(rectype))
		{
			assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype) || IS_COM(rectype) || (JRT_NULL == (rectype)));
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_null.strm_seqno);
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_tcom.strm_seqno);
			rec_strm_seqno = GET_STRM_SEQNO(rec);
			if (rec_strm_seqno)
			{
				strm_idx = GET_STRM_INDEX(rec_strm_seqno);
				rec_strm_seqno = GET_STRM_SEQ60(rec_strm_seqno);
				resync_strm_seqno = murgbl.resync_strm_seqno[strm_idx];
				assert(!resync_strm_seqno || (rec_strm_seqno < resync_strm_seqno));
			}
		}
#		endif
	}
	/* Note: Broken transaction determination is done below only based on the records that got selected as
	 * part of the mur_options.selection criteria. Therefore depending on whether a broken transaction gets
	 * selected or not, future complete transactions might either go to the lost transaction or extract file.
	 */
	recstat = process_losttn ? LOST_TN : GOOD_TN;
	status = SS_NORMAL;
	if (FENCE_NONE != mur_options.fences)
	{
		if (IS_FENCED(rectype))
		{
			assert(rec_token_seq);
#			ifdef DEBUG
			/* assert that all TP records before min_broken_time are not broken */
			if (IS_TP(rectype) && ((!mur_options.rollback && rec_time < murgbl.min_broken_time)
						|| (mur_options.rollback && rec_token_seq < murgbl.min_broken_seqno)))
			{
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				if (NULL != (multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence)))
				{
					assert(0 == multi->partner);
					assert(FALSE == multi->this_is_broken);
				}
			}
#			endif
			/* In most cases, the fact whether a TP tn is broken or not would have been determined already in
			 * mur_forward. In this case, rctl->forw_multi would be set appropriately. So use that to get to
			 * "multi" and avoid a hashtable lookup. If forw_multi is NULL (e.g. for ZTP or single-region TP),
			 * the hash-table lookup cannot be avoided.
			 */
			multi = NULL;
			forw_multi = rctl->forw_multi;
			if (NULL != forw_multi)
			{
				multi = forw_multi->multi;
				/* Always honor the "recstat" from the forw_multi since that has been determined taking into
				 * consideration the BROKEN_TN status of ALL participating regions.
				 */
				assert((GOOD_TN != forw_multi->recstat) || (GOOD_TN == recstat));
				recstat = forw_multi->recstat;
			} else if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
			{
				assert(!mur_options.rollback || process_losttn);
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				assert(rec_token_seq == ((struct_jrec_upd *)rec)->token_seq.token);
				multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence);
				if ((NULL != multi) && (0 < multi->partner))
				{
					process_losttn = rctl->process_losttn = TRUE;
					recstat = BROKEN_TN;
				}
			}
			/* Check that if the hashtable reports a tn as GOOD, it better have had the same
			 * # of participants in the TCOM records across all the participating regions.
			 */
			assert((NULL == multi) || (BROKEN_TN == recstat) || (FALSE == multi->this_is_broken));
		} else if ((FENCE_ALWAYS == mur_options.fences) && is_set_kill_zkill_ztworm_lgtrig_ztrig)
		{
			process_losttn = rctl->process_losttn = TRUE;
			recstat = BROKEN_TN;
		}
	} else
		forw_multi = NULL;
	if (mur_options.show)
	{
		assert(SS_NORMAL == status);
		if (BROKEN_TN != recstat)
		{
			if (JRT_PFIN == rectype)
				status = mur_pini_state(jctl, rec->prefix.pini_addr, FINISHED_PROC);
			else if ((JRT_EOF != rectype)
					&& ((JRT_ALIGN != rectype) || (JNL_HDR_LEN != rec->prefix.pini_addr)))
			{	/* Note that it is possible that we have a PINI record followed by a PFIN record
				 * and later an ALIGN record with the pini_addr pointing to the original PINI
				 * record (see comment in jnl_write.c where pini_addr gets assigned to JNL_HDR_LEN)
				 * In this case we do not want the ALIGN record to cause the process to become
				 * ACTIVE although it has written a PFIN record. Hence the check above.
				 */
				status = mur_pini_state(jctl, rec->prefix.pini_addr, ACTIVE_PROC);
			}
		} else
			status = mur_pini_state(jctl, rec->prefix.pini_addr, BROKEN_PROC);
		if (SS_NORMAL != status)
			return status;	/* "mur_pini_state" failed due to bad pini_addr */
		++jctl->jnlrec_cnt[rectype];	/* for -show=STATISTICS */
	}
	if (!mur_options.update && !jgbl.mur_extract)
		return SS_NORMAL;
	if (murgbl.ok_to_update_db && IS_TUPD(rectype) && (GOOD_TN == recstat))
	{	/* Even for FENCE_NONE we apply fences. Otherwise a TUPD becomes UPD etc.
		 * If forw_multi is non-NULL, a multi-region TP transaction is being played as a SINGLE
		 * TP transaction across all the involved regions. Therefore only ONE op_tstart is done
		 * even though more than one TSET might be encountered. In this case, do not issue JNLTPNEST error.
		 */
		if (dollar_tlevel && (NULL == forw_multi))
		{
			assert(FALSE);
			murgbl.wrn_count++;
			gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_JNLTPNEST, 4, jctl->jnl_fn_len,
				jctl->jnl_fn, jctl->rec_offset, &rec->prefix.tn);
			OP_TROLLBACK(0);
		}
		if (!dollar_tlevel)
		{	/* Note: op_tstart resets gv_currkey. So set gv_currkey later. */
			/* mv is used to determine transaction id. But it is ignored by recover/rollback */
			mv.mvtype = MV_STR;
			mv.str.len = 0;
			mv.str.addr = NULL;
			op_tstart(IMPLICIT_TSTART, TRUE, &mv, -1);
			DEBUG_ONLY(jgbl.max_tp_ztp_jnl_upd_num = 0;)
		}
		tp_set_sgm();	/* needed to set "sgm_info_ptr" to correspond to "rctl" */
	}
Example #6
0
int gtm_main (int argc, char **argv, char **envp)
#ifdef __osf__
# pragma pointer_size (restore)
#endif
{
	char			*ptr, *eq, **p;
	int             	eof, parse_ret;
	int			gtmcrypt_errno;
#	ifdef GTM_SOCKET_SSL_SUPPORT
	int			status;
	char			tlsid_env_name[MAX_TLSID_LEN * 2];
#	endif
	DCL_THREADGBL_ACCESS;

	GTM_THREADGBL_INIT;
	gtmenvp = envp;
	gtm_dist_ok_to_use = TRUE;
	common_startup_init(GTM_IMAGE);
	GTMTRIG_DBG_ONLY(ch_at_trigger_init = &mdb_condition_handler);
	err_init(stop_image_conditional_core);
	UNICODE_ONLY(gtm_strToTitle_ptr = &gtm_strToTitle);
	GTM_ICU_INIT_IF_NEEDED;	/* Note: should be invoked after err_init (since it may error out) and before CLI parsing */
	cli_lex_setup(argc, argv);
	/* put the arguments into buffer, then clean up the token buffer
	 * cli_gettoken() copies all arguments except the first one argv[0]
	 * into the buffer (cli_lex_in_ptr->in_str).
	 * i.e. command line: "/usr/library/V990/mumps -run somefile"
	 * the buffer cli_lex_in_ptr->in_str == "-run somefile"
	 */
	if (1 < argc)
		cli_gettoken(&eof);
	/* cli_gettoken() extracts the first token into cli_token_buf (in tok_extract())
	 * which should be done in parse_cmd(), So, reset the token buffer here to make
	 * parse_cmd() starts from the first token
	*/
	cli_token_buf[0] = '\0';
	/* insert the "MUMPS " in the parsing buffer the buffer is now:
	 * cli_lex_in_ptr->in_str == "MUMPS -run somefile"
	 * we didnot change argv[0]
	*/
	ptr = cli_lex_in_ptr->in_str;
	memmove(strlen("MUMPS ") + ptr, ptr, strlen(ptr) + 1);	/* BYPASSOK */
	MEMCPY_LIT(ptr, "MUMPS ");
	/* reset the argument buffer pointer, it's changed in cli_gettoken() call above
	 * do NOT reset to 0(NULL) to avoid fetching cmd line args into buffer again
	 * cli_lex_in_ptr->tp is the pointer to indicate current position in the buffer
	 * cli_lex_in_ptr->in_str
	 */
	cli_lex_in_ptr->tp = cli_lex_in_ptr->in_str;
	parse_ret = parse_cmd();
	if (parse_ret && (EOF != parse_ret))
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(4) parse_ret, 2, LEN_AND_STR(cli_err_str));
	if (cli_present("DIRECT_MODE"))
		invocation_mode = MUMPS_DIRECT;
	else if (cli_present("RUN"))
		invocation_mode = MUMPS_RUN;
	gtm_chk_dist(argv[0]);
	/* this should be after cli_lex_setup() due to S390 A/E conversion in cli_lex_setup   */
	init_gtm();
#	ifdef GTM_TLS
	if (MUMPS_COMPILE != invocation_mode)
	{
		if ((NULL != (ptr = (char *)getenv(GTM_PASSWD_ENV))) && (0 == strlen(ptr)))
		{
			INIT_PROC_ENCRYPTION(NULL, gtmcrypt_errno);
			if (0 != gtmcrypt_errno)
			{
				CLEAR_CRYPTERR_MASK(gtmcrypt_errno);
				assert(!IS_REPEAT_MSG_MASK(gtmcrypt_errno));
				assert((ERR_CRYPTDLNOOPEN == gtmcrypt_errno) || (ERR_CRYPTINIT == gtmcrypt_errno));
				if (ERR_CRYPTDLNOOPEN == gtmcrypt_errno)
					gtmcrypt_errno = ERR_CRYPTDLNOOPEN2;
				else if (ERR_CRYPTINIT == gtmcrypt_errno)
					gtmcrypt_errno = ERR_CRYPTINIT2;
				gtmcrypt_errno = SET_CRYPTERR_MASK(gtmcrypt_errno);
				GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, SIZEOF(GTMCRYPT_ERRLIT) - 1, GTMCRYPT_ERRLIT); /* BYPASSOK */
			}
		}
#		ifdef GTM_SOCKET_SSL_SUPPORT
		/* The below logic is for prefetching the password for TLS identifiers that may have been set in the environment.
		 * But, since SSL support for Socket devices is not yet implemented, this logic need not be enabled as of this
		 * writing. When SSL support for socket devices is implemented, the surrounding #ifdef can be removed.
		 */
		if (NULL != getenv("gtmcrypt_config"))
		{	/* Environment is configured for SSL/TLS (and/or encryption). Check if any environment variable of the form
			 * `gtmtls_passwd_*' is set to NULL string. If so, nudge the SSL/TLS library to read password(s) from the
			 * user.
			 */
			for (p = envp; *p; p++)
			{
				ptr = *p;
				if (0 == MEMCMP_LIT(ptr, GTMTLS_PASSWD_ENV_PREFIX))
				{	/* At least one environment variable of $gtmtls_passwd_* is found. */
					eq = strchr(ptr, '=');
					if (0 != strlen(eq + 1))
						break; /* Set to non-empty string. No need to initialize the library now. */
					/* Set to empty string. */
					if (NULL == tls_ctx)
					{
						if (SS_NORMAL != (status = gtm_tls_loadlibrary()))
						{
							rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_TLSDLLNOOPEN, 0,
									ERR_TEXT, 2, LEN_AND_STR(dl_err));
						}
						if (NULL == (tls_ctx = gtm_tls_init(GTM_TLS_API_VERSION,
											GTMTLS_OP_INTERACTIVE_MODE)))
						{
							rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_TLSINIT, 0,
									ERR_TEXT, 2, LEN_AND_STR(gtm_tls_get_error()));
						}
					}
					assert(NULL != tls_ctx);
					assert((MAX_TLSID_LEN * 2) > (int)(eq - ptr));
					memcpy(tlsid_env_name, ptr, (int)(eq - ptr));
					tlsid_env_name[(int)(eq - ptr)] = '\0';
					gtm_tls_prefetch_passwd(tls_ctx, tlsid_env_name);
				}
			}
		}
#		endif
	}
#	endif
	dm_start();
	return 0;
}
Example #7
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
	/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
	int4			status;
	uint4			blocking_pid;
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	boolean_t		clustered, hold_onto_crit, was_crit, issued_db_init_crypt_warning, sync_needed;
	int			dummy, lcnt, ocnt;
	cw_set_element		*cse;
	off_chain		chain1;
	register sgmnt_addrs	*csa;
	register sgmnt_data_ptr_t	csd;
	enum db_ver		ondsk_blkver;
	int4			dummy_errno, gtmcrypt_errno;
	boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
	ht_ent_int4		*tabent;
	srch_blk_status		*blkhist;
	trans_num		dirty, blkhdrtn;
	sm_uc_ptr_t		buffaddr;
	uint4			stuck_cnt = 0;
	boolean_t		lcl_blk_free;
	node_local_ptr_t	cnl;
	gd_segment		*seg;
	uint4			buffs_per_flush, flush_target;
	enc_info_t		*encr_ptr;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	lcl_blk_free = block_is_free;
	block_is_free = FALSE;	/* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
	first_tp_srch_status = NULL;
	reset_first_tp_srch_status = FALSE;
	csa = cs_addrs;
	csd = csa->hdr;
	INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
	is_mm = (dba_mm == csd->acc_meth);
	/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
	assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
	if (dollar_tlevel)
	{
		assert(sgm_info_ptr);
		if (0 != sgm_info_ptr->cw_set_depth)
		{
			chain1 = *(off_chain *)&blk;
			if (1 == chain1.flag)
			{
				assert(sgm_info_ptr->cw_set_depth);
				if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				else
				{
					assert(FALSE == csa->now_crit);
					rdfail_detail = cdb_sc_blknumerr;
					return (sm_uc_ptr_t)NULL;
				}
			} else
			{
				if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
					first_tp_srch_status = tabent->value;
				else
					first_tp_srch_status = NULL;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
				cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			assert(!chain1.flag || cse);
			if (cse)
			{	/* transaction has modified the sought after block  */
				if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
				{	/* Changes have not been committed to shared memory, i.e. still in private memory.
					 * Build block in private buffer if not already done and return the same.
					 */
					assert(gds_t_writemap != cse->mode);
					if (FALSE == cse->done)
					{	/* out of date, so make it current */
						assert(gds_t_committed != cse->mode);
						already_built = (NULL != cse->new_buff);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
						assert(NULL != cse->blk_target);
						if (!already_built && !chain1.flag)
						{
							buffaddr = first_tp_srch_status->buffaddr;
							cr = first_tp_srch_status->cr;
							assert((is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
								TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
									first_tp_srch_status->tn, blkhdrtn,
									((blk_hdr_ptr_t)buffaddr)->levl);
								return (sm_uc_ptr_t)NULL;
							}
							if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
										|| (first_tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_lostcr; /* should this be something else */
								return (sm_uc_ptr_t)NULL;
							}
						}
						cse->done = TRUE;
					}
					*cycle = CYCLE_PVT_COPY;
					*cr_out = 0;
					return (sm_uc_ptr_t)cse->new_buff;
				} else
				{	/* Block changes are already committed to shared memory (possible if we are in TP
					 * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
					 * block from shared memory; do not look at private memory (i.e. cse) as that might
					 * not be as uptodate as shared memory.
					 */
					assert(csa->now_crit);	/* gvcst_expand_free_subtree does t_qread in crit */
					/* If this block was newly created as part of the TP transaction, it should not be killed
					 * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
					 * have had an old_mode of kill_t_create in which case we would not have come into this
					 * else block. Assert accordingly.
					 */
					assert(!chain1.flag);
					first_tp_srch_status = NULL;	/* do not use any previous srch_hist information */
				}
			}
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				first_tp_srch_status = tabent->value;
			else
				first_tp_srch_status = NULL;
		}
		ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
		if (!is_mm && first_tp_srch_status)
		{
			cr = first_tp_srch_status->cr;
			assert(cr && !first_tp_srch_status->cse);
			if (first_tp_srch_status->cycle == cr->cycle)
			{
				*cycle = first_tp_srch_status->cycle;
				*cr_out = cr;
				cr->refer = TRUE;
				if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
					CWS_INSERT(blk);
				return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
			} else
			{	/* Block was already part of the read-set of this transaction, but got recycled in the cache.
				 * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
				 * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
				 * if block recycling happened within the same mini-action and restart in that case.
				 * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
				 * the values to update to. Set a variable that will enable the updation before returning.
				 * Also assert that if we are in the final retry, we are never in a situation where we have a
				 * block that got recycled since the start of the current mini-action. This is easily detected since
				 * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
				 * have been read as part of the current mini-action until now.
				 */
				assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
				reset_first_tp_srch_status = TRUE;
			}
		}
	}
	if ((uint4)blk >= (uint4)csa->ti->total_blks)
	{	/* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is
		 * never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread
		 * can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below.
		 */
		assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
		assert(!csa->now_crit);
		rdfail_detail = cdb_sc_blknumerr;
		return (sm_uc_ptr_t)NULL;
	}
	if (is_mm)
	{
		*cycle = CYCLE_SHRD_COPY;
		*cr_out = 0;
		return (sm_uc_ptr_t)(mm_read(blk));
	}
	was_crit = csa->now_crit;
	cnl = csa->nl;
	encr_ptr = csa->encr_ptr;
	if (NULL != encr_ptr)
	{
		/* If this is an encrypted database and we hold crit, make sure our private cycle matches the shared cycle.
		 * Or else we would need to call "process_reorg_encrypt_restart" below (a heavyweight operation) holding crit.
		 */
		assert(!was_crit || (cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle));
		seg = gv_cur_region->dyn.addr;
		issued_db_init_crypt_warning = encr_ptr->issued_db_init_crypt_warning;
		if (!IS_BITMAP_BLK(blk) && issued_db_init_crypt_warning)
		{	/* A non-GT.M process is attempting to read a non-bitmap block, yet it has previously encountered an error
			 * during db_init (because it did not have access to the encryption keys) and reported it with a -W-
			 * severity. Since the block it is attempting to read can be in the unencrypted shared memory (read from
			 * disk by another process with access to the encryption keys), we cannot let it access it without a valid
			 * handle, so issue an rts_error.
			 *
			 * TODO: DSE and LKE could bypass getting the ftok semaphore. LKE is not an issue, but DSE does care about
			 *       the csa->reorg_encrypt_cycle. So it means DSE could get an inconsistent copy of reorg_encrypt_cycle
			 *       and associated hashes if it had done a bypass and a concurrent REORG -ENCRYPT is holding the ftok
			 *       semaphore and changing these values at the same time.
			 */
			assert(!IS_GTM_IMAGE);	/* GT.M would have error'ed out in db_init */
			gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG));
			GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
		} else if (cnl->reorg_encrypt_cycle != encr_ptr->reorg_encrypt_cycle)
		{	/* A concurrent MUPIP REORG ENCRYPT occurred. Cannot proceed with the read even if the block is
			 * already loaded from disk into the unencrypted global buffers (security issue). Need to load the
			 * new encryption keys and only let those processes which are able to successfully do this proceed
			 * with the read. First, copy the key hashes from csd into csa->encr_ptr. That needs crit
			 * to ensure a concurrent MUPIP REORG ENCRYPT does not sneak in.
			 *
			 * Note: Even though we asserted a few lines above that if "was_crit" is TRUE, then we expect
			 * the encryption cycles to be in sync, we handle this out-of-design situation in "pro" by fixing
			 * the cycles while holding crit (hopefully rare case so it is okay to hold crit for a heavyweight call).
			 */
			if (!was_crit)
				grab_crit(gv_cur_region);
			/* Now that we have crit, sync them up by copying the new keys inside crit and opening the key handles
			 * outside crit (a potentially long running operation).
			 */
			SIGNAL_REORG_ENCRYPT_RESTART(mu_reorg_encrypt_in_prog, reorg_encrypt_restart_csa,
					cnl, csa, csd, rdfail_detail, process_id);
			assert(csa == reorg_encrypt_restart_csa);
			if (!was_crit)
				rel_crit(gv_cur_region);
			/* If we are inside a TP read-write transaction, it is possible we already used the old keys for
			 * prior calls to "jnl_format" so we have to restart (cannot sync up cycles). Do the same for
			 * TP read-only transaction as well as NON-TP read-write transaction. In all these cases we know
			 * the caller is capable of restarting. All other cases we dont know if the caller is capable so
			 * sync up the cycles and proceed using the new keys for the read.
			 *
			 * But since it is possible the caller does not call t_retry right away (e.g. mupip reorg which can
			 * choose to abandone this tree path and move on to another block without aborting this transaction)
			 * it is better we finish the pending call to "process_reorg_encrypt_restart" right here before returning.
			 */
			process_reorg_encrypt_restart();
			assert(NULL == reorg_encrypt_restart_csa);
			if (IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans))
			{
				assert(cdb_sc_reorg_encrypt == rdfail_detail);	/* set by SIGNAL_REORG_ENCRYPT_RESTART macro */
				return (sm_uc_ptr_t)NULL;
			}
		}
	}
	assert(dba_bg == csd->acc_meth);
	assert(!first_tp_srch_status || !first_tp_srch_status->cr
					|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
	if (FALSE == (clustered = csd->clustered))
		bt = NULL;
	ocnt = 0;
	set_wc_blocked = FALSE;	/* to indicate whether cnl->wc_blocked was set to TRUE by us */
	hold_onto_crit = csa->hold_onto_crit;	/* note down in local to avoid csa-> dereference in multiple usages below */
	do
	{
		if (NULL == (cr = db_csh_get(blk)))
		{	/* not in memory */
			if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
				bt = NULL;
			if (!csa->now_crit)
			{
				assert(!hold_onto_crit);
				if (NULL != bt)
				{	/* at this point, bt is not NULL only if clustered and flushing - wait no crit */
					assert(clustered);
					wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
				}
				/* assume defaults for flush_target and buffs_per_flush */
				flush_target = csd->flush_trigger;
				buffs_per_flush = 0;
				if ((0 != csd->epoch_taper) && (FALSE == gv_cur_region->read_only) && JNL_ENABLED(csd) &&
						(0 != cnl->wcs_active_lvl) && (NOJNL != csa->jnl->channel) &&
						(0 != cnl->jnl_file.u.inode) && csd->jnl_before_image)
				{
					EPOCH_TAPER_IF_NEEDED(csa, csd, cnl, (gd_region *) 0, FALSE, buffs_per_flush, flush_target);
				}
				if ((flush_target <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, buffs_per_flush, dummy_errno);
						/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
				/* Get crit but also ensure encryption cycles are in sync ("dsk_read" relies on this).
				 * Note: "sync_needed" should be TRUE very rarely since we synced the cycles just a few lines
				 * above. But in case a MUPIP REORG ENCRYPT concurrently sneaked in between these lines we
				 * need to resync.
				 */
				sync_needed = grab_crit_encr_cycle_sync(gv_cur_region);
				assert(NULL == reorg_encrypt_restart_csa);
				assert(!sync_needed || (NULL != encr_ptr));
				if (sync_needed && IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans))
				{
					assert(cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle);
					rel_crit(gv_cur_region);
					rdfail_detail = cdb_sc_reorg_encrypt;	/* set by SIGNAL_REORG_ENCRYPT_RESTART macro */
					return (sm_uc_ptr_t)NULL;
				}
				cr = db_csh_get(blk);			/* in case blk arrived before crit */
			}
			if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
			{	/* Once crit, need to assure that if clustered, that flushing is [still] complete
				 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
				wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
			}
			if (NULL == cr)
			{	/* really not in memory - must get a new buffer */
				assert(csa->now_crit);
				cr = db_csh_getn(blk);
				if (CR_NOTVALID == (sm_long_t)cr)
				{
					assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
					assert(gtm_white_box_test_case_enabled);
					SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
					BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
					set_wc_blocked = TRUE;
					break;
				}
				assert(0 <= cr->read_in_progress);
				*cycle = cr->cycle;
				cr->tn = csd->trans_hist.curr_tn;
				/* Record history of most recent disk reads only in dbg builds for now. Although the macro
				 * is just a couple dozen instructions, it is done while holding crit so we want to avoid
				 * delaying crit unless really necessary. Whoever wants this information can enable it
				 * by a build change to remove the DEBUG_ONLY part below.
				 */
				DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
				if (!was_crit && !hold_onto_crit)
					rel_crit(gv_cur_region);
				/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
				assert(0 == cr->dirty);
				assert(cr->read_in_progress >= 0);
				CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
				buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
#				ifdef DEBUG
				/* stop self to test sechshr_db_clnup clears the read state */
				if (gtm_white_box_test_case_enabled
					&& (WBTEST_SIGTSTP_IN_T_QREAD == gtm_white_box_test_case_number))
				{	/* this should never fail, but because of the way we developed the test we got paranoid */
					dummy = kill(process_id, SIGTERM);
					assert(0 == dummy);
					for (dummy = 10; dummy; dummy--)
						LONG_SLEEP(10); /* time for sigterm to take hit before we clear block_now_locked */
				}
#				endif
				if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free)))
				{
					/* buffer does not contain valid data, so reset blk to be empty */
					cr->cycle++;	/* increment cycle for blk number changes (for tp_hist and others) */
					cr->blk = CR_BLKEMPTY;
					cr->r_epid = 0;
					RELEASE_BUFF_READ_LOCK(cr);
					TREF(block_now_locked) = NULL;
					assert(-1 <= cr->read_in_progress);
					assert(was_crit == csa->now_crit);
					if (ERR_DYNUPGRDFAIL == status)
					{	/* if we dont hold crit on the region, it is possible due to concurrency conflicts
						 * that this block is unused (i.e. marked free/recycled in bitmap, see comments in
						 * gds_blk_upgrade.h). in this case we should not error out but instead restart.
						 */
						if (was_crit)
						{
							assert(FALSE);
							rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk,
									DB_LEN_STR(gv_cur_region));
						} else
						{
							rdfail_detail = cdb_sc_lostcr;
							return (sm_uc_ptr_t)NULL;
						}
					}
					if ((-1 == status) && !was_crit)
					{	/* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch
						 * a full database block's length of data. This can happen if the requested read is
						 * past the end of the file, which can happen if a concurrent truncate occurred
						 * after the blk >= csa->ti->total_blks comparison above. Allow for this scenario
						 * by restarting. However, if we've had crit the whole time, no truncate could have
						 * happened. -1 indicates a problem with the file, so fall through to DBFILERR.
						 */
						rdfail_detail = cdb_sc_truncate;
						return (sm_uc_ptr_t)NULL;
					} else if (IS_CRYPTERR_MASK(status))
					{
						seg = gv_cur_region->dyn.addr;
						GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname);
					}
					else
					{	/* A DBFILERR can be thrown for two possible reasons:
						 * (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or
						 * (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block
						 * number for one region from another region with fewer total_blks.
						 *    We suspect the former is what happened in GTM-7623. Apparently the latter
						 * has been an issue before, too. If either occurs again in pro, this assertpro
						 * distinguishes the two possibilities.
						 */
						assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data));
						rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region),
								status);
					}
				}
				disk_blk_read = TRUE;
				assert(0 <= cr->read_in_progress);
				assert(0 == cr->dirty);
				/* Only set in cache if read was success */
				cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
				cr->r_epid = 0;
				RELEASE_BUFF_READ_LOCK(cr);
				TREF(block_now_locked) = NULL;
				assert(-1 <= cr->read_in_progress);
				*cr_out = cr;
				assert(was_crit == csa->now_crit);
				if (reset_first_tp_srch_status)
					RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
				return buffaddr;
			} else  if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
			{
				assert(!hold_onto_crit);
				assert(TRUE == csa->now_crit);
				assert(cnl->in_crit == process_id);
				rel_crit(gv_cur_region);
			}
		}