Exemple #1
0
int jnl_v11tov15(uchar_ptr_t jnl_buff, uint4 *jnl_len, uchar_ptr_t conv_buff, uint4 *conv_len, uint4 conv_bufsiz)
{
	/* Convert a transaction from jnl version 11 (V4.2-002) to 15 (V.4.4-002)  */

	unsigned char		*jb, *cb, *cstart, *jstart, rectype;
	int			status, reclen;
	unsigned short		key_len;
	unsigned int		long_data_len, jlen, total_data, nzeros, conv_reclen, clen_without_sfx, total_key;
	jrec_prefix		prefix;
	jrec_suffix		suffix;
	seq_num			jsno;

	jb = jnl_buff;
	cb = conv_buff;
	status = SS_NORMAL;
	jlen = *jnl_len;
	while (0 < jlen)
	{
		if (0 < (reclen = v11_jnl_record_length((jnl_record *)jb, jlen)))
		{
			if (reclen <= jlen)
			{
				rectype = REF_CHAR(jb + V11_JREC_TYPE_OFFSET);
				total_key = total_data = 0;
				assert(IS_REPLICATED(rectype));
				if (IS_ZTP(rectype))
					GTMASSERT;	/* ZTP not supported */
				if (IS_SET_KILL_ZKILL(rectype))
				{
					GET_USHORT(key_len, jb + V11_JREC_PREFIX_SIZE + v11_jnl_fixed_size[rectype]);
					total_key = key_len + sizeof(unsigned short);
					if (IS_SET(rectype))
					{
						GET_MSTR_LEN(long_data_len, jb + V11_JREC_PREFIX_SIZE +
								   v11_jnl_fixed_size[rectype] + total_key);
						total_data = long_data_len + sizeof(mstr_len_t);
					}
					conv_reclen = JREC_PREFIX_SIZE + FIXED_UPD_RECLEN +
						total_key + total_data + JREC_SUFFIX_SIZE;
					conv_reclen = ROUND_UP2(conv_reclen, JNL_REC_START_BNDRY);
				} else if (IS_COM(rectype))
					conv_reclen = JREC_PREFIX_SIZE + TCOM_RECLEN + JREC_SUFFIX_SIZE;
				clen_without_sfx = conv_reclen - JREC_SUFFIX_SIZE;
				if (cb - conv_buff + conv_reclen > conv_bufsiz)
				{
					repl_errno = EREPL_INTLFILTER_NOSPC;
					status = -1;
					break;
				}
				cstart = cb;
				jstart = jb;
				prefix.jrec_type = rectype;
				suffix.backptr = prefix.forwptr = conv_reclen;
				prefix.pini_addr = 0;
				prefix.time = 0;
				prefix.tn = 0;
				suffix.suffix_code = JNL_REC_SUFFIX_CODE;
				memcpy(cb, (unsigned char*)&prefix, JREC_PREFIX_SIZE);
				cb += JREC_PREFIX_SIZE;
				memcpy(cb, jb + V11_JREC_PREFIX_SIZE + V11_JNL_SEQNO_OFFSET, sizeof(seq_num));
				cb += sizeof(seq_num);
				if (IS_SET_KILL_ZKILL(rectype))
				{
					PUT_JNL_STR_LEN(cb, key_len);
					jb += (V11_JREC_PREFIX_SIZE + V11_MUMPS_NODE_OFFSET + sizeof(unsigned short));
					if (IS_FENCED(rectype))
						jb += TP_TOKEN_TID_SIZE;
					cb += sizeof(jnl_str_len_t);
					memcpy(cb, jb, key_len);
					cb += key_len;
					jb += key_len;
					if (IS_SET(rectype))
					{
						PUT_MSTR_LEN(cb, long_data_len);
						cb += sizeof(mstr_len_t);
						jb += sizeof(mstr_len_t);
						memcpy(cb, jb, long_data_len);
						cb += long_data_len;
					}
				} else if (IS_COM(rectype))
				{
					assert(JRT_TCOM == rectype);
					memset(cb, 0, TID_STR_SIZE);
					cb += TID_STR_SIZE;
					memcpy(cb, jb + V11_JREC_PREFIX_SIZE + V11_TCOM_PARTICIPANTS_OFFSET, sizeof(uint4));
					cb += sizeof(uint4);
				} else
					assert(FALSE);
				nzeros = (cstart + clen_without_sfx - cb);
				if (nzeros > 0)
				{
					memset(cb, 0, nzeros);
					cb += nzeros;
				}
				jb = jstart + reclen;
				memcpy(cb, (unsigned char*)&suffix, JREC_SUFFIX_SIZE);
				cb += JREC_SUFFIX_SIZE;
				assert(cb == cstart + conv_reclen);
				jlen -= reclen;
				continue;
			}
			repl_errno = EREPL_INTLFILTER_INCMPLREC;
			status = -1;
			break;
		}
		repl_errno = EREPL_INTLFILTER_BADREC;
		status = -1;
		break;
	}
	assert(0 == jlen || -1 == status);
	*jnl_len = jb - jnl_buff;
	*conv_len = cb - conv_buff;
	return(status);
}
uint4	mur_forward_play_cur_jrec(reg_ctl_list *rctl)
{
	boolean_t		process_losttn;
	boolean_t		is_set_kill_zkill_ztworm_lgtrig_ztrig, is_set_kill_zkill_ztrig;
	trans_num		curr_tn;
	enum jnl_record_type	rectype;
	enum rec_fence_type	rec_fence;
	enum broken_type	recstat;
	jnl_tm_t		rec_time;
	uint4			status;
	mval			mv;
	seq_num 		rec_token_seq, rec_strm_seqno, resync_strm_seqno;
	jnl_record		*rec;
	jnl_string		*keystr;
	multi_struct 		*multi;
	jnl_ctl_list		*jctl;
	ht_ent_mname		*tabent;
	mname_entry	 	gvent;
	gvnh_reg_t		*gvnh_reg;
	pini_list_struct	*plst;
	int4			gtmcrypt_errno;
	boolean_t		use_new_key;
	forw_multi_struct	*forw_multi;
#	if (defined(DEBUG) && defined(UNIX))
	int4			strm_idx;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(!rctl->forw_eof_seen);
	if (multi_proc_in_use)
	{	/* Set key to print this rctl's region-name as prefix in case this forked off process prints any output.
		 * e.g. If this function ends up calling t_end/op_tcommit which in turn needs to do a jnl autoswitch
		 * inside jnl_file_extend and prints a GTM-I-FILERENAME message.
		 */
		MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
	}
	jctl = rctl->jctl;
	/* Ensure we never DOUBLE process the same journal record in the forward phase */
	assert((jctl != rctl->last_processed_jctl) || (jctl->rec_offset != rctl->last_processed_rec_offset));
#	ifdef DEBUG
	rctl->last_processed_jctl = jctl;
	rctl->last_processed_rec_offset = jctl->rec_offset;
#	endif
	rec = rctl->mur_desc->jnlrec;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	rec_time = rec->prefix.time;
	assert(rec_time <= mur_options.before_time);
	assert(rec_time >= mur_options.after_time);
	assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated));
	is_set_kill_zkill_ztworm_lgtrig_ztrig = (boolean_t)(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype));
	if (is_set_kill_zkill_ztworm_lgtrig_ztrig)
	{
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		if (USES_ANY_KEY(jctl->jfh))
		{
			use_new_key = USES_NEW_KEY(jctl->jfh);
			assert(NEEDS_NEW_KEY(jctl->jfh, rec->prefix.tn) == use_new_key);
			MUR_DECRYPT_LOGICAL_RECS(
					keystr,
					(use_new_key ? TRUE : jctl->jfh->non_null_iv),
					rec->prefix.forwptr,
					(use_new_key ? jctl->encr_key_handle2 : jctl->encr_key_handle),
					gtmcrypt_errno);
			if (0 != gtmcrypt_errno)
			{
				GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, jctl->jnl_fn_len, jctl->jnl_fn);
				return gtmcrypt_errno;
			}
		}
	}
	if (mur_options.selection && !mur_select_rec(jctl))
		return SS_NORMAL;
	rec_token_seq = (REC_HAS_TOKEN_SEQ(rectype)) ? GET_JNL_SEQNO(rec) : 0;
	process_losttn = rctl->process_losttn;
	if (!process_losttn && mur_options.rollback)
	{
		if (IS_REPLICATED(rectype) && (rec_token_seq >= murgbl.losttn_seqno))
			process_losttn = rctl->process_losttn = TRUE;
#		if (defined(UNIX) && defined(DEBUG))
		if ((rec_token_seq < murgbl.losttn_seqno) && murgbl.resync_strm_seqno_nonzero && IS_REPLICATED(rectype))
		{
			assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype) || IS_COM(rectype) || (JRT_NULL == (rectype)));
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_null.strm_seqno);
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_tcom.strm_seqno);
			rec_strm_seqno = GET_STRM_SEQNO(rec);
			if (rec_strm_seqno)
			{
				strm_idx = GET_STRM_INDEX(rec_strm_seqno);
				rec_strm_seqno = GET_STRM_SEQ60(rec_strm_seqno);
				resync_strm_seqno = murgbl.resync_strm_seqno[strm_idx];
				assert(!resync_strm_seqno || (rec_strm_seqno < resync_strm_seqno));
			}
		}
#		endif
	}
	/* Note: Broken transaction determination is done below only based on the records that got selected as
	 * part of the mur_options.selection criteria. Therefore depending on whether a broken transaction gets
	 * selected or not, future complete transactions might either go to the lost transaction or extract file.
	 */
	recstat = process_losttn ? LOST_TN : GOOD_TN;
	status = SS_NORMAL;
	if (FENCE_NONE != mur_options.fences)
	{
		if (IS_FENCED(rectype))
		{
			assert(rec_token_seq);
#			ifdef DEBUG
			/* assert that all TP records before min_broken_time are not broken */
			if (IS_TP(rectype) && ((!mur_options.rollback && rec_time < murgbl.min_broken_time)
						|| (mur_options.rollback && rec_token_seq < murgbl.min_broken_seqno)))
			{
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				if (NULL != (multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence)))
				{
					assert(0 == multi->partner);
					assert(FALSE == multi->this_is_broken);
				}
			}
#			endif
			/* In most cases, the fact whether a TP tn is broken or not would have been determined already in
			 * mur_forward. In this case, rctl->forw_multi would be set appropriately. So use that to get to
			 * "multi" and avoid a hashtable lookup. If forw_multi is NULL (e.g. for ZTP or single-region TP),
			 * the hash-table lookup cannot be avoided.
			 */
			multi = NULL;
			forw_multi = rctl->forw_multi;
			if (NULL != forw_multi)
			{
				multi = forw_multi->multi;
				/* Always honor the "recstat" from the forw_multi since that has been determined taking into
				 * consideration the BROKEN_TN status of ALL participating regions.
				 */
				assert((GOOD_TN != forw_multi->recstat) || (GOOD_TN == recstat));
				recstat = forw_multi->recstat;
			} else if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
			{
				assert(!mur_options.rollback || process_losttn);
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				assert(rec_token_seq == ((struct_jrec_upd *)rec)->token_seq.token);
				multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence);
				if ((NULL != multi) && (0 < multi->partner))
				{
					process_losttn = rctl->process_losttn = TRUE;
					recstat = BROKEN_TN;
				}
			}
			/* Check that if the hashtable reports a tn as GOOD, it better have had the same
			 * # of participants in the TCOM records across all the participating regions.
			 */
			assert((NULL == multi) || (BROKEN_TN == recstat) || (FALSE == multi->this_is_broken));
		} else if ((FENCE_ALWAYS == mur_options.fences) && is_set_kill_zkill_ztworm_lgtrig_ztrig)
		{
			process_losttn = rctl->process_losttn = TRUE;
			recstat = BROKEN_TN;
		}
	} else
		forw_multi = NULL;
	if (mur_options.show)
	{
		assert(SS_NORMAL == status);
		if (BROKEN_TN != recstat)
		{
			if (JRT_PFIN == rectype)
				status = mur_pini_state(jctl, rec->prefix.pini_addr, FINISHED_PROC);
			else if ((JRT_EOF != rectype)
					&& ((JRT_ALIGN != rectype) || (JNL_HDR_LEN != rec->prefix.pini_addr)))
			{	/* Note that it is possible that we have a PINI record followed by a PFIN record
				 * and later an ALIGN record with the pini_addr pointing to the original PINI
				 * record (see comment in jnl_write.c where pini_addr gets assigned to JNL_HDR_LEN)
				 * In this case we do not want the ALIGN record to cause the process to become
				 * ACTIVE although it has written a PFIN record. Hence the check above.
				 */
				status = mur_pini_state(jctl, rec->prefix.pini_addr, ACTIVE_PROC);
			}
		} else
			status = mur_pini_state(jctl, rec->prefix.pini_addr, BROKEN_PROC);
		if (SS_NORMAL != status)
			return status;	/* "mur_pini_state" failed due to bad pini_addr */
		++jctl->jnlrec_cnt[rectype];	/* for -show=STATISTICS */
	}
	if (!mur_options.update && !jgbl.mur_extract)
		return SS_NORMAL;
	if (murgbl.ok_to_update_db && IS_TUPD(rectype) && (GOOD_TN == recstat))
	{	/* Even for FENCE_NONE we apply fences. Otherwise a TUPD becomes UPD etc.
		 * If forw_multi is non-NULL, a multi-region TP transaction is being played as a SINGLE
		 * TP transaction across all the involved regions. Therefore only ONE op_tstart is done
		 * even though more than one TSET might be encountered. In this case, do not issue JNLTPNEST error.
		 */
		if (dollar_tlevel && (NULL == forw_multi))
		{
			assert(FALSE);
			murgbl.wrn_count++;
			gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_JNLTPNEST, 4, jctl->jnl_fn_len,
				jctl->jnl_fn, jctl->rec_offset, &rec->prefix.tn);
			OP_TROLLBACK(0);
		}
		if (!dollar_tlevel)
		{	/* Note: op_tstart resets gv_currkey. So set gv_currkey later. */
			/* mv is used to determine transaction id. But it is ignored by recover/rollback */
			mv.mvtype = MV_STR;
			mv.str.len = 0;
			mv.str.addr = NULL;
			op_tstart(IMPLICIT_TSTART, TRUE, &mv, -1);
			DEBUG_ONLY(jgbl.max_tp_ztp_jnl_upd_num = 0;)
		}
		tp_set_sgm();	/* needed to set "sgm_info_ptr" to correspond to "rctl" */
	}
Exemple #3
0
/* This routine is called only for recover and rollback (that is, mur_options.update).
 * It applies the set/kill/zkill, tcom, inctn, and aimg records during forward processing.
 * Some fields like jnl_seqno, rec_seqno and prefix.time are saved here from original journal files.
 * Later jnl_write routines copies them to journal records instead of generating them like the runtime system */
uint4	mur_output_record(reg_ctl_list *rctl)
{
	mval			mv;
	jnl_record		*rec;
	char			*val_ptr;
	int			strm_num;
	uint4			dummy;
	off_jnl_t		pini_addr;
	jnl_string		*keystr;
	enum jnl_record_type 	rectype;
	uint4			jnl_status, status;
	pini_list_struct	*plst;
	boolean_t		jnl_enabled, was_crit;
	struct_jrec_null	null_record;
	gd_region		*reg;
	seq_num			strm_seqno;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	jnl_ctl_list		*jctl;
	jnl_format_buffer	*ztworm_jfb;
	blk_hdr_ptr_t		aimg_blk_ptr;
	int			in_len, gtmcrypt_errno;
	boolean_t		use_new_key;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(mur_options.update);
	rec = rctl->mur_desc->jnlrec;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	switch (rectype)
	{
		case JRT_ALIGN:
		case JRT_EOF:
		case JRT_EPOCH:
		case JRT_PBLK:
		case JRT_PINI:
		case JRT_TRUNC:
			return SS_NORMAL;
			break;
		default:
			break;
	}
	jgbl.gbl_jrec_time = rec->prefix.time;
	pini_addr = rec->prefix.pini_addr;
	reg = rctl->gd;
	jctl = rctl->jctl;
	assert(jctl->reg_ctl == rctl);
	assert(gv_cur_region == reg);
	csa = rctl->csa;
	assert(cs_addrs == csa);
	csd = csa->hdr;
	assert(cs_data == csd);
	jnl_enabled = JNL_ENABLED(csa);
	if (jnl_enabled)
	{
		status = mur_get_pini(jctl, pini_addr, &plst);
		if (SS_NORMAL != status)
			return status;
		prc_vec = &plst->jpv;
		csa->jnl->pini_addr = plst->new_pini_addr;
		rctl->mur_plst = plst;
	}
	if (mur_options.rollback && IS_REPLICATED(rectype))
	{
		jgbl.mur_jrec_seqno = GET_JNL_SEQNO(rec);
		if (jgbl.mur_jrec_seqno >= murgbl.consist_jnl_seqno)
		{
			assert(murgbl.losttn_seqno >= (jgbl.mur_jrec_seqno + 1));
			murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno + 1;
		}
		jgbl.mur_jrec_strm_seqno = GET_STRM_SEQNO(rec);
		strm_seqno = jgbl.mur_jrec_strm_seqno;
		if (strm_seqno)
		{	/* maintain csd->strm_reg_seqno */
			strm_num = GET_STRM_INDEX(strm_seqno);
			strm_seqno = GET_STRM_SEQ60(strm_seqno);
			assert(csd->strm_reg_seqno[strm_num] <= (strm_seqno + 1));
			csd->strm_reg_seqno[strm_num] = strm_seqno + 1;
		}
	}
	/* Assert that TREF(gd_targ_gvnh_reg) is NULL for every update that journal recovery/rollback plays forward;
	 * This is necessary to ensure every update is played in only the database file where the journal record is seen
	 * instead of across all regions that span the particular global reference. For example if ^a(1) spans db files
	 * a.dat and b.dat, and a KILL ^a(1) is done at the user level, we would see KILL ^a(1) journal records in a.mjl
	 * and b.mjl. When journal recovery processes the journal record in a.mjl, it should do the kill only in a.dat
	 * When it gets to the same journal record in b.mjl, it would do the same kill in b.dat and effectively complete
	 * the user level KILL ^a(1). If instead recovery does the KILL across all spanned regions, we would be basically
	 * doing duplicate work let alone do it out-of-order since recovery goes region by region for the most part.
	 */
	assert(NULL == TREF(gd_targ_gvnh_reg));
	if (IS_SET_KILL_ZKILL_ZTRIG(rectype))
	{	/* TP and non-TP has same format */
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		if (jnl_enabled)
		{
			MUR_SET_JNL_FENCE_CTL_TOKEN(rec->jrec_set_kill.token_seq.token, rctl);
			jnl_fence_ctl.strm_seqno = rec->jrec_set_kill.strm_seqno;
			jgbl.tp_ztp_jnl_upd_num = rec->jrec_set_kill.update_num;
			DEBUG_ONLY(jgbl.max_tp_ztp_jnl_upd_num = MAX(jgbl.max_tp_ztp_jnl_upd_num, jgbl.tp_ztp_jnl_upd_num);)
			jgbl.mur_jrec_nodeflags = keystr->nodeflags;
		}
		if (IS_FENCED(rectype))
		{	/* Even for FENCE_NONE we apply fences. Otherwise an [F/G/T/U]UPD becomes UPD etc. */
			/* op_tstart is called in "mur_forward_play_cur_jrec" already */
			if (IS_FUPD(rectype))
			{
				jnl_fence_ctl.level = 1;
				if (jnl_enabled)
				{
					jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END;
					csa->next_fenced = NULL;
				}
			} else if (IS_GUPD(rectype))
			{
				jnl_fence_ctl.level = 1;
				if (jnl_enabled)
				{
					jnl_fence_ctl.fence_list = csa;
					csa->next_fenced = JNL_FENCE_LIST_END;
				}
			} else if (IS_TP(rectype))
				tp_set_sgm();
		}
#		ifdef GTM_TRIGGER
		/* Check if ^#t and if so need to increment trigger cycle in file header. Note that the below 'if' check could cause
		 * csd->db_trigger_cycle to be incremented even for the region that actually did NOT get any trigger updates. This
		 * is because some of the ^#t subscripts (like ^#t(#TNAME)) go to the DEFAULT region. So, even though a trigger was
		 * loaded only for ^a (corresponding to AREG), csd->db_trigger_cycle will be incremented for DEFAULT region as well.
		 * To avoid this, the below check should be modified to set csa->incr_db_trigger_cycle only if the ^#t subscript
		 * does not begin with '#' (similar to what is done in UPD_GV_BIND_NAME_APPROPRIATE). However, since journal
		 * recovery operates in standalone mode, the db_trigger_cycle increment to DEFAULT region should be okay since it
		 * will NOT cause any restarts
		 */
		if (IS_GVKEY_HASHT_GBLNAME(keystr->length, keystr->text))
		{
			assert(cs_addrs == csa);
			csa->incr_db_trigger_cycle = TRUE;
		}
#		endif
		if (IS_SET(rectype))
		{
			val_ptr = &keystr->text[keystr->length];
			GET_MSTR_LEN(mv.str.len, val_ptr);
			mv.str.addr = val_ptr + SIZEOF(mstr_len_t);
			mv.mvtype = MV_STR;
			op_gvput(&mv);
		} else if (IS_KILL(rectype))
		{
			if (IS_TP(rectype))
				tp_set_sgm();
			op_gvkill();
#		ifdef GTM_TRIGGER
		} else if (IS_ZTRIG(rectype))
		{
			if (IS_TP(rectype))
				tp_set_sgm();
			op_ztrigger();
#		endif
		} else
		{
			assert(IS_ZKILL(rectype));
			if (IS_TP(rectype))
				tp_set_sgm();
			op_gvzwithdraw();
		}
		if (IS_ZTP(rectype))
		{	/* Even for FENCE_NONE we apply fences. Otherwise an FUPD/GUPD becomes UPD etc. */
			assert(jnl_enabled || (JNL_FENCE_LIST_END == jnl_fence_ctl.fence_list && NULL == csa->next_fenced));
			jnl_fence_ctl.level = 0;
			if (jnl_enabled)
			{
				jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END;
				csa->next_fenced = NULL;
			}
		}
		return SS_NORMAL;
	}