Example #1
/* This called for TP and non-TP, but not for ZTP */
void	jnl_write_logical(sgmnt_addrs *csa, jnl_format_buffer *jfb, uint4 com_csum, jnlpool_write_ctx_t *jplctx)
	struct_jrec_upd		*jrec;
	struct_jrec_null	*jrec_null;
	struct_jrec_upd		*jrec_alt;
	jnl_private_control	*jpc;
	/* If REPL_WAS_ENABLED(csa) is TRUE, then we would not have gone through the code that initializes
	 * jgbl.gbl_jrec_time or jpc->pini_addr. But in this case, we are not writing the journal record
	 * to the journal buffer or journal file but write it only to the journal pool from where it gets
	 * sent across to the update process that does not care about these fields so it is ok to leave them as is.
	jpc = csa->jnl;
	assert((0 != jpc->pini_addr) || REPL_WAS_ENABLED(csa));
	assert(jgbl.gbl_jrec_time || REPL_WAS_ENABLED(csa));
	assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(jfb->rectype) || (JRT_NULL == jfb->rectype));
	jrec = (struct_jrec_upd *)jfb->buff;
	assert(OFFSETOF(struct_jrec_null, prefix) == OFFSETOF(struct_jrec_upd, prefix));
	assert(SIZEOF(jrec_null->prefix) == SIZEOF(jrec->prefix));
	jrec->prefix.pini_addr = (0 == jpc->pini_addr) ? JNL_HDR_LEN : jpc->pini_addr;
	jrec->prefix.tn = csa->ti->curr_tn;
	jrec->prefix.time = jgbl.gbl_jrec_time;
	/* t_end/tp_tend/mur_output_record has already set token/jnl_seqno into jnl_fence_ctl.token */
	assert((0 != jnl_fence_ctl.token) || (!dollar_tlevel && !jgbl.forw_phase_recovery && !REPL_ENABLED(csa))
		|| (!dollar_tlevel && jgbl.forw_phase_recovery && (repl_open != csa->hdr->intrpt_recov_repl_state)));
	assert(OFFSETOF(struct_jrec_null, jnl_seqno) == OFFSETOF(struct_jrec_upd, token_seq));
	assert(SIZEOF(jrec_null->jnl_seqno) == SIZEOF(jrec->token_seq));
	jrec->token_seq.token = jnl_fence_ctl.token;
	assert(OFFSETOF(struct_jrec_null, strm_seqno) == OFFSETOF(struct_jrec_upd, strm_seqno));
	assert(SIZEOF(jrec_null->strm_seqno) == SIZEOF(jrec->strm_seqno));
	jrec->strm_seqno = jnl_fence_ctl.strm_seqno;
	/* update checksum below */
	if(JRT_NULL != jrec->prefix.jrec_type)
		COMPUTE_LOGICAL_REC_CHECKSUM(jfb->checksum, jrec, com_csum, jrec->prefix.checksum);
	} else
		jrec->prefix.checksum = compute_checksum(INIT_CHECKSUM_SEED, (unsigned char *)jrec, SIZEOF(struct_jrec_null));
	if (REPL_ALLOWED(csa) && USES_ANY_KEY(csa->hdr))
		jrec_alt = (struct_jrec_upd *)jfb->alt_buff;
		jrec_alt->prefix = jrec->prefix;
		jrec_alt->token_seq = jrec->token_seq;
		jrec_alt->strm_seqno = jrec->strm_seqno;
		jrec_alt->num_participants = jrec->num_participants;
	JNL_WRITE_APPROPRIATE(csa, jpc, jfb->rectype, (jnl_record *)jrec, NULL, jfb, jplctx);
Example #2
int	mur_forward_multi_proc(reg_ctl_list *rctl)
	boolean_t		multi_proc, this_reg_stuck, release_latch, ok_to_play;
	boolean_t		cancelled_dbsync_timer, cancelled_timer;
	reg_ctl_list		*rctl_top, *prev_rctl;
	jnl_ctl_list		*jctl;
	gd_region		*reg;
	sgmnt_addrs		*csa;
	seq_num 		rec_token_seq;
	jnl_tm_t		rec_time;
	enum broken_type	recstat;
	jnl_record		*rec;
	enum jnl_record_type	rectype;
	char			errstr[256];
	int			i, rctl_index, save_errno, num_procs_stuck, num_reg_stuck;
	uint4			status, regcnt_stuck, num_partners, start_hrtbt_cntr;
	forw_multi_struct	*forw_multi;
	shm_forw_multi_t	*sfm;
	multi_struct 		*multi;
	jnl_tm_t		adjusted_resolve_time;
	shm_reg_ctl_t		*shm_rctl_start, *shm_rctl, *first_shm_rctl;
	size_t			shm_size, reccnt, copy_size;
	int4			*size_ptr;
	char			*shmPtr; /* not using "shm_ptr" since it is already used in an AIX include file */
	int			shmid;
	multi_proc_shm_hdr_t	*mp_hdr;	/* Pointer to "multi_proc_shm_hdr_t" structure in shared memory */

	status = 0;
	/* Although we made sure the # of tasks is the same as the # of processes forked off (in the "gtm_multi_proc"
	 * invocation in "mur_forward"), it is possible one of the forked process finishes one invocation of
	 * "mur_forward_multi_proc" before even another forked process gets assigned one task in "gtm_multi_proc_helper".
	 * In this case, we would be invoked more than once. But the first invocation would have done all the needed stuff
	 * so return for later invocations.
	if (mur_forward_multi_proc_done)
		return 0;
	mur_forward_multi_proc_done = TRUE;
	/* Note: "rctl" is unused. But cannot avoid passing it since "gtm_multi_proc" expects something */
	prev_rctl = NULL;
	rctl_start = NULL;
	adjusted_resolve_time = murgbl.adjusted_resolve_time;
	assert(0 == murgbl.regcnt_remaining);
	multi_proc = multi_proc_in_use;	/* cache value in "local" to speed up access inside loops below */
	if (multi_proc)
		mp_hdr = multi_proc_shm_hdr;
		shm_rctl_start = mur_shm_hdr->shm_rctl_start;
		if (jgbl.onlnrlbk)
			for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
				assert(rctl->csa->hold_onto_crit);	/* would have been set in parent process */
				rctl->csa->hold_onto_crit = FALSE;	/* reset since we dont own this region */
				assert(rctl->csa->now_crit);		/* would have been set in parent process */
				rctl->csa->now_crit = FALSE;		/* reset since we dont own this region */
		START_HEARTBEAT_IF_NEEDED; /* heartbeat timer needed later (in case not already started by "gtm_multi_proc") */
	first_shm_rctl = NULL;
	/* Phase1 of forward recovery starts */
	for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
		/* Check if "rctl" is available for us or if some other concurrent process has taken it */
		if (multi_proc)
			rctl_index = rctl - &mur_ctl[0];
			shm_rctl = &shm_rctl_start[rctl_index];
			if (shm_rctl->owning_pid)
				assert(process_id != shm_rctl->owning_pid);
			for ( ; rctl < rctl_top; rctl++, shm_rctl++)
				if (shm_rctl->owning_pid)
					assert(process_id != shm_rctl->owning_pid);
				shm_rctl->owning_pid = process_id;	/* Declare ownership */
				rctl->this_pid_is_owner = TRUE;
				if (jgbl.onlnrlbk)
				{	/* This is an online rollback and crit was grabbed on all regions by the parent rollback
					 * process. But this child process now owns this region and does the actual rollback on
					 * this region so borrow crit for the duration of this child process.
					csa = rctl->csa;
					csa->hold_onto_crit = TRUE;
					csa->now_crit = TRUE;
					assert(csa->nl->in_crit == mp_hdr->parent_pid);
					csa->nl->in_crit = process_id;
					assert(csa->nl->onln_rlbk_pid == mp_hdr->parent_pid);
					csa->nl->onln_rlbk_pid = process_id;
				if (NULL == first_shm_rctl)
					first_shm_rctl = shm_rctl;
			if (rctl >= rctl_top)
				assert(rctl == rctl_top);
			/* Set key to print this rctl'ss region-name as prefix in case this forked off process prints any output */
			MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
#			ifdef MUR_DEBUG
			fprintf(stderr, "pid = %d : Owns region %s\n", process_id, multi_proc_key);
#			endif
		} else
			rctl->this_pid_is_owner = TRUE;
		if (mur_options.forward)
			assert(NULL == rctl->jctl_turn_around);
			jctl = rctl->jctl = rctl->jctl_head;
			assert(jctl->reg_ctl == rctl);
			jctl->rec_offset = JNL_HDR_LEN;
			jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END; /* initialized to reflect journaling is not enabled */
			if (mur_options.rollback)
				jgbl.mur_jrec_seqno = jctl->jfh->start_seqno;
		} else
			jctl = rctl->jctl = (NULL == rctl->jctl_turn_around) ? rctl->jctl_head : rctl->jctl_turn_around;
			assert(jctl->reg_ctl == rctl);
			jctl->rec_offset = jctl->turn_around_offset;
			jgbl.mur_jrec_seqno = jctl->turn_around_seqno;
			assert((NULL != rctl->jctl_turn_around) || (0 == jctl->rec_offset));
		if (mur_options.rollback)
			if (murgbl.consist_jnl_seqno < jgbl.mur_jrec_seqno)
				/* Assert that murgbl.losttn_seqno is never lesser than jgbl.mur_jrec_seqno (the turnaround
				 * point seqno) as this is what murgbl.consist_jnl_seqno is going to be set to and will
				 * eventually be the post-rollback seqno. If this condition is violated, the result of the
				 * recovery is a compromised database (the file header will indicate a Region Seqno which
				 * is not necessarily correct since seqnos prior to it might be absent in the database).
				 * Therefore, this is an out-of-design situation with respect to rollback and so stop it.
				assert(murgbl.losttn_seqno >= jgbl.mur_jrec_seqno);
				murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno;
			assert(murgbl.consist_jnl_seqno <= murgbl.losttn_seqno);
		if (mur_options.update || mur_options.extr[GOOD_TN])
			reg = rctl->gd;
			gv_cur_region = reg;
			tp_change_reg();	/* note : sets cs_addrs to non-NULL value even if gv_cur_region->open is FALSE
						 * (cs_data could still be NULL). */
			rctl->csa = cs_addrs;
			cs_addrs->miscptr = (void *)rctl;
			rctl->csd = cs_data;
			rctl->sgm_info_ptr = cs_addrs->sgm_info_ptr;
			assert(!reg->open || (NULL != cs_addrs->dir_tree));
			gv_target = cs_addrs->dir_tree;
		jctl->after_end_of_data = FALSE;
		status = mur_next(jctl, jctl->rec_offset);
		assert(ERR_JNLREADEOF != status);	/* cannot get EOF at start of forward processing */
		if (SS_NORMAL != status)
			goto finish;
		PRINT_VERBOSE_STAT(jctl, "mur_forward:at the start");
		rctl->process_losttn = FALSE;
		/* Any multi-region TP transaction will be processed as multiple single-region TP transactions up
		 * until the tp-resolve-time is reached. From then on, they will be treated as one multi-region TP
		 * transaction. This is needed for proper lost-tn determination (any multi-region transaction that
		 * gets played in a region AFTER it has already encountered a broken tn should treat this as a lost tn).
			if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr))
			{	/* We are at a logical point. So exit if signaled by parent */
				status = ERR_FORCEDHALT;
				goto finish;
			assert(jctl == rctl->jctl);
			rec = rctl->mur_desc->jnlrec;
			rec_time = rec->prefix.time;
			if (rec_time > mur_options.before_time)
				break;	/* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */
			if (rec_time < mur_options.after_time)
				status = mur_next_rec(&jctl);
				continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */
			if (rec_time >= adjusted_resolve_time)
				break;	/* Records after this adjusted resolve_time will be processed below in phase2 */
			/* Note: Since we do hashtable token processing only for records from tp_resolve_time onwards,
			 * it is possible that if we encounter any broken transactions here we wont know they are broken
			 * but will play them as is. That is unavoidable. Specify -SINCE_TIME (for -BACKWARD rollback/recover)
			 * and -VERIFY (for -FORWARD rollback/recover) to control tp_resolve_time (and in turn more
			 * effective broken tn determination).
			status = mur_forward_play_cur_jrec(rctl);
			if (SS_NORMAL != status)
			status = mur_next_rec(&jctl);
		} while (SS_NORMAL == status);
		CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */
		if (SS_NORMAL != status)
		{	/* ERR_FILENOTCREATE is possible from "mur_cre_file_extfmt" OR	ERR_FORCEDHALT is possible
			 * from "mur_forward_play_cur_jrec". No other errors are known to occur here. Assert accordingly.
			assert((ERR_FILENOTCREATE == status) || (ERR_FORCEDHALT == status));
			goto finish;
		if (rctl->forw_eof_seen)
			PRINT_VERBOSE_STAT(jctl, "mur_forward:Reached EOF before tp_resolve_time");
			continue;	/* Reached EOF before even getting to tp_resolve_time.
					 * Do not even consider region for next processing loop */
		rctl->last_tn = 0;
		murgbl.regcnt_remaining++;	/* # of regions participating in recovery at this point */
		if (NULL == rctl_start)
			rctl_start = rctl;
		if (NULL != prev_rctl)
			prev_rctl->next_rctl = rctl;
			rctl->prev_rctl = prev_rctl;
		prev_rctl = rctl;
		assert(murgbl.ok_to_update_db || !rctl->db_updated);
		PRINT_VERBOSE_STAT(jctl, "mur_forward:at tp_resolve_time");
	if (multi_proc)
		multi_proc_key = NULL;	/* reset key until it can be set to rctl's region-name again */
	/* Note that it is possible for rctl_start to be NULL at this point. That is there is no journal record in any region
	 * AFTER the calculated tp-resolve-time. This is possible if for example -AFTER_TIME was used and has a time later
	 * than any journal record in all journal files. If rctl_start is NULL, prev_rctl should also be NULL and vice versa.
	if (NULL != rctl_start)
		assert(NULL != prev_rctl);
		prev_rctl->next_rctl = rctl_start;
		rctl_start->prev_rctl = prev_rctl;
	rctl = rctl_start;
	regcnt_stuck = 0; /* # of regions we are stuck in waiting for other regions to resolve a multi-region TP transaction */
	assert((NULL == rctl) || (NULL == rctl->forw_multi));
	gv_cur_region = NULL;	/* clear out any previous value to ensure gv_cur_region/cs_addrs/cs_data
				 * all get set in sync by the MUR_CHANGE_REG macro below.
	/* Phase2 of forward recovery starts */
	while (NULL != rctl)
	{	/* while there is at least one region remaining with unprocessed journal records */
		assert(NULL != rctl_start);
		assert(0 < murgbl.regcnt_remaining);
		if (NULL != rctl->forw_multi)
		{	/* This region's current journal record is part of a TP transaction waiting for other regions */
			assert(regcnt_stuck <= murgbl.regcnt_remaining);
			if (regcnt_stuck == murgbl.regcnt_remaining)
				assertpro(multi_proc_in_use); /* Else : Out-of-design situation. Stuck in ALL regions. */
				/* Check one last time if all regions are stuck waiting for another process to resolve the
				 * multi-region TP transaction. If so, wait in a sleep loop. If not, we can proceed.
				rctl = rctl_start;
				start_hrtbt_cntr = heartbeat_counter;
					if (IS_FORCED_MULTI_PROC_EXIT(mp_hdr))
					{	/* We are at a logical point. So exit if signaled by parent */
						status = ERR_FORCEDHALT;
						goto finish;
					forw_multi = rctl->forw_multi;
					assert(NULL != forw_multi);
					sfm = forw_multi->shm_forw_multi;
					assert(NULL != sfm);
					assert(sfm->num_reg_seen_forward <= sfm->num_reg_seen_backward);
#					ifdef MUR_DEBUG
					fprintf(stderr, "Pid = %d : Line %d : token = %llu : forward = %d : backward = %d\n",
						process_id, __LINE__, (long long int)sfm->token,
						sfm->num_reg_seen_forward, sfm->num_reg_seen_backward);
#					endif
					if (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward)
					{	/* We are no longer stuck in this region */
						forw_multi->no_longer_stuck = TRUE;
					rctl = rctl->next_rctl;	/* Move on to the next available region */
					assert(NULL != rctl);
					if (rctl == rctl_start)
					{	/* We went through all regions once and are still stuck.
						 * Sleep until at leat TWO heartbeats have elapsed after which check for deadlock.
						 * Do this only in the child process that owns the FIRST region in the region list.
						 * This way we dont have contention for the GRAB_MULTI_PROC_LATCH from
						 * all children at more or less the same time.
						if ((rctl == mur_ctl) && (heartbeat_counter > (start_hrtbt_cntr + 2)))
						{	/* Check if all processes are stuck for a while. If so assertpro */
							shm_rctl_start = mur_shm_hdr->shm_rctl_start;
							num_reg_stuck = 0;
							for (i = 0; i < murgbl.reg_total; i++)
								shm_rctl = &shm_rctl_start[i];
								sfm = shm_rctl->shm_forw_multi;
								if (NULL != sfm)
									if (sfm->num_reg_seen_forward != sfm->num_reg_seen_backward)
							/* If everyone is stuck at this point, it is an out-of-design situation */
							assertpro(num_reg_stuck < murgbl.reg_total);
							start_hrtbt_cntr = heartbeat_counter;
						} else
						{	/* Sleep and recheck if any region we are stuck in got resolved.
							 * To minimize time spent sleeping, we just yield our timeslice.
				} while (TRUE);
			} else
				rctl = rctl->next_rctl;	/* Move on to the next available region */
				assert(NULL != rctl);
		regcnt_stuck = 0;	/* restart the counter now that we found at least one non-stuck region */
		jctl = rctl->jctl;
		this_reg_stuck = FALSE;
		for ( status = SS_NORMAL; SS_NORMAL == status; )
			if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr))
			{	/* We are at a logical point. So exit if signaled by parent */
				status = ERR_FORCEDHALT;
				goto finish;
			assert(jctl == rctl->jctl);
			rec = rctl->mur_desc->jnlrec;
			rec_time = rec->prefix.time;
			if (rec_time > mur_options.before_time)
				break;	/* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */
			assert((rec_time >= adjusted_resolve_time) || (mur_options.notncheck && !mur_options.verify));
			assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated));
			if (rec_time < mur_options.after_time)
				status = mur_next_rec(&jctl);
				continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */
			/* Check if current journal record can be played right away or need to wait for corresponding journal
			 * records from other participating TP regions to be reached. A non-TP or ZTP transaction can be played
			 * without issues (i.e. has no dependencies with any other regions). A single-region TP transaction too
			 * falls in the same category. A multi-region TP transaction needs to wait until all participating regions
			 * have played all journal records BEFORE this TP in order to ensure recover plays records in the exact
			 * same order that GT.M performed them in.
			/* If FENCE_NONE is specified, we would not have maintained any multi hashtable in mur_back_process for
			 * broken transaction processing. So we process multi-region TP transactions as multiple single-region
			 * TP transactions in forward phase.
			if (FENCE_NONE != mur_options.fences)
				rectype = (enum jnl_record_type)rec->prefix.jrec_type;
				if (IS_TP(rectype) && IS_TUPD(rectype))
					assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants);
					assert(&rec->jrec_set_kill.num_participants == &rec->jrec_lgtrig.num_participants);
					num_partners = rec->jrec_set_kill.num_participants;
					assert(0 < num_partners);
					if (1 < num_partners)
						this_reg_stuck = TRUE;
						assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num);
						assert(&rec->jrec_set_kill.update_num == &rec->jrec_lgtrig.update_num);
			if (this_reg_stuck)
				rec_token_seq = GET_JNL_SEQNO(rec);
				MUR_FORW_TOKEN_LOOKUP(forw_multi, rec_token_seq, rec_time);
				if (NULL != forw_multi)
				{	/* This token has already been seen in another region in forward processing.
					 * Add current region as well. If all regions have been resolved, then play
					 * the entire transaction maintaining the exact same order of updates within.
					if (!forw_multi->no_longer_stuck)
						MUR_FORW_TOKEN_ONE_MORE_REG(forw_multi, rctl);
				} else
				{	/* First time we are seeing this token in forward processing. Check if this
					 * has already been determined to be a broken transaction.
					recstat = GOOD_TN;
					multi = NULL;
					if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
						multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, TPFENCE);
						if ((NULL != multi) && (0 < multi->partner))
							recstat = BROKEN_TN;
					MUR_FORW_TOKEN_ADD(forw_multi, rec_token_seq, rec_time, rctl, num_partners,
								recstat, multi);
				/* Check that "tabent" field has been initialized above (by either the MUR_FORW_TOKEN_LOOKUP
				 * or MUR_FORW_TOKEN_ADD macros). This is relied upon by "mur_forward_play_multireg_tp" below.
				assert(NULL != forw_multi->u.tabent);
				assert(forw_multi->num_reg_seen_forward <= forw_multi->num_reg_seen_backward);
				if (multi_proc)
					sfm = forw_multi->shm_forw_multi;
					ok_to_play = (NULL == sfm) || (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward);
				} else
					ok_to_play = (forw_multi->num_reg_seen_forward == forw_multi->num_reg_seen_backward);
				assert(ok_to_play || !forw_multi->no_longer_stuck);
				if (ok_to_play )
				{	/* We have enough information to proceed with playing this multi-region TP in
					 * forward processing (even if we might not have seen all needed regions). Now play it.
					 * Note that the TP could be BROKEN_TN or GOOD_TN. The callee handles it.
					assert(forw_multi == rctl->forw_multi);
					status = mur_forward_play_multireg_tp(forw_multi, rctl);
					this_reg_stuck = FALSE;
					/* Note that as part of playing the TP transaction, we could have reached
					 * the EOF of rctl. In this case, we need to break out of the loop.
					if ((SS_NORMAL != status) || rctl->forw_eof_seen)
					assert(NULL == rctl->forw_multi);
					jctl = rctl->jctl;	/* In case the first record after the most recently processed
								 * TP transaction is in the next generation journal file */
			} else
				status = mur_forward_play_cur_jrec(rctl);
				if (SS_NORMAL != status)
			status = mur_next_rec(&jctl);
		assert((NULL == rctl->forw_multi) || this_reg_stuck);
		assert((NULL != rctl->forw_multi) || !this_reg_stuck);
		if (!this_reg_stuck)
		{	/* We are not stuck in this region (to resolve a multi-region TP).
			 * This means we are done processing all the records of this region.
			assert(NULL == rctl->forw_multi);
			if (!rctl->forw_eof_seen)
				CHECK_IF_EOF_REACHED(rctl, status);
					/* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */
				if (SS_NORMAL != status)
					assert(ERR_FILENOTCREATE == status);
					goto finish;
				DELETE_RCTL_FROM_UNPROCESSED_LIST(rctl); /* since all of its records should have been processed */
			} else
			{	/* EOF was seen in rctl inside "mur_forward_play_multireg_tp" and it was removed
				 * from the unprocessed list of rctls. At the time rctl was removed, its "next_rctl"
				 * field could have been pointing to another <rctl> that has since then also been
				 * removed inside the same function. Therefore the "next_rctl" field is not reliable
				 * in this case but instead we should rely on the global variable "rctl_start" which
				 * points to the list of unprocessed rctls. Set "next_rctl" accordingly.
				rctl->next_rctl = rctl_start;
				if (ERR_JNLREADEOF == status)
					status = SS_NORMAL;
		assert(SS_NORMAL == status);
		assert(!this_reg_stuck || !rctl->forw_eof_seen);
		assert((NULL == rctl->next_rctl) || (NULL != rctl_start));
		assert((NULL == rctl->next_rctl) || (0 < murgbl.regcnt_remaining));
		rctl = rctl->next_rctl;	/* Note : even though "rctl" could have been deleted from the doubly linked list above,
					 * rctl->next_rctl is not touched so we can still use it to get to the next element. */
	assert(0 == murgbl.regcnt_remaining);
	jgbl.mur_pini_addr_reset_fnptr = NULL;	/* No more simulation of GT.M activity for any region */
	prc_vec = murgbl.prc_vec;	/* Use process-vector of MUPIP RECOVER (not any simulating GT.M process) now onwards */
	assert(0 == dollar_tlevel);
	for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
		if (!rctl->this_pid_is_owner)
			continue;	/* in a parallel processing environment, process only regions we own */
		if (multi_proc)
		{	/* Set key to print this rctl's region-name as prefix in case this forked off process prints any output */
			MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
		PRINT_VERBOSE_STAT(rctl->jctl, "mur_forward:at the end");
		assert(!mur_options.rollback || (0 != murgbl.consist_jnl_seqno));
		assert(mur_options.rollback || (0 == murgbl.consist_jnl_seqno));
		assert(!dollar_tlevel);	/* In case it applied a broken TUPD */
		assert(murgbl.ok_to_update_db || !rctl->db_updated);
		rctl->mur_plst = NULL;	/* reset now that simulation of GT.M updates is done */
		/* Ensure mur_block_count_correct is called if updates allowed */
		if (murgbl.ok_to_update_db && (SS_NORMAL != mur_block_count_correct(rctl)))
			gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(4) ERR_BLKCNTEDITFAIL, 2, DB_LEN_STR(rctl->gd));
	if (multi_proc)
		multi_proc_key = NULL;	/* reset key until it can be set to rctl's region-name again */
	if ((SS_NORMAL == status) && mur_options.show)
	if (NULL != first_shm_rctl)
	{	/* Transfer needed process-private information to shared memory so parent process can later inherit this. */
		first_shm_rctl->err_cnt = murgbl.err_cnt;
		first_shm_rctl->wrn_count = murgbl.wrn_count;
		first_shm_rctl->consist_jnl_seqno = murgbl.consist_jnl_seqno;
		/* If extract files were created by this process for one or more regions, then copy that information to
		 * shared memory so parent process can use this information to do a merge sort.
		shm_rctl = mur_shm_hdr->shm_rctl_start;
		for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++, shm_rctl++)
			if (!rctl->this_pid_is_owner)
				continue;	/* in a parallel processing environment, process only regions we own */
			/* Cancel any flush/dbsync timers by this child process for this region. This is because the
			 * child is not going to go through exit handling code (no gds_rundown etc.). And we need to
			 * clear up csa->nl->wcs_timers. (normally done by gds_rundown).
			if (NULL != rctl->csa)	/* rctl->csa can be NULL in case of "mupip journal -extract" etc. */
				CANCEL_DB_TIMERS(rctl->gd, rctl->csa, cancelled_timer, cancelled_dbsync_timer);
			reccnt = 0;
			for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0;
								recstat < TOT_EXTR_TYPES;
									recstat++, size_ptr++)
			{	/* Assert "extr_file_created" information is in sync between rctl and shm_rctl.
				 * This was done at the end of "mur_cre_file_extfmt".
				assert(shm_rctl->extr_file_created[recstat] == rctl->extr_file_created[recstat]);
				/* Assert that if *size_ptr is non-zero, then we better have created an extract file.
				 * Note that the converse is not true. It is possible we created a file for example to
				 * write an INCTN record but decided to not write anything because it was not a -detail
				 * type of extract. So *sizeptr could be 0 even though we created the extract file.
				assert(!*size_ptr || rctl->extr_file_created[recstat]);
				shm_rctl->jnlext_list_size[recstat] = *size_ptr;
				reccnt += *size_ptr;
			assert(INVALID_SHMID == shm_rctl->jnlext_shmid);
			shm_size = reccnt * SIZEOF(jnlext_multi_t);
			/* If we are quitting because of an abnormal status OR a forced signal to terminate
			 * OR if the parent is dead (kill -9) dont bother creating shmid to communicate back with parent.
			if (mp_hdr->parent_pid != getppid())
				SET_FORCED_MULTI_PROC_EXIT;	/* Also signal sibling children to stop processing */
				if (SS_NORMAL != status)
					status = ERR_FORCEDHALT;
			if ((SS_NORMAL == status) && shm_size)
				shmid = shmget(IPC_PRIVATE, shm_size, 0600 | IPC_CREAT);
				if (-1 == shmid)
					save_errno = errno;
					SNPRINTF(errstr, SIZEOF(errstr),
						"shmget() : shmsize=0x%llx", shm_size);
					MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);	/* to print region name prefix */
					rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8)
								ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno);
				shmPtr = (char *)do_shmat(shmid, 0, 0);
				if (-1 == (sm_long_t)shmPtr)
					save_errno = errno;
					SNPRINTF(errstr, SIZEOF(errstr),
						"shmat() : shmid=%d shmsize=0x%llx", shmid, shm_size);
					MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);	/* to print region name prefix */
					rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8)
								ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno);
				shm_rctl->jnlext_shmid = shmid;
				shm_rctl->jnlext_shm_size = shm_size;
				for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0;
									recstat < TOT_EXTR_TYPES;
										recstat++, size_ptr++)
					shm_size = *size_ptr;
					if (shm_size)
						copy_size = copy_list_to_buf(rctl->jnlext_multi_list[recstat],
												(int4)shm_size, shmPtr);
						assert(copy_size == (shm_size * SIZEOF(jnlext_multi_t)));
						shmPtr += copy_size;
	mur_close_file_extfmt(IN_MUR_CLOSE_FILES_FALSE);	/* Need to flush buffered extract/losttrans/brokentrans files */
	return (int)status;
uint4	mur_forward_play_cur_jrec(reg_ctl_list *rctl)
	boolean_t		process_losttn;
	boolean_t		is_set_kill_zkill_ztworm_lgtrig_ztrig, is_set_kill_zkill_ztrig;
	trans_num		curr_tn;
	enum jnl_record_type	rectype;
	enum rec_fence_type	rec_fence;
	enum broken_type	recstat;
	jnl_tm_t		rec_time;
	uint4			status;
	mval			mv;
	seq_num 		rec_token_seq, rec_strm_seqno, resync_strm_seqno;
	jnl_record		*rec;
	jnl_string		*keystr;
	multi_struct 		*multi;
	jnl_ctl_list		*jctl;
	ht_ent_mname		*tabent;
	mname_entry	 	gvent;
	gvnh_reg_t		*gvnh_reg;
	pini_list_struct	*plst;
	int4			gtmcrypt_errno;
	boolean_t		use_new_key;
	forw_multi_struct	*forw_multi;
#	if (defined(DEBUG) && defined(UNIX))
	int4			strm_idx;
#	endif

	if (multi_proc_in_use)
	{	/* Set key to print this rctl's region-name as prefix in case this forked off process prints any output.
		 * e.g. If this function ends up calling t_end/op_tcommit which in turn needs to do a jnl autoswitch
		 * inside jnl_file_extend and prints a GTM-I-FILERENAME message.
		MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
	jctl = rctl->jctl;
	/* Ensure we never DOUBLE process the same journal record in the forward phase */
	assert((jctl != rctl->last_processed_jctl) || (jctl->rec_offset != rctl->last_processed_rec_offset));
#	ifdef DEBUG
	rctl->last_processed_jctl = jctl;
	rctl->last_processed_rec_offset = jctl->rec_offset;
#	endif
	rec = rctl->mur_desc->jnlrec;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	rec_time = rec->prefix.time;
	assert(rec_time <= mur_options.before_time);
	assert(rec_time >= mur_options.after_time);
	assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated));
	is_set_kill_zkill_ztworm_lgtrig_ztrig = (boolean_t)(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype));
	if (is_set_kill_zkill_ztworm_lgtrig_ztrig)
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		if (USES_ANY_KEY(jctl->jfh))
			use_new_key = USES_NEW_KEY(jctl->jfh);
			assert(NEEDS_NEW_KEY(jctl->jfh, rec->prefix.tn) == use_new_key);
					(use_new_key ? TRUE : jctl->jfh->non_null_iv),
					(use_new_key ? jctl->encr_key_handle2 : jctl->encr_key_handle),
			if (0 != gtmcrypt_errno)
				GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, jctl->jnl_fn_len, jctl->jnl_fn);
				return gtmcrypt_errno;
	if (mur_options.selection && !mur_select_rec(jctl))
		return SS_NORMAL;
	rec_token_seq = (REC_HAS_TOKEN_SEQ(rectype)) ? GET_JNL_SEQNO(rec) : 0;
	process_losttn = rctl->process_losttn;
	if (!process_losttn && mur_options.rollback)
		if (IS_REPLICATED(rectype) && (rec_token_seq >= murgbl.losttn_seqno))
			process_losttn = rctl->process_losttn = TRUE;
#		if (defined(UNIX) && defined(DEBUG))
		if ((rec_token_seq < murgbl.losttn_seqno) && murgbl.resync_strm_seqno_nonzero && IS_REPLICATED(rectype))
			assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype) || IS_COM(rectype) || (JRT_NULL == (rectype)));
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_null.strm_seqno);
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_tcom.strm_seqno);
			rec_strm_seqno = GET_STRM_SEQNO(rec);
			if (rec_strm_seqno)
				strm_idx = GET_STRM_INDEX(rec_strm_seqno);
				rec_strm_seqno = GET_STRM_SEQ60(rec_strm_seqno);
				resync_strm_seqno = murgbl.resync_strm_seqno[strm_idx];
				assert(!resync_strm_seqno || (rec_strm_seqno < resync_strm_seqno));
#		endif
	/* Note: Broken transaction determination is done below only based on the records that got selected as
	 * part of the mur_options.selection criteria. Therefore depending on whether a broken transaction gets
	 * selected or not, future complete transactions might either go to the lost transaction or extract file.
	recstat = process_losttn ? LOST_TN : GOOD_TN;
	status = SS_NORMAL;
	if (FENCE_NONE != mur_options.fences)
		if (IS_FENCED(rectype))
#			ifdef DEBUG
			/* assert that all TP records before min_broken_time are not broken */
			if (IS_TP(rectype) && ((!mur_options.rollback && rec_time < murgbl.min_broken_time)
						|| (mur_options.rollback && rec_token_seq < murgbl.min_broken_seqno)))
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				if (NULL != (multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence)))
					assert(0 == multi->partner);
					assert(FALSE == multi->this_is_broken);
#			endif
			/* In most cases, the fact whether a TP tn is broken or not would have been determined already in
			 * mur_forward. In this case, rctl->forw_multi would be set appropriately. So use that to get to
			 * "multi" and avoid a hashtable lookup. If forw_multi is NULL (e.g. for ZTP or single-region TP),
			 * the hash-table lookup cannot be avoided.
			multi = NULL;
			forw_multi = rctl->forw_multi;
			if (NULL != forw_multi)
				multi = forw_multi->multi;
				/* Always honor the "recstat" from the forw_multi since that has been determined taking into
				 * consideration the BROKEN_TN status of ALL participating regions.
				assert((GOOD_TN != forw_multi->recstat) || (GOOD_TN == recstat));
				recstat = forw_multi->recstat;
			} else if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
				assert(!mur_options.rollback || process_losttn);
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				assert(rec_token_seq == ((struct_jrec_upd *)rec)->token_seq.token);
				multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence);
				if ((NULL != multi) && (0 < multi->partner))
					process_losttn = rctl->process_losttn = TRUE;
					recstat = BROKEN_TN;
			/* Check that if the hashtable reports a tn as GOOD, it better have had the same
			 * # of participants in the TCOM records across all the participating regions.
			assert((NULL == multi) || (BROKEN_TN == recstat) || (FALSE == multi->this_is_broken));
		} else if ((FENCE_ALWAYS == mur_options.fences) && is_set_kill_zkill_ztworm_lgtrig_ztrig)
			process_losttn = rctl->process_losttn = TRUE;
			recstat = BROKEN_TN;
	} else
		forw_multi = NULL;
	if (mur_options.show)
		assert(SS_NORMAL == status);
		if (BROKEN_TN != recstat)
			if (JRT_PFIN == rectype)
				status = mur_pini_state(jctl, rec->prefix.pini_addr, FINISHED_PROC);
			else if ((JRT_EOF != rectype)
					&& ((JRT_ALIGN != rectype) || (JNL_HDR_LEN != rec->prefix.pini_addr)))
			{	/* Note that it is possible that we have a PINI record followed by a PFIN record
				 * and later an ALIGN record with the pini_addr pointing to the original PINI
				 * record (see comment in jnl_write.c where pini_addr gets assigned to JNL_HDR_LEN)
				 * In this case we do not want the ALIGN record to cause the process to become
				 * ACTIVE although it has written a PFIN record. Hence the check above.
				status = mur_pini_state(jctl, rec->prefix.pini_addr, ACTIVE_PROC);
		} else
			status = mur_pini_state(jctl, rec->prefix.pini_addr, BROKEN_PROC);
		if (SS_NORMAL != status)
			return status;	/* "mur_pini_state" failed due to bad pini_addr */
		++jctl->jnlrec_cnt[rectype];	/* for -show=STATISTICS */
	if (!mur_options.update && !jgbl.mur_extract)
		return SS_NORMAL;
	if (murgbl.ok_to_update_db && IS_TUPD(rectype) && (GOOD_TN == recstat))
	{	/* Even for FENCE_NONE we apply fences. Otherwise a TUPD becomes UPD etc.
		 * If forw_multi is non-NULL, a multi-region TP transaction is being played as a SINGLE
		 * TP transaction across all the involved regions. Therefore only ONE op_tstart is done
		 * even though more than one TSET might be encountered. In this case, do not issue JNLTPNEST error.
		if (dollar_tlevel && (NULL == forw_multi))
			gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_JNLTPNEST, 4, jctl->jnl_fn_len,
				jctl->jnl_fn, jctl->rec_offset, &rec->prefix.tn);
		if (!dollar_tlevel)
		{	/* Note: op_tstart resets gv_currkey. So set gv_currkey later. */
			/* mv is used to determine transaction id. But it is ignored by recover/rollback */
			mv.mvtype = MV_STR;
			mv.str.len = 0;
			mv.str.addr = NULL;
			op_tstart(IMPLICIT_TSTART, TRUE, &mv, -1);
			DEBUG_ONLY(jgbl.max_tp_ztp_jnl_upd_num = 0;)
		tp_set_sgm();	/* needed to set "sgm_info_ptr" to correspond to "rctl" */
Example #4
/* This routine formats and outputs journal extract records
   corresponding to M SET, KILL, ZKILL, TSTART, ZTSTART, and ZTRIGGER commands, $ZTRIGGER function (LGTRIG) and $ZTWORMHOLE */
void	mur_extract_set(jnl_ctl_list *jctl, fi_type *fi, jnl_record *rec, pini_list_struct *plst)
	enum jnl_record_type	rectype;
	int			max_blen, actual, extract_len, val_extr_len, val_len;
	char			*val_ptr, *ptr, *buff;
	jnl_string		*keystr;
	boolean_t		do_format2zwr, is_ztstart;

	if (!mur_options.detail)
		extract_len = 0;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	if (IS_FUPD_TUPD(rectype))
		if (!mur_options.detail)
			if (IS_TUPD(rectype))
				EXT2BYTES(&muext_code[MUEXT_TSTART][0]); /* TSTART */
				is_ztstart = FALSE;
			} else /* if (IS_FUPD(rectype)) */
				EXT2BYTES(&muext_code[MUEXT_ZTSTART][0]); /* ZTSTART */
				is_ztstart = TRUE;
		} else
			if (IS_TUPD(rectype))
				strcpy(murgbl.extr_buff + extract_len, "TSTART \\");
				is_ztstart = FALSE;
			} else /* if (IS_FUPD(rectype)) */
				strcpy(murgbl.extr_buff + extract_len, "ZTSTART\\");
				is_ztstart = TRUE;
			extract_len = STRLEN(murgbl.extr_buff);
		if (mur_options.detail)
		if (!is_ztstart)
		jnlext_write(fi, murgbl.extr_buff, extract_len);
	/* Output the SET or KILL or ZKILL or ZTWORMHOLE or LGTRIG or ZTRIG record */
	if (!mur_options.detail)
		extract_len = 0;
		if (IS_SET(rectype))
		} else if (IS_KILL(rectype))
		} else if (IS_ZKILL(rectype))
		} else if (IS_ZTWORM(rectype))
		} else if (IS_LGTRIG(rectype))
		} else if (IS_ZTRIG(rectype))
		} else
			assert(FALSE);	/* The assert will disappear in pro but not the ";" to properly terminate the else */
	} else
		if (IS_FUPD_TUPD(rectype))
			memcpy(murgbl.extr_buff, "                       ", 23);
			extract_len = 23;
		} else
			extract_len = STRLEN(murgbl.extr_buff);
		strcpy(murgbl.extr_buff + extract_len, "       \\");
		memcpy(murgbl.extr_buff + extract_len, jrt_label[rectype], LAB_LEN);
		extract_len += LAB_LEN;
		memcpy(murgbl.extr_buff + extract_len, LAB_TERM, LAB_TERM_SZ);
		extract_len += LAB_TERM_SZ;
	if (mur_options.detail)
	if (IS_ZTP(rectype))
	} else
	assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_ztworm.strm_seqno);
	assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_lgtrig.strm_seqno);
	assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num);
	assert(&rec->jrec_set_kill.update_num == &rec->jrec_lgtrig.update_num);
	do_format2zwr = FALSE;
	if (IS_SET_KILL_ZKILL_ZTRIG(rectype))
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		buff = &murgbl.extr_buff[extract_len];
		max_blen = MIN(MAX_ZWR_KEY_SZ, murgbl.max_extr_record_length - extract_len);
		assert(MAX_ZWR_KEY_SZ == max_blen);	/* We allocated enough for key and data expansion for ZWR format */
		ptr = (char *)format_targ_key((uchar_ptr_t)buff, max_blen, gv_currkey, TRUE);
		assert(NULL != ptr);
		if (NULL != ptr)
			extract_len += (int)(ptr - &murgbl.extr_buff[extract_len]);
		if (IS_SET(rectype))
			murgbl.extr_buff[extract_len++] = '=';
			val_ptr = &keystr->text[keystr->length];
			GET_MSTR_LEN(val_len, val_ptr);
			val_ptr += SIZEOF(mstr_len_t);
			do_format2zwr = TRUE;
	} else if (IS_ZTWORM(rectype) || IS_LGTRIG(rectype))
		assert(&rec->jrec_ztworm.ztworm_str == &rec->jrec_lgtrig.lgtrig_str);
		keystr = (jnl_string *)&rec->jrec_ztworm.ztworm_str;
		val_len = keystr->length;
		val_ptr = &keystr->text[0];
		do_format2zwr = TRUE;
	if (do_format2zwr)
		if (ZWR_EXP_RATIO(val_len) <= murgbl.max_extr_record_length - extract_len)
			ptr = &murgbl.extr_buff[extract_len];
			format2zwr((sm_uc_ptr_t)val_ptr, val_len, (unsigned char *)ptr, &val_extr_len);
			extract_len += val_extr_len;
		} else
				3, jctl->jnl_fn_len, jctl->jnl_fn, jctl->rec_offset,
				ERR_TEXT, 2, LEN_AND_LIT("Length of the record is too high for zwr format"));
			if (mur_options.verbose || mur_options.detail)
				gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_MUINFOUINT4, 4,
					LEN_AND_LIT("After max expansion record length"),
					ZWR_EXP_RATIO(val_len), ZWR_EXP_RATIO(val_len));
				gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_MUINFOUINT4, 4, LEN_AND_LIT("Buffer size"),
					murgbl.max_extr_record_length - extract_len,
					murgbl.max_extr_record_length - extract_len);
	murgbl.extr_buff[extract_len++] = '\\';
	jnlext_write(fi, murgbl.extr_buff, extract_len);