コード例 #1
0
ファイル: mur_forward.c プロジェクト: mihawk/fis-gtm
int	mur_forward_multi_proc(reg_ctl_list *rctl)
{
	boolean_t		multi_proc, this_reg_stuck, release_latch, ok_to_play;
	boolean_t		cancelled_dbsync_timer, cancelled_timer;
	reg_ctl_list		*rctl_top, *prev_rctl;
	jnl_ctl_list		*jctl;
	gd_region		*reg;
	sgmnt_addrs		*csa;
	seq_num 		rec_token_seq;
	jnl_tm_t		rec_time;
	enum broken_type	recstat;
	jnl_record		*rec;
	enum jnl_record_type	rectype;
	char			errstr[256];
	int			i, rctl_index, save_errno, num_procs_stuck, num_reg_stuck;
	uint4			status, regcnt_stuck, num_partners, start_hrtbt_cntr;
	forw_multi_struct	*forw_multi;
	shm_forw_multi_t	*sfm;
	multi_struct 		*multi;
	jnl_tm_t		adjusted_resolve_time;
	shm_reg_ctl_t		*shm_rctl_start, *shm_rctl, *first_shm_rctl;
	size_t			shm_size, reccnt, copy_size;
	int4			*size_ptr;
	char			*shmPtr; /* not using "shm_ptr" since it is already used in an AIX include file */
	int			shmid;
	multi_proc_shm_hdr_t	*mp_hdr;	/* Pointer to "multi_proc_shm_hdr_t" structure in shared memory */

	status = 0;
	/* Although we made sure the # of tasks is the same as the # of processes forked off (in the "gtm_multi_proc"
	 * invocation in "mur_forward"), it is possible one of the forked process finishes one invocation of
	 * "mur_forward_multi_proc" before even another forked process gets assigned one task in "gtm_multi_proc_helper".
	 * In this case, we would be invoked more than once. But the first invocation would have done all the needed stuff
	 * so return for later invocations.
	 */
	if (mur_forward_multi_proc_done)
		return 0;
	mur_forward_multi_proc_done = TRUE;
	/* Note: "rctl" is unused. But cannot avoid passing it since "gtm_multi_proc" expects something */
	prev_rctl = NULL;
	rctl_start = NULL;
	adjusted_resolve_time = murgbl.adjusted_resolve_time;
	assert(0 == murgbl.regcnt_remaining);
	multi_proc = multi_proc_in_use;	/* cache value in "local" to speed up access inside loops below */
	if (multi_proc)
	{
		mp_hdr = multi_proc_shm_hdr;
		shm_rctl_start = mur_shm_hdr->shm_rctl_start;
		if (jgbl.onlnrlbk)
		{
			for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
			{
				assert(rctl->csa->hold_onto_crit);	/* would have been set in parent process */
				rctl->csa->hold_onto_crit = FALSE;	/* reset since we dont own this region */
				assert(rctl->csa->now_crit);		/* would have been set in parent process */
				rctl->csa->now_crit = FALSE;		/* reset since we dont own this region */
			}
		}
		START_HEARTBEAT_IF_NEEDED; /* heartbeat timer needed later (in case not already started by "gtm_multi_proc") */
	}
	first_shm_rctl = NULL;
	/* Phase1 of forward recovery starts */
	for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
	{
		/* Check if "rctl" is available for us or if some other concurrent process has taken it */
		if (multi_proc)
		{
			rctl_index = rctl - &mur_ctl[0];
			shm_rctl = &shm_rctl_start[rctl_index];
			if (shm_rctl->owning_pid)
			{
				assert(process_id != shm_rctl->owning_pid);
				continue;
			}
			GRAB_MULTI_PROC_LATCH_IF_NEEDED(release_latch);
			assert(release_latch);
			for ( ; rctl < rctl_top; rctl++, shm_rctl++)
			{
				if (shm_rctl->owning_pid)
				{
					assert(process_id != shm_rctl->owning_pid);
					continue;
				}
				shm_rctl->owning_pid = process_id;	/* Declare ownership */
				rctl->this_pid_is_owner = TRUE;
				if (jgbl.onlnrlbk)
				{	/* This is an online rollback and crit was grabbed on all regions by the parent rollback
					 * process. But this child process now owns this region and does the actual rollback on
					 * this region so borrow crit for the duration of this child process.
					 */
					csa = rctl->csa;
					csa->hold_onto_crit = TRUE;
					csa->now_crit = TRUE;
					assert(csa->nl->in_crit == mp_hdr->parent_pid);
					csa->nl->in_crit = process_id;
					assert(csa->nl->onln_rlbk_pid == mp_hdr->parent_pid);
					csa->nl->onln_rlbk_pid = process_id;
				}
				if (NULL == first_shm_rctl)
					first_shm_rctl = shm_rctl;
				break;
			}
			REL_MULTI_PROC_LATCH_IF_NEEDED(release_latch);
			if (rctl >= rctl_top)
			{
				assert(rctl == rctl_top);
				break;
			}
			/* Set key to print this rctl'ss region-name as prefix in case this forked off process prints any output */
			MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
#			ifdef MUR_DEBUG
			fprintf(stderr, "pid = %d : Owns region %s\n", process_id, multi_proc_key);
#			endif
		} else
			rctl->this_pid_is_owner = TRUE;
		if (mur_options.forward)
		{
			assert(NULL == rctl->jctl_turn_around);
			jctl = rctl->jctl = rctl->jctl_head;
			assert(jctl->reg_ctl == rctl);
			jctl->rec_offset = JNL_HDR_LEN;
			jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END; /* initialized to reflect journaling is not enabled */
			if (mur_options.rollback)
				jgbl.mur_jrec_seqno = jctl->jfh->start_seqno;
		} else
		{
			jctl = rctl->jctl = (NULL == rctl->jctl_turn_around) ? rctl->jctl_head : rctl->jctl_turn_around;
			assert(jctl->reg_ctl == rctl);
			jctl->rec_offset = jctl->turn_around_offset;
			jgbl.mur_jrec_seqno = jctl->turn_around_seqno;
			assert((NULL != rctl->jctl_turn_around) || (0 == jctl->rec_offset));
		}
		if (mur_options.rollback)
		{
			if (murgbl.consist_jnl_seqno < jgbl.mur_jrec_seqno)
			{
				/* Assert that murgbl.losttn_seqno is never lesser than jgbl.mur_jrec_seqno (the turnaround
				 * point seqno) as this is what murgbl.consist_jnl_seqno is going to be set to and will
				 * eventually be the post-rollback seqno. If this condition is violated, the result of the
				 * recovery is a compromised database (the file header will indicate a Region Seqno which
				 * is not necessarily correct since seqnos prior to it might be absent in the database).
				 * Therefore, this is an out-of-design situation with respect to rollback and so stop it.
				 */
				assert(murgbl.losttn_seqno >= jgbl.mur_jrec_seqno);
				murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno;
			}
			assert(murgbl.consist_jnl_seqno <= murgbl.losttn_seqno);
		}
		if (mur_options.update || mur_options.extr[GOOD_TN])
		{
			reg = rctl->gd;
			gv_cur_region = reg;
			tp_change_reg();	/* note : sets cs_addrs to non-NULL value even if gv_cur_region->open is FALSE
						 * (cs_data could still be NULL). */
			rctl->csa = cs_addrs;
			cs_addrs->miscptr = (void *)rctl;
			rctl->csd = cs_data;
			rctl->sgm_info_ptr = cs_addrs->sgm_info_ptr;
			assert(!reg->open || (NULL != cs_addrs->dir_tree));
			gv_target = cs_addrs->dir_tree;
		}
		jctl->after_end_of_data = FALSE;
		status = mur_next(jctl, jctl->rec_offset);
		assert(ERR_JNLREADEOF != status);	/* cannot get EOF at start of forward processing */
		if (SS_NORMAL != status)
			goto finish;
		PRINT_VERBOSE_STAT(jctl, "mur_forward:at the start");
		rctl->process_losttn = FALSE;
		/* Any multi-region TP transaction will be processed as multiple single-region TP transactions up
		 * until the tp-resolve-time is reached. From then on, they will be treated as one multi-region TP
		 * transaction. This is needed for proper lost-tn determination (any multi-region transaction that
		 * gets played in a region AFTER it has already encountered a broken tn should treat this as a lost tn).
		 */
		do
		{
			if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr))
			{	/* We are at a logical point. So exit if signaled by parent */
				status = ERR_FORCEDHALT;
				goto finish;
			}
			assert(jctl == rctl->jctl);
			rec = rctl->mur_desc->jnlrec;
			rec_time = rec->prefix.time;
			if (rec_time > mur_options.before_time)
				break;	/* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */
			if (rec_time < mur_options.after_time)
			{
				status = mur_next_rec(&jctl);
				continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */
			}
			if (rec_time >= adjusted_resolve_time)
				break;	/* Records after this adjusted resolve_time will be processed below in phase2 */
			/* Note: Since we do hashtable token processing only for records from tp_resolve_time onwards,
			 * it is possible that if we encounter any broken transactions here we wont know they are broken
			 * but will play them as is. That is unavoidable. Specify -SINCE_TIME (for -BACKWARD rollback/recover)
			 * and -VERIFY (for -FORWARD rollback/recover) to control tp_resolve_time (and in turn more
			 * effective broken tn determination).
			 */
			status = mur_forward_play_cur_jrec(rctl);
			if (SS_NORMAL != status)
				break;
			status = mur_next_rec(&jctl);
		} while (SS_NORMAL == status);
		CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */
		if (SS_NORMAL != status)
		{	/* ERR_FILENOTCREATE is possible from "mur_cre_file_extfmt" OR	ERR_FORCEDHALT is possible
			 * from "mur_forward_play_cur_jrec". No other errors are known to occur here. Assert accordingly.
			 */
			assert((ERR_FILENOTCREATE == status) || (ERR_FORCEDHALT == status));
			goto finish;
		}
		if (rctl->forw_eof_seen)
		{
			PRINT_VERBOSE_STAT(jctl, "mur_forward:Reached EOF before tp_resolve_time");
			continue;	/* Reached EOF before even getting to tp_resolve_time.
					 * Do not even consider region for next processing loop */
		}
		rctl->last_tn = 0;
		murgbl.regcnt_remaining++;	/* # of regions participating in recovery at this point */
		if (NULL == rctl_start)
			rctl_start = rctl;
		if (NULL != prev_rctl)
		{
			prev_rctl->next_rctl = rctl;
			rctl->prev_rctl = prev_rctl;
		}
		prev_rctl = rctl;
		assert(murgbl.ok_to_update_db || !rctl->db_updated);
		PRINT_VERBOSE_STAT(jctl, "mur_forward:at tp_resolve_time");
	}
	if (multi_proc)
		multi_proc_key = NULL;	/* reset key until it can be set to rctl's region-name again */
	/* Note that it is possible for rctl_start to be NULL at this point. That is there is no journal record in any region
	 * AFTER the calculated tp-resolve-time. This is possible if for example -AFTER_TIME was used and has a time later
	 * than any journal record in all journal files. If rctl_start is NULL, prev_rctl should also be NULL and vice versa.
	 */
	if (NULL != rctl_start)
	{
		assert(NULL != prev_rctl);
		prev_rctl->next_rctl = rctl_start;
		rctl_start->prev_rctl = prev_rctl;
	}
	rctl = rctl_start;
	regcnt_stuck = 0; /* # of regions we are stuck in waiting for other regions to resolve a multi-region TP transaction */
	assert((NULL == rctl) || (NULL == rctl->forw_multi));
	gv_cur_region = NULL;	/* clear out any previous value to ensure gv_cur_region/cs_addrs/cs_data
				 * all get set in sync by the MUR_CHANGE_REG macro below.
				 */
	/* Phase2 of forward recovery starts */
	while (NULL != rctl)
	{	/* while there is at least one region remaining with unprocessed journal records */
		assert(NULL != rctl_start);
		assert(0 < murgbl.regcnt_remaining);
		if (NULL != rctl->forw_multi)
		{	/* This region's current journal record is part of a TP transaction waiting for other regions */
			regcnt_stuck++;
			assert(regcnt_stuck <= murgbl.regcnt_remaining);
			if (regcnt_stuck == murgbl.regcnt_remaining)
			{
				assertpro(multi_proc_in_use); /* Else : Out-of-design situation. Stuck in ALL regions. */
				/* Check one last time if all regions are stuck waiting for another process to resolve the
				 * multi-region TP transaction. If so, wait in a sleep loop. If not, we can proceed.
				 */
				rctl = rctl_start;
				start_hrtbt_cntr = heartbeat_counter;
				do
				{
					if (IS_FORCED_MULTI_PROC_EXIT(mp_hdr))
					{	/* We are at a logical point. So exit if signaled by parent */
						status = ERR_FORCEDHALT;
						goto finish;
					}
					forw_multi = rctl->forw_multi;
					assert(NULL != forw_multi);
					sfm = forw_multi->shm_forw_multi;
					assert(NULL != sfm);
					assert(sfm->num_reg_seen_forward <= sfm->num_reg_seen_backward);
#					ifdef MUR_DEBUG
					fprintf(stderr, "Pid = %d : Line %d : token = %llu : forward = %d : backward = %d\n",
						process_id, __LINE__, (long long int)sfm->token,
						sfm->num_reg_seen_forward, sfm->num_reg_seen_backward);
#					endif
					if (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward)
					{	/* We are no longer stuck in this region */
						assert(!forw_multi->no_longer_stuck);
						forw_multi->no_longer_stuck = TRUE;
						break;
					}
					rctl = rctl->next_rctl;	/* Move on to the next available region */
					assert(NULL != rctl);
					if (rctl == rctl_start)
					{	/* We went through all regions once and are still stuck.
						 * Sleep until at leat TWO heartbeats have elapsed after which check for deadlock.
						 * Do this only in the child process that owns the FIRST region in the region list.
						 * This way we dont have contention for the GRAB_MULTI_PROC_LATCH from
						 * all children at more or less the same time.
						 */
						if ((rctl == mur_ctl) && (heartbeat_counter > (start_hrtbt_cntr + 2)))
						{	/* Check if all processes are stuck for a while. If so assertpro */
							GRAB_MULTI_PROC_LATCH_IF_NEEDED(release_latch);
							assert(release_latch);
							shm_rctl_start = mur_shm_hdr->shm_rctl_start;
							num_reg_stuck = 0;
							for (i = 0; i < murgbl.reg_total; i++)
							{
								shm_rctl = &shm_rctl_start[i];
								sfm = shm_rctl->shm_forw_multi;
								if (NULL != sfm)
								{
									if (sfm->num_reg_seen_forward != sfm->num_reg_seen_backward)
										num_reg_stuck++;
								}
							}
							REL_MULTI_PROC_LATCH_IF_NEEDED(release_latch);
							/* If everyone is stuck at this point, it is an out-of-design situation */
							assertpro(num_reg_stuck < murgbl.reg_total);
							start_hrtbt_cntr = heartbeat_counter;
						} else
						{	/* Sleep and recheck if any region we are stuck in got resolved.
							 * To minimize time spent sleeping, we just yield our timeslice.
							 */
							rel_quant();
							continue;
						}
					}
				} while (TRUE);
			} else
			{
				rctl = rctl->next_rctl;	/* Move on to the next available region */
				assert(NULL != rctl);
				continue;
			}
		}
		regcnt_stuck = 0;	/* restart the counter now that we found at least one non-stuck region */
		MUR_CHANGE_REG(rctl);
		jctl = rctl->jctl;
		this_reg_stuck = FALSE;
		for ( status = SS_NORMAL; SS_NORMAL == status; )
		{
			if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr))
			{	/* We are at a logical point. So exit if signaled by parent */
				status = ERR_FORCEDHALT;
				goto finish;
			}
			assert(jctl == rctl->jctl);
			rec = rctl->mur_desc->jnlrec;
			rec_time = rec->prefix.time;
			if (rec_time > mur_options.before_time)
				break;	/* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */
			assert((rec_time >= adjusted_resolve_time) || (mur_options.notncheck && !mur_options.verify));
			assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated));
			if (rec_time < mur_options.after_time)
			{
				status = mur_next_rec(&jctl);
				continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */
			}
			/* Check if current journal record can be played right away or need to wait for corresponding journal
			 * records from other participating TP regions to be reached. A non-TP or ZTP transaction can be played
			 * without issues (i.e. has no dependencies with any other regions). A single-region TP transaction too
			 * falls in the same category. A multi-region TP transaction needs to wait until all participating regions
			 * have played all journal records BEFORE this TP in order to ensure recover plays records in the exact
			 * same order that GT.M performed them in.
			 */
			/* If FENCE_NONE is specified, we would not have maintained any multi hashtable in mur_back_process for
			 * broken transaction processing. So we process multi-region TP transactions as multiple single-region
			 * TP transactions in forward phase.
			 */
			if (FENCE_NONE != mur_options.fences)
			{
				rectype = (enum jnl_record_type)rec->prefix.jrec_type;
				if (IS_TP(rectype) && IS_TUPD(rectype))
				{
					assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype));
					assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants);
					assert(&rec->jrec_set_kill.num_participants == &rec->jrec_lgtrig.num_participants);
					num_partners = rec->jrec_set_kill.num_participants;
					assert(0 < num_partners);
					if (1 < num_partners)
					{
						this_reg_stuck = TRUE;
						assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num);
						assert(&rec->jrec_set_kill.update_num == &rec->jrec_lgtrig.update_num);
					}
				}
			}
			if (this_reg_stuck)
			{
				rec_token_seq = GET_JNL_SEQNO(rec);
				MUR_FORW_TOKEN_LOOKUP(forw_multi, rec_token_seq, rec_time);
				if (NULL != forw_multi)
				{	/* This token has already been seen in another region in forward processing.
					 * Add current region as well. If all regions have been resolved, then play
					 * the entire transaction maintaining the exact same order of updates within.
					 */
					if (!forw_multi->no_longer_stuck)
						MUR_FORW_TOKEN_ONE_MORE_REG(forw_multi, rctl);
				} else
				{	/* First time we are seeing this token in forward processing. Check if this
					 * has already been determined to be a broken transaction.
					 */
					recstat = GOOD_TN;
					multi = NULL;
					if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
					{
						multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, TPFENCE);
						if ((NULL != multi) && (0 < multi->partner))
							recstat = BROKEN_TN;
					}
					MUR_FORW_TOKEN_ADD(forw_multi, rec_token_seq, rec_time, rctl, num_partners,
								recstat, multi);
				}
				/* Check that "tabent" field has been initialized above (by either the MUR_FORW_TOKEN_LOOKUP
				 * or MUR_FORW_TOKEN_ADD macros). This is relied upon by "mur_forward_play_multireg_tp" below.
				 */
				assert(NULL != forw_multi->u.tabent);
				assert(forw_multi->num_reg_seen_forward <= forw_multi->num_reg_seen_backward);
				if (multi_proc)
				{
					sfm = forw_multi->shm_forw_multi;
					ok_to_play = (NULL == sfm) || (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward);
				} else
					ok_to_play = (forw_multi->num_reg_seen_forward == forw_multi->num_reg_seen_backward);
				assert(ok_to_play || !forw_multi->no_longer_stuck);
				if (ok_to_play )
				{	/* We have enough information to proceed with playing this multi-region TP in
					 * forward processing (even if we might not have seen all needed regions). Now play it.
					 * Note that the TP could be BROKEN_TN or GOOD_TN. The callee handles it.
					 */
					assert(forw_multi == rctl->forw_multi);
					status = mur_forward_play_multireg_tp(forw_multi, rctl);
					this_reg_stuck = FALSE;
					/* Note that as part of playing the TP transaction, we could have reached
					 * the EOF of rctl. In this case, we need to break out of the loop.
					 */
					if ((SS_NORMAL != status) || rctl->forw_eof_seen)
						break;
					assert(NULL == rctl->forw_multi);
					assert(!dollar_tlevel);
					jctl = rctl->jctl;	/* In case the first record after the most recently processed
								 * TP transaction is in the next generation journal file */
					continue;
				}
				break;
			} else
			{
				status = mur_forward_play_cur_jrec(rctl);
				if (SS_NORMAL != status)
					break;
			}
			assert(!this_reg_stuck);
			status = mur_next_rec(&jctl);
		}
		assert((NULL == rctl->forw_multi) || this_reg_stuck);
		assert((NULL != rctl->forw_multi) || !this_reg_stuck);
		if (!this_reg_stuck)
		{	/* We are not stuck in this region (to resolve a multi-region TP).
			 * This means we are done processing all the records of this region.
			 */
			assert(NULL == rctl->forw_multi);
			if (!rctl->forw_eof_seen)
			{
				CHECK_IF_EOF_REACHED(rctl, status);
					/* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */
				if (SS_NORMAL != status)
				{
					assert(ERR_FILENOTCREATE == status);
					goto finish;
				}
				assert(!dollar_tlevel);
				DELETE_RCTL_FROM_UNPROCESSED_LIST(rctl); /* since all of its records should have been processed */
			} else
			{	/* EOF was seen in rctl inside "mur_forward_play_multireg_tp" and it was removed
				 * from the unprocessed list of rctls. At the time rctl was removed, its "next_rctl"
				 * field could have been pointing to another <rctl> that has since then also been
				 * removed inside the same function. Therefore the "next_rctl" field is not reliable
				 * in this case but instead we should rely on the global variable "rctl_start" which
				 * points to the list of unprocessed rctls. Set "next_rctl" accordingly.
				 */
				rctl->next_rctl = rctl_start;
				if (ERR_JNLREADEOF == status)
					status = SS_NORMAL;
			}
			assert(rctl->deleted_from_unprocessed_list);
		}
		assert(SS_NORMAL == status);
		assert(!this_reg_stuck || !rctl->forw_eof_seen);
		assert((NULL == rctl->next_rctl) || (NULL != rctl_start));
		assert((NULL == rctl->next_rctl) || (0 < murgbl.regcnt_remaining));
		rctl = rctl->next_rctl;	/* Note : even though "rctl" could have been deleted from the doubly linked list above,
					 * rctl->next_rctl is not touched so we can still use it to get to the next element. */
	}
	assert(0 == murgbl.regcnt_remaining);
	jgbl.mur_pini_addr_reset_fnptr = NULL;	/* No more simulation of GT.M activity for any region */
	prc_vec = murgbl.prc_vec;	/* Use process-vector of MUPIP RECOVER (not any simulating GT.M process) now onwards */
	assert(0 == dollar_tlevel);
	for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
	{
		if (!rctl->this_pid_is_owner)
		{
			assert(multi_proc_in_use);
			continue;	/* in a parallel processing environment, process only regions we own */
		}
		if (multi_proc)
		{	/* Set key to print this rctl's region-name as prefix in case this forked off process prints any output */
			MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
		}
		PRINT_VERBOSE_STAT(rctl->jctl, "mur_forward:at the end");
		assert(!mur_options.rollback || (0 != murgbl.consist_jnl_seqno));
		assert(mur_options.rollback || (0 == murgbl.consist_jnl_seqno));
		assert(!dollar_tlevel);	/* In case it applied a broken TUPD */
		assert(murgbl.ok_to_update_db || !rctl->db_updated);
		rctl->mur_plst = NULL;	/* reset now that simulation of GT.M updates is done */
		/* Ensure mur_block_count_correct is called if updates allowed */
		if (murgbl.ok_to_update_db && (SS_NORMAL != mur_block_count_correct(rctl)))
		{
			gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(4) ERR_BLKCNTEDITFAIL, 2, DB_LEN_STR(rctl->gd));
			murgbl.wrn_count++;
		}
	}
finish:
	if (multi_proc)
		multi_proc_key = NULL;	/* reset key until it can be set to rctl's region-name again */
	if ((SS_NORMAL == status) && mur_options.show)
		mur_output_show();
	if (NULL != first_shm_rctl)
	{	/* Transfer needed process-private information to shared memory so parent process can later inherit this. */
		first_shm_rctl->err_cnt = murgbl.err_cnt;
		first_shm_rctl->wrn_count = murgbl.wrn_count;
		first_shm_rctl->consist_jnl_seqno = murgbl.consist_jnl_seqno;
		/* If extract files were created by this process for one or more regions, then copy that information to
		 * shared memory so parent process can use this information to do a merge sort.
		 */
		shm_rctl = mur_shm_hdr->shm_rctl_start;
		for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++, shm_rctl++)
		{
			assert(multi_proc_in_use);
			if (!rctl->this_pid_is_owner)
				continue;	/* in a parallel processing environment, process only regions we own */
			/* Cancel any flush/dbsync timers by this child process for this region. This is because the
			 * child is not going to go through exit handling code (no gds_rundown etc.). And we need to
			 * clear up csa->nl->wcs_timers. (normally done by gds_rundown).
			 */
			if (NULL != rctl->csa)	/* rctl->csa can be NULL in case of "mupip journal -extract" etc. */
				CANCEL_DB_TIMERS(rctl->gd, rctl->csa, cancelled_timer, cancelled_dbsync_timer);
			reccnt = 0;
			for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0;
								recstat < TOT_EXTR_TYPES;
									recstat++, size_ptr++)
			{	/* Assert "extr_file_created" information is in sync between rctl and shm_rctl.
				 * This was done at the end of "mur_cre_file_extfmt".
				 */
				assert(shm_rctl->extr_file_created[recstat] == rctl->extr_file_created[recstat]);
				/* Assert that if *size_ptr is non-zero, then we better have created an extract file.
				 * Note that the converse is not true. It is possible we created a file for example to
				 * write an INCTN record but decided to not write anything because it was not a -detail
				 * type of extract. So *sizeptr could be 0 even though we created the extract file.
				 */
				assert(!*size_ptr || rctl->extr_file_created[recstat]);
				shm_rctl->jnlext_list_size[recstat] = *size_ptr;
				reccnt += *size_ptr;
			}
			assert(INVALID_SHMID == shm_rctl->jnlext_shmid);
			shm_size = reccnt * SIZEOF(jnlext_multi_t);
			/* If we are quitting because of an abnormal status OR a forced signal to terminate
			 * OR if the parent is dead (kill -9) dont bother creating shmid to communicate back with parent.
			 */
			if (mp_hdr->parent_pid != getppid())
			{
				SET_FORCED_MULTI_PROC_EXIT;	/* Also signal sibling children to stop processing */
				if (SS_NORMAL != status)
					status = ERR_FORCEDHALT;
			}
			if ((SS_NORMAL == status) && shm_size)
			{
				shmid = shmget(IPC_PRIVATE, shm_size, 0600 | IPC_CREAT);
				if (-1 == shmid)
				{
					save_errno = errno;
					SNPRINTF(errstr, SIZEOF(errstr),
						"shmget() : shmsize=0x%llx", shm_size);
					MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);	/* to print region name prefix */
					rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8)
								ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno);
				}
				shmPtr = (char *)do_shmat(shmid, 0, 0);
				if (-1 == (sm_long_t)shmPtr)
				{
					save_errno = errno;
					SNPRINTF(errstr, SIZEOF(errstr),
						"shmat() : shmid=%d shmsize=0x%llx", shmid, shm_size);
					MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);	/* to print region name prefix */
					rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8)
								ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno);
				}
				shm_rctl->jnlext_shmid = shmid;
				shm_rctl->jnlext_shm_size = shm_size;
				for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0;
									recstat < TOT_EXTR_TYPES;
										recstat++, size_ptr++)
				{
					shm_size = *size_ptr;
					if (shm_size)
					{
						copy_size = copy_list_to_buf(rctl->jnlext_multi_list[recstat],
												(int4)shm_size, shmPtr);
						assert(copy_size == (shm_size * SIZEOF(jnlext_multi_t)));
						shmPtr += copy_size;
					}
				}
			}
		}
	}
	mur_close_file_extfmt(IN_MUR_CLOSE_FILES_FALSE);	/* Need to flush buffered extract/losttrans/brokentrans files */
	return (int)status;
}
コード例 #2
0
ファイル: mur_forward.c プロジェクト: h4ck3rm1k3/fis-gtm
uint4	mur_forward(jnl_tm_t min_broken_time, seq_num min_broken_seqno, seq_num losttn_seqno)
{
	boolean_t		added, this_reg_stuck;
	boolean_t		is_set_kill_zkill_ztworm, is_set_kill_zkill;
	jnl_record		*rec;
	enum jnl_record_type	rectype;
	enum rec_fence_type	rec_fence;
	enum broken_type	recstat;
	jnl_tm_t		rec_time;
	int4			rec_image_count = 0;	/* This is a dummy variable for UNIX */
	uint4			status, regcnt_stuck, num_partners;
	mval			mv;
	reg_ctl_list		*rctl, *rctl_top, *prev_rctl;
	jnl_ctl_list		*jctl;
	gd_region		*reg;
	sgmnt_addrs		*csa;
	seq_num 		rec_token_seq;
	forw_multi_struct	*forw_multi;
	multi_struct 		*multi;

	error_def(ERR_JNLREADEOF);
	error_def(ERR_BLKCNTEDITFAIL);

	skip_dbtriggers = TRUE;	/* do not want to invoke any triggers for updates done by journal recovery */
	murgbl.extr_buff = (char *)malloc(murgbl.max_extr_record_length);
	for (recstat = (enum broken_type)0; recstat < TOT_EXTR_TYPES; recstat++)
		murgbl.extr_file_create[recstat] = TRUE;
	jgbl.dont_reset_gbl_jrec_time = jgbl.forw_phase_recovery = TRUE;
	assert(NULL == jgbl.mur_pini_addr_reset_fnptr);
	jgbl.mur_pini_addr_reset_fnptr = (pini_addr_reset_fnptr)mur_pini_addr_reset;
	gv_keysize = DBKEYSIZE(MAX_KEY_SZ);
	mu_gv_stack_init();
	murgbl.consist_jnl_seqno = 0;
	/* Note down passed in values in murgbl global so "mur_forward_play_cur_jrec" function can see it as well */
	murgbl.min_broken_time = min_broken_time;
	murgbl.min_broken_seqno = min_broken_seqno;
	murgbl.losttn_seqno = losttn_seqno;
	assert(!mur_options.rollback || (murgbl.losttn_seqno <= murgbl.min_broken_seqno));
	prev_rctl = NULL;
	rctl_start = NULL;
	assert(0 == murgbl.regcnt_remaining);
	for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++)
	{
		if (mur_options.forward)
		{
			assert(NULL == rctl->jctl_turn_around);
			jctl = rctl->jctl = rctl->jctl_head;
			assert(jctl->reg_ctl == rctl);
			jctl->rec_offset = JNL_HDR_LEN;
			jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END; /* initialized to reflect journaling is not enabled */
		} else
		{
			jctl = rctl->jctl = (NULL == rctl->jctl_turn_around) ? rctl->jctl_head : rctl->jctl_turn_around;
			assert(jctl->reg_ctl == rctl);
			jctl->rec_offset = jctl->turn_around_offset;
			jgbl.mur_jrec_seqno = jctl->turn_around_seqno;
			if (mur_options.rollback && murgbl.consist_jnl_seqno < jgbl.mur_jrec_seqno)
				murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno;
			assert(murgbl.consist_jnl_seqno <= murgbl.losttn_seqno);
			assert((NULL != rctl->jctl_turn_around) || (0 == jctl->rec_offset));
		}
		if (mur_options.update || mur_options.extr[GOOD_TN])
		{
			reg = rctl->gd;
			gv_cur_region = reg;
			tp_change_reg();	/* note : sets cs_addrs to non-NULL value even if gv_cur_region->open is FALSE
						 * (cs_data could still be NULL). */
			rctl->csa = cs_addrs;
			cs_addrs->rctl = rctl;
			rctl->csd = cs_data;
			rctl->sgm_info_ptr = cs_addrs->sgm_info_ptr;
			SET_CSA_DIR_TREE(cs_addrs, MAX_KEY_SZ, reg);
			gv_target = cs_addrs->dir_tree;
		}
		jctl->after_end_of_data = FALSE;
		status = mur_next(jctl, jctl->rec_offset);
		assert(ERR_JNLREADEOF != status);	/* cannot get EOF at start of forward processing */
		if (SS_NORMAL != status)
			return status;
		PRINT_VERBOSE_STAT(jctl, "mur_forward:at the start");
		/* Any multi-region TP transaction will be processed as multiple single-region TP transactions up
		 * until the tp-resolve-time is reached. From then on, they will be treated as one multi-region TP
		 * transaction. This is needed for proper lost-tn determination (any multi-region transaction that
		 * gets played in a region AFTER it has already encountered a broken tn should treat this as a lost tn).
		 */
		do
		{
			assert(jctl == rctl->jctl);
			rec = rctl->mur_desc->jnlrec;
			rec_time = rec->prefix.time;
			if (rec_time > mur_options.before_time)
				break;	/* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */
			if (rec_time < mur_options.after_time)
			{
				status = mur_next_rec(&jctl);
				continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */
			}
			if (rec_time >= jgbl.mur_tp_resolve_time)
				break;	/* Records after tp-resolve-time will be processed below */
			/* TODO: what do we do if we find a BROKEN tn here? */
			status = mur_forward_play_cur_jrec(rctl);
			if (SS_NORMAL != status)
				break;
			status = mur_next_rec(&jctl);
		} while (SS_NORMAL == status);
		CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */
		if (SS_NORMAL != status)
			return status;
		if (rctl->forw_eof_seen)
		{
			PRINT_VERBOSE_STAT(jctl, "mur_forward:Reached EOF before tp_resolve_time");
			continue;	/* Reached EOF before even getting to tp_resolve_time.
					 * Do not even consider region for next processing loop */
		}
		rctl->last_tn = 0;
		rctl->process_losttn = FALSE;
		murgbl.regcnt_remaining++;	/* # of regions participating in recovery at this point */
		if (NULL == rctl_start)
			rctl_start = rctl;
		if (NULL != prev_rctl)
		{
			prev_rctl->next_rctl = rctl;
			rctl->prev_rctl = prev_rctl;
		}
		prev_rctl = rctl;
		assert(murgbl.ok_to_update_db || !rctl->db_updated);
		PRINT_VERBOSE_STAT(jctl, "mur_forward:at tp_resolve_time");
	}
	/* Note that it is possible for rctl_start to be NULL at this point. That is there is no journal record in any region
	 * AFTER the calculated tp-resolve-time. This is possible if for example -AFTER_TIME was used and has a time later
	 * than any journal record in all journal files. If rctl_start is NULL, prev_rctl should also be NULL and vice versa.
	 */
	if (prev_rctl != rctl_start)
	{
		assert(NULL != prev_rctl);
		assert(NULL != rctl_start);
		prev_rctl->next_rctl = rctl_start;
		rctl_start->prev_rctl = prev_rctl;
	} else
	{	/* prev_rctl & rctl_start are identical. They both should be NULL or should point to a single element linked list */
		assert((NULL == rctl_start) || (NULL == rctl_start->next_rctl) && (NULL == rctl_start->prev_rctl));
	}
	rctl = rctl_start;
	regcnt_stuck = 0; /* # of regions we are stuck in waiting for other regions to resolve a multi-region TP transaction */
	assert((NULL == rctl) || (NULL == rctl->forw_multi));
	gv_cur_region = NULL;	/* clear out any previous value to ensure gv_cur_region/cs_addrs/cs_data
				 * all get set in sync by the MUR_CHANGE_REG macro below.
				 */
	while (NULL != rctl)
	{	/* while there is at least one region remaining with unprocessed journal records */
		assert(NULL != rctl_start);
		assert(0 < murgbl.regcnt_remaining);
		if (NULL != rctl->forw_multi)
		{	/* This region's current journal record is part of a TP transaction waiting for other regions */
			regcnt_stuck++;
			if (regcnt_stuck >= murgbl.regcnt_remaining)
				GTMASSERT;	/* Out-of-design situation. Stuck in ALL regions. */
			rctl = rctl->next_rctl;	/* Move on to the next available region */
			assert(NULL != rctl);
			continue;
		}
		regcnt_stuck = 0;	/* restart the counter now that we found at least one non-stuck region */
		MUR_CHANGE_REG(rctl);
		jctl = rctl->jctl;
		this_reg_stuck = FALSE;
		for ( status = SS_NORMAL; SS_NORMAL == status; )
		{
			assert(jctl == rctl->jctl);
			rec = rctl->mur_desc->jnlrec;
			rec_time = rec->prefix.time;
			assert(rec_time >= jgbl.mur_tp_resolve_time);
			if (rec_time > mur_options.before_time)
				break;	/* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */
			assert((0 == mur_options.after_time) || mur_options.forward && !rctl->db_updated);
			if (rec_time < mur_options.after_time)
			{
				status = mur_next_rec(&jctl);
				continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */
			}
			/* Check if current journal record can be played right away or need to wait for corresponding journal
			 * records from other participating TP regions to be reached. A non-TP or ZTP transaction can be played
			 * without issues (i.e. has no dependencies with any other regions). A single-region TP transaction too
			 * falls in the same category. A multi-region TP transaction needs to wait until all participating regions
			 * have played all journal records BEFORE this TP in order to ensure recover plays records in the exact
			 * same order that GT.M performed them in.
			 */
			/* If FENCE_NONE is specified, we would not have maintained any multi hashtable in mur_back_process for
			 * broken transaction processing. So we process multi-region TP transactions as multiple single-region
			 * TP transactions in forward phase.
			 */
			if (FENCE_NONE != mur_options.fences)
			{
				rectype = (enum jnl_record_type)rec->prefix.jrec_type;
				if (IS_TP(rectype) && IS_TUPD(rectype))
				{
					assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype));
					assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants);
					num_partners = rec->jrec_set_kill.num_participants;
					assert(0 < num_partners);
					if (1 < num_partners)
					{
						this_reg_stuck = TRUE;
						assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num);
					}
				}
			}
			if (this_reg_stuck)
			{
				rec_token_seq = GET_JNL_SEQNO(rec);
				VMS_ONLY(
					/* In VMS, pid is not unique. We need "image_count" as well. But this is not needed
					 * in case of rollback as the token is guaranteed to be unique in that case.
					 */
					if (!mur_options.rollback)
					{
						MUR_GET_IMAGE_COUNT(jctl, rec, rec_image_count, status);
						if (SS_NORMAL != status)
						{
							this_reg_stuck = FALSE;	/* so abnormal "status" gets checked below */
							break;
						}
					}
				)
				/* In Unix, "rec_image_count" is ignored by the MUR_FORW* macros */
				MUR_FORW_TOKEN_LOOKUP(forw_multi, rec_token_seq, rec_time, rec_image_count);
				if (NULL != forw_multi)
				{	/* This token has already been seen in another region in forward processing.
					 * Add current region as well. If all regions have been resolved, then play
					 * the entire transaction maintaining the exact same order of updates within.
					 */
					MUR_FORW_TOKEN_ONE_MORE_REG(forw_multi, rctl);
				} else
				{	/* First time we are seeing this token in forward processing. Check if this
					 * has already been determined to be a broken transaction.
					 */
					recstat = GOOD_TN;
					multi = NULL;
					if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
					{
						multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_image_count, rec_time, TPFENCE);
						if ((NULL != multi) && (0 < multi->partner))
							recstat = BROKEN_TN;
					}
					MUR_FORW_TOKEN_ADD(forw_multi, rec_token_seq, rec_time, rctl, num_partners,
								recstat, multi, rec_image_count);
				}
				/* Check that "tabent" field has been initialized above (by either the MUR_FORW_TOKEN_LOOKUP
				 * or MUR_FORW_TOKEN_ADD macros). This is relied upon by "mur_forward_play_multireg_tp" below.
				 */
				assert(NULL != forw_multi->u.tabent);
				assert(forw_multi->num_reg_seen_forward <= forw_multi->num_reg_seen_backward);
				if (forw_multi->num_reg_seen_forward == forw_multi->num_reg_seen_backward)
				{	/* All regions have been seen in forward processing. Now play it.
					 * Note that the TP could be BROKEN_TN or GOOD_TN. The callee handles it.
					 */
					assert(forw_multi == rctl->forw_multi);
					status = mur_forward_play_multireg_tp(forw_multi, rctl);
					this_reg_stuck = FALSE;
					/* Note that as part of playing the TP transaction, we could have reached
					 * the EOF of rctl. In this case, we need to break out of the loop.
					 */
					if ((SS_NORMAL != status) || rctl->forw_eof_seen)
						break;
					assert(NULL == rctl->forw_multi);
					assert(!dollar_tlevel);
					jctl = rctl->jctl;	/* In case the first record after the most recently processed
								 * TP transaction is in the next generation journal file */
					continue;
				}
				break;
			} else
			{
				status = mur_forward_play_cur_jrec(rctl);
				if (SS_NORMAL != status)
					break;
			}
			assert(!this_reg_stuck);
			status = mur_next_rec(&jctl);
		}
コード例 #3
0
uint4	mur_forward_play_cur_jrec(reg_ctl_list *rctl)
{
	boolean_t		process_losttn;
	boolean_t		is_set_kill_zkill_ztworm_lgtrig_ztrig, is_set_kill_zkill_ztrig;
	trans_num		curr_tn;
	enum jnl_record_type	rectype;
	enum rec_fence_type	rec_fence;
	enum broken_type	recstat;
	jnl_tm_t		rec_time;
	uint4			status;
	mval			mv;
	seq_num 		rec_token_seq, rec_strm_seqno, resync_strm_seqno;
	jnl_record		*rec;
	jnl_string		*keystr;
	multi_struct 		*multi;
	jnl_ctl_list		*jctl;
	ht_ent_mname		*tabent;
	mname_entry	 	gvent;
	gvnh_reg_t		*gvnh_reg;
	pini_list_struct	*plst;
	int4			gtmcrypt_errno;
	boolean_t		use_new_key;
	forw_multi_struct	*forw_multi;
#	if (defined(DEBUG) && defined(UNIX))
	int4			strm_idx;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(!rctl->forw_eof_seen);
	if (multi_proc_in_use)
	{	/* Set key to print this rctl's region-name as prefix in case this forked off process prints any output.
		 * e.g. If this function ends up calling t_end/op_tcommit which in turn needs to do a jnl autoswitch
		 * inside jnl_file_extend and prints a GTM-I-FILERENAME message.
		 */
		MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key);
	}
	jctl = rctl->jctl;
	/* Ensure we never DOUBLE process the same journal record in the forward phase */
	assert((jctl != rctl->last_processed_jctl) || (jctl->rec_offset != rctl->last_processed_rec_offset));
#	ifdef DEBUG
	rctl->last_processed_jctl = jctl;
	rctl->last_processed_rec_offset = jctl->rec_offset;
#	endif
	rec = rctl->mur_desc->jnlrec;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	rec_time = rec->prefix.time;
	assert(rec_time <= mur_options.before_time);
	assert(rec_time >= mur_options.after_time);
	assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated));
	is_set_kill_zkill_ztworm_lgtrig_ztrig = (boolean_t)(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype));
	if (is_set_kill_zkill_ztworm_lgtrig_ztrig)
	{
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		if (USES_ANY_KEY(jctl->jfh))
		{
			use_new_key = USES_NEW_KEY(jctl->jfh);
			assert(NEEDS_NEW_KEY(jctl->jfh, rec->prefix.tn) == use_new_key);
			MUR_DECRYPT_LOGICAL_RECS(
					keystr,
					(use_new_key ? TRUE : jctl->jfh->non_null_iv),
					rec->prefix.forwptr,
					(use_new_key ? jctl->encr_key_handle2 : jctl->encr_key_handle),
					gtmcrypt_errno);
			if (0 != gtmcrypt_errno)
			{
				GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, jctl->jnl_fn_len, jctl->jnl_fn);
				return gtmcrypt_errno;
			}
		}
	}
	if (mur_options.selection && !mur_select_rec(jctl))
		return SS_NORMAL;
	rec_token_seq = (REC_HAS_TOKEN_SEQ(rectype)) ? GET_JNL_SEQNO(rec) : 0;
	process_losttn = rctl->process_losttn;
	if (!process_losttn && mur_options.rollback)
	{
		if (IS_REPLICATED(rectype) && (rec_token_seq >= murgbl.losttn_seqno))
			process_losttn = rctl->process_losttn = TRUE;
#		if (defined(UNIX) && defined(DEBUG))
		if ((rec_token_seq < murgbl.losttn_seqno) && murgbl.resync_strm_seqno_nonzero && IS_REPLICATED(rectype))
		{
			assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype) || IS_COM(rectype) || (JRT_NULL == (rectype)));
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_null.strm_seqno);
			assert(&rec->jrec_set_kill.strm_seqno == &rec->jrec_tcom.strm_seqno);
			rec_strm_seqno = GET_STRM_SEQNO(rec);
			if (rec_strm_seqno)
			{
				strm_idx = GET_STRM_INDEX(rec_strm_seqno);
				rec_strm_seqno = GET_STRM_SEQ60(rec_strm_seqno);
				resync_strm_seqno = murgbl.resync_strm_seqno[strm_idx];
				assert(!resync_strm_seqno || (rec_strm_seqno < resync_strm_seqno));
			}
		}
#		endif
	}
	/* Note: Broken transaction determination is done below only based on the records that got selected as
	 * part of the mur_options.selection criteria. Therefore depending on whether a broken transaction gets
	 * selected or not, future complete transactions might either go to the lost transaction or extract file.
	 */
	recstat = process_losttn ? LOST_TN : GOOD_TN;
	status = SS_NORMAL;
	if (FENCE_NONE != mur_options.fences)
	{
		if (IS_FENCED(rectype))
		{
			assert(rec_token_seq);
#			ifdef DEBUG
			/* assert that all TP records before min_broken_time are not broken */
			if (IS_TP(rectype) && ((!mur_options.rollback && rec_time < murgbl.min_broken_time)
						|| (mur_options.rollback && rec_token_seq < murgbl.min_broken_seqno)))
			{
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				if (NULL != (multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence)))
				{
					assert(0 == multi->partner);
					assert(FALSE == multi->this_is_broken);
				}
			}
#			endif
			/* In most cases, the fact whether a TP tn is broken or not would have been determined already in
			 * mur_forward. In this case, rctl->forw_multi would be set appropriately. So use that to get to
			 * "multi" and avoid a hashtable lookup. If forw_multi is NULL (e.g. for ZTP or single-region TP),
			 * the hash-table lookup cannot be avoided.
			 */
			multi = NULL;
			forw_multi = rctl->forw_multi;
			if (NULL != forw_multi)
			{
				multi = forw_multi->multi;
				/* Always honor the "recstat" from the forw_multi since that has been determined taking into
				 * consideration the BROKEN_TN status of ALL participating regions.
				 */
				assert((GOOD_TN != forw_multi->recstat) || (GOOD_TN == recstat));
				recstat = forw_multi->recstat;
			} else if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq))
			{
				assert(!mur_options.rollback || process_losttn);
				rec_fence = GET_REC_FENCE_TYPE(rectype);
				assert(rec_token_seq == ((struct_jrec_upd *)rec)->token_seq.token);
				multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, rec_fence);
				if ((NULL != multi) && (0 < multi->partner))
				{
					process_losttn = rctl->process_losttn = TRUE;
					recstat = BROKEN_TN;
				}
			}
			/* Check that if the hashtable reports a tn as GOOD, it better have had the same
			 * # of participants in the TCOM records across all the participating regions.
			 */
			assert((NULL == multi) || (BROKEN_TN == recstat) || (FALSE == multi->this_is_broken));
		} else if ((FENCE_ALWAYS == mur_options.fences) && is_set_kill_zkill_ztworm_lgtrig_ztrig)
		{
			process_losttn = rctl->process_losttn = TRUE;
			recstat = BROKEN_TN;
		}
	} else
		forw_multi = NULL;
	if (mur_options.show)
	{
		assert(SS_NORMAL == status);
		if (BROKEN_TN != recstat)
		{
			if (JRT_PFIN == rectype)
				status = mur_pini_state(jctl, rec->prefix.pini_addr, FINISHED_PROC);
			else if ((JRT_EOF != rectype)
					&& ((JRT_ALIGN != rectype) || (JNL_HDR_LEN != rec->prefix.pini_addr)))
			{	/* Note that it is possible that we have a PINI record followed by a PFIN record
				 * and later an ALIGN record with the pini_addr pointing to the original PINI
				 * record (see comment in jnl_write.c where pini_addr gets assigned to JNL_HDR_LEN)
				 * In this case we do not want the ALIGN record to cause the process to become
				 * ACTIVE although it has written a PFIN record. Hence the check above.
				 */
				status = mur_pini_state(jctl, rec->prefix.pini_addr, ACTIVE_PROC);
			}
		} else
			status = mur_pini_state(jctl, rec->prefix.pini_addr, BROKEN_PROC);
		if (SS_NORMAL != status)
			return status;	/* "mur_pini_state" failed due to bad pini_addr */
		++jctl->jnlrec_cnt[rectype];	/* for -show=STATISTICS */
	}
	if (!mur_options.update && !jgbl.mur_extract)
		return SS_NORMAL;
	if (murgbl.ok_to_update_db && IS_TUPD(rectype) && (GOOD_TN == recstat))
	{	/* Even for FENCE_NONE we apply fences. Otherwise a TUPD becomes UPD etc.
		 * If forw_multi is non-NULL, a multi-region TP transaction is being played as a SINGLE
		 * TP transaction across all the involved regions. Therefore only ONE op_tstart is done
		 * even though more than one TSET might be encountered. In this case, do not issue JNLTPNEST error.
		 */
		if (dollar_tlevel && (NULL == forw_multi))
		{
			assert(FALSE);
			murgbl.wrn_count++;
			gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(6) ERR_JNLTPNEST, 4, jctl->jnl_fn_len,
				jctl->jnl_fn, jctl->rec_offset, &rec->prefix.tn);
			OP_TROLLBACK(0);
		}
		if (!dollar_tlevel)
		{	/* Note: op_tstart resets gv_currkey. So set gv_currkey later. */
			/* mv is used to determine transaction id. But it is ignored by recover/rollback */
			mv.mvtype = MV_STR;
			mv.str.len = 0;
			mv.str.addr = NULL;
			op_tstart(IMPLICIT_TSTART, TRUE, &mv, -1);
			DEBUG_ONLY(jgbl.max_tp_ztp_jnl_upd_num = 0;)
		}
		tp_set_sgm();	/* needed to set "sgm_info_ptr" to correspond to "rctl" */
	}
コード例 #4
0
ファイル: mur_output_record.c プロジェクト: shabiel/fis-gtm
/* This routine is called only for recover and rollback (that is, mur_options.update).
 * It applies the set/kill/zkill, tcom, inctn, and aimg records during forward processing.
 * Some fields like jnl_seqno, rec_seqno and prefix.time are saved here from original journal files.
 * Later jnl_write routines copies them to journal records instead of generating them like the runtime system */
uint4	mur_output_record(reg_ctl_list *rctl)
{
	mval			mv;
	jnl_record		*rec;
	char			*val_ptr;
	int			strm_num;
	uint4			dummy;
	off_jnl_t		pini_addr;
	jnl_string		*keystr;
	enum jnl_record_type 	rectype;
	uint4			jnl_status, status;
	pini_list_struct	*plst;
	boolean_t		jnl_enabled, was_crit;
	struct_jrec_null	null_record;
	gd_region		*reg;
	seq_num			strm_seqno;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	jnl_ctl_list		*jctl;
	jnl_format_buffer	*ztworm_jfb;
	blk_hdr_ptr_t		aimg_blk_ptr;
	int			in_len, gtmcrypt_errno;
	boolean_t		use_new_key;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(mur_options.update);
	rec = rctl->mur_desc->jnlrec;
	rectype = (enum jnl_record_type)rec->prefix.jrec_type;
	switch (rectype)
	{
		case JRT_ALIGN:
		case JRT_EOF:
		case JRT_EPOCH:
		case JRT_PBLK:
		case JRT_PINI:
		case JRT_TRUNC:
			return SS_NORMAL;
			break;
		default:
			break;
	}
	jgbl.gbl_jrec_time = rec->prefix.time;
	pini_addr = rec->prefix.pini_addr;
	reg = rctl->gd;
	jctl = rctl->jctl;
	assert(jctl->reg_ctl == rctl);
	assert(gv_cur_region == reg);
	csa = rctl->csa;
	assert(cs_addrs == csa);
	csd = csa->hdr;
	assert(cs_data == csd);
	jnl_enabled = JNL_ENABLED(csa);
	if (jnl_enabled)
	{
		status = mur_get_pini(jctl, pini_addr, &plst);
		if (SS_NORMAL != status)
			return status;
		prc_vec = &plst->jpv;
		csa->jnl->pini_addr = plst->new_pini_addr;
		rctl->mur_plst = plst;
	}
	if (mur_options.rollback && IS_REPLICATED(rectype))
	{
		jgbl.mur_jrec_seqno = GET_JNL_SEQNO(rec);
		if (jgbl.mur_jrec_seqno >= murgbl.consist_jnl_seqno)
		{
			assert(murgbl.losttn_seqno >= (jgbl.mur_jrec_seqno + 1));
			murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno + 1;
		}
		jgbl.mur_jrec_strm_seqno = GET_STRM_SEQNO(rec);
		strm_seqno = jgbl.mur_jrec_strm_seqno;
		if (strm_seqno)
		{	/* maintain csd->strm_reg_seqno */
			strm_num = GET_STRM_INDEX(strm_seqno);
			strm_seqno = GET_STRM_SEQ60(strm_seqno);
			assert(csd->strm_reg_seqno[strm_num] <= (strm_seqno + 1));
			csd->strm_reg_seqno[strm_num] = strm_seqno + 1;
		}
	}
	/* Assert that TREF(gd_targ_gvnh_reg) is NULL for every update that journal recovery/rollback plays forward;
	 * This is necessary to ensure every update is played in only the database file where the journal record is seen
	 * instead of across all regions that span the particular global reference. For example if ^a(1) spans db files
	 * a.dat and b.dat, and a KILL ^a(1) is done at the user level, we would see KILL ^a(1) journal records in a.mjl
	 * and b.mjl. When journal recovery processes the journal record in a.mjl, it should do the kill only in a.dat
	 * When it gets to the same journal record in b.mjl, it would do the same kill in b.dat and effectively complete
	 * the user level KILL ^a(1). If instead recovery does the KILL across all spanned regions, we would be basically
	 * doing duplicate work let alone do it out-of-order since recovery goes region by region for the most part.
	 */
	assert(NULL == TREF(gd_targ_gvnh_reg));
	if (IS_SET_KILL_ZKILL_ZTRIG(rectype))
	{	/* TP and non-TP has same format */
		keystr = (jnl_string *)&rec->jrec_set_kill.mumps_node;
		if (jnl_enabled)
		{
			MUR_SET_JNL_FENCE_CTL_TOKEN(rec->jrec_set_kill.token_seq.token, rctl);
			jnl_fence_ctl.strm_seqno = rec->jrec_set_kill.strm_seqno;
			jgbl.tp_ztp_jnl_upd_num = rec->jrec_set_kill.update_num;
			DEBUG_ONLY(jgbl.max_tp_ztp_jnl_upd_num = MAX(jgbl.max_tp_ztp_jnl_upd_num, jgbl.tp_ztp_jnl_upd_num);)
			jgbl.mur_jrec_nodeflags = keystr->nodeflags;
		}
		if (IS_FENCED(rectype))
		{	/* Even for FENCE_NONE we apply fences. Otherwise an [F/G/T/U]UPD becomes UPD etc. */
			/* op_tstart is called in "mur_forward_play_cur_jrec" already */
			if (IS_FUPD(rectype))
			{
				jnl_fence_ctl.level = 1;
				if (jnl_enabled)
				{
					jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END;
					csa->next_fenced = NULL;
				}
			} else if (IS_GUPD(rectype))
			{
				jnl_fence_ctl.level = 1;
				if (jnl_enabled)
				{
					jnl_fence_ctl.fence_list = csa;
					csa->next_fenced = JNL_FENCE_LIST_END;
				}
			} else if (IS_TP(rectype))
				tp_set_sgm();
		}
#		ifdef GTM_TRIGGER
		/* Check if ^#t and if so need to increment trigger cycle in file header. Note that the below 'if' check could cause
		 * csd->db_trigger_cycle to be incremented even for the region that actually did NOT get any trigger updates. This
		 * is because some of the ^#t subscripts (like ^#t(#TNAME)) go to the DEFAULT region. So, even though a trigger was
		 * loaded only for ^a (corresponding to AREG), csd->db_trigger_cycle will be incremented for DEFAULT region as well.
		 * To avoid this, the below check should be modified to set csa->incr_db_trigger_cycle only if the ^#t subscript
		 * does not begin with '#' (similar to what is done in UPD_GV_BIND_NAME_APPROPRIATE). However, since journal
		 * recovery operates in standalone mode, the db_trigger_cycle increment to DEFAULT region should be okay since it
		 * will NOT cause any restarts
		 */
		if (IS_GVKEY_HASHT_GBLNAME(keystr->length, keystr->text))
		{
			assert(cs_addrs == csa);
			csa->incr_db_trigger_cycle = TRUE;
		}
#		endif
		if (IS_SET(rectype))
		{
			val_ptr = &keystr->text[keystr->length];
			GET_MSTR_LEN(mv.str.len, val_ptr);
			mv.str.addr = val_ptr + SIZEOF(mstr_len_t);
			mv.mvtype = MV_STR;
			op_gvput(&mv);
		} else if (IS_KILL(rectype))
		{
			if (IS_TP(rectype))
				tp_set_sgm();
			op_gvkill();
#		ifdef GTM_TRIGGER
		} else if (IS_ZTRIG(rectype))
		{
			if (IS_TP(rectype))
				tp_set_sgm();
			op_ztrigger();
#		endif
		} else
		{
			assert(IS_ZKILL(rectype));
			if (IS_TP(rectype))
				tp_set_sgm();
			op_gvzwithdraw();
		}
		if (IS_ZTP(rectype))
		{	/* Even for FENCE_NONE we apply fences. Otherwise an FUPD/GUPD becomes UPD etc. */
			assert(jnl_enabled || (JNL_FENCE_LIST_END == jnl_fence_ctl.fence_list && NULL == csa->next_fenced));
			jnl_fence_ctl.level = 0;
			if (jnl_enabled)
			{
				jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END;
				csa->next_fenced = NULL;
			}
		}
		return SS_NORMAL;
	}