void process_reorg_encrypt_restart(void)
{
	intrpt_state_t	prev_intrpt_state;
	enc_info_t	*encr_ptr;
	int		gtmcrypt_errno;
	gd_segment	*seg;
	sgmnt_addrs	*csa;

	csa = reorg_encrypt_restart_csa;
	assert(NULL != csa);	/* caller should have ensured this */
	/* Opening handles for encryption is a heavyweight operation. Caller should have ensured we are not in crit for
	 * any region when the new key handles are opened for any one region. Assert that.
	 */
	assert(0 == have_crit(CRIT_HAVE_ANY_REG));
	DEFER_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state);
	encr_ptr = csa->encr_ptr;
	assert(NULL != encr_ptr);
	DBG_RECORD_CRYPT_RECEIVE(csa->hdr, csa, csa->nl, process_id, encr_ptr);
	seg = csa->region->dyn.addr;
	INIT_DB_OR_JNL_ENCRYPTION(csa, encr_ptr, seg->fname_len, seg->fname, gtmcrypt_errno);
	if (0 != gtmcrypt_errno)
	{
		ENABLE_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state);
		GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname);
	}
	reorg_encrypt_restart_csa = NULL;
	ENABLE_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state);
}
void	gtmsource_onln_rlbk_clnup()
{
	gtmsource_local_ptr_t	gtmsource_local;
	boolean_t		was_crit;
	sgmnt_addrs		*repl_csa;

	gtmsource_local = jnlpool.gtmsource_local;
	repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs;
	was_crit = repl_csa->now_crit;
	assert(!repl_csa->hold_onto_crit);
	assert(was_crit || (process_id == gtmsource_local->gtmsource_srv_latch.u.parts.latch_pid)
		|| (0 != have_crit(CRIT_HAVE_ANY_REG)));
	/* Reset source server context to indicate a fresh connection that is about to take place */
	assert(NULL != gtmsource_local);
	if (NULL != gtmsource_local)
	{
		/* If ROLLBACK has not taken the instance past the source server's read_jnl_seqno, then the source server should
		 * just continue from where it currently is and start sending the journal records from that point onwards. But, this
		 * is non-trivial. The reason is because, when the source server detected the online rollback, it could be in the
		 * READ_POOL state. But, since the instance has been rolled back, the journal pool cannot be relied upon in its
		 * entirety. To illustrate this -- consider that the journal pool contains the data from 1-100 and the source server
		 * is currently sending sequence number 30 and is reading from the pool. Assume an online rollback happens that
		 * takes the instance from sequence number 100 to sequence number 80 and leaves the journal pool write_addr and
		 * early_write_addr untouched. Now, lets say GT.M process comes in after this and does a few more updates. All of
		 * these updates will be written in the journal pool right after the "old-rolled-back" sequence number 100. If the
		 * source server continues to read from the pool, it will send the valid data until sequence number 80. After that,
		 * it will start sending the "old-rolled-back" sequence numbers 81-100 which is not right. To avoid this, rollback
		 * should set the write_addr and early_write_addr by searching in the journal pool for sequence number 81. This is
		 * currently not done, but is something that we can think about when it comes to optimization. Until then, force
		 * rollback to reset jnlpool's write_addr, write and early_write_addr to 0 and let source server be forced into
		 * READ_FILE mode.
		 */
		gtmsource_local->read_state = READ_FILE;
		/* Set the state which gets bubbled up the call chain to gtmsource_process at which point we will close and
		 * re-establish the connection with the other end.
		 */
		gtmsource_local->gtmsource_state = gtmsource_state = GTMSOURCE_HANDLE_ONLN_RLBK;
		if (!was_crit)
			grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK);
		/* We have to let the read files logic know that until we have sent data "upto" the current journal sequence number
		 * at this point, we cannot rely on the journal pool. Indicate this through the gtmsource_save_read_jnl_seqno global
		 * variable
		 */
		gtmsource_save_read_jnl_seqno = jnlpool.jnlpool_ctl->jnl_seqno;
		gtmsource_local->read = jnlpool.jnlpool_ctl->write;
		gtmsource_local->read_addr = jnlpool.jnlpool_ctl->write_addr;
		if (!was_crit)
			rel_lock(jnlpool.jnlpool_dummy_reg);
	}
	return;
}
void deferred_signal_handler(void)
{
	void (*signal_routine)();

	error_def(ERR_KILLBYSIG);
	error_def(ERR_KILLBYSIGUINFO);
	error_def(ERR_KILLBYSIGSINFO1);
	error_def(ERR_KILLBYSIGSINFO2);

	/* To avoid nested calls to this routine, we set forced_exit to FALSE at the very beginning */
	forced_exit = FALSE;

	if (exit_handler_active)
	{
		assert(FALSE);	/* at this point in time (June 2003) there is no way we know of to get here, hence the assert */
		return;	/* since anyway we are exiting currently, resume exit handling instead of reissuing another one */
	}
	/* For signals that get a delayed response so we can get out of crit, we also delay the messages.
	 * This routine will output those delayed messages from the appropriate structures to both the
	 * user and the system console.
	 */
	/* note can't use switch here because ERR_xxx are not defined as constants */
	if (ERR_KILLBYSIG == forced_exit_err)
	{
		send_msg(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal);
		gtm_putmsg(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal);
	} else if (ERR_KILLBYSIGUINFO == forced_exit_err)
	{
		send_msg(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id,
						signal_info.signal, signal_info.send_pid, signal_info.send_uid);
		gtm_putmsg(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id,
						signal_info.signal, signal_info.send_pid, signal_info.send_uid);
	} else if (ERR_KILLBYSIGSINFO1 == forced_exit_err)
	{
		send_msg(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type),
			 process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr);
		gtm_putmsg(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type),
			   process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr);
	} else if (ERR_KILLBYSIGSINFO2 == forced_exit_err)
	{
		send_msg(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type),
			 process_id, signal_info.signal, signal_info.int_iadr);
		gtm_putmsg(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type),
			   process_id, signal_info.signal, signal_info.int_iadr);
	} else
	{
		send_msg(VARLSTCNT(1) forced_exit_err);
		gtm_putmsg(VARLSTCNT(1) forced_exit_err);
	}
	/* As on VMS, a mupip stop does not drive the condition handlers unless we are in crit */
	if ((0 != have_crit(CRIT_HAVE_ANY_REG) || SIGTERM != exi_condition) && CHANDLER_EXISTS)
		DRIVECH(0);
	/* If a special routine was registered to be driven on a signal, drive it now */
	if (0 != exi_condition && call_on_signal)
	{
		signal_routine = call_on_signal;
		call_on_signal = NULL;		/* So we don't recursively call ourselves */
		(*signal_routine)();
	}
	/* If the condition handler didn't cause an exit, drive the defined exit handler */
	exit(-exi_condition);
}
Esempio n. 4
0
void mutex_deadlock_check(mutex_struct_ptr_t criticalPtr)
{
	tp_region	*tr;
	sgmnt_addrs	*csa;
	int4		save_crit_count;

	if (in_mutex_deadlock_check)
		return;
	in_mutex_deadlock_check = TRUE;
	/* A zero value of "crit_count" implies asynchronous activities can occur (e.g. db flush timer, periodic epoch timers etc.).
	 * At this point, although we are here through grab_crit()/grab_lock() (which would have incremented "crit_count"), we are
	 * 	in a safe and consistent state as far as the mutex structures go so it is ok to set "crit_count" to 0 implying we
	 * 	are now in an interruptible state (of course, we need to restore "crit_count" to what it was before returning).
	 * The other alternative of not changing "crit_count" presents us with complex situations wherein recursion
	 * 	of grab_crit/rel_crit might occur (through direct or indirect calls from mutex_deadlock_check())
	 * 	causing crit_count to be > 1 and in turn causing the crit_count-reset-logic in grab_crit/rel_crit to
	 * 	do a "crit_count--" (instead of "crit_count = 0"). This suffers from the problem that in case of an error code path
	 * 	crit_count might not get decremented appropriately and hence become out-of-sync (i.e. a positive value instead
	 * 	of zero) and a non-zero value might cause indefinite deferrals of asynchronous events.
	 */
	assert(1 == crit_count);
	save_crit_count = crit_count;
	crit_count = 0;

	/* Need to determine who should and should not go through the deadlock checker.
	 *
	 * List of who needs to be considered
	 * ------------------------------------
	 * -> GT.M, Update process, MUPIP LOAD and GT.CM GNP/OMI server : since they go through t_end() to update the database.
	 * 	Note that all of the above (and only those) have the "is_replicator" flag set to TRUE.
	 * -> MUPIP REORG, since it does non-TP transactions and goes through t_end() (has "mu_reorg_process" flag set).
	 *
	 * List of who does not need to be considered (with reasons)
	 * -----------------------------------------------------------
	 * -> MUPIP RECOVER can hold crit on several regions (through TP or non-TP transactions).
	 * 	But it has standalone access and hence no possibility of a deadlock.
	 * -> MUPIP RESTORE too holds standalone access so does not need to be considered.
	 * -> Source Server, Receiver Server etc. can hold only one CRIT resource at any point of time.
	 * -> DSE, MUPIP BACKUP, MUPIP SET JOURNAL etc. can legitimately hold crit on several regions though in non-TP.
	 */
	if (is_replicator || mu_reorg_process)
	{
		if (0 == dollar_tlevel)
		{
			if ((NULL != jnlpool.jnlpool_dummy_reg) && jnlpool.jnlpool_dummy_reg->open)
			{
				++crit_deadlock_check_cycle;
				if (FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs.critical == criticalPtr)
				{	/* grab_lock going for crit on the jnlpool region. gv_cur_region points to the
					 * current region of interest, which better have replication enabled, and be now crit
					 */
					assert(cs_addrs == &FILE_INFO(gv_cur_region)->s_addrs);
					csa = &FILE_INFO(gv_cur_region)->s_addrs;
					if (FALSE == csa->now_crit || !REPL_ENABLED(csa->hdr))
						GTMASSERT;	/* should have crit on gv_cur_region before asking for jnlpool */
					csa->crit_check_cycle = crit_deadlock_check_cycle; /* allow for crit in gv_cur_region */
				}
			}
		} else
                {       /* Need to mark the regions allowed to have crit as follows:
                         * Place the current cycle into the csa's of regions allowed to have crit so have_crit() can easily test.
                         * Note that should the system be up long enough for the 2**32 cycle value to
                         * wrap and a region be unused for most of that time, such a region might not be entitled to crit
                         * but have an old csa->crit_cycle_check matching the current crit_deadlock_cycle_check -
                         * that case would not trigger have_crit() to release crit on that region;
                         * however, the next call to this routine increments crit_deadlock_check_cycle and so
                         * crit on that region gets released after two calls instead of (the usual) one.
			 */
			++crit_deadlock_check_cycle;
			for (tr = tp_reg_list;  NULL != tr;  tr = tr->fPtr)
			{
				if (!tr->reg->open)
					continue;
				csa = &FILE_INFO(tr->reg)->s_addrs;
				if (csa->now_crit)
					csa->crit_check_cycle = crit_deadlock_check_cycle;
				else
				{	/* Seen first non-crit region. Make sure either of the following is true.
					 *	 (i) this is the region we are currently grabbing crit on
					 *	(ii) we do not hold crit on any region in the tp_reg_list.
					 * If neither of the above, we have an out of design condition that can only
					 * 	warrant blowing the process up..
					 */
					if ((csa->critical != criticalPtr) && (tr != tp_reg_list))
						GTMASSERT;
					break;
				}
			}
		}
		/* Release crit in regions not legitimately part of this TP/non-TP transaction */
		have_crit(CRIT_RELEASE | CRIT_NOT_TRANS_REG);
	}
	crit_count = save_crit_count;
	in_mutex_deadlock_check = FALSE;
}
/* Note we don't increment fast_lock_count as part of getting the latch and decrement it when releasing it because ROLLBACK
 * can hold onto this latch for a long while and can do updates in this duration and we should NOT have a non-zero fast_lock_count
 * as many places like t_begin/dsk_read have asserts to this effect. It is okay to NOT increment fast_lock_count as ROLLBACK
 * anyways have logic to disable interrupts the moment it starts doing database updates.
 */
boolean_t	grab_gtmsource_srv_latch(sm_global_latch_ptr_t latch, uint4 max_timeout_in_secs, uint4 onln_rlbk_action)
{
	int			spins, maxspins, retries, max_retries;
	unix_db_info		*udi;
	sgmnt_addrs		*repl_csa;
	boolean_t		cycle_mismatch;

	assert(!have_crit(CRIT_HAVE_ANY_REG));
	udi = FILE_INFO(jnlpool.jnlpool_dummy_reg);
	repl_csa = &udi->s_addrs;
	maxspins = num_additional_processors ? MAX_LOCK_SPINS(LOCK_SPINS, num_additional_processors) : 1;
	max_retries = max_timeout_in_secs * 4 * 1000; /* outer-loop : X minutes, 1 loop in 4 is sleep of 1 ms */
	for (retries = max_retries - 1; 0 < retries; retries--)
	{
		for (spins = maxspins; 0 < spins; spins--)
		{
			assert(latch->u.parts.latch_pid != process_id); /* We better not hold it if trying to get it */
			if (GET_SWAPLOCK(latch))
			{
				DEBUG_ONLY(locknl = repl_csa->nl); /* Use the journal pool to maintain lock history */
				LOCK_HIST("OBTN", latch, process_id, retries);
				DEBUG_ONLY(locknl = NULL);
				if (jnlpool.repl_inst_filehdr->file_corrupt && !jgbl.onlnrlbk)
				{
					/* Journal pool indicates an abnormally terminated online rollback. Cannot continue until
					 * the rollback command is re-run to bring the journal pool/file and instance file to a
					 * consistent state.
					 */
					/* No need to release the latch before rts_error (mupip_exit_handler will do it for us) */
					rts_error(VARLSTCNT(8) ERR_REPLREQROLLBACK, 2, LEN_AND_STR(udi->fn),
						ERR_TEXT, 2, LEN_AND_LIT("file_corrupt field in instance file header is set to"
										" TRUE"));
				}
				cycle_mismatch = (repl_csa->onln_rlbk_cycle != jnlpool.jnlpool_ctl->onln_rlbk_cycle);
				assert((ASSERT_NO_ONLINE_ROLLBACK != onln_rlbk_action) || !cycle_mismatch);
				if ((HANDLE_CONCUR_ONLINE_ROLLBACK == onln_rlbk_action) && cycle_mismatch)
				{
					assert(is_src_server);
					SYNC_ONLN_RLBK_CYCLES;
					gtmsource_onln_rlbk_clnup(); /* side-effect : sets gtmsource_state */
					rel_gtmsource_srv_latch(latch);
				}
				return TRUE;
			}
		}
		if (retries & 0x3)
		{	/* On all but every 4th pass, do a simple rel_quant */
			rel_quant();
		} else
		{
			/* On every 4th pass, we bide for awhile */
			wcs_sleep(LOCK_SLEEP);
			if (RETRY_CASLATCH_CUTOFF == (retries % LOCK_TRIES))
				performCASLatchCheck(latch, TRUE);
		}
	}
	DUMP_LOCKHIST();
	assert(FALSE);
	assert(jnlpool.gtmsource_local && jnlpool.gtmsource_local->gtmsource_pid);
	rts_error(VARLSTCNT(5) ERR_SRVLCKWT2LNG, 2, max_timeout_in_secs, jnlpool.gtmsource_local->gtmsource_pid);
	return FALSE; /* to keep the compiler happy */
}