Beispiel #1
0
int mu_rndwn_sem_all(void)
{
	int 			save_errno, exit_status = SS_NORMAL, semid;
	char			entry[MAX_ENTRY_LEN];
	FILE			*pf;
	char			fname[MAX_FN_LEN + 1], *fgets_res;
	boolean_t 		rem_sem;
	shm_parms		*parm_buff;

	if (NULL == (pf = POPEN(IPCS_SEM_CMD_STR ,"r")))
        {
		save_errno = errno;
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("POPEN()"), CALLFROM, save_errno);
                return ERR_MUNOTALLSEC;
        }
	while (NULL != (FGETS(entry, SIZEOF(entry), pf, fgets_res)) && entry[0] != '\n')
	{
		if (-1 != (semid = parse_sem_id(entry)))
		{
			if (is_orphaned_gtm_semaphore(semid))
			{	/* semval == 0 and corresponding shared memory has been removed */
				if (-1 != semctl(semid, 0, IPC_RMID))
				{
					gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(3) ERR_SEMREMOVED, 1, semid);
					send_msg_csa(CSA_ARG(NULL) VARLSTCNT(3) ERR_SEMREMOVED, 1, semid);
				}
			}
		}
	}
	pclose(pf);
	return exit_status;
}
int continue_proc(pid_t pid)
{
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	DEBUG_ONLY(if (!TREF(gtm_usesecshr)))	/* Cause debug builds to talk to gtmsecshr more often */
	{
		if (WBTEST_ENABLED(WBTEST_HOLD_GTMSOURCE_SRV_LATCH))
		{
			/* Simulate the kill below, but ignore its return status so that we end up invoking gtmsecshr */
			kill(pid, SIGCONT);
			/* Wait until the target quits so that kill() call by gtmsecshr fails with ESRCH */
			while (is_proc_alive(pid, 0))
				LONG_SLEEP(1);
		}
		else if (0 == kill(pid, SIGCONT))
			return 0;
		else if (ESRCH == errno)
		{
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(5) ERR_NOSUCHPROC, 3, pid, RTS_ERROR_LITERAL("continue"));
			return ESRCH;
		} else
			assert(EINVAL != errno);
	}
	return send_mesg2gtmsecshr(CONTINUE_PROCESS, pid, NULL, 0);
}
Beispiel #3
0
int create_server(void)
{
	int		child_pid, done_pid, status = 0;
#	ifdef _BSD
	union	wait	chld_status;
#	define CSTAT	chld_status
#	else
#	define CSTAT	status
#	endif
	int		save_errno;

	FORK(child_pid);
	if (0 == child_pid)
	{
		process_id = getpid();
		/* Do exec using gtmsecshr_path, which was initialize in file check code - send_mesg2gtmsecshr */
		if (WBTEST_ENABLED(WBTEST_BADEXEC_SECSHR_PROCESS))
			STRCPY(gtmsecshr_path, "");
		status = EXECL(gtmsecshr_path, gtmsecshr_path, 0);
		if (-1 == status)
		{
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(9) ERR_GTMSECSHRSTART, 3, RTS_ERROR_TEXT("Client"), process_id,
				ERR_TEXT, 2, RTS_ERROR_STRING(secshrstart_error_code[UNABLETOEXECGTMSECSHR]));
			UNDERSCORE_EXIT(UNABLETOEXECGTMSECSHR);
		}
        } else
	{
		if (-1 == child_pid)
		{
			status = GNDCHLDFORKFLD;
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_GTMSECSHRSTART, 3, RTS_ERROR_TEXT("Client"), process_id,
				   ERR_TEXT,  2, RTS_ERROR_TEXT("Failed to fork off gtmsecshr"), errno);
			/* Sleep for a while and hope a subsequent fork will succeed */
			hiber_start(1000);
		}
		for (; !status ;)
		{
			/* To prevent a warning message that the compiler issues */
			done_pid = wait(&CSTAT);
			if (done_pid == child_pid)
			{
				status = WEXITSTATUS(CSTAT);
				break;
			} else if (-1 == done_pid)
			{
				if (ECHILD == errno) /* Assume normal exit status */
					break;
				else if (EINTR != errno)
				{
					status = GNDCHLDFORKFLD;
					gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_GTMSECSHRSTART, 3,
							RTS_ERROR_TEXT("Client"), process_id,
							ERR_TEXT, 2, RTS_ERROR_TEXT("Error spawning gtmsecshr"), errno);
				}
			}
		}
	}
	return status;
}
Beispiel #4
0
/* --------------------------------------------------------------------------------
	This function  renames a file, if exists. Otherwise do nothing.
  --------------------------------------------------------------------------------- */
int rename_file_if_exists(char *org_fn, int org_fn_len, char *rename_fn, int *rename_fn_len, uint4 *ustatus)
{
	mstr 		orgfile;
	int		status;
	jnl_tm_t	now;

	memcpy(rename_fn, org_fn, org_fn_len + 1); /* Ensure it to be NULL terminated */
	*rename_fn_len = org_fn_len;
	orgfile.addr = org_fn;
	orgfile.len = org_fn_len;
	if (FILE_NOT_FOUND == (status = gtm_file_stat(&orgfile, NULL, NULL, FALSE, ustatus)))
		return RENAME_NOT_REQD;
	else if (FILE_STAT_ERROR == status)
	{
		assert(SS_NORMAL != *ustatus);
		return RENAME_FAILED;
	}
	/* File is present in the system */
	assert(0 <  MAX_FN_LEN - org_fn_len - 1);
	JNL_SHORT_TIME(now);
	if (SS_NORMAL != (status = prepare_unique_name(org_fn, org_fn_len, "", "", rename_fn, rename_fn_len, now, ustatus)))
	{	/* "prepare_unique_name" would not have set "ustatus" to the error code. So set it here and return */
		assert(SS_NORMAL == *ustatus);
		*ustatus = status;
		assert(SS_NORMAL != *ustatus);
		return RENAME_FAILED;
	}
	assert(0 == rename_fn[*rename_fn_len]);
	if (SS_NORMAL != (status = gtm_rename(org_fn, org_fn_len, rename_fn, *rename_fn_len, ustatus)))
	{
		*ustatus = status;
		assert(SS_NORMAL != *ustatus);
		if (IS_GTM_IMAGE)
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(9) ERR_RENAMEFAIL, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn,
				status, 0, *ustatus);
		else
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT1(8) ERR_RENAMEFAIL, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn,
				status, PUT_SYS_ERRNO(*ustatus));
		return RENAME_FAILED;
	}
	if (IS_GTM_IMAGE)
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT (6) ERR_FILERENAME, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn);
	else
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT (6) ERR_FILERENAME, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn);
	return RENAME_SUCCESS;
}
Beispiel #5
0
void set_enospc_flags(gd_addr *addr_ptr, char enospc_enable_list[], boolean_t ok_to_interrupt)
{
	gd_region	*r_local, *r_top;
	int		i;
	sgmnt_addrs	*csa;
	const char	*syslog_msg;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;

	for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions, i = 0;
	     r_local < r_top; r_local++, i++)
	{
		if (!r_local->open || r_local->was_open)
			continue;
		if ((dba_bg != r_local->dyn.addr->acc_meth) && (dba_mm != r_local->dyn.addr->acc_meth))
			continue;
		csa = &FILE_INFO(r_local)->s_addrs;
		if (ANTICIPATORY_FREEZE_ENABLED(csa))
		{
			switch(enospc_enable_list[i])
			{
			case NONE:
				syslog_msg = "Turning off fake ENOSPC for both database and journal file.";
				csa->nl->fake_db_enospc = FALSE;
				csa->nl->fake_jnl_enospc = FALSE;
				break;
			case DB_ON:
				syslog_msg = "Turning on fake ENOSPC only for database file.";
				csa->nl->fake_db_enospc = TRUE;
				csa->nl->fake_jnl_enospc = FALSE;
				break;
			case JNL_ON:
				syslog_msg = "Turning on fake ENOSPC only for journal file.";
				csa->nl->fake_db_enospc = FALSE;
				csa->nl->fake_jnl_enospc = TRUE;
				break;
			case DB_AND_JNL_ON:
				syslog_msg = "Turning on fake ENOSPC for both database and journal file.";
				csa->nl->fake_db_enospc = TRUE;
				csa->nl->fake_jnl_enospc = TRUE;
				break;
			default:
				assert(FALSE);
			}
			if (ok_to_interrupt)
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT, 2,
					     LEN_AND_STR(syslog_msg));
		}
	}
}
Beispiel #6
0
void set_enospc_if_needed()
{
	gd_addr		*addr_ptr;
	char		enospc_enable_list[MAX_REGIONS];
	boolean_t	ok_to_interrupt, is_time_to_act;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;

	if (TREF(gtm_test_fake_enospc) && is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE)
	{
		ok_to_interrupt = (INTRPT_OK_TO_INTERRUPT == intrpt_ok_state) && (0 == gtmMallocDepth);
		is_time_to_act = (next_heartbeat_counter == heartbeat_counter);
		if (syslog_deferred && ok_to_interrupt)
		{
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(3) ERR_FAKENOSPCLEARED, 1, (heartbeat_counter - syslog_deferred));
			syslog_deferred = 0;
		}
		if (!is_time_to_act || syslog_deferred || (!ok_to_interrupt && !IS_REPL_INST_FROZEN))
		{
			/* We have to skip this because we have just fallen into deferred zone or we are currently in it */
			if (is_time_to_act)
				next_heartbeat_counter++;  /* Try again in the next heartbeat */
			return;
		}
		assert(0 == syslog_deferred);
		srand(time(NULL));
		addr_ptr = get_next_gdr(NULL);
		if (NULL == addr_ptr) /* Ensure that there is a global directory to operate on. */
			return;
		assert(NULL == get_next_gdr(addr_ptr));
		/* Randomly simulate ENOSPC or free space. NO more than 50 regions are allowed to avoid unnecessary
		 * malloc/frees in debug-only code
		 */
		assert(MAX_REGIONS >= addr_ptr->n_regions);
		if (!IS_REPL_INST_FROZEN)
		{	/* We are in an UNFROZEN state, and about to be FROZEN due to ENOSPC */
			choose_random_reg_list(enospc_enable_list, addr_ptr->n_regions);
			next_heartbeat_counter = heartbeat_counter + ENOSPC_FROZEN_DURATION;
		} else
		{	/* We are in a FROZEN state, and about to be UNFROZEN due to free space */
			memset(enospc_enable_list, 0, MAX_REGIONS);
			next_heartbeat_counter = heartbeat_counter + ENOSPC_UNFROZEN_DURATION;
			if (!ok_to_interrupt)
				syslog_deferred = heartbeat_counter;
		}
		set_enospc_flags(addr_ptr, enospc_enable_list, ok_to_interrupt);
	}
}
Beispiel #7
0
void ch_overrun(void)
{
	PRN_ERROR;
	gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOCHLEFT);
	send_msg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOCHLEFT);

	/* If exit handler is already active, we will just core and die */
	if (exit_handler_active)
		gtm_dump_core();
	else
	{	/* Otherwise, we generate a core and exit to drive the condition handler */
		gtm_fork_n_core();
		MUMPS_EXIT;
	}
}
/* This function sets the "ftok_counter_halted" field to TRUE in the instance file header and flushes it to disk.
 * Caller could be attached to the journal pool or not. If not, update file header directly. If yes, go through locks.
 */
void	repl_inst_ftok_counter_halted(unix_db_info *udi, char *file_type, repl_inst_hdr *repl_instance)
{
	assert(udi->grabbed_ftok_sem);	/* this ensures we have a lock before we modify the instance file header */
	if (NULL != jnlpool.repl_inst_filehdr)
	{
		assert(!jnlpool.repl_inst_filehdr->ftok_counter_halted);
		jnlpool.repl_inst_filehdr->ftok_counter_halted = TRUE;
		grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK);
		repl_inst_flush_filehdr();
		rel_lock(jnlpool.jnlpool_dummy_reg);
	} else
	{
		assert(!repl_instance->ftok_counter_halted);
		repl_instance->ftok_counter_halted = TRUE;
		repl_inst_write(udi->fn, (off_t)0, (sm_uc_ptr_t)repl_instance, SIZEOF(repl_inst_hdr));
	}
	/* Ignore any errors while flushing the "halted" value to the file header. The only consequence is other processes
	 * will incur a performance overhead trying to unnecessarily bump the semaphore counter when it is already ERANGE.
	 */
	send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_NOMORESEMCNT, 5, LEN_AND_LIT("ftok"), file_type, LEN_AND_STR(udi->fn));
}
void deferred_signal_handler(void)
{
	void (*signal_routine)();
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	/* To avoid nested calls to this routine, progress the forced_exit state. */
	SET_FORCED_EXIT_STATE_ALREADY_EXITING;

	if (exit_handler_active)
	{
		assert(FALSE);	/* at this point in time (June 2003) there is no way we know of to get here, hence the assert */
		return;	/* since anyway we are exiting currently, resume exit handling instead of reissuing another one */
	}
	/* For signals that get a delayed response so we can get out of crit, we also delay the messages.
	 * This routine will output those delayed messages from the appropriate structures to both the
	 * user and the system console.
	 */
	/* note can't use switch here because ERR_xxx are not defined as constants */
	if (ERR_KILLBYSIG == forced_exit_err)
	{
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type),
			process_id, signal_info.signal);
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type),
			process_id, signal_info.signal);
	} else if (ERR_KILLBYSIGUINFO == forced_exit_err)
	{
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id,
						signal_info.signal, signal_info.send_pid, signal_info.send_uid);
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id,
						signal_info.signal, signal_info.send_pid, signal_info.send_uid);
	} else if (ERR_KILLBYSIGSINFO1 == forced_exit_err)
	{
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type),
			 process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr);
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type),
			   process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr);
	} else if (ERR_KILLBYSIGSINFO2 == forced_exit_err)
	{
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type),
			 process_id, signal_info.signal, signal_info.int_iadr);
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type),
			   process_id, signal_info.signal, signal_info.int_iadr);
	} else if (ERR_KILLBYSIGSINFO3 == forced_exit_err)
	{
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type),
			 process_id, signal_info.signal, signal_info.bad_vadr);
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type),
			   process_id, signal_info.signal, signal_info.bad_vadr);
	} else if (ERR_FORCEDHALT != forced_exit_err || !gtm_quiet_halt)
	{	/* No HALT messages if quiet halt is requested */
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(1) forced_exit_err);
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) forced_exit_err);
	}
	assert(OK_TO_INTERRUPT);
	/* Signal intent to exit BEFORE driving condition handlers. This avoids checks that will otherwise fail (for example
	 * if mdb_condition_handler/preemptive_db_clnup gets called below, that could invoke the RESET_GV_TARGET macro which in turn
	 * would assert that gv_target->gd_csa is equal to cs_addrs. This could not be true in case we were in mainline code
	 * that was interrupted by the flush timer for a different region which in turn was interrupted by an external signal
	 * that would drive us to exit. Setting the "process_exiting" variable causes those csa checks to pass.
	 */
	SET_PROCESS_EXITING_TRUE;
#	ifdef DEBUG
	if (gtm_white_box_test_case_enabled && (WBTEST_DEFERRED_TIMERS == gtm_white_box_test_case_number)
		&& (2 == gtm_white_box_test_case_count))
	{
		DEFER_INTERRUPTS(INTRPT_NO_TIMER_EVENTS);
		DBGFPF((stderr, "DEFERRED_SIGNAL_HANDLER: will sleep for 20 seconds\n"));
		LONG_SLEEP(20);
		DBGFPF((stderr, "DEFERRED_SIGNAL_HANDLER: done sleeping\n"));
		ENABLE_INTERRUPTS(INTRPT_NO_TIMER_EVENTS);
	}
#	endif
	/* If any special routines are registered to be driven on a signal, drive them now */
	if ((0 != exi_condition) && (NULL != call_on_signal))
	{
		signal_routine = call_on_signal;
		call_on_signal = NULL;		/* So we don't recursively call ourselves */
		(*signal_routine)();
	}
	/* Note, we do not drive create_fatal_error zshow_dmp() in this routine since any deferrable signals are
	 * by definition not fatal.
	 */
	exit(-exi_condition);
}
Beispiel #10
0
void	mu_reorg_upgrd_dwngrd(void)
{
	blk_hdr			new_hdr;
	blk_segment		*bs1, *bs_ptr;
	block_id		*blkid_ptr, curblk, curbmp, start_blk, stop_blk, start_bmp, last_bmp;
	block_id		startblk_input, stopblk_input;
	boolean_t		upgrade, downgrade, safejnl, nosafejnl, region, first_reorg_in_this_db_fmt, reorg_entiredb;
	boolean_t		startblk_specified, stopblk_specified, set_fully_upgraded, db_got_to_v5_once, mark_blk_free;
	cache_rec_ptr_t		cr;
	char			*bml_lcl_buff = NULL, *command, *reorg_command;
	sm_uc_ptr_t		bptr = NULL;
	cw_set_element		*cse;
	enum cdb_sc		cdb_status;
	enum db_ver		new_db_format, ondsk_blkver;
	gd_region		*reg;
	int			cycle;
	int4			blk_seg_cnt, blk_size;	/* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */
	int4			blocks_left, expected_blks2upgrd, actual_blks2upgrd, total_blks, free_blks;
	int4			status, status1, mapsize, lcnt, bml_status;
	reorg_stats_t		reorg_stats;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	sm_uc_ptr_t		blkBase, bml_sm_buff;	/* shared memory pointer to the bitmap global buffer */
	srch_hist		alt_hist;
	srch_blk_status		*blkhist, bmlhist;
	tp_region		*rptr;
	trans_num		curr_tn;
	unsigned char    	save_cw_set_depth;
	uint4			lcl_update_trans;

	region    = (CLI_PRESENT == cli_present("REGION"));
	upgrade   = (CLI_PRESENT == cli_present("UPGRADE"));
	downgrade = (CLI_PRESENT == cli_present("DOWNGRADE"));
	assert(upgrade && !downgrade || !upgrade && downgrade);
	command = upgrade ? "UPGRADE" : "DOWNGRADE";
	reorg_command = upgrade ? "MUPIP REORG UPGRADE" : "MUPIP REORG DOWNGRADE";
	reorg_entiredb = TRUE;	/* unless STARTBLK or STOPBLK is specified we are going to {up,down}grade the entire database */
	startblk_specified = FALSE;
	assert(SIZEOF(block_id) == SIZEOF(uint4));
	if ((CLI_PRESENT == cli_present("STARTBLK")) && (cli_get_hex("STARTBLK", (uint4 *)&startblk_input)))
	{
		reorg_entiredb = FALSE;
		startblk_specified = TRUE;
	}
	stopblk_specified = FALSE;
	assert(SIZEOF(block_id) == SIZEOF(uint4));
	if ((CLI_PRESENT == cli_present("STOPBLK")) && (cli_get_hex("STOPBLK", (uint4 *)&stopblk_input)))
	{
		reorg_entiredb = FALSE;
		stopblk_specified = TRUE;
	}
	mu_reorg_upgrd_dwngrd_in_prog = TRUE;
	mu_reorg_nosafejnl = (CLI_NEGATED == cli_present("SAFEJNL")) ? TRUE : FALSE;

	assert(region);
	status = SS_NORMAL;
	error_mupip = FALSE;
	gvinit();	/* initialize gd_header (needed by the later call to mu_getlst) */
	mu_getlst("REG_NAME", SIZEOF(tp_region));	/* get the parameter corresponding to REGION qualifier */
	if (error_mupip)
	{
		util_out_print("!/MUPIP REORG !AD cannot proceed with above errors!/", TRUE, LEN_AND_STR(command));
		mupip_exit(ERR_MUNOACTION);
	}
	assert(DBKEYSIZE(MAX_KEY_SZ) == gv_keysize);	/* no need to invoke GVKEYSIZE_INIT_IF_NEEDED macro */
	gv_target = targ_alloc(gv_keysize, NULL, NULL);	/* t_begin needs this initialized */
	gv_target_list = NULL;
	memset(&alt_hist, 0, SIZEOF(alt_hist));	/* null-initialize history */
	blkhist = &alt_hist.h[0];
	for (rptr = grlist;  NULL != rptr;  rptr = rptr->fPtr)
	{
		if (mu_ctrly_occurred || mu_ctrlc_occurred)
			break;
		reg = rptr->reg;
		util_out_print("!/Region !AD : MUPIP REORG !AD started", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
		if (reg_cmcheck(reg))
		{
			util_out_print("Region !AD : MUPIP REORG !AD cannot run across network",
				TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			status = ERR_MUNOFINISH;
			continue;
		}
		mu_reorg_process = TRUE;	/* gvcst_init will use this value to use gtm_poollimit settings. */
		gvcst_init(reg);
		mu_reorg_process = FALSE;
		assert(update_array != NULL);
		/* access method stored in global directory and database file header might be different in which case
		 * the database setting prevails. therefore, the access method check can be done only after opening
		 * the database (i.e. after the gvcst_init)
		 */
		if (dba_bg != REG_ACC_METH(reg))
		{
			util_out_print("Region !AD : MUPIP REORG !AD cannot continue as access method is not BG",
				TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			status = ERR_MUNOFINISH;
			continue;
		}
		/* The mu_getlst call above uses insert_region to create the grlist, which ensures that duplicate regions mapping to
		 * the same db file correspond to only one grlist entry.
		 */
		assert(FALSE == reg->was_open);
		TP_CHANGE_REG(reg);	/* sets gv_cur_region, cs_addrs, cs_data */
		csa = cs_addrs;
		csd = cs_data;
		blk_size = csd->blk_size;	/* "blk_size" is used by the BLK_FINI macro */
		if (reg->read_only)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(reg));
			status = ERR_MUNOFINISH;
			continue;
		}
		assert(GDSVCURR == GDSV6); /* so we trip this assert in case GDSVCURR changes without a change to this module */
		new_db_format = (upgrade ? GDSV6 : GDSV4);
		grab_crit(reg);
		curr_tn = csd->trans_hist.curr_tn;
		/* set the desired db format in the file header to the appropriate version, increment transaction number */
		status1 = desired_db_format_set(reg, new_db_format, reorg_command);
		assert(csa->now_crit);	/* desired_db_format_set() should not have released crit */
		first_reorg_in_this_db_fmt = TRUE;	/* with the current desired_db_format, this is the first reorg */
		if (SS_NORMAL != status1)
		{	/* "desired_db_format_set" would have printed appropriate error messages */
			if (ERR_MUNOACTION != status1)
			{	/* real error occurred while setting the db format. skip to next region */
				status = ERR_MUNOFINISH;
				rel_crit(reg);
				continue;
			}
			util_out_print("Region !AD : Desired DB Format remains at !AD after !AD", TRUE, REG_LEN_STR(reg),
				LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command));
			if (csd->reorg_db_fmt_start_tn == csd->desired_db_format_tn)
				first_reorg_in_this_db_fmt = FALSE;
		} else
			util_out_print("Region !AD : Desired DB Format set to !AD by !AD", TRUE, REG_LEN_STR(reg),
				LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command));
		assert(dba_bg == csd->acc_meth);
		/* Check blks_to_upgrd counter to see if upgrade/downgrade is complete */
		total_blks = csd->trans_hist.total_blks;
		free_blks = csd->trans_hist.free_blocks;
		actual_blks2upgrd = csd->blks_to_upgrd;
		/* If MUPIP REORG UPGRADE and there is no block to upgrade in the database as indicated by BOTH
		 * 	"csd->blks_to_upgrd" and "csd->fully_upgraded", then we can skip processing.
		 * If MUPIP REORG UPGRADE and all non-free blocks need to be upgraded then again we can skip processing.
		 */
		if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded)
			|| (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd)))
		{
			util_out_print("Region !AD : Blocks to Upgrade counter indicates no action needed for MUPIP REORG !AD",
				       TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : "
				       "Blocks to upgrade = [0x!XL]",
				       TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd);
			util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			rel_crit(reg);
			continue;
		}
		stop_blk = total_blks;
		if (stopblk_specified && stopblk_input <= stop_blk)
			stop_blk = stopblk_input;
		if (first_reorg_in_this_db_fmt)
		{	/* Note down reorg start tn (in case we are interrupted, future reorg will know to resume) */
			csd->reorg_db_fmt_start_tn = csd->desired_db_format_tn;
			csd->reorg_upgrd_dwngrd_restart_block = 0;
			start_blk = (startblk_specified ? startblk_input : 0);
		} else
		{	/* Either a concurrent MUPIP REORG of the same type ({up,down}grade) is currently running
			 * or a previously running REORG of the same type was interrupted (Ctrl-Ced).
			 * In either case resume processing from whatever restart block number is stored in fileheader
			 * the only exception is if "STARTBLK" was specified in the input in which use that unconditionally.
			 */
			start_blk = (startblk_specified ? startblk_input : csd->reorg_upgrd_dwngrd_restart_block);
		}
		if (start_blk > stop_blk)
			start_blk = stop_blk;
		mu_reorg_upgrd_dwngrd_start_tn = csd->reorg_db_fmt_start_tn;
		/* Before releasing crit, flush the file-header and dirty buffers in cache to disk. This is because we are now
		 * going to read each GDS block directly from disk to determine if it needs to be upgraded/downgraded or not.
		 */
		if (!wcs_flu(WCSFLU_FLUSH_HDR))	/* wcs_flu assumes gv_cur_region is set (which it is in this routine) */
		{
			rel_crit(reg);
			gtm_putmsg_csa(CSA_ARG(csa)
				VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg));
			status = ERR_MUNOFINISH;
			continue;
		}
		rel_crit(reg);
		/* Loop through entire database one GDS block at a time and upgrade/downgrade each of them */
		status1 = SS_NORMAL;
		start_bmp = ROUND_DOWN2(start_blk, BLKS_PER_LMAP);
		last_bmp  = ROUND_DOWN2(stop_blk - 1, BLKS_PER_LMAP);
		curblk = start_blk;	/* curblk is the block to be upgraded/downgraded */
		util_out_print("Region !AD : Started processing from block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk);
		if (NULL != bptr)
		{	/* malloc/free "bptr" for each region as GDS block-size can be different */
			free(bptr);
			bptr = NULL;
		}
		memset(&reorg_stats, 0, SIZEOF(reorg_stats));	/* initialize statistics for this region */
		for (curbmp = start_bmp; curbmp <= last_bmp; curbmp += BLKS_PER_LMAP)
		{
			if (mu_ctrly_occurred || mu_ctrlc_occurred)
			{
				status1 = ERR_MUNOFINISH;
				break;
			}
			/* --------------------------------------------------------------
			 *             Read in current bitmap block
			 * --------------------------------------------------------------
			 */
			assert(!csa->now_crit);
			bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* bring block into the cache outside of crit */
			reorg_stats.blks_read_from_disk_bmp++;
			grab_crit_encr_cycle_sync(reg); /* needed so t_qread does not return NULL below */
			if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn)
			{	/* csd->desired_db_format changed since reorg started. discontinue the reorg */
				/* see later comment on "csd->reorg_upgrd_dwngrd_restart_block" for why the assignment
				 * of this field should be done only if a db format change did not occur.
				 */
				rel_crit(reg);
				status1 = ERR_MUNOFINISH;
				/* This "start_tn" check is redone after the for-loop and an error message is printed there */
				break;
			} else if (reorg_entiredb)
			{	/* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */
				assert(csd->reorg_upgrd_dwngrd_restart_block <= MAX(start_blk, curbmp));
				csd->reorg_upgrd_dwngrd_restart_block = curbmp;	/* previous blocks have been upgraded/downgraded */
			}
			/* Check blks_to_upgrd counter to see if upgrade/downgrade is complete.
			 * Repeat check done a few steps earlier outside of this for loop.
			 */
			total_blks = csd->trans_hist.total_blks;
			free_blks = csd->trans_hist.free_blocks;
			actual_blks2upgrd = csd->blks_to_upgrd;
			if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded)
				|| (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd)))
			{
				rel_crit(reg);
				break;
			}
			bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* now that in crit, note down stable buffer */
			if (NULL == bml_sm_buff)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL);
			ondsk_blkver = cr->ondsk_blkver;	/* note down db fmt on disk for bitmap block */
			/* Take a copy of the shared memory bitmap buffer into process-private memory before releasing crit.
			 * We are interested in those blocks that are currently marked as USED in the bitmap.
			 * It is possible that once we release crit, concurrent updates change the bitmap state of those blocks.
			 * In that case, those updates will take care of doing the upgrade/downgrade of those blocks in the
			 * format currently set in csd->desired_db_format i.e. accomplishing MUPIP REORG UPGRADE/DOWNGRADE's job.
			 * If the desired_db_format changes concurrently, we will stop doing REORG UPGRADE/DOWNGRADE processing.
			 */
			if (NULL == bml_lcl_buff)
				bml_lcl_buff = malloc(BM_SIZE(BLKS_PER_LMAP));
			memcpy(bml_lcl_buff, (blk_hdr_ptr_t)bml_sm_buff, BM_SIZE(BLKS_PER_LMAP));
			if (FALSE == cert_blk(reg, curbmp, (blk_hdr_ptr_t)bml_lcl_buff, 0, FALSE))
			{	/* certify the block while holding crit as cert_blk uses fields from file-header (shared memory) */
				assert(FALSE);	/* in pro, skip ugprading/downgarding all blks in this unreliable local bitmap */
				rel_crit(reg);
				util_out_print("Region !AD : Bitmap Block [0x!XL] has integrity errors. Skipping this bitmap.",
					TRUE, REG_LEN_STR(reg), curbmp);
				status1 = ERR_MUNOFINISH;
				continue;
			}
			rel_crit(reg);
			/* ------------------------------------------------------------------------
			 *         Upgrade/Downgrade all BUSY blocks in the current bitmap
			 * ------------------------------------------------------------------------
			 */
			curblk = (curbmp == start_bmp) ? start_blk : curbmp;
			mapsize = (curbmp == last_bmp) ? (stop_blk - curbmp) : BLKS_PER_LMAP;
			assert(0 != mapsize);
			assert(mapsize <= BLKS_PER_LMAP);
			db_got_to_v5_once = csd->db_got_to_v5_once;
			for (lcnt = curblk - curbmp; lcnt < mapsize; lcnt++, curblk++)
			{
				if (mu_ctrly_occurred || mu_ctrlc_occurred)
				{
					status1 = ERR_MUNOFINISH;
					goto stop_reorg_on_this_reg;	/* goto needed because of nested FOR Loop */
				}
				GET_BM_STATUS(bml_lcl_buff, lcnt, bml_status);
				assert(BLK_MAPINVALID != bml_status); /* cert_blk ran clean so we dont expect invalid entries */
				if (BLK_FREE == bml_status)
				{
					reorg_stats.blks_skipped_free++;
					continue;
				}
				/* MUPIP REORG UPGRADE/DOWNGRADE will convert USED & RECYCLED blocks */
				if (db_got_to_v5_once || (BLK_RECYCLED != bml_status))
				{	/* Do NOT read recycled V4 block from disk unless it is guaranteed NOT to be too full */
					if (lcnt)
					{	/* non-bitmap block */
						/* read in block from disk into private buffer. dont pollute the cache yet */
						if (NULL == bptr)
							bptr = (sm_uc_ptr_t)malloc(blk_size);
						status1 = dsk_read(curblk, bptr, &ondsk_blkver, FALSE);
						/* dsk_read on curblk could return an error (DYNUPGRDFAIL) if curblk needs to be
						 * upgraded and if its block size was too big to allow the extra block-header space
						 * requirements for a dynamic upgrade. a MUPIP REORG DOWNGRADE should not error out
						 * in that case as the block is already in the downgraded format.
						 */
						if (SS_NORMAL != status1)
						{
							if (!upgrade && (ERR_DYNUPGRDFAIL == status1))
							{
								assert(GDSV4 == new_db_format);
								ondsk_blkver = new_db_format;
							} else
							{
								gtm_putmsg_csa(CSA_ARG(csa)
									VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status1);
								util_out_print("Region !AD : Error occurred while reading block "
									"[0x!XL]", TRUE, REG_LEN_STR(reg), curblk);
								status1 = ERR_MUNOFINISH;
								goto stop_reorg_on_this_reg;/* goto needed due to nested FOR Loop */
							}
						}
						reorg_stats.blks_read_from_disk_nonbmp++;
					} /* else bitmap block has been read in crit earlier and ondsk_blkver appropriately set */
					if (new_db_format == ondsk_blkver)
					{
						assert((SS_NORMAL == status1) || (!upgrade && (ERR_DYNUPGRDFAIL == status1)));
						status1 = SS_NORMAL;	/* treat DYNUPGRDFAIL as no error in case of downgrade */
						reorg_stats.blks_skipped_newfmtindisk++;
						continue;	/* current disk version is identical to what is desired */
					}
					assert(SS_NORMAL == status1);
				}
				/* Begin non-TP transaction to upgrade/downgrade the block.
				 * The way we do that is by updating the block using a null update array.
				 * Any update to a block will trigger an automatic upgrade/downgrade of the block based on
				 * 	the current fileheader desired_db_format setting and we use that here.
				 */
				t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
				for (; ;)
				{
					CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
					curr_tn = csd->trans_hist.curr_tn;
					db_got_to_v5_once = csd->db_got_to_v5_once;
					if (db_got_to_v5_once || (BLK_RECYCLED != bml_status))
					{
						blkhist->cse = NULL;	/* start afresh (do not use value from previous retry) */
						blkBase = t_qread(curblk, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr);
						if (NULL == blkBase)
						{
							t_retry((enum cdb_sc)rdfail_detail);
							continue;
						}
						blkhist->blk_num = curblk;
						blkhist->buffaddr = blkBase;
						ondsk_blkver = blkhist->cr->ondsk_blkver;
						new_hdr = *(blk_hdr_ptr_t)blkBase;
						mu_reorg_upgrd_dwngrd_blktn = new_hdr.tn;
						mark_blk_free = FALSE;
						inctn_opcode = upgrade ? inctn_blkupgrd : inctn_blkdwngrd;
					} else
					{
						mark_blk_free = TRUE;
						inctn_opcode = inctn_blkmarkfree;
					}
					inctn_detail.blknum_struct.blknum = curblk;
					/* t_end assumes that the history it is passed does not contain a bitmap block.
					 * for bitmap block, the history validation information is passed through cse instead.
					 * therefore we need to handle bitmap and non-bitmap cases separately.
					 */
					if (!lcnt)
					{	/* Means a bitmap block.
						 * At this point we can do a "new_db_format != ondsk_blkver" check to determine
						 * if the block got converted since we did the dsk_read (see the non-bitmap case
						 * for a similar check done there), but in that case we will have a transaction
						 * which has read 1 bitmap block and is updating no block. "t_end" currently cannot
						 * handle this case as it expects any bitmap block that needs validation to also
						 * have a corresponding cse which will hold its history. Hence we avoid doing the
						 * new_db_format check. The only disadvantage of this is that we will end up
						 * modifying the bitmap block as part of this transaction (in an attempt to convert
						 * its ondsk_blkver) even though it is already in the right format. Since this
						 * overhead is going to be one per bitmap block and since the block is in the cache
						 * at this point, we should not lose much.
						 */
						assert(!mark_blk_free);
						BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id);
						*blkid_ptr = 0;
						t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0);
						assert(&alt_hist.h[0] == blkhist);
						alt_hist.h[0].blk_num = 0; /* create empty history for bitmap block */
						assert(update_trans);
					} else
					{	/* non-bitmap block. fill in history for validation in t_end */
						assert(curblk);	/* we should never come here for block 0 (bitmap) */
						if (!mark_blk_free)
						{
							assert(blkhist->blk_num == curblk);
							assert(blkhist->buffaddr == blkBase);
							blkhist->tn      = curr_tn;
							alt_hist.h[1].blk_num = 0;
						}
						/* Also need to pass the bitmap as history to detect if any concurrent M-kill
						 * is freeing up the same USED block that we are trying to convert OR if any
						 * concurrent M-set is reusing the same RECYCLED block that we are trying to
						 * convert. Because of t_end currently not being able to validate a bitmap
						 * without that simultaneously having a cse, we need to create a cse for the
						 * bitmap that is used only for bitmap history validation, but should not be
						 * used to update the contents of the bitmap block in bg_update.
						 */
						bmlhist.buffaddr = t_qread(curbmp, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr);
						if (NULL == bmlhist.buffaddr)
						{
							t_retry((enum cdb_sc)rdfail_detail);
							continue;
						}
						bmlhist.blk_num = curbmp;
						bmlhist.tn = curr_tn;
						GET_BM_STATUS(bmlhist.buffaddr, lcnt, bml_status);
						if (BLK_MAPINVALID == bml_status)
						{
							t_retry(cdb_sc_lostbmlcr);
							continue;
						}
						if (!mark_blk_free)
						{
							if ((new_db_format != ondsk_blkver) && (BLK_FREE != bml_status))
							{	/* block still needs to be converted. create cse */
								BLK_INIT(bs_ptr, bs1);
								BLK_SEG(bs_ptr, blkBase + SIZEOF(new_hdr),
									new_hdr.bsiz - SIZEOF(new_hdr));
								BLK_FINI(bs_ptr, bs1);
								t_write(blkhist, (unsigned char *)bs1, 0, 0,
									((blk_hdr_ptr_t)blkBase)->levl, FALSE,
									FALSE, GDS_WRITE_PLAIN);
								/* The directory tree status for now is only used to determine
								 * whether writing the block to snapshot file (see t_end_sysops.c).
 								 * For reorg upgrade/downgrade process, the block is updated in a
								 * sequential way without changing the gv_target. In this case, we
								 * assume the block is in directory tree so as to have it written to
								 * the snapshot file.
			 					 */
								BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state);
								/* reset update_trans in case previous retry had set it to 0 */
								update_trans = UPDTRNS_DB_UPDATED_MASK;
								if (BLK_RECYCLED == bml_status)
								{	/* If block that we are upgarding is RECYCLED, indicate to
									 * bg_update that blks_to_upgrd counter should NOT be
									 * touched in this case by setting "mode" to a special value
									 */
									assert(cw_set[cw_set_depth-1].mode == gds_t_write);
									cw_set[cw_set_depth-1].mode = gds_t_write_recycled;
									/* we SET block as NOT RECYCLED, otherwise, the mm_update()
									 * or bg_update_phase2 may skip writing it to snapshot file
									 * when its level is 0
									 */
									BIT_CLEAR_RECYCLED(cw_set[cw_set_depth-1].blk_prior_state);
								}
							} else
							{	/* Block got converted by another process since we did the dsk_read.
								 * 	or this block became marked free in the bitmap.
								 * No need to update this block. just call t_end for validation of
								 * 	both the non-bitmap block as well as the bitmap block.
								 * Note down that this transaction is no longer updating any blocks.
								 */
								update_trans = 0;
							}
							/* Need to put bit maps on the end of the cw set for concurrency checking.
							 * We want to simulate t_write_map, except we want to update "cw_map_depth"
							 * instead of "cw_set_depth". Hence the save and restore logic below.
							 * This part of the code is similar to the one in mu_swap_blk.c
							 */
							save_cw_set_depth = cw_set_depth;
							assert(!cw_map_depth);
							t_write_map(&bmlhist, NULL, curr_tn, 0); /* will increment cw_set_depth */
							cw_map_depth = cw_set_depth; /* set cw_map_depth to latest cw_set_depth */
							cw_set_depth = save_cw_set_depth;/* restore cw_set_depth */
							/* t_write_map simulation end */
						} else
						{
							if (BLK_RECYCLED != bml_status)
							{	/* Block was RECYCLED at beginning but no longer so. Retry */
								t_retry(cdb_sc_bmlmod);
								continue;
							}
							/* Mark recycled block as FREE in bitmap */
							assert(lcnt == (curblk - curbmp));
							assert(update_array_ptr == update_array);
							*((block_id *)update_array_ptr) = lcnt;
							update_array_ptr += SIZEOF(block_id);
							/* the following assumes SIZEOF(block_id) == SIZEOF(int) */
							assert(SIZEOF(block_id) == SIZEOF(int));
							*(int *)update_array_ptr = 0;
							t_write_map(&bmlhist, (unsigned char *)update_array, curr_tn, 0);
							update_trans = UPDTRNS_DB_UPDATED_MASK;
						}
					}
					assert(SIZEOF(lcl_update_trans) == SIZEOF(update_trans));
					lcl_update_trans = update_trans;	/* take a copy before t_end modifies it */
					if ((trans_num)0 != t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
					{	/* In case this is MM and t_end() remapped an extended database, reset csd */
						assert(csd == cs_data);
						if (!lcl_update_trans)
						{
							assert(lcnt);
							assert(!mark_blk_free);
							assert((new_db_format == ondsk_blkver) || (BLK_BUSY != bml_status));
							if (BLK_BUSY != bml_status)
								reorg_stats.blks_skipped_free++;
							else
								reorg_stats.blks_skipped_newfmtincache++;
						} else if (!lcnt)
							reorg_stats.blks_converted_bmp++;
						else
							reorg_stats.blks_converted_nonbmp++;
						break;
					}
					assert(csd == cs_data);
				}
			}
		}
	stop_reorg_on_this_reg:
		/* even though ctrl-c occurred, update file-header fields to store reorg's progress before exiting */
		grab_crit(reg);
		blocks_left = 0;
		assert(csd->trans_hist.total_blks >= csd->blks_to_upgrd);
		actual_blks2upgrd = csd->blks_to_upgrd;
		total_blks = csd->trans_hist.total_blks;
		free_blks = csd->trans_hist.free_blocks;
		/* Care should be taken not to set "csd->reorg_upgrd_dwngrd_restart_block" in case of a concurrent db fmt
		 * change. This is because let us say we are doing REORG UPGRADE. A concurrent REORG DOWNGRADE would
		 * have reset "csd->reorg_upgrd_dwngrd_restart_block" field to 0 and if that reorg is interrupted by a
		 * Ctrl-C (before this reorg came here) it would have updated "csd->reorg_upgrd_dwngrd_restart_block" to
		 * a non-zero value indicating how many blocks from 0 have been downgraded. We should not reset this
		 * field to "curblk" as it will be mis-interpreted as the number of blocks that have been DOWNgraded.
		 */
		set_fully_upgraded = FALSE;
		if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn)
		{	/* csd->desired_db_format changed since reorg started. discontinue the reorg */
			util_out_print("Region !AD : Desired DB Format changed during REORG. Stopping REORG.",
				TRUE, REG_LEN_STR(reg));
			status1 = ERR_MUNOFINISH;
		} else if (reorg_entiredb)
		{	/* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */
			assert(csd->reorg_upgrd_dwngrd_restart_block <= curblk);
			csd->reorg_upgrd_dwngrd_restart_block = curblk;	/* blocks lesser than this have been upgraded/downgraded */
			expected_blks2upgrd = upgrade ? 0 : (total_blks - free_blks);
			blocks_left = upgrade ? actual_blks2upgrd : (expected_blks2upgrd - actual_blks2upgrd);
			/* If this reorg command went through all blocks in the database, then it should have
			 * 	correctly concluded at this point whether the reorg is complete or not.
			 * If this reorg command started from where a previous incomplete reorg left
			 *	(i.e. first_reorg_in_this_db_fmt is FALSE), it cannot determine if the initial
			 *	GDS blocks that it skipped are completely {up,down}graded or not.
			 */
			assert((0 == blocks_left) || (SS_NORMAL != status1) || !first_reorg_in_this_db_fmt);
			/* If this is a MUPIP REORG UPGRADE that did go through every block in the database (indicated by
			 * "reorg_entiredb" && "first_reorg_in_this_db_fmt") and the current count of "blks_to_upgrd" is
			 * 0 in the file-header and the desired_db_format did not change since the start of the REORG,
			 * we can be sure that the entire database has been upgraded. Set "csd->fully_upgraded" to TRUE.
			 */
			if ((SS_NORMAL == status1) && first_reorg_in_this_db_fmt && upgrade && (0 == actual_blks2upgrd))
			{
				csd->fully_upgraded = TRUE;
				csd->db_got_to_v5_once = TRUE;
				set_fully_upgraded = TRUE;
			}
			/* flush all changes noted down in the file-header */
			if (!wcs_flu(WCSFLU_FLUSH_HDR))	/* wcs_flu assumes gv_cur_region is set (which it is in this routine) */
			{
				gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4,
					LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg));
				status = ERR_MUNOFINISH;
				rel_crit(reg);
				continue;
			}
		}
		curr_tn = csd->trans_hist.curr_tn;
		rel_crit(reg);
		util_out_print("Region !AD : Stopped processing at block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk);
		/* Print statistics */
		util_out_print("Region !AD : Statistics : Blocks Read From Disk (Bitmap)     : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_bmp);
		util_out_print("Region !AD : Statistics : Blocks Skipped (Free)              : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_free);
		util_out_print("Region !AD : Statistics : Blocks Read From Disk (Non-Bitmap) : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_nonbmp);
		util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in disk)   : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtindisk);
		util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in cache)  : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtincache);
		util_out_print("Region !AD : Statistics : Blocks Converted (Bitmap)          : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_bmp);
		util_out_print("Region !AD : Statistics : Blocks Converted (Non-Bitmap)      : 0x!XL",
			TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_nonbmp);
		if (reorg_entiredb && (SS_NORMAL == status1) && (0 != blocks_left))
		{	/* file-header counter does not match what reorg on the entire database expected to see */
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBBTUWRNG, 2, expected_blks2upgrd, actual_blks2upgrd);
			util_out_print("Region !AD : Run MUPIP INTEG (without FAST qualifier) to fix the counter",
				TRUE, REG_LEN_STR(reg));
			status1 = ERR_MUNOFINISH;
		} else
			util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : "
				       "Blocks to upgrade = [0x!XL]",
				       TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd);
		/* Issue success or failure message for this region */
		if (SS_NORMAL == status1)
		{	/* issue success only if REORG did not encounter any error in its processing */
			if (set_fully_upgraded)
				util_out_print("Region !AD : Database is now FULLY UPGRADED", TRUE, REG_LEN_STR(reg));
			util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUREUPDWNGRDEND, 5, REG_LEN_STR(reg),
										process_id, process_id, &curr_tn);
		} else
		{
			assert(ERR_MUNOFINISH == status1);
			assert((SS_NORMAL == status) || (ERR_MUNOFINISH == status));
			util_out_print("Region !AD : MUPIP REORG !AD incomplete. See above messages.!/",
					TRUE, REG_LEN_STR(reg), LEN_AND_STR(command));
			status = status1;
		}
	}
	if (NULL != bptr)
		free(bptr);
	if (NULL != bml_lcl_buff)
		free(bml_lcl_buff);
	if (mu_ctrly_occurred || mu_ctrlc_occurred)
	{
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_REORGCTRLY);
		status = ERR_MUNOFINISH;
	}
	mupip_exit(status);
}
Beispiel #11
0
void trigger_delete_all(char *trigger_rec, uint4 len, uint4 *trig_stats)
{
	int			count;
	sgmnt_addrs		*csa;
	mval			curr_gbl_name;
	mval			after_hash_cycle;
	int			cycle;
	mval			*mv_count_ptr;
	mval			*mv_cycle_ptr;
	gd_region		*reg, *reg_top;
	int4			result;
	gd_region		*lgtrig_reg;
	mval			trigger_cycle;
	mval			trigger_count;
	boolean_t		this_db_updated;
	uint4			triggers_deleted;
	mval			trigjrec;
	boolean_t		jnl_format_done;
	boolean_t		delete_required;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(0 < dollar_tlevel);
	jnl_format_done = FALSE;
	lgtrig_reg = NULL;
	trigjrec.mvtype = MV_STR;
	trigjrec.str.len = len;
	trigjrec.str.addr = trigger_rec;
	triggers_deleted = 0;
	for (reg = gd_header->regions, reg_top = reg + gd_header->n_regions; reg < reg_top; reg++)
	{
		GVTR_SWITCH_REG_AND_HASHT_BIND_NAME(reg);
		csa = cs_addrs;
		if (NULL == csa)	/* not BG or MM access method */
			continue;
		/* gv_target now points to ^#t in region "reg" */
		/* To write the LGTRIG logical jnl record, choose some region that has journaling enabled */
		if (!reg->read_only && !jnl_format_done && JNL_WRITE_LOGICAL_RECS(csa))
			lgtrig_reg = reg;
		if (!gv_target->root)
			continue;
		/* kill ^#t("#TNAME") */
		BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHTNAME, STRLEN(LITERAL_HASHTNAME));
		if (0 != gvcst_data())
		{	/* Issue error if we dont have permissions to touch ^#t global */
			if (reg->read_only)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_TRIGMODREGNOTRW, 2, REG_LEN_STR(reg));
			gvcst_kill(TRUE);
		}
		/* Kill all descendents of ^#t(trigvn, ...) where trigvn is any global with a trigger,
		 * but skip the ^#t("#...",...) entries. Setup ^#t("$") as the key for op_gvorder
		 */
		BUILD_HASHT_SUB_CURRKEY(LITERAL_MAXHASHVAL, STRLEN(LITERAL_MAXHASHVAL));
		TREF(gv_last_subsc_null) = FALSE; /* We know its not null, but prior state is unreliable */
		this_db_updated = FALSE;
		while (TRUE)
		{
			op_gvorder(&curr_gbl_name);
			/* quit:$length(curr_gbl_name)=0 */
			if (0 == curr_gbl_name.str.len)
				break;
			count = cycle = 0;
			/* $get(^#t(curr_gbl_name,#COUNT)) */
			BUILD_HASHT_SUB_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len,
							LITERAL_HASHCOUNT, STRLEN(LITERAL_HASHCOUNT));
			if (TRUE == (delete_required = gvcst_get(&trigger_count))) /* inline assignment */
			{
				mv_count_ptr = &trigger_count;
				count = MV_FORCE_UINT(mv_count_ptr);
			}
			/* $get(^#t(curr_gbl_name,#CYCLE)) */
			BUILD_HASHT_SUB_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len,
				LITERAL_HASHCYCLE, STRLEN(LITERAL_HASHCYCLE));
			if (!gvcst_get(&trigger_cycle))
			{	/* Found hasht record, there must be #CYCLE */
				if (CDB_STAGNATE > t_tries)
					t_retry(cdb_sc_triggermod);
				gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6,
						curr_gbl_name.str.len, curr_gbl_name.str.addr,
						curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#CYCLE\""),
						ERR_TEXT, 2, RTS_ERROR_TEXT("#CYCLE field is missing"));
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6,
						curr_gbl_name.str.len, curr_gbl_name.str.addr,
						curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#CYCLE\""),
						ERR_TEXT, 2, RTS_ERROR_TEXT("#CYCLE field is missing"));
				assert(WBTEST_HELPOUT_TRIGDEFBAD == gtm_white_box_test_case_number);
			} else {
				mv_cycle_ptr = &trigger_cycle;
				cycle = MV_FORCE_UINT(mv_cycle_ptr);
				cycle++;
				MV_FORCE_MVAL(&trigger_cycle, cycle);
			}
			if (!delete_required)
			{	/* $order(^#t(curr_gbl_name,#LABEL)); should be the NULL string if #COUNT not found
				 * Use #LABEL vs #CYCLE because MUPIP TRIGGER -UPGRADE unconditionally inserts it */
				BUILD_HASHT_SUB_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len,
					LITERAL_HASHLABEL, STRLEN(LITERAL_HASHLABEL));
				op_gvorder(&after_hash_cycle);
				/* quit:$length(after_hash_cycle)=0 */
				if (0 != after_hash_cycle.str.len)
				{	/* Found hasht record after #LABEL, but #COUNT is not defined; Force removal */
					if (CDB_STAGNATE > t_tries)
						t_retry(cdb_sc_triggermod);
					gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6,
							curr_gbl_name.str.len, curr_gbl_name.str.addr,
							curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#COUNT\""),
							ERR_TEXT, 2, RTS_ERROR_TEXT("#COUNT field is missing. Skipped in results"));
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6,
							curr_gbl_name.str.len, curr_gbl_name.str.addr,
							curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#COUNT\""),
							ERR_TEXT, 2, RTS_ERROR_TEXT("#COUNT field is missing. Skipped in results"));
					assert(WBTEST_HELPOUT_TRIGDEFBAD == gtm_white_box_test_case_number);
					delete_required = TRUE;
				}
			}
			if (delete_required)
			{
				/* Now that we know there is something to kill, check if we have permissions to touch ^#t global */
				if (reg->read_only)
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_TRIGMODREGNOTRW, 2, REG_LEN_STR(reg));
				if (!jnl_format_done && JNL_WRITE_LOGICAL_RECS(csa))
				{
					jnl_format(JNL_LGTRIG, NULL, &trigjrec, 0);
					jnl_format_done = TRUE;
				}
				/* kill ^#t(curr_gbl_name); kills ^#t(curr_gbl_name,"#TRHASH") as well */
				BUILD_HASHT_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len);
				gvcst_kill(TRUE);
				if (0 < cycle)
				{
					/* set ^#t(curr_gbl_name,#CYCLE)=trigger_cycle */
					SET_TRIGGER_GLOBAL_SUB_SUB_MVAL(curr_gbl_name.str.addr, curr_gbl_name.str.len,
						LITERAL_HASHCYCLE, STRLEN(LITERAL_HASHCYCLE), trigger_cycle, result);
					assert(PUT_SUCCESS == result);
				}
				this_db_updated = TRUE;
				triggers_deleted += count;
			} /* else there is nothing under the hasht record, leave #CYCLE alone */
			/* get ready for op_gvorder() call for next trigger under ^#t */
			BUILD_HASHT_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len);
		}
		if (this_db_updated)
		{
			csa->incr_db_trigger_cycle = TRUE;
			if (dollar_ztrigger_invoked)
			{	/* increment db_dztrigger_cycle so that next gvcst_put/gvcst_kill in this transaction,
				 * on this region, will re-read. See trigger_update.c for a comment on why it is okay
				 * for db_dztrigger_cycle to be incremented more than once in the same transaction
				 */
				csa->db_dztrigger_cycle++;
			}
		}
	}
	if (!jnl_format_done && (NULL != lgtrig_reg))
	{	/* There was no journaled region that had a ^#t update, but found at least one journaled region
		 * so write a LGTRIG logical jnl record there.
		 */
		GVTR_SWITCH_REG_AND_HASHT_BIND_NAME(lgtrig_reg);
		csa = cs_addrs;
		JNLPOOL_INIT_IF_NEEDED(csa, csa->hdr, csa->nl);	/* see previous usage for comment on why it is needed */
		assert(dollar_tlevel);
		T_BEGIN_SETORKILL_NONTP_OR_TP(ERR_TRIGLOADFAIL);	/* needed to set update_trans TRUE on this region
									 * even if NO db updates happen to ^#t nodes. */
		jnl_format(JNL_LGTRIG, NULL, &trigjrec, 0);
		jnl_format_done = TRUE;
	}
	if (triggers_deleted)
	{
		util_out_print_gtmio("All existing triggers (count = !UL) deleted", FLUSH, triggers_deleted);
		trig_stats[STATS_DELETED] += triggers_deleted;
		trig_stats[STATS_NOERROR_TRIGFILE]++;
	} else
	{
		util_out_print_gtmio("No matching triggers found for deletion", FLUSH);
		trig_stats[STATS_UNCHANGED_TRIGFILE]++;
	}
}
Beispiel #12
0
int gtmsource()
{
	int			status, log_init_status, waitpid_res, save_errno;
	char			print_msg[1024], tmpmsg[1024];
	gd_region		*reg, *region_top;
	sgmnt_addrs		*csa, *repl_csa;
	boolean_t		all_files_open, isalive;
	pid_t			pid, ppid, procgp;
	seq_num			read_jnl_seqno, jnl_seqno;
	unix_db_info		*udi;
	gtmsource_local_ptr_t	gtmsource_local;
	boolean_t		this_side_std_null_coll;
	int			null_fd, rc;

	memset((uchar_ptr_t)&jnlpool, 0, SIZEOF(jnlpool_addrs));
	call_on_signal = gtmsource_sigstop;
	ESTABLISH_RET(gtmsource_ch, SS_NORMAL);
	if (-1 == gtmsource_get_opt())
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_MUPCLIERR);
	if (gtmsource_options.shut_down)
	{	/* Wait till shutdown time nears even before going to "jnlpool_init". This is because the latter will return
		 * with the ftok semaphore and access semaphore held and we do not want to be holding those locks (while
		 * waiting for the user specified timeout to expire) as that will affect new GTM processes and/or other
		 * MUPIP REPLIC commands that need these locks for their function.
		 */
		if (0 < gtmsource_options.shutdown_time)
		{
			repl_log(stdout, TRUE, TRUE, "Waiting for %d seconds before signalling shutdown\n",
												gtmsource_options.shutdown_time);
			LONG_SLEEP(gtmsource_options.shutdown_time);
		} else
			repl_log(stdout, TRUE, TRUE, "Signalling shutdown immediate\n");
	} else if (gtmsource_options.start)
	{
		repl_log(stdout, TRUE, TRUE, "Initiating START of source server for secondary instance [%s]\n",
			gtmsource_options.secondary_instname);
	}
	if (gtmsource_options.activate && (ROOTPRIMARY_SPECIFIED == gtmsource_options.rootprimary))
	{	/* MUPIP REPLIC -SOURCE -ACTIVATE -UPDOK has been specified. We need to open the gld and db regions now
		 * in case this is a secondary -> primary transition. This is so we can later switch journal files in all
		 * journaled regions when the transition actually happens inside "gtmsource_rootprimary_init". But since
		 * we have not yet done a "jnlpool_init", we dont know if updates are disabled in it or not. Although we
		 * need to do the gld/db open only if updates are currently disabled in the jnlpool, we do this always
		 * because once we do a jnlpool_init, we will come back with the ftok on the jnlpool held and that has
		 * issues with later db open since we will try to hold the db ftok as part of db open and the ftok logic
		 * currently has assumptions that a process holds only one ftok at any point in time.
		 */
		assert(NULL == gd_header);
		gvinit();
		all_files_open = region_init(FALSE);
		if (!all_files_open)
		{
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN);
			gtmsource_exit(ABNORMAL_SHUTDOWN);
		}
	}
	jnlpool_init(GTMSOURCE, gtmsource_options.start, &is_jnlpool_creator);
	/* is_jnlpool_creator == TRUE ==> this process created the journal pool
	 * is_jnlpool_creator == FALSE ==> journal pool already existed and this process simply attached to it.
	 */
	if (gtmsource_options.shut_down)
		gtmsource_exit(gtmsource_shutdown(FALSE, NORMAL_SHUTDOWN) - NORMAL_SHUTDOWN);
	else if (gtmsource_options.activate)
		gtmsource_exit(gtmsource_mode_change(GTMSOURCE_MODE_ACTIVE_REQUESTED) - NORMAL_SHUTDOWN);
	else if (gtmsource_options.deactivate)
		gtmsource_exit(gtmsource_mode_change(GTMSOURCE_MODE_PASSIVE_REQUESTED) - NORMAL_SHUTDOWN);
	else if (gtmsource_options.checkhealth)
		gtmsource_exit(gtmsource_checkhealth() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.changelog)
		 gtmsource_exit(gtmsource_changelog() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.showbacklog)
		gtmsource_exit(gtmsource_showbacklog() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.stopsourcefilter)
		gtmsource_exit(gtmsource_stopfilter() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.jnlpool)
		gtmsource_exit(gtmsource_jnlpool() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.losttncomplete)
		gtmsource_exit(gtmsource_losttncomplete() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.needrestart)
		gtmsource_exit(gtmsource_needrestart() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.showfreeze)
		gtmsource_exit(gtmsource_showfreeze() - NORMAL_SHUTDOWN);
	else if (gtmsource_options.setfreeze)
		gtmsource_exit(gtmsource_setfreeze() - NORMAL_SHUTDOWN);
	else if (!gtmsource_options.start)
	{
		assert(CLI_PRESENT == cli_present("STATSLOG"));
		gtmsource_exit(gtmsource_statslog() - NORMAL_SHUTDOWN);
	}
	assert(gtmsource_options.start);
#	ifndef REPL_DEBUG_NOBACKGROUND
	/* Set "child_server_running" to FALSE before forking off child. Wait for it to be set to TRUE by the child. */
	gtmsource_local = jnlpool.gtmsource_local;
	gtmsource_local->child_server_running = FALSE;
	FORK(pid);
	if (0 > pid)
	{
		save_errno = errno;
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0,
			ERR_TEXT, 2, RTS_ERROR_LITERAL("Could not fork source server"), save_errno);
	} else if (0 < pid)
	{	/* Parent. Wait until child sets "child_server_running" to FALSE. That is an indication that the child
		 * source server has completed its initialization phase and is all set so the parent command can return.
		 */
		while (isalive = is_proc_alive(pid, 0))	/* note : intended assignment */
		{
			if (gtmsource_local->child_server_running)
				break;
			/* To take care of reassignment of PIDs, the while condition should be && with the condition
			 * (PPID of pid == process_id)
			 */
			SHORT_SLEEP(GTMSOURCE_WAIT_FOR_SRV_START);
			WAITPID(pid, &status, WNOHANG, waitpid_res); /* Release defunct child if dead */
		}
		if (isalive)
		{	/* Child process is alive and started with no issues */
			if (0 != (save_errno = rel_sem(SOURCE, JNL_POOL_ACCESS_SEM)))
				rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0,
					ERR_TEXT, 2, RTS_ERROR_LITERAL("Error in rel_sem"), save_errno);
			ftok_sem_release(jnlpool.jnlpool_dummy_reg, TRUE, TRUE);
		} else
		{	/* Child source server process errored out at startup and is no longer alive.
			 * If we were the one who created the journal pool, let us clean it up.
			 */
			repl_log(stdout, TRUE, TRUE, "Source server startup failed. See source server log file\n");
			if (is_jnlpool_creator)
				status = gtmsource_shutdown(TRUE, NORMAL_SHUTDOWN);
		}
		/* If the parent is killed (or crashes) between the fork and exit, checkhealth may not detect that startup
		 * is in progress - parent forks and dies, the system will release sem 0 and 1, checkhealth might test the
		 * value of sem 1 before the child grabs sem 1.
		 */
		gtmsource_exit(isalive ? SRV_ALIVE : SRV_ERR);
	}
	/* Point stdin to /dev/null */
	OPENFILE("/dev/null", O_RDONLY, null_fd);
	if (0 > null_fd)
		rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to open /dev/null for read"), errno, 0);
	FCNTL3(null_fd, F_DUPFD, 0, rc);
	if (0 > rc)
		rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to set stdin to /dev/null"), errno, 0);
	CLOSEFILE(null_fd, rc);
	if (0 > rc)
		rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to close /dev/null"), errno, 0);
	/* The parent process (source server startup command) will be holding the ftok semaphore and jnlpool access semaphore
	 * at this point. The variables that indicate this would have been copied over to the child during the fork. This will
	 * make the child think it is actually holding them as well when actually it is not. Reset those variables in the child
	 * to ensure they do not misrepresent the holder of those semaphores.
	 */
	ftok_sem_reg = NULL;
	udi = FILE_INFO(jnlpool.jnlpool_dummy_reg);
	assert(udi->grabbed_ftok_sem);
	udi->grabbed_ftok_sem = FALSE;
	assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
	holds_sem[SOURCE][JNL_POOL_ACCESS_SEM] = FALSE;
	assert(!holds_sem[SOURCE][SRC_SERV_COUNT_SEM]);
	/* Start child source server initialization */
	is_src_server = TRUE;
	OPERATOR_LOG_MSG;
	process_id = getpid();
	/* Reinvoke secshr related initialization with the child's pid */
	INVOKE_INIT_SECSHR_ADDRS;
	/* Initialize mutex socket, memory semaphore etc. before any "grab_lock" is done by this process on the journal pool.
	 * Note that the initialization would already have been done by the parent receiver startup command but we need to
	 * redo the initialization with the child process id.
	 */
	assert(mutex_per_process_init_pid && (mutex_per_process_init_pid != process_id));
	mutex_per_process_init();
	START_HEARTBEAT_IF_NEEDED;
	ppid = getppid();
	log_init_status = repl_log_init(REPL_GENERAL_LOG, &gtmsource_log_fd, gtmsource_options.log_file);
	assert(SS_NORMAL == log_init_status);
	repl_log_fd2fp(&gtmsource_log_fp, gtmsource_log_fd);
	if (-1 == (procgp = setsid()))
		send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
				RTS_ERROR_LITERAL("Source server error in setsid"), errno);
#	endif /* REPL_DEBUG_NOBACKGROUND */
	if (ZLIB_CMPLVL_NONE != gtm_zlib_cmp_level)
		gtm_zlib_init();	/* Open zlib shared library for compression/decompression */
	REPL_DPRINT1("Setting up regions\n");
	gvinit();

	/* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */
	all_files_open = region_init(FALSE);
	if (!all_files_open)
	{
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN);
		gtmsource_exit(ABNORMAL_SHUTDOWN);
	}
	/* Determine primary side null subscripts collation order */
	/* Also check whether all regions have same null collation order */
	this_side_std_null_coll = -1;
	for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++)
	{
		csa = &FILE_INFO(reg)->s_addrs;
		if (this_side_std_null_coll != csa->hdr->std_null_coll)
		{
			if (-1 == this_side_std_null_coll)
				this_side_std_null_coll = csa->hdr->std_null_coll;
			else
			{
				gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NULLCOLLDIFF);
				gtmsource_exit(ABNORMAL_SHUTDOWN);
			}
		}
		if (!REPL_ALLOWED(csa) && JNL_ALLOWED(csa))
		{
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_REPLOFFJNLON, 2, DB_LEN_STR(reg));
			gtmsource_exit(ABNORMAL_SHUTDOWN);
		}
		if (reg->read_only && REPL_ALLOWED(csa))
		{
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
				   RTS_ERROR_LITERAL("Source Server does not have write permissions to one or "
					             "more database files that are replicated"));
			gtmsource_exit(ABNORMAL_SHUTDOWN);
		}
	}
	/* Initialize source server alive/dead state related fields in "gtmsource_local" before the ftok semaphore is released */
	gtmsource_local->gtmsource_pid = process_id;
	gtmsource_local->gtmsource_state = GTMSOURCE_START;
	if (is_jnlpool_creator)
	{
		DEBUG_ONLY(jnlpool.jnlpool_ctl->jnlpool_creator_pid = process_id);
		gtmsource_seqno_init(this_side_std_null_coll);
		if (ROOTPRIMARY_SPECIFIED == gtmsource_options.rootprimary)
		{	/* Created the journal pool as a root primary. Append a history record to the replication instance file.
			 * Invoke the function "gtmsource_rootprimary_init" to do that.
			 */
			gtmsource_rootprimary_init(jnlpool.jnlpool_ctl->jnl_seqno);
		}
	}
	/* after this point we can no longer have the case where all the regions are unreplicated/non-journaled. */
#	ifndef REPL_DEBUG_NOBACKGROUND
	/* It is necessary for every process that is using the ftok semaphore to increment the counter by 1. This is used
	 * by the last process that shuts down to delete the ftok semaphore when it notices the counter to be 0.
	 * Note that the parent source server startup command would have done an increment of the ftok counter semaphore
	 * for the replication instance file. But the source server process (the child) that comes here would not have done
	 * that. Do that while the parent is still holding on to the ftok semaphore waiting for our okay.
	 */
	if (!ftok_sem_incrcnt(jnlpool.jnlpool_dummy_reg))
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_JNLPOOLSETUP);
	/* Increment the source server count semaphore */
	status = incr_sem(SOURCE, SRC_SERV_COUNT_SEM);
	if (0 != status)
	{
		save_errno = errno;
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
			RTS_ERROR_LITERAL("Counter semaphore increment failure in child source server"), save_errno);
	}
#	else
	if (0 != (save_errno = rel_sem_immediate(SOURCE, JNL_POOL_ACCESS_SEM)))
	{
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
			RTS_ERROR_LITERAL("Error in rel_sem_immediate"), save_errno);
	}
#	endif /* REPL_DEBUG_NOBACKGROUND */

	gtmsource_srv_count++;
	gtmsource_local->child_server_running = TRUE;	/* At this point, the parent startup command will stop waiting for child */
	gtm_event_log_init();
	/* Log source server startup command line first */
	SPRINTF(tmpmsg, "%s %s\n", cli_lex_in_ptr->argv[0], cli_lex_in_ptr->in_str);
	repl_log(gtmsource_log_fp, TRUE, TRUE, tmpmsg);

	SPRINTF(tmpmsg, "GTM Replication Source Server with Pid [%d] started for Secondary Instance [%s]",
		process_id, gtmsource_local->secondary_instname);
	sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, LEN_AND_STR(tmpmsg));
	repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg);
	if (is_jnlpool_creator)
	{
		repl_log(gtmsource_log_fp, TRUE, TRUE, "Created jnlpool with shmid = [%d] and semid = [%d]\n",
			jnlpool.repl_inst_filehdr->jnlpool_shmid, jnlpool.repl_inst_filehdr->jnlpool_semid);
	} else
		repl_log(gtmsource_log_fp, TRUE, TRUE, "Attached to existing jnlpool with shmid = [%d] and semid = [%d]\n",
			jnlpool.repl_inst_filehdr->jnlpool_shmid, jnlpool.repl_inst_filehdr->jnlpool_semid);
	gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg);
#	ifdef GTM_TLS
	if (REPL_TLS_REQUESTED)
	{
		repl_do_tls_init(gtmsource_log_fp);
		assert(REPL_TLS_REQUESTED || PLAINTEXT_FALLBACK);
	}
#	endif
	if (jnlpool.jnlpool_ctl->freeze)
	{
		last_seen_freeze_flag = jnlpool.jnlpool_ctl->freeze;
		sgtm_putmsg(print_msg, VARLSTCNT(3) ERR_REPLINSTFROZEN, 1, jnlpool.repl_inst_filehdr->inst_info.this_instname);
		repl_log(gtmsource_log_fp, TRUE, FALSE, print_msg);
		sgtm_putmsg(print_msg, VARLSTCNT(3) ERR_REPLINSTFREEZECOMMENT, 1, jnlpool.jnlpool_ctl->freeze_comment);
		repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg);
	}
	gtmsource_local->jnlfileonly = gtmsource_options.jnlfileonly;
	do
	{ 	/* If mode is passive, go to sleep. Wakeup every now and then and check to see if I have to become active. */
		gtmsource_state = gtmsource_local->gtmsource_state = GTMSOURCE_START;
		if ((gtmsource_local->mode == GTMSOURCE_MODE_PASSIVE) && (gtmsource_local->shutdown == NO_SHUTDOWN))
		{
			gtmsource_poll_actions(FALSE);
			SHORT_SLEEP(GTMSOURCE_WAIT_FOR_MODE_CHANGE);
			continue;
		}
		if (GTMSOURCE_MODE_PASSIVE == gtmsource_local->mode)
		{	/* Shutdown initiated */
			assert(gtmsource_local->shutdown == SHUTDOWN);
			sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2,
				    RTS_ERROR_LITERAL("GTM Replication Source Server Shutdown signalled"));
			repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg);
			gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg);
			break;
		}
		gtmsource_poll_actions(FALSE);
		if (GTMSOURCE_CHANGING_MODE == gtmsource_state)
			continue;
		if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsource_local->mode)
			gtmsource_local->mode = GTMSOURCE_MODE_ACTIVE;
		SPRINTF(tmpmsg, "GTM Replication Source Server now in ACTIVE mode using port %d", gtmsource_local->secondary_port);
		sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, LEN_AND_STR(tmpmsg));
		repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg);
		gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg);
		DEBUG_ONLY(repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs;)
		assert(!repl_csa->hold_onto_crit);	/* so it is ok to invoke "grab_lock" and "rel_lock" unconditionally */
		grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, HANDLE_CONCUR_ONLINE_ROLLBACK);
		if (GTMSOURCE_HANDLE_ONLN_RLBK == gtmsource_state)
		{
			repl_log(gtmsource_log_fp, TRUE, TRUE, "Starting afresh due to ONLINE ROLLBACK\n");
			repl_log(gtmsource_log_fp, TRUE, TRUE, "REPL INFO - Current Jnlpool Seqno : %llu\n",
					jnlpool.jnlpool_ctl->jnl_seqno);
			continue;
		}
		QWASSIGN(gtmsource_local->read_addr, jnlpool.jnlpool_ctl->write_addr);
		gtmsource_local->read = jnlpool.jnlpool_ctl->write;
		gtmsource_local->read_state = gtmsource_local->jnlfileonly ? READ_FILE : READ_POOL;
		read_jnl_seqno = gtmsource_local->read_jnl_seqno;
		assert(read_jnl_seqno <= jnlpool.jnlpool_ctl->jnl_seqno);
		if (read_jnl_seqno < jnlpool.jnlpool_ctl->jnl_seqno)
		{
			gtmsource_local->read_state = READ_FILE;
			QWASSIGN(gtmsource_save_read_jnl_seqno, jnlpool.jnlpool_ctl->jnl_seqno);
			gtmsource_pool2file_transition = TRUE; /* so that we read the latest gener jnl files */
		}
		rel_lock(jnlpool.jnlpool_dummy_reg);
		if (SS_NORMAL != (status = gtmsource_alloc_tcombuff()))
			rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2,
				  RTS_ERROR_LITERAL("Error allocating initial tcom buffer space. Malloc error"), status);
		gtmsource_filter = NO_FILTER;
		if ('\0' != gtmsource_local->filter_cmd[0])
		{
			if (SS_NORMAL == (status = repl_filter_init(gtmsource_local->filter_cmd)))
				gtmsource_filter |= EXTERNAL_FILTER;
			else
				gtmsource_exit(ABNORMAL_SHUTDOWN);
		}
		gtmsource_process();
		/* gtmsource_process returns only when mode needs to be changed to PASSIVE */
		assert(gtmsource_state == GTMSOURCE_CHANGING_MODE);
		gtmsource_ctl_close();
		gtmsource_free_msgbuff();
		gtmsource_free_tcombuff();
		gtmsource_free_filter_buff();
		gtmsource_stop_heartbeat();
		if (FD_INVALID != gtmsource_sock_fd)
			repl_close(&gtmsource_sock_fd);
		if (gtmsource_filter & EXTERNAL_FILTER)
			repl_stop_filter();
	} while (TRUE);
Beispiel #13
0
/* 1. Allocate a basic initial condition handler stack that can be expanded later if necessary.
 * 2. On Linux, make sure bits 0,1, 4, 5, and 6 are set in /proc/PID/coredump_filter so dumps the sections that GT.M
 *    cores need to have in them.
 */
void err_init(void (*x)())
{
	chnd = (condition_handler *)malloc((CONDSTK_INITIAL_INCR + CONDSTK_RESERVE) * SIZEOF(condition_handler));
	chnd[0].ch_active = FALSE;
	chnd[0].save_active_ch = NULL;
	active_ch = ctxt = &chnd[0];
	ctxt->ch = x;
	chnd_end = &chnd[CONDSTK_INITIAL_INCR]; /* chnd_end is the end of the condition handler stack */
	chnd_incr = CONDSTK_INITIAL_INCR * 2;
#	if defined(__linux__) || defined(__NetBSD__)
	/* Read the coredump_filter value from /proc for this process, update the value if necessary so we have the proper
	 * flags set to get the info we (and gtmpcat) need to properly process a core file. Note any errors we encounter just
	 * send a message to the operator log and return as nothing here should prevent GT.M from running.
	 *
	 * Note "man 5 core" on x86-64 Linux (Ubuntu 12.04) notes that the /proc/PID/coredump_filter file is only provided when
	 * the Linux kernel is built with the CONFIG_ELF_CORE configuration option. This *seems* to control whether or not the
	 * kernel supports the ELF loader or not. To date, all Linux flavors GT.M supports use ELF so we regard this as largely
	 * mandatory though in the future it may happen that GT.M works yet runs with something other than ELF. In that case,
	 * we'd need to change the below to avoid the operator log messages every time GT.M initializes.
	 */
	{
		int 		rc;
		unsigned int	filterbits;
		char		procfn[SIZEOF(COREDUMPFILTERFN) + MAX_DIGITS_IN_INT];	/* File name of file to update */
		char		filter[FILTERPARMSIZE], *filterend;			/* Value read in & written out */
		char		*rcc;
		FILE		*filterstrm;						/* filter stream file block */

		/* Note use simple basic methods since this early in initialization not everything is necessarily setup to
		 * be able to properly use the *print*() wrapper functions.
		 */
		rc = snprintf(procfn, SIZEOF(procfn), COREDUMPFILTERFN, getpid());
		if (0 > rc)
		{
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("sprintf()"), CALLFROM, rc);
			return;
		}
		filterstrm = fopen(procfn, "r");
		if (NULL == filterstrm)
		{
			rc = errno;
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fopen()"), CALLFROM, rc);
			return;
		}
		rcc = fgets(filter, SIZEOF(filter), filterstrm);
		if (NULL == rcc)
		{
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fgets()"), CALLFROM, rc);
			return;
		}
		rc = fclose(filterstrm);
		if (0 > rc)
		{
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fclose()"), CALLFROM, rc);
			return;
		}
		filterend = filter + SIZEOF(filter);
		filterbits = (unsigned int)strtol(filter, &filterend, 16);
		if (FILTERENABLEBITS != (filterbits & FILTERENABLEBITS))
		{	/* At least one flag was missing - reset them */
			filterbits = filterbits | FILTERENABLEBITS;
			filterstrm = fopen(procfn, "w");
			if (NULL == filterstrm)
			{
				rc = errno;
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fopen()"),
					     CALLFROM, rc);
				return;
			}
			rc = fprintf(filterstrm, "0x%08x", filterbits);
			if (0 > rc)
			{
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fprintf"),
					     CALLFROM, rc);
				return;
			}
			fclose(filterstrm);
			if (0 > rc)
			{
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fclose()"),
					     CALLFROM, rc);
				return;
			}
		}
	}
#	endif
}
Beispiel #14
0
OS_PAGE_SIZE_DECLARE

uint4	 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog)
{
	sm_uc_ptr_t		old_base[2], mmap_retaddr;
	boolean_t		was_crit, is_mm;
	char			buff[DISK_BLOCK_SIZE];
	int			result, save_errno, status;
	uint4			new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total;
	uint4			jnl_status, to_wait, to_msg, wait_period;
	gtm_uint64_t		avail_blocks, mmap_sz;
	off_t			new_eof;
	trans_num		curr_tn;
	unix_db_info		*udi;
	inctn_opcode_t		save_inctn_opcode;
	int4			prev_extend_blks_to_upgrd;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	cache_rec_ptr_t         cr;
	DCL_THREADGBL_ACCESS;

	assert(!IS_DSE_IMAGE);
	assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */
	assert(!process_exiting);
	DEBUG_ONLY(old_base[0] = old_base[1] = NULL);
	assert(!gv_cur_region->read_only);
	udi = FILE_INFO(gv_cur_region);
	is_mm = (dba_mm == cs_addrs->hdr->acc_meth);
#	if !defined(MM_FILE_EXT_OK)
	if (!udi->grabbed_access_sem && is_mm)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */
#	endif
	/* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will
	   overflow and end up doing silly things.
	*/
	assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR));
	if (!blocks)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */
	bplmap = cs_data->bplmap;
	/* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired
	 * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks
	 *      and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this
	 *      manner as every non-bitmap block must have an associated bitmap)
	 * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap.
	 * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed.
	 */
	new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1)
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap);
	new_blocks = blocks + new_bit_maps;
	assert(0 < (int)new_blocks);
	if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data))
	{
		assert(FALSE);
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX);
		return (uint4)(NO_FREE_SPACE);
	}
	if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE)))
	{
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
	} else
	{
		if (!(gtmDebugLevel & GDL_IgnoreAvailSpace))
		{	/* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB
			 * in space it could never fit it if wasn't sparse. Needed for some tests.
			 */
			avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE);
			if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
			{
				if (blocks > (uint4)avail_blocks)
				{
					SETUP_THREADGBL_ACCESS;
					if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs))
						return (uint4)(NO_FREE_SPACE);
					else
						send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4,
							DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks);
				} else
					send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region),
						 (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0)));
			}
		}
	}
	/* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */
	was_crit = cs_addrs->now_crit;
	assert(!cs_addrs->hold_onto_crit || was_crit);
	/* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur.
	 * If we are coming from online rollback (when that feature is available), we will come in holding crit and in
	 * 	the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself.
	 * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for
	 * freeze.
	 * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited
	 * 	for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this
	 *	function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that
	 *	at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region
	 *	to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused
	 *	op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which
	 *	we hold crit.
	 */
	assert(!was_crit || !cs_data->freeze || (dollar_tlevel && (CDB_STAGNATE <= t_tries)));
	/*
	 * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE
	 * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup
	 * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt"
	 * transaction numbers without correspondingly updating the history transaction numbers (effectively causing
	 * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover)
	 * if we already hold crit.
	 */
	if (!was_crit)
	{
		for ( ; ; )
		{
			grab_crit(gv_cur_region);
			if (!cs_data->freeze && !IS_REPL_INST_FROZEN)
				break;
			rel_crit(gv_cur_region);
			while (cs_data->freeze || IS_REPL_INST_FROZEN)
				hiber_start(1000);
		}
	} else if (cs_data->freeze && dollar_tlevel)
	{	/* We don't want to continue with file extension as explained above. Hence return with an error code which
		 * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly.
		 */
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_FREEZE_PROG;
	}
	if (IS_REPL_INST_FROZEN && trans_in_prog)
	{
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_INST_FREEZE;
	}
	assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks);
	old_total = cs_data->trans_hist.total_blks;
	if (old_total != filesize)
	{	/* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM,
		 * we still need to remap the database
		 */
		assert((old_total > filesize) GTM_TRUNCATE_ONLY( || !is_mm));
		/* For BG, someone else could have truncated or extended - we have no idea */
		GDSFILEXT_CLNUP;
		return (SS_NORMAL);
	}
Beispiel #15
0
int send_mesg2gtmsecshr(unsigned int code, unsigned int id, char *path, int path_len)
{
	int                     client_sockfd, create_server_status, fcntl_res;
	int			req_code, wait_count = 0;
	int			recv_len, send_len;
	ssize_t			num_chars_recvd, num_chars_sent;
	int 			save_errno, ret_code = 0, init_ret_code = 0;
	int			loop_count = 0;
	int			recv_complete, send_complete;
	boolean_t		retry = FALSE;
	size_t			server_proc_len;
	int			semop_res;
	int			selstat, status;
	char			*recv_ptr, *send_ptr;
	struct sockaddr_un	server_proc;
	struct sembuf		sop[4];
	fd_set			wait_on_fd;
	gtmsecshr_mesg		mesg;
	TID			timer_id;
	int4			msec_timeout;
	char			*gtm_tmp_ptr;
	struct stat		stat_buf;
	struct shmid_ds		shm_info;
	int			len;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	DBGGSSHR((LOGFLAGS, "secshr_client: New send request\n"));
	if (!gtm_dist_ok_to_use)
		rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_GTMDISTUNVERIF, 4, STRLEN(gtm_dist), gtm_dist,
				gtmImageNames[image_type].imageNameLen, gtmImageNames[image_type].imageName);
	/* Create communication key (hash of release name) if it has not already been done */
	if (0 == TREF(gtmsecshr_comkey))
	{
		STR_HASH((char *)gtm_release_name, gtm_release_name_len, TREF(gtmsecshr_comkey), 0);
	}
	timer_id = (TID)send_mesg2gtmsecshr;
	if (!gtmsecshr_file_check_done)
	{
		len = STRLEN(gtm_dist);
		memcpy(gtmsecshr_path, gtm_dist, len);
		gtmsecshr_path[len] =  '/';
		memcpy(gtmsecshr_path + len + 1, GTMSECSHR_EXECUTABLE, STRLEN(GTMSECSHR_EXECUTABLE));
		gtmsecshr_pathname.addr = gtmsecshr_path;
		gtmsecshr_pathname.len = len + 1 + STRLEN(GTMSECSHR_EXECUTABLE);
		assertpro(GTM_PATH_MAX > gtmsecshr_pathname.len);
		gtmsecshr_pathname.addr[gtmsecshr_pathname.len] = '\0';
		if (-1 == Stat(gtmsecshr_pathname.addr, &stat_buf))
			rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5,
					LEN_AND_LIT("stat"), CALLFROM, errno);
		if ((ROOTUID != stat_buf.st_uid)
			|| !(stat_buf.st_mode & S_ISUID)
			|| (0 != ACCESS(gtmsecshr_pathname.addr, (X_OK))))
			rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_GTMSECSHRPERM);
		gtmsecshr_file_check_done = TRUE;
	}
	if (!gtmsecshr_sock_init_done && (0 < (init_ret_code = gtmsecshr_sock_init(CLIENT))))	/* Note assignment */
		return init_ret_code;
	DEBUG_ONLY(mesg.usesecshr = TREF(gtm_usesecshr));	/* Flag ignored in PRO build */
	while (MAX_COMM_ATTEMPTS >= loop_count)
	{	/* first, try the sendto */
		req_code = mesg.code = code;
		send_len = (int4)(GTM_MESG_HDR_SIZE);
  		if (REMOVE_FILE == code)
		{
			assert(GTM_PATH_MAX > path_len);	/* Name is not user supplied so simple check */
			memcpy(mesg.mesg.path, path, path_len);
			send_len += path_len;
		} else if (FLUSH_DB_IPCS_INFO == code)
		{
			assert(GTM_PATH_MAX > db_ipcs.fn_len);
			memcpy(&mesg.mesg.db_ipcs, &db_ipcs, (offsetof(ipcs_mesg, fn[0]) + db_ipcs.fn_len + 1));
			/* Most of the time file length is much smaller than GTM_PATH_MAX */
			send_len += offsetof(ipcs_mesg, fn[0]);
			send_len += mesg.mesg.db_ipcs.fn_len + 1;
		} else
		{
			mesg.mesg.id = id;
			send_len += SIZEOF(mesg.mesg.id);
		}
		DBGGSSHR((LOGFLAGS, "secshr_client: loop %d  frm-pid: %d  to-pid: %d  send_len: %d  code: %d\n", loop_count,
			  process_id, id, send_len, code));
		mesg.comkey = TREF(gtmsecshr_comkey);	/* Version communication key */
		mesg.pid = process_id;			/* Process id of client */
		mesg.seqno = ++cur_seqno;
		send_ptr = (char *)&mesg;
		send_complete = FALSE;
		SENDTO_SOCK(gtmsecshr_sockfd, send_ptr, send_len, 0, (struct sockaddr *)&gtmsecshr_sock_name,
			    (GTM_SOCKLEN_TYPE)gtmsecshr_sockpath_len, num_chars_sent);	/* This form handles EINTR internally */
		save_errno = errno;
		DBGGSSHR((LOGFLAGS, "secshr_client: sendto rc:    %d  errno: %d (only important if rc=-1)\n", (int)num_chars_sent,
			  save_errno));
		if (0 >= num_chars_sent)
		{	/* SENDTO_SOCK failed - start server and attempt to resend */
			if ((EISCONN == save_errno) || (EBADF == save_errno))
			{
				gtmsecshr_sock_cleanup(CLIENT);
				gtmsecshr_sock_init(CLIENT);
				wcs_backoff(loop_count + 1);
				DBGGSSHR((LOGFLAGS, "secshr_client: Connection error, reset socket\n"));
			} else
			{
				if (0 < loop_count)
					/* No message unless attempted server start at least once */
					send_msg_csa(CSA_ARG(NULL) VARLSTCNT(11) ERR_GTMSECSHRSRVF, 4,
							RTS_ERROR_TEXT("Client"), process_id,
							loop_count - 1, ERR_TEXT, 2,
							RTS_ERROR_TEXT("sendto to gtmsecshr failed"), save_errno);
				START_SERVER;
				DBGGSSHR((LOGFLAGS, "secshr_client: sendto() failed - restarting server\n"));
			}
			loop_count++;
			continue;
		}
		SETUP_FOR_RECV;		/* Sets timer, recvcomplete = FALSE */
		do
		{	/* Note RECVFROM does not loop on EINTR return codes so must be handled. Note also we only expect
			 * to receive the message header back as an acknowledgement.
			 */
			num_chars_recvd = RECVFROM(gtmsecshr_sockfd, recv_ptr, GTM_MESG_HDR_SIZE, 0, (struct sockaddr *)0,
						   (GTM_SOCKLEN_TYPE *)0);
			save_errno = errno;
			DBGGSSHR((LOGFLAGS, "secshr_client: recvfrom rc: %d  errno: %d (only important if rc=-1)\n",
				  (int)num_chars_recvd, save_errno));
			if (0 <= num_chars_recvd)
			{	/* Message received - make sure it is large enough to have set seqno before we do anything
				 * to rely on it.
				 */
				if ((GTM_MESG_HDR_SIZE <= num_chars_recvd) && (mesg.seqno == cur_seqno)
				    && (TREF(gtmsecshr_comkey) == mesg.comkey))
					recv_complete = TRUE;
				else
				{	/* Message too short or not correct sequence */
					cancel_timer(timer_id);
					/* Print True/False for the possibilities we failed */
					DBGGSSHR((LOGFLAGS, "secshr_client: Message incorrect - chars: %d, seq: %d\n",
						  (GTM_MESG_HDR_SIZE <= num_chars_recvd), (mesg.seqno == cur_seqno)));
					SETUP_FOR_RECV;
					continue;
				}
			} else
			{	/* Something untoward happened */
				if (client_timer_popped)
					break;
				if (EINTR == save_errno)	/* Had an irrelevant interrupt - ignore */
					continue;
				if (EBADF == save_errno)
					break;
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(11) ERR_GTMSECSHRSRVF, 4,
						RTS_ERROR_TEXT("Client"), process_id, loop_count - 1, ERR_TEXT, 2,
						RTS_ERROR_TEXT("recvfrom from gtmsecshr failed"), save_errno);
				if ((ECONNRESET == save_errno) || (ENOTCONN == save_errno))
				{
					num_chars_recvd = 0;
					break;
				}
				gtmsecshr_sock_cleanup(CLIENT);
				return save_errno;
			}
		} while (!recv_complete);
		cancel_timer(timer_id);
		if (client_timer_popped || (EBADF == save_errno) || (0 == num_chars_recvd))
		{	/* Timeout, connection issues, bad descriptor block - retry */
			gtmsecshr_sock_cleanup(CLIENT);
			gtmsecshr_sock_init(CLIENT);
			retry = TRUE;
			if (client_timer_popped)
			{
				START_SERVER;
				DBGGSSHR((LOGFLAGS, "secshr_client: Read timer popped - restarting server\n"));
			} else
				DBGGSSHR((LOGFLAGS, "secshr_client: Read error - socket reset, retrying\n"));
			loop_count++;
			continue;
		}
		/* Response to *our* latest message available */
		assert(recv_complete);
		if (ret_code = mesg.code)		/* Warning - assignment */
		{
			DBGGSSHR((LOGFLAGS, "secshr_client: non-zero response from gtmsecshr - request: %d  retcode: %d\n",
				  req_code, ret_code));
			if (INVALID_COMKEY == ret_code)
			{	/* Comkey mismatch means for a different version of GT.M - we will not handle it */
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFIL, 7, RTS_ERROR_TEXT("Client"),
					 process_id, mesg.pid, req_code, RTS_ERROR_TEXT(mesg.mesg.path),
					 ERR_TEXT, 2, RTS_ERROR_STRING("Communicating with wrong GT.M version"));
				rts_error_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFIL, 7, RTS_ERROR_TEXT("Client"),
					  process_id, mesg.pid, req_code, RTS_ERROR_TEXT(mesg.mesg.path),
					  ERR_TEXT, 2, RTS_ERROR_STRING("Communicating with wrong GT.M version"));
				break;	/* rts_error should not return */
			}
			switch(req_code)
			{
				case REMOVE_FILE:
					/* Called from mutex_sock_init(). Path (and length) contain null terminator byte.
					 * See if file still exists (may have been deleted by earlier attempt). Caller
					 * handles actual error.
					 */
					if ((-1 != Stat(path, &stat_buf)) || (ENOENT != ret_code))
						send_msg_csa(CSA_ARG(NULL) VARLSTCNT(14) ERR_GTMSECSHRSRVFIL, 7,
								RTS_ERROR_TEXT("Client"),
							 	process_id, mesg.pid, req_code, RTS_ERROR_TEXT(mesg.mesg.path),
								ERR_TEXT, 2, RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]),
								mesg.code);
					else
						ret_code = 0;	/* File is gone so this or a previous try actually worked */
					break;
				case REMOVE_SEM:
					/* See if semaphore still eixsts (may have been removed by earlier attempt that
					 * got a reply confused or lost). If not there, no error. Else error to op-log.
					 */
					if ((-1 != semctl(id, 0, GETVAL)) && !SEM_REMOVED(errno))
						send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFID, 6,
								RTS_ERROR_TEXT("Client"),
							 	process_id, mesg.pid, req_code, mesg.mesg.id, ERR_TEXT, 2,
								RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]),
								mesg.code);
					else
						ret_code = 0;	/* File is gone so this or a previous try actually worked */
				case REMOVE_SHM:
					/* See if shmem still eixsts (may have been removed by earlier attempt that
					 * got a reply confused or lost). If not there, no error. Else error to op-log.
					 * Note -
					 */
					if ((-1 != shmctl(id, IPC_STAT, &shm_info)) && !SEM_REMOVED(errno))
						send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFID, 6,
								RTS_ERROR_TEXT("Client"),
							 	process_id, mesg.pid, req_code, mesg.mesg.id, ERR_TEXT, 2,
								RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]),
								mesg.code);
					else
						ret_code = 0;	/* File is gone so this or a previous try actually worked */
					break;
				case FLUSH_DB_IPCS_INFO:	/* Errors handled by caller */
					break;
				default:
					if (EPERM != mesg.code && EACCES != mesg.code)
						send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFID, 6,
								RTS_ERROR_TEXT("Client"),
								process_id, mesg.pid, req_code, mesg.mesg.id, ERR_TEXT, 2,
							 	RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]),
							 	mesg.code);
					break;
			}
		}
		break;
	}
	if (MAX_COMM_ATTEMPTS < loop_count)
	{
		ret_code = -1;
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_GTMSECSHRSRVF, 4,
				RTS_ERROR_TEXT("Client"), process_id, loop_count - 1,
			   	ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to communicate with gtmsecshr"));
		/* If gtm_tmp is not defined, show default path */
		if (gtm_tmp_ptr = GETENV("gtm_tmp"))
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_GTMSECSHRTMPPATH, 2,
				RTS_ERROR_TEXT(gtm_tmp_ptr),
				ERR_TEXT, 2, RTS_ERROR_TEXT("(from $gtm_tmp)"));
		else
			send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4)
					ERR_GTMSECSHRTMPPATH, 2, RTS_ERROR_TEXT("/tmp"));
	}
	if (ONETIMESOCKET == init_ret_code)
		gtmsecshr_sock_cleanup(CLIENT);
	return ret_code;
}
void generic_signal_handler(int sig, siginfo_t *info, void *context)
{
	gtm_sigcontext_t	*context_ptr;
	void			(*signal_routine)();
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	FORWARD_SIG_TO_MAIN_THREAD_IF_NEEDED(sig);
	/* Save parameter value in global variables for easy access in core */
	dont_want_core = FALSE;		/* (re)set in case we recurse */
	created_core = FALSE;		/* we can deal with a second core if needbe */
	exi_condition = sig;
	if (NULL != info)
		exi_siginfo = *info;
	else
		memset(&exi_siginfo, 0, SIZEOF(*info));
#	if defined(__ia64) && defined(__hpux)
	context_ptr = (gtm_sigcontext_t *)context;	/* no way to make a copy of the context */
	memset(&exi_context, 0, SIZEOF(exi_context));
#	else
	if (NULL != context)
		exi_context = *(gtm_sigcontext_t *)context;
	else
		memset(&exi_context, 0, SIZEOF(exi_context));
	context_ptr = &exi_context;
#	endif
	/* Check if we are fielding nested immediate shutdown signals */
	if (EXIT_IMMED <= exit_state)
	{
		switch(sig)
		{	/* If we are dealing with one of these three dangerous signals which we have
			 * already hit while attempting to shutdown once, die with core now.
			 */
			case SIGSEGV:
			case SIGBUS:
			case SIGILL:
				if (core_in_progress)
				{
					if (exit_handler_active)
						_exit(sig);
					else
						exit(sig);
				}
				++core_in_progress;
				DUMP_CORE;
				GTMASSERT;
			default:
				;
		}
	}
	switch(sig)
	{
		case SIGTERM:
			if (!IS_GTMSECSHR_IMAGE)
			{
				forced_exit_err = ERR_FORCEDHALT;
				/* If nothing pending AND we have crit or in wcs_wtstart() or already in exit processing, wait to
				 * invoke shutdown. wcs_wtstart() manipulates the active queue that a concurrent process in crit
				 * in bt_put() might be waiting for. interrupting it can cause deadlocks (see C9C11-002178).
				 */
				if (DEFER_EXIT_PROCESSING)
				{
					SET_FORCED_EXIT_STATE;
					exit_state++;		/* Make exit pending, may still be tolerant though */
					assert(!IS_GTMSECSHR_IMAGE);
					if (exit_handler_active && !gtm_quiet_halt)
						SEND_AND_PUT_MSG(VARLSTCNT(1) forced_exit_err);
					return;
				}
				exit_state = EXIT_IMMED;
				SET_PROCESS_EXITING_TRUE; 	/* Set this BEFORE cancelling timers as wcs_phase2_commit_wait
								 * relies on this.
								 */
				if (ERR_FORCEDHALT != forced_exit_err || !gtm_quiet_halt)
					SEND_AND_PUT_MSG(VARLSTCNT(1) forced_exit_err);
			} else
			{	/* Special case for gtmsecshr - no deferral just exit */
				forced_exit_err = ERR_GTMSECSHRSHUTDN;
				if (OK_TO_SEND_MSG)
					send_msg_csa(CSA_ARG(NULL) VARLSTCNT(1) forced_exit_err);
			}
			dont_want_core = TRUE;
			break;
		case SIGQUIT:	/* Handle SIGQUIT specially which we ALWAYS want to defer if possible as it is always sent */
			dont_want_core = TRUE;
			extract_signal_info(sig, &exi_siginfo, context_ptr, &signal_info);
			switch(signal_info.infotype)
			{
				case GTMSIGINFO_NONE:
					forced_exit_err = ERR_KILLBYSIG;
					break;
				case GTMSIGINFO_USER:
					forced_exit_err = ERR_KILLBYSIGUINFO;
					break;
				case GTMSIGINFO_ILOC + GTMSIGINFO_BADR:
					forced_exit_err = ERR_KILLBYSIGSINFO1;
					break;
				case GTMSIGINFO_ILOC:
					forced_exit_err = ERR_KILLBYSIGSINFO2;
					break;
				case GTMSIGINFO_BADR:
					forced_exit_err = ERR_KILLBYSIGSINFO3;
					break;
				default:
					exit_state = EXIT_IMMED;
					GTMASSERT;
			}
			/* If nothing pending AND we have crit or already in exit processing, wait to invoke shutdown */
			if (DEFER_EXIT_PROCESSING)
			{
				SET_FORCED_EXIT_STATE;
				exit_state++;		/* Make exit pending, may still be tolerant though */
				assert(!IS_GTMSECSHR_IMAGE);
				return;
			}
			exit_state = EXIT_IMMED;
			SET_PROCESS_EXITING_TRUE;
			switch(signal_info.infotype)
			{
				case GTMSIGINFO_NONE:
					SEND_AND_PUT_MSG(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type),
						process_id, sig);
					break;
				case GTMSIGINFO_USER:
					SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.send_pid, signal_info.send_uid);
					break;
				case GTMSIGINFO_ILOC + GTMSIGINFO_BADR:
					SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.int_iadr, signal_info.bad_vadr);
					break;
				case GTMSIGINFO_ILOC:
					SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.int_iadr);
					break;
				case GTMSIGINFO_BADR:
					SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.bad_vadr);
					break;
			}
			break;
#		ifdef _AIX
		case SIGDANGER:
			forced_exit_err = ERR_KRNLKILL;
			/* If nothing pending AND we have crit or already in exit processing, wait to invoke shutdown */
			if (DEFER_EXIT_PROCESSING)
			{
				SET_FORCED_EXIT_STATE;
				exit_state++;		/* Make exit pending, may still be tolerant though */
				assert(!IS_GTMSECSHR_IMAGE);
				return;
			}
			exit_state = EXIT_IMMED;
			SET_PROCESS_EXITING_TRUE;
			SEND_AND_PUT_MSG(VARLSTCNT(1) forced_exit_err);
			dont_want_core = TRUE;
			break;
#		endif
		default:
			extract_signal_info(sig, &exi_siginfo, context_ptr, &signal_info);
			switch(signal_info.infotype)
			{
				case GTMSIGINFO_NONE:
					exit_state = EXIT_IMMED;
					SET_PROCESS_EXITING_TRUE;
					SEND_AND_PUT_MSG(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type),
						process_id, sig);
					break;
				case GTMSIGINFO_USER:
					/* This signal was SENT to us so it can wait until we are out of crit to cause an exit */
					forced_exit_err = ERR_KILLBYSIGUINFO;
					/* If nothing pending AND we have crit or already exiting, wait to invoke shutdown */
					if (DEFER_EXIT_PROCESSING)
					{
						assert(!IS_GTMSECSHR_IMAGE);
						SET_FORCED_EXIT_STATE;
						exit_state++;		/* Make exit pending, may still be tolerant though */
						need_core = TRUE;
						gtm_fork_n_core();	/* Generate "virgin" core while we can */
						return;
					}
					exit_state = EXIT_IMMED;
					SET_PROCESS_EXITING_TRUE;
					SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.send_pid, signal_info.send_uid);
					break;
				case GTMSIGINFO_ILOC + GTMSIGINFO_BADR:
					exit_state = EXIT_IMMED;
					SET_PROCESS_EXITING_TRUE;
					SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.int_iadr, signal_info.bad_vadr);
					break;
				case GTMSIGINFO_ILOC:
					exit_state = EXIT_IMMED;
					SET_PROCESS_EXITING_TRUE;
					SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.int_iadr);
					break;
				case GTMSIGINFO_BADR:
					exit_state = EXIT_IMMED;
					SET_PROCESS_EXITING_TRUE;
					SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type),
						process_id, sig, signal_info.bad_vadr);
					break;
				default:
					exit_state = EXIT_IMMED;
					SET_PROCESS_EXITING_TRUE;
					GTMASSERT;
			}
			if (0 != signal_info.sig_err)
			{
				SEND_AND_PUT_MSG(VARLSTCNT(1) signal_info.sig_err);
			}
			break;
	} /* switch (sig) */
	/* Stop the timers but do not cancel them. This allows the timer structures to appear in the core where gtmpcat can
	 * extract them allowing us to see what was going on.
	 */
	sys_canc_timer();
	FFLUSH(stdout);
	if (!dont_want_core)
	{
		need_core = TRUE;
		gtm_fork_n_core();
	}
	/* If any special routines are registered to be driven on a signal, drive them now */
	if (0 != exi_condition)
	{
		/* If this is a GTM-ish process in runtime mode, call the routine to generate the zshow dump. This separate
		 * routine necessary because (1) generic_signal_handler() no longer calls condition handlers and (2) we couldn't
		 * use call_on_signal because it would already be in use in updproc() where it is also possible this routine
		 * needs to be called. Bypass this code if we didn't create a core since that means it is not a GTM
		 * issue that forced its demise (and since this is an uncaring interrupt, we could be in any number of
		 * situations that would cause a ZSHOW dump to explode). Better for user to use jobexam to cause a dump prior
		 * to terminating the process in a deferrable fashion.
		 */
		if (!dont_want_core && IS_MCODE_RUNNING && (NULL != (signal_routine = RFPTR(create_fatal_error_zshow_dmp_fptr))))
		{	/* note assignment of signal_routine above */
			SFPTR(create_fatal_error_zshow_dmp_fptr, NULL);
			(*signal_routine)(exi_condition, FALSE);
		}
		/* Some mupip functions define an entry point to drive on signals. Make sure to do this AFTER we create the
		 * dump file above as it may detach things (like the recvpool) we need to create the above dump.
		 */
		if (NULL != (signal_routine = call_on_signal))	/* Note assignment */
		{
			call_on_signal = NULL;		/* So we don't recursively call ourselves */
			(*signal_routine)();
		}
	}
	if (!IS_GTMSECSHR_IMAGE)
	{
		assert((EXIT_IMMED <= exit_state) || !exit_handler_active);
		exit(-exi_condition);
	} else
		return;
}
Beispiel #17
0
boolean_t mu_truncate(int4 truncate_percent)
{
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t 	csd;
	int			num_local_maps;
	int 			lmap_num, lmap_blk_num;
	int			bml_status, sigkill;
	int			save_errno;
	int			ftrunc_status;
	uint4			jnl_status;
	uint4			old_total, new_total;
	uint4			old_free, new_free;
	uint4			end_blocks;
	int4			blks_in_lmap, blk;
	gtm_uint64_t		before_trunc_file_size;
	off_t			trunc_file_size;
	off_t			padding;
	uchar_ptr_t		lmap_addr;
	boolean_t		was_crit;
	uint4			found_busy_blk;
	srch_blk_status		bmphist;
	srch_blk_status 	*blkhist;
	srch_hist		alt_hist;
	trans_num		curr_tn;
	blk_hdr_ptr_t		lmap_blk_hdr;
	block_id		*blkid_ptr;
	unix_db_info    	*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	char			*err_msg;
	intrpt_state_t		prev_intrpt_state;
	off_t			offset;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csa = cs_addrs;
	csd = cs_data;
	if (dba_mm == csd->acc_meth)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		return TRUE;
	}
	/* already checked for parallel truncates on this region --- see mupip_reorg.c */
	gv_target = NULL;
	assert(csa->nl->trunc_pid == process_id);
	assert(dba_mm != csd->acc_meth);
	old_total = csa->ti->total_blks;
	old_free = csa->ti->free_blocks;
	sigkill = 0;
	found_busy_blk = 0;
	memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */
	assert(csd->bplmap == BLKS_PER_LMAP);
	end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */
	if (0 == end_blocks)
		end_blocks = BLKS_PER_LMAP;
	num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP);
	/* ======================================== PHASE 1 ======================================== */
	for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--)
	{
		if (mu_ctrly_occurred || mu_ctrlc_occurred)
			return TRUE;
		assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */
		if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region),
					truncate_percent);
			return TRUE;
		}
		lmap_blk_num = lmap_num * BLKS_PER_LMAP;
		if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
		{
			found_busy_blk = lmap_blk_num;
			break;
		}
		blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP;
		/* Loop through non-bitmap blocks of this lmap, do recycled2free */
		DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n",
			lmap_num, lmap_blk_num, blks_in_lmap));
		for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;)
		{
			t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
			for (;;) /* retry loop for recycled to free transactions */
			{
				curr_tn = csd->trans_hist.curr_tn;
				/* Read the nth local bitmap into memory */
				bmphist.blk_num = lmap_blk_num;
				bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr);
				lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr;
				if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
				{ /* Could not read the block successfully. Retry. */
					t_retry((enum cdb_sc)rdfail_detail);
					continue;
				}
				lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr);
				/* starting from the hint (blk itself), find the first busy or recycled block */
				blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status);
				assert(blk < BLKS_PER_LMAP);
				if (blk == -1 || blk >= blks_in_lmap)
				{ /* done with this lmap, continue to next */
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
				{ /* stop processing blocks... skip ahead to phase 2 */
					found_busy_blk = lmap_blk_num;
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_RECYCLED == bml_status)
				{ /* Write PBLK records for recycled blocks only if before_image journaling is
				   * enabled. t_end() takes care of checking if journaling is enabled and
				   * writing PBLK record. We have to at least mark the recycled block as free.
				   */
					RESET_UPDATE_ARRAY;
					update_trans = UPDTRNS_DB_UPDATED_MASK;
					*((block_id *)update_array_ptr) = blk;
					update_array_ptr += SIZEOF(block_id);
					*(int *)update_array_ptr = 0;
					alt_hist.h[1].blk_num = 0;
					alt_hist.h[0].level = 0;
					alt_hist.h[0].cse = NULL;
					alt_hist.h[0].tn = curr_tn;
					alt_hist.h[0].blk_num = lmap_blk_num + blk;
					alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num,
							&alt_hist.h[0].cycle, &alt_hist.h[0].cr);
					if (!alt_hist.h[0].buffaddr)
					{
						t_retry((enum cdb_sc)rdfail_detail);
						continue;
					}
					if (!t_recycled2free(&alt_hist.h[0]))
					{
						t_retry(cdb_sc_lostbmlcr);
						continue;
					}
					t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0);
					/* Set the opcode for INCTN record written by t_end() */
					inctn_opcode = inctn_blkmarkfree;
					if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
						continue;
					/* block processed, scan from the next one */
					blk++;
					break;
				} else
				{
					assert(t_tries < CDB_STAGNATE);
					t_retry(cdb_sc_badbitmap);
					continue;
				}
			} /* END recycled2free retry loop */
		} /* END scanning blocks of this particular lmap */
		/* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */
		/* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */
		DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num));
		t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
		for (;;)
		{
			RESET_UPDATE_ARRAY;
			BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id);
			*blkid_ptr = 0;
			update_trans = UPDTRNS_DB_UPDATED_MASK;
			inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */
			curr_tn = csd->trans_hist.curr_tn;
			blkhist = &alt_hist.h[0];
			blkhist->blk_num = lmap_blk_num;
			blkhist->tn = curr_tn;
			blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */
			/* Read the nth local bitmap into memory */
			blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr);
			lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr;
			if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
			{ /* Could not read the block successfully. Retry. */
				t_retry((enum cdb_sc)rdfail_detail);
				continue;
			}
			t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0);
			blkhist->blk_num = 0; /* create empty history for bitmap block */
			if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
				continue;
			break;
		}
	} /* END scanning lmaps */
	/* ======================================== PHASE 2 ======================================== */
	assert(!csa->now_crit);
	for (;;)
	{ /* wait for FREEZE, we don't want to truncate a frozen database */
		grab_crit(gv_cur_region);
		if (FROZEN_CHILLED(cs_data))
			DO_CHILLED_AUTORELEASE(csa, cs_data);
		if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN)
			break;
		rel_crit(gv_cur_region);
		while (FROZEN(cs_data) || IS_REPL_INST_FROZEN)
		{
			hiber_start(1000);
			if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data))
				break;
		}
	}
	assert(csa->nl->trunc_pid == process_id);
	/* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */
	if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB))
	{
		assert(FALSE);
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"),
				DB_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return FALSE;
	}
	csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk);
	assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk));
	new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP);
	if (mu_ctrly_occurred || mu_ctrlc_occurred)
	{
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (csa->ti->total_blks != old_total || new_total == old_total)
	{
		assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (SNAPSHOTS_IN_PROG(csa->nl))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	}
	DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	if (JNL_ENABLED(csa))
	{ /* Write JRT_TRUNC and INCTN records */
		if (!jgbl.dont_reset_gbl_jrec_time)
		SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = csa->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open(gv_cur_region, csa);
		if (SS_NORMAL != jnl_status)
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
		else
		{
			if (0 == jpc->pini_addr)
				jnl_put_jrt_pini(csa);
			jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total);
			inctn_opcode = inctn_mu_reorg;
			jnl_write_inctn_rec(csa);
			jnl_status = jnl_flush(gv_cur_region);
			if (SS_NORMAL != jnl_status)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"),
					jnl_status);
				assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */
			}
		}
	}
	/* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */
	curr_tn = csa->ti->curr_tn;
	CHECK_TN(csa, csd, curr_tn);
	udi = FILE_INFO(gv_cur_region);
	/* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */
	trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1));
	csd->after_trunc_total_blks = new_total;
	csd->before_trunc_free_blocks = csa->ti->free_blocks;
	csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */
	/* file size and total blocks: INCONSISTENT */
	csa->ti->total_blks = new_total;
	/* past the point of no return -- shared memory intact */
	assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total));
	csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total);
	new_free = csa->ti->free_blocks;
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	CHECK_DBSYNC(gv_cur_region, save_errno);
	/* past the point of no return -- shared memory deleted */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */
	clear_cache_array(csa, csd, gv_cur_region, new_total, old_total);
	offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size;
	save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff)));
	if (0 != save_errno)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		return FALSE;
	}
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */
	/* Execute an ftruncate() and truncate the DB file
	 * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS)
	 * It ignores kill -9 signal till its operation is completed.
	 * So we can safely assume that the result of ftruncate() will be complete.
	 */
	FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status);
	if (0 != ftrunc_status)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		/* should go through recover_truncate now, which will again try to FTRUNCATE */
		return FALSE;
	}
	/* file size and total blocks: CONSISTENT (shrunk) */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */
	csa->nl->root_search_cycle++;	/* Force concurrent processes to restart in t_end/tp_tend to make sure no one
					 * tries to commit updates past the end of the file. Bitmap validations together
					 * with highest_lbm_with_busy_blk should actually be sufficient, so this is
					 * just to be safe.
					 */
	csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */
	/* Increment TN */
	assert(csa->ti->early_tn == csa->ti->curr_tn);
	csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1;
	INCREMENT_CURR_TN(csd);
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */
	CHECK_DBSYNC(gv_cur_region, save_errno);
	ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	curr_tn = csa->ti->curr_tn;
	rel_crit(gv_cur_region);
	send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn);
	util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].",
					FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free);
	return TRUE;
} /* END of mu_truncate() */
uint4 jnl_file_lost(jnl_private_control *jpc, uint4 jnl_stat)
{	/* Notify operator and terminate journaling */
	unsigned int	status;
	sgmnt_addrs	*csa;
	seq_num		reg_seqno, jnlseqno;
	boolean_t	was_lockid = FALSE, instfreeze_environ;

	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	switch(jpc->region->dyn.addr->acc_meth)
	{
	case dba_mm:
	case dba_bg:
		csa = &FILE_INFO(jpc->region)->s_addrs;
		break;
	default:
		assertpro(FALSE && jpc->region->dyn.addr->acc_meth);
	}
#	ifdef VMS
	/* The following assert has been removed as it could be FALSE if the caller is "jnl_file_extend"
	 *	assert(0 != memcmp(csa->nl->jnl_file.jnl_file_id.fid, zero_fid, SIZEOF(zero_fid)));
	 */
#	endif
	assert(csa->now_crit);
	/* We issue an rts_error (instead of shutting off journaling) in the following cases :					{BYPASSOK}
	 * 1) $gtm_error_on_jnl_file_lost is set to issue runtime error (if not already issued) in case of journaling issues.
	 * 2) The process has the given message set in $gtm_custom_errors (indicative of instance freeze on error setup)
	 *    in which case the goal is to never shut-off journaling
	 */
	UNIX_ONLY(assert(jnlpool.jnlpool_ctl == jnlpool_ctl));
	UNIX_ONLY(instfreeze_environ = INST_FREEZE_ON_MSG_ENABLED(csa, jnl_stat));
	VMS_ONLY(instfreeze_environ = FALSE);
	if ((JNL_FILE_LOST_ERRORS == TREF(error_on_jnl_file_lost)) || instfreeze_environ)
	{
		VMS_ONLY(assert(FALSE)); /* Not fully implemented / supported on VMS. */
		if (!process_exiting || instfreeze_environ || !csa->jnl->error_reported)
		{
			csa->jnl->error_reported = TRUE;
			in_wcs_recover = FALSE;	/* in case we're called in wcs_recover() */
			if (SS_NORMAL != jpc->status)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(7) jnl_stat, 4, JNL_LEN_STR(csa->hdr),
						DB_LEN_STR(gv_cur_region), jpc->status);
			else
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_stat, 4, JNL_LEN_STR(csa->hdr),
						DB_LEN_STR(gv_cur_region));
		}
		return jnl_stat;
	}
	if (0 != jnl_stat)
		jnl_send_oper(jpc, jnl_stat);
	csa->hdr->jnl_state = jnl_closed;
	jpc->jnl_buff->cycle++; /* increment shared cycle so all future callers of jnl_ensure_open recognize journal switch */
	assert(jpc->cycle < jpc->jnl_buff->cycle);
	if (REPL_ENABLED(csa->hdr))
	{
		csa->hdr->repl_state = repl_was_open;
		reg_seqno = csa->hdr->reg_seqno;
		jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO;
		send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(jpc->region), &reg_seqno, &reg_seqno,
				&jnlseqno, &jnlseqno);
	} else
		send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_JNLCLOSED, 3, DB_LEN_STR(jpc->region), &csa->ti->curr_tn);
#ifdef VMS
	/* We can get a jnl_file_lost before the file is even created, so locking is done only if the lock exist */
	if (0 != csa->jnl->jnllsb->lockid)
	{
		was_lockid = TRUE;
		status = gtm_enqw(EFN$C_ENF, LCK$K_EXMODE, csa->jnl->jnllsb, LCK$M_CONVERT | LCK$M_NODLCKBLK,
				NULL, 0, NULL, 0, NULL, PSL$C_USER, 0);
		if (SS$_NORMAL == status)
			status = csa->jnl->jnllsb->cond;
	}
	jnl_file_close(jpc->region, FALSE, FALSE);
	if (was_lockid)
	{
		if (SS$_NORMAL == status)
			status = gtm_deq(csa->jnl->jnllsb->lockid, NULL, PSL$C_USER, 0);
		assertpro(SS$_NORMAL == status);
	}
# else
	jnl_file_close(jpc->region, FALSE, FALSE);
#endif
	return EXIT_NRM;
}
Beispiel #19
0
void gv_rundown(void)
{
	gd_region	*r_top, *r_save, *r_local;
	gd_addr		*addr_ptr;
	sgm_info	*si;
	int4		rundown_status = EXIT_NRM;			/* if gds_rundown went smoothly */
#	ifdef VMS
	vms_gds_info	*gds_info;
#	elif UNIX
	unix_db_info	*udi;
#	endif
#if defined(DEBUG) && defined(UNIX)
	sgmnt_addrs		*csa;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;

	r_save = gv_cur_region;		/* Save for possible core dump */
	gvcmy_rundown();
	ENABLE_AST

	if (pool_init)
		rel_lock(jnlpool.jnlpool_dummy_reg);
	for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
	{
		for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++)
		{
			if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth)
			{	/* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above.
			 	 * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions,
				 * region->open is TRUE although cs_addrs is NULL.
			 	 */
#				if defined(DEBUG) && defined(UNIX)
				if (is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE && TREF(gtm_test_fake_enospc))
				{	/* Clear ENOSPC faking now that we are running down */
					csa = REG2CSA(r_local);
					if (csa->nl->fake_db_enospc || csa->nl->fake_jnl_enospc)
					{
						send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT,
							     2, LEN_AND_LIT("Resetting fake_db_enospc and fake_jnl_enospc"));
						csa->nl->fake_db_enospc = FALSE;
						csa->nl->fake_jnl_enospc = FALSE;
					}
				}
#				endif
				gv_cur_region = r_local;
			        tp_change_reg();
				UNIX_ONLY(rundown_status |=) gds_rundown();

				/* Now that gds_rundown is done, free up the memory associated with the region.
				 * Ideally the following memory freeing code should go to gds_rundown, but
				 * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM.
				 */
				if (NULL != cs_addrs)
				{
					if (NULL != cs_addrs->dir_tree)
						FREE_CSA_DIR_TREE(cs_addrs);
					if (cs_addrs->sgm_info_ptr)
					{
						si = cs_addrs->sgm_info_ptr;
						/* It is possible we got interrupted before initializing all fields of "si"
						 * completely so account for NULL values while freeing/releasing those fields.
						 */
						assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa));
						if (si->jnl_tail)
						{
							CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list);
							CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list);
						}
						CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list);
						if (NULL != si->blks_in_use)
						{
							free_hashtab_int4(si->blks_in_use);
							free(si->blks_in_use);
							si->blks_in_use = NULL;
						}
						if (si->cr_array_size)
						{
							assert(NULL != si->cr_array);
							if (NULL != si->cr_array)
								free(si->cr_array);
						}
						if (NULL != si->first_tp_hist)
							free(si->first_tp_hist);
						free(si);
					}
					if (cs_addrs->jnl)
					{
						assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs);
						if (cs_addrs->jnl->jnllsb)
						{
							UNIX_ONLY(assert(FALSE));
							free(cs_addrs->jnl->jnllsb);
						}
						free(cs_addrs->jnl);
					}
					GTMCRYPT_ONLY(
						if (cs_addrs->encrypted_blk_contents)
							free(cs_addrs->encrypted_blk_contents);
					)
				}
				assert(gv_cur_region->dyn.addr->file_cntl->file_info);
				VMS_ONLY(
					gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info;
					if (gds_info->xabpro)
						free(gds_info->xabpro);
					if (gds_info->xabfhc)
						free(gds_info->xabfhc);
					if (gds_info->nam)
					{
						free(gds_info->nam->nam$l_esa);
						free(gds_info->nam);
					}
					if (gds_info->fab)
						free(gds_info->fab);
				)
				free(gv_cur_region->dyn.addr->file_cntl->file_info);
				free(gv_cur_region->dyn.addr->file_cntl);
			}
			r_local->open = r_local->was_open = FALSE;
		}
	}
Beispiel #20
0
uint4	 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog)
{
	sm_uc_ptr_t		old_base[2], mmap_retaddr;
	boolean_t		was_crit, is_mm;
	int			result, save_errno, status;
	DEBUG_ONLY(int		first_save_errno);
	uint4			new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total;
	uint4			jnl_status;
	gtm_uint64_t		avail_blocks, mmap_sz;
	off_t			new_eof, new_size;
	trans_num		curr_tn;
	unix_db_info		*udi;
	inctn_opcode_t		save_inctn_opcode;
	int4			prev_extend_blks_to_upgrd;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	cache_rec_ptr_t         cr;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(!IS_DSE_IMAGE);
	assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */
	assert(!process_exiting);
	DEBUG_ONLY(old_base[0] = old_base[1] = NULL);
	assert(!gv_cur_region->read_only);
	udi = FILE_INFO(gv_cur_region);
	is_mm = (dba_mm == cs_addrs->hdr->acc_meth);
#	if !defined(MM_FILE_EXT_OK)
	if (!udi->grabbed_access_sem && is_mm)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */
#	endif
	/* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will
	   overflow and end up doing silly things.
	*/
	assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR));
#	if defined(__sun) || defined(__hpux)
	cs_data->defer_allocate = TRUE;
#	endif
	if (!blocks && (cs_data->defer_allocate || (TRANS_IN_PROG_TRUE == trans_in_prog)))
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */
	bplmap = cs_data->bplmap;
	/* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired
	 * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks
	 *      and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this
	 *      manner as every non-bitmap block must have an associated bitmap)
	 * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap.
	 * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed.
	 */
	new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1)
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap);
	new_blocks = blocks + new_bit_maps;
	assert((0 < (int)new_blocks) || (!cs_data->defer_allocate && (0 == new_blocks)));
	if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data))
	{
		assert(WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR));
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX);
		return (uint4)(NO_FREE_SPACE);
	}
	if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE)))
	{
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
	} else
	{
		if (!(gtmDebugLevel & GDL_IgnoreAvailSpace))
		{	/* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB
			 * in space it could never fit it if wasn't sparse. Needed for some tests.
			 */
			avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE);
			if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
			{
				if (blocks > (uint4)avail_blocks)
				{
					if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs))
						return (uint4)(NO_FREE_SPACE);
					else
						send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4,
							DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks);
				} else
					send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region),
						 (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0)));
			}
		}
	}
#	ifdef DEBUG
	if (WBTEST_ENABLED(WBTEST_MM_CONCURRENT_FILE_EXTEND) && dollar_tlevel && !MEMCMP_LIT(gv_cur_region->rname, "DEFAULT"))
	{
		SYSTEM("$gtm_dist/mumps -run $gtm_wbox_mrtn");
		assert(1 == cs_addrs->nl->wbox_test_seq_num);	/* should have been set by mubfilcpy */
		cs_addrs->nl->wbox_test_seq_num = 2;	/* signal mupip backup to stop sleeping in mubfilcpy */
	}
#	endif
	/* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */
	was_crit = cs_addrs->now_crit;
	assert(!cs_addrs->hold_onto_crit || was_crit);
	/* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur.
	 * If we are coming from online rollback (when that feature is available), we will come in holding crit and in
	 * 	the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself.
	 * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for
	 * freeze.
	 * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited
	 * 	for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this
	 *	function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that
	 *	at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region
	 *	to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused
	 *	op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which
	 *	we hold crit.
	 */
	assert(!was_crit || !FROZEN_HARD(cs_data) || (dollar_tlevel && (CDB_STAGNATE <= t_tries)));
	/*
	 * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE
	 * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup
	 * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt"
	 * transaction numbers without correspondingly updating the history transaction numbers (effectively causing
	 * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover)
	 * if we already hold crit.
	 */
	if (!was_crit)
	{
		for ( ; ; )
		{
			grab_crit(gv_cur_region);
			if (FROZEN_CHILLED(cs_data))
				DO_CHILLED_AUTORELEASE(cs_addrs, cs_data);
			if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN)
				break;
			rel_crit(gv_cur_region);
			while (FROZEN(cs_data) || IS_REPL_INST_FROZEN)
			{
				hiber_start(1000);
				if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data))
					break;
			}
		}
	} else if (FROZEN_HARD(cs_data) && dollar_tlevel)
	{	/* We don't want to continue with file extension as explained above. Hence return with an error code which
		 * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly.
		 */
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_FREEZE_PROG;
	} else
		WAIT_FOR_REGION_TO_UNCHILL(cs_addrs, cs_data);
	if (IS_REPL_INST_FROZEN && trans_in_prog)
	{
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_INST_FREEZE;
	}
	assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks);
	old_total = cs_data->trans_hist.total_blks;
	if (old_total != filesize)
	{	/* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM,
		 * we still need to remap the database
		 */
		assert((old_total > filesize) || !is_mm);
		/* For BG, someone else could have truncated or extended - we have no idea */
		GDSFILEXT_CLNUP;
		return (SS_NORMAL);
	}
	if (trans_in_prog && SUSPICIOUS_EXTEND)
	{
		if (!was_crit)
		{
			GDSFILEXT_CLNUP;
			return (uint4)(EXTEND_SUSPECT);
		}
		/* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then it means we held
		 * crit through bm_getfree into gdsfilext and still didn't get it right.
		 */
		assertpro(!is_free_blks_ctr_ok() && !SUSPICIOUS_EXTEND);
	}
	if (JNL_ENABLED(cs_data))
	{
		if (!jgbl.dont_reset_gbl_jrec_time)
			SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = cs_addrs->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open(gv_cur_region, cs_addrs);
		if (jnl_status)
		{
			GDSFILEXT_CLNUP;
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region));
			return (uint4)(NO_FREE_SPACE);	/* should have better return status */
		}
	}
	if (is_mm)
	{
		cs_addrs->nl->mm_extender_pid = process_id;
		status = wcs_wtstart(gv_cur_region, 0, NULL, NULL);
		cs_addrs->nl->mm_extender_pid = 0;
		assertpro(SS_NORMAL == status);
		old_base[0] = cs_addrs->db_addrs[0];
		old_base[1] = cs_addrs->db_addrs[1];
		cs_addrs->db_addrs[0] = NULL; /* don't rely on it until the mmap below */
#		ifdef _AIX
		status = shmdt(old_base[0] - BLK_ZERO_OFF(cs_data->start_vbn));
#		else
		status = munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0]));
#		endif
		if (0 != status)
		{
			save_errno = errno;
			GDSFILEXT_CLNUP;
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region),
					ERR_SYSCALL, 5, LEN_AND_STR(MEM_UNMAP_SYSCALL), CALLFROM, save_errno);
			return (uint4)(NO_FREE_SPACE);
		}
	} else
	{	/* Due to concurrency issues, it is possible some process had issued a disk read of the GDS block# corresponding
		 * to "old_total" right after a truncate wrote a GDS-block of zeros on disk (to signal end of the db file).
		 * If so, the global buffer containing this block needs to be invalidated now as part of the extend. If not, it is
		 * possible the EOF block on disk is now going to be overwritten by a properly initialized bitmap block (as part
		 * of the gdsfilext below) while the global buffer continues to have an incorrect copy of that bitmap block and
		 * this in turn would cause XXXX failures due to a bad bitmap block in shared memory. (GTM-7519)
		 */
		cr = db_csh_get((block_id)old_total);
		if ((NULL != cr) && ((cache_rec_ptr_t)CR_NOTVALID != cr))
		{
			assert((0 == cr->dirty) && (0 == cr->bt_index) && !cr->stopped);
			cr->cycle++;
			cr->blk = CR_BLKEMPTY;
		}
	}
	CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn);	/* can issue rts_error TNTOOLARGE */
	new_total = old_total + new_blocks;
	new_eof = BLK_ZERO_OFF(cs_data->start_vbn) + ((off_t)new_total * cs_data->blk_size);
#	if !defined(__sun) && !defined(__hpux)
	if (!cs_data->defer_allocate)
	{
		new_size = new_eof + cs_data->blk_size;
		save_errno = posix_fallocate(udi->fd, 0, new_size);
		DEBUG_ONLY(first_save_errno = save_errno);
		if ((ENOSPC == save_errno) && IS_GTM_IMAGE)
			save_errno = extend_wait_for_fallocate(udi, new_size);
		if (0 != save_errno)
		{
			GDSFILEXT_CLNUP;
			assert(ENOSPC == save_errno);
			if (ENOSPC != save_errno)
				send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_PREALLOCATEFAIL, 2, DB_LEN_STR(gv_cur_region),
					     save_errno);
			return (uint4)(NO_FREE_SPACE);
		}
	}
#	endif
	save_errno = db_write_eof_block(udi, udi->fd, cs_data->blk_size, new_eof, &(TREF(dio_buff)));
	if ((ENOSPC == save_errno) && IS_GTM_IMAGE)
		save_errno = extend_wait_for_write(udi, cs_data->blk_size, new_eof);
	if (0 != save_errno)
	{
		GDSFILEXT_CLNUP;
		if (ENOSPC != save_errno)
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		return (uint4)(NO_FREE_SPACE);
	}
	if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_1))
	{
		LONG_SLEEP(600);
		assert(FALSE);
	}
	/* Ensure the EOF and metadata get to disk BEFORE any bitmap writes. Otherwise, the file size could no longer reflect
	 * a proper extent and subsequent invocations of gdsfilext could corrupt the database.
	 */
	if (!IS_STATSDB_CSA(cs_addrs))
	{
		GTM_DB_FSYNC(cs_addrs, udi->fd, status);
		assert(0 == status);
		if (0 != status)
		{
			GDSFILEXT_CLNUP;
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(8) ERR_DBFILERR, 5,
						RTS_ERROR_LITERAL("fsync1()"), CALLFROM, status);
			return (uint4)(NO_FREE_SPACE);
		}
	}
	if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_2))
	{
		LONG_SLEEP(600);
		assert(FALSE); /* Should be killed before that */
	}
	DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
Beispiel #21
0
int4 gds_rundown(void)
{
	boolean_t		canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin;
	boolean_t		have_standalone_access, ipc_deleted, err_caught;
	boolean_t		is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm;
	boolean_t		unsafe_last_writer;
	char			time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */
	gd_region		*reg;
	int			save_errno, status, rc;
	int4			semval, ftok_semval, sopcnt, ftok_sopcnt;
	short			crash_count;
	sm_long_t		munmap_len;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t	cnl;
	struct shmid_ds		shm_buf;
	struct sembuf		sop[2], ftok_sop[2];
	uint4           	jnl_status;
	unix_db_info		*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	shm_snapshot_t		*ss_shm_ptr;
	uint4			ss_pid, onln_rlbk_pid, holder_pid;
	boolean_t		was_crit;
	boolean_t		safe_mode; /* Do not flush or take down shared memory. */
	boolean_t		bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen,
				ftok_counter_halted,
				access_counter_halted;
	int			secshrstat;
	intrpt_state_t		prev_intrpt_state;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	jnl_status = 0;
	reg = gv_cur_region;			/* Local copy */

	/* early out for cluster regions
	 * to avoid tripping the assert below.
	 * Note:
	 *	This early out is consistent with VMS.  It has been
	 *	noted that all of the gtcm assignments
	 *      to gv_cur_region should use the TP_CHANGE_REG
	 *	macro.  This would also avoid the assert problem
	 *	and should be done eventually.
	 */
	if (dba_cm == reg->dyn.addr->acc_meth)
		return EXIT_NRM;

	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	if ((reg->open) && (dba_usr == csd->acc_meth))
	{
		change_reg();
		gvusr_rundown();
		return EXIT_NRM;
	}
	/* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local
	 * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on
	 * that later to determine if the process had standalone access or not when it entered this function.  We need to guarantee
	 * that none else access database file header when semid/shmid fields are reset.  We already have created ftok semaphore in
	 * db_init or, mu_rndwn_file and did not remove it.  So just lock it. We do it in blocking mode.
	 */
	have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */
	DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
	ESTABLISH_NORET(gds_rundown_ch, err_caught);
	if (err_caught)
	{
		REVERT;
		WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0);
		ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
		DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE);
		return EXIT_ERR;
	}
	assert(reg->open);			/* if we failed to open, dbinit_ch should have taken care of proper clean up */
	assert(!reg->opening);			/* see comment above */
	assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth));
	is_mm = (dba_bg != csd->acc_meth);
	assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk));
	/* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This
	 * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to
	 * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any
	 * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known
	 * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown.
	 */
	was_crit = (csa->now_crit && jgbl.onlnrlbk);
	/* Cancel any pending flush timer for this region by this task */
	canceled_flush_timer = FALSE;
	canceled_dbsync_timer = FALSE;
	CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer);
	we_are_last_user = FALSE;
	inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr);
	if (!csa->persistent_freeze)
		region_freeze(reg, FALSE, FALSE, FALSE);
	if (!was_crit)
	{
		rel_crit(reg);		/* get locks to known state */
		mutex_cleanup(reg);
	}
	/* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others
	 * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from
	 * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach
	 * mupip_exit_handler while holding access control semaphore. Assert accordingly.
	 */
	assert(!have_standalone_access || mupip_jnl_recover || process_exiting);
	/* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it
	 * needs the access control lock as well. The only expection is we are online rollback and currently running down.
	 */
	cnl = csa->nl;
	onln_rlbk_pid = cnl->onln_rlbk_pid;
	assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0));
	if (!have_standalone_access)
	{
		if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */
		{
			save_errno = errno;
			assert(FALSE);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
				  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno);
		}
		may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */
		/* We need to guarantee that no one else access database file header when semid/shmid fields are reset.
		 * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it.
		 */
		if (!ftok_sem_lock(reg, may_bypass_ftok))
		{
			if (may_bypass_ftok)
			{	/* We did a non-blocking wait. It's ok to proceed without locking */
				bypassed_ftok = TRUE;
				holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID);
				if ((uint4)-1 == holder_pid)
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"),
							CALLFROM, errno);
				if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */
				{
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10,
						 LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"),
						 REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid);
					send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2,
							LEN_AND_LIT("FTOK bypassed at rundown"));
				}
			} else
			{	/* We did a blocking wait but something bad happened. */
				FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
			}
		}
		sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0;	/* Wait for 0 */
		sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1;	/* Lock */
		sopcnt = 2;
		sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */
		SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT);
		if (0 != status)
		{
			save_errno = errno;
			/* Check # of processes counted on access sem. */
			if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL)))
			{
				assert(FALSE);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno);
			}
			bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt;
			/* Before attempting again in the blocking mode, see if the holding process is an online rollback.
			 * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we
			 * are better off skipping rundown and continuing with sanity cleanup and exit.
			 */
			holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID);
			if ((uint4)-1 == holder_pid)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno);
			if (!bypassed_access)
			{	/* We couldn't get it in one shot-- see if we already have it */
				if (holder_pid == process_id)
				{
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg),
							ERR_RNDWNSEMFAIL);
					REVERT;
					ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
					assert(FALSE);
					return EXIT_ERR;
				}
				if (EAGAIN != save_errno)
				{
					assert(FALSE);
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"),
							CALLFROM, save_errno);
				}
				sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO;	/* Try again - blocking this time */
				SEMOP(udi->semid, sop, 2, status, FORCED_WAIT);
				if (-1 == status)			/* We couldn't get it at all.. */
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"),
							CALLFROM, errno);
			} else if (!IS_GTM_IMAGE)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10,
						LEN_AND_STR(gtmImageNames[image_type].imageName), process_id,
						LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid);
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2,
						LEN_AND_LIT("Access control bypassed at rundown"));
			}
			udi->grabbed_access_sem = !bypassed_access;
		}
	} /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we
	   * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the
	   * ftok semaphore and trying it could cause deadlock
	   */
	/* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE.
	 * But there could be other processes still having the database open so we cannot safely reset the halted fields.
	 */
	if (have_standalone_access && !jgbl.onlnrlbk)
		csd->ftok_counter_halted = csd->access_counter_halted = FALSE;
	ftok_counter_halted = csd->ftok_counter_halted;
	access_counter_halted = csd->access_counter_halted;
	/* If we bypassed any of the semaphores, activate safe mode.
	 * Also, if the replication instance is frozen and this db has replication turned on (which means
	 * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode.
	 */
	ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen);
	safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted;
	/* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */
	assert(csa->ref_cnt);	/* decrement private ref_cnt before shared ref_cnt decrement. */
	csa->ref_cnt--;		/* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */
	assert(!csa->ref_cnt);
	--cnl->ref_cnt;
	if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1))
	{	/* VERMISMATCH condition. Possible only if DSE */
		assert(dse_running);
		vermismatch = TRUE;
	} else
		vermismatch = FALSE;
	if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf))
	{
		save_errno = errno;
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
				RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno);
	} else
		we_are_last_user =  (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode;
	/* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */
	assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen);
	if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL)))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
			  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno);
	/* There's one writer left and I am it */
	assert(reg->read_only || semval >= 0);
	unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch;
	we_are_last_writer = unsafe_last_writer && !safe_mode;
	assert(!we_are_last_writer || !safe_mode);
	assert(!we_are_last_user || !safe_mode);
	/* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */
	assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen);
	GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */
	if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
			  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno);
	if (NULL != csa->ss_ctx)
		ss_destroy_context(csa->ss_ctx);
	/* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */
	assert(1 == MAX_SNAPSHOTS);
	ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa);
	ss_pid = ss_shm_ptr->ss_info.ss_pid;
	is_cur_process_ss_initiator = (process_id == ss_pid);
	if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user))
	{
		/* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip
		 * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP
		 * INTEG or by a MUPIP RUNDOWN.
		 */
		if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid)
			&& (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0)))
		{
			ss_release(NULL);
			ss_release_lock(reg);
		}
	}
	/* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush
	 * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause
	 * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without
	 * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a
	 * consistent state.
	 */
	if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch)
	{	/* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY
		 * if there is NO concurrent online rollback running (as we need crit to set wc_blocked)
		 */
		if (csa->wbuf_dqd && !is_mm)
		{	/* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which
			 * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only
			 * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked
			 * BEFORE secshr_db_clnup in mur_close_files.
			 * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because
			 * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in
			 * wcs_wtstart or wcs_get_space. Assert accordingly.
			 */
			assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode);
			if (!was_crit)
				grab_crit(reg);
			SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
			BG_TRACE_PRO_ANY(csa, wcb_gds_rundown);
                        send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"),
                                process_id, &csa->ti->curr_tn, DB_LEN_STR(reg));
			csa->wbuf_dqd = 0;
			wcs_recover(reg);
			BG_TRACE_PRO_ANY(csa, lost_block_recovery);
			if (!was_crit)
				rel_crit(reg);
		}
		if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE)
			originator_prc_vec = NULL;
		/* If we are the last writing user, then everything must be flushed */
		if (we_are_last_writer)
		{	/* Time to flush out all of our buffers */
			assert(!safe_mode);
			if (is_mm)
			{
				MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg);
				cnl->remove_shm = TRUE;
			}
			if (cnl->wc_blocked && jgbl.onlnrlbk)
			{	/* if the last update done by online rollback was not committed in the normal code-path but was
				 * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never
				 * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This
				 * could result in the last update never getting flushed to the disk and if online rollback happened
				 * to be the last writer then the shared memory will be flushed and removed and the last update will
				 * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is
				 * possible only if phase1 or phase2 errors are induced using white box test cases
				 */
				assert(WB_COMMIT_ERR_ENABLED);
				wcs_recover(reg);
			}
			/* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly
			 * also ensures that the db is fsynced. We don't want to use it in the calls to
			 * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there.
			 * In this case, since we are running down, we don't have any such option.
			 */
			cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			/* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in
			 * case of MM for potential file extension), even if it did a grab_crit().  Therefore, make
			 * sure that's true.
			 */
			assert(csd == csa->hdr);
			assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1));
		} else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen)
		{	/* canceled pending db or jnl flush timers - flush database and journal buffers to disk */
			if (!was_crit)
				grab_crit(reg);
			/* we need to sync the epoch as the fact that there is no active pending flush timer implies
			 * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion
			 */
			wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			if (!was_crit)
				rel_crit(reg);
			assert((dba_mm == cs_data->acc_meth) || (csd == cs_data));
			csd = cs_data;	/* In case this is MM and wcs_flu() remapped an extended database, reset csd */
		}
		/* Do rundown journal processing after buffer flushes since they require jnl to be open */
		if (JNL_ENABLED(csd))
		{	/* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning
			 * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25.
			 */
			tp_change_reg();	/* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */
			jpc = csa->jnl;
			jbp = jpc->jnl_buff;
			if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
                        if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
			if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc))
				|| we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin)
			{	/* We need to close the journal file cleanly if we have the latest generation journal file open
				 *	or if we are the last writer and the journal file is open in shared memory (not necessarily
				 *	by ourselves e.g. the only process that opened the journal got shot abnormally)
				 * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode
				 * 	if we are not the last writer as it can be concurrently updated.
				 */
				if (!was_crit)
					grab_crit(reg);
				if (JNL_ENABLED(csd))
				{
					SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */
					/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
					 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
					 * journal records (if it decides to switch to a new journal file).
					 */
					ADJUST_GBL_JREC_TIME(jgbl, jbp);
					jnl_status = jnl_ensure_open();
					if (0 == jnl_status)
					{	/* If we_are_last_writer, we would have already done a wcs_flu() which would
						 * have written an epoch record and we are guaranteed no further updates
						 * since we are the last writer. So, just close the journal.
						 * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing
						 * a pini, so allow for that.
						 */
						assert(!jbp->before_images || is_mm
						    || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract
						    || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr));
						/* If we haven't written a pini, let jnl_file_close write the pini/pfin. */
						if (!jgbl.mur_extract && (0 != jpc->pini_addr))
							jnl_put_jrt_pfin(csa);
						/* If not the last writer and no pending flush timer left, do jnl flush now */
						if (!we_are_last_writer && (0 > cnl->wcs_timers))
						{
							if (SS_NORMAL == (jnl_status = jnl_flush(reg)))
							{
								assert(jbp->freeaddr == jbp->dskaddr);
								jnl_fsync(reg, jbp->dskaddr);
								assert(jbp->fsync_dskaddr == jbp->dskaddr);
							} else
							{
								send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2,
									JNL_LEN_STR(csd), ERR_TEXT, 2,
									RTS_ERROR_TEXT("Error with journal flush in gds_rundown"),
									jnl_status);
								assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */
								/* In this routine, all code that follows from here on does not
								 * assume anything about the journaling characteristics of this
								 * database so it is safe to continue execution even though
								 * journaling got closed in the middle.
								 */
							}
						}
						jnl_file_close(reg, we_are_last_writer, FALSE);
					} else
						send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd),
								DB_LEN_STR(reg));
				}
				if (!was_crit)
					rel_crit(reg);
			}
		}
		if (we_are_last_writer)			/* Flush the fileheader last and harden the file to disk */
		{
			if (!was_crit)
				grab_crit(reg);			/* To satisfy crit requirement in fileheader_sync() */
			memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */
			if (!have_standalone_access && we_are_last_user)
			{	/* mupip_exit_handler will do this after mur_close_file */
				csd->semid = INVALID_SEMID;
				csd->shmid = INVALID_SHMID;
				csd->gt_sem_ctime.ctime = 0;
				csd->gt_shm_ctime.ctime = 0;
			}
			fileheader_sync(reg);
			if (!was_crit)
				rel_crit(reg);
			if (!is_mm)
			{
				GTM_DB_FSYNC(csa, udi->fd, rc);		/* Sync it all */
				if (-1 == rc)
				{
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			} else
			{	/* Now do final MM file sync before exit */
				assert(csa->ti->total_blks == csa->total_blks);
				#ifdef _AIX
				GTM_DB_FSYNC(csa, udi->fd, rc);
				if (-1 == rc)
				#else
				if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1]))
				#endif
				{
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			}
		} else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued)
		{
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg));
			cnl->lastwriterbypas_msg_issued = TRUE;
		}
	} /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */
	/* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started
	 * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this).
	 */
	CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer);
	if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm)
	{	/* mupip_exit_handler will do this after mur_close_file */
		db_ipcs.semid = INVALID_SEMID;
		db_ipcs.shmid = INVALID_SHMID;
		db_ipcs.gt_sem_ctime = 0;
		db_ipcs.gt_shm_ctime = 0;
		db_ipcs.fn_len = reg->dyn.addr->fname_len;
		memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len);
		db_ipcs.fn[reg->dyn.addr->fname_len] = 0;
 		/* request gtmsecshr to flush. read_only cannot flush itself */
		WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa);
		if (!csa->read_only_fs)
		{
			secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0);
			if (0 != secshrstat)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					  ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header"));
		}
	}
	/* Done with file now, close it */
	CLOSEFILE_RESET(udi->fd, rc);	/* resets "udi->fd" to FD_INVALID */
	if (-1 == rc)
	{
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
			  ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno);
	}
	/* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */
#	if !defined(_AIX)
	if (is_mm && (NULL != csa->db_addrs[0]))
	{
		assert(csa->db_addrs[1] > csa->db_addrs[0]);
		munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]);
		if (0 < munmap_len)
			munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len));
	}
#	endif
	/* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down
	 * this region. In the process also get the remove_shm status from node_local before detaching.
	 * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data
	 * integrity as a reissue of recover will restore the database to a consistent state.
	 */
	remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl);
	/* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0.
	 * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid).
	 */
	if (jgbl.onlnrlbk)
		cnl->onln_rlbk_pid = 0;
	rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */
	/* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter
	 * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down.
	 * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes.
	 */
	if (safe_mode) /* indicates flushing was skipped */
	{
		if (bypassed_access)
			cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */
		if (bypassed_ftok)
			cnl->dbrndwn_ftok_skip++;
	}
	if (jgbl.onlnrlbk)
		csa->hold_onto_crit = FALSE;
	GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0);
	status = shmdt((caddr_t)cnl);
	csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/
	/* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar
	 * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the
	 * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so
	 * we are safe passing "csa".
	 */
	if (-1 == status)
		send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2,
				LEN_AND_LIT("Error during shmdt"), errno);
	REMOVE_CSA_FROM_CSADDRSLIST(csa);	/* remove "csa" from list of open regions (cs_addrs_list) */
	reg->open = FALSE;
	/* If file is still not in good shape, die here and now before we get rid of our storage */
	assertpro(0 == csa->wbuf_dqd);
	ipc_deleted = FALSE;
	/* If we are the very last user, remove shared storage id and the semaphores */
	if (we_are_last_user)
	{	/* remove shared storage, only if last writer to rundown did a successful wcs_flu() */
		assert(!vermismatch);
		if (remove_shm)
		{
			ipc_deleted = TRUE;
			if (0 != shm_rmid(udi->shmid))
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory"));
			/* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */
			udi->new_shm = FALSE;
			/* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from
			 * mur_close_files())
			 */
			if (!have_standalone_access)
			{
				if (0 != sem_rmid(udi->semid))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						      ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore"));
				udi->new_sem = FALSE;			/* Note that we no longer have a new semaphore */
				udi->grabbed_access_sem = FALSE;
				udi->counter_acc_incremented = FALSE;
			}
		} else if (is_src_server || is_updproc)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		} else
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
	} else
	{
		assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode);
		if (!jgbl.onlnrlbk && !have_standalone_access)
		{ 	/* If we were writing, get rid of our writer access count semaphore */
			if (!reg->read_only)
			{
				if (!access_counter_halted)
				{
					save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO);
					if (0 != save_errno)
						rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
								ERR_SYSCALL, 5,
								RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"),
								CALLFROM, save_errno);
				}
				udi->counter_acc_incremented = FALSE;
			}
			assert(safe_mode || !bypassed_access);
			/* Now remove the rundown lock */
			if (!bypassed_access)
			{
				if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO)))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown access control semaphore release"),
							CALLFROM, save_errno);
				udi->grabbed_access_sem = FALSE;
			}
		} /* else access control semaphore will be released in db_ipcs_reset */
	}
	if (!have_standalone_access)
	{
		if (bypassed_ftok)
		{
			if (!ftok_counter_halted)
				if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO)))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
		} else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE))
		{
			FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
		}
		udi->grabbed_ftok_sem = FALSE;
		udi->counter_ftok_incremented = FALSE;
	}
	ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
	if (!ipc_deleted)
	{
		GET_CUR_TIME(time_str);
		if (is_src_server)
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Source server"), REG_LEN_STR(reg));
		if (is_updproc)
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Update process"), REG_LEN_STR(reg));
		if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user))
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
		}
	}
	REVERT;
	return EXIT_NRM;
}