Beispiel #1
0
void dse_maps(void)
{
	block_id		blk, bml_blk;
	blk_segment		*bs1, *bs_ptr;
	int4			blk_seg_cnt, blk_size;		/* needed for BLK_INIT, BLK_SEG and BLK_FINI macros */
	sm_uc_ptr_t		bp;
	char			util_buff[MAX_UTIL_LEN];
	int4			bml_size, bml_list_size, blk_index, bml_index;
	int4			total_blks, blks_in_bitmap;
	int4			bplmap, dummy_int;
	unsigned char		*bml_list;
	cache_rec_ptr_t		cr, dummy_cr;
	bt_rec_ptr_t		btr;
	int			util_len;
	uchar_ptr_t		blk_ptr;
	boolean_t		was_crit;
	uint4			jnl_status;
	srch_blk_status		blkhist;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;

	if (CLI_PRESENT == cli_present("BUSY") || CLI_PRESENT == cli_present("FREE") ||
		CLI_PRESENT == cli_present("MASTER") || CLI_PRESENT == cli_present("RESTORE_ALL"))
	{
	if (gv_cur_region->read_only)
		rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	}
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	csa = cs_addrs;
	assert(&FILE_INFO(gv_cur_region)->s_addrs == csa);
	was_crit = csa->now_crit;
	if (csa->critical)
		crash_count = csa->critical->crashcnt;
	csd = csa->hdr;
	bplmap = csd->bplmap;
	if (CLI_PRESENT == cli_present("BLOCK"))
	{
		if (!cli_get_hex("BLOCK", (uint4 *)&blk))
			return;
		if (blk < 0 || blk >= csa->ti->total_blks)
		{
			util_out_print("Error: invalid block number.", TRUE);
			return;
		}
		patch_curr_blk = blk;
	}
	else
		blk = patch_curr_blk;
	if (CLI_PRESENT == cli_present("FREE"))
	{
		if (0 == bplmap)
		{
			util_out_print("Cannot perform map updates:  bplmap field of file header is zero.", TRUE);
			return;
		}
		if (blk / bplmap * bplmap == blk)
		{
			util_out_print("Cannot perform action on a map block.", TRUE);
			return;
		}
		bml_blk = blk / bplmap * bplmap;
		bm_setmap(bml_blk, blk, FALSE);
		return;
	}
	if (CLI_PRESENT == cli_present("BUSY"))
	{
		if (0 == bplmap)
		{
			util_out_print("Cannot perform map updates:  bplmap field of file header is zero.", TRUE);
			return;
		}
		if (blk / bplmap * bplmap == blk)
		{
			util_out_print("Cannot perform action on a map block.", TRUE);
			return;
		}
		bml_blk = blk / bplmap * bplmap;
		bm_setmap(bml_blk, blk, TRUE);
		return;
	}
	blk_size = csd->blk_size;
	if (CLI_PRESENT == cli_present("MASTER"))
	{
		if (0 == bplmap)
		{
			util_out_print("Cannot perform maps updates:  bplmap field of file header is zero.", TRUE);
			return;
		}
		if (!was_crit)
			grab_crit(gv_cur_region);
		bml_blk = blk / bplmap * bplmap;
		if (dba_mm == csd->acc_meth)
			bp = MM_BASE_ADDR(csa) + (off_t)bml_blk * blk_size;
		else
		{
			assert(dba_bg == csd->acc_meth);
			if (!(bp = t_qread(bml_blk, &dummy_int, &dummy_cr)))
				rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
		}
		if ((csa->ti->total_blks / bplmap) * bplmap == bml_blk)
			total_blks = (csa->ti->total_blks - bml_blk);
		else
			total_blks = bplmap;
		if (NO_FREE_SPACE == bml_find_free(0, bp + SIZEOF(blk_hdr), total_blks))
			bit_clear(bml_blk / bplmap, csa->bmm);
		else
			bit_set(bml_blk / bplmap, csa->bmm);
		if (bml_blk > csa->nl->highest_lbm_blk_changed)
			csa->nl->highest_lbm_blk_changed = bml_blk;
		if (!was_crit)
			rel_crit(gv_cur_region);
		return;
	}
	if (CLI_PRESENT == cli_present("RESTORE_ALL"))
	{
		if (0 == bplmap)
		{
			util_out_print("Cannot perform maps updates:  bplmap field of file header is zero.", TRUE);
			return;
		}
		total_blks = csa->ti->total_blks;
		assert(ROUND_DOWN2(blk_size, 2 * SIZEOF(int4)) == blk_size);
		bml_size = BM_SIZE(bplmap);
		bml_list_size = (total_blks + bplmap - 1) / bplmap * bml_size;
		bml_list = (unsigned char *)malloc(bml_list_size);
		for (blk_index = 0, bml_index = 0;  blk_index < total_blks; blk_index += bplmap, bml_index++)
			bml_newmap((blk_hdr_ptr_t)(bml_list + bml_index * bml_size), bml_size, csa->ti->curr_tn);
		if (!was_crit)
		{
			grab_crit(gv_cur_region);
			csa->hold_onto_crit = TRUE;	/* need to do this AFTER grab_crit */
		}
		blk = get_dir_root();
		assert(blk < bplmap);
		csa->ti->free_blocks = total_blks - DIVIDE_ROUND_UP(total_blks, bplmap);
		bml_busy(blk, bml_list + SIZEOF(blk_hdr));
		csa->ti->free_blocks =  csa->ti->free_blocks - 1;
		dse_m_rest(blk, bml_list, bml_size, &csa->ti->free_blocks, TRUE);
		for (blk_index = 0, bml_index = 0;  blk_index < total_blks; blk_index += bplmap, bml_index++)
		{
			t_begin_crit(ERR_DSEFAIL);
			CHECK_TN(csa, csd, csd->trans_hist.curr_tn);	/* can issue rts_error TNTOOLARGE */
			CWS_RESET;
			CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
			assert(csa->ti->early_tn == csa->ti->curr_tn);
			blk_ptr = bml_list + bml_index * bml_size;
			blkhist.blk_num = blk_index;
			if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr)))
				rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
			BLK_INIT(bs_ptr, bs1);
			BLK_SEG(bs_ptr, blk_ptr + SIZEOF(blk_hdr), bml_size - SIZEOF(blk_hdr));
			BLK_FINI(bs_ptr, bs1);
			t_write(&blkhist, (unsigned char *)bs1, 0, 0, LCL_MAP_LEVL, TRUE, FALSE, GDS_WRITE_KILLTN);
			BUILD_AIMG_IF_JNL_ENABLED(csd, csa->ti->curr_tn);
			t_end(&dummy_hist, NULL, csa->ti->curr_tn);
		}
		/* Fill in master map */
		for (blk_index = 0, bml_index = 0;  blk_index < total_blks; blk_index += bplmap, bml_index++)
		{
			blks_in_bitmap = (blk_index + bplmap <= total_blks) ? bplmap : total_blks - blk_index;
			assert(1 < blks_in_bitmap);	/* the last valid block in the database should never be a bitmap block */
			if (NO_FREE_SPACE != bml_find_free(0, (bml_list + bml_index * bml_size) + SIZEOF(blk_hdr), blks_in_bitmap))
				bit_set(blk_index / bplmap, csa->bmm);
			else
				bit_clear(blk_index / bplmap, csa->bmm);
			if (blk_index > csa->nl->highest_lbm_blk_changed)
				csa->nl->highest_lbm_blk_changed = blk_index;
		}
		if (!was_crit)
		{
			csa->hold_onto_crit = FALSE;	/* need to do this before the rel_crit */
			rel_crit(gv_cur_region);
		}
		if (unhandled_stale_timer_pop)
			process_deferred_stale();
		free(bml_list);
		csd->kill_in_prog = csd->abandoned_kills = 0;
		return;
	}
	MEMCPY_LIT(util_buff, "!/Block ");
	util_len = SIZEOF("!/Block ") - 1;
	util_len += i2hex_nofill(blk, (uchar_ptr_t)&util_buff[util_len], 8);
	memcpy(&util_buff[util_len], " is marked !AD in its local bit map.!/",
		SIZEOF(" is marked !AD in its local bit map.!/") - 1);
	util_len += SIZEOF(" is marked !AD in its local bit map.!/") - 1;
	util_buff[util_len] = 0;
	if (!was_crit)
		grab_crit(gv_cur_region);
	util_out_print(util_buff, TRUE, 4, dse_is_blk_free(blk, &dummy_int, &dummy_cr) ? "free" : "busy");
	if (!was_crit)
		rel_crit(gv_cur_region);
	return;
}
Beispiel #2
0
void gv_rundown(void)
{
	gd_region	*r_top, *r_save, *r_local;
	gd_addr		*addr_ptr;
	sgm_info	*si;
#	ifdef VMS
	vms_gds_info	*gds_info;
#	elif UNIX
	unix_db_info	*udi;
#	endif

	r_save = gv_cur_region;		/* Save for possible core dump */
	gvcmy_rundown();
	ENABLE_AST

	if (pool_init)
		rel_lock(jnlpool.jnlpool_dummy_reg);
	for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
	{
		for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++)
		{
			if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth)
			{	/* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above.
			 	 * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions,
				 * region->open is TRUE although cs_addrs is NULL.
			 	 */
				gv_cur_region = r_local;
			        tp_change_reg();
				gds_rundown();
				/* Now that gds_rundown is done, free up the memory associated with the region.
				 * Ideally the following memory freeing code should go to gds_rundown, but
				 * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM.
				 */
				if (NULL != cs_addrs)
				{
					if (NULL != cs_addrs->dir_tree)
						FREE_CSA_DIR_TREE(cs_addrs);
					if (cs_addrs->sgm_info_ptr)
					{
						si = cs_addrs->sgm_info_ptr;
						/* It is possible we got interrupted before initializing all fields of "si"
						 * completely so account for NULL values while freeing/releasing those fields.
						 */
						assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa));
						if (si->jnl_tail)
						{
							CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list);
							CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list);
						}
						CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list);
						if (NULL != si->blks_in_use)
						{
							free_hashtab_int4(si->blks_in_use);
							free(si->blks_in_use);
							si->blks_in_use = NULL;
						}
						if (si->cr_array_size)
						{
							assert(NULL != si->cr_array);
							if (NULL != si->cr_array)
								free(si->cr_array);
						}
						if (NULL != si->first_tp_hist)
							free(si->first_tp_hist);
						free(si);
					}
					if (cs_addrs->jnl)
					{
						assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs);
						if (cs_addrs->jnl->jnllsb)
						{
							UNIX_ONLY(assert(FALSE));
							free(cs_addrs->jnl->jnllsb);
						}
						free(cs_addrs->jnl);
					}
					GTMCRYPT_ONLY(
						if (cs_addrs->encrypted_blk_contents)
							free(cs_addrs->encrypted_blk_contents);
					)
				}
				assert(gv_cur_region->dyn.addr->file_cntl->file_info);
				VMS_ONLY(
					gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info;
					if (gds_info->xabpro)
						free(gds_info->xabpro);
					if (gds_info->xabfhc)
						free(gds_info->xabfhc);
					if (gds_info->nam)
					{
						free(gds_info->nam->nam$l_esa);
						free(gds_info->nam);
					}
					if (gds_info->fab)
						free(gds_info->fab);
				)
				free(gv_cur_region->dyn.addr->file_cntl->file_info);
				free(gv_cur_region->dyn.addr->file_cntl);
			}
			r_local->open = r_local->was_open = FALSE;
		}
Beispiel #3
0
void mubclnup(backup_reg_list *curr_ptr, clnup_stage stage)
{
	sgmnt_addrs	*csa;
	backup_reg_list *ptr, *next;
	uint4		status;
	boolean_t	had_lock;
	unix_db_info	*udi;
	int		rc;

	assert(stage >= need_to_free_space && stage < num_of_clnup_stage);

	free(stringpool.base);

	switch(stage)
	{
	case need_to_rel_crit:
		for (ptr = (backup_reg_list *)grlist; ptr != NULL && ptr != curr_ptr && ptr != (backup_reg_list *)halt_ptr;)
		{
			if (keep_going == ptr->not_this_time)
			{
				csa = &FILE_INFO(ptr->reg)->s_addrs;
				DECR_INHIBIT_KILLS(csa->nl);
				rel_crit(ptr->reg);
			}
			ptr = ptr->fPtr;
		}
		curr_ptr = (backup_reg_list *)halt_ptr;
		/* Intentional Fall Through */
	case need_to_del_tempfile:
		for (ptr = (backup_reg_list *)grlist; ptr != NULL && ptr != curr_ptr;)
		{
			assert(3 == num_backup_proc_status);   /* Ensure there are only 3 possible values for "ptr->not_this_time".
								* The assert below and the following if check rely on this. */
			assert((keep_going == ptr->not_this_time)
				|| (give_up_before_create_tempfile == ptr->not_this_time)
				|| (give_up_after_create_tempfile == ptr->not_this_time));
			if (give_up_before_create_tempfile != ptr->not_this_time)
			{
				free(ptr->backup_hdr);
				if (online)
				{	/* Stop temporary file from growing if we made it active */
					if (keep_going == ptr->not_this_time)
					{
						csa = &FILE_INFO(ptr->reg)->s_addrs;
						csa->nl->nbb = BACKUP_NOT_IN_PROGRESS;
						/* Make sure all running processes have a chance to see this backup
						   state change so they won't be trying to flush when we go to delete
						   the temporary files (mostly an issue on VMS).

						   This operation notifies other processes by:
						   1) Using a compswap lock with builtin memory barriers so other
						      processors know the memory state change.
						   2) Processes obtaining the lock after we release it will do their
						      own memory barrier operation and see the change.
						   3) By grabbing the lock, we are assured that anyone else getting the
						      lock after us will also be checking the errno flag AFTER getting the
						      lock (see backup_buffer_flush()) and see no flush is necessary.
						*/
						if (!(had_lock = shmpool_lock_held_by_us(ptr->reg)))
							shmpool_lock_hdr(ptr->reg);

						if (backup_interrupted && 0 == csa->shmpool_buffer->backup_errno)
							/* Needs a non-zero value to stop the backup */
							csa->shmpool_buffer->backup_errno = ERR_FORCEDHALT;
						if (!had_lock)
							shmpool_unlock_hdr(ptr->reg);
					}
					/* get rid of the temporary file */
					if (ptr->backup_fd > 2)
					{
						CLOSEFILE_RESET(ptr->backup_fd, rc);	/* resets "ptr" to FD_INVALID */
						UNLINK(ptr->backup_tempfile);
					}
				} else	/* defreeze the databases */
					region_freeze(ptr->reg, FALSE, FALSE, FALSE);
			}
			ptr = ptr->fPtr;
		}

		/* Intentional fall through */
	case need_to_free_space:
		for (ptr = (backup_reg_list *)grlist; ptr != NULL;)
		{
			next = ptr->fPtr;
			if (keep_going != ptr->not_this_time)
				error_mupip = TRUE;
			if (NULL != ptr->backup_file.addr)
				free(ptr->backup_file.addr);
			free(ptr);
			ptr = next;
		}
	}
	/* Release FTOK lock on the replication instance file if holding it */
	assert((NULL == jnlpool.jnlpool_dummy_reg) || (NULL != mu_repl_inst_reg_list) || jnlpool_init_needed);
	if ((NULL != mu_repl_inst_reg_list) && (NULL != jnlpool.jnlpool_dummy_reg) && jnlpool.jnlpool_dummy_reg->open)
	{
		udi = FILE_INFO(jnlpool.jnlpool_dummy_reg);
		assert(NULL != udi);
		if (NULL != udi)
		{	/* See gv_rundown.c comment for why ftok_sem_release 2nd parameter is FALSE below */
			if (udi->grabbed_ftok_sem)
				ftok_sem_release(jnlpool.jnlpool_dummy_reg, FALSE, TRUE);
			assert(!udi->grabbed_ftok_sem);
		}
	}
	return;
}
Beispiel #4
0
void	lke_show(void)
{
	bool			locks, all = TRUE, wait = TRUE, interactive = FALSE, match = FALSE, memory = TRUE, nocrit = TRUE;
	boolean_t		exact = FALSE, was_crit;
	int4			pid;
	size_t			ls_len;
	int			n;
	char 			regbuf[MAX_RN_LEN], nodebuf[32], one_lockbuf[MAX_KEY_SZ];
	mlk_ctldata_ptr_t	ctl;
	mstr			reg, node, one_lock;

	error_def(ERR_UNIMPLOP);
	error_def(ERR_TEXT);

	/* Get all command parameters */
	reg.addr = regbuf;
	reg.len = SIZEOF(regbuf);
	node.addr = nodebuf;
	node.len = SIZEOF(nodebuf);
	one_lock.addr = one_lockbuf;
	one_lock.len = SIZEOF(one_lockbuf);

	if (lke_getcli(&all, &wait, &interactive, &pid, &reg, &node, &one_lock, &memory, &nocrit, &exact) == 0)
		return;

	/* Search all regions specified on the command line */
	for (gv_cur_region = gd_header->regions, n = 0; n != gd_header->n_regions; ++gv_cur_region, ++n)
	{
		/* If region matches and is open */
		if ((reg.len == 0  ||
		     gv_cur_region->rname_len == reg.len  &&  memcmp(gv_cur_region->rname, reg.addr, reg.len) == 0)  &&
		    gv_cur_region->open)
		{
			match = TRUE;
			util_out_print("!/!AD!/", NOFLUSH, REG_LEN_STR(gv_cur_region));

			/* If distributed database, the region is located on another node */
			if (gv_cur_region->dyn.addr->acc_meth == dba_cm)
			{
#				if defined(LKE_WORKS_OK_WITH_CM)
				/* Obtain lock info from the remote node */
				locks = gtcmtr_lke_showreq(gv_cur_region->dyn.addr->cm_blk, gv_cur_region->cmx_regnum,
							   all, wait, pid, &node);
#				else
				gtm_putmsg(VARLSTCNT(10) ERR_UNIMPLOP, 0, ERR_TEXT, 2,
						LEN_AND_LIT("GT.CM region - locks must be displayed on the local node"),
						ERR_TEXT, 2, REG_LEN_STR(gv_cur_region));
				continue;
#				endif
			} else if (gv_cur_region->dyn.addr->acc_meth == dba_bg  || gv_cur_region->dyn.addr->acc_meth == dba_mm)
			{	/* Local region */
				cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs;
				ls_len = (size_t)(cs_addrs->lock_addrs[1] - cs_addrs->lock_addrs[0]);
				ctl = (mlk_ctldata_ptr_t)malloc(ls_len);
				/* Prevent any modification of the lock space while we make a local copy of it */
				if (cs_addrs->critical != NULL)
					crash_count = cs_addrs->critical->crashcnt;
				was_crit = cs_addrs->now_crit;
				if (!nocrit && !was_crit)
					grab_crit(gv_cur_region);
				longcpy((uchar_ptr_t)ctl, (uchar_ptr_t)cs_addrs->lock_addrs[0], ls_len);
				if (!nocrit && !was_crit)
					rel_crit(gv_cur_region);
				locks = ctl->blkroot == 0 ?
						FALSE:
						lke_showtree(NULL, (mlk_shrblk_ptr_t)R2A(ctl->blkroot), all, wait, pid,
												one_lock, memory);
				free(ctl);
			} else
			{
				util_out_print(NULL, RESET);
				util_out_print("Region is not BG, MM, or CM", FLUSH);
				locks = TRUE;
			}
			if (!locks)
			{
				util_out_print(NULL, RESET);
				util_out_print("No locks were found in !AD", FLUSH, REG_LEN_STR(gv_cur_region));
			}
		}
	}

	if (!match  &&  reg.len != 0)
		rts_error(VARLSTCNT(4) ERR_NOREGION, 2, reg.len, reg.addr);

}
Beispiel #5
0
cache_rec_ptr_t	db_csh_getn(block_id block)
{
	cache_rec_ptr_t		hdr, q0, start_cr, cr;
	bt_rec_ptr_t		bt;
	unsigned int		lcnt, ocnt;
	int			rip, max_ent, pass1, pass2, pass3;
	int4			flsh_trigger;
	uint4			first_r_epid, latest_r_epid;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		*tp_srch_status;
	ht_ent_int4		*tabent;

	csa = cs_addrs;
	csd = csa->hdr;
	assert(csa->now_crit);
	assert(csa == &FILE_INFO(gv_cur_region)->s_addrs);
	max_ent = csd->n_bts;
	cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off);
	hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets);
	start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets;
	pass1 = max_ent;	/* skip referred or dirty or read-into cache records */
	pass2 = 2 * max_ent;	/* skip referred cache records */
	pass3 = 3 * max_ent;	/* skip nothing */
	INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1);
	DEFER_INTERRUPTS(INTRPT_IN_DB_CSH_GETN);
	for (lcnt = 0;  ; lcnt++)
	{
		if (lcnt > pass3)
		{
			BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed);
			assert(FALSE);
			break;
		}
		cr++;
		if (cr == start_cr + max_ent)
			cr = start_cr;
		VMS_ONLY(
			if ((lcnt == pass1) || (lcnt == pass2))
				wcs_wtfini(gv_cur_region);
		)
		if (cr->refer && (lcnt < pass2))
		{	/* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */
			cr->refer = FALSE;
			continue;
		}
		if (cr->in_cw_set || cr->in_tend)
		{	/* some process already has this pinned for reading and/or updating. skip it. */
			cr->refer = TRUE;
			continue;
		}
		if (CDB_STAGNATE <= t_tries || mu_reorg_process)
		{
			/* Prevent stepping on self when crit for entire transaction.
			 * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block.
			 * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use
			 * 	is updated to the latest cw_stagnate list of blocks only in "tp_hist".
			 * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's.
			 * This is to allow big-read TP transactions which may use up more than the available global buffers.
			 * There is one issue here in that a block that has been only read till now may be stepped upon here
			 *	but may later be needed for update. It is handled by updating the block's corresponding
			 *	entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the
			 *	"cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the
			 *	first time within the transaction since otherwise the transaction would restart due to a
			 *	cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still
			 *	remain the "tn" when the block was first read within this transaction to ensure the block
			 *	hasn't been modified since the start of the transaction. Once we intend on changing the
			 *	block i.e. srch_blk_status->cse is non-NULL, we ensure in the code below not to step on it.
			 *	["tp_hist" is the routine that updates the "cr", "cycle" and "tn" of the block].
			 * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn"
			 *	of the first t_qread of the block within that transaction. The above is the only exception.
			 * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of
			 *	them even if they don't have a cse. This is to ensure that the current action doesn't
			 *	encounter a restart due to cdb_sc_lostcr in "tp_hist" even in the fourth-retry.
			 */
			tp_srch_status = NULL;
			if (dollar_tlevel && (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&cr->blk)))
					&& (tp_srch_status = (srch_blk_status *)tabent->value) && (tp_srch_status->cse))
			{	/* this process is already using the block - skip it */
				cr->refer = TRUE;
				continue;
			}
			if (NULL != lookup_hashtab_int4(&cw_stagnate, (uint4 *)&cr->blk))
			{	/* this process is already using the block for the current gvcst_search - skip it */
				cr->refer = TRUE;
				continue;
			}
			if (NULL != tp_srch_status)
			{	/* About to reuse a buffer that is part of the read-set of the current TP transaction.
				 * Reset clue as otherwise the next global reference of that global will use an outofdate clue.
				 * Even though tp_srch_status is available after the sgm_info_ptr->blks_in_use hashtable check,
				 * we dont want to reset the clue in case the cw_stagnate hashtable check causes the same cr
				 * to be skipped from reuse. Hence the placement of this reset logic AFTER the cw_stagnate check.
				 */
				tp_srch_status->blk_target->clue.end = 0;
			}
		}
		if (cr->dirty)
		{	/* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a
			 * concurrent "wcs_wtstart" has reset dirty to 0 but that update did not reach us yet). In this
			 * case the call to "wcs_get_space" below will do the necessary memory barrier instructions
			 * (through calls to "aswp") which will allow us to see the non-stale value of cr->dirty.
			 *
			 * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space
			 * done below will return FALSE forcing a cache-rebuild which will fix this situation.
			 *
			 * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine
			 * is "wcs_wtfini" which is executed in crit which another process cannot be in as we are in crit now.
			 */
			if (gv_cur_region->read_only)
				continue;
			if (lcnt < pass1)
			{
				if (!csa->timer && (csa->nl->wcs_timers < 1))
					wcs_timer_start(gv_cur_region, FALSE);
				continue;
			}
			BG_TRACE_PRO(db_csh_getn_flush_dirty);
			if (FALSE == wcs_get_space(gv_cur_region, 0, cr))
			{	/* failed to flush it out - force a rebuild */
				BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt);
				assert(csa->nl->wc_blocked); /* only reason we currently know why wcs_get_space could fail */
				assert(gtm_white_box_test_case_enabled);
				break;
			}
			assert(0 == cr->dirty);
		}
		UNIX_ONLY(
			/* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR.
			 * In VMS, resetting the write-latch value occurs in "wcs_wtfini" which is in CRIT, we are fine.
			 * In Unix, this resetting is done by "wcs_wtstart" which is out-of-crit. Therefore, we need to
			 * 	wait for this value to be LATCH_CLEAR before reusing this cache-record.
			 * Note that we are examining the write-latch-value without holding the interlock. It is ok to do
			 * 	this because the only two routines that modify the latch value are "bg_update" and
			 * 	"wcs_wtstart". The former cannot be concurrently executing because we are in crit.
			 * 	The latter will not update the latch value unless this cache-record is dirty. But in this
			 * 	case we would have most likely gone through the if (cr->dirty) check above. Most likely
			 * 	because there is one rare possibility where a concurrent "wcs_wtstart" has set cr->dirty
			 * 	to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared.
			 * 	In all other cases, nobody is modifying the latch since when we got crit and therefore
			 * 	it is safe to observe the value of the latch without holding the interlock.
			 */
			if (LATCH_CLEAR != WRITE_LATCH_VAL(cr))
			{	/* possible if a concurrent "wcs_wtstart" has set cr->dirty to 0 but not yet
				 * cleared the latch. this should be very rare though.
				 */
				if (lcnt < pass2)
					continue; /* try to find some other cache-record to reuse until the 3rd pass */
				for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++)
					wcs_sleep(SLEEP_WRTLATCHWAIT);	/* since it is a short lock, sleep the minimum */
				if (MAXWRTLATCHWAIT <= ocnt)
				{
					BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck);
					assert(FALSE);
					continue;
				}
			}
		)
Beispiel #6
0
void db_auto_upgrade(gd_region *reg)
{
	/* detect unitialized file header fields for this version of GT.M and do a mini auto-upgrade, initializing such fields
	 * to default values in the new GT.M version
	 */

	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;

	assert(NULL != reg);
	if (NULL == reg)
		return;
	csa = &FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	assert(NULL != csd);
	if (NULL == csd)
		return;

	if (0 == csd->mutex_spin_parms.mutex_hard_spin_count)
		csd->mutex_spin_parms.mutex_hard_spin_count = MUTEX_HARD_SPIN_COUNT;
	if (0 == csd->mutex_spin_parms.mutex_sleep_spin_count)
		csd->mutex_spin_parms.mutex_sleep_spin_count = MUTEX_SLEEP_SPIN_COUNT;
	/* zero is a legitimate value for csd->mutex_spin_parms.mutex_spin_sleep_mask; so can't detect if need re-initialization */
	INIT_NUM_CRIT_ENTRY_IF_NEEDED(csd);

	/* Auto upgrade based on minor database version number. This code currently only does auto upgrade and does not
	 * do auto downgrade although that certainly is possible to implement if necessary. For now, if the current version
	 * is at a lower level than the minor db version, we do nothing.
	 *
	 * Note the purpose of the minor_dbver field is so that some part of gtm (either runtime, or conversion utility) some
	 * time and several versions down the road from now knows by looking at this field what fields in the fileheader are
	 * valid so it is important that the minor db version be updated each time the fileheader is updated and this routine
	 * correspondingly updated. SE 5/2006.
	 */
	if (csd->minor_dbver < GDSMVCURR)
	{	/* In general, the method for adding new versions is:
		 * 1) If there are no automatic updates for this version, it is optional to add the version to the switch
		 *    statement below. Those there are more for example at this time (through V53000).
		 * 2) Update (or add) a case for the previous version to update any necessary fields.
		 */
		if (!csd->opened_by_gtmv53 && !csd->db_got_to_v5_once)
		{
			csd->opened_by_gtmv53 = TRUE;
			/* This is a case of a database that has been used by a pre-V53 version of GT.M that did not contain
			 * the fix (C9H07-002873). At this point, the database might contain RECYCLED blocks that are a mix of
			 *	a) Those blocks that were RECYCLED at the time of the MUPIP UPGRADE from V4 to V5.
			 *	b) Those blocks that became RECYCLED due to M-kills in V5.
			 * It is only (a) that we have to mark as FREE as it might contain too-full v4 format blocks. But there
			 * is no way to distinguish the two. So we mark both (a) and (b) as FREE. This will mean no PBLKs written
			 * for (b) and hence no backward journal recovery possible to a point before the start of the REORG UPGRADE.
			 * We force a MUPIP REORG UPGRADE rerun (to mark RECYCLED blocks FREE) by setting fully_upgraded to FALSE.
			 * Note that this does not need to be done for databases created by a V5 version (C9I05-002987).
			 */
			if (MASTER_MAP_SIZE_V4 == csd->master_map_len)
			{
				csd->fully_upgraded = FALSE;
				csd->reorg_upgrd_dwngrd_restart_block = 0;	/* reorg upgrade should restart from block 0 */
				/* Ensure reorg_db_fmt_start_tn and desired_db_format_tn are set to different
				 * values so fresh reorg upgrade can set fully_upgraded to TRUE once it is done.
				 */
				csd->reorg_db_fmt_start_tn = 0;
				csd->desired_db_format_tn = 1;
			} else
				csd->db_got_to_v5_once = TRUE;	/* db was created by V5 so safe to set this */
		}
		/* When adding a new minor version, the following template should be maintained
		 * a) Remove the penultimate 'break'
		 * b) Remove the assert(FALSE) in the last case (most recent minor version)
		 * c) If there are any file header fields added in the new minor version, initialize the fields to default values
		 *    in the last case
		 * d) Add a new case with the new minor version
		 * e) Add assert(FALSE) and break (like it was before)
		 */
		switch(csd->minor_dbver)
		{	/* Note that handling for any fields introduced in a version will not go in the "switch-case" block
			 * of code introduced for the new version but will go in the PREVIOUS "switch-case" block.
			 */
			case GDSMV51000:		/* Multi-site replication available */
			case GDSMV52000:		/* Unicode */
			case GDSMV53000:		/* M-Itanium release */
				gvstats_rec_upgrade(csa); /* Move GVSTATS information to new place in file header */
			case GDSMV54002:
				/* GT.M V54002B introduced jnl_eov_tn for backward recovery */
				csd->jnl_eovtn = csd->trans_hist.curr_tn;
			case GDSMV54002B:
				/* GT.M V55000 introduced strm_reg_seqno, save_strm_reg_seqno, intrpt_recov_resync_strm_seqno
				 * AND obsoleted dualsite_resync_seqno. For new fields, we are guaranteed they are
				 * zero (in formerly unused sections of the file header) so no need for any initialization.
				 * For obsoleted fields, it would be good to clear them here so we dont run into issues later.
				 */
				UNIX_ONLY(csd->filler_seqno = 0;)	/* was "dualsite_resync_seqno" in pre-V55000 versions */
				/* In addition, V55000 introduced before_trunc_total_blks for MUPIP REORG -TRUNCATE.
				 * Since it is a new field no initialization necessary.
				 */
			case GDSMV55000:
				UNIX_ONLY(csd->freeze_on_fail = FALSE;)
				UNIX_ONLY(csd->span_node_absent = TRUE;)
Beispiel #7
0
error_def(ERR_FREEZEID);

freeze_status	region_freeze(gd_region *region, boolean_t freeze, boolean_t override, boolean_t wait_for_kip)
{
	uint4			freeze_id, sleep_counter;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	now_t			now;                                            /* for GET_CUR_TIME macro */
	char			*time_ptr, time_str[CTIME_BEFORE_NL + 2];       /* for GET_CUR_TIME macro */
	boolean_t		was_crit;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	freeze_id = FREEZE_ID;
	csa = &FILE_INFO(region)->s_addrs;
	csd = csa->hdr;
	if (freeze)
	{
		was_crit = csa->now_crit;
		if (!was_crit)
			grab_crit(region);	/* really need this to be sure in UNIX, shouldn't be frequent anyway */
		INCR_INHIBIT_KILLS(csa->nl);
		if (OWNERSHIP)
		{
			DECR_INHIBIT_KILLS(csa->nl);
			if (!was_crit)
				rel_crit(region);
			return REG_FREEZE_SUCCESS;
		}
		if (!override && csd->freeze)
/* go after a specific number of buffers or a particular buffer */
bool	wcs_get_space(gd_region *reg, int needed, cache_rec *cr)
{
	unsigned int		lcnt, ocnt, status;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t        cnl;
	que_ent_ptr_t		base, q0;
	int4			dummy_errno;
	boolean_t		is_mm;

	assert((0 != needed) || (NULL != cr));
	csa = &(FILE_INFO(reg)->s_addrs);
	assert(csa == cs_addrs);
	csd = csa->hdr;
	is_mm = (dba_mm == csd->acc_meth);
	assert(is_mm || (dba_bg == csd->acc_meth));
	cnl = csa->nl;
	if (FALSE == csa->now_crit)
	{
		assert(0 != needed);	/* if needed == 0, then we should be in crit */
		for (lcnt = DIVIDE_ROUND_UP(needed, csd->n_wrt_per_flu);  0 < lcnt;  lcnt--)
			JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno);
					/* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */
		return TRUE;
	}
	if (FALSE == wcs_wtfini(reg))
		return FALSE;
	/* while calculating flush_trigger, the decrement should be atleast 1 if still not reached the minimum allowed */
	csd->flush_trigger = MAX(csd->flush_trigger - MAX(csd->flush_trigger/STEP_FACTOR, 1), MIN_FLUSH_TRIGGER(csd->n_bts));
	if (0 == needed)
	{
		if (!is_mm)
		{	/* If another process is concurrently finishing up phase2 of commit, wait for that to complete first. */
			if (cr->in_tend && !wcs_phase2_commit_wait(csa, cr))
				return FALSE;	/* assumption is that caller will set wc_blocked and trigger cache recovery */
		}
		for (lcnt = 1; (MAXGETSPACEWAIT > lcnt) && (0 != cr->dirty); lcnt++)
		{	/* We want to flush a specific cache-record. We speed up the wait by moving the dirty cache-record
			 * to the head of the active queue. But to do this, we need exclusive access to the active queue.
			 * The only other processes outside of crit that can be touching this concurrently are wcs_wtstart
			 * (which can remove entries from the queue) and bg_update_phase2 (which can add entries to the queue).
			 * In the case of writers, we can wait for those to complete (by setting cnl->wc_blocked to TRUE)
			 * and then play with the queue. But in the case of bg_update_phase2, it is not easily possible to
			 * do a similar wait so in this case we choose to do plain wcs_wtstart (which uses interlocked
			 * queue operations and hence can work well with concurrent bg_update_phase2) and wait until the
			 * cache record of interest becomes non-dirty. The consequence is we might wait a little longer than
			 * necessary but that is considered acceptable for now.
			 */
			/* Check if cache recovery is needed (could be set by another process in
			 * secshr_db_clnup finishing off a phase2 commit). If so, no point invoking
			 * wcs_wtstart as it will return right away. Instead return FALSE so
			 * cache-recovery can be triggered by the caller.
			 */
			if (cnl->wc_blocked)
			{
				assert(gtm_white_box_test_case_enabled);
				return FALSE;
			}
			if (!is_mm && cnl->wcs_phase2_commit_pidcnt)
			{
				JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno);
					/* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */
				wcs_sleep(lcnt);
			} else if (LATCH_CLEAR == WRITE_LATCH_VAL(cr))
			{
				SIGNAL_WRITERS_TO_STOP(cnl);	/* to stop all active writers */
				WAIT_FOR_WRITERS_TO_STOP(cnl, ocnt, MAXGETSPACEWAIT);
				if (MAXGETSPACEWAIT <= ocnt)
				{
					assert(FALSE);
					return FALSE;
				}
				if (LATCH_CLEAR == WRITE_LATCH_VAL(cr))
				{	/* Check if cache-record is part of the active queue. If so, then remove it from the
					 * tail of the active queue and move it to the head to try and speed up the flush.
					 * If not and if cr->dirty is non-zero, then the only way this is possible we know
					 * of is if a concurrent process encountered an error in the midst of commit in phase2
					 * of bg_update and finished the update but did not reinsert the cache-record in the
					 * active queue (see comment in secshr_db_clnup about why INSQ*I macros are not used
					 * in VMS). In this case, return FALSE as wcs_get_space cannot flush this cache-record.
					 * The caller will trigger appropriate error handling. We are guaranteed that cr cannot
					 * be part of the wip queue because WRITE_LATCH_VAL(cr) is LATCH_CLEAR (in wip queue it
					 * will be > LATCH_CLEAR).
					 */
					if (0 != cr->state_que.fl)
					{	/* We are about to play with the queues without using interlocks.
						 * Assert no one else could be concurrently playing with the queue.
						 */
						assert(!cnl->wcs_phase2_commit_pidcnt && !cnl->in_wtstart);
						base = &csa->acc_meth.bg.cache_state->cacheq_active;
						q0 = (que_ent_ptr_t)((sm_uc_ptr_t)&cr->state_que + cr->state_que.fl);
						shuffqth((que_ent_ptr_t)q0, (que_ent_ptr_t)base);
					} else if (cr->dirty)
					{
						assert(gtm_white_box_test_case_enabled);
						return FALSE;
					}
				}
				SIGNAL_WRITERS_TO_RESUME(cnl);
				JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno);
					/* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */
				wcs_sleep(lcnt);
			} else if ((0 == cr->iosb.cond) || (WRT_STRT_PNDNG == cr->iosb.cond))
			{
				JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno);
					/* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */
				wcs_sleep(lcnt);
			}
			if (FALSE == wcs_wtfini(reg))
				return FALSE;
		}
		if (0 == cr->dirty)
			return TRUE;
		assert(FALSE);
		return FALSE;
	}
	for (lcnt = 1; ((cnl->wc_in_free < needed) && (MAXGETSPACEWAIT > lcnt)); lcnt++)
	{
		DCLAST_WCS_WTSTART(reg, 0, dummy_errno); /* a macro that dclast's wcs_wtstart and checks for errors etc. */
		wcs_sleep(lcnt);
		if (FALSE == wcs_wtfini(reg))
			return FALSE;
	}
	if (cnl->wc_in_free < needed)
	{
		assert(FALSE);
		return FALSE;
	}
	return TRUE;
}
Beispiel #9
0
void preemptive_db_clnup(int preemptive_severe)
{
	sgmnt_addrs	*csa;
	sgm_info	*si;
	gd_region	*r_top, *reg;
	gd_addr		*addr_ptr;

	if (!dollar_tlevel && update_trans)
	{	/* It's possible we hit an error in the middle of an update, at which point we have
		 * a valid clue and non-NULL cse. However, this causes problems for subsequent
		 * transactions (see comment in t_begin). In particular we could end up pinning buffers
		 * unnecessarily. So clear the cse of any histories that may have been active during the update.
		 */
		CLEAR_CSE(gv_target);
		if ((NULL != gv_target) && (NULL != gv_target->gd_csa))
		{
			CLEAR_CSE(gv_target->gd_csa->dir_tree);
			GTMTRIG_ONLY(CLEAR_CSE(gv_target->gd_csa->hasht_tree));
		}
	}
	if (INVALID_GV_TARGET != reset_gv_target)
	{
		if (SUCCESS != preemptive_severe && INFO != preemptive_severe)
		{
			/* We know of a few cases in Unix where gv_target and gv_currkey could be out of sync at this point.
			 *   a) If we are inside trigger code which in turn does an update that does
			 *	reads of ^#t global and ends up in a restart. This restart would
			 *	in turn do a rts_error(TPRETRY) which would invoke mdb_condition_handler
			 *	that would in turn invoke preemptive_db_clnup which invokes this macro.
			 *	In this tp restart case though, it is ok for gv_target and gv_currkey
			 *	to be out of sync because they are going to be reset by tp_clean_up anyways.
			 *	So skip the dbg-only in-sync check.
			 *   b) If we are in gvtr_init reading the ^#t global and detect an error (e.g. TRIGINVCHSET)
			 *	gv_target after the reset would be pointing to a regular global whereas gv_currkey
			 *	would be pointing to ^#t. It is ok to be out-of-sync since in this case, we expect
			 *	mdb_condition_handler to be calling us. That has code to reset gv_currkey (and
			 *	cs_addrs/cs_data etc.) to reflect gv_target (i.e. get them back in sync).
			 * Therefore in Unix we pass SKIP_GVT_GVKEY_CHECK to skip the gvtarget/gvcurrkey out-of-sync check
			 * in RESET_GV_TARGET. In VMS we pass DO_GVT_GVKEY_CHECK as we dont yet know of an out-of-sync situation.
			 */
			RESET_GV_TARGET(UNIX_ONLY(SKIP_GVT_GVKEY_CHECK) VMS_ONLY(DO_GVT_GVKEY_CHECK));
		}
	}
	need_kip_incr = FALSE;	/* in case we got an error in t_end (e.g. GBLOFLOW), dont want this global variable to get
				 * carried over to the next non-TP transaction that this process does (e.g. inside an error trap).
				 */
	if (dollar_tlevel)
	{
		for (si = first_sgm_info;  si != NULL; si = si->next_sgm_info)
		{
			if (NULL != si->kip_csa)
			{
				csa = si->tp_csa;
				assert(si->tp_csa == si->kip_csa);
				CAREFUL_DECR_KIP(csa->hdr, csa, si->kip_csa);
			}
		}
	} else if (NULL != kip_csa && (NULL != kip_csa->hdr) && (NULL != kip_csa->nl))
		CAREFUL_DECR_KIP(kip_csa->hdr, kip_csa, kip_csa);
	if (IS_DSE_IMAGE)
	{	/* Release crit on any region that was obtained for the current erroring DSE operation.
		 * Take care NOT to release crits obtained by a previous CRIT -SEIZE command.
		 */
		for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
		{
			for (reg = addr_ptr->regions, r_top = reg + addr_ptr->n_regions; reg < r_top; reg++)
			{
				if (reg->open && !reg->was_open)
				{
					csa = &FILE_INFO(reg)->s_addrs;
					assert(csa->hold_onto_crit || !csa->dse_crit_seize_done);
					assert(!csa->hold_onto_crit || csa->now_crit);
					if (csa->now_crit && (!csa->hold_onto_crit || !csa->dse_crit_seize_done))
					{
						rel_crit(reg);
						csa->hold_onto_crit = FALSE;
						t_abort(reg, csa);	/* cancel mini-transaction if any in progress */
					}
				}
			}
		}
	}
}
Beispiel #10
0
boolean_t mu_truncate(int4 truncate_percent)
{
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t 	csd;
	int			num_local_maps;
	int 			lmap_num, lmap_blk_num;
	int			bml_status, sigkill;
	int			save_errno;
	int			ftrunc_status;
	uint4			jnl_status;
	uint4			old_total, new_total;
	uint4			old_free, new_free;
	uint4			end_blocks;
	int4			blks_in_lmap, blk;
	gtm_uint64_t		before_trunc_file_size;
	off_t			trunc_file_size;
	off_t			padding;
	uchar_ptr_t		lmap_addr;
	boolean_t		was_crit;
	uint4			found_busy_blk;
	srch_blk_status		bmphist;
	srch_blk_status 	*blkhist;
	srch_hist		alt_hist;
	trans_num		curr_tn;
	blk_hdr_ptr_t		lmap_blk_hdr;
	block_id		*blkid_ptr;
	unix_db_info    	*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	char			*err_msg;
	intrpt_state_t		prev_intrpt_state;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csa = cs_addrs;
	csd = cs_data;
	if (dba_mm == csd->acc_meth)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		return TRUE;
	}
	/* already checked for parallel truncates on this region --- see mupip_reorg.c */
	gv_target = NULL;
	assert(csa->nl->trunc_pid == process_id);
	assert(dba_mm != csd->acc_meth);
	old_total = csa->ti->total_blks;
	old_free = csa->ti->free_blocks;
	sigkill = 0;
	found_busy_blk = 0;
	memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */
	assert(csd->bplmap == BLKS_PER_LMAP);
	end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */
	if (0 == end_blocks)
		end_blocks = BLKS_PER_LMAP;
	num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP);
	/* ======================================== PHASE 1 ======================================== */
	for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--)
	{
		if (mu_ctrly_occurred || mu_ctrlc_occurred)
			return TRUE;
		assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */
		if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region),
					truncate_percent);
			return TRUE;
		}
		lmap_blk_num = lmap_num * BLKS_PER_LMAP;
		if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
		{
			found_busy_blk = lmap_blk_num;
			break;
		}
		blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP;
		/* Loop through non-bitmap blocks of this lmap, do recycled2free */
		DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n",
			lmap_num, lmap_blk_num, blks_in_lmap));
		for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;)
		{
			t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
			for (;;) /* retry loop for recycled to free transactions */
			{
				curr_tn = csd->trans_hist.curr_tn;
				/* Read the nth local bitmap into memory */
				bmphist.blk_num = lmap_blk_num;
				bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr);
				lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr;
				if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
				{ /* Could not read the block successfully. Retry. */
					t_retry((enum cdb_sc)rdfail_detail);
					continue;
				}
				lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr);
				/* starting from the hint (blk itself), find the first busy or recycled block */
				blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status);
				assert(blk < BLKS_PER_LMAP);
				if (blk == -1 || blk >= blks_in_lmap)
				{ /* done with this lmap, continue to next */
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
				{ /* stop processing blocks... skip ahead to phase 2 */
					found_busy_blk = lmap_blk_num;
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_RECYCLED == bml_status)
				{ /* Write PBLK records for recycled blocks only if before_image journaling is
				   * enabled. t_end() takes care of checking if journaling is enabled and
				   * writing PBLK record. We have to at least mark the recycled block as free.
				   */
					RESET_UPDATE_ARRAY;
					update_trans = UPDTRNS_DB_UPDATED_MASK;
					*((block_id *)update_array_ptr) = blk;
					update_array_ptr += SIZEOF(block_id);
					*(int *)update_array_ptr = 0;
					alt_hist.h[1].blk_num = 0;
					alt_hist.h[0].level = 0;
					alt_hist.h[0].cse = NULL;
					alt_hist.h[0].tn = curr_tn;
					alt_hist.h[0].blk_num = lmap_blk_num + blk;
					alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num,
							&alt_hist.h[0].cycle, &alt_hist.h[0].cr);
					if (!alt_hist.h[0].buffaddr)
					{
						t_retry((enum cdb_sc)rdfail_detail);
						continue;
					}
					if (!t_recycled2free(&alt_hist.h[0]))
					{
						t_retry(cdb_sc_lostbmlcr);
						continue;
					}
					t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0);
					/* Set the opcode for INCTN record written by t_end() */
					inctn_opcode = inctn_blkmarkfree;
					if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
						continue;
					/* block processed, scan from the next one */
					blk++;
					break;
				} else
				{
					assert(t_tries < CDB_STAGNATE);
					t_retry(cdb_sc_badbitmap);
					continue;
				}
			} /* END recycled2free retry loop */
		} /* END scanning blocks of this particular lmap */
		/* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */
		/* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */
		DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num));
		t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
		for (;;)
		{
			RESET_UPDATE_ARRAY;
			BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id);
			*blkid_ptr = 0;
			update_trans = UPDTRNS_DB_UPDATED_MASK;
			inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */
			curr_tn = csd->trans_hist.curr_tn;
			blkhist = &alt_hist.h[0];
			blkhist->blk_num = lmap_blk_num;
			blkhist->tn = curr_tn;
			blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */
			/* Read the nth local bitmap into memory */
			blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr);
			lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr;
			if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
			{ /* Could not read the block successfully. Retry. */
				t_retry((enum cdb_sc)rdfail_detail);
				continue;
			}
			t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0);
			blkhist->blk_num = 0; /* create empty history for bitmap block */
			if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
				continue;
			break;
		}
	} /* END scanning lmaps */
	/* ======================================== PHASE 2 ======================================== */
	assert(!csa->now_crit);
	for (;;)
	{ /* wait for FREEZE, we don't want to truncate a frozen database */
		grab_crit(gv_cur_region);
		if (!cs_data->freeze && !IS_REPL_INST_FROZEN)
			break;
		rel_crit(gv_cur_region);
		while (cs_data->freeze || IS_REPL_INST_FROZEN)
			hiber_start(1000);
	}
	assert(csa->nl->trunc_pid == process_id);
	/* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */
	if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB))
	{
		assert(FALSE);
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"),
				DB_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return FALSE;
	}
	csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk);
	assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk));
	new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP);
	if (mu_ctrly_occurred || mu_ctrlc_occurred)
	{
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (csa->ti->total_blks != old_total || new_total == old_total)
	{
		assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (SNAPSHOTS_IN_PROG(csa->nl))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	}
	DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	if (JNL_ENABLED(csa))
	{ /* Write JRT_TRUNC and INCTN records */
		if (!jgbl.dont_reset_gbl_jrec_time)
		SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = csa->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open();
		if (SS_NORMAL != jnl_status)
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
		else
		{
			if (0 == jpc->pini_addr)
				jnl_put_jrt_pini(csa);
			jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total);
			inctn_opcode = inctn_mu_reorg;
			jnl_write_inctn_rec(csa);
			jnl_status = jnl_flush(gv_cur_region);
			if (SS_NORMAL != jnl_status)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"),
					jnl_status);
				assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */
			}
		}
	}
	/* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */
	curr_tn = csa->ti->curr_tn;
	CHECK_TN(csa, csd, curr_tn);
	udi = FILE_INFO(gv_cur_region);
	/* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */
	trunc_file_size = BLK_ZERO_OFF(csd) + ((off_t)csd->blk_size * new_total) + DISK_BLOCK_SIZE;
	csd->after_trunc_total_blks = new_total;
	csd->before_trunc_free_blocks = csa->ti->free_blocks;
	csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */
	/* file size and total blocks: INCONSISTENT */
	csa->ti->total_blks = new_total;
	/* past the point of no return -- shared memory intact */
	assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total));
	csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total);
	new_free = csa->ti->free_blocks;
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	CHECK_DBSYNC(gv_cur_region, save_errno);
	/* past the point of no return -- shared memory deleted */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */
	clear_cache_array(csa, csd, gv_cur_region, new_total, old_total);
	WRITE_EOF_BLOCK(gv_cur_region, csd, new_total, save_errno);
	if (0 != save_errno)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		return FALSE;
	}
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */
	/* Execute an ftruncate() and truncate the DB file
	 * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS)
	 * It ignores kill -9 signal till its operation is completed.
	 * So we can safely assume that the result of ftruncate() will be complete.
	 */
	FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status);
	if (0 != ftrunc_status)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		/* should go through recover_truncate now, which will again try to FTRUNCATE */
		return FALSE;
	}
	/* file size and total blocks: CONSISTENT (shrunk) */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */
	csa->nl->root_search_cycle++;	/* Force concurrent processes to restart in t_end/tp_tend to make sure no one
					 * tries to commit updates past the end of the file. Bitmap validations together
					 * with highest_lbm_with_busy_blk should actually be sufficient, so this is
					 * just to be safe.
					 */
	csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */
	/* Increment TN */
	assert(csa->ti->early_tn == csa->ti->curr_tn);
	csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1;
	INCREMENT_CURR_TN(csd);
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */
	CHECK_DBSYNC(gv_cur_region, save_errno);
	ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	curr_tn = csa->ti->curr_tn;
	rel_crit(gv_cur_region);
	send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn);
	util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].",
					FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free);
	return TRUE;
} /* END of mu_truncate() */
Beispiel #11
0
void dse_f_reg(void)
{
	char rn[MAX_RN_LEN];
	unsigned short rnlen;
	int i;
	bool found;
	gd_region *ptr;
	gd_addr *temp_gdaddr;
	gd_binding *map;

	temp_gdaddr = gd_header;
	gd_header = original_header;
	rnlen = SIZEOF(rn);
	if (!cli_get_str("REGION",rn,&rnlen))
	{
		gd_header = temp_gdaddr;
		return;
	}
	if (rn[0] == '*' && rnlen == 1)
	{
		util_out_print("List of global directory:!_!AD!/",TRUE,dollar_zgbldir.str.len,dollar_zgbldir.str.addr);
		for (i=0, ptr = gd_header->regions; i < gd_header->n_regions ;i++, ptr++)
		{	util_out_print("!/File  !_!AD",TRUE, ptr->dyn.addr->fname_len,&ptr->dyn.addr->fname[0]);
			util_out_print("Region!_!AD",TRUE, REG_LEN_STR(ptr));
                 }
		gd_header = temp_gdaddr;
		 return;
	}
	assert(rn[0]);
	found = FALSE;
	for (i=0, ptr = gd_header->regions; i < gd_header->n_regions ;i++, ptr++)
	{
		if (found = !memcmp(&ptr->rname[0],&rn[0],MAX_RN_LEN))
			break;
	}
	if (!found)
	{
		util_out_print("Error:  region not found.",TRUE);
		gd_header = temp_gdaddr;
		return;
	}
	if (ptr == gv_cur_region)
	{
		util_out_print("Error:  already in region: !AD",TRUE,REG_LEN_STR(gv_cur_region));
		gd_header = temp_gdaddr;
		return;
	}
	if (ptr->dyn.addr->acc_meth == dba_cm)
	{
		util_out_print("Error:  Cannot edit an GT.CM database file.",TRUE);
		gd_header = temp_gdaddr;
		return;
	}
	if (ptr->dyn.addr->acc_meth == dba_usr)
	{
		util_out_print("Error:  Cannot edit a non-GDS format database file.",TRUE);
		gd_header = temp_gdaddr;
		return;
	}
	if (!ptr->open)
	{
		util_out_print("Error:  that region was not opened because it is not bound to any namespace.",TRUE);
		gd_header = temp_gdaddr;
		return;
	}
	if (TRUE == cs_addrs->now_crit)
	{
		util_out_print("Warning:  now leaving region in critical section: !AD",TRUE, gv_cur_region->rname_len,
				gv_cur_region->rname);
	}
	gv_cur_region = ptr;
	gv_target = NULL;	/* to prevent out-of-sync situations between gv_target and cs_addrs */
	gv_currkey->base[0] = '\0';	/* prevent fast-path from op_gvname from being taken as region has been switched
					 * and gv_target has been reset to NULL.
					 */
	gv_currkey->end = 0;	/* clear end so it is in sync with base[0] */
	switch (gv_cur_region->dyn.addr->acc_meth)
	{
	case dba_mm:
	case dba_bg:
		cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs;
		cs_data = cs_addrs->hdr;
		break;
	default:
		GTMASSERT;
	}
	if (cs_addrs && cs_addrs->critical)
		crash_count = cs_addrs->critical->crashcnt;
	util_out_print("!/File  !_!AD",TRUE, DB_LEN_STR(gv_cur_region));
	util_out_print("Region!_!AD!/",TRUE, REG_LEN_STR(gv_cur_region));
	patch_curr_blk = get_dir_root();
	gv_init_reg(gv_cur_region);
	GET_SAVED_GDADDR(gd_header, temp_gdaddr, map, gv_cur_region);
	return;
}
Beispiel #12
0
int main(int argc, char *argv[])
{
	DCL_THREADGBL_ACCESS;

	GTM_THREADGBL_INIT;
	common_startup_init(DSE_IMAGE);
	licensed = TRUE;
	TREF(transform) = TRUE;
	TREF(no_spangbls) = TRUE;	/* dse operates on a per-region basis irrespective of global mapping in gld */
	TREF(skip_file_corrupt_check) = TRUE;	/* do not let csd->file_corrupt flag cause errors in dse */
	op_open_ptr = op_open;
	patch_curr_blk = get_dir_root();
	err_init(util_base_ch);
	UNICODE_ONLY(gtm_strToTitle_ptr = &gtm_strToTitle);
	GTM_ICU_INIT_IF_NEEDED;	/* Note: should be invoked after err_init (since it may error out) and before CLI parsing */
	sig_init(generic_signal_handler, dse_ctrlc_handler, suspsigs_handler, continue_handler);
	atexit(util_exit_handler);
	SET_LATCH_GLOBAL(&defer_latch, LOCK_AVAILABLE);
	stp_init(STP_INITSIZE);
	rts_stringpool = stringpool;
	getjobname();
	INVOKE_INIT_SECSHR_ADDRS;
	io_init(TRUE);
	getzdir();
	gtm_chk_dist(argv[0]);
	prealloc_gt_timers();
	gt_timers_add_safe_hndlrs();
	initialize_pattern_table();
	gvinit();
	region_init(FALSE);
	util_out_print("!/File  !_!AD", TRUE, DB_LEN_STR(gv_cur_region));
	util_out_print("Region!_!AD!/", TRUE, REG_LEN_STR(gv_cur_region));
	cli_lex_setup(argc, argv);
	/* Since DSE operates on a region-by-region basis (for the most part), do not use a global directory at all from now on */
	original_header = gd_header;
	gd_header = NULL;
	OPERATOR_LOG_MSG;
#	ifdef DEBUG
	if ((gtm_white_box_test_case_enabled && (WBTEST_SEMTOOLONG_STACK_TRACE == gtm_white_box_test_case_number) ))
	{
		sgmnt_addrs     * csa;
		node_local_ptr_t cnl;
		csa = &FILE_INFO(gv_cur_region)->s_addrs;
		cnl = csa->nl;
		cnl->wbox_test_seq_num  = 1; /*Signal the first step and wait here*/
		/* The signal to the shell. MUPIP must not start BEFORE DSE */
		util_out_print("DSE is ready. MUPIP can start. Note: This message is a part of WBTEST_SEMTOOLONG_STACK_TRACE test. "
			       "It will not appear in PRO version.", TRUE);
		while (2 != cnl->wbox_test_seq_num) /*Wait for another process to get hold of the semaphore and signal next step*/
			LONG_SLEEP(1);
	}
#	endif
	if (argc < 2)
                display_prompt();
	while (1)
	{
		if (!dse_process(argc))
			break;
		display_prompt();
	}
	dse_exit();
	REVERT;
	return 0;
}
Beispiel #13
0
void gv_rundown(void)
{
	gd_region	*r_top, *r_save, *r_local;
	gd_addr		*addr_ptr;
	sgm_info	*si;
	int4		rundown_status = EXIT_NRM;			/* if gds_rundown went smoothly */
#	ifdef VMS
	vms_gds_info	*gds_info;
#	elif UNIX
	unix_db_info	*udi;
#	endif
#if defined(DEBUG) && defined(UNIX)
	sgmnt_addrs		*csa;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;

	r_save = gv_cur_region;		/* Save for possible core dump */
	gvcmy_rundown();
	ENABLE_AST

	if (pool_init)
		rel_lock(jnlpool.jnlpool_dummy_reg);
	for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
	{
		for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++)
		{
			if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth)
			{	/* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above.
			 	 * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions,
				 * region->open is TRUE although cs_addrs is NULL.
			 	 */
#				if defined(DEBUG) && defined(UNIX)
				if (is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE && TREF(gtm_test_fake_enospc))
				{	/* Clear ENOSPC faking now that we are running down */
					csa = REG2CSA(r_local);
					if (csa->nl->fake_db_enospc || csa->nl->fake_jnl_enospc)
					{
						send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT,
							     2, LEN_AND_LIT("Resetting fake_db_enospc and fake_jnl_enospc"));
						csa->nl->fake_db_enospc = FALSE;
						csa->nl->fake_jnl_enospc = FALSE;
					}
				}
#				endif
				gv_cur_region = r_local;
			        tp_change_reg();
				UNIX_ONLY(rundown_status |=) gds_rundown();

				/* Now that gds_rundown is done, free up the memory associated with the region.
				 * Ideally the following memory freeing code should go to gds_rundown, but
				 * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM.
				 */
				if (NULL != cs_addrs)
				{
					if (NULL != cs_addrs->dir_tree)
						FREE_CSA_DIR_TREE(cs_addrs);
					if (cs_addrs->sgm_info_ptr)
					{
						si = cs_addrs->sgm_info_ptr;
						/* It is possible we got interrupted before initializing all fields of "si"
						 * completely so account for NULL values while freeing/releasing those fields.
						 */
						assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa));
						if (si->jnl_tail)
						{
							CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list);
							CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list);
						}
						CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list);
						CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list);
						if (NULL != si->blks_in_use)
						{
							free_hashtab_int4(si->blks_in_use);
							free(si->blks_in_use);
							si->blks_in_use = NULL;
						}
						if (si->cr_array_size)
						{
							assert(NULL != si->cr_array);
							if (NULL != si->cr_array)
								free(si->cr_array);
						}
						if (NULL != si->first_tp_hist)
							free(si->first_tp_hist);
						free(si);
					}
					if (cs_addrs->jnl)
					{
						assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs);
						if (cs_addrs->jnl->jnllsb)
						{
							UNIX_ONLY(assert(FALSE));
							free(cs_addrs->jnl->jnllsb);
						}
						free(cs_addrs->jnl);
					}
					GTMCRYPT_ONLY(
						if (cs_addrs->encrypted_blk_contents)
							free(cs_addrs->encrypted_blk_contents);
					)
				}
				assert(gv_cur_region->dyn.addr->file_cntl->file_info);
				VMS_ONLY(
					gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info;
					if (gds_info->xabpro)
						free(gds_info->xabpro);
					if (gds_info->xabfhc)
						free(gds_info->xabfhc);
					if (gds_info->nam)
					{
						free(gds_info->nam->nam$l_esa);
						free(gds_info->nam);
					}
					if (gds_info->fab)
						free(gds_info->fab);
				)
				free(gv_cur_region->dyn.addr->file_cntl->file_info);
				free(gv_cur_region->dyn.addr->file_cntl);
			}
			r_local->open = r_local->was_open = FALSE;
		}
	}
Beispiel #14
0
void jnl_fsync(gd_region *reg, uint4 fsync_addr)
{
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jb;
	uint4			lcnt, saved_dsk_addr, saved_status;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	int4			lck_state;
	int			fsync_ret, save_errno;

	error_def(ERR_JNLFSYNCERR);
	error_def(ERR_FSYNCTIMOUT);
	error_def(ERR_TEXT);
	error_def(ERR_JNLFRCDTERM);
	error_def(ERR_JNLFSYNCLSTCK);

	csa = &FILE_INFO(reg)->s_addrs;
	jpc = csa->jnl;
	jb  = jpc->jnl_buff;

	if ((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc))
	{
		csd = csa->hdr;
		for (lcnt = 1; fsync_addr > jb->fsync_dskaddr && !JNL_FILE_SWITCHED(jpc); lcnt++)
		{
			if (MAX_FSYNC_WAIT_CNT / 2 == lcnt)	/* half way into max.patience*/
			{
				saved_status = jpc->status;
				jpc->status = SS_NORMAL;
				jnl_send_oper(jpc, ERR_JNLFSYNCLSTCK);
				jpc->status = saved_status ;
			}
			if (MAX_FSYNC_WAIT_CNT == lcnt)	/* tried a long */
			{
				saved_status = jpc->status;
				jpc->status = SS_NORMAL;
				jnl_send_oper(jpc, ERR_JNLFSYNCLSTCK);
				jpc->status = saved_status ;
				send_msg(VARLSTCNT(4) ERR_FSYNCTIMOUT, 2, JNL_LEN_STR(csd));
				GTMASSERT;
			}
			BG_TRACE_PRO_ANY(csa, n_jnl_fsync_tries);
			if (GET_SWAPLOCK(&jb->fsync_in_prog_latch))
				break;
			wcs_sleep(lcnt);
			performCASLatchCheck(&jb->fsync_in_prog_latch, lcnt);
		}
		if (fsync_addr > jb->fsync_dskaddr && !JNL_FILE_SWITCHED(jpc))
		{
			assert(process_id == jb->fsync_in_prog_latch.u.parts.latch_pid);  /* assert we have the lock */
			saved_dsk_addr = jb->dskaddr;
			if (jpc->sync_io)
			{
				/* We need to maintain the fsync control fields irrespective of the type of IO, because we might
				 * switch between these at any time.
				 */
				jb->fsync_dskaddr = saved_dsk_addr;
			} else
			{
				GTM_FSYNC(jpc->channel, fsync_ret);
				if (-1 == fsync_ret)
				{
					save_errno = errno;
					assert(FALSE);
					send_msg(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, JNL_LEN_STR(csd),
						ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), save_errno);
					rts_error(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, JNL_LEN_STR(csd),
						ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), save_errno);
				} else
				{
					jb->fsync_dskaddr = saved_dsk_addr;
					BG_TRACE_PRO_ANY(csa, n_jnl_fsyncs);
				}
			}
		}
		if (process_id == jb->fsync_in_prog_latch.u.parts.latch_pid)
			RELEASE_SWAPLOCK(&jb->fsync_in_prog_latch);
	}
	return;
}
Beispiel #15
0
void	lke_clear(void)
{
	bool		locks, all = TRUE, wait = FALSE, interactive = TRUE, match = FALSE, memory = FALSE, nocrit = FALSE;
	boolean_t	exact = TRUE, was_crit;
	int4		pid;
	int		n;
	char		regbuf[MAX_RN_LEN], nodebuf[32], one_lockbuf[MAX_KEY_SZ];
	mlk_ctldata_ptr_t	ctl;
	mstr		reg, node, one_lock;

	/* Get all command parameters */
	reg.addr = regbuf;
	reg.len = SIZEOF(regbuf);
	node.addr = nodebuf;
	node.len = SIZEOF(nodebuf);
	one_lock.addr = one_lockbuf;
	one_lock.len = SIZEOF(one_lockbuf);

	if (lke_getcli(&all, &wait, &interactive, &pid, &reg, &node, &one_lock, &memory, &nocrit, &exact) == 0)
		return;

	/* Search all regions specified on the command line */
	for (gv_cur_region = gd_header->regions, n = 0;
	     n != gd_header->n_regions;
	     ++gv_cur_region, ++n)
	{	/* If region matches and is open */
		if ((reg.len == 0  ||
		     gv_cur_region->rname_len == reg.len  &&  memcmp(gv_cur_region->rname, reg.addr, reg.len) == 0)  &&
		    gv_cur_region->open)
		{
			match = TRUE;
			util_out_print("!/!AD!/", NOFLUSH, REG_LEN_STR(gv_cur_region));
			/* If distributed database, the region is located on another node */
			if (gv_cur_region->dyn.addr->acc_meth == dba_cm)
			{
#				if defined(LKE_WORKS_OK_WITH_CM)
				/* Remote lock clears are not supported, so LKE CLEAR -EXACT qualifier
				 * will not be supported on GT.CM.*/
				locks = gtcmtr_lke_clearreq(gv_cur_region->dyn.addr->cm_blk, gv_cur_region->cmx_regnum,
							    all, interactive, pid, &node);
#				else
				gtm_putmsg(VARLSTCNT(10) ERR_UNIMPLOP, 0, ERR_TEXT, 2,
						LEN_AND_LIT("GT.CM region - locks must be cleared on the local node"),
						ERR_TEXT, 2, REG_LEN_STR(gv_cur_region));
				continue;
#				endif
			} else if ((dba_bg == gv_cur_region->dyn.addr->acc_meth) || (dba_mm == gv_cur_region->dyn.addr->acc_meth))
			{	/* Local region */
				cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs;
				ctl = (mlk_ctldata_ptr_t)cs_addrs->lock_addrs[0];
				/* Prevent any modifications of locks while we are clearing */
				if (cs_addrs->critical != NULL)
					crash_count = cs_addrs->critical->crashcnt;
				was_crit = cs_addrs->now_crit;
				if (!was_crit)
					grab_crit(gv_cur_region);
				locks = ctl->blkroot == 0 ? FALSE
							  : lke_cleartree(gv_cur_region, NULL, ctl,
									 (mlk_shrblk_ptr_t)R2A(ctl->blkroot),
									  all, interactive, pid, one_lock, exact);
				if (!was_crit)
					rel_crit(gv_cur_region);
			} else
			{
				gtm_putmsg(VARLSTCNT(2) ERR_BADREGION, 0);
				locks = TRUE;
			}

			if (!locks)
			{
				gtm_putmsg(VARLSTCNT(4) ERR_NOLOCKMATCH, 2, REG_LEN_STR(gv_cur_region));
			}
		}
	}

	if (!match  &&  reg.len != 0)
		rts_error(VARLSTCNT(4) ERR_NOREGION, 2, reg.len, reg.addr);

}
Beispiel #16
0
int4 gds_rundown(void)
{
	boolean_t		canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin;
	boolean_t		have_standalone_access, ipc_deleted, err_caught;
	boolean_t		is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm;
	boolean_t		unsafe_last_writer;
	char			time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */
	gd_region		*reg;
	int			save_errno, status, rc;
	int4			semval, ftok_semval, sopcnt, ftok_sopcnt;
	short			crash_count;
	sm_long_t		munmap_len;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t	cnl;
	struct shmid_ds		shm_buf;
	struct sembuf		sop[2], ftok_sop[2];
	uint4           	jnl_status;
	unix_db_info		*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	shm_snapshot_t		*ss_shm_ptr;
	uint4			ss_pid, onln_rlbk_pid, holder_pid;
	boolean_t		was_crit;
	boolean_t		safe_mode; /* Do not flush or take down shared memory. */
	boolean_t		bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen,
				ftok_counter_halted,
				access_counter_halted;
	int			secshrstat;
	intrpt_state_t		prev_intrpt_state;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	jnl_status = 0;
	reg = gv_cur_region;			/* Local copy */

	/* early out for cluster regions
	 * to avoid tripping the assert below.
	 * Note:
	 *	This early out is consistent with VMS.  It has been
	 *	noted that all of the gtcm assignments
	 *      to gv_cur_region should use the TP_CHANGE_REG
	 *	macro.  This would also avoid the assert problem
	 *	and should be done eventually.
	 */
	if (dba_cm == reg->dyn.addr->acc_meth)
		return EXIT_NRM;

	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	if ((reg->open) && (dba_usr == csd->acc_meth))
	{
		change_reg();
		gvusr_rundown();
		return EXIT_NRM;
	}
	/* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local
	 * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on
	 * that later to determine if the process had standalone access or not when it entered this function.  We need to guarantee
	 * that none else access database file header when semid/shmid fields are reset.  We already have created ftok semaphore in
	 * db_init or, mu_rndwn_file and did not remove it.  So just lock it. We do it in blocking mode.
	 */
	have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */
	DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
	ESTABLISH_NORET(gds_rundown_ch, err_caught);
	if (err_caught)
	{
		REVERT;
		WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0);
		ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
		DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE);
		return EXIT_ERR;
	}
	assert(reg->open);			/* if we failed to open, dbinit_ch should have taken care of proper clean up */
	assert(!reg->opening);			/* see comment above */
	assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth));
	is_mm = (dba_bg != csd->acc_meth);
	assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk));
	/* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This
	 * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to
	 * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any
	 * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known
	 * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown.
	 */
	was_crit = (csa->now_crit && jgbl.onlnrlbk);
	/* Cancel any pending flush timer for this region by this task */
	canceled_flush_timer = FALSE;
	canceled_dbsync_timer = FALSE;
	CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer);
	we_are_last_user = FALSE;
	inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr);
	if (!csa->persistent_freeze)
		region_freeze(reg, FALSE, FALSE, FALSE);
	if (!was_crit)
	{
		rel_crit(reg);		/* get locks to known state */
		mutex_cleanup(reg);
	}
	/* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others
	 * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from
	 * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach
	 * mupip_exit_handler while holding access control semaphore. Assert accordingly.
	 */
	assert(!have_standalone_access || mupip_jnl_recover || process_exiting);
	/* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it
	 * needs the access control lock as well. The only expection is we are online rollback and currently running down.
	 */
	cnl = csa->nl;
	onln_rlbk_pid = cnl->onln_rlbk_pid;
	assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0));
	if (!have_standalone_access)
	{
		if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */
		{
			save_errno = errno;
			assert(FALSE);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
				  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno);
		}
		may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */
		/* We need to guarantee that no one else access database file header when semid/shmid fields are reset.
		 * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it.
		 */
		if (!ftok_sem_lock(reg, may_bypass_ftok))
		{
			if (may_bypass_ftok)
			{	/* We did a non-blocking wait. It's ok to proceed without locking */
				bypassed_ftok = TRUE;
				holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID);
				if ((uint4)-1 == holder_pid)
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"),
							CALLFROM, errno);
				if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */
				{
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10,
						 LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"),
						 REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid);
					send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2,
							LEN_AND_LIT("FTOK bypassed at rundown"));
				}
			} else
			{	/* We did a blocking wait but something bad happened. */
				FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
			}
		}
		sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0;	/* Wait for 0 */
		sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1;	/* Lock */
		sopcnt = 2;
		sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */
		SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT);
		if (0 != status)
		{
			save_errno = errno;
			/* Check # of processes counted on access sem. */
			if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL)))
			{
				assert(FALSE);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno);
			}
			bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt;
			/* Before attempting again in the blocking mode, see if the holding process is an online rollback.
			 * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we
			 * are better off skipping rundown and continuing with sanity cleanup and exit.
			 */
			holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID);
			if ((uint4)-1 == holder_pid)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno);
			if (!bypassed_access)
			{	/* We couldn't get it in one shot-- see if we already have it */
				if (holder_pid == process_id)
				{
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg),
							ERR_RNDWNSEMFAIL);
					REVERT;
					ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
					assert(FALSE);
					return EXIT_ERR;
				}
				if (EAGAIN != save_errno)
				{
					assert(FALSE);
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"),
							CALLFROM, save_errno);
				}
				sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO;	/* Try again - blocking this time */
				SEMOP(udi->semid, sop, 2, status, FORCED_WAIT);
				if (-1 == status)			/* We couldn't get it at all.. */
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"),
							CALLFROM, errno);
			} else if (!IS_GTM_IMAGE)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10,
						LEN_AND_STR(gtmImageNames[image_type].imageName), process_id,
						LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid);
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2,
						LEN_AND_LIT("Access control bypassed at rundown"));
			}
			udi->grabbed_access_sem = !bypassed_access;
		}
	} /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we
	   * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the
	   * ftok semaphore and trying it could cause deadlock
	   */
	/* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE.
	 * But there could be other processes still having the database open so we cannot safely reset the halted fields.
	 */
	if (have_standalone_access && !jgbl.onlnrlbk)
		csd->ftok_counter_halted = csd->access_counter_halted = FALSE;
	ftok_counter_halted = csd->ftok_counter_halted;
	access_counter_halted = csd->access_counter_halted;
	/* If we bypassed any of the semaphores, activate safe mode.
	 * Also, if the replication instance is frozen and this db has replication turned on (which means
	 * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode.
	 */
	ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen);
	safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted;
	/* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */
	assert(csa->ref_cnt);	/* decrement private ref_cnt before shared ref_cnt decrement. */
	csa->ref_cnt--;		/* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */
	assert(!csa->ref_cnt);
	--cnl->ref_cnt;
	if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1))
	{	/* VERMISMATCH condition. Possible only if DSE */
		assert(dse_running);
		vermismatch = TRUE;
	} else
		vermismatch = FALSE;
	if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf))
	{
		save_errno = errno;
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
				RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno);
	} else
		we_are_last_user =  (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode;
	/* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */
	assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen);
	if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL)))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
			  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno);
	/* There's one writer left and I am it */
	assert(reg->read_only || semval >= 0);
	unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch;
	we_are_last_writer = unsafe_last_writer && !safe_mode;
	assert(!we_are_last_writer || !safe_mode);
	assert(!we_are_last_user || !safe_mode);
	/* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */
	assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen);
	GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */
	if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
			  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno);
	if (NULL != csa->ss_ctx)
		ss_destroy_context(csa->ss_ctx);
	/* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */
	assert(1 == MAX_SNAPSHOTS);
	ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa);
	ss_pid = ss_shm_ptr->ss_info.ss_pid;
	is_cur_process_ss_initiator = (process_id == ss_pid);
	if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user))
	{
		/* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip
		 * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP
		 * INTEG or by a MUPIP RUNDOWN.
		 */
		if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid)
			&& (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0)))
		{
			ss_release(NULL);
			ss_release_lock(reg);
		}
	}
	/* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush
	 * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause
	 * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without
	 * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a
	 * consistent state.
	 */
	if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch)
	{	/* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY
		 * if there is NO concurrent online rollback running (as we need crit to set wc_blocked)
		 */
		if (csa->wbuf_dqd && !is_mm)
		{	/* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which
			 * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only
			 * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked
			 * BEFORE secshr_db_clnup in mur_close_files.
			 * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because
			 * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in
			 * wcs_wtstart or wcs_get_space. Assert accordingly.
			 */
			assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode);
			if (!was_crit)
				grab_crit(reg);
			SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
			BG_TRACE_PRO_ANY(csa, wcb_gds_rundown);
                        send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"),
                                process_id, &csa->ti->curr_tn, DB_LEN_STR(reg));
			csa->wbuf_dqd = 0;
			wcs_recover(reg);
			BG_TRACE_PRO_ANY(csa, lost_block_recovery);
			if (!was_crit)
				rel_crit(reg);
		}
		if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE)
			originator_prc_vec = NULL;
		/* If we are the last writing user, then everything must be flushed */
		if (we_are_last_writer)
		{	/* Time to flush out all of our buffers */
			assert(!safe_mode);
			if (is_mm)
			{
				MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg);
				cnl->remove_shm = TRUE;
			}
			if (cnl->wc_blocked && jgbl.onlnrlbk)
			{	/* if the last update done by online rollback was not committed in the normal code-path but was
				 * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never
				 * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This
				 * could result in the last update never getting flushed to the disk and if online rollback happened
				 * to be the last writer then the shared memory will be flushed and removed and the last update will
				 * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is
				 * possible only if phase1 or phase2 errors are induced using white box test cases
				 */
				assert(WB_COMMIT_ERR_ENABLED);
				wcs_recover(reg);
			}
			/* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly
			 * also ensures that the db is fsynced. We don't want to use it in the calls to
			 * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there.
			 * In this case, since we are running down, we don't have any such option.
			 */
			cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			/* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in
			 * case of MM for potential file extension), even if it did a grab_crit().  Therefore, make
			 * sure that's true.
			 */
			assert(csd == csa->hdr);
			assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1));
		} else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen)
		{	/* canceled pending db or jnl flush timers - flush database and journal buffers to disk */
			if (!was_crit)
				grab_crit(reg);
			/* we need to sync the epoch as the fact that there is no active pending flush timer implies
			 * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion
			 */
			wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			if (!was_crit)
				rel_crit(reg);
			assert((dba_mm == cs_data->acc_meth) || (csd == cs_data));
			csd = cs_data;	/* In case this is MM and wcs_flu() remapped an extended database, reset csd */
		}
		/* Do rundown journal processing after buffer flushes since they require jnl to be open */
		if (JNL_ENABLED(csd))
		{	/* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning
			 * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25.
			 */
			tp_change_reg();	/* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */
			jpc = csa->jnl;
			jbp = jpc->jnl_buff;
			if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
                        if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
			if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc))
				|| we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin)
			{	/* We need to close the journal file cleanly if we have the latest generation journal file open
				 *	or if we are the last writer and the journal file is open in shared memory (not necessarily
				 *	by ourselves e.g. the only process that opened the journal got shot abnormally)
				 * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode
				 * 	if we are not the last writer as it can be concurrently updated.
				 */
				if (!was_crit)
					grab_crit(reg);
				if (JNL_ENABLED(csd))
				{
					SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */
					/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
					 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
					 * journal records (if it decides to switch to a new journal file).
					 */
					ADJUST_GBL_JREC_TIME(jgbl, jbp);
					jnl_status = jnl_ensure_open();
					if (0 == jnl_status)
					{	/* If we_are_last_writer, we would have already done a wcs_flu() which would
						 * have written an epoch record and we are guaranteed no further updates
						 * since we are the last writer. So, just close the journal.
						 * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing
						 * a pini, so allow for that.
						 */
						assert(!jbp->before_images || is_mm
						    || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract
						    || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr));
						/* If we haven't written a pini, let jnl_file_close write the pini/pfin. */
						if (!jgbl.mur_extract && (0 != jpc->pini_addr))
							jnl_put_jrt_pfin(csa);
						/* If not the last writer and no pending flush timer left, do jnl flush now */
						if (!we_are_last_writer && (0 > cnl->wcs_timers))
						{
							if (SS_NORMAL == (jnl_status = jnl_flush(reg)))
							{
								assert(jbp->freeaddr == jbp->dskaddr);
								jnl_fsync(reg, jbp->dskaddr);
								assert(jbp->fsync_dskaddr == jbp->dskaddr);
							} else
							{
								send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2,
									JNL_LEN_STR(csd), ERR_TEXT, 2,
									RTS_ERROR_TEXT("Error with journal flush in gds_rundown"),
									jnl_status);
								assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */
								/* In this routine, all code that follows from here on does not
								 * assume anything about the journaling characteristics of this
								 * database so it is safe to continue execution even though
								 * journaling got closed in the middle.
								 */
							}
						}
						jnl_file_close(reg, we_are_last_writer, FALSE);
					} else
						send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd),
								DB_LEN_STR(reg));
				}
				if (!was_crit)
					rel_crit(reg);
			}
		}
		if (we_are_last_writer)			/* Flush the fileheader last and harden the file to disk */
		{
			if (!was_crit)
				grab_crit(reg);			/* To satisfy crit requirement in fileheader_sync() */
			memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */
			if (!have_standalone_access && we_are_last_user)
			{	/* mupip_exit_handler will do this after mur_close_file */
				csd->semid = INVALID_SEMID;
				csd->shmid = INVALID_SHMID;
				csd->gt_sem_ctime.ctime = 0;
				csd->gt_shm_ctime.ctime = 0;
			}
			fileheader_sync(reg);
			if (!was_crit)
				rel_crit(reg);
			if (!is_mm)
			{
				GTM_DB_FSYNC(csa, udi->fd, rc);		/* Sync it all */
				if (-1 == rc)
				{
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			} else
			{	/* Now do final MM file sync before exit */
				assert(csa->ti->total_blks == csa->total_blks);
				#ifdef _AIX
				GTM_DB_FSYNC(csa, udi->fd, rc);
				if (-1 == rc)
				#else
				if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1]))
				#endif
				{
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			}
		} else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued)
		{
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg));
			cnl->lastwriterbypas_msg_issued = TRUE;
		}
	} /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */
	/* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started
	 * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this).
	 */
	CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer);
	if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm)
	{	/* mupip_exit_handler will do this after mur_close_file */
		db_ipcs.semid = INVALID_SEMID;
		db_ipcs.shmid = INVALID_SHMID;
		db_ipcs.gt_sem_ctime = 0;
		db_ipcs.gt_shm_ctime = 0;
		db_ipcs.fn_len = reg->dyn.addr->fname_len;
		memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len);
		db_ipcs.fn[reg->dyn.addr->fname_len] = 0;
 		/* request gtmsecshr to flush. read_only cannot flush itself */
		WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa);
		if (!csa->read_only_fs)
		{
			secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0);
			if (0 != secshrstat)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					  ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header"));
		}
	}
	/* Done with file now, close it */
	CLOSEFILE_RESET(udi->fd, rc);	/* resets "udi->fd" to FD_INVALID */
	if (-1 == rc)
	{
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
			  ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno);
	}
	/* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */
#	if !defined(_AIX)
	if (is_mm && (NULL != csa->db_addrs[0]))
	{
		assert(csa->db_addrs[1] > csa->db_addrs[0]);
		munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]);
		if (0 < munmap_len)
			munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len));
	}
#	endif
	/* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down
	 * this region. In the process also get the remove_shm status from node_local before detaching.
	 * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data
	 * integrity as a reissue of recover will restore the database to a consistent state.
	 */
	remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl);
	/* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0.
	 * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid).
	 */
	if (jgbl.onlnrlbk)
		cnl->onln_rlbk_pid = 0;
	rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */
	/* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter
	 * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down.
	 * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes.
	 */
	if (safe_mode) /* indicates flushing was skipped */
	{
		if (bypassed_access)
			cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */
		if (bypassed_ftok)
			cnl->dbrndwn_ftok_skip++;
	}
	if (jgbl.onlnrlbk)
		csa->hold_onto_crit = FALSE;
	GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0);
	status = shmdt((caddr_t)cnl);
	csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/
	/* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar
	 * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the
	 * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so
	 * we are safe passing "csa".
	 */
	if (-1 == status)
		send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2,
				LEN_AND_LIT("Error during shmdt"), errno);
	REMOVE_CSA_FROM_CSADDRSLIST(csa);	/* remove "csa" from list of open regions (cs_addrs_list) */
	reg->open = FALSE;
	/* If file is still not in good shape, die here and now before we get rid of our storage */
	assertpro(0 == csa->wbuf_dqd);
	ipc_deleted = FALSE;
	/* If we are the very last user, remove shared storage id and the semaphores */
	if (we_are_last_user)
	{	/* remove shared storage, only if last writer to rundown did a successful wcs_flu() */
		assert(!vermismatch);
		if (remove_shm)
		{
			ipc_deleted = TRUE;
			if (0 != shm_rmid(udi->shmid))
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory"));
			/* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */
			udi->new_shm = FALSE;
			/* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from
			 * mur_close_files())
			 */
			if (!have_standalone_access)
			{
				if (0 != sem_rmid(udi->semid))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						      ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore"));
				udi->new_sem = FALSE;			/* Note that we no longer have a new semaphore */
				udi->grabbed_access_sem = FALSE;
				udi->counter_acc_incremented = FALSE;
			}
		} else if (is_src_server || is_updproc)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		} else
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
	} else
	{
		assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode);
		if (!jgbl.onlnrlbk && !have_standalone_access)
		{ 	/* If we were writing, get rid of our writer access count semaphore */
			if (!reg->read_only)
			{
				if (!access_counter_halted)
				{
					save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO);
					if (0 != save_errno)
						rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
								ERR_SYSCALL, 5,
								RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"),
								CALLFROM, save_errno);
				}
				udi->counter_acc_incremented = FALSE;
			}
			assert(safe_mode || !bypassed_access);
			/* Now remove the rundown lock */
			if (!bypassed_access)
			{
				if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO)))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown access control semaphore release"),
							CALLFROM, save_errno);
				udi->grabbed_access_sem = FALSE;
			}
		} /* else access control semaphore will be released in db_ipcs_reset */
	}
	if (!have_standalone_access)
	{
		if (bypassed_ftok)
		{
			if (!ftok_counter_halted)
				if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO)))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
		} else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE))
		{
			FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
		}
		udi->grabbed_ftok_sem = FALSE;
		udi->counter_ftok_incremented = FALSE;
	}
	ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
	if (!ipc_deleted)
	{
		GET_CUR_TIME(time_str);
		if (is_src_server)
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Source server"), REG_LEN_STR(reg));
		if (is_updproc)
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Update process"), REG_LEN_STR(reg));
		if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user))
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
		}
	}
	REVERT;
	return EXIT_NRM;
}
Beispiel #17
0
void mlk_wake_pending(mlk_ctldata_ptr_t ctl,
		      mlk_shrblk_ptr_t d,
		      gd_region *reg)
{
	mlk_prcblk_ptr_t	next, pr;
	sm_uint_ptr_t 		empty_slot, ctop;
	sgmnt_addrs		*csa;
	boolean_t		remote_pid;
	int 			crit_wake_res; /* also used in macro DO_CRIT_WAKE */
	int 			lcnt;

	csa = &FILE_INFO(reg)->s_addrs;
	if (!d->pending)
		return;
	ctl->wakeups++;
	/* Before updating d->sequence ensure there is no process owning this lock, since otherwise when the owner process attempts
	 * to release the lock it will fail as its private copy of "p->sequence" will not match the shared memory "d->sequence".
	*/
	assert(!d->owner);
	d->sequence = csa->hdr->trans_hist.lock_sequence++;	/* This node is being awakened (GTCM) */
	BG_TRACE_PRO_ANY(csa, mlock_wakeups);			/* Record halted slumbers */
	if (reg->dyn.addr->acc_meth == dba_bg &&
		csa->hdr->clustered)
	{
		remote_pid = FALSE;
		for (empty_slot = ctl->clus_pids,
			ctop = &ctl->clus_pids[NUM_CLST_LCKS-1];
			*empty_slot && empty_slot <= ctop; empty_slot++)
			;
		for (pr = (mlk_prcblk_ptr_t)R2A(d->pending), lcnt = csa->hdr->lock_space_size / PRC_FACTOR; lcnt; lcnt--)
		{
			next = (pr->next) ? (mlk_prcblk_ptr_t)R2A(pr->next) : 0;	/* in case it's deleted */
			if ((pr->process_id & NODENUMBER)  ==  (process_id & NODENUMBER))
			{
				DO_CRIT_WAKE;
			} else if (empty_slot <= ctop)
			{
				remote_pid = TRUE;
				*empty_slot = pr->process_id;
				empty_slot++;
			}
			if (next)
				pr = next;
			else
				break;
		}
		if (remote_pid)
			ccp_cluster_lock_wake(reg);
	} else
	{
		for (pr = (mlk_prcblk_ptr_t)R2A(d->pending), lcnt = csa->hdr->lock_space_size / PRC_FACTOR; lcnt; lcnt--)
		{
			next = (pr->next) ? (mlk_prcblk_ptr_t)R2A(pr->next) : 0;	/* in case it's deleted */
			DO_CRIT_WAKE;

			/* Wake one process to keep things orderly, if it loses its way, others
			 * will jump in after a timout */
			if (GONE == crit_wake_res && next)
				pr = next;
			else
				break;
		}
	}
	if (!lcnt)
		GTMASSERT;
	return;
}
Beispiel #18
0
void mu_int_reg(gd_region *reg, boolean_t *return_value)
{
	boolean_t		read_only, was_crit;
	freeze_status		status;
	node_local_ptr_t	cnl;
	sgmnt_addrs     	*csa;
	sgmnt_data_ptr_t	csd;
#	ifdef DEBUG
	boolean_t		need_to_wait = FALSE;
	int			trynum;
	uint4			curr_wbox_seq_num;
#	endif
	sgmnt_data		*csd_copy_ptr;
	gd_segment		*seg;
	int			gtmcrypt_errno;
	*return_value = FALSE;
	UNIX_ONLY(jnlpool_init_needed = TRUE);
	ESTABLISH(mu_int_reg_ch);
	if (dba_usr == reg->dyn.addr->acc_meth)
	{
		util_out_print("!/Can't integ region !AD; not GDS format", TRUE,  REG_LEN_STR(reg));
		mu_int_skipreg_cnt++;
		return;
	}
	gv_cur_region = reg;
	if (reg_cmcheck(reg))
	{
		util_out_print("!/Can't integ region across network", TRUE);
		mu_int_skipreg_cnt++;
		return;
	}
	gvcst_init(gv_cur_region);
	if (gv_cur_region->was_open)
	{	/* already open under another name */
		gv_cur_region->open = FALSE;
		return;
	}
	change_reg();
	csa = &FILE_INFO(gv_cur_region)->s_addrs;
	cnl = csa->nl;
	csd = csa->hdr;
	read_only = gv_cur_region->read_only;
	assert(NULL != mu_int_master);
	/* Ensure that we don't see an increase in the file header and master map size compared to it's maximum values */
	assert(SGMNT_HDR_LEN >= SIZEOF(sgmnt_data) && (MASTER_MAP_SIZE_MAX >= MASTER_MAP_SIZE(csd)));
	/* ONLINE INTEG if asked for explicitly by specifying -ONLINE is an error if the db has partial V4 blocks.
	 * However, if -ONLINE is not explicitly specified but rather assumed implicitly (as default for -REG)
	 * then turn off ONLINE INTEG for this region and continue as if -NOONLINE was specified
	 */
#	ifdef GTM_SNAPSHOT
	if (!csd->fully_upgraded)
	{
		ointeg_this_reg = FALSE; /* Turn off ONLINE INTEG for this region */
		if (online_specified)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_SSV4NOALLOW, 2, DB_LEN_STR(gv_cur_region));
			util_out_print(NO_ONLINE_ERR_MSG, TRUE);
			mu_int_skipreg_cnt++;
			return;
		}
	}
#	endif
	if (!ointeg_this_reg || read_only)
	{
		status = region_freeze(gv_cur_region, TRUE, FALSE, TRUE);
		switch (status)
		{
			case REG_ALREADY_FROZEN:
				UNIX_ONLY(if (csa->read_only_fs) break);
				util_out_print("!/Database for region !AD is already frozen, not integing",
					TRUE, REG_LEN_STR(gv_cur_region));
				mu_int_skipreg_cnt++;
				return;
			case REG_HAS_KIP:
				/* We have already waited for KIP to reset. This time do not wait for KIP */
				status = region_freeze(gv_cur_region, TRUE, FALSE, FALSE);
				if (REG_ALREADY_FROZEN == status)
				{
					UNIX_ONLY(if (csa->read_only_fs) break);
					util_out_print("!/Database for region !AD is already frozen, not integing",
						TRUE, REG_LEN_STR(gv_cur_region));
					mu_int_skipreg_cnt++;
					return;
				}
				break;
			case REG_FREEZE_SUCCESS:
				break;
			default:
				assert(FALSE);
		}
Beispiel #19
0
uint4 jnl_file_open(gd_region *reg, bool init, void *dummy)	/* third argument for compatibility with VMS version */
{
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t     	jb;
	struct stat		stat_buf;
	uint4			sts;
	sm_uc_ptr_t		nameptr;
	int			fstat_res;
	int			close_res;
	boolean_t		retry;
	ZOS_ONLY(int		realfiletag;)

	csa = &FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	jpc = csa->jnl;
	jb = jpc->jnl_buff;
	assert(NOJNL == jpc->channel);
	sts = 0;
	jpc->status = jpc->status2 = SS_NORMAL;
	nameptr = csd->jnl_file_name;
	assert('/' == csd->jnl_file_name[0]);
	if (init)
	{
		assert(csd->jnl_file_len < JNL_NAME_SIZE);
		nameptr[csd->jnl_file_len] = 0;
		cre_jnl_file_intrpt_rename(((int)csd->jnl_file_len), csd->jnl_file_name);
		/* although jnl_file_close() would have reset jnl_file.u.inode and device to 0 and incremented cycle, it
		 * might have got shot in the middle of executing those instructions. we redo it here just to be safe.
Beispiel #20
0
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out)
	/* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */
{
	int4			status;
	uint4			blocking_pid;
	cache_rec_ptr_t		cr;
	bt_rec_ptr_t		bt;
	boolean_t		clustered, hold_onto_crit, was_crit;
	int			dummy, lcnt, ocnt;
	cw_set_element		*cse;
	off_chain		chain1;
	register sgmnt_addrs	*csa;
	register sgmnt_data_ptr_t	csd;
	enum db_ver		ondsk_blkver;
	int4			dummy_errno;
	boolean_t		already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked;
	ht_ent_int4		*tabent;
	srch_blk_status		*blkhist;
	trans_num		dirty, blkhdrtn;
	sm_uc_ptr_t		buffaddr;
	uint4			stuck_cnt = 0;
	boolean_t		lcl_blk_free;
	node_local_ptr_t	cnl;

	lcl_blk_free = block_is_free;
	block_is_free = FALSE;	/* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */
	first_tp_srch_status = NULL;
	reset_first_tp_srch_status = FALSE;
	csa = cs_addrs;
	csd = csa->hdr;
	INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1);
	is_mm = (dba_mm == csd->acc_meth);
	/* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */
	assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover);
	if (dollar_tlevel)
	{
		assert(sgm_info_ptr);
		if (0 != sgm_info_ptr->cw_set_depth)
		{
			chain1 = *(off_chain *)&blk;
			if (1 == chain1.flag)
			{
				assert(sgm_info_ptr->cw_set_depth);
				if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth)
					tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse);
				else
				{
					assert(FALSE == csa->now_crit);
					rdfail_detail = cdb_sc_blknumerr;
					return (sm_uc_ptr_t)NULL;
				}
			} else
			{
				if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
					first_tp_srch_status = tabent->value;
				else
					first_tp_srch_status = NULL;
				ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
				cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL;
			}
			assert(!cse || !cse->high_tlevel);
			assert(!chain1.flag || cse);
			if (cse)
			{	/* transaction has modified the sought after block  */
				if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode))
				{	/* Changes have not been committed to shared memory, i.e. still in private memory.
					 * Build block in private buffer if not already done and return the same.
					 */
					assert(gds_t_writemap != cse->mode);
					if (FALSE == cse->done)
					{	/* out of date, so make it current */
						assert(gds_t_committed != cse->mode);
						already_built = (NULL != cse->new_buff);
						/* Validate the block's search history right after building a private copy.
						 * This is not needed in case gvcst_search is going to reuse the clue's search
						 * history and return (because tp_hist will do the validation of this block).
						 * But if gvcst_search decides to do a fresh traversal (because the clue does not
						 * cover the path of the current input key etc.) the block build that happened now
						 * will not get validated in tp_hist since it will instead be given the current
						 * key's search history path (a totally new path) for validation. Since a private
						 * copy of the block has been built, tp_tend would also skip validating this block
						 * so it is necessary that we validate the block right here. Since it is tricky to
						 * accurately differentiate between the two cases, we do the validation
						 * unconditionally here (besides it is only a few if checks done per block build
						 * so it is considered okay performance-wise).
						 */
						gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0);
						assert(NULL != cse->blk_target);
						if (!already_built && !chain1.flag)
						{
							buffaddr = first_tp_srch_status->buffaddr;
							cr = first_tp_srch_status->cr;
							assert((is_mm || cr) && buffaddr);
							blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn;
							if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_blkmod;	/* should this be something else */
								TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data,
									first_tp_srch_status->tn, blkhdrtn,
									((blk_hdr_ptr_t)buffaddr)->levl);
								return (sm_uc_ptr_t)NULL;
							}
							if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle)
										|| (first_tp_srch_status->blk_num != cr->blk)))
							{
								assert(CDB_STAGNATE > t_tries);
								rdfail_detail = cdb_sc_lostcr; /* should this be something else */
								return (sm_uc_ptr_t)NULL;
							}
						}
						cse->done = TRUE;
					}
					*cycle = CYCLE_PVT_COPY;
					*cr_out = 0;
					return (sm_uc_ptr_t)cse->new_buff;
				} else
				{	/* Block changes are already committed to shared memory (possible if we are in TP
					 * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read
					 * block from shared memory; do not look at private memory (i.e. cse) as that might
					 * not be as uptodate as shared memory.
					 */
					assert(csa->now_crit);	/* gvcst_expand_free_subtree does t_qread in crit */
					/* If this block was newly created as part of the TP transaction, it should not be killed
					 * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would
					 * have had an old_mode of kill_t_create in which case we would not have come into this
					 * else block. Assert accordingly.
					 */
					assert(!chain1.flag);
					first_tp_srch_status = NULL;	/* do not use any previous srch_hist information */
				}
			}
		} else
		{
			if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk)))
				first_tp_srch_status = tabent->value;
			else
				first_tp_srch_status = NULL;
		}
		ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr);
		if (!is_mm && first_tp_srch_status)
		{
			cr = first_tp_srch_status->cr;
			assert(cr && !first_tp_srch_status->cse);
			if (first_tp_srch_status->cycle == cr->cycle)
			{
				*cycle = first_tp_srch_status->cycle;
				*cr_out = cr;
				cr->refer = TRUE;
				if (CDB_STAGNATE <= t_tries)	/* mu_reorg doesn't use TP else should have an || for that */
					CWS_INSERT(blk);
				return (sm_uc_ptr_t)first_tp_srch_status->buffaddr;
			} else
			{	/* Block was already part of the read-set of this transaction, but got recycled in the cache.
				 * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new
				 * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect
				 * if block recycling happened within the same mini-action and restart in that case.
				 * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know
				 * the values to update to. Set a variable that will enable the updation before returning.
				 * Also assert that if we are in the final retry, we are never in a situation where we have a
				 * block that got recycled since the start of the current mini-action. This is easily detected since
				 * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that
				 * have been read as part of the current mini-action until now.
				 */
				assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk)));
				reset_first_tp_srch_status = TRUE;
			}
		}
	}
	if ((blk >= csa->ti->total_blks) || (blk < 0))
	{	/* requested block out of range; could occur because of a concurrency conflict */
		if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data))
			GTMASSERT;
		assert(FALSE == csa->now_crit);
		rdfail_detail = cdb_sc_blknumerr;
		return (sm_uc_ptr_t)NULL;
	}
	if (is_mm)
	{
		*cycle = CYCLE_SHRD_COPY;
		*cr_out = 0;
		return (sm_uc_ptr_t)(mm_read(blk));
	}
#	ifdef GTM_CRYPT
	/* If database is encrypted, check if encryption initialization went fine for this database. If not,
	 * do not let process proceed as it could now potentially get a peek at the desired data from the
	 * decrypted shared memory global buffers (read in from disk by other processes) without having to go to disk.
	 * If DSE, allow for a special case where it is trying to dump a local bitmap block. In this case, DSE
	 * can continue to run fine (even if encryption initialization failed) since bitmap blocks are unencrypted.
	 */
	if (csa->encrypt_init_status && (!dse_running || !IS_BITMAP_BLK(blk)))
		GC_RTS_ERROR(csa->encrypt_init_status, gv_cur_region->dyn.addr->fname);
#	endif
	assert(dba_bg == csd->acc_meth);
	assert(!first_tp_srch_status || !first_tp_srch_status->cr
					|| first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle);
	if (FALSE == (clustered = csd->clustered))
		bt = NULL;
	was_crit = csa->now_crit;
	ocnt = 0;
	cnl = csa->nl;
	set_wc_blocked = FALSE;	/* to indicate whether cnl->wc_blocked was set to TRUE by us */
	hold_onto_crit = csa->hold_onto_crit;	/* note down in local to avoid csa-> dereference in multiple usages below */
	do
	{
		if (NULL == (cr = db_csh_get(blk)))
		{	/* not in memory */
			if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing))
				bt = NULL;
			if (!csa->now_crit)
			{
				assert(!hold_onto_crit);
				if (NULL != bt)
				{	/* at this point, bt is not NULL only if clustered and flushing - wait no crit */
					assert(clustered);
					wait_for_block_flush(bt, blk);	/* try for no other node currently writing the block */
				}
				if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only))
					JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno);
						/* a macro that dclast's "wcs_wtstart" and checks for errors etc. */
				grab_crit(gv_cur_region);
				cr = db_csh_get(blk);			/* in case blk arrived before crit */
			}
			if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing))
			{	/* Once crit, need to assure that if clustered, that flushing is [still] complete
				 * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */
				wait_for_block_flush(bt, blk);	/* ensure no other node currently writing the block */
			}
			if (NULL == cr)
			{	/* really not in memory - must get a new buffer */
				assert(csa->now_crit);
				cr = db_csh_getn(blk);
				if (CR_NOTVALID == (sm_long_t)cr)
				{
					assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */
					assert(gtm_white_box_test_case_enabled);
					SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
					BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk);
					set_wc_blocked = TRUE;
					break;
				}
				assert(0 <= cr->read_in_progress);
				*cycle = cr->cycle;
				cr->tn = csd->trans_hist.curr_tn;
				/* Record history of most recent disk reads only in dbg builds for now. Although the macro
				 * is just a couple dozen instructions, it is done while holding crit so we want to avoid
				 * delaying crit unless really necessary. Whoever wants this information can enable it
				 * by a build change to remove the DEBUG_ONLY part below.
				 */
				DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);)
				if (!was_crit && !hold_onto_crit)
					rel_crit(gv_cur_region);
				/* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */
				assert(0 == cr->dirty);
				assert(cr->read_in_progress >= 0);
				CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr);
				if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr), &ondsk_blkver, lcl_blk_free)))
				{	/* buffer does not contain valid data, so reset blk to be empty */
					cr->cycle++;	/* increment cycle for blk number changes (for tp_hist and others) */
					cr->blk = CR_BLKEMPTY;
					cr->r_epid = 0;
					RELEASE_BUFF_READ_LOCK(cr);
					assert(-1 <= cr->read_in_progress);
					assert(was_crit == csa->now_crit);
					if (FUTURE_READ == status)
					{	/* in cluster, block can be in the "future" with respect to the local history */
						assert(TRUE == clustered);
						assert(FALSE == csa->now_crit);
						rdfail_detail = cdb_sc_future_read;	/* t_retry forces the history up to date */
						return (sm_uc_ptr_t)NULL;
					}
					if (ERR_DYNUPGRDFAIL == status)
					{	/* if we dont hold crit on the region, it is possible due to concurrency conflicts
						 * that this block is unused (i.e. marked free/recycled in bitmap, see comments in
						 * gds_blk_upgrade.h). in this case we should not error out but instead restart.
						 */
						if (was_crit)
						{
							assert(FALSE);
							rts_error(VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region));
						} else
						{
							rdfail_detail = cdb_sc_lostcr;
							return (sm_uc_ptr_t)NULL;
						}
					}
					if (-1 == status)
					{
						/* could have been concurrent truncate, and we read a blk >= csa->ti->total_blks */
						/* restart */
						rdfail_detail = cdb_sc_truncate;
						return (sm_uc_ptr_t)NULL;
					} else
						rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
				}
				disk_blk_read = TRUE;
				assert(0 <= cr->read_in_progress);
				assert(0 == cr->dirty);
				/* Only set in cache if read was success */
				cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver);
				cr->r_epid = 0;
				RELEASE_BUFF_READ_LOCK(cr);
				assert(-1 <= cr->read_in_progress);
				*cr_out = cr;
				assert(was_crit == csa->now_crit);
				if (reset_first_tp_srch_status)
				{	/* keep the parantheses for the if (although single line) since the following is a macro */
					RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle);
				}
				return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr);
			} else  if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt))
			{
				assert(!hold_onto_crit);
				assert(TRUE == csa->now_crit);
				assert(cnl->in_crit == process_id);
				rel_crit(gv_cur_region);
			}
		}
int gtmsource_checkhealth(void)
{
	uint4			gtmsource_pid;
	int			status, semval, save_errno;
	boolean_t		srv_alive, all_files_open;
	gtmsource_local_ptr_t	gtmsourcelocal_ptr;
	int4			index, num_servers;
	seq_num			reg_seqno, jnlseqno;
	gd_region		*reg, *region_top;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	char			errtxt[OUT_BUFF_SIZE];
	char			*modestr;

	assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
	if (NULL != jnlpool.gtmsource_local)	/* Check health of a specific source server */
		gtmsourcelocal_ptr = jnlpool.gtmsource_local;
	else
		gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[0];
	num_servers = 0;
	status = SRV_ALIVE;
	for (index = 0; index < NUM_GTMSRC_LCL; index++, gtmsourcelocal_ptr++)
	{
		if ('\0' == gtmsourcelocal_ptr->secondary_instname[0])
		{
			assert(NULL == jnlpool.gtmsource_local);
			continue;
		}
		gtmsource_pid = gtmsourcelocal_ptr->gtmsource_pid;
		/* If CHECKHEALTH on a specific secondary instance is requested, print the health information irrespective
		 * of whether a source server for that instance is alive or not. For CHECKHEALTH on ALL secondary instances
		 * print health information only for those instances that have an active or passive source server alive.
		 */
		if ((NULL == jnlpool.gtmsource_local) && (0 == gtmsource_pid))
			continue;
		repl_log(stdout, TRUE, TRUE, "Initiating CHECKHEALTH operation on source server pid [%d] for secondary instance"
			" name [%s]\n", gtmsource_pid, gtmsourcelocal_ptr->secondary_instname);
		srv_alive = (0 == gtmsource_pid) ? FALSE : is_proc_alive(gtmsource_pid, 0);
		if (srv_alive)
		{
			if (GTMSOURCE_MODE_ACTIVE == gtmsourcelocal_ptr->mode)
				modestr = "ACTIVE";
			else if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsourcelocal_ptr->mode)
				modestr = "ACTIVE REQUESTED";
			else if (GTMSOURCE_MODE_PASSIVE == gtmsourcelocal_ptr->mode)
				modestr = "PASSIVE";
			else if (GTMSOURCE_MODE_PASSIVE_REQUESTED == gtmsourcelocal_ptr->mode)
				modestr = "PASSIVE REQUESTED";
			else
			{
				assert(gtmsourcelocal_ptr->mode != gtmsourcelocal_ptr->mode);
				modestr = "UNKNOWN";
			}
			repl_log(stderr, FALSE, TRUE, FORMAT_STR1, gtmsource_pid, "Source server", "", modestr);
			status |= SRV_ALIVE;
			num_servers++;
		} else
		{
			repl_log(stderr, FALSE, TRUE, FORMAT_STR, gtmsource_pid, "Source server", " NOT");
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_SRCSRVNOTEXIST, 2,
					LEN_AND_STR(gtmsourcelocal_ptr->secondary_instname));
			status |= SRV_DEAD;
		}
		if (NULL != jnlpool.gtmsource_local)
			break;
	}
	if (NULL == jnlpool.gtmsource_local)
	{	/* Compare number of servers that were found alive with the current value of the COUNT semaphore.
		 * If they are not equal, report the discrepancy.
		 */
		semval = get_sem_info(SOURCE, SRC_SERV_COUNT_SEM, SEM_INFO_VAL);
		if (-1 == semval)
		{
			save_errno = errno;
			repl_log(stderr, FALSE, TRUE,
				"Error fetching source server count semaphore value : %s\n", STRERROR(save_errno));
			status |= SRV_ERR;
		} else if (semval != num_servers)
		{
			repl_log(stderr, FALSE, FALSE,
				"Error : Expected %d source server(s) to be alive but found %d actually alive\n",
				semval, num_servers);
			repl_log(stderr, FALSE, TRUE, "Error : Check if any pid reported above is NOT a source server process\n");
			status |= SRV_ERR;
		}
	}
	/* Check that there are no regions with replication state = WAS_ON (i.e. repl_was_open). If so report that.
	 * But to determine that, we need to attach to all the database regions.
	 */
	gvinit();
	/* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */
	all_files_open = region_init(FALSE);
	if (!all_files_open)
	{
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN);
		status |= SRV_ERR;
	} else
	{
		for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++)
		{
			csa = &FILE_INFO(reg)->s_addrs;
			csd = csa->hdr;
			if (REPL_WAS_ENABLED(csd))
			{
				assert(!JNL_ENABLED(csd) || REPL_ENABLED(csd));	/* || is for turning replication on concurrently */
				reg_seqno = csd->reg_seqno;
				jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO;
				sgtm_putmsg(errtxt, VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(reg),
					&reg_seqno, &reg_seqno, &jnlseqno, &jnlseqno);
				repl_log(stderr, FALSE, TRUE, errtxt);
				status |= SRV_ERR;
			}
		}
	}
	if (jnlpool.jnlpool_ctl->freeze)
	{
		repl_log(stderr, FALSE, FALSE, "Warning: Instance Freeze is ON\n");
		repl_log(stderr, FALSE, TRUE, "   Freeze Comment: %s\n", jnlpool.jnlpool_ctl->freeze_comment);
		status |= SRV_ERR;
	}
	return (status + NORMAL_SHUTDOWN);
}
Beispiel #22
0
void ccp_reqwm_interrupt(ccp_db_header **pdb)
{
	ccp_db_header	*db;
	sgmnt_addrs	*csa;
	uint4	status;


	assert(lib$ast_in_prog());

	db = *pdb;

	csa = db->segment;
	if (csa == NULL  ||  csa->nl->ccp_state == CCST_CLOSED)
		return;

	switch (db->wm_iosb.cond)
	{
	case SS$_DEADLOCK:
		ccp_signal_cont(SS$_DEADLOCK);
		/* Just try again */
		ccp_request_write_mode(db);
		return;

	case SS$_CANCEL:
		/* Lock cancelled by close */
		return;

	case SS$_VALNOTVALID:
		/* Force reads from disk */
		db->wm_iosb.valblk[CCP_VALBLK_TRANS_HIST] = 0;
		db->last_lk_sequence = db->master_map_start_tn
				     = 0;
		/* Drop through ... */

	case SS$_NORMAL:
		if (db->wm_iosb.valblk[CCP_VALBLK_TRANS_HIST] == csa->ti->curr_tn + csa->ti->lock_sequence)
		{
			/* No change to current tn, do not need to update header */
			if (csa->now_crit)
			{
				assert (csa->nl->in_crit == process_id);
				csa->nl->in_crit = 0;
				(void)mutex_unlockw(csa->critical, csa->critical->crashcnt, &csa->now_crit);
				/***** Check error status here? *****/
			}
			ccp_writedb5(db);
		}
		else
		{
			if (csa->nl->in_crit == 0)
			{
				if (mutex_lockwim(csa->critical, csa->critical->crashcnt, &csa->now_crit) == cdb_sc_normal)
					csa->nl->in_crit = process_id;		/* now_crit was set by mutex_lockwim */
				else
					if (csa->nl->in_crit == 0)		/***** Why is this re-tested? *****/
					{
						status = sys$setimr(0, delta_100_msec, ccp_reqwm_interrupt, &db->wmcrit_timer_id,
								    0);
						if (status != SS$_NORMAL)
							ccp_signal_cont(status);	/***** Is this reasonable? *****/
						return;
					}
			}
			status = sys$qio(0, FILE_INFO(db->greg)->fab->fab$l_stv, IO$_READVBLK, &db->qio_iosb, ccp_writedb2, db,
					 &db->glob_sec->trans_hist, BT_SIZE(csa->hdr) + SIZEOF(th_index), TH_BLOCK, 0, 0, 0);
			if (status != SS$_NORMAL)
				ccp_signal_cont(status);	/***** Is this reasonable? *****/
		}
		return;

	default:
		ccp_signal_cont(db->wm_iosb.cond);		/***** Is this reasonable? *****/
		return;
	}
}
Beispiel #23
0
error_def(ERR_DBCCERR);
error_def(ERR_DBFLCORRP);

void	grab_crit(gd_region *reg)
{
	unix_db_info		*udi;
	sgmnt_addrs		*csa;
	node_local_ptr_t        cnl;
	sgmnt_data_ptr_t	csd;
	enum cdb_sc		status;
	mutex_spin_parms_ptr_t	mutex_spin_parms;
	DEBUG_ONLY(sgmnt_addrs	*jnlpool_csa;)
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	csd = csa->hdr;
	cnl = csa->nl;
#	ifdef DEBUG
	if (gtm_white_box_test_case_enabled
		&& (WBTEST_SENDTO_EPERM == gtm_white_box_test_case_number)
		&& (0 == cnl->wbox_test_seq_num))
	{
		FPRINTF(stderr, "MUPIP BACKUP entered grab_crit\n");
		cnl->wbox_test_seq_num = 1;
		while (2 != cnl->wbox_test_seq_num)
			LONG_SLEEP(1);
		FPRINTF(stderr, "MUPIP BACKUP resumed in grab_crit\n");
		cnl->wbox_test_seq_num = 3;
	}
Beispiel #24
0
bt_rec_ptr_t bt_put(gd_region *reg, int4 block)
{
	bt_rec_ptr_t		bt, q0, q1, hdr;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	cache_rec_ptr_t		cr;
	th_rec_ptr_t		th;
	trans_num		lcl_tn;
	uint4			lcnt;

	csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	assert(csa->now_crit || csd->clustered);
	assert(dba_mm != csa->hdr->acc_meth);
	lcl_tn = csa->ti->curr_tn;
	hdr = csa->bt_header + (block % csd->bt_buckets);
	assert(BT_QUEHEAD == hdr->blk);
	for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl);  ;
		bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++)
	{
		if (BT_QUEHEAD == bt->blk)
		{	/* there is no matching bt */
			assert(bt == hdr);
			bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque));
			if (CR_NOTVALID != bt->cache_index)
			{	/* the oldest bt is still valid */
				assert(!in_wcs_recover);
				cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
				if (cr->dirty)
				{	/* get it written so it can be reused */
					BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty);
					if (FALSE == wcs_get_space(reg, 0, cr))
					{
						assert(csa->nl->wc_blocked);	/* only reason we currently know
										 * why wcs_get_space could fail */
						assert(gtm_white_box_test_case_enabled);
						BG_TRACE_PRO_ANY(csa, wcb_bt_put);
						send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"),
							process_id, &lcl_tn, DB_LEN_STR(reg));
						return NULL;
					}
				}
				bt->cache_index = CR_NOTVALID;
				cr->bt_index = 0;
			}
			q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl);
			q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0);
			if (EMPTY_QUEUE == (sm_long_t)q1)
				rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1);
			bt->blk = block;
			bt->killtn = lcl_tn;
			insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr);
			th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base);
			if (EMPTY_QUEUE == (sm_long_t)th)
				GTMASSERT;
			break;
		}
		if (bt->blk == block)
		{	/* bt_put should never be called twice for the same block with the same lcl_tn. This is because
			 * t_end/tp_tend update every block only once as part of each update transaction. Assert this.
			 * The two exceptions are
			 *   a) Forward journal recovery which simulates a 2-phase M-kill where the same block
			 *	could get updated in both phases (example bitmap block gets updated for blocks created
			 *	within the TP transaction as well as for blocks that are freed up in the 2nd phase of
			 *	the M-kill) with the same transaction number. This is because although GT.M would have
			 *	updated the same block with different transaction numbers in the two phases, forward
			 *	recovery will update it with the same tn and instead increment the db tn on seeing the
			 *	following INCTN journal record(s).
			 *   b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block
			 *	and potentially with the same tn. This is because the state of the queues is questionable
			 *	and there could be more than one cache record for a given block number.
			 */
			assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa)));
			q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl);
			th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque)));
			if (EMPTY_QUEUE == (sm_long_t)th)
				GTMASSERT;
			break;
		}
		if (0 == bt->blkque.fl)
			rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2);
		if (lcnt >= csd->n_bts)
			rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3);
	}
	insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base);
	bt->tn = lcl_tn;
	return bt;
}
/*
 * Description:
 * 	Grab ftok semaphore on replication instance file
 *	Grab all replication semaphores for the instance (both jnlpool and recvpool)
 * 	Release ftok semaphore
 * Parameters:
 * Return Value: TRUE, if succsessful
 *	         FALSE, if fails.
 */
boolean_t mu_replpool_grab_sem(boolean_t immediate)
{
	char			instfilename[MAX_FN_LEN + 1];
	gd_region		*r_save;
	static gd_region 	*replreg;
	int			status, save_errno;
	union semun		semarg;
	struct semid_ds		semstat;
	repl_inst_hdr		repl_instance;
	unix_db_info		*udi;
	unsigned int		full_len;

	error_def(ERR_RECVPOOLSETUP);
	error_def(ERR_JNLPOOLSETUP);
	error_def(ERR_REPLFTOKSEM);
	error_def(ERR_TEXT);

	if (NULL == replreg)
	{
		r_save = gv_cur_region;
		mu_gv_cur_reg_init();
		replreg = gv_cur_region;
		gv_cur_region = r_save;
	}
	jnlpool.jnlpool_dummy_reg = replreg;
	recvpool.recvpool_dummy_reg = replreg;
	if (!repl_inst_get_name(instfilename, &full_len, MAX_FN_LEN + 1, issue_rts_error))
		GTMASSERT;	/* rts_error should have been issued by repl_inst_get_name */
	assert(full_len);
	memcpy((char *)replreg->dyn.addr->fname, instfilename, full_len);
	replreg->dyn.addr->fname_len = full_len;
	udi = FILE_INFO(replreg);
	udi->fn = (char *)replreg->dyn.addr->fname;
	if (!ftok_sem_get(replreg, TRUE, REPLPOOL_ID, immediate))
		rts_error(VARLSTCNT(4) ERR_REPLFTOKSEM, 2, full_len, instfilename);
	repl_inst_read(instfilename, (off_t)0, (sm_uc_ptr_t)&repl_instance, SIZEOF(repl_inst_hdr));
	/*
	 * --------------------------
	 * First semaphores of jnlpool
	 * --------------------------
	 */
	if (-1 == (udi->semid = init_sem_set_source(IPC_PRIVATE, NUM_SRC_SEMS, RWDALL | IPC_CREAT)))
	{
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
			  RTS_ERROR_LITERAL("Error creating journal pool"), REPL_SEM_ERRNO);
	}
	semarg.val = GTM_ID;
	if (-1 == semctl(udi->semid, SOURCE_ID_SEM, SETVAL, semarg))
	{
		save_errno = errno;
		remove_sem_set(SOURCE);		/* Remove what we created */
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
			 RTS_ERROR_LITERAL("Error with jnlpool semctl"), save_errno);
	}
	semarg.buf = &semstat;
	if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg))
	{
		save_errno = errno;
		remove_sem_set(SOURCE);		/* Remove what we created */
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2,
			 RTS_ERROR_LITERAL("Error with jnlpool semctl"), save_errno);
	}
	udi->gt_sem_ctime = semarg.buf->sem_ctime;
	status = grab_sem_all_source();
	if (0 != status)
	{
		remove_sem_set(SOURCE);		/* Remove what we created */
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(1) ERR_JNLPOOLSETUP);
	}
	repl_instance.jnlpool_semid = udi->semid;
	repl_instance.jnlpool_semid_ctime = udi->gt_sem_ctime;
	/*
	 * --------------------------
	 * Now semaphores of recvpool
	 * --------------------------
	 */
	assert(NUM_SRC_SEMS == NUM_RECV_SEMS);
	if (-1 == (udi->semid = init_sem_set_recvr(IPC_PRIVATE, NUM_RECV_SEMS, RWDALL | IPC_CREAT)))
	{
		remove_sem_set(SOURCE);
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(7) ERR_RECVPOOLSETUP, 0,
			  ERR_TEXT, 2,
			  RTS_ERROR_LITERAL("Error creating recv pool"), REPL_SEM_ERRNO);
	}
	semarg.val = GTM_ID;
	if (-1 == semctl(udi->semid, RECV_ID_SEM, SETVAL, semarg))
	{
		save_errno = errno;
		remove_sem_set(SOURCE);
		remove_sem_set(RECV);
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(7) ERR_RECVPOOLSETUP, 0, ERR_TEXT, 2,
			 RTS_ERROR_LITERAL("Error with recvpool semctl"), save_errno);
	}
	semarg.buf = &semstat;
	if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg)) /* For creation time */
	{
		save_errno = errno;
		remove_sem_set(SOURCE);
		remove_sem_set(RECV);
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(7) ERR_RECVPOOLSETUP, 0, ERR_TEXT, 2,
			 RTS_ERROR_LITERAL("Error with recvpool semctl"), save_errno);
	}
	udi->gt_sem_ctime = semarg.buf->sem_ctime;
	status = grab_sem_all_receive();
	if (0 != status)
	{
		remove_sem_set(SOURCE);
		remove_sem_set(RECV);
		ftok_sem_release(replreg, TRUE, TRUE);
		rts_error(VARLSTCNT(1) ERR_RECVPOOLSETUP);
	}
	repl_instance.recvpool_semid = udi->semid;
	repl_instance.recvpool_semid_ctime = udi->gt_sem_ctime;
	/* Initialize jnlpool.repl_inst_filehdr as it is used later by gtmrecv_fetchresync() */
	assert(NULL == jnlpool.repl_inst_filehdr);
	jnlpool.repl_inst_filehdr = (repl_inst_hdr_ptr_t)malloc(SIZEOF(repl_inst_hdr));
	memcpy(jnlpool.repl_inst_filehdr, &repl_instance, SIZEOF(repl_inst_hdr));
	/* Flush changes to the replication instance file header to disk */
	repl_inst_write(instfilename, (off_t)0, (sm_uc_ptr_t)&repl_instance, SIZEOF(repl_inst_hdr));
	/* Now release jnlpool/recvpool ftok semaphore */
	if (!ftok_sem_release(replreg, FALSE, immediate))
	{
		remove_sem_set(SOURCE);
		remove_sem_set(RECV);
		rts_error(VARLSTCNT(4) ERR_REPLFTOKSEM, 2, full_len, instfilename);
	}
	return TRUE;
}
Beispiel #26
0
OS_PAGE_SIZE_DECLARE

uint4	 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog)
{
	sm_uc_ptr_t		old_base[2], mmap_retaddr;
	boolean_t		was_crit, is_mm;
	char			buff[DISK_BLOCK_SIZE];
	int			result, save_errno, status;
	uint4			new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total;
	uint4			jnl_status, to_wait, to_msg, wait_period;
	gtm_uint64_t		avail_blocks, mmap_sz;
	off_t			new_eof;
	trans_num		curr_tn;
	unix_db_info		*udi;
	inctn_opcode_t		save_inctn_opcode;
	int4			prev_extend_blks_to_upgrd;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	cache_rec_ptr_t         cr;
	DCL_THREADGBL_ACCESS;

	assert(!IS_DSE_IMAGE);
	assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */
	assert(!process_exiting);
	DEBUG_ONLY(old_base[0] = old_base[1] = NULL);
	assert(!gv_cur_region->read_only);
	udi = FILE_INFO(gv_cur_region);
	is_mm = (dba_mm == cs_addrs->hdr->acc_meth);
#	if !defined(MM_FILE_EXT_OK)
	if (!udi->grabbed_access_sem && is_mm)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */
#	endif
	/* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will
	   overflow and end up doing silly things.
	*/
	assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR));
	if (!blocks)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */
	bplmap = cs_data->bplmap;
	/* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired
	 * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks
	 *      and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this
	 *      manner as every non-bitmap block must have an associated bitmap)
	 * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap.
	 * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed.
	 */
	new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1)
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap);
	new_blocks = blocks + new_bit_maps;
	assert(0 < (int)new_blocks);
	if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data))
	{
		assert(FALSE);
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX);
		return (uint4)(NO_FREE_SPACE);
	}
	if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE)))
	{
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
	} else
	{
		if (!(gtmDebugLevel & GDL_IgnoreAvailSpace))
		{	/* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB
			 * in space it could never fit it if wasn't sparse. Needed for some tests.
			 */
			avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE);
			if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
			{
				if (blocks > (uint4)avail_blocks)
				{
					SETUP_THREADGBL_ACCESS;
					if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs))
						return (uint4)(NO_FREE_SPACE);
					else
						send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4,
							DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks);
				} else
					send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region),
						 (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0)));
			}
		}
	}
	/* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */
	was_crit = cs_addrs->now_crit;
	assert(!cs_addrs->hold_onto_crit || was_crit);
	/* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur.
	 * If we are coming from online rollback (when that feature is available), we will come in holding crit and in
	 * 	the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself.
	 * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for
	 * freeze.
	 * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited
	 * 	for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this
	 *	function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that
	 *	at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region
	 *	to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused
	 *	op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which
	 *	we hold crit.
	 */
	assert(!was_crit || !cs_data->freeze || (dollar_tlevel && (CDB_STAGNATE <= t_tries)));
	/*
	 * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE
	 * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup
	 * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt"
	 * transaction numbers without correspondingly updating the history transaction numbers (effectively causing
	 * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover)
	 * if we already hold crit.
	 */
	if (!was_crit)
	{
		for ( ; ; )
		{
			grab_crit(gv_cur_region);
			if (!cs_data->freeze && !IS_REPL_INST_FROZEN)
				break;
			rel_crit(gv_cur_region);
			while (cs_data->freeze || IS_REPL_INST_FROZEN)
				hiber_start(1000);
		}
	} else if (cs_data->freeze && dollar_tlevel)
	{	/* We don't want to continue with file extension as explained above. Hence return with an error code which
		 * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly.
		 */
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_FREEZE_PROG;
	}
	if (IS_REPL_INST_FROZEN && trans_in_prog)
	{
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_INST_FREEZE;
	}
	assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks);
	old_total = cs_data->trans_hist.total_blks;
	if (old_total != filesize)
	{	/* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM,
		 * we still need to remap the database
		 */
		assert((old_total > filesize) GTM_TRUNCATE_ONLY( || !is_mm));
		/* For BG, someone else could have truncated or extended - we have no idea */
		GDSFILEXT_CLNUP;
		return (SS_NORMAL);
	}
Beispiel #27
0
/* Upgrade ^#t global in "reg" region */
void	trigger_upgrade(gd_region *reg)
{
	boolean_t		est_first_pass, do_upgrade, is_defined;
	boolean_t		was_null = FALSE, is_null = FALSE;
	int			seq_num, trig_seq_num;
	int			currlabel;
	mval			tmpmval, xecuteimval, *gvname, *tmpmv, *tmpmv2;
	int4			result, tmpint4;
	uint4			curend, gvname_prev, xecute_curend;
	uint4			hash_code, kill_hash_code;
	int			count, i, xecutei, tncount;
	char			*trigname, *trigindex, *ptr;
	char			name_and_index[MAX_MIDENT_LEN + 1 + MAX_DIGITS_IN_INT];
	char			trigvn[MAX_MIDENT_LEN + 1 + MAX_DIGITS_IN_INT], nullbyte[1];
	uint4			trigname_len, name_index_len;
	int			ilen;
	sgmnt_addrs		*csa;
	jnl_private_control	*jpc;
	uint4			sts;
	int			close_res;
	hash128_state_t		hash_state, kill_hash_state;
	uint4			hash_totlen, kill_hash_totlen;
	int			trig_protected_mval_push_count;
#	ifdef DEBUG
	int			save_dollar_tlevel;
#	endif
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(gv_cur_region == reg);
	assert(!dollar_tlevel);	/* caller should have ensured this. this is needed as otherwise things get complicated. */
	assert(!is_replicator);	/* caller should have ensured this. this is needed so we dont bump jnl_seqno (if replicating) */
	csa = &FILE_INFO(reg)->s_addrs;
	assert(csa->hdr->hasht_upgrade_needed);
	/* If before-image journaling is turned on in this region (does not matter if replication is turned on or not),
	 * once this transaction is done, we need to switch to new journal file and cut the back link because
	 * otherwise it is possible for backward journal recovery (or rollback) or source server to encounter
	 * the journal records generated in this ^#t-upgrade-transaction in which case they dont know to handle
	 * it properly (e.g. rollback or backward recovery does not know to restore csa->hdr->hasht_upgrade_needed
	 * if it rolls back this transaction). To achieve this, we set hold_onto_crit to TRUE and do the jnl link
	 * cut AFTER the transaction commits but before anyone else can sneak in to do any more updates.
	 * Since most often we expect databases to be journaled, we do this hold_onto_crit even for the non-journaled case.
	 */
	grab_crit(reg);
	csa->hold_onto_crit = TRUE;
	DEBUG_ONLY(save_dollar_tlevel = dollar_tlevel);
	assert(!donot_INVOKE_MUMTSTART);
	DEBUG_ONLY(donot_INVOKE_MUMTSTART = TRUE);
	op_tstart(IMPLICIT_TSTART, TRUE, &literal_batch, 0); /* 0 ==> save no locals but RESTART OK */
	ESTABLISH_NORET(trigger_upgrade_ch, est_first_pass);
	/* On a TP restart anywhere down below, this line is where the restart resumes execution from */
	assert(donot_INVOKE_MUMTSTART);	/* Make sure still set for every try/retry of TP transaction */
	change_reg(); /* TP_CHANGE_REG wont work as we need to set sgm_info_ptr */
	assert(NULL != cs_addrs);
	assert(csa == cs_addrs);
	SET_GVTARGET_TO_HASHT_GBL(csa);	/* sets up gv_target */
	assert(NULL != gv_target);
	INITIAL_HASHT_ROOT_SEARCH_IF_NEEDED;	/* Needed to do every retry in case restart was due to an online rollback.
						 * This also sets up gv_currkey */
	/* Do actual upgrade of ^#t global.
	 *
	 * Below is a sample layout of the label 2 ^#t global
	 * -------------------------------------------------------
	 * ^#t("#TNAME","x")="a"_$C(0)_"1"		(present in DEFAULT only)
	 * ^#t("#TRHASH",89771515,1)="a"_$C(0)_"1"	(present in DEFAULT only)
	 * ^#t("#TRHASH",106937755,1)="a"_$C(0)_"1"	(present in DEFAULT only)
	 * ^#t("a",1,"BHASH")="106937755"
	 * ^#t("a",1,"CHSET")="M"
	 * ^#t("a",1,"CMD")="S"
	 * ^#t("a",1,"LHASH")="89771515"
	 * ^#t("a",1,"TRIGNAME")="x#"
	 * ^#t("a",1,"XECUTE")=" do ^twork"
	 * ^#t("a","#COUNT")="1"
	 * ^#t("a","#CYCLE")="1"
	 * ^#t("a","#LABEL")="2"
	 *
	 * Below is a sample layout of the label 3 ^#t global
	 * -------------------------------------------------------
	 * ^#t("#LABEL")="3"				(present only after upgrade, not regular trigger load)
	 * ^#t("#TNAME","x")="a"_$C(0)_"1"		(present in CURRENT region)
	 * ^#t("a",1,"BHASH")="71945627"
	 * ^#t("a",1,"CHSET")="M"
	 * ^#t("a",1,"CMD")="S"
	 * ^#t("a",1,"LHASH")="71945627"
	 * ^#t("a",1,"TRIGNAME")="x#"
	 * ^#t("a",1,"XECUTE")=" do ^twork"
	 * ^#t("a","#COUNT")="1"
	 * ^#t("a","#CYCLE")="2"
	 * ^#t("a","#LABEL")="3"
	 * ^#t("a","#TRHASH",71945627,1)="a"_$C(0)_"1"
	 *
	 * Key aspects of the format change
	 * ----------------------------------
	 * 1) New ^#t("#LABEL")="3" to indicate the format of the ^#t global. This is in addition to
	 * 	^#t("a","#LABEL") etc. which is already there. This way we have a #LABEL for not just the installed
	 * 	triggers but also for the name information stored in the #TNAME nodes.
	 * 2) In the BHASH and LHASH fields. The hash computation is different so there are more chances of BHASH and LHASH
	 * 	matching in which case we store only one #TRHASH entry (instead of two). So thre is fewer ^#t records in the new
	 * 	format in most cases.
	 * 3) ^#t("a","#LABEL") bumps from 2 to 3. Similarly ^#t("a","#CYCLE") bumps by one (to make sure triggers for this
	 *	global get re-read if and when we implement an -ONLINE upgrade).
	 * 4) DEFAULT used to have ^#t("#TNAME",...) nodes corresponding to triggers across ALL regions in the gbldir and
	 * 	other regions used to have NO ^#t("#TNAME",...) nodes whereas after the upgrade every region have
	 *	^#t("#TNAME",...) nodes	corresponding to triggers installed in that region. So it is safer to kill ^#t("#TNAME")
	 *	nodes and add them as needed.
	 * 5) #TRHASH has moved from ^#t() to ^#t(<gbl>). So it is safer to kill ^#t("#TRHASH")	nodes and add them as needed.
	 *
	 * Below is a sample layout of the label 4 ^#t global
	 * -------------------------------------------------------
	 * ^#t("#TNAME","x")="a"_$C(0)_"1"		(present in CURRENT region)
	 * ^#t("a",1,"BHASH")="71945627"
	 * ^#t("a",1,"CHSET")="M"
	 * ^#t("a",1,"CMD")="S"
	 * ^#t("a",1,"LHASH")="71945627"
	 * ^#t("a",1,"TRIGNAME")="x#"
	 * ^#t("a",1,"XECUTE")=" do ^twork"
	 * ^#t("a","#COUNT")="1"
	 * ^#t("a","#CYCLE")="2"
	 * ^#t("a","#LABEL")="4"
	 * ^#t("a","#TRHASH",71945627,1)="a"_$C(0)_"1"
	 *
	 * Key aspects of the format change
	 * ----------------------------------
	 * 1) Removed ^#t("#LABEL") as it is redundant information and trigger load does not include it
	 * 2) Multiline triggers were incorrectly processed resulting in incorrect BHASH and LHASH values. Upgrade fixes this
	 * 3) ^#t("a","#LABEL") bumps from 3 to 4. Similarly ^#t("a","#CYCLE") bumps by one (to make sure
	 * 	triggers for this global get re-read if and when we implement an -ONLINE upgrade).
	 */
	tmpmv = &tmpmval;	/* At all points maintain this relationship. The two are used interchangeably below */
	if (gv_target->root)
		do_upgrade = TRUE;
	/* The below logic assumes ^#t global does not have any integrity errors */
	assert(do_upgrade);	/* caller should have not invoked us otherwise */
	if (do_upgrade)
	{	/* kill ^#t("#TRHASH"), ^#t("#TNAME") and ^#t("#LABEL") first. Regenerate each again as we process ^#t(<gbl>,...) */
		csa->incr_db_trigger_cycle = TRUE; /* so that we increment csd->db_trigger_cycle at commit time.
							 * this forces concurrent processes to read upgraded triggers.
							 */
		if (JNL_WRITE_LOGICAL_RECS(csa))
		{	/* Note that the ^#t upgrade is a physical layout change. But it has no logical change (i.e. users
			 * see the same MUPIP TRIGGER -SELECT output as before). So write only a dummy LGTRIG journal
			 * record for this operation. Hence write a string that starts with a trigger comment character ";".
			 */
			assert(!gv_cur_region->read_only);
			jnl_format(JNL_LGTRIG, NULL, (mval *)&literal_trigjnlrec, 0);
		}
		/* KILL ^#t("#LABEL") unconditionally */
		BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHLABEL, STRLEN(LITERAL_HASHLABEL));
		if (0 != gvcst_data())
			gvcst_kill(TRUE);
		/* KILL ^#t("#TNAME") unconditionally and regenerate */
		BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHTNAME, STRLEN(LITERAL_HASHTNAME));
		if (0 != gvcst_data())
			gvcst_kill(TRUE);
		/* KILL ^#t("#TRHASH") unconditionally and regenerate */
		BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHTRHASH, STRLEN(LITERAL_HASHTRHASH));
		if (0 != gvcst_data())
			gvcst_kill(TRUE);
		/* Loop through all global names for which ^#t(<gvn>) exists. The only first-level subscripts of ^#t starting
		 * with # are #TNAME and #TRHASH in collation order. So after #TRHASH we expect to find subscripts that are
		 * global names. Hence the HASHTRHASH code is placed AFTER the HASHTNAME code above.
		 */
		TREF(gd_targ_gvnh_reg) = NULL;	/* needed so op_gvorder below goes through gvcst_order (i.e. focuses only
						 * on the current region) and NOT through gvcst_spr_order (which does not
						 * apply anyways in the case of ^#t).
						 */
		nullbyte[0] = '\0';
		trig_protected_mval_push_count = 0;
		INCR_AND_PUSH_MV_STENT(gvname); /* Protect gvname from garbage collection */
		do
		{
			op_gvorder(gvname);
			if (0 == gvname->str.len)
				break;
			assert(ARRAYSIZE(trigvn) > gvname->str.len);
			memcpy(&trigvn[0], gvname->str.addr, gvname->str.len);
			gvname->str.addr = &trigvn[0];	/* point away from stringpool to avoid stp_gcol issues */
			/* Save gv_currkey->prev so it is restored before next call to op_gvorder (which cares about this field).
			 * gv_currkey->prev gets tampered with in the for loop below (e.g. BUILD_HASHT_SUB_CURRKEY macro).
			 * No need to do this for gv_currkey->end since the body of the for loop takes care of restoring it.
			 */
			gvname_prev = gv_currkey->prev;
			BUILD_HASHT_SUB_CURRKEY(gvname->str.addr, gvname->str.len);
			/* At this point, gv_currkey is ^#t(<gvn>) */
			/* Increment ^#t(<gvn>,"#CYCLE") */
			is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_hashcycle, tmpmv);
			assert(is_defined);
			tmpint4 = mval2i(tmpmv);
			tmpint4++;
			i2mval(tmpmv, tmpint4);
			gvtr_set_hasht_gblsubs((mval *)&literal_hashcycle, tmpmv);
			/* Read ^#t(<gvn>,"#COUNT") */
			is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_hashcount, tmpmv);
			if (is_defined)
			{
				tmpint4 = mval2i(tmpmv);
				count = tmpint4;
				/* Get ^#t(<gvn>,"#LABEL"), error out for invalid values. Upgrade disallowed for label 1 triggers */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_hashlabel, tmpmv);
				assert(is_defined);
				currlabel = mval2i(tmpmv);
				if ((V19_HASHT_GBL_LABEL_INT >= currlabel) || (HASHT_GBL_CURLABEL_INT <= currlabel))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_TRIGUPBADLABEL, 6, currlabel,
							HASHT_GBL_CURLABEL_INT, gvname->str.len, gvname->str.addr,
							REG_LEN_STR(reg));
				/* Set ^#t(<gvn>,"#LABEL")=HASHT_GBL_CURLABEL */
				gvtr_set_hasht_gblsubs((mval *)&literal_hashlabel, (mval *)&literal_curlabel);
			} else
				count = 0;
			/* Kill ^#t(<gvn>,"#TRHASH") unconditionally and regenerate */
			gvtr_kill_hasht_gblsubs((mval *)&literal_hashtrhash, TRUE);
			/* At this point, gv_currkey is ^#t(<gvn>) */
			for (i = 1; i <= count; i++)
			{
				/* At this point, gv_currkey is ^#t(<gvn>) */
				curend = gv_currkey->end; /* note gv_currkey->end before changing it so we can restore it later */
				assert(KEY_DELIMITER == gv_currkey->base[curend]);
				assert(gv_target->gd_csa == cs_addrs);
				i2mval(tmpmv, i);
				COPY_SUBS_TO_GVCURRKEY(tmpmv, gv_cur_region, gv_currkey, was_null, is_null);
				/* At this point, gv_currkey is ^#t(<gvn>,i) */
				/* Compute new LHASH and BHASH hash values.
				 *	LHASH uses : GVSUBS,                        XECUTE
				 *	BHASH uses : GVSUBS, DELIM, ZDELIM, PIECES, XECUTE
				 * So reach each of these pieces and compute hash along the way.
				 */
				STR_PHASH_INIT(hash_state, hash_totlen);
				STR_PHASH_PROCESS(hash_state, hash_totlen, gvname->str.addr, gvname->str.len);
				STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1);
				/* Read in ^#t(<gvn>,i,"GVSUBS") */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_gvsubs, tmpmv);
				if (is_defined)
				{
					STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len);
					STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1);
				}
				/* Copy over SET hash state (2-tuple <state,totlen>) to KILL hash state before adding
				 * the PIECES, DELIM, ZDELIM portions (those are only part of the SET hash).
				 */
				kill_hash_state = hash_state;
				kill_hash_totlen = hash_totlen;
				/* Read in ^#t(<gvn>,i,"PIECES") */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_pieces, tmpmv);
				if (is_defined)
				{
					STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len);
					STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1);
				}
				/* Read in ^#t(<gvn>,i,"DELIM") */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_delim, tmpmv);
				if (is_defined)
				{
					STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len);
					STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1);
				}
				/* Read in ^#t(<gvn>,i,"ZDELIM") */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_zdelim, tmpmv);
				if (is_defined)
				{
					STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len);
					STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1);
				}
				/* Read in ^#t(<gvn>,i,"XECUTE").
				 * Note: The XECUTE portion of the trigger definition is used in SET and KILL hash.
				 * But since we have started maintaining "hash_state" and "kill_hash_state" separately
				 * (due to PIECES, DELIM, ZDELIM) we need to update the hash for both using same input string.
				 */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_xecute, tmpmv);
				if (is_defined)
				{
					STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len);
					STR_PHASH_PROCESS(kill_hash_state, kill_hash_totlen, tmpmval.str.addr, tmpmval.str.len);
				} else
				{	/* Multi-record XECUTE string */
					/* At this point, gv_currkey is ^#t(<gvn>,i) */
					xecute_curend = gv_currkey->end; /* note gv_currkey->end so we can restore it later */
					assert(KEY_DELIMITER == gv_currkey->base[xecute_curend]);
					tmpmv2 = (mval *)&literal_xecute;
					COPY_SUBS_TO_GVCURRKEY(tmpmv2, gv_cur_region, gv_currkey, was_null, is_null);
					xecutei = 1;
					do
					{
						i2mval(&xecuteimval, xecutei);
						is_defined = gvtr_get_hasht_gblsubs(&xecuteimval, tmpmv);
						if (!is_defined)
							break;
						STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len);
						STR_PHASH_PROCESS(kill_hash_state, kill_hash_totlen,
									tmpmval.str.addr, tmpmval.str.len);
						xecutei++;
					} while (TRUE);
					/* Restore gv_currkey to ^#t(<gvn>,i) */
					gv_currkey->end = xecute_curend;
					gv_currkey->base[xecute_curend] = KEY_DELIMITER;
				}
				STR_PHASH_RESULT(hash_state, hash_totlen, hash_code);
				STR_PHASH_RESULT(kill_hash_state, kill_hash_totlen, kill_hash_code);
				/* Set ^#t(<gvn>,i,"LHASH") */
				MV_FORCE_UMVAL(tmpmv, kill_hash_code);
				gvtr_set_hasht_gblsubs((mval *)&literal_lhash, tmpmv);
				/* Set ^#t(<gvn>,i,"BHASH") */
				MV_FORCE_UMVAL(tmpmv, hash_code);
				gvtr_set_hasht_gblsubs((mval *)&literal_bhash, tmpmv);
				/* Read in ^#t(<gvn>,i,"TRIGNAME") to determine if #SEQNUM/#TNCOUNT needs to be maintained */
				is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_trigname, tmpmv);
				assert(is_defined);
				assert('#' == tmpmval.str.addr[tmpmval.str.len - 1]);
				tmpmval.str.len--;
				if ((tmpmval.str.len <= ARRAYSIZE(name_and_index)) &&
						(NULL != (ptr = memchr(tmpmval.str.addr, '#', tmpmval.str.len))))
				{	/* Auto-generated name. Need to maintain #SEQNUM/#TNCOUNT */
					/* Take copy of trigger name into non-stringpool location to avoid stp_gcol issues */
					trigname_len = ptr - tmpmval.str.addr;
					ptr++;
					name_index_len = (tmpmval.str.addr + tmpmval.str.len) - ptr;
					assert(ARRAYSIZE(name_and_index) >= (trigname_len + 1 + name_index_len));
					trigname = &name_and_index[0];
					trigindex = ptr;
					memcpy(trigname, tmpmval.str.addr, tmpmval.str.len);
					A2I(ptr, ptr + name_index_len, trig_seq_num);
					/* At this point, gv_currkey is ^#t(<gvn>,i) */
					/* $get(^#t("#TNAME",<trigger name>,"#SEQNUM")) */
					BUILD_HASHT_SUB_SUB_SUB_CURRKEY(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME),
						trigname, trigname_len, LITERAL_HASHSEQNUM, STR_LIT_LEN(LITERAL_HASHSEQNUM));
					seq_num = gvcst_get(tmpmv) ? mval2i(tmpmv) : 0;
					if (trig_seq_num > seq_num)
					{	/* Set ^#t("#TNAME",<trigger name>,"#SEQNUM") = trig_seq_num */
						SET_TRIGGER_GLOBAL_SUB_SUB_SUB_STR(LITERAL_HASHTNAME,
							STR_LIT_LEN(LITERAL_HASHTNAME), trigname, trigname_len,
							LITERAL_HASHSEQNUM, STR_LIT_LEN(LITERAL_HASHSEQNUM),
							trigindex, name_index_len, result);
						assert(PUT_SUCCESS == result);
					}
					/* set ^#t("#TNAME",<trigger name>,"#TNCOUNT")++ */
					BUILD_HASHT_SUB_SUB_SUB_CURRKEY(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME),
						trigname, trigname_len, LITERAL_HASHTNCOUNT, STR_LIT_LEN(LITERAL_HASHTNCOUNT));
					tncount = gvcst_get(tmpmv) ? mval2i(tmpmv) + 1 : 1;
					i2mval(tmpmv, tncount);
					SET_TRIGGER_GLOBAL_SUB_SUB_SUB_MVAL(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME),
						trigname, trigname_len, LITERAL_HASHTNCOUNT, STR_LIT_LEN(LITERAL_HASHTNCOUNT),
						tmpmval, result);
					trigname_len += 1 + name_index_len; /* in preparation for ^#t("#TNAME") set below */
					assert(PUT_SUCCESS == result);
					BUILD_HASHT_SUB_CURRKEY(gvname->str.addr, gvname->str.len);
					/* At this point, gv_currkey is ^#t(<gvn>) */
				} else
				{
					/* Take copy of trigger name into non-stringpool location to avoid stp_gcol issues */
					trigname = &name_and_index[0];  /* in preparation for ^#t("#TNAME") set below */
					trigname_len = MIN(tmpmval.str.len, ARRAYSIZE(name_and_index));
					assert(ARRAYSIZE(name_and_index) >= trigname_len);
					memcpy(trigname, tmpmval.str.addr, trigname_len);
					/* Restore gv_currkey to what it was at beginning of for loop iteration */
					gv_currkey->end = curend;
					gv_currkey->base[curend] = KEY_DELIMITER;
				}
				/* At this point, gv_currkey is ^#t(<gvn>) */
				if (kill_hash_code != hash_code)
					gvtr_set_hashtrhash(gvname->str.addr, gvname->str.len, kill_hash_code, i);
				/* Set ^#t(<gvn>,"#TRHASH",hash_code,i) */
				gvtr_set_hashtrhash(gvname->str.addr, gvname->str.len, hash_code, i);
				/* Set ^#t("#TNAME",<trigname>)=<gvn>_$c(0)_<trigindx> */
				/* The upgrade assumes that the region does not contain two triggers with the same name.
				 * V62000 and before could potentially have this out of design case. Once implemented
				 * the trigger integrity check will warn users of this edge case */
				ptr = &trigvn[gvname->str.len];
				*ptr++ = '\0';
				ilen = 0;
				I2A(ptr, ilen, i);
				ptr += ilen;
				assert(ptr <= ARRAYTOP(trigvn));
				SET_TRIGGER_GLOBAL_SUB_SUB_STR(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME),
					trigname, trigname_len, trigvn, ptr - gvname->str.addr, result);
				assert(PUT_SUCCESS == result);
				BUILD_HASHT_SUB_CURRKEY(gvname->str.addr, gvname->str.len);
				/* At this point, gv_currkey is ^#t(<gvn>) */
			}
			/* At this point, gv_currkey is ^#t(<gvn>) i.e. gv_currkey->end is correct but gv_currkey->prev
			 * might have been tampered with. Restore it to proper value first.
			 */
			 gv_currkey->prev = gvname_prev;
			gvname->mvtype = 0; /* can now be garbage collected in the next iteration */
		} while (TRUE);
	}
	op_tcommit();
	REVERT; /* remove our condition handler */
	DEBUG_ONLY(donot_INVOKE_MUMTSTART = FALSE;)
	if (csa->hold_onto_crit)
/*
 * This will rundown a replication instance journal (and receiver) pool.
 *	Input Parameter:
 *		replpool_id of the instance. Instaname must be null terminated in replpool_id.
 * Returns :
 *	TRUE,  if successful.
 *	FALSE, otherwise.
 */
boolean_t mu_rndwn_repl_instance(replpool_identifier *replpool_id, boolean_t immediate)
{
	boolean_t		jnlpool_stat = TRUE, recvpool_stat = TRUE;
	char			*instname, shmid_buff[TMP_BUF_LEN];
	gd_region		*r_save;
	repl_inst_fmt		repl_instance;
	static	gd_region	*reg = NULL;
	struct semid_ds		semstat;
	struct shmid_ds		shmstat;
	union semun		semarg;
	uchar_ptr_t		ret_ptr;
	unix_db_info		*udi;
	int			save_errno;

	error_def(ERR_MUJPOOLRNDWNSUC);
	error_def(ERR_MURPOOLRNDWNSUC);
	error_def(ERR_MUJPOOLRNDWNFL);
	error_def(ERR_MURPOOLRNDWNFL);
	error_def(ERR_SEMREMOVED);
	error_def(ERR_REPLACCSEM);
	error_def(ERR_SYSCALL);

	if (NULL == reg)
	{
		r_save = gv_cur_region;
		mu_gv_cur_reg_init();
		reg = gv_cur_region;
		gv_cur_region = r_save;
	}
	jnlpool.jnlpool_dummy_reg = reg;
	instname = replpool_id->instname;
	reg->dyn.addr->fname_len = strlen(instname);
	assert(0 == instname[reg->dyn.addr->fname_len]);
	memcpy((char *)reg->dyn.addr->fname, instname, reg->dyn.addr->fname_len + 1);
	udi = FILE_INFO(reg);
	udi->fn = (char *)reg->dyn.addr->fname;
	/* Lock replication instance using ftok semaphore */
	if (!ftok_sem_get(reg, TRUE, REPLPOOL_ID, immediate))
		return FALSE;
	repl_inst_get((char *)instname, &repl_instance);
	semarg.buf = &semstat;
	/*
	 * --------------------------
	 * First rundown Journal pool
	 * --------------------------
	 */
	if (INVALID_SEMID != repl_instance.jnlpool_semid)
		if ((-1 == semctl(repl_instance.jnlpool_semid, 0, IPC_STAT, semarg)) ||
	 			(semarg.buf->sem_ctime != repl_instance.jnlpool_semid_ctime))
			repl_instance.jnlpool_semid = INVALID_SEMID;
	if (INVALID_SHMID != repl_instance.jnlpool_shmid)
		if ((-1 == shmctl(repl_instance.jnlpool_shmid, IPC_STAT, &shmstat)) ||
	 			(shmstat.shm_ctime != repl_instance.jnlpool_shmid_ctime))
			repl_instance.jnlpool_shmid = INVALID_SHMID;
	if (INVALID_SHMID != repl_instance.jnlpool_shmid)
	{
		jnlpool_stat = mu_rndwn_replpool(replpool_id, repl_instance.jnlpool_semid, repl_instance.jnlpool_shmid);
		ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.jnlpool_shmid);
		*ret_ptr = '\0';
		gtm_putmsg(VARLSTCNT(6) (jnlpool_stat ? ERR_MUJPOOLRNDWNSUC : ERR_MUJPOOLRNDWNFL),
			4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname));
	} else if (INVALID_SEMID != repl_instance.jnlpool_semid)
	{
		if (0 == sem_rmid(repl_instance.jnlpool_semid))
		{	/* note that shmid_buff used here is actually a buffer to hold semid (not shmid) */
			ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.jnlpool_semid);
			*ret_ptr = '\0';
			gtm_putmsg(VARLSTCNT(9) ERR_MUJPOOLRNDWNSUC, 4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname),
						ERR_SEMREMOVED, 1, repl_instance.jnlpool_semid);
		} else
		{
			save_errno = errno;
			gtm_putmsg(VARLSTCNT(13) ERR_REPLACCSEM, 3, repl_instance.jnlpool_semid, RTS_ERROR_STRING(instname),
						ERR_SYSCALL, 5, RTS_ERROR_LITERAL("jnlpool sem_rmid()"), CALLFROM, save_errno);
		}
		/* Note that jnlpool_stat is not set to FALSE in case sem_rmid() fails above. This is because the journal pool is
		 * anyway not present and it is safer to reset the sem/shmids in the instance file. The only thing this might cause
		 * is a stranded semaphore but that is considered better than getting errors due to not resetting instance file.
		 */
	}
	if (jnlpool_stat)	/* Reset instance file for jnlpool info */
		repl_inst_jnlpool_reset();
	/*
	 * --------------------------
	 * Now rundown Receivpool
	 * --------------------------
	 */
	recvpool.recvpool_dummy_reg = reg;
	if (INVALID_SEMID != repl_instance.recvpool_semid)
		if ((-1 == semctl(repl_instance.recvpool_semid, 0, IPC_STAT, semarg)) ||
	 			(semarg.buf->sem_ctime != repl_instance.recvpool_semid_ctime))
			repl_instance.recvpool_semid = INVALID_SEMID;
	if (INVALID_SHMID != repl_instance.recvpool_shmid)
		if ((-1 == shmctl(repl_instance.recvpool_shmid, IPC_STAT, &shmstat)) ||
	 			(shmstat.shm_ctime != repl_instance.recvpool_shmid_ctime))
			repl_instance.recvpool_shmid = INVALID_SHMID;
	if (INVALID_SHMID != repl_instance.recvpool_shmid)
	{
		recvpool_stat = mu_rndwn_replpool(replpool_id, repl_instance.recvpool_semid, repl_instance.recvpool_shmid);
		ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.recvpool_shmid);
		*ret_ptr = '\0';
		gtm_putmsg(VARLSTCNT(6) (recvpool_stat ? ERR_MURPOOLRNDWNSUC : ERR_MURPOOLRNDWNFL),
			4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname));
	} else if (INVALID_SEMID != repl_instance.recvpool_semid)
	{
		if (0 == sem_rmid(repl_instance.recvpool_semid))
		{	/* note that shmid_buff used here is actually a buffer to hold semid (not shmid) */
			ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.recvpool_semid);
			*ret_ptr = '\0';
			gtm_putmsg(VARLSTCNT(9) ERR_MURPOOLRNDWNSUC, 4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname),
						ERR_SEMREMOVED, 1, repl_instance.recvpool_semid);
		} else
		{
			save_errno = errno;
			gtm_putmsg(VARLSTCNT(13) ERR_REPLACCSEM, 3, repl_instance.recvpool_semid, RTS_ERROR_STRING(instname),
						ERR_SYSCALL, 5, RTS_ERROR_LITERAL("recvpool sem_rmid()"), CALLFROM, save_errno);
		}
		/* Note that recvpool_stat is not set to FALSE in case sem_rmid() fails above. This is because the journal pool is
		 * anyway not present and it is safer to reset the sem/shmids in the instance file. The only thing this might cause
		 * is a stranded semaphore but that is considered better than getting errors due to not resetting instance file.
		 */
	}
	if (recvpool_stat)	/* Reset instance file for recvpool info */
		repl_inst_recvpool_reset();

	/* Release replication instance ftok semaphore lock */
	if (!ftok_sem_release(reg, TRUE, immediate))
		return FALSE;
	return (jnlpool_stat && recvpool_stat);
}
int mu_replpool_grab_sem(repl_inst_hdr_ptr_t repl_inst_filehdr, char pool_type, boolean_t *sem_created_ptr)
{
	int			status, save_errno, sem_id, semval, semnum, instfilelen;
	time_t			sem_ctime;
	boolean_t		sem_created;
	char			*instfilename;
	union semun		semarg;
	struct semid_ds		semstat;
	gd_region		*replreg;
	DEBUG_ONLY(unix_db_info	*udi;)

	*sem_created_ptr = sem_created = FALSE; /* assume semaphore not created by default */
	/* First ensure that the caller has grabbed the ftok semaphore on the replication instance file */
	assert((NULL != jnlpool.jnlpool_dummy_reg) && (jnlpool.jnlpool_dummy_reg == recvpool.recvpool_dummy_reg));
	replreg = jnlpool.jnlpool_dummy_reg;
	DEBUG_ONLY(udi = FILE_INFO(jnlpool.jnlpool_dummy_reg));
	assert(udi->grabbed_ftok_sem); /* the caller should have grabbed ftok semaphore */
	instfilename = (char *)replreg->dyn.addr->fname;
	instfilelen = replreg->dyn.addr->fname_len;
	assert((NULL != instfilename) && (0 != instfilelen) && ('\0' == instfilename[instfilelen]));
	assert((JNLPOOL_SEGMENT == pool_type) || (RECVPOOL_SEGMENT == pool_type));
	if (JNLPOOL_SEGMENT == pool_type)
	{
		sem_id = repl_inst_filehdr->jnlpool_semid;
		sem_ctime = repl_inst_filehdr->jnlpool_semid_ctime;
	}
	else
	{
		sem_id = repl_inst_filehdr->recvpool_semid;
		sem_ctime = repl_inst_filehdr->recvpool_semid_ctime;
	}
Beispiel #30
0
void gds_rundown(void)
{
	bool			is_mm, we_are_last_user, we_are_last_writer;
	boolean_t		ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch;
	now_t			now;	/* for GET_CUR_TIME macro */
	char			*time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */
	gd_region		*reg;
	int			save_errno, status;
	int4			semval, ftok_semval, sopcnt, ftok_sopcnt;
	short			crash_count;
	sm_long_t		munmap_len;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	struct shmid_ds		shm_buf;
	struct sembuf		sop[2], ftok_sop[2];
	uint4           	jnl_status;
	unix_db_info		*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;

	error_def(ERR_CRITSEMFAIL);
	error_def(ERR_DBCCERR);
	error_def(ERR_DBFILERR);
	error_def(ERR_DBRNDWNWRN);
	error_def(ERR_ERRCALL);
	error_def(ERR_GBLOFLOW);
	error_def(ERR_GTMASSERT);
	error_def(ERR_IPCNOTDEL);
	error_def(ERR_JNLFLUSH);
	error_def(ERR_RNDWNSEMFAIL);
	error_def(ERR_TEXT);
	error_def(ERR_WCBLOCKED);

	forced_exit = FALSE;		/* Okay, we're dying already -- let rel_crit live in peace now.
					 * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001
					 */
	grabbed_access_sem = FALSE;
	jnl_status = 0;
	reg = gv_cur_region;			/* Local copy */

	/*
	 * early out for cluster regions
	 * to avoid tripping the assert below.
	 * Note:
	 *	This early out is consistent with VMS.  It has been
	 *	noted that all of the gtcm assignments
	 *      to gv_cur_region should use the TP_CHANGE_REG
	 *	macro.  This would also avoid the assert problem
	 *	and should be done eventually.
	 */
	if (dba_cm == reg->dyn.addr->acc_meth)
		return;

	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	if ((reg->open) && (dba_usr == csd->acc_meth))
	{
		change_reg();
		gvusr_rundown();
		return;
	}
	ESTABLISH(gds_rundown_ch);
	if (!reg->open)				/* Not open, no point to rundown */
	{
		if (reg->opening)		/* Died partway open, kill rest of way */
		{
			rel_crit(reg);
			mutex_cleanup(reg);
/* revist this to handle MM properly  SMW 98/12/16
                        if (NULL != csa->nl)
                        {
                                status = shmdt((caddr_t)csa->nl);
                                if (-1 == status)
                                        send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
                                                ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno);
                        }
*/
			shmdt((caddr_t)csa->nl);
			csa->nl = NULL;
		}
		REVERT;
		return;
	}
	switch(csd->acc_meth)
	{	/* Pass mm and bg through */
	    case dba_bg:
		is_mm = FALSE;
		break;
	    case dba_mm:
		is_mm = TRUE;
		break;
	    case dba_usr:
		assert(FALSE);
	    default:
		REVERT;
		return;
	}
	/* Cancel any pending flush timer for this region by this task */
	CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer);
	we_are_last_user = FALSE;
	if (!csa->persistent_freeze)
		region_freeze(reg, FALSE, FALSE, FALSE);
	assert(!csa->read_lock);
	rel_crit(reg);		/* get locks to known state */
	mutex_cleanup(reg);
	/*
	 * We need to guarantee that none else access database file header when semid/shmid fields are reset.
	 * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it.
	 * So just lock it. We do it in blocking mode.
	 */
	if (!ftok_sem_lock(reg, FALSE, FALSE))
		rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
	/*
	 * For mupip_jnl_recover we already have database access control semaphore.
	 * We do not release it. We release it from  mur_close_files.
	 */
	if (!mupip_jnl_recover)
	{
		sop[0].sem_num = 0; sop[0].sem_op = 0;	/* Wait for 0 */
		sop[1].sem_num = 0; sop[1].sem_op = 1;	/* Lock */
		sopcnt = 2;
		sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */
		SEMOP(udi->semid, sop, sopcnt, status);
		if (-1 == status)			/* We couldn't get it in one shot -- see if we already have it */
		{
			save_errno = errno;
			/* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */
			if (semctl(udi->semid, 0, GETPID) == process_id)
			{
				send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2,
					DB_LEN_STR(reg),
					ERR_RNDWNSEMFAIL);
				REVERT;
				return;			/* Already in rundown for this region */
			}
			if (EAGAIN != save_errno)
			{
				assert(FALSE);
				rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno);
			}
			sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO;	/* Try again - blocking this time */
			SEMOP(udi->semid, sop, 2, status);
			if (-1 == status)			/* We couldn't get it at all.. */
				rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno);
		}
	}
	grabbed_access_sem = TRUE;
	/*
	 * We now have the dbinit/rundown lock, so we are alone in this code for this region
	 * and nobody else can attach.
	 * See if we are all alone in accessing this database shared memory.
	 */
	assert(csa->ref_cnt);	/* decrement private ref_cnt before shared ref_cnt decrement. */
	csa->ref_cnt--;		/* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */
	assert(!csa->ref_cnt);
	--csa->nl->ref_cnt;
	if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1))
	{	/* VERMISMATCH condition. Possible only if DSE */
		assert(dse_running);
		vermismatch = TRUE;
	} else
		vermismatch = FALSE;
	if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf))
	{
		save_errno = errno;
		rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
			ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno);
	} else
		we_are_last_user =  (1 == shm_buf.shm_nattch) && !vermismatch;
	assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */
	if (-1 == (semval = semctl(udi->semid, 1, GETVAL)))
		rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno);
	we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */
	assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */
	if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL)))
		rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno);
	/* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should
	 * 	not flush shared memory contents to disk as they might be in an inconsistent state.
	 * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine.
	 * A reissue of the recover/rollback command will restore the database to a consistent state.
	 * Otherwise, if we have write access to this region, let us perform a few writing tasks.
	 */
	if (csa->nl->donotflush_dbjnl)
		csa->wbuf_dqd = 0;	/* ignore csa->wbuf_dqd status as we do not care about the cache contents */
	else if (!reg->read_only && !vermismatch)
	{	/* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */
		if (csa->wbuf_dqd)
		{
			grab_crit(reg);
			SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
			BG_TRACE_PRO_ANY(csa, wcb_gds_rundown);
                        send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"),
                                process_id, &csa->ti->curr_tn, DB_LEN_STR(reg));
			csa->wbuf_dqd = 0;
			wcs_recover(reg);
			if (is_mm)
			{
				assert(FALSE);
				csd = csa->hdr;
			}
			BG_TRACE_PRO_ANY(csa, lost_block_recovery);
			rel_crit(reg);
		}
		if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type))
			originator_prc_vec = NULL;
		/* If we are the last writing user, then everything must be flushed */
		if (we_are_last_writer)
		{	/* Time to flush out all of our buffers */
			if (is_mm)
			{
				if (csa->total_blks != csa->ti->total_blks)	/* do remap if file had been extended */
				{
					grab_crit(reg);
					wcs_mm_recover(reg);
					csd = csa->hdr;
					rel_crit(reg);
				}
				csa->nl->remove_shm = TRUE;
			}
			/* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly
			 * also ensures that the db is fsynced. We don't want to use it in the calls to
			 * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there.
			 * In this case, since we are running down, we don't have any such option.
			 */
			csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			/* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in
			 * case of MM for potential file extension), even if it did a grab_crit().  Therefore, make
			 * sure that's true.
			 */
			assert(csd == csa->hdr);
			assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1));
			csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn;
		} else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer)
		{	/* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */
			grab_crit(reg);
			/* we need to sync the epoch as the fact that there is no active pending flush timer implies
			 * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion
			 */
			wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			rel_crit(reg);
			assert((dba_mm == cs_data->acc_meth) || (csd == cs_data));
			csd = cs_data;	/* In case this is MM and wcs_flu() remapped an extended database, reset csd */
		}
		/* Do rundown journal processing after buffer flushes since they require jnl to be open */
		if (JNL_ENABLED(csd))
		{	/* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning
			 * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25.
			 */
			tp_change_reg();	/* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */
			jpc = csa->jnl;
			jbp = jpc->jnl_buff;
			if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
                        if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
			if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc))
				|| we_are_last_writer && (0 != csa->nl->jnl_file.u.inode))
			{	/* We need to close the journal file cleanly if we have the latest generation journal file open
				 *	or if we are the last writer and the journal file is open in shared memory (not necessarily
				 *	by ourselves e.g. the only process that opened the journal got shot abnormally)
				 * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode
				 * 	if we are not the last writer as it can be concurrently updated.
				 */
				grab_crit(reg);
				if (JNL_ENABLED(csd))
				{
					SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */
					/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
					 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
					 * journal records (if it decides to switch to a new journal file).
					 */
					ADJUST_GBL_JREC_TIME(jgbl, jbp);
					jnl_status = jnl_ensure_open();
					if (0 == jnl_status)
					{	/* If we_are_last_writer, we would have already done a wcs_flu() which would
						 * have written an epoch record and we are guaranteed no further updates
						 * since we are the last writer. So, just close the journal.
						 * Although we assert pini_addr should be non-zero for last_writer, we
						 * play it safe in PRO and write a PINI record if not written already.
						 */
						assert(!jbp->before_images || is_mm
								|| !we_are_last_writer || 0 != jpc->pini_addr);
						if (we_are_last_writer && 0 == jpc->pini_addr)
							jnl_put_jrt_pini(csa);
						if (0 != jpc->pini_addr)
							jnl_put_jrt_pfin(csa);
						/* If not the last writer and no pending flush timer left, do jnl flush now */
						if (!we_are_last_writer && (0 > csa->nl->wcs_timers))
						{
							if (SS_NORMAL == (jnl_status = jnl_flush(reg)))
							{
								assert(jbp->freeaddr == jbp->dskaddr);
								jnl_fsync(reg, jbp->dskaddr);
								assert(jbp->fsync_dskaddr == jbp->dskaddr);
							} else
							{
								send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
									ERR_TEXT, 2,
									RTS_ERROR_TEXT("Error with journal flush in gds_rundown"),
									jnl_status);
								assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */
								/* In this routine, all code that follows from here on does not
								 * assume anything about the journaling characteristics of this
								 * database so it is safe to continue execution even though
								 * journaling got closed in the middle.
								 */
							}
						}
						jnl_file_close(reg, we_are_last_writer, FALSE);
					} else
						send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg));
				}
				rel_crit(reg);
			}
		}
		if (we_are_last_writer)			/* Flush the fileheader last and harden the file to disk */
		{
			grab_crit(reg);			/* To satisfy crit requirement in fileheader_sync() */
			memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */
			if (!mupip_jnl_recover && we_are_last_user)
			{	/* mupip_jnl_recover will do this after mur_close_file */
				csd->semid = INVALID_SEMID;
				csd->shmid = INVALID_SHMID;
				csd->gt_sem_ctime.ctime = 0;
				csd->gt_shm_ctime.ctime = 0;
			}
			fileheader_sync(reg);
			rel_crit(reg);
			if (FALSE == is_mm)
			{
				if (-1 == fsync(udi->fd))		/* Sync it all */
				{
					rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			} else
			{	/* Now do final MM file sync before exit */
#if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC)
				if (-1 == fsync(udi->fd))		/* Sync it all */
				{
					rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
#else
				if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC))
				{
					rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno);
				}
#endif
			}
                }
	} /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */
	if (reg->read_only && we_are_last_user && !mupip_jnl_recover)
	{	/* mupip_jnl_recover will do this after mur_close_file */
		db_ipcs.semid = INVALID_SEMID;
		db_ipcs.shmid = INVALID_SHMID;
		db_ipcs.gt_sem_ctime = 0;
		db_ipcs.gt_shm_ctime = 0;
		db_ipcs.fn_len = reg->dyn.addr->fname_len;
		memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len);
		db_ipcs.fn[reg->dyn.addr->fname_len] = 0;
 		/* request gtmsecshr to flush. read_only cannot flush itself */
		if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0))
			rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
				  ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header"));
	}
	/* Done with file now, close it */
	if (-1 == close(udi->fd))
	{
		rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
			  ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno);
	}
	/* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */
	if (is_mm)
	{
		munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr),
											 MSYNC_ADDR_INCS));
		if (munmap_len > 0)
		{
			munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)),
			       (size_t)(munmap_len));
#ifdef DEBUG_DB64
			rel_mmseg((caddr_t)csa->db_addrs[0]);
#endif
		}
	}
	/* Detach our shared memory while still under lock so reference counts will be
	 * correct for the next process to run down this region.
	 * In the process also get the remove_shm status from node_local before detaching.
	 * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data
	 * 	integrity as a reissue of recover will restore the database to a consistent state.
	 */
	remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl);
	status = shmdt((caddr_t)csa->nl);
	csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/
	if (-1 == status)
		send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno);
	reg->open = FALSE;

	/* If file is still not in good shape, die here and now before we get rid of our storage */
	if (csa->wbuf_dqd)
		GTMASSERT;
	ipc_deleted = FALSE;
	/* If we are the very last user, remove shared storage id and the semaphores */
	if (we_are_last_user)
	{	/* remove shared storage, only if last writer to rundown did a successful wcs_flu() */
		assert(!vermismatch);
		if (remove_shm)
		{
			ipc_deleted = TRUE;
			if (0 != shm_rmid(udi->shmid))
				rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory"));
		} else if (is_src_server || is_updproc)
		{
			gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
			send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		} else
			send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		/*
		 * Don't release semaphore in case of mupip recover/rollback; since it has standalone access.
		 * It will release the semaphore in mur_close_files.
		 */
		if (!mupip_jnl_recover)
		{
			if (0 != sem_rmid(udi->semid))
				rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore"));
			grabbed_access_sem = FALSE;
		}
	} else
	{
		assert(!mupip_jnl_recover);
		/* If we were writing, get rid of our writer access count semaphore */
		if (!reg->read_only)
			if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO)))
				rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno);
		/* Now remove the rundown lock */
		if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO)))
			rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
				ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno);
		grabbed_access_sem = FALSE;
	}
	if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE))
			rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
	if (!ipc_deleted)
	{
		GET_CUR_TIME;
		if (is_src_server)
			gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Source server"), REG_LEN_STR(reg));
		if (is_updproc)
			gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Update process"), REG_LEN_STR(reg));
		if (mupip_jnl_recover)
		{
			gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
			send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
		}
	}
	REVERT;
}