Exemplo n.º 1
0
void	t_busy2free(srch_blk_status *blkhist)
{
	cw_set_element		*cse;
	sgmnt_addrs		*csa;
	blk_hdr_ptr_t		old_block;
	unsigned int		bsiz;
	jnl_buffer_ptr_t	jbbp;		/* jbbp is non-NULL only if before-image journaling */

	assert(!dollar_tlevel);
	assert(cw_set_depth < CDB_CW_SET_SIZE);
	cse = &cw_set[cw_set_depth];
	cse->mode = gds_t_busy2free;
	cse->blk = blkhist->blk_num;
	cse->old_block = blkhist->buffaddr;
	old_block = (blk_hdr_ptr_t)cse->old_block;
	cse->was_free = FALSE; /* t_busy2free operates on BUSY blocks and hence cse->was_free is set to FALSE unconditionally */
	csa = cs_addrs;
	assert(NULL != old_block);
	jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
	if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn))
	{	/* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to
		 * access the buffer more than the db blk_size.
		 */
		bsiz = MIN(old_block->bsiz, csa->hdr->blk_size);
		cse->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz);
	}
	cse->upd_addr = NULL;
	cse->jnl_freeaddr = 0;		/* reset jnl_freeaddr that previous transaction might have filled in */
	cse->done = FALSE;
	blkhist->cse = cse;	/* indicate to t_end/tp_tend that this block is part of the write-set */
	cw_set_depth++;
}
Exemplo n.º 2
0
/* make sure that the journal file is available if appropriate */
uint4   jnl_ensure_open(gd_region *reg, sgmnt_addrs *csa)
{
	uint4			jnl_status;
	jnl_private_control	*jpc;
	sgmnt_data_ptr_t	csd;
	boolean_t		first_open_of_jnl, need_to_open_jnl;
	int			close_res;

	csd = csa->hdr;
	assert(csa->now_crit);
	jpc = csa->jnl;
	assert(&FILE_INFO(jpc->region)->s_addrs == csa);
	assert(&FILE_INFO(reg)->s_addrs == csa);
	assert(NULL != jpc);
	assert(JNL_ENABLED(csa->hdr));
	/* The goal is to change the code below to do only one JNL_FILE_SWITCHED(jpc) check instead of the additional
	 * (NOJNL == jpc->channel) check done below. The assert below ensures that the NOJNL check can indeed
	 * be subsumed by the JNL_FILE_SWITCHED check (with the exception of the source-server which has a special case that
	 * needs to be fixed in C9D02-002241). Over time, this has to be changed to one check.
	 */
	assert((NOJNL != jpc->channel) || JNL_FILE_SWITCHED(jpc) || is_src_server);
	need_to_open_jnl = FALSE;
	jnl_status = 0;
	if (NOJNL == jpc->channel)
		need_to_open_jnl = TRUE;
	else if (JNL_FILE_SWITCHED(jpc))
	{       /* The journal file has been changed "on the fly"; close the old one and open the new one */
		JNL_FD_CLOSE(jpc->channel, close_res);  /* sets jpc->channel to NOJNL */
		need_to_open_jnl = TRUE;
	}
	if (need_to_open_jnl)
	{
		/* Whenever journal file get switch, reset the pini_addr and new_freeaddr. */
		jpc->pini_addr = 0;
		jpc->new_freeaddr = 0;
		if (IS_GTCM_GNP_SERVER_IMAGE)
			gtcm_jnl_switched(reg); /* Reset pini_addr of all clients that had any older journal file open */
		first_open_of_jnl = (0 == csa->nl->jnl_file.u.inode);
		jnl_status = jnl_file_open(reg, first_open_of_jnl);
	}
#	ifdef DEBUG
	else
		GTM_WHITE_BOX_TEST(WBTEST_JNL_FILE_OPEN_FAIL, jnl_status, ERR_JNLFILOPN);
#	endif
	assert((0 != jnl_status) || !JNL_FILE_SWITCHED(jpc) || (is_src_server && !JNL_ENABLED(csa) && REPL_WAS_ENABLED(csa)));
	return jnl_status;
}
Exemplo n.º 3
0
void	ccp_exitwm3( ccp_db_header *db)
{
	sgmnt_addrs		*csa;
	bt_rec			*que_base, *que_top, *p;
	ccp_action_record	request;
	uint4		status;


	assert(lib$ast_in_prog());

	csa = db->segment;
	assert(csa->nl->ccp_state == CCST_WMXREQ);
	assert(csa->ti->curr_tn == csa->ti->early_tn);

	db->wm_iosb.valblk[CCP_VALBLK_TRANS_HIST] = csa->ti->curr_tn + csa->ti->lock_sequence;
	if (JNL_ENABLED(csa->hdr)  &&  csa->jnl != NULL)
	{
		assert(csa->jnl->channel != 0);
		db->wm_iosb.valblk[CCP_VALBLK_JNL_ADDR] = csa->jnl->jnl_buff->freeaddr;
		db->wm_iosb.valblk[CCP_VALBLK_EPOCH_TN] = csa->jnl->jnl_buff->epoch_tn;
		/* lastaddr is no longer a field in jnl_buff
		 *	db->wm_iosb.valblk[CCP_VALBLK_LST_ADDR] = csa->jnl->jnl_buff->lastaddr;
		 */
	}

	/* Convert Write-mode lock from Protected Write to Concurrent Read, writing the lock value block */
	status = ccp_enqw(EFN$C_ENF, LCK$K_CRMODE, &db->wm_iosb, LCK$M_CONVERT | LCK$M_VALBLK, NULL, 0,
			  NULL, 0, NULL, PSL$C_USER, 0);
	/***** Check error status here? *****/

	for (que_base = csa->bt_header, que_top = que_base + csa->hdr->bt_buckets;
	     que_base < que_top;
	     ++que_base)
	{
		assert(que_base->blk == BT_QUEHEAD);

		for (p = (bt_rec *)((char *)que_base + que_base->blkque.fl);
		     p != que_base;
		     p = (bt_rec *)((char *)p + p->blkque.fl))
		{
			if (((int4)p & 3) != 0)
				ccp_signal_cont(ERR_GTMCHECK);	/***** Is this reasonable? *****/
			p->flushing = FALSE;
		}
	}

	db->blocking_ast_received = FALSE;
	db->wmexit_requested = FALSE;
	csa->nl->ccp_state = CCST_WMXGNT;
	db->wc_rover = 0;

	request.action = CCTR_EWMWTBF;
	request.pid = 0;
	request.v.h = db;
	ccp_act_request(&request);

	return;
}
Exemplo n.º 4
0
/* jpc 	   : Journal private control
 * rectype : Record type
 * jnl_rec : This contains fixed part of a variable size record or the complete fixed size records.
 * blk_ptr : For JRT_PBLK and JRT_AIMG this has the block image
 * jfb     : For SET/KILL/ZKILL/ZTWORM records entire record is formatted in this.
 * 	     For JRT_PBLK and JRT_AIMG it contains partial records
 */
void	jnl_write(jnl_private_control *jpc, enum jnl_record_type rectype, jnl_record *jnl_rec, blk_hdr_ptr_t blk_ptr,
	jnl_format_buffer *jfb)
{
	int4			align_rec_len, rlen, rlen_with_align, dstlen, lcl_size, lcl_free, lcl_orig_free;
	jnl_buffer_ptr_t	jb;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t	cnl;
	struct_jrec_align	align_rec;
	uint4 			status;
	jrec_suffix		suffix;
	boolean_t		nowrap, is_replicated;
	struct_jrec_blk		*jrec_blk;
	uint4			checksum, jnlpool_size, lcl_freeaddr;
	sm_uc_ptr_t		lcl_buff;
	gd_region		*reg;
	char			*ptr;
	int			jnl_wrt_start_modulus, jnl_wrt_start_mask;
	uint4			jnl_fs_block_size, aligned_lcl_free, padding_size;
	uint4			tmp_csum1, tmp_csum2;
#	ifdef DEBUG
	uint4			lcl_dskaddr, mumps_node_sz;
	char			*mumps_node_ptr;
#	endif

	assert(jnl_write_recursion_depth++ < MAX_JNL_WRITE_RECURSION_DEPTH);
	reg = jpc->region;
	csa = &FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	is_replicated = jrt_is_replicated[rectype];
	/* Ensure that no replicated journal record is written by this routine if REPL-WAS_ENABLED(csa) is TRUE */
	assert((JNL_ENABLED(csa) && !REPL_WAS_ENABLED(csa)) || !is_replicated);
	/* Assert that the only journal records that the source server ever writes are PINI/PFIN/EPOCH/EOF
	 * which it does at the very end when the database is about to be shut down
	 */
	assert(!is_src_server || (JRT_EOF == rectype) || (JRT_PINI == rectype) || (JRT_EPOCH == rectype) || (JRT_PFIN == rectype));
	assert(csa->now_crit  ||  (csd->clustered  &&  csa->nl->ccp_state == CCST_CLOSED));
	assert(rectype > JRT_BAD  &&  rectype < JRT_RECTYPES && JRT_ALIGN != rectype);
	jb = jpc->jnl_buff;
	/* Before taking a copy of jb->freeaddr, determine if both free and freeaddr are in sync. If not fix that first. */
	if (jb->free_update_pid)
	{
		FIX_NONZERO_FREE_UPDATE_PID(csa, jb);
	}
	lcl_freeaddr = jb->freeaddr;
	lcl_free = jb->free;
	lcl_size = jb->size;
	lcl_buff = &jb->buff[jb->buff_off];
	DBG_CHECK_JNL_BUFF_FREEADDR(jb);
	++jb->reccnt[rectype];
	assert(NULL != jnl_rec);
	rlen = jnl_rec->prefix.forwptr;
	/* Do high-level check on rlen */
	assert(rlen <= jb->max_jrec_len);
	/* Do fine-grained checks on rlen */
	GTMTRIG_ONLY(assert(!IS_ZTWORM(rectype) || (MAX_ZTWORM_JREC_LEN >= rlen));)	/* ZTWORMHOLE */
Exemplo n.º 5
0
/* make sure that the journal file is available if appropriate */
uint4   jnl_ensure_open(void)
{
	uint4			jnl_status;
	jnl_private_control	*jpc;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	boolean_t		first_open_of_jnl, need_to_open_jnl;
	int			close_res;
#       if defined(VMS)
	static const gds_file_id	file;
	uint4				status;
#       endif

	error_def(ERR_JNLFILOPN);

	csa = cs_addrs;
	csd = csa->hdr;
	assert(csa->now_crit);
	jpc = csa->jnl;
	assert(NULL != jpc);
	assert(JNL_ENABLED(csa->hdr));
	/* The goal is to change the code below to do only one JNL_FILE_SWITCHED(jpc) check instead of the additional
	 * (NOJNL == jpc->channel) check done below. The assert below ensures that the NOJNL check can indeed
	 * be subsumed by the JNL_FILE_SWITCHED check (with the exception of the source-server which has a special case that
	 * needs to be fixed in C9D02-002241). Over time, this has to be changed to one check.
	 */
	assert((NOJNL != jpc->channel) || JNL_FILE_SWITCHED(jpc) || is_src_server);
	need_to_open_jnl = FALSE;
	jnl_status = 0;
	if (NOJNL == jpc->channel)
	{
#               ifdef VMS
		if (NOJNL != jpc->old_channel)
		{
			if (lib$ast_in_prog())          /* called from wcs_wipchk_ast */
				jnl_oper_user_ast(gv_cur_region);
			else
			{
				status = sys$setast(DISABLE);
				jnl_oper_user_ast(gv_cur_region);
				if (SS$_WASSET == status)
					ENABLE_AST;
			}
		}
#               endif
		need_to_open_jnl = TRUE;
	} else if (JNL_FILE_SWITCHED(jpc))
	{       /* The journal file has been changed "on the fly"; close the old one and open the new one */
		VMS_ONLY(assert(FALSE);)        /* everyone having older jnl open should have closed it at time of switch in VMS */
		JNL_FD_CLOSE(jpc->channel, close_res);  /* sets jpc->channel to NOJNL */
		need_to_open_jnl = TRUE;
	}
Exemplo n.º 6
0
void jnl_wait(gd_region *reg)
{
	jnl_private_control	*jpc;
	sgmnt_addrs		*csa;
	uint4			status;

	if ((NULL != reg) && (TRUE == reg->open))
	{
		csa = &FILE_INFO(reg)->s_addrs;
		jpc = csa->jnl;
		if ((TRUE == JNL_ENABLED(csa->hdr)) && (NULL != jpc))
		{	/* wait on jnl writes for region */
			status = jnl_write_attempt(jpc, jpc->new_freeaddr);
			if (SS_NORMAL == status && !JNL_FILE_SWITCHED(jpc))
				jnl_fsync(reg, jpc->new_freeaddr);
		}
	}
	return;
}
Exemplo n.º 7
0
void jnl_flush(gd_region *reg)
{
	sgmnt_addrs		*csa;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jb;

	if (!reg || !reg->open)
		return;
	csa = &FILE_INFO(reg)->s_addrs;
	assert(csa->now_crit);
	jpc = csa->jnl;
	if (!JNL_ENABLED(csa->hdr) || (NULL == jpc) || (NOJNL == jpc->channel))
		return;
	jb = jpc->jnl_buff;
	jb->blocked = process_id;
	if (jb->freeaddr != jb->dskaddr && SS_NORMAL != jnl_write_attempt(jpc, jb->freeaddr))
		return;
	assert(jb->dskaddr == jb->freeaddr);
	jb->blocked = 0;
	return;
}
Exemplo n.º 8
0
int jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size)
{
	file_control		*fc;
	boolean_t		need_extend;
	jnl_buffer_ptr_t     	jb;
	jnl_create_info 	jnl_info;
	jnl_file_header		header;
	uint4			new_alq;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	char			prev_jnl_fn[JNL_NAME_SIZE];
	uint4			jnl_status = 0, status;
	int			new_blocks, result;
	GTM_BAVAIL_TYPE		avail_blocks;
	uint4			aligned_tot_jrec_size, count;

	switch(jpc->region->dyn.addr->acc_meth)
	{
	case dba_mm:
	case dba_bg:
		csa = &FILE_INFO(jpc->region)->s_addrs;
		break;
	default:
		GTMASSERT;
	}
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state)));
	assert(jpc->region == gv_cur_region);
	assert(csa->jnl_state == csd->jnl_state);
	if (!JNL_ENABLED(csa) || (NOJNL == jpc->channel) || (JNL_FILE_SWITCHED(jpc)))
		GTMASSERT;	/* crit and messing with the journal file - how could it have vanished? */
	if (!csd->jnl_deq)
	{
		assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq);
		assert(csd->jnl_alq == csd->autoswitchlimit);
		new_blocks = csd->jnl_alq;
	} else
		/* May cause extension of  csd->jnl_deq * n blocks where n > 0 */
		new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq);
	jpc->status = SS_NORMAL;
	jb = jpc->jnl_buff;
	assert(0 <= new_blocks);
	DEBUG_ONLY(count = 0);
	for (need_extend = (0 != new_blocks); need_extend; )
	{
		DEBUG_ONLY(count++);
		/* usually we will do the loop just once where we do the file extension.
		 * rarely we might need to do an autoswitch instead after which again rarely
		 * 	we might need to do an extension on the new journal to fit in the transaction's	journal requirements.
		 * therefore we should do this loop a maximum of twice. hence the assert below.
		 */
		assert(count <= 2);
		need_extend = FALSE;
		if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE)))
		{
			if ((new_blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
			{
				if (new_blocks > avail_blocks)
				{	/* if we cannot satisfy the request, it is an error */
					send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd),
						new_blocks, avail_blocks);
					new_blocks = 0;
					jpc->status = SS_NORMAL;
					break;
				} else
					send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd),
						(avail_blocks - new_blocks));
			}
		} else
			send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status);
		new_alq = jb->filesize + new_blocks;
		/* ensure current journal file size is well within autoswitchlimit --> design constraint */
		assert(csd->autoswitchlimit >= jb->filesize);
		if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks)))	/* close to max */
			send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize);
		if (csd->autoswitchlimit < new_alq)
		{	/* Reached max, need to autoswitch */
			/* Ensure new journal file can hold the entire current transaction's journal record requirements */
			assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size));
			memset(&jnl_info, 0, sizeof(jnl_info));
			jnl_info.prev_jnl = &prev_jnl_fn[0];
			set_jnl_info(gv_cur_region, &jnl_info);
			assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc)));
			jnl_status = jnl_ensure_open();
			if (0 == jnl_status)
			{	/* flush the cache and jnl-buffer-contents to current journal file before
				 * switching to a new journal. */
				wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH);
				jnl_file_close(gv_cur_region, TRUE, TRUE);
			} else
				rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status);
			assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr));
			if (jgbl.forw_phase_recovery && (NULL != jgbl.mur_pini_addr_reset_fnptr))
				(*jgbl.mur_pini_addr_reset_fnptr)();
			assert(!jnl_info.no_rename);
			assert(!jnl_info.no_prev_link);
			if (EXIT_NRM == cre_jnl_file(&jnl_info))
			{
				assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len));
				assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0');
				assert(csd->jnl_file_len == jnl_info.jnl_len);
				assert(csd->jnl_buffer_size == jnl_info.buffer);
				assert(csd->jnl_alq == jnl_info.alloc);
				assert(csd->jnl_deq == jnl_info.extend);
				assert(csd->jnl_before_image == jnl_info.before_images);
				csd->trans_hist.header_open_tn = jnl_info.tn;	/* needed for successful jnl_file_open() */
				send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREATE, 2, JNL_LEN_STR(csd));
				fc = gv_cur_region->dyn.addr->file_cntl;
				fc->op = FC_WRITE;
				fc->op_buff = (sm_uc_ptr_t)csd;
				status = dbfilop(fc);
				if (SS_NORMAL != status)
					send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
				assert(JNL_ENABLED(csa));
				/* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */
				jnl_status = jnl_ensure_open();	/* sets jpc->status */
				if (0 != jnl_status)
					rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
				assert(jb->filesize == csd->jnl_alq);
				aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size),
										csd->jnl_alq, csd->jnl_deq);
				if (aligned_tot_jrec_size > csd->jnl_alq)
				{	/* need to extend more than initial allocation in the new journal file
					 * to accommodate the current transaction.
					 */
					new_blocks = aligned_tot_jrec_size - csd->jnl_alq;
					assert(new_blocks);
					assert(0 == new_blocks % csd->jnl_deq);
					need_extend = TRUE;
				}
			} else
			{
				send_msg(VARLSTCNT(4) ERR_JNLCREATERR, 2, JNL_LEN_STR(csd));
				jpc->status = ERR_JNLNOCREATE;
				new_blocks = -1;
			}
		} else
		{
			assert(!need_extend);	/* ensure we won't go through the for loop again */
			/* Virtually extend currently used journal file */
			jb->filesize = new_alq;	/* Actually this is virtual file size blocks */
			DO_FILE_READ(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2);
			if (SS_NORMAL != jpc->status)
				rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status);
			assert((header.virtual_size + new_blocks) == new_alq);
			header.virtual_size = new_alq;
			DO_FILE_WRITE(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2);
			if (SS_NORMAL != jpc->status)
				rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status);
		}
		if (0 >= new_blocks)
			break;
	}
	if (0 >= new_blocks)
	{
		jpc->status = ERR_JNLREADEOF;
		jnl_file_lost(jpc, ERR_JNLEXTEND);
       		new_blocks = -1;
	}
	return new_blocks;
}
Exemplo n.º 9
0
uint4 jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size)
{
	file_control		*fc;
	boolean_t		need_extend;
	jnl_buffer_ptr_t     	jb;
	jnl_create_info 	jnl_info;
	jnl_file_header		*header;
	unsigned char		hdr_buff[REAL_JNL_HDR_LEN + MAX_IO_BLOCK_SIZE];
	uint4			new_alq;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	char			prev_jnl_fn[JNL_NAME_SIZE];
	uint4			jnl_status = 0, status;
	int			new_blocks, warn_blocks, result;
	gtm_uint64_t		avail_blocks;
	uint4			aligned_tot_jrec_size, count;
	uint4			jnl_fs_block_size, read_write_size;
	DCL_THREADGBL_ACCESS;

	switch(jpc->region->dyn.addr->acc_meth)
	{
	case dba_mm:
	case dba_bg:
		csa = &FILE_INFO(jpc->region)->s_addrs;
		break;
	default:
		GTMASSERT;
	}
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state)));
	assert(&FILE_INFO(jpc->region)->s_addrs == csa);
	assert(csa->jnl_state == csd->jnl_state);
	assertpro(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && (!JNL_FILE_SWITCHED(jpc)));
		/* crit and messing with the journal file - how could it have vanished? */
	if (!csd->jnl_deq || (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit))
	{
		assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq);
		assert(csd->jnl_alq == csd->autoswitchlimit);
		new_blocks = csd->jnl_alq;
	} else
		/* May cause extension of  csd->jnl_deq * n blocks where n > 0 */
		new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq);
	jpc->status = SS_NORMAL;
	jb = jpc->jnl_buff;
	assert(0 <= new_blocks);
	DEBUG_ONLY(count = 0);
	for (need_extend = (0 != new_blocks); need_extend; )
	{
		DEBUG_ONLY(count++);
		/* usually we will do the loop just once where we do the file extension.
		 * rarely we might need to do an autoswitch instead after which again rarely
		 * 	we might need to do an extension on the new journal to fit in the transaction's	journal requirements.
		 * therefore we should do this loop a maximum of twice. hence the assert below.
		 */
		assert(count <= 2);
		need_extend = FALSE;
		if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE)))
		{
			warn_blocks = (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit)
					? ((csd->jnl_deq > csd->autoswitchlimit) ? csd->jnl_deq : csd->autoswitchlimit)
					: new_blocks;

			if ((warn_blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
			{
				if (new_blocks > avail_blocks)
				{	/* If we cannot satisfy the request, it is an error, unless the anticipatory freeze
					 * scheme is in effect in which case, we will assume space is available even if
					 * it is not and go ahead with writes to the disk. If the writes fail with ENOSPC
					 * we will freeze the instance and wait for space to become available and keep
					 * retrying the writes. Therefore, we make the NOSPACEEXT a warning in this case.
					 */
					SETUP_THREADGBL_ACCESS;
					if (!ANTICIPATORY_FREEZE_ENABLED(csa))
					{
						send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4,
							JNL_LEN_STR(csd), new_blocks, avail_blocks);
						new_blocks = 0;
						jpc->status = SS_NORMAL;
						break;
					} else
						send_msg(VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4,
							JNL_LEN_STR(csd), new_blocks, avail_blocks);
				} else
					send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - warn_blocks));
			}
		} else
			send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status);
		new_alq = jb->filesize + new_blocks;
		/* ensure current journal file size is well within autoswitchlimit --> design constraint */
		assert(csd->autoswitchlimit >= jb->filesize);
		if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks)))	/* close to max */
			send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize);
		if (csd->autoswitchlimit < new_alq)
		{	/* Reached max, need to autoswitch */
			/* Ensure new journal file can hold the entire current transaction's journal record requirements */
			assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size));
			memset(&jnl_info, 0, SIZEOF(jnl_info));
			jnl_info.prev_jnl = &prev_jnl_fn[0];
			set_jnl_info(gv_cur_region, &jnl_info);
			assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc)));
			jnl_status = jnl_ensure_open();
			if (0 == jnl_status)
			{	/* flush the cache and jnl-buffer-contents to current journal file before
				 * switching to a new journal. Set a global variable in_jnl_file_autoswitch
				 * so jnl_write can know not to do the padding check. But because this is a global
				 * variable, we also need to make sure it is reset in case of errors during the
				 * autoswitch (or else calls to jnl_write after we are out of the autoswitch logic
				 * will continue to incorrectly not do the padding check. Hence a condition handler.
				 */
				assert(!in_jnl_file_autoswitch);
				in_jnl_file_autoswitch = TRUE;
				/* Also make sure time is not changed. This way if "jnl_write" as part of writing a
				 * journal record invokes jnl_file_extend, when the autoswitch is done and writing
				 * of the parent jnl_write resumes, we want it to continue with the same timestamp
				 * and not have to reset its time (non-trivial task) to reflect any changes since then.
				 */
				assert(!jgbl.save_dont_reset_gbl_jrec_time);
				jgbl.save_dont_reset_gbl_jrec_time = jgbl.dont_reset_gbl_jrec_time;
				jgbl.dont_reset_gbl_jrec_time = TRUE;
				/* Establish a condition handler so we reset a few global variables that have
				 * temporarily been modified in case of errors inside wcs_flu/jnl_file_close.
				 */
				ESTABLISH_RET(jnl_file_autoswitch_ch, EXIT_ERR);
				/* It is possible we still have not written a PINI record in this journal file
				 * (e.g. mupip extend saw the need to do jnl_file_extend inside jnl_write while
				 * trying to write a PINI record). Write a PINI record in that case before closing
				 * the journal file that way the EOF record will have a non-zero pini_addr.
				 */
				if (0 == jpc->pini_addr)
					jnl_put_jrt_pini(csa);
				wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SPEEDUP_NOBEFORE);
				jnl_file_close(gv_cur_region, TRUE, TRUE);
				REVERT;
				in_jnl_file_autoswitch = FALSE;
				jgbl.dont_reset_gbl_jrec_time = jgbl.save_dont_reset_gbl_jrec_time;
				DEBUG_ONLY(jgbl.save_dont_reset_gbl_jrec_time = FALSE);
				assert((dba_mm == cs_data->acc_meth) || (csd == cs_data));
				csd = cs_data;	/* In MM, wcs_flu() can remap an extended DB, so reset csd to be sure */
			} else
			{
				if (SS_NORMAL != jpc->status)
					rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region),
						jpc->status);
				else
					rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
			}
			assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr));
			assert(jgbl.forw_phase_recovery || (NULL == jgbl.mur_pini_addr_reset_fnptr));
			if (NULL != jgbl.mur_pini_addr_reset_fnptr)
				(*jgbl.mur_pini_addr_reset_fnptr)(csa);
			assert(!jnl_info.no_rename);
			assert(!jnl_info.no_prev_link);
			if (EXIT_NRM == cre_jnl_file(&jnl_info))
			{
				assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len));
				assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0');
				assert(csd->jnl_file_len == jnl_info.jnl_len);
				assert(csd->jnl_buffer_size == jnl_info.buffer);
				assert(csd->jnl_alq == jnl_info.alloc);
				assert(csd->jnl_deq == jnl_info.extend);
				assert(csd->jnl_before_image == jnl_info.before_images);
				csd->jnl_checksum = jnl_info.checksum;
				csd->jnl_eovtn = csd->trans_hist.curr_tn;
				send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREAT, 2, JNL_LEN_STR(csd));
				fc = gv_cur_region->dyn.addr->file_cntl;
				fc->op = FC_WRITE;
				fc->op_buff = (sm_uc_ptr_t)csd;
				fc->op_len = SGMNT_HDR_LEN;
				fc->op_pos = 1;
				status = dbfilop(fc);
				if (SS_NORMAL != status)
					send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
				assert(JNL_ENABLED(csa));
				/* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */
				jnl_status = jnl_ensure_open();	/* sets jpc->status */
				if (0 != jnl_status)
				{
					if (jpc->status)
						rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region),
							jpc->status);
					else
						rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
				}
				assert(jb->filesize == csd->jnl_alq);
				if (csd->jnl_alq + csd->jnl_deq <= csd->autoswitchlimit)
				{
					aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size),
											csd->jnl_alq, csd->jnl_deq);
					if (aligned_tot_jrec_size > csd->jnl_alq)
					{	/* need to extend more than initial allocation in the new journal file
						 * to accommodate the current transaction.
						 */
						new_blocks = aligned_tot_jrec_size - csd->jnl_alq;
						assert(new_blocks);
						assert(0 == new_blocks % csd->jnl_deq);
						need_extend = TRUE;
					}
				}
			} else
			{
				send_msg(VARLSTCNT(4) ERR_JNLNOCREATE, 2, JNL_LEN_STR(csd));
				jpc->status = ERR_JNLNOCREATE;
				new_blocks = -1;
			}
		} else
		{
			assert(!need_extend);	/* ensure we won't go through the for loop again */
			/* Virtually extend currently used journal file */
			jnl_fs_block_size = jb->fs_block_size;
			header = (jnl_file_header *)(ROUND_UP2((uintszofptr_t)hdr_buff, jnl_fs_block_size));
			read_write_size = ROUND_UP2(REAL_JNL_HDR_LEN, jnl_fs_block_size);
			assert((unsigned char *)header + read_write_size <= ARRAYTOP(hdr_buff));
			DO_FILE_READ(jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2);
			if (SS_NORMAL != jpc->status)
			{
				assert(FALSE);
				rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status);
			}
			assert((header->virtual_size + new_blocks) == new_alq);
			jb->filesize = new_alq;	/* Actually this is virtual file size blocks */
			header->virtual_size = new_alq;
			JNL_DO_FILE_WRITE(csa, csd->jnl_file_name, jpc->channel, 0,
					header, read_write_size, jpc->status, jpc->status2);
			if (SS_NORMAL != jpc->status)
			{
				assert(FALSE);
				rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status);
			}
		}
		if (0 >= new_blocks)
			break;
	}
	if (0 < new_blocks)
	{
		INCR_GVSTATS_COUNTER(csa, csa->nl, n_jnl_extends, 1);
		return EXIT_NRM;
	}
	jpc->status = ERR_JNLREADEOF;
	jnl_file_lost(jpc, ERR_JNLEXTEND);
       	return EXIT_ERR;
}
Exemplo n.º 10
0
void gtcmd_rundown(connection_struct *cnx, bool clean_exit)
{
	int4			link;
	cm_region_list		*ptr, *last, *que_next, *que_last;
	cm_region_head		*region;
	uint4			jnl_status;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	int			refcnt;
	boolean_t		was_crit;
	int4			rundown_status = EXIT_NRM;			/* if gds_rundown went smoothly */

	for (ptr = cnx->region_root;  ptr;)
	{
		region = ptr->reghead;
		TP_CHANGE_REG(region->reg);
		jpc = cs_addrs->jnl;
		if (ptr->pini_addr && clean_exit && JNL_ENABLED(cs_data) && (NOJNL != jpc->channel))
		{
			was_crit = cs_addrs->now_crit;
			if (!was_crit)
				grab_crit(gv_cur_region);
			if (JNL_ENABLED(cs_data))
			{
				jpc->pini_addr = ptr->pini_addr;
				SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pfin needs this to be set */
				jbp = jpc->jnl_buff;
				/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
				 * of jnl records.  This needs to be done BEFORE the jnl_ensure_open as that could write
				 * journal records (if it decides to switch to a new journal file).
				 */
				ADJUST_GBL_JREC_TIME(jgbl, jbp);
				jnl_status = jnl_ensure_open();
				if (0 == jnl_status)
				{
					if (0 != jpc->pini_addr)
						jnl_put_jrt_pfin(cs_addrs);
				} else
					send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region));
			}
			if (!was_crit)
				rel_crit(gv_cur_region);
		}
		refcnt = --region->refcnt;
		/* Dont know how refcnt can become negative but in pro handle it by bypassing this region. The reason is the
		 * following. refcnt should have originally been a positive value. Every time this function is invoked, it would
		 * be decremented by one. There should have been one invocation that saw refcnt to be zero. That would have
		 * done the rundown of the region or if it is still in the stack the rundown is still in progress. Therefore
		 * it is not a good idea to try running down this region when we see refcnt to be negative (as otherwise we
		 * will get confused and could potentially end up with SIG-11 or ACCVIO errors). The worst case is that we
		 * would not have rundown the region in which case an externally issued MUPIP RUNDOWN would be enough.
		 */
		assert(0 <= refcnt);
		if (0 == refcnt)
		{	/* free up only as little as needed to facilitate structure reuse when the region is opened again */
			assert(region->head.fl == region->head.bl);
			VMS_ONLY(gtcm_ast_avail++);
			if (JNL_ALLOWED(cs_data))
				jpc->pini_addr = 0;
			UNIX_ONLY(rundown_status |=) gds_rundown();
			gd_ht_kill(region->reg_hash, TRUE);	/* TRUE to free up the table and the gv_targets it holds too */
			FREE_CSA_DIR_TREE(cs_addrs);
			cm_del_gdr_ptr(gv_cur_region);
		}
		que_next = (cm_region_list *)((unsigned char *)ptr + ptr->regque.fl);
		que_last = (cm_region_list *)((unsigned char *)ptr + ptr->regque.bl);
		link = (int4)((unsigned char *)que_next - (unsigned char *)que_last);
		que_last->regque.fl = link;
		que_next->regque.bl = -link;
		last = ptr;
		ptr = ptr->next;
		free(last);
	}
Exemplo n.º 11
0
uint4	 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog)
{
	sm_uc_ptr_t		old_base[2], mmap_retaddr;
	boolean_t		was_crit, is_mm;
	int			result, save_errno, status;
	DEBUG_ONLY(int		first_save_errno);
	uint4			new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total;
	uint4			jnl_status;
	gtm_uint64_t		avail_blocks, mmap_sz;
	off_t			new_eof, new_size;
	trans_num		curr_tn;
	unix_db_info		*udi;
	inctn_opcode_t		save_inctn_opcode;
	int4			prev_extend_blks_to_upgrd;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	cache_rec_ptr_t         cr;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(!IS_DSE_IMAGE);
	assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */
	assert(!process_exiting);
	DEBUG_ONLY(old_base[0] = old_base[1] = NULL);
	assert(!gv_cur_region->read_only);
	udi = FILE_INFO(gv_cur_region);
	is_mm = (dba_mm == cs_addrs->hdr->acc_meth);
#	if !defined(MM_FILE_EXT_OK)
	if (!udi->grabbed_access_sem && is_mm)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */
#	endif
	/* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will
	   overflow and end up doing silly things.
	*/
	assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR));
#	if defined(__sun) || defined(__hpux)
	cs_data->defer_allocate = TRUE;
#	endif
	if (!blocks && (cs_data->defer_allocate || (TRANS_IN_PROG_TRUE == trans_in_prog)))
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */
	bplmap = cs_data->bplmap;
	/* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired
	 * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks
	 *      and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this
	 *      manner as every non-bitmap block must have an associated bitmap)
	 * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap.
	 * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed.
	 */
	new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1)
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap);
	new_blocks = blocks + new_bit_maps;
	assert((0 < (int)new_blocks) || (!cs_data->defer_allocate && (0 == new_blocks)));
	if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data))
	{
		assert(WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR));
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX);
		return (uint4)(NO_FREE_SPACE);
	}
	if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE)))
	{
		send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
	} else
	{
		if (!(gtmDebugLevel & GDL_IgnoreAvailSpace))
		{	/* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB
			 * in space it could never fit it if wasn't sparse. Needed for some tests.
			 */
			avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE);
			if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
			{
				if (blocks > (uint4)avail_blocks)
				{
					if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs))
						return (uint4)(NO_FREE_SPACE);
					else
						send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4,
							DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks);
				} else
					send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region),
						 (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0)));
			}
		}
	}
#	ifdef DEBUG
	if (WBTEST_ENABLED(WBTEST_MM_CONCURRENT_FILE_EXTEND) && dollar_tlevel && !MEMCMP_LIT(gv_cur_region->rname, "DEFAULT"))
	{
		SYSTEM("$gtm_dist/mumps -run $gtm_wbox_mrtn");
		assert(1 == cs_addrs->nl->wbox_test_seq_num);	/* should have been set by mubfilcpy */
		cs_addrs->nl->wbox_test_seq_num = 2;	/* signal mupip backup to stop sleeping in mubfilcpy */
	}
#	endif
	/* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */
	was_crit = cs_addrs->now_crit;
	assert(!cs_addrs->hold_onto_crit || was_crit);
	/* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur.
	 * If we are coming from online rollback (when that feature is available), we will come in holding crit and in
	 * 	the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself.
	 * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for
	 * freeze.
	 * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited
	 * 	for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this
	 *	function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that
	 *	at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region
	 *	to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused
	 *	op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which
	 *	we hold crit.
	 */
	assert(!was_crit || !FROZEN_HARD(cs_data) || (dollar_tlevel && (CDB_STAGNATE <= t_tries)));
	/*
	 * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE
	 * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup
	 * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt"
	 * transaction numbers without correspondingly updating the history transaction numbers (effectively causing
	 * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover)
	 * if we already hold crit.
	 */
	if (!was_crit)
	{
		for ( ; ; )
		{
			grab_crit(gv_cur_region);
			if (FROZEN_CHILLED(cs_data))
				DO_CHILLED_AUTORELEASE(cs_addrs, cs_data);
			if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN)
				break;
			rel_crit(gv_cur_region);
			while (FROZEN(cs_data) || IS_REPL_INST_FROZEN)
			{
				hiber_start(1000);
				if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data))
					break;
			}
		}
	} else if (FROZEN_HARD(cs_data) && dollar_tlevel)
	{	/* We don't want to continue with file extension as explained above. Hence return with an error code which
		 * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly.
		 */
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_FREEZE_PROG;
	} else
		WAIT_FOR_REGION_TO_UNCHILL(cs_addrs, cs_data);
	if (IS_REPL_INST_FROZEN && trans_in_prog)
	{
		assert(CDB_STAGNATE <= t_tries);
		GDSFILEXT_CLNUP;
		return (uint4)FINAL_RETRY_INST_FREEZE;
	}
	assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks);
	old_total = cs_data->trans_hist.total_blks;
	if (old_total != filesize)
	{	/* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM,
		 * we still need to remap the database
		 */
		assert((old_total > filesize) || !is_mm);
		/* For BG, someone else could have truncated or extended - we have no idea */
		GDSFILEXT_CLNUP;
		return (SS_NORMAL);
	}
	if (trans_in_prog && SUSPICIOUS_EXTEND)
	{
		if (!was_crit)
		{
			GDSFILEXT_CLNUP;
			return (uint4)(EXTEND_SUSPECT);
		}
		/* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then it means we held
		 * crit through bm_getfree into gdsfilext and still didn't get it right.
		 */
		assertpro(!is_free_blks_ctr_ok() && !SUSPICIOUS_EXTEND);
	}
	if (JNL_ENABLED(cs_data))
	{
		if (!jgbl.dont_reset_gbl_jrec_time)
			SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = cs_addrs->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open(gv_cur_region, cs_addrs);
		if (jnl_status)
		{
			GDSFILEXT_CLNUP;
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region));
			return (uint4)(NO_FREE_SPACE);	/* should have better return status */
		}
	}
	if (is_mm)
	{
		cs_addrs->nl->mm_extender_pid = process_id;
		status = wcs_wtstart(gv_cur_region, 0, NULL, NULL);
		cs_addrs->nl->mm_extender_pid = 0;
		assertpro(SS_NORMAL == status);
		old_base[0] = cs_addrs->db_addrs[0];
		old_base[1] = cs_addrs->db_addrs[1];
		cs_addrs->db_addrs[0] = NULL; /* don't rely on it until the mmap below */
#		ifdef _AIX
		status = shmdt(old_base[0] - BLK_ZERO_OFF(cs_data->start_vbn));
#		else
		status = munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0]));
#		endif
		if (0 != status)
		{
			save_errno = errno;
			GDSFILEXT_CLNUP;
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region),
					ERR_SYSCALL, 5, LEN_AND_STR(MEM_UNMAP_SYSCALL), CALLFROM, save_errno);
			return (uint4)(NO_FREE_SPACE);
		}
	} else
	{	/* Due to concurrency issues, it is possible some process had issued a disk read of the GDS block# corresponding
		 * to "old_total" right after a truncate wrote a GDS-block of zeros on disk (to signal end of the db file).
		 * If so, the global buffer containing this block needs to be invalidated now as part of the extend. If not, it is
		 * possible the EOF block on disk is now going to be overwritten by a properly initialized bitmap block (as part
		 * of the gdsfilext below) while the global buffer continues to have an incorrect copy of that bitmap block and
		 * this in turn would cause XXXX failures due to a bad bitmap block in shared memory. (GTM-7519)
		 */
		cr = db_csh_get((block_id)old_total);
		if ((NULL != cr) && ((cache_rec_ptr_t)CR_NOTVALID != cr))
		{
			assert((0 == cr->dirty) && (0 == cr->bt_index) && !cr->stopped);
			cr->cycle++;
			cr->blk = CR_BLKEMPTY;
		}
	}
	CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn);	/* can issue rts_error TNTOOLARGE */
	new_total = old_total + new_blocks;
	new_eof = BLK_ZERO_OFF(cs_data->start_vbn) + ((off_t)new_total * cs_data->blk_size);
#	if !defined(__sun) && !defined(__hpux)
	if (!cs_data->defer_allocate)
	{
		new_size = new_eof + cs_data->blk_size;
		save_errno = posix_fallocate(udi->fd, 0, new_size);
		DEBUG_ONLY(first_save_errno = save_errno);
		if ((ENOSPC == save_errno) && IS_GTM_IMAGE)
			save_errno = extend_wait_for_fallocate(udi, new_size);
		if (0 != save_errno)
		{
			GDSFILEXT_CLNUP;
			assert(ENOSPC == save_errno);
			if (ENOSPC != save_errno)
				send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_PREALLOCATEFAIL, 2, DB_LEN_STR(gv_cur_region),
					     save_errno);
			return (uint4)(NO_FREE_SPACE);
		}
	}
#	endif
	save_errno = db_write_eof_block(udi, udi->fd, cs_data->blk_size, new_eof, &(TREF(dio_buff)));
	if ((ENOSPC == save_errno) && IS_GTM_IMAGE)
		save_errno = extend_wait_for_write(udi, cs_data->blk_size, new_eof);
	if (0 != save_errno)
	{
		GDSFILEXT_CLNUP;
		if (ENOSPC != save_errno)
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		return (uint4)(NO_FREE_SPACE);
	}
	if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_1))
	{
		LONG_SLEEP(600);
		assert(FALSE);
	}
	/* Ensure the EOF and metadata get to disk BEFORE any bitmap writes. Otherwise, the file size could no longer reflect
	 * a proper extent and subsequent invocations of gdsfilext could corrupt the database.
	 */
	if (!IS_STATSDB_CSA(cs_addrs))
	{
		GTM_DB_FSYNC(cs_addrs, udi->fd, status);
		assert(0 == status);
		if (0 != status)
		{
			GDSFILEXT_CLNUP;
			send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(8) ERR_DBFILERR, 5,
						RTS_ERROR_LITERAL("fsync1()"), CALLFROM, status);
			return (uint4)(NO_FREE_SPACE);
		}
	}
	if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_2))
	{
		LONG_SLEEP(600);
		assert(FALSE); /* Should be killed before that */
	}
	DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
Exemplo n.º 12
0
void t_write_map (
    srch_blk_status	*blkhist,	/* Search History of the block to be written. Currently the
						 *	following members in this structure are used by "t_write_map"
						 *	    "blk_num"		--> Block number being modified
						 *	    "buffaddr"		--> Address of before image of the block
						 *	    "cr"		--> cache-record that holds the block (BG only)
						 *	    "cycle"		--> cycle when block was read by t_qread (BG only)
						 *	    "cr->ondsk_blkver"	--> Actual block version on disk
						 */
    unsigned char 	*upd_addr,	/* Address of the update array containing list of blocks to be cleared in bitmap */
    trans_num	tn,		/* Transaction Number when this block was read. Used for cdb_sc_blkmod validation */
    int4		reference_cnt)	/* Same meaning as cse->reference_cnt (see gdscc.h for comments) */
{
    cw_set_element		*cs;
    cache_rec_ptr_t		cr;
    jnl_buffer_ptr_t	jbbp;		/* jbbp is non-NULL only if before-image journaling */
    sgmnt_addrs		*csa;
    blk_hdr_ptr_t		old_block;
    unsigned int		bsiz;
    block_id		blkid;
    uint4			*updptr;

    csa = cs_addrs;
    if (!dollar_tlevel)
    {
        assert(cw_set_depth < CDB_CW_SET_SIZE);
        cs = &cw_set[cw_set_depth];
        cs->mode = gds_t_noop;	/* initialize it to a value that is not "gds_t_committed" before incrementing
					 * cw_set_depth as secshr_db_clnup relies on it */
        cw_set_depth++;
    } else
    {
        tp_cw_list(&cs);
        sgm_info_ptr->cw_set_depth++;
    }
    cs->mode = gds_t_writemap;
    cs->blk_checksum = 0;
    cs->blk = blkhist->blk_num;
    assert((cs->blk < csa->ti->total_blks) GTM_TRUNCATE_ONLY(|| (CDB_STAGNATE > t_tries)));
    cs->old_block = blkhist->buffaddr;
    /* t_write_map operates on BUSY blocks and hence cs->blk_prior_state's free_status is set to FALSE unconditionally */
    BIT_CLEAR_FREE(cs->blk_prior_state);
    BIT_CLEAR_RECYCLED(cs->blk_prior_state);
    old_block = (blk_hdr_ptr_t)cs->old_block;
    assert(NULL != old_block);
    jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
    if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn))
    {   /* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to
         * access the buffer more than the db blk_size.
         */
        bsiz = MIN(old_block->bsiz, csa->hdr->blk_size);
        cs->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz);
    }
    cs->cycle = blkhist->cycle;
    cr = blkhist->cr;
    cs->cr = cr;
    /* the buffer in shared memory holding the GDS block contents currently does not have in its block header the
     * on-disk format of that block. if it had, we could have easily copied that over to the cw-set-element.
     * until then, we have to use the cache-record's field "ondsk_blkver". but the cache-record is available only in BG.
     * thankfully, in MM, we do not allow GDSV4 type blocks, so we can safely assign GDSV6 (or GDSVCURR) to this field.
     */
    assert((NULL != cr) || (dba_mm == csa->hdr->acc_meth));
    cs->ondsk_blkver = (NULL == cr) ? GDSVCURR : cr->ondsk_blkver;
    cs->ins_off = 0;
    cs->index = 0;
    assert(reference_cnt < csa->hdr->bplmap);	/* Cannot allocate more blocks than a bitmap holds */
    assert(reference_cnt > -csa->hdr->bplmap);	/* Cannot free more blocks than a bitmap holds */
    cs->reference_cnt = reference_cnt;
    cs->jnl_freeaddr = 0;		/* reset jnl_freeaddr that previous transaction might have filled in */
    cs->upd_addr = upd_addr;
    DEBUG_ONLY(
        if (reference_cnt < 0)
        reference_cnt = -reference_cnt;
        /* Check that all block numbers are relative to the bitmap block number (i.e. bit number) */
        updptr = (uint4 *)upd_addr;
        while (reference_cnt--)
    {
        blkid = *updptr;
        assert((int4)blkid < csa->hdr->bplmap);
            updptr++;
        }
    )
Exemplo n.º 13
0
boolean_t mu_truncate(int4 truncate_percent)
{
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t 	csd;
	int			num_local_maps;
	int 			lmap_num, lmap_blk_num;
	int			bml_status, sigkill;
	int			save_errno;
	int			ftrunc_status;
	uint4			jnl_status;
	uint4			old_total, new_total;
	uint4			old_free, new_free;
	uint4			end_blocks;
	int4			blks_in_lmap, blk;
	gtm_uint64_t		before_trunc_file_size;
	off_t			trunc_file_size;
	off_t			padding;
	uchar_ptr_t		lmap_addr;
	boolean_t		was_crit;
	uint4			found_busy_blk;
	srch_blk_status		bmphist;
	srch_blk_status 	*blkhist;
	srch_hist		alt_hist;
	trans_num		curr_tn;
	blk_hdr_ptr_t		lmap_blk_hdr;
	block_id		*blkid_ptr;
	unix_db_info    	*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	char			*err_msg;
	intrpt_state_t		prev_intrpt_state;
	off_t			offset;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csa = cs_addrs;
	csd = cs_data;
	if (dba_mm == csd->acc_meth)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		return TRUE;
	}
	/* already checked for parallel truncates on this region --- see mupip_reorg.c */
	gv_target = NULL;
	assert(csa->nl->trunc_pid == process_id);
	assert(dba_mm != csd->acc_meth);
	old_total = csa->ti->total_blks;
	old_free = csa->ti->free_blocks;
	sigkill = 0;
	found_busy_blk = 0;
	memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */
	assert(csd->bplmap == BLKS_PER_LMAP);
	end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */
	if (0 == end_blocks)
		end_blocks = BLKS_PER_LMAP;
	num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP);
	/* ======================================== PHASE 1 ======================================== */
	for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--)
	{
		if (mu_ctrly_occurred || mu_ctrlc_occurred)
			return TRUE;
		assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */
		if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region),
					truncate_percent);
			return TRUE;
		}
		lmap_blk_num = lmap_num * BLKS_PER_LMAP;
		if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
		{
			found_busy_blk = lmap_blk_num;
			break;
		}
		blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP;
		/* Loop through non-bitmap blocks of this lmap, do recycled2free */
		DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n",
			lmap_num, lmap_blk_num, blks_in_lmap));
		for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;)
		{
			t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
			for (;;) /* retry loop for recycled to free transactions */
			{
				curr_tn = csd->trans_hist.curr_tn;
				/* Read the nth local bitmap into memory */
				bmphist.blk_num = lmap_blk_num;
				bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr);
				lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr;
				if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
				{ /* Could not read the block successfully. Retry. */
					t_retry((enum cdb_sc)rdfail_detail);
					continue;
				}
				lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr);
				/* starting from the hint (blk itself), find the first busy or recycled block */
				blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status);
				assert(blk < BLKS_PER_LMAP);
				if (blk == -1 || blk >= blks_in_lmap)
				{ /* done with this lmap, continue to next */
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
				{ /* stop processing blocks... skip ahead to phase 2 */
					found_busy_blk = lmap_blk_num;
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_RECYCLED == bml_status)
				{ /* Write PBLK records for recycled blocks only if before_image journaling is
				   * enabled. t_end() takes care of checking if journaling is enabled and
				   * writing PBLK record. We have to at least mark the recycled block as free.
				   */
					RESET_UPDATE_ARRAY;
					update_trans = UPDTRNS_DB_UPDATED_MASK;
					*((block_id *)update_array_ptr) = blk;
					update_array_ptr += SIZEOF(block_id);
					*(int *)update_array_ptr = 0;
					alt_hist.h[1].blk_num = 0;
					alt_hist.h[0].level = 0;
					alt_hist.h[0].cse = NULL;
					alt_hist.h[0].tn = curr_tn;
					alt_hist.h[0].blk_num = lmap_blk_num + blk;
					alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num,
							&alt_hist.h[0].cycle, &alt_hist.h[0].cr);
					if (!alt_hist.h[0].buffaddr)
					{
						t_retry((enum cdb_sc)rdfail_detail);
						continue;
					}
					if (!t_recycled2free(&alt_hist.h[0]))
					{
						t_retry(cdb_sc_lostbmlcr);
						continue;
					}
					t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0);
					/* Set the opcode for INCTN record written by t_end() */
					inctn_opcode = inctn_blkmarkfree;
					if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
						continue;
					/* block processed, scan from the next one */
					blk++;
					break;
				} else
				{
					assert(t_tries < CDB_STAGNATE);
					t_retry(cdb_sc_badbitmap);
					continue;
				}
			} /* END recycled2free retry loop */
		} /* END scanning blocks of this particular lmap */
		/* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */
		/* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */
		DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num));
		t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
		for (;;)
		{
			RESET_UPDATE_ARRAY;
			BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id);
			*blkid_ptr = 0;
			update_trans = UPDTRNS_DB_UPDATED_MASK;
			inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */
			curr_tn = csd->trans_hist.curr_tn;
			blkhist = &alt_hist.h[0];
			blkhist->blk_num = lmap_blk_num;
			blkhist->tn = curr_tn;
			blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */
			/* Read the nth local bitmap into memory */
			blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr);
			lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr;
			if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
			{ /* Could not read the block successfully. Retry. */
				t_retry((enum cdb_sc)rdfail_detail);
				continue;
			}
			t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0);
			blkhist->blk_num = 0; /* create empty history for bitmap block */
			if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
				continue;
			break;
		}
	} /* END scanning lmaps */
	/* ======================================== PHASE 2 ======================================== */
	assert(!csa->now_crit);
	for (;;)
	{ /* wait for FREEZE, we don't want to truncate a frozen database */
		grab_crit(gv_cur_region);
		if (FROZEN_CHILLED(cs_data))
			DO_CHILLED_AUTORELEASE(csa, cs_data);
		if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN)
			break;
		rel_crit(gv_cur_region);
		while (FROZEN(cs_data) || IS_REPL_INST_FROZEN)
		{
			hiber_start(1000);
			if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data))
				break;
		}
	}
	assert(csa->nl->trunc_pid == process_id);
	/* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */
	if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB))
	{
		assert(FALSE);
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"),
				DB_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return FALSE;
	}
	csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk);
	assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk));
	new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP);
	if (mu_ctrly_occurred || mu_ctrlc_occurred)
	{
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (csa->ti->total_blks != old_total || new_total == old_total)
	{
		assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (SNAPSHOTS_IN_PROG(csa->nl))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	}
	DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	if (JNL_ENABLED(csa))
	{ /* Write JRT_TRUNC and INCTN records */
		if (!jgbl.dont_reset_gbl_jrec_time)
		SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = csa->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open(gv_cur_region, csa);
		if (SS_NORMAL != jnl_status)
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
		else
		{
			if (0 == jpc->pini_addr)
				jnl_put_jrt_pini(csa);
			jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total);
			inctn_opcode = inctn_mu_reorg;
			jnl_write_inctn_rec(csa);
			jnl_status = jnl_flush(gv_cur_region);
			if (SS_NORMAL != jnl_status)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"),
					jnl_status);
				assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */
			}
		}
	}
	/* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */
	curr_tn = csa->ti->curr_tn;
	CHECK_TN(csa, csd, curr_tn);
	udi = FILE_INFO(gv_cur_region);
	/* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */
	trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1));
	csd->after_trunc_total_blks = new_total;
	csd->before_trunc_free_blocks = csa->ti->free_blocks;
	csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */
	/* file size and total blocks: INCONSISTENT */
	csa->ti->total_blks = new_total;
	/* past the point of no return -- shared memory intact */
	assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total));
	csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total);
	new_free = csa->ti->free_blocks;
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	CHECK_DBSYNC(gv_cur_region, save_errno);
	/* past the point of no return -- shared memory deleted */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */
	clear_cache_array(csa, csd, gv_cur_region, new_total, old_total);
	offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size;
	save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff)));
	if (0 != save_errno)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		return FALSE;
	}
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */
	/* Execute an ftruncate() and truncate the DB file
	 * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS)
	 * It ignores kill -9 signal till its operation is completed.
	 * So we can safely assume that the result of ftruncate() will be complete.
	 */
	FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status);
	if (0 != ftrunc_status)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		/* should go through recover_truncate now, which will again try to FTRUNCATE */
		return FALSE;
	}
	/* file size and total blocks: CONSISTENT (shrunk) */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */
	csa->nl->root_search_cycle++;	/* Force concurrent processes to restart in t_end/tp_tend to make sure no one
					 * tries to commit updates past the end of the file. Bitmap validations together
					 * with highest_lbm_with_busy_blk should actually be sufficient, so this is
					 * just to be safe.
					 */
	csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */
	/* Increment TN */
	assert(csa->ti->early_tn == csa->ti->curr_tn);
	csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1;
	INCREMENT_CURR_TN(csd);
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */
	CHECK_DBSYNC(gv_cur_region, save_errno);
	ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	curr_tn = csa->ti->curr_tn;
	rel_crit(gv_cur_region);
	send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn);
	util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].",
					FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free);
	return TRUE;
} /* END of mu_truncate() */
Exemplo n.º 14
0
/******************************************************************************************
Input Parameters:
	level: level of working block
	dest_blk_id: last destination used for swap
Output Parameters:
	kill_set_ptr: Kill set to be freed
	*exclude_glist_ptr: List of globals not to be moved for a swap destination
Input/Output Parameters:
	gv_target : as working block's history
	reorg_gv_target->hist : as desitnitions block's history
 ******************************************************************************************/
enum cdb_sc mu_swap_blk(int level, block_id *pdest_blk_id, kill_set *kill_set_ptr, glist *exclude_glist_ptr)
{
	unsigned char		x_blk_lmap;
	unsigned short		temp_ushort;
	int			rec_size1, rec_size2;
	int			wlevel, nslevel, dest_blk_level;
	int			piece_len1, piece_len2, first_offset, second_offset,
				work_blk_size, work_parent_size, dest_blk_size, dest_parent_size;
	int			dest_child_cycle;
	int			blk_seg_cnt, blk_size;
	trans_num		ctn;
	int			key_len, key_len_dir;
	block_id		dest_blk_id, work_blk_id, child1, child2;
	enum cdb_sc		status;
	srch_hist 		*dest_hist_ptr, *dir_hist_ptr;
	cache_rec_ptr_t		dest_child_cr;
	blk_segment		*bs1, *bs_ptr;
	sm_uc_ptr_t		saved_blk, work_blk_ptr, work_parent_ptr, dest_parent_ptr, dest_blk_ptr,
				bn_ptr, bmp_buff, tblk_ptr, rec_base, rPtr1;
	boolean_t		gbl_target_was_set, blk_was_free, deleted;
	gv_namehead		*save_targ;
	srch_blk_status		bmlhist, destblkhist, *hist_ptr;
	unsigned char    	save_cw_set_depth;
	cw_set_element		*tmpcse;
	jnl_buffer_ptr_t	jbbp; /* jbbp is non-NULL only if before-image journaling */
	unsigned int		bsiz;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	dest_blk_id = *pdest_blk_id;
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	if (NULL == TREF(gv_reorgkey))
		GVKEY_INIT(TREF(gv_reorgkey), DBKEYSIZE(MAX_KEY_SZ));
	dest_hist_ptr = &(reorg_gv_target->hist);
	dir_hist_ptr = reorg_gv_target->alt_hist;
	blk_size = cs_data->blk_size;
	work_parent_ptr = gv_target->hist.h[level+1].buffaddr;
	work_parent_size = ((blk_hdr_ptr_t)work_parent_ptr)->bsiz;
	work_blk_ptr = gv_target->hist.h[level].buffaddr;
	work_blk_size = ((blk_hdr_ptr_t)work_blk_ptr)->bsiz;
	work_blk_id = gv_target->hist.h[level].blk_num;
	if (blk_size < work_blk_size)
	{
		assert(t_tries < CDB_STAGNATE);
		return cdb_sc_blkmod;
	}
	cws_reorg_remove_index = 0;
	/*===== Infinite loop to find the destination block =====*/
	for ( ; ; )
	{
		blk_was_free = FALSE;
		INCR_BLK_NUM(dest_blk_id);
		/* A Pre-order traversal should not cause a child block to go to its parent.
		 * However, in case it happens because already the organization was like that or for any other reason, skip swap.
		 * If we decide to swap, code below should be changed to take care of the special case.
		 * Still a grand-child can go to its grand-parent. This is rare and following code can handle it.
		 */
		if (dest_blk_id == gv_target->hist.h[level+1].blk_num)
			continue;
		if (cs_data->trans_hist.total_blks <= dest_blk_id || dest_blk_id == work_blk_id)
		{
			*pdest_blk_id = dest_blk_id;
			return cdb_sc_oprnotneeded;
		}
		ctn = cs_addrs->ti->curr_tn;
		/* We need to save the block numbers that were NEWLY ADDED (since entering this function "mu_swap_blk")
		 * through the CWS_INSERT macro (in db_csh_get/db_csh_getn which can be called by t_qread or gvcst_search below).
		 * This is so that we can delete these blocks from the "cw_stagnate" hashtable in case we determine the need to
		 * choose a different "dest_blk_id" in this for loop (i.e. come to the next iteration). If these blocks are not
		 * deleted, then the hashtable will keep growing (a good example will be if -EXCLUDE qualifier is specified and
		 * a lot of prospective dest_blk_ids get skipped because they contain EXCLUDEd global variables) and very soon
		 * the hashtable will contain more entries than there are global buffers and at that point db_csh_getn will not
		 * be able to get a free global buffer for a new block (since it checks the "cw_stagnate" hashtable before reusing
		 * a buffer in case of MUPIP REORG). To delete these previous iteration blocks, we use the "cws_reorg_remove_array"
		 * variable. This array should have enough entries to accommodate the maximum number of blocks that can be t_qread
		 * in one iteration down below. And that number is the sum of
		 *	+     MAX_BT_DEPTH : for the t_qread while loop down the tree done below
		 *	+ 2 * MAX_BT_DEPTH : for the two calls to gvcst_search done below
		 *	+ 2                : 1 for the t_qread of dest_blk_id and 1 more for the t_qread of a
		 *			     bitmap block done inside the call to get_lmap below
		 *	= 3 * MAX_BT_DEPTH + 2
		 * To be safe, we give a buffer of MAX_BT_DEPTH elements i.e. (4 * MAX_BT_DEPTH) + 2.
		 * This is defined in the macro CWS_REMOVE_ARRAYSIZE in cws_insert.h
		 */
		/* reset whatever blocks the previous iteration of this for loop had filled in the cw_stagnate hashtable */
		for ( ; cws_reorg_remove_index > 0; cws_reorg_remove_index--)
		{
			deleted = delete_hashtab_int4(&cw_stagnate, (uint4 *)&cws_reorg_remove_array[cws_reorg_remove_index]);
			assert(deleted);
		}
		/* read corresponding bitmap block before attempting to read destination  block.
		 * if bitmap indicates block is free, we will not read the destination block
		 */
		bmp_buff = get_lmap(dest_blk_id, &x_blk_lmap, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr);
		if (!bmp_buff || BLK_MAPINVALID == x_blk_lmap ||
			((blk_hdr_ptr_t)bmp_buff)->bsiz != BM_SIZE(BLKS_PER_LMAP) ||
			((blk_hdr_ptr_t)bmp_buff)->levl != LCL_MAP_LEVL)
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_badbitmap;
		}
		if (BLK_FREE != x_blk_lmap)
		{	/* x_blk_lmap is either BLK_BUSY or BLK_RECYCLED. In either case, we need to read destination block
			 * in case we later detect that the before-image needs to be written.
			 */
			if (!(dest_blk_ptr = t_qread(dest_blk_id, (sm_int_ptr_t)&destblkhist.cycle, &destblkhist.cr)))
			{
				assert(t_tries < CDB_STAGNATE);
				return (enum cdb_sc)rdfail_detail;
			}
			destblkhist.blk_num = dest_blk_id;
			destblkhist.buffaddr = dest_blk_ptr;
			destblkhist.level = dest_blk_level = ((blk_hdr_ptr_t)dest_blk_ptr)->levl;
		}
		if (BLK_BUSY != x_blk_lmap)
		{	/* x_blk_map is either BLK_FREE or BLK_RECYCLED both of which mean the block is not used in the bitmap */
			blk_was_free = TRUE;
			break;
		}
		/* dest_blk_id might contain a *-record only.
		 * So follow the pointer to go to the data/index block, which has a non-* key to search.
		 */
		nslevel = dest_blk_level;
		if (MAX_BT_DEPTH <= nslevel)
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_maxlvl;
		}
		rec_base = dest_blk_ptr + SIZEOF(blk_hdr);
		GET_RSIZ(rec_size1, rec_base);
		tblk_ptr = dest_blk_ptr;
		while ((BSTAR_REC_SIZE == rec_size1) && (0 != nslevel))
		{
			GET_LONG(child1, (rec_base + SIZEOF(rec_hdr)));
			if (0 == child1 || child1 > cs_data->trans_hist.total_blks - 1)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_rdfail;
			}
			if (!(tblk_ptr = t_qread(child1, (sm_int_ptr_t)&dest_child_cycle, &dest_child_cr)))
			{
				assert(t_tries < CDB_STAGNATE);
				return (enum cdb_sc)rdfail_detail;
			}
			/* leaf of a killed GVT can have block header only.   Skip those blocks */
			if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz)
				break;
			nslevel--;
			rec_base = tblk_ptr + SIZEOF(blk_hdr);
			GET_RSIZ(rec_size1, rec_base);
		}
		/* leaf of a killed GVT can have block header only.   Skip those blocks */
		if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz)
			continue;
		/* get length of global variable name (do not read subscript) for dest_blk_id */
		GET_GBLNAME_LEN(key_len_dir, rec_base + SIZEOF(rec_hdr));
		/* key_len = length of 1st key value (including subscript) for dest_blk_id */
		GET_KEY_LEN(key_len, rec_base + SIZEOF(rec_hdr));
		if ((1 >= key_len_dir || MAX_MIDENT_LEN + 1 < key_len_dir) || (2 >= key_len || MAX_KEY_SZ < key_len))
		{	/* Earlier used to restart here always. But dest_blk_id can be a block,
			 * which is just killed and still marked busy.  Skip it, if we are in last retry.
			 */
			if (CDB_STAGNATE <= t_tries)
				continue;
			else
				return cdb_sc_blkmod;
		}
		memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len_dir);
		(TREF(gv_reorgkey))->base[key_len_dir] = 0;
		(TREF(gv_reorgkey))->end = key_len_dir;
		if (exclude_glist_ptr->next)
		{	/* exclude blocks for globals in the list of EXCLUDE option */
			if  (in_exclude_list(&((TREF(gv_reorgkey))->base[0]), key_len_dir - 1, exclude_glist_ptr))
				continue;
		}
		save_targ = gv_target;
		if (INVALID_GV_TARGET != reset_gv_target)
			gbl_target_was_set = TRUE;
		else
		{
			gbl_target_was_set = FALSE;
			reset_gv_target = save_targ;
		}
		gv_target = reorg_gv_target;
		gv_target->root = cs_addrs->dir_tree->root;
		gv_target->clue.end = 0;
		/* assign Directory tree path to find dest_blk_id in dir_hist_ptr */
		status = gvcst_search(TREF(gv_reorgkey), dir_hist_ptr);
		if (cdb_sc_normal != status)
		{
			assert(t_tries < CDB_STAGNATE);
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			return status;
		}
		if (dir_hist_ptr->h[0].curr_rec.match != (TREF(gv_reorgkey))->end + 1)
		{	/* may be in a kill_set of another process */
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			continue;
		}
		for (wlevel = 0; wlevel <= dir_hist_ptr->depth &&
			dir_hist_ptr->h[wlevel].blk_num != dest_blk_id; wlevel++);
		if (dir_hist_ptr->h[wlevel].blk_num == dest_blk_id)
		{	/* do not swap a dir_tree block */
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			continue;
		}
		/* gv_reorgkey will now have the first key from dest_blk_id,
		 * or, from a descendant of dest_blk_id (in case it had a *-key only).
		 */
		memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len);
		(TREF(gv_reorgkey))->end = key_len - 1;
		GET_KEY_LEN(key_len_dir, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr));
		/* Get root of GVT for dest_blk_id */
		GET_LONG(gv_target->root,
			dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr) + key_len_dir);
		if ((0 == gv_target->root) || (gv_target->root > (cs_data->trans_hist.total_blks - 1)))
		{
			assert(t_tries < CDB_STAGNATE);
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			return cdb_sc_blkmod;
		}
		/* Assign Global Variable Tree path to find dest_blk_id in dest_hist_ptr */
		gv_target->clue.end = 0;
		status = gvcst_search(TREF(gv_reorgkey), dest_hist_ptr);
		RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
		if (dest_blk_level >= dest_hist_ptr->depth || /* do not swap in root level */
			dest_hist_ptr->h[dest_blk_level].blk_num != dest_blk_id) /* must be in a kill set of another process. */
			continue;
		if ((cdb_sc_normal != status) || (dest_hist_ptr->h[nslevel].curr_rec.match != ((TREF(gv_reorgkey))->end + 1)))
		{
			assert(t_tries < CDB_STAGNATE);
			return (cdb_sc_normal != status ? status : cdb_sc_blkmod);
		}
		for (wlevel = nslevel; wlevel <= dest_blk_level; wlevel++)
			dest_hist_ptr->h[wlevel].tn = ctn;
		dest_blk_ptr = dest_hist_ptr->h[dest_blk_level].buffaddr;
		dest_blk_size = ((blk_hdr_ptr_t)dest_blk_ptr)->bsiz;
		dest_parent_ptr = dest_hist_ptr->h[dest_blk_level+1].buffaddr;
		dest_parent_size = ((blk_hdr_ptr_t)dest_parent_ptr)->bsiz;
		break;
	}
	/*===== End of infinite loop to find the destination block =====*/
	/*-----------------------------------------------------
	   Now modify blocks for swapping. Maximum of 4 blocks.
	   -----------------------------------------------------*/
	if (!blk_was_free)
	{	/* 1: dest_blk_id into work_blk_id */
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, dest_blk_ptr + SIZEOF(blk_hdr), dest_blk_size - SIZEOF(blk_hdr));
		if (!BLK_FINI (bs_ptr,bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		assert(gv_target->hist.h[level].blk_num == work_blk_id);
		assert(gv_target->hist.h[level].buffaddr == work_blk_ptr);
		t_write(&gv_target->hist.h[level], (unsigned char *)bs1, 0, 0, dest_blk_level, TRUE, TRUE, GDS_WRITE_KILLTN);
	}
	/* 2: work_blk_id into dest_blk_id */
	if (!blk_was_free && work_blk_id == dest_hist_ptr->h[dest_blk_level+1].blk_num)
	{	/* work_blk_id will be swapped with its child.
		 * This is the only vertical swap.  Here working block goes to its child.
		 * Working block cannot goto its parent because of traversal
		 */
		if (dest_blk_level + 1 != level || dest_parent_size != work_blk_size)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		BLK_INIT(bs_ptr, bs1);
		BLK_ADDR(saved_blk, dest_parent_size, unsigned char);
		memcpy(saved_blk, dest_parent_ptr, dest_parent_size);
		first_offset = dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset;
		GET_RSIZ(rec_size1, saved_blk + first_offset);
		if (work_blk_size < first_offset + rec_size1)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		piece_len1 =  first_offset + rec_size1;
		BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), piece_len1 - SIZEOF(block_id) - SIZEOF(blk_hdr));
		BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
		PUT_LONG(bn_ptr, work_blk_id); /* since work_blk_id will now be the child of dest_blk_id */
		BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
		BLK_SEG(bs_ptr, saved_blk + piece_len1, dest_parent_size - piece_len1);
		if (!BLK_FINI(bs_ptr, bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		assert(dest_blk_id == dest_hist_ptr->h[dest_blk_level].blk_num);
		assert(dest_blk_ptr == dest_hist_ptr->h[dest_blk_level].buffaddr);
		t_write(&dest_hist_ptr->h[dest_blk_level], (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN);
	} else /* free block or, when working block does not move vertically (swap with parent/child) */
	{
		BLK_INIT(bs_ptr, bs1);
		BLK_ADDR(saved_blk, work_blk_size, unsigned char);
		memcpy(saved_blk, work_blk_ptr, work_blk_size);
		BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), work_blk_size - SIZEOF(blk_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		if (blk_was_free)
		{
			tmpcse = &cw_set[cw_set_depth];
			t_create(dest_blk_id, (unsigned char *)bs1, 0, 0, level);
			/* Although we invoked t_create, we do not want t_end to allocate the block (i.e. change mode
			 * from gds_t_create to gds_t_acquired). Instead we do that and a little more (that t_end does) all here.
			 */
			assert(dest_blk_id == tmpcse->blk);
			tmpcse->mode = gds_t_acquired;
			/* If snapshots are in progress, we might want to read the before images of the FREE blocks also.
			 * Since mu_swap_blk mimics a small part of t_end, it sets cse->mode to gds_t_acquired and hence
			 * will not read the before images of the FREE blocks in t_end. To workaround this, set
			 * cse->was_free to TRUE so that in t_end, this condition can be used to read the before images of
			 * the FREE blocks if needed.
			 */
			(BLK_FREE == x_blk_lmap) ? SET_FREE(tmpcse) : SET_NFREE(tmpcse);
			/* No need to write before-image in case the block is FREE. In case the database had never been fully
			 * upgraded from V4 to V5 format (after the MUPIP UPGRADE), all RECYCLED blocks can basically be considered
			 * FREE (i.e. no need to write before-images since backward journal recovery will never be expected
			 * to take the database to a point BEFORE the mupip upgrade).
			 */
			if ((BLK_FREE == x_blk_lmap) || !cs_data->db_got_to_v5_once)
				tmpcse->old_block = NULL;
			else
			{	/* Destination is a recycled block that needs a before image */
				tmpcse->old_block = destblkhist.buffaddr;
				/* Record cr,cycle. This is used later in t_end to determine if checksums need to be recomputed */
				tmpcse->cr = destblkhist.cr;
				tmpcse->cycle = destblkhist.cycle;
				jbbp = (JNL_ENABLED(cs_addrs) && cs_addrs->jnl_before_image) ? cs_addrs->jnl->jnl_buff : NULL;
				if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn))
				{	/* Compute CHECKSUM for writing PBLK record before getting crit.
					 * It is possible that we are reading a block that is actually marked free in
					 * the bitmap (due to concurrency issues at this point). Therefore we might be
					 * actually reading uninitialized block headers and in turn a bad value of
					 * "old_block->bsiz". Restart if we ever access a buffer whose size is greater
					 * than the db block size.
					 */
					bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz;
					if (bsiz > blk_size)
					{
						assert(CDB_STAGNATE > t_tries);
						return cdb_sc_lostbmlcr;
					}
					JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, cs_data, cs_addrs, tmpcse->old_block, bsiz);
				}
			}
			assert(GDSVCURR == tmpcse->ondsk_blkver);	/* should have been set by t_create above */
		} else
		{
			hist_ptr = &dest_hist_ptr->h[dest_blk_level];
			assert(dest_blk_id == hist_ptr->blk_num);
			assert(dest_blk_ptr == hist_ptr->buffaddr);
			t_write(hist_ptr, (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN);
		}
	}
	if (!blk_was_free)
	{	/* 3: Parent of destination block (may be parent of working block too) */
		if (gv_target->hist.h[level+1].blk_num == dest_hist_ptr->h[dest_blk_level+1].blk_num)
		{	/* dest parent == work_blk parent */
			BLK_INIT(bs_ptr, bs1);
			/* Interchange pointer to dest_blk_id and work_blk_id */
			if (level != dest_blk_level ||
				gv_target->hist.h[level+1].curr_rec.offset == dest_hist_ptr->h[level+1].curr_rec.offset)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			if (gv_target->hist.h[level+1].curr_rec.offset < dest_hist_ptr->h[level+1].curr_rec.offset)
			{
				first_offset = gv_target->hist.h[level+1].curr_rec.offset;
				second_offset = dest_hist_ptr->h[level+1].curr_rec.offset;
			} else
			{
				first_offset = dest_hist_ptr->h[level+1].curr_rec.offset;
				second_offset = gv_target->hist.h[level+1].curr_rec.offset;
			}
			GET_RSIZ(rec_size1, dest_parent_ptr + first_offset);
			GET_RSIZ(rec_size2, dest_parent_ptr + second_offset);
			if (dest_parent_size < first_offset + rec_size1 ||
				dest_parent_size < second_offset + rec_size2 ||
				BSTAR_REC_SIZE >= rec_size1 || BSTAR_REC_SIZE > rec_size2)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			piece_len1 =  first_offset + rec_size1 - SIZEOF(block_id);
			piece_len2 =  second_offset + rec_size2 - SIZEOF(block_id);
			GET_LONG(child1, dest_parent_ptr + piece_len1);
			GET_LONG(child2, dest_parent_ptr + piece_len2);
			BLK_SEG(bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), piece_len1 - SIZEOF(blk_hdr));
			BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
			PUT_LONG(bn_ptr, child2);
			BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
			BLK_SEG(bs_ptr, dest_parent_ptr + first_offset + rec_size1,
				second_offset + rec_size2 - SIZEOF(block_id) - first_offset - rec_size1);
			BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
			PUT_LONG(bn_ptr, child1);
			BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
			BLK_SEG(bs_ptr, dest_parent_ptr + second_offset + rec_size2,
				dest_parent_size - second_offset - rec_size2);
			if (!BLK_FINI(bs_ptr,bs1))
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			assert(level == dest_blk_level);
			assert(dest_parent_ptr == dest_hist_ptr->h[level+1].buffaddr);
			t_write(&dest_hist_ptr->h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN);
		} else if (work_blk_id != dest_hist_ptr->h[dest_blk_level+1].blk_num)
		{	/* Destination block moved in the position of working block.
			 * So destination block's parent's pointer should be changed to work_blk_id
			 */
			BLK_INIT(bs_ptr, bs1);
			GET_RSIZ(rec_size1, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset);
			if (dest_parent_size < rec_size1 +  dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset ||
				BSTAR_REC_SIZE > rec_size1)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			BLK_SEG (bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr),
			    dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id));
			BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
			PUT_LONG(bn_ptr, work_blk_id);
			BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
			BLK_SEG(bs_ptr, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1,
				dest_parent_size - dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset  - rec_size1);
			if (!BLK_FINI(bs_ptr,bs1))
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			assert(dest_parent_ptr == dest_hist_ptr->h[dest_blk_level+1].buffaddr);
			t_write(&dest_hist_ptr->h[dest_blk_level+1], (unsigned char *)bs1, 0, 0, dest_blk_level+1,
				FALSE, TRUE, GDS_WRITE_KILLTN);
		}
	}
	/* 4: Parent of working block, if different than destination's parent or, destination was a free block */
	if (blk_was_free || gv_target->hist.h[level+1].blk_num != dest_hist_ptr->h[dest_blk_level+1].blk_num)
	{	/* Parent block of working blk should correctly point the working block. Working block went to dest_blk_id  */
		GET_RSIZ(rec_size1, (work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset));
		if (work_parent_size < rec_size1 +  gv_target->hist.h[level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, work_parent_ptr + SIZEOF(blk_hdr),
			gv_target->hist.h[level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id));
		BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
		PUT_LONG(bn_ptr, dest_blk_id);
		BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
		BLK_SEG(bs_ptr, work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset + rec_size1,
			work_parent_size - gv_target->hist.h[level+1].curr_rec.offset - rec_size1);
		if (!BLK_FINI(bs_ptr, bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		assert(gv_target->hist.h[level+1].buffaddr == work_parent_ptr);
		t_write(&gv_target->hist.h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN);
	}
	/* else already taken care of, when dest_blk_id moved */
	if (blk_was_free)
	{	/* A free/recycled block will become busy block.
		 * So the local bitmap must be updated.
		 * Local bit map block will be added in the list of update arrray for concurrency check and
		 * 	also the cw_set element will be created to mark the free/recycled block as free.
		 * kill_set_ptr will save the block which will become free.
		 */
		child1 = ROUND_DOWN2(dest_blk_id, BLKS_PER_LMAP); /* bit map block */
		bmlhist.buffaddr = bmp_buff;
		bmlhist.blk_num = child1;
		child1 = dest_blk_id - child1;
		assert(child1);
		PUT_LONG(update_array_ptr, child1);
		/* Need to put bit maps on the end of the cw set for concurrency checking.
		 * We want to simulate t_write_map, except we want to update "cw_map_depth" instead of "cw_set_depth".
		 * Hence the save and restore logic (for "cw_set_depth") below.
		 */
		save_cw_set_depth = cw_set_depth;
		assert(!cw_map_depth);
		t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, ctn, 1);	/* will increment cw_set_depth */
		cw_map_depth = cw_set_depth;		/* set cw_map_depth to the latest cw_set_depth */
		cw_set_depth = save_cw_set_depth;	/* restore cw_set_depth */
		/* t_write_map simulation end */
		update_array_ptr += SIZEOF(block_id);
		child1 = 0;
		PUT_LONG(update_array_ptr, child1);
		update_array_ptr += SIZEOF(block_id);
		assert(1 == cw_set[cw_map_depth - 1].reference_cnt);	/* 1 free block is now becoming BLK_USED in the bitmap */
		/* working block will be removed */
		kill_set_ptr->blk[kill_set_ptr->used].flag = 0;
		kill_set_ptr->blk[kill_set_ptr->used].level = 0;
		kill_set_ptr->blk[kill_set_ptr->used++].block = work_blk_id;
	}
	*pdest_blk_id = dest_blk_id;
	return cdb_sc_normal;
}
Exemplo n.º 15
0
bt_rec_ptr_t bt_put(gd_region *reg, int4 block)
{
	bt_rec_ptr_t		bt, q0, q1, hdr;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	cache_rec_ptr_t		cr;
	th_rec_ptr_t		th;
	trans_num		lcl_tn;
	uint4			lcnt;

	csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs;
	csd = csa->hdr;
	assert(csa->now_crit || csd->clustered);
	assert(dba_mm != csa->hdr->acc_meth);
	lcl_tn = csa->ti->curr_tn;
	hdr = csa->bt_header + (block % csd->bt_buckets);
	assert(BT_QUEHEAD == hdr->blk);
	for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl);  ;
		bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++)
	{
		if (BT_QUEHEAD == bt->blk)
		{	/* there is no matching bt */
			assert(bt == hdr);
			bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque));
			if (CR_NOTVALID != bt->cache_index)
			{	/* the oldest bt is still valid */
				assert(!in_wcs_recover);
				cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index);
				if (cr->dirty)
				{	/* get it written so it can be reused */
					BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty);
					if (FALSE == wcs_get_space(reg, 0, cr))
					{
						assert(csa->nl->wc_blocked);	/* only reason we currently know
										 * why wcs_get_space could fail */
						assert(gtm_white_box_test_case_enabled);
						BG_TRACE_PRO_ANY(csa, wcb_bt_put);
						send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"),
							process_id, &lcl_tn, DB_LEN_STR(reg));
						return NULL;
					}
				}
				bt->cache_index = CR_NOTVALID;
				cr->bt_index = 0;
			}
			q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl);
			q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0);
			if (EMPTY_QUEUE == (sm_long_t)q1)
				rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1);
			bt->blk = block;
			bt->killtn = lcl_tn;
			insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr);
			th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base);
			if (EMPTY_QUEUE == (sm_long_t)th)
				GTMASSERT;
			break;
		}
		if (bt->blk == block)
		{	/* bt_put should never be called twice for the same block with the same lcl_tn. This is because
			 * t_end/tp_tend update every block only once as part of each update transaction. Assert this.
			 * The two exceptions are
			 *   a) Forward journal recovery which simulates a 2-phase M-kill where the same block
			 *	could get updated in both phases (example bitmap block gets updated for blocks created
			 *	within the TP transaction as well as for blocks that are freed up in the 2nd phase of
			 *	the M-kill) with the same transaction number. This is because although GT.M would have
			 *	updated the same block with different transaction numbers in the two phases, forward
			 *	recovery will update it with the same tn and instead increment the db tn on seeing the
			 *	following INCTN journal record(s).
			 *   b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block
			 *	and potentially with the same tn. This is because the state of the queues is questionable
			 *	and there could be more than one cache record for a given block number.
			 */
			assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa)));
			q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl);
			th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque)));
			if (EMPTY_QUEUE == (sm_long_t)th)
				GTMASSERT;
			break;
		}
		if (0 == bt->blkque.fl)
			rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2);
		if (lcnt >= csd->n_bts)
			rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3);
	}
	insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base);
	bt->tn = lcl_tn;
	return bt;
}
Exemplo n.º 16
0
bool gtcmtr_bufflush(void)
{
	cm_region_list	*reg_ref;
	mval		v;
	short		n;
	unsigned short	num_trans, data_len;
	unsigned char	buff[MAX_ZWR_KEY_SZ], *end;
	unsigned char	*ptr, regnum, len, cc, prv;
	static readonly gds_file_id file;

	error_def(ERR_KEY2BIG);
	error_def(ERR_REC2BIG);
	error_def(ERR_GVIS);

	ptr = curr_entry->clb_ptr->mbf;
	assert(*ptr == CMMS_B_BUFFLUSH);
	ptr++;
	v.mvtype = MV_STR;
	GET_USHORT(num_trans, ptr);
	ptr += sizeof (short);
	for (; num_trans-- > 0;)
	{
		regnum = *ptr++;
		reg_ref = gtcm_find_region(curr_entry, regnum);
		len = *ptr++;
		cc = *ptr++;
		prv = *ptr++;
		assert (len + cc - 1 < gv_currkey->top);
		memcpy(&gv_currkey->base[cc], ptr, len);
		ptr += len;
		gv_currkey->end = len + cc - 1;
		gv_currkey->prev = prv;
		assert(prv < gv_currkey->end);
		if ((n = gv_currkey->end + 1) > gv_cur_region->max_key_size)
		{
			if ((end = format_targ_key(&buff[0], MAX_ZWR_KEY_SZ, gv_currkey, TRUE)) == 0)
				end = &buff[MAX_ZWR_KEY_SZ - 1];
			rts_error(VARLSTCNT(11) ERR_KEY2BIG, 4, n, (int4)gv_cur_region->max_key_size,
				REG_LEN_STR(gv_cur_region), 0, ERR_GVIS, 2, end - buff, buff);
		}
		gtcm_bind_name(reg_ref->reghead, TRUE);
		if (JNL_ENABLED(cs_addrs->hdr))
		{
			cs_addrs->jnl->pini_addr = reg_ref->pini_addr;
			originator_prc_vec = curr_entry->pvec;
		}
		GET_USHORT(data_len, ptr);
		ptr += sizeof(short);
		v.str.len = data_len;
		v.str.addr = (char *)ptr;
		if (n + v.str.len + sizeof(rec_hdr) > gv_cur_region->max_rec_size)
		{
			if ((end = format_targ_key(&buff[0], MAX_ZWR_KEY_SZ, gv_currkey, TRUE)) == 0)
				end = &buff[MAX_ZWR_KEY_SZ - 1];
			rts_error(VARLSTCNT(11) ERR_REC2BIG, 4, n + v.str.len + sizeof(rec_hdr), (int4)gv_cur_region->max_rec_size,
				REG_LEN_STR(gv_cur_region), 0, ERR_GVIS, 2, end - buff, buff);
		}
		gvcst_put(&v);
		if (JNL_ENABLED(cs_addrs->hdr))
			reg_ref->pini_addr = cs_addrs->jnl->pini_addr; /* In case  journal switch occurred */
		ptr += data_len;
	}
	ptr = curr_entry->clb_ptr->mbf;
	*ptr++ = CMMS_C_BUFFLUSH;
	curr_entry->clb_ptr->cbl = S_HDRSIZE;
	return TRUE;
}
Exemplo n.º 17
0
int4 gds_rundown(void)
{
	boolean_t		canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin;
	boolean_t		have_standalone_access, ipc_deleted, err_caught;
	boolean_t		is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm;
	boolean_t		unsafe_last_writer;
	char			time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */
	gd_region		*reg;
	int			save_errno, status, rc;
	int4			semval, ftok_semval, sopcnt, ftok_sopcnt;
	short			crash_count;
	sm_long_t		munmap_len;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	node_local_ptr_t	cnl;
	struct shmid_ds		shm_buf;
	struct sembuf		sop[2], ftok_sop[2];
	uint4           	jnl_status;
	unix_db_info		*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	shm_snapshot_t		*ss_shm_ptr;
	uint4			ss_pid, onln_rlbk_pid, holder_pid;
	boolean_t		was_crit;
	boolean_t		safe_mode; /* Do not flush or take down shared memory. */
	boolean_t		bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen,
				ftok_counter_halted,
				access_counter_halted;
	int			secshrstat;
	intrpt_state_t		prev_intrpt_state;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	jnl_status = 0;
	reg = gv_cur_region;			/* Local copy */

	/* early out for cluster regions
	 * to avoid tripping the assert below.
	 * Note:
	 *	This early out is consistent with VMS.  It has been
	 *	noted that all of the gtcm assignments
	 *      to gv_cur_region should use the TP_CHANGE_REG
	 *	macro.  This would also avoid the assert problem
	 *	and should be done eventually.
	 */
	if (dba_cm == reg->dyn.addr->acc_meth)
		return EXIT_NRM;

	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	if ((reg->open) && (dba_usr == csd->acc_meth))
	{
		change_reg();
		gvusr_rundown();
		return EXIT_NRM;
	}
	/* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local
	 * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on
	 * that later to determine if the process had standalone access or not when it entered this function.  We need to guarantee
	 * that none else access database file header when semid/shmid fields are reset.  We already have created ftok semaphore in
	 * db_init or, mu_rndwn_file and did not remove it.  So just lock it. We do it in blocking mode.
	 */
	have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */
	DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
	ESTABLISH_NORET(gds_rundown_ch, err_caught);
	if (err_caught)
	{
		REVERT;
		WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0);
		ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
		DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE);
		return EXIT_ERR;
	}
	assert(reg->open);			/* if we failed to open, dbinit_ch should have taken care of proper clean up */
	assert(!reg->opening);			/* see comment above */
	assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth));
	is_mm = (dba_bg != csd->acc_meth);
	assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk));
	/* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This
	 * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to
	 * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any
	 * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known
	 * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown.
	 */
	was_crit = (csa->now_crit && jgbl.onlnrlbk);
	/* Cancel any pending flush timer for this region by this task */
	canceled_flush_timer = FALSE;
	canceled_dbsync_timer = FALSE;
	CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer);
	we_are_last_user = FALSE;
	inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr);
	if (!csa->persistent_freeze)
		region_freeze(reg, FALSE, FALSE, FALSE);
	if (!was_crit)
	{
		rel_crit(reg);		/* get locks to known state */
		mutex_cleanup(reg);
	}
	/* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others
	 * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from
	 * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach
	 * mupip_exit_handler while holding access control semaphore. Assert accordingly.
	 */
	assert(!have_standalone_access || mupip_jnl_recover || process_exiting);
	/* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it
	 * needs the access control lock as well. The only expection is we are online rollback and currently running down.
	 */
	cnl = csa->nl;
	onln_rlbk_pid = cnl->onln_rlbk_pid;
	assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0));
	if (!have_standalone_access)
	{
		if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */
		{
			save_errno = errno;
			assert(FALSE);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
				  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno);
		}
		may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */
		/* We need to guarantee that no one else access database file header when semid/shmid fields are reset.
		 * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it.
		 */
		if (!ftok_sem_lock(reg, may_bypass_ftok))
		{
			if (may_bypass_ftok)
			{	/* We did a non-blocking wait. It's ok to proceed without locking */
				bypassed_ftok = TRUE;
				holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID);
				if ((uint4)-1 == holder_pid)
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"),
							CALLFROM, errno);
				if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */
				{
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10,
						 LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"),
						 REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid);
					send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2,
							LEN_AND_LIT("FTOK bypassed at rundown"));
				}
			} else
			{	/* We did a blocking wait but something bad happened. */
				FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
			}
		}
		sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0;	/* Wait for 0 */
		sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1;	/* Lock */
		sopcnt = 2;
		sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */
		SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT);
		if (0 != status)
		{
			save_errno = errno;
			/* Check # of processes counted on access sem. */
			if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL)))
			{
				assert(FALSE);
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno);
			}
			bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt;
			/* Before attempting again in the blocking mode, see if the holding process is an online rollback.
			 * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we
			 * are better off skipping rundown and continuing with sanity cleanup and exit.
			 */
			holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID);
			if ((uint4)-1 == holder_pid)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
					  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno);
			if (!bypassed_access)
			{	/* We couldn't get it in one shot-- see if we already have it */
				if (holder_pid == process_id)
				{
					send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg),
							ERR_RNDWNSEMFAIL);
					REVERT;
					ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
					assert(FALSE);
					return EXIT_ERR;
				}
				if (EAGAIN != save_errno)
				{
					assert(FALSE);
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"),
							CALLFROM, save_errno);
				}
				sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO;	/* Try again - blocking this time */
				SEMOP(udi->semid, sop, 2, status, FORCED_WAIT);
				if (-1 == status)			/* We couldn't get it at all.. */
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"),
							CALLFROM, errno);
			} else if (!IS_GTM_IMAGE)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10,
						LEN_AND_STR(gtmImageNames[image_type].imageName), process_id,
						LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid);
				send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2,
						LEN_AND_LIT("Access control bypassed at rundown"));
			}
			udi->grabbed_access_sem = !bypassed_access;
		}
	} /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we
	   * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the
	   * ftok semaphore and trying it could cause deadlock
	   */
	/* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE.
	 * But there could be other processes still having the database open so we cannot safely reset the halted fields.
	 */
	if (have_standalone_access && !jgbl.onlnrlbk)
		csd->ftok_counter_halted = csd->access_counter_halted = FALSE;
	ftok_counter_halted = csd->ftok_counter_halted;
	access_counter_halted = csd->access_counter_halted;
	/* If we bypassed any of the semaphores, activate safe mode.
	 * Also, if the replication instance is frozen and this db has replication turned on (which means
	 * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode.
	 */
	ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen);
	safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted;
	/* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */
	assert(csa->ref_cnt);	/* decrement private ref_cnt before shared ref_cnt decrement. */
	csa->ref_cnt--;		/* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */
	assert(!csa->ref_cnt);
	--cnl->ref_cnt;
	if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1))
	{	/* VERMISMATCH condition. Possible only if DSE */
		assert(dse_running);
		vermismatch = TRUE;
	} else
		vermismatch = FALSE;
	if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf))
	{
		save_errno = errno;
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
				RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno);
	} else
		we_are_last_user =  (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode;
	/* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */
	assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen);
	if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL)))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
			  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno);
	/* There's one writer left and I am it */
	assert(reg->read_only || semval >= 0);
	unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch;
	we_are_last_writer = unsafe_last_writer && !safe_mode;
	assert(!we_are_last_writer || !safe_mode);
	assert(!we_are_last_user || !safe_mode);
	/* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */
	assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen);
	GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */
	if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5,
			  RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno);
	if (NULL != csa->ss_ctx)
		ss_destroy_context(csa->ss_ctx);
	/* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */
	assert(1 == MAX_SNAPSHOTS);
	ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa);
	ss_pid = ss_shm_ptr->ss_info.ss_pid;
	is_cur_process_ss_initiator = (process_id == ss_pid);
	if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user))
	{
		/* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip
		 * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP
		 * INTEG or by a MUPIP RUNDOWN.
		 */
		if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid)
			&& (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0)))
		{
			ss_release(NULL);
			ss_release_lock(reg);
		}
	}
	/* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush
	 * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause
	 * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without
	 * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a
	 * consistent state.
	 */
	if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch)
	{	/* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY
		 * if there is NO concurrent online rollback running (as we need crit to set wc_blocked)
		 */
		if (csa->wbuf_dqd && !is_mm)
		{	/* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which
			 * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only
			 * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked
			 * BEFORE secshr_db_clnup in mur_close_files.
			 * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because
			 * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in
			 * wcs_wtstart or wcs_get_space. Assert accordingly.
			 */
			assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode);
			if (!was_crit)
				grab_crit(reg);
			SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE);
			BG_TRACE_PRO_ANY(csa, wcb_gds_rundown);
                        send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"),
                                process_id, &csa->ti->curr_tn, DB_LEN_STR(reg));
			csa->wbuf_dqd = 0;
			wcs_recover(reg);
			BG_TRACE_PRO_ANY(csa, lost_block_recovery);
			if (!was_crit)
				rel_crit(reg);
		}
		if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE)
			originator_prc_vec = NULL;
		/* If we are the last writing user, then everything must be flushed */
		if (we_are_last_writer)
		{	/* Time to flush out all of our buffers */
			assert(!safe_mode);
			if (is_mm)
			{
				MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg);
				cnl->remove_shm = TRUE;
			}
			if (cnl->wc_blocked && jgbl.onlnrlbk)
			{	/* if the last update done by online rollback was not committed in the normal code-path but was
				 * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never
				 * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This
				 * could result in the last update never getting flushed to the disk and if online rollback happened
				 * to be the last writer then the shared memory will be flushed and removed and the last update will
				 * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is
				 * possible only if phase1 or phase2 errors are induced using white box test cases
				 */
				assert(WB_COMMIT_ERR_ENABLED);
				wcs_recover(reg);
			}
			/* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly
			 * also ensures that the db is fsynced. We don't want to use it in the calls to
			 * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there.
			 * In this case, since we are running down, we don't have any such option.
			 */
			cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			/* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in
			 * case of MM for potential file extension), even if it did a grab_crit().  Therefore, make
			 * sure that's true.
			 */
			assert(csd == csa->hdr);
			assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1));
		} else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen)
		{	/* canceled pending db or jnl flush timers - flush database and journal buffers to disk */
			if (!was_crit)
				grab_crit(reg);
			/* we need to sync the epoch as the fact that there is no active pending flush timer implies
			 * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion
			 */
			wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			if (!was_crit)
				rel_crit(reg);
			assert((dba_mm == cs_data->acc_meth) || (csd == cs_data));
			csd = cs_data;	/* In case this is MM and wcs_flu() remapped an extended database, reset csd */
		}
		/* Do rundown journal processing after buffer flushes since they require jnl to be open */
		if (JNL_ENABLED(csd))
		{	/* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning
			 * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25.
			 */
			tp_change_reg();	/* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */
			jpc = csa->jnl;
			jbp = jpc->jnl_buff;
			if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
                        if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
			if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc))
				|| we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin)
			{	/* We need to close the journal file cleanly if we have the latest generation journal file open
				 *	or if we are the last writer and the journal file is open in shared memory (not necessarily
				 *	by ourselves e.g. the only process that opened the journal got shot abnormally)
				 * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode
				 * 	if we are not the last writer as it can be concurrently updated.
				 */
				if (!was_crit)
					grab_crit(reg);
				if (JNL_ENABLED(csd))
				{
					SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */
					/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
					 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
					 * journal records (if it decides to switch to a new journal file).
					 */
					ADJUST_GBL_JREC_TIME(jgbl, jbp);
					jnl_status = jnl_ensure_open();
					if (0 == jnl_status)
					{	/* If we_are_last_writer, we would have already done a wcs_flu() which would
						 * have written an epoch record and we are guaranteed no further updates
						 * since we are the last writer. So, just close the journal.
						 * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing
						 * a pini, so allow for that.
						 */
						assert(!jbp->before_images || is_mm
						    || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract
						    || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr));
						/* If we haven't written a pini, let jnl_file_close write the pini/pfin. */
						if (!jgbl.mur_extract && (0 != jpc->pini_addr))
							jnl_put_jrt_pfin(csa);
						/* If not the last writer and no pending flush timer left, do jnl flush now */
						if (!we_are_last_writer && (0 > cnl->wcs_timers))
						{
							if (SS_NORMAL == (jnl_status = jnl_flush(reg)))
							{
								assert(jbp->freeaddr == jbp->dskaddr);
								jnl_fsync(reg, jbp->dskaddr);
								assert(jbp->fsync_dskaddr == jbp->dskaddr);
							} else
							{
								send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2,
									JNL_LEN_STR(csd), ERR_TEXT, 2,
									RTS_ERROR_TEXT("Error with journal flush in gds_rundown"),
									jnl_status);
								assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */
								/* In this routine, all code that follows from here on does not
								 * assume anything about the journaling characteristics of this
								 * database so it is safe to continue execution even though
								 * journaling got closed in the middle.
								 */
							}
						}
						jnl_file_close(reg, we_are_last_writer, FALSE);
					} else
						send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd),
								DB_LEN_STR(reg));
				}
				if (!was_crit)
					rel_crit(reg);
			}
		}
		if (we_are_last_writer)			/* Flush the fileheader last and harden the file to disk */
		{
			if (!was_crit)
				grab_crit(reg);			/* To satisfy crit requirement in fileheader_sync() */
			memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */
			if (!have_standalone_access && we_are_last_user)
			{	/* mupip_exit_handler will do this after mur_close_file */
				csd->semid = INVALID_SEMID;
				csd->shmid = INVALID_SHMID;
				csd->gt_sem_ctime.ctime = 0;
				csd->gt_shm_ctime.ctime = 0;
			}
			fileheader_sync(reg);
			if (!was_crit)
				rel_crit(reg);
			if (!is_mm)
			{
				GTM_DB_FSYNC(csa, udi->fd, rc);		/* Sync it all */
				if (-1 == rc)
				{
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			} else
			{	/* Now do final MM file sync before exit */
				assert(csa->ti->total_blks == csa->total_blks);
				#ifdef _AIX
				GTM_DB_FSYNC(csa, udi->fd, rc);
				if (-1 == rc)
				#else
				if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1]))
				#endif
				{
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			}
		} else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued)
		{
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg));
			cnl->lastwriterbypas_msg_issued = TRUE;
		}
	} /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */
	/* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started
	 * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this).
	 */
	CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer);
	if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm)
	{	/* mupip_exit_handler will do this after mur_close_file */
		db_ipcs.semid = INVALID_SEMID;
		db_ipcs.shmid = INVALID_SHMID;
		db_ipcs.gt_sem_ctime = 0;
		db_ipcs.gt_shm_ctime = 0;
		db_ipcs.fn_len = reg->dyn.addr->fname_len;
		memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len);
		db_ipcs.fn[reg->dyn.addr->fname_len] = 0;
 		/* request gtmsecshr to flush. read_only cannot flush itself */
		WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa);
		if (!csa->read_only_fs)
		{
			secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0);
			if (0 != secshrstat)
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					  ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header"));
		}
	}
	/* Done with file now, close it */
	CLOSEFILE_RESET(udi->fd, rc);	/* resets "udi->fd" to FD_INVALID */
	if (-1 == rc)
	{
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
			  ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno);
	}
	/* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */
#	if !defined(_AIX)
	if (is_mm && (NULL != csa->db_addrs[0]))
	{
		assert(csa->db_addrs[1] > csa->db_addrs[0]);
		munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]);
		if (0 < munmap_len)
			munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len));
	}
#	endif
	/* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down
	 * this region. In the process also get the remove_shm status from node_local before detaching.
	 * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data
	 * integrity as a reissue of recover will restore the database to a consistent state.
	 */
	remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl);
	/* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0.
	 * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid).
	 */
	if (jgbl.onlnrlbk)
		cnl->onln_rlbk_pid = 0;
	rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */
	/* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter
	 * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down.
	 * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes.
	 */
	if (safe_mode) /* indicates flushing was skipped */
	{
		if (bypassed_access)
			cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */
		if (bypassed_ftok)
			cnl->dbrndwn_ftok_skip++;
	}
	if (jgbl.onlnrlbk)
		csa->hold_onto_crit = FALSE;
	GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0);
	status = shmdt((caddr_t)cnl);
	csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/
	/* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar
	 * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the
	 * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so
	 * we are safe passing "csa".
	 */
	if (-1 == status)
		send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2,
				LEN_AND_LIT("Error during shmdt"), errno);
	REMOVE_CSA_FROM_CSADDRSLIST(csa);	/* remove "csa" from list of open regions (cs_addrs_list) */
	reg->open = FALSE;
	/* If file is still not in good shape, die here and now before we get rid of our storage */
	assertpro(0 == csa->wbuf_dqd);
	ipc_deleted = FALSE;
	/* If we are the very last user, remove shared storage id and the semaphores */
	if (we_are_last_user)
	{	/* remove shared storage, only if last writer to rundown did a successful wcs_flu() */
		assert(!vermismatch);
		if (remove_shm)
		{
			ipc_deleted = TRUE;
			if (0 != shm_rmid(udi->shmid))
				rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory"));
			/* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */
			udi->new_shm = FALSE;
			/* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from
			 * mur_close_files())
			 */
			if (!have_standalone_access)
			{
				if (0 != sem_rmid(udi->semid))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						      ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore"));
				udi->new_sem = FALSE;			/* Note that we no longer have a new semaphore */
				udi->grabbed_access_sem = FALSE;
				udi->counter_acc_incremented = FALSE;
			}
		} else if (is_src_server || is_updproc)
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		} else
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
	} else
	{
		assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode);
		if (!jgbl.onlnrlbk && !have_standalone_access)
		{ 	/* If we were writing, get rid of our writer access count semaphore */
			if (!reg->read_only)
			{
				if (!access_counter_halted)
				{
					save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO);
					if (0 != save_errno)
						rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
								ERR_SYSCALL, 5,
								RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"),
								CALLFROM, save_errno);
				}
				udi->counter_acc_incremented = FALSE;
			}
			assert(safe_mode || !bypassed_access);
			/* Now remove the rundown lock */
			if (!bypassed_access)
			{
				if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO)))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
							ERR_SYSCALL, 5,
							RTS_ERROR_TEXT("gds_rundown access control semaphore release"),
							CALLFROM, save_errno);
				udi->grabbed_access_sem = FALSE;
			}
		} /* else access control semaphore will be released in db_ipcs_reset */
	}
	if (!have_standalone_access)
	{
		if (bypassed_ftok)
		{
			if (!ftok_counter_halted)
				if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO)))
					rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
		} else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE))
		{
			FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id);
			rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
		}
		udi->grabbed_ftok_sem = FALSE;
		udi->counter_ftok_incremented = FALSE;
	}
	ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state);
	if (!ipc_deleted)
	{
		GET_CUR_TIME(time_str);
		if (is_src_server)
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Source server"), REG_LEN_STR(reg));
		if (is_updproc)
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Update process"), REG_LEN_STR(reg));
		if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user))
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
		}
	}
	REVERT;
	return EXIT_NRM;
}
Exemplo n.º 18
0
uint4 mur_block_count_correct(reg_ctl_list *rctl)
{
    unsigned int		native_size, size;
    sgmnt_data_ptr_t 	mu_data;
    int4			mu_int_ovrhd;
    uint4			total_blks;
    uint4			status;
    uint4                   new_bit_maps, bplmap, new_blocks;

    MUR_CHANGE_REG(rctl);
    mu_data = cs_data;
    switch (mu_data->acc_meth)
    {
    default:
        GTMASSERT;
        break;
#if defined(VMS) && defined(GT_CX_DEF)
    case dba_bg:	/* necessary to do calculation in this manner to prevent double rounding causing an error */
        if (mu_data->unbacked_cache)
            mu_int_ovrhd = DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + mu_data->free_space +
                                           mu_data->lock_space_size, DISK_BLOCK_SIZE);
        else
            mu_int_ovrhd = DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + BT_SIZE(mu_data)
                                           + mu_data->free_space + mu_data->lock_space_size, DISK_BLOCK_SIZE);
        break;
#else
    case dba_bg:
#endif
    case dba_mm:
        mu_int_ovrhd = (int4)DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + mu_data->free_space, DISK_BLOCK_SIZE);
        break;
    }
    mu_int_ovrhd += 1;
    assert(mu_int_ovrhd == mu_data->start_vbn);
    size = mu_int_ovrhd + (mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks;
    native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl);
    /* In the following tests, the EOF block should always be 1 greater than the actual size of the file.
     * This is due to the GDS being allocated in even DISK_BLOCK_SIZE-byte blocks.
     */
    if (native_size && (size < native_size))
    {
        total_blks = (dba_mm == mu_data->acc_meth) ? cs_addrs->total_blks : cs_addrs->ti->total_blks;
        if (JNL_ENABLED(cs_addrs))
            cs_addrs->jnl->pini_addr = 0; /* Stop simulation of GTM process journal record writing (if any active)*/
        /* If journaling, gdsfilext will need to write an inctn record. The timestamp of that journal record will
         * need to be adjusted to the current system time to reflect that it is recovery itself writing that record
         * instead of simulating GT.M activity. Since the variable jgbl.dont_reset_gbl_jrec_time is still set, gdsfilext
         * will NOT modify jgbl.gbl_jrec_time. Temporarily reset it to allow for adjustments to gbl_jrec_time.
         */
        assert(jgbl.dont_reset_gbl_jrec_time);
        jgbl.dont_reset_gbl_jrec_time = FALSE;
        /* Calculate the number of blocks to add based on the difference between the real file size and the file size
         * computed from the header->total_blks.  Takes into account that gdsfilext() will automatically add new_bit_maps
         * to the amount of blocks we request.
         */
        bplmap = cs_data->bplmap;
        new_blocks = (native_size - size)/(mu_data->blk_size / DISK_BLOCK_SIZE);
        new_bit_maps = DIVIDE_ROUND_UP(total_blks + new_blocks, bplmap) - DIVIDE_ROUND_UP(total_blks, bplmap);
        if (SS_NORMAL != (status = gdsfilext(new_blocks - new_bit_maps, total_blks)))
        {
            jgbl.dont_reset_gbl_jrec_time = TRUE;
            return (status);
        }
        jgbl.dont_reset_gbl_jrec_time = TRUE;
        DEBUG_ONLY(
            /* Check that the filesize and blockcount in the fileheader match now after the extend */
            size = mu_int_ovrhd + (mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks;
            native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl);
            assert(size == native_size);
        )
    }
Exemplo n.º 19
0
boolean_t	tp_tend(boolean_t crit_only)
{
	block_id		tp_blk;
	boolean_t		history_validated, is_mm, was_crit, x_lock, do_validation;
	boolean_t		do_deferred_writes = FALSE, replication = FALSE;
	bt_rec_ptr_t		bt;
	cache_rec_ptr_t		cr;
	cw_set_element		*cse;
	file_control		*fc;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	jnl_format_buffer	*jfb;
	sgm_info		*si, *tmpsi;
	tp_region		*tr, *tr_last;
	sgmnt_addrs		*csa, *tmpcsa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		*t1;
	trans_num		ctn, tnque_earliest_tn;
	trans_num		valid_thru;	/* buffers touched by this transaction will be valid thru this tn */
	enum cdb_sc		status;
	gd_region		*save_gv_cur_region;
	int			lcnt, participants;
	jnldata_hdr_ptr_t	jnl_header;
	int			repl_tp_region_count = 0;
	boolean_t		first_time = TRUE, release_crit, yes_jnl_no_repl, retvalue;
	uint4			jnl_status, leafmods, indexmods;
	uint4			total_jnl_rec_size;
	jnlpool_ctl_ptr_t	jpl, tjpl;

	error_def(ERR_DLCKAVOIDANCE);
	error_def(ERR_JNLTRANS2BIG);

	assert(dollar_tlevel > 0);
	assert(0 == jnl_fence_ctl.level);
	participants = 0;
	status = cdb_sc_normal;
	/* if the transaction does no updates and the transaction history has not changed, we do not need any more validation */
	do_validation = FALSE;	/* initially set to FALSE, but set to TRUE below */
	jnl_status = 0;
	if (FALSE == crit_only)
	{
		for (si = first_sgm_info;  (NULL != si); si = si->next_sgm_info)
		{
			sgm_info_ptr = si;
			TP_CHANGE_REG_IF_NEEDED(si->gv_cur_region);
			csa = cs_addrs;
			csd = cs_data;
			if ((csd->wc_blocked) ||			/* If blocked, or.. */
				((dba_mm == csa->hdr->acc_meth) &&	/* we have MM and.. */
				(csa->total_blks != csa->ti->total_blks)))	/* and file has been extended */
			{	/* Force repair */
				t_fail_hist[t_tries] = cdb_sc_helpedout; /* special status to prevent punishing altruism */
				TP_TRACE_HIST(CR_BLKEMPTY, NULL);
				return FALSE;
			}
			/* whenever si->first_cw_set is non-NULL, ensure that si->update_trans is TRUE */
			assert((NULL == si->first_cw_set) || si->update_trans);
			/* whenever si->first_cw_set is NULL, ensure that si->update_trans is FALSE
			 * except when the set noop optimization is enabled */
			assert((NULL != si->first_cw_set) || !si->update_trans || gvdupsetnoop);
			if (!si->update_trans)
			{
				if (si->start_tn == csa->ti->early_tn)
				{	/* read with no change to the transaction history. ensure we haven't overrun
					 * our history buffer and we have reasonable values for first and last */
					assert(si->last_tp_hist - si->first_tp_hist <= si->tp_hist_size);
					continue;
				} else
					do_validation = TRUE;
			} else
			{
				do_validation = TRUE;
				is_mm = (dba_mm == cs_data->acc_meth);
				/* We are still out of crit if this is not our last attempt. If so, run the region list and check
				 * that we have sufficient free blocks for our update. If not, get them now while we can.
				 * We will repeat this check later in crit but it will hopefully have little or nothing to do.
				 * bypass 1st check if already in crit -- check later
				 */
				if (!csa->now_crit && !is_mm && (csa->nl->wc_in_free < si->cw_set_depth + 1)
						&& !wcs_get_space(si->gv_cur_region, si->cw_set_depth + 1, NULL))
					assert(FALSE);	/* wcs_get_space() should have returned TRUE unconditionally in this case */
			}
			if (si->update_trans && JNL_ENABLED(csa))
			{	/* compute the total journal record size requirements before grab_crit().
				 * there is code later that will check for state changes from now to then
				 */
				TOTAL_TPJNL_REC_SIZE(total_jnl_rec_size, si, csa);
				/* compute current transaction's maximum journal space needs in number of disk blocks */
				si->tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size);
				/* check if current TP transaction's journal size needs are greater than max jnl file size */
				if (si->tot_jrec_size > csd->autoswitchlimit)
					/* can't fit in current transaction's journal records into one journal file */
					rts_error(VARLSTCNT(6) ERR_JNLTRANS2BIG, 4, si->tot_jrec_size,
						JNL_LEN_STR(csd), csd->autoswitchlimit);
			}
		}	/* for (si ... ) */
		if (!do_validation)
		{
			if (CDB_STAGNATE <= t_tries)
			{
				for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr)
					rel_crit(tr->reg);
			}
			UNIX_ONLY(
				/* Must be done after REVERT since we are no longer in crit */
				if (unhandled_stale_timer_pop)
					process_deferred_stale();
			)
			return TRUE;
		}
	}
Exemplo n.º 20
0
void	ccp_tr_writedb1(ccp_action_record *rec)
{
	ccp_action_record	request;
	ccp_db_header		*db;
	jnl_buffer		*jb;
	sgmnt_addrs		*csa;
	int4			curr_time[2];
	uint4		status, *p1, *p2, *top;


	if ((db = rec->v.h) == NULL)
		return;

	csa = db->segment;
	assert(csa->nl->ccp_state == CCST_RDMODE  ||  csa->nl->ccp_state == CCST_CLOSED  ||  csa->nl->ccp_state == CCST_OPNREQ);

	if (JNL_ENABLED(csa->hdr))
	{
		assert(csa->jnl->channel != 0);

		jb = csa->jnl->jnl_buff;

		if (jb->blocked != 0)
		{
			/* jnl writes from a previous write mode are not done yet;  try again later */
			if ((status = sys$setimr(0, delta_100_msec, ccp_writedb5, db, 0)) != SS$_NORMAL)
				ccp_signal_cont(status);	/***** Is this reasonable? *****/
			return;
		}

		jb->epoch_tn = db->wm_iosb.valblk[CCP_VALBLK_EPOCH_TN];
		jb->freeaddr = jb->dskaddr
			     = ROUND_UP(db->wm_iosb.valblk[CCP_VALBLK_JNL_ADDR], DISK_BLOCK_SIZE);
		/* lastaddr is no longer a field in jnl_buff
		 *	jb->lastaddr = db->wm_iosb.valblk[CCP_VALBLK_LST_ADDR];
		 */
		jb->free = jb->dsk
			 = 0;
	}

	/* Note:  We must clear these flags prior to changing state or a set from ccp_exitwm_blkast may be missed */
	db->quantum_expired = FALSE;
	db->tick_in_progress = FALSE;

	if (db->remote_wakeup)
	{
		for (p2 = NULL, p1 = csa->lock_addrs[0], top = p1 + SIZEOF(int4) * NUM_CLST_LCKS;
		     *p1 != 0  &&  p1 <= top;
		     ++p1)
		{
			if ((*p1 & NODENUMBER) == (process_id & NODENUMBER))
			{
				crit_wake(p1);
				if (p2 == NULL)
					p2 = p1;
				*p1 = 0;
			}
			else
				if (p2 != NULL)
				{
					*p2++ = *p1;
					*p1 = 0;
				}
		}

		db->remote_wakeup = FALSE;

		status = ccp_enq(0, LCK$K_CRMODE, &db->lock_iosb, LCK$M_CONVERT | LCK$M_SYNCSTS, NULL, 0,
				 ccp_lkrqwake1, db, ccp_lkdowake_blkast, PSL$C_USER, 0);
		/***** Check error status here? *****/
	}

	db->glob_sec->freeze = 0;
	/* db->glob_sec->wcs_active_lvl = db->glob_sec->n_bts / 2; */
	db->drop_lvl = 0;

	sys$gettim(curr_time);
	lib$add_times(curr_time, db->glob_sec->ccp_quantum_interval, &db->start_wm_time);

	csa->nl->ccp_state = CCST_WRTGNT;

	if (db->blocking_ast_received)
	{
		status = sys$dclast(ccp_exitwm_blkast, &db->exitwm_timer_id, PSL$C_USER);
		if (status != SS$_NORMAL)
			ccp_signal_cont(status);	/***** Is this reasonable? *****/
	}

	csa->nl->ccp_crit_blocked = FALSE;
	++csa->nl->ccp_cycle;

	status = sys$setimr(0, &db->glob_sec->ccp_quantum_interval, ccp_quantum_interrupt, &db->quantum_timer_id, 0);
	if (status != SS$_NORMAL)
		ccp_signal_cont(status);	/***** Is this reasonable? *****/

	db->dirty_buffers_acquired = FALSE;
	ccp_pndg_proc_wake(&db->write_wait);

	status = ccp_enq(0, LCK$K_EXMODE, &db->flush_iosb, LCK$M_CONVERT | LCK$M_SYNCSTS, NULL, 0,
			 ccp_reqdrtbuf_interrupt, db, NULL, PSL$C_USER, 0);
	if (status == SS$_SYNCH)
	{
		request.action = CCTR_GOTDRT;
		request.pid = 0;
		request.v.h = db;
		ccp_act_request(&request);
	}
	/***** Check error status here? *****/

}
uint4 mur_block_count_correct(reg_ctl_list *rctl)
{
	gtm_uint64_t		native_size, size;
	sgmnt_data_ptr_t 	mu_data;
	int4			mu_int_ovrhd;
	uint4			total_blks;
	uint4			status;
	uint4                   new_bit_maps, bplmap, new_blocks, tmpcnt;
	enum db_acc_method      acc_meth;

	MUR_CHANGE_REG(rctl);
	mu_data = cs_data;
	acc_meth = mu_data->acc_meth;
	switch (acc_meth)
	{
		case dba_bg:
		case dba_mm:
			mu_int_ovrhd = (int4)DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + mu_data->free_space, DISK_BLOCK_SIZE);
			break;
		default:
			assertpro(FALSE && acc_meth);
	}
	mu_int_ovrhd += 1;
	assert(mu_int_ovrhd == mu_data->start_vbn);
	size = mu_int_ovrhd + (off_t)(mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks;
	native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl);
	/* In the following tests, the EOF block should always be 1 greater than the actual size of the file.
	 * This is due to the GDS being allocated in even DISK_BLOCK_SIZE-byte blocks.
	 */
	if (native_size && (size < native_size))
	{
		total_blks = (dba_mm == acc_meth) ? cs_addrs->total_blks : cs_addrs->ti->total_blks;
		if (JNL_ENABLED(cs_addrs))
			cs_addrs->jnl->pini_addr = 0; /* Stop simulation of GTM process journal record writing (if any active)*/
		/* If journaling, gdsfilext will need to write an inctn record. The timestamp of that journal record will
		 * need to be adjusted to the current system time to reflect that it is recovery itself writing that record
		 * instead of simulating GT.M activity. Since the variable jgbl.dont_reset_gbl_jrec_time is still set, gdsfilext
		 * will NOT modify jgbl.gbl_jrec_time. Temporarily reset it to allow for adjustments to gbl_jrec_time.
		 */
		assert(jgbl.dont_reset_gbl_jrec_time);
		jgbl.dont_reset_gbl_jrec_time = FALSE;
		/* Calculate the number of blocks to add based on the difference between the real file size and the file size
		 * computed from the header->total_blks.  Takes into account that gdsfilext() will automatically add new_bit_maps
		 * to the amount of blocks we request.
		 */
		bplmap = cs_data->bplmap;
		new_blocks = (native_size - size)/(mu_data->blk_size / DISK_BLOCK_SIZE);
		new_bit_maps = DIVIDE_ROUND_UP(total_blks + new_blocks, bplmap) - DIVIDE_ROUND_UP(total_blks, bplmap);
		tmpcnt = new_blocks - new_bit_maps;
		/* Call GDSFILEXT only if the no of blocks by which DB needs to be extended is not '0' since GDSFILEXT() treats
		 * extension by count 0 as unavailability of space(NO_FREE_SPACE error). And in the following case, tmpcnt could
		 * be '0' on AIX because in MM mode AIX increases the native_size to the nearest multiple of OS_PAGE_SIZE.
		 * And this increase could be less than GT.M block size.*/
		if (tmpcnt && SS_NORMAL != (status = GDSFILEXT(new_blocks - new_bit_maps, total_blks, TRANS_IN_PROG_FALSE)))
		{
			jgbl.dont_reset_gbl_jrec_time = TRUE;
			return (status);
		}
		jgbl.dont_reset_gbl_jrec_time = TRUE;
#		ifdef DEBUG
		/* Check that the filesize and blockcount in the fileheader match now after the extend */
		size = mu_int_ovrhd + (off_t)(mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks;
		native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl);
		ALIGN_DBFILE_SIZE_IF_NEEDED(size, native_size);
		assert(size == native_size);
#		endif
	}
	return SS_NORMAL;
}
Exemplo n.º 22
0
/* This function is called primarily to append a new histinfo record to the replication instance file by one of the following
 *	1) MUPIP REPLIC -SOURCE -START -ROOTPRIMARY command (after forking the child source server) if it created the journal pool.
 *	2) MUPIP REPLIC -SOURCE -ACTIVATE -ROOTPRIMARY command if this is a propagating primary to root primary transition.
 * In addition, this function also initializes the "lms_group_info" field in the instance file (from the "inst_info" field)
 *	if the current value is NULL.
 */
void	gtmsource_rootprimary_init(seq_num start_seqno)
{
	unix_db_info		*udi;
	repl_histinfo		histinfo;
	boolean_t		was_crit, switch_jnl;
	gd_region		*reg, *region_top;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	uint4			jnl_status;

	udi = FILE_INFO(jnlpool.jnlpool_dummy_reg);
	assert(NULL != jnlpool.repl_inst_filehdr);
	/* Update journal pool fields to reflect this is a root primary startup and updates are enabled */
	assert(!udi->s_addrs.hold_onto_crit || jgbl.onlnrlbk);
	was_crit = udi->s_addrs.now_crit;
	if (!was_crit)
		grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK);
	jnlpool.repl_inst_filehdr->root_primary_cycle++;
	/* If this instance is transitioning from a non-rootprimary to rootprimary, switch journal files.
	 * This helps with maintaining accurate value of csd->zqgblmod_tn when the former primary connects
	 * to the current primary through a fetchresync-rollback or receiver-server-autorollback..
	 */
	switch_jnl = (!jnlpool.repl_inst_filehdr->was_rootprimary && (0 < jnlpool.repl_inst_filehdr->num_histinfo));
	jnlpool.repl_inst_filehdr->was_rootprimary = TRUE;
	assert(start_seqno >= jnlpool.jnlpool_ctl->start_jnl_seqno);
	assert(start_seqno == jnlpool.jnlpool_ctl->jnl_seqno);
	jnlpool.repl_inst_filehdr->jnl_seqno = start_seqno;
	assert(jgbl.onlnrlbk || jnlpool.jnlpool_ctl->upd_disabled);
	if (!jgbl.onlnrlbk)
		jnlpool.jnlpool_ctl->upd_disabled = FALSE;
	if (IS_REPL_INST_UUID_NULL(jnlpool.repl_inst_filehdr->lms_group_info))
	{	/* This is the first time this instance is being brought up either as a root primary or as a propagating
		 * primary. Initialize the "lms_group_info" fields in the instance file header in journal pool shared memory.
		 * They will be flushed to the instance file as part of the "repl_inst_histinfo_add -> repl_inst_flush_filehdr"
		 * function invocation below.
		 */
		assert('\0' == jnlpool.repl_inst_filehdr->lms_group_info.created_nodename[0]);
		assert('\0' == jnlpool.repl_inst_filehdr->lms_group_info.this_instname[0]);
		assert(!jnlpool.repl_inst_filehdr->lms_group_info.creator_pid);
		jnlpool.repl_inst_filehdr->lms_group_info = jnlpool.repl_inst_filehdr->inst_info;
		assert('\0' != jnlpool.repl_inst_filehdr->lms_group_info.created_nodename[0]);
		DBG_CHECK_CREATED_NODENAME(jnlpool.repl_inst_filehdr->lms_group_info.created_nodename);
		assert('\0' != jnlpool.repl_inst_filehdr->lms_group_info.this_instname[0]);
		assert(jnlpool.repl_inst_filehdr->lms_group_info.created_time);
		assert(jnlpool.repl_inst_filehdr->lms_group_info.creator_pid);
	}
	/* Initialize histinfo fields */
	memcpy(histinfo.root_primary_instname, jnlpool.repl_inst_filehdr->inst_info.this_instname, MAX_INSTNAME_LEN - 1);
	histinfo.root_primary_instname[MAX_INSTNAME_LEN - 1] = '\0';
	assert('\0' != histinfo.root_primary_instname[0]);
	histinfo.start_seqno = start_seqno;
	assert(jnlpool.jnlpool_ctl->strm_seqno[0] == jnlpool.repl_inst_filehdr->strm_seqno[0]);
	assert(jnlpool.repl_inst_filehdr->is_supplementary || (0 == jnlpool.jnlpool_ctl->strm_seqno[0]));
	histinfo.strm_seqno = (!jnlpool.repl_inst_filehdr->is_supplementary) ? 0 : jnlpool.jnlpool_ctl->strm_seqno[0];
	histinfo.root_primary_cycle = jnlpool.repl_inst_filehdr->root_primary_cycle;
	assert(process_id == getpid());
	histinfo.creator_pid = process_id;
	JNL_SHORT_TIME(histinfo.created_time);
	histinfo.strm_index = 0;
	histinfo.history_type = HISTINFO_TYPE_NORMAL;
	NULL_INITIALIZE_REPL_INST_UUID(histinfo.lms_group);
	/* The following fields will be initialized in the "repl_inst_histinfo_add" function call below.
	 *	histinfo.histinfo_num
	 *	histinfo.prev_histinfo_num
	 *	histinfo.last_histinfo_num[]
	 */
	/* Add the histinfo record to the instance file and flush the changes in the journal pool to the file header */
	repl_inst_histinfo_add(&histinfo);
	if (!was_crit)
		rel_lock(jnlpool.jnlpool_dummy_reg);
	if (switch_jnl)
	{
		SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_file_extend and its callees assume jgbl.gbl_jrec_time is set */
		for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++)
		{
			gv_cur_region = reg;
			change_reg();		/* sets cs_addrs/cs_data (needed by jnl_ensure_open) */
			if (!JNL_ENABLED(cs_addrs))
				continue;
			grab_crit(gv_cur_region);
			jpc = cs_addrs->jnl;
			/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order of jnl
			 * records. This needs to be done BEFORE the jnl_ensure_open as that could write journal records
			 * (if it decides to switch to a new journal file)
			 */
			jbp = jpc->jnl_buff;
			ADJUST_GBL_JREC_TIME(jgbl, jbp);
			jnl_status = jnl_ensure_open();
			if (0 == jnl_status)
			{
				if (EXIT_ERR == SWITCH_JNL_FILE(jpc))
					rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_JNLEXTEND, 2, JNL_LEN_STR(cs_data));
			} else
			{
				if (SS_NORMAL != jpc->status)
					rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(cs_data),
							DB_LEN_STR(gv_cur_region), jpc->status);
				else
					rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data),
							DB_LEN_STR(gv_cur_region));
			}
			rel_crit(gv_cur_region);
		}
	}
}
Exemplo n.º 23
0
void	ccp_tr_opendb3(ccp_action_record *rec)
{
	ccp_db_header		*db;
	sgmnt_addrs		*csa;
	sgmnt_data		*csd;
	jnl_private_control	*jpc;
	uint4		status;


	db = rec->v.h;
	db->master_map_start_tn = 0;

	csa = db->segment;
	csd = db->glob_sec
	    = csa->hdr
	    = csa->db_addrs[0];

	adawi(1, &csa->nl->ref_cnt);	/* GT.M process that sent open request */
	csa->lock_addrs[0] = (unsigned char *)csd + (LOCK_BLOCK(csd) * DISK_BLOCK_SIZE);
	csa->lock_addrs[1] = csa->lock_addrs[0] + csd->lock_space_size;

	if (JNL_ENABLED(csd))
	{
		jpc = csa->jnl
		    = malloc(SIZEOF(jnl_private_control));
		memset(jpc, 0, SIZEOF(jnl_private_control));

		jpc->region = db->greg;
		jpc->jnl_buff = csa->lock_addrs[1] + CACHE_CONTROL_SIZE(csd) + JNL_NAME_EXP_SIZE;

		FILE_INFO(db->greg)->s_addrs.jnl->jnllsb->lockid = 0;

		if (db->wm_iosb.valblk[CCP_VALBLK_JNL_ADDR] == 0)
		{
			/* Open jnl file based on data in file header, first machine to open */
			jnl_file_open(db->greg, FALSE, ccp_closejnl_ast);
			if (jpc->channel == 0)
			{
				/* Open failed */
				ccp_close1(db);
				ccp_signal_cont(ERR_CCPJNLOPNERR);	/***** Is this reasonable? *****/
				return;
			}

			/* Write out file id for other machines */
			csd->trans_hist.ccp_jnl_filesize = jpc->jnl_buff->filesize;
			csd->ccp_jnl_before = jpc->jnl_buff->before_images;

			status = sys$qio(0, FILE_INFO(db->greg)->fab->fab$l_stv, IO$_WRITEVBLK, &db->qio_iosb, ccp_opendb3a, db,
					 csd, (MM_BLOCK - 1) * OS_PAGELET_SIZE, 1, 0, 0, 0);
		}
		else
		{
/* ??? --> */		/* Open jnl file based on id in header.  Read in header to make sure get file id from first machine */

			status = sys$qio(0, FILE_INFO(db->greg)->fab->fab$l_stv, IO$_READVBLK, &db->qio_iosb, ccp_opendb3a, db,
					 csd, (MM_BLOCK - 1) * OS_PAGELET_SIZE, 1, 0, 0, 0);
		}

		if (status != SS$_NORMAL)
		{
			ccp_signal_cont(status);	/***** Is this reasonable? *****/
			ccp_close1(db);
		}
	}
	else
		ccp_opendb3b(db);

}
Exemplo n.º 24
0
uint4	 gdsfilext (uint4 blocks, uint4 filesize)
{
	sm_uc_ptr_t		old_base[2];
	boolean_t		was_crit, need_to_restore_mask = FALSE;
	char			*buff;
	int			mm_prot, result, save_errno, status;
	uint4			new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks;
	uint4			jnl_status, to_wait, to_msg, wait_period;
	GTM_BAVAIL_TYPE		avail_blocks;
	sgmnt_data_ptr_t	tmp_csd;
	off_t			new_eof;
	trans_num		curr_tn;
	unix_db_info		*udi;
	sigset_t		savemask;
	inctn_opcode_t		save_inctn_opcode;
	int4			prev_extend_blks_to_upgrd;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;

	error_def(ERR_DBFILERR);
	error_def(ERR_DBFILEXT);
	error_def(ERR_DSKSPACEFLOW);
	error_def(ERR_JNLFLUSH);
	error_def(ERR_TEXT);
	error_def(ERR_TOTALBLKMAX);
	error_def(ERR_WAITDSKSPACE);

#ifdef __hppa
	if (dba_mm == cs_addrs->hdr->acc_meth)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */
#endif

	/* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will
	   overflow and end up doing silly things.
	*/
	assert(blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks));

	if (!blocks)
		return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */
	bplmap = cs_data->bplmap;
	/* new total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired
	   There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks
		and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this
		manner as every non-bitmap block must have an associated bitmap)
	   Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap.
	   Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed.
	*/
	new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1)
			- DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap);
	new_blocks = blocks + new_bit_maps;
	assert(0 < (int)new_blocks);
	udi = FILE_INFO(gv_cur_region);
	if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE)))
	{
		send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
	} else
	{
		avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE);
		if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks)
		{
			send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region),
					(uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0)));
#ifndef __MVS__
			if (blocks > (uint4)avail_blocks)
				return (uint4)(NO_FREE_SPACE);
#endif
		}
	}
	cs_addrs->extending = TRUE;
	was_crit = cs_addrs->now_crit;
	/* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur */
	assert(!was_crit || CDB_STAGNATE == t_tries || FALSE == cs_data->freeze);
	for ( ; ; )
	{	/* If we are in the final retry and already hold crit, it is possible that csd->wc_blocked is also set to TRUE
		 * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup
		 * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt"
		 * transaction numbers without correspondingly updating the history transaction numbers (effectively causing
		 * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover)
		 * if we already hold crit.
		 */
		if (!was_crit)
			grab_crit(gv_cur_region);
		if (FALSE == cs_data->freeze)
			break;
		rel_crit(gv_cur_region);
		if (was_crit)
		{	/* Two cases.
			 * (i)  Final retry and in TP. We might be holding crit in other regions too.
			 *	We can't do a grab_crit() on this region again unless it is deadlock-safe.
			 *      To be on the safer side, we do a restart. The tp_restart() logic will wait
			 *	for this region's freeze to be removed before grabbing crit.
			 * (ii) Final retry and not in TP. In that case too, it is better to restart in case there is
			 *	some validation code that shortcuts the checking for the final retry assuming we were
			 *	in crit from t_begin() to t_end(). t_retry() has logic that will wait for unfreeze.
			 * In either case, we need to restart. Returning EXTEND_UNFREEZECRIT will cause one in t_end/tp_tend.
			 */
			return (uint4)(EXTEND_UNFREEZECRIT);
		}
		while (cs_data->freeze)
			hiber_start(1000);
	}
	assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks);
	if (cs_data->trans_hist.total_blks != filesize)
	{
		/* somebody else has already extended it, since we are in crit, this is trust-worthy
		 * however, in case of MM, we still need to remap the database */
		assert(cs_data->trans_hist.total_blks > filesize);
		GDSFILEXT_CLNUP;
		return (SS_NORMAL);
	}
	if (run_time && (2 * ((0 < dollar_tlevel) ? sgm_info_ptr->cw_set_depth : cw_set_depth) < cs_addrs->ti->free_blocks))
	{
		if (FALSE == was_crit)
		{
			rel_crit(gv_cur_region);
			return (uint4)(EXTEND_SUSPECT);
		}
		/* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then GTMASSERT. */
		if (is_free_blks_ctr_ok() ||
				(2 * ((0 < dollar_tlevel) ? sgm_info_ptr->cw_set_depth : cw_set_depth) < cs_addrs->ti->free_blocks))
			GTMASSERT;	/* held crit through bm_getfree into gdsfilext and still didn't get it right */
	}
	if (JNL_ENABLED(cs_data))
	{
		if (!jgbl.dont_reset_gbl_jrec_time)
			SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = cs_addrs->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open();
		if (jnl_status)
		{
			GDSFILEXT_CLNUP;
			send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region));
			return (uint4)(NO_FREE_SPACE);	/* should have better return status */
		}
	}
	if (dba_mm == cs_addrs->hdr->acc_meth)
	{
#if defined(UNTARGETED_MSYNC)
		status = msync((caddr_t)cs_addrs->db_addrs[0], (size_t)(cs_addrs->db_addrs[1] - cs_addrs->db_addrs[0]), MS_SYNC);
#else
		cs_addrs->nl->mm_extender_pid = process_id;
		status = wcs_wtstart(gv_cur_region, 0);
		cs_addrs->nl->mm_extender_pid = 0;
		if (0 != cs_addrs->acc_meth.mm.mmblk_state->mmblkq_active.fl)
			GTMASSERT;
		status = 0;
#endif
		if (0 == status)
		{
			/* Block SIGALRM for the duration when cs_data and cs_addrs are out of sync */
			sigprocmask(SIG_BLOCK, &blockalrm, &savemask);
			need_to_restore_mask = TRUE;
			tmp_csd = cs_data;
			cs_data = (sgmnt_data_ptr_t)malloc(sizeof(*cs_data));
			memcpy((sm_uc_ptr_t)cs_data, (uchar_ptr_t)tmp_csd, sizeof(*cs_data));
			status = munmap((caddr_t)cs_addrs->db_addrs[0],
					     (size_t)(cs_addrs->db_addrs[1] - cs_addrs->db_addrs[0]));
#ifdef DEBUG_DB64
			if (-1 != status)
				rel_mmseg((caddr_t)cs_addrs->db_addrs[0]);
#endif
		} else
			tmp_csd = NULL;
		if (0 != status)
		{
			if (tmp_csd)
			{
				free(cs_data);
				cs_data = tmp_csd;
			}
			GDSFILEXT_CLNUP;
			send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status);
			return (uint4)(NO_FREE_SPACE);
		}
		cs_addrs->hdr = cs_data;
		cs_addrs->ti = &cs_data->trans_hist;
	}
	if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data))
	{
		GDSFILEXT_CLNUP;
		send_msg(VARLSTCNT(1) ERR_TOTALBLKMAX);
		return (uint4)(NO_FREE_SPACE);
	}
	CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn);	/* can issue rts_error TNTOOLARGE */
	assert(0 < (int)new_blocks);
	new_total = cs_data->trans_hist.total_blks + new_blocks;
	new_eof = ((off_t)(cs_data->start_vbn - 1) * DISK_BLOCK_SIZE) + ((off_t)new_total * cs_data->blk_size);
	buff = (char *)malloc(DISK_BLOCK_SIZE);
	memset(buff, 0, DISK_BLOCK_SIZE);
	LSEEKWRITE(udi->fd, new_eof, buff, DISK_BLOCK_SIZE, save_errno);
	if ((ENOSPC == save_errno) && run_time)
	{
		/* try to write it every second, and send message to operator
		 * log every 1/20 of cs_data->wait_disk_space
		 */
		wait_period = to_wait = DIVIDE_ROUND_UP(cs_data->wait_disk_space, CDB_STAGNATE + 1);
		to_msg = (to_wait / 8) ? (to_wait / 8) : 1;		/* send around 8 messages during 1 wait_period */
		while ((to_wait > 0) && (ENOSPC == save_errno))
		{
			if ((to_wait == cs_data->wait_disk_space) || (to_wait % to_msg == 0))
			{
				send_msg(VARLSTCNT(11) ERR_WAITDSKSPACE, 4, process_id,
					to_wait + (CDB_STAGNATE - t_tries) * wait_period, DB_LEN_STR(gv_cur_region),
					ERR_TEXT, 2,
					RTS_ERROR_TEXT("Please make more disk space available or shutdown GT.M to avoid data loss"),
					save_errno);
				gtm_putmsg(VARLSTCNT(11) ERR_WAITDSKSPACE, 4, process_id,
					to_wait + (CDB_STAGNATE - t_tries) * wait_period, DB_LEN_STR(gv_cur_region),
					ERR_TEXT, 2,
					RTS_ERROR_TEXT("Please make more disk space available or shutdown GT.M to avoid data loss"),
					save_errno);
			}
			if (!was_crit)
				rel_crit(gv_cur_region);
			hiber_start(1000);
			to_wait--;
			if (!was_crit)
				grab_crit(gv_cur_region);
			LSEEKWRITE(udi->fd, new_eof, buff, DISK_BLOCK_SIZE, save_errno);
		}
	}
	free(buff);
	if (0 != save_errno)
	{
		GDSFILEXT_CLNUP;
		if (ENOSPC == save_errno)
			return (uint4)(NO_FREE_SPACE);
		send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno);
		return (uint4)(NO_FREE_SPACE);
	}
	DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
Exemplo n.º 25
0
/* Finds a free block and adds information to update array and cw_set */
block_id swap_root_or_directory_block(int parent_blk_lvl, int child_blk_lvl, srch_hist *dir_hist_ptr, block_id child_blk_id,
		sm_uc_ptr_t child_blk_ptr, kill_set *kill_set_list, trans_num curr_tn)
{
	sgmnt_data_ptr_t	csd;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	srch_blk_status		bmlhist, freeblkhist;
	block_id		hint_blk_num, free_blk_id, parent_blk_id;
	boolean_t		free_blk_recycled;
	int4			master_bit, num_local_maps, free_bit, hint_bit, maxbitsthismap;
	uint4			total_blks;
	int			blk_seg_cnt, blk_size;
	sm_uc_ptr_t		parent_blk_ptr, bn_ptr, saved_blk;
	blk_segment		*bs1, *bs_ptr;
	int			parent_blk_size, child_blk_size, bsiz;
	int			rec_size1, curr_offset, bpntr_end, hdr_len;
	int			tmp_cmpc;
	cw_set_element		*tmpcse;
	jnl_buffer_ptr_t	jbbp; /* jbbp is non-NULL only if before-image journaling */
	unsigned short		temp_ushort;
	unsigned long		temp_long;
	unsigned char		save_cw_set_depth;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csd = cs_data;
	csa = cs_addrs;
	cnl = csa->nl;
	blk_size = csd->blk_size;
	/* Find a free/recycled block for new block location. */
	hint_blk_num = 0;
	total_blks = csa->ti->total_blks;
	num_local_maps = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP);
	master_bit = bmm_find_free((hint_blk_num / BLKS_PER_LMAP), csa->bmm, num_local_maps);
	if ((NO_FREE_SPACE == master_bit))
	{
		t_abort(gv_cur_region, csa);
		return ABORT_SWAP;
	}
	bmlhist.blk_num = (block_id)master_bit * BLKS_PER_LMAP;
	if (NULL == (bmlhist.buffaddr = t_qread(bmlhist.blk_num, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr)))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry((enum cdb_sc)rdfail_detail);
		return RETRY_SWAP;
	}
	hint_bit = 0;
	maxbitsthismap = (master_bit != (num_local_maps - 1)) ? BLKS_PER_LMAP : total_blks - bmlhist.blk_num;
	free_bit = bm_find_blk(hint_bit, bmlhist.buffaddr + SIZEOF(blk_hdr), maxbitsthismap, &free_blk_recycled);
	free_blk_id = bmlhist.blk_num + free_bit;
	if (DIR_ROOT >= free_blk_id)
	{	/* Bitmap block 0 and directory tree root block 1 should always be marked busy. */
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_badbitmap);
		return RETRY_SWAP;
	}
	if (child_blk_id <= free_blk_id)
	{	/* stop swapping root or DT blocks once the database is truncated well enough. A good heuristic for this is to check
		 * if the block is to be swapped into a higher block number and if so do not swap
		 */
		t_abort(gv_cur_region, csa);
		return ABORT_SWAP;
	}
	/* ====== begin update array ======
	 * Four blocks get changed.
	 * 	1. Free block becomes busy and gains the contents of child (root block/directory tree block)
	 * 	2. Parent block in directory tree remains busy, but points to new root block location.
	 *	3. Free block's corresponding bitmap reflects above change.
	 * 	4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE)
	 */
	parent_blk_ptr = dir_hist_ptr->h[parent_blk_lvl].buffaddr; /* parent_blk_lvl is 0 iff we're moving a gvt root block */
	parent_blk_id = dir_hist_ptr->h[parent_blk_lvl].blk_num;
	CHECK_AND_RESET_UPDATE_ARRAY;
	if (free_blk_recycled)
	{	/* Otherwise, it's a completely free block, in which case no need to read. */
		freeblkhist.blk_num = (block_id)free_blk_id;
		if (NULL == (freeblkhist.buffaddr = t_qread(free_blk_id, (sm_int_ptr_t)&freeblkhist.cycle, &freeblkhist.cr)))
		{
			assert(t_tries < CDB_STAGNATE);
			t_retry((enum cdb_sc)rdfail_detail);
			return RETRY_SWAP;
		}
	}
	child_blk_size = ((blk_hdr_ptr_t)child_blk_ptr)->bsiz;
	BLK_INIT(bs_ptr, bs1);
	BLK_ADDR(saved_blk, child_blk_size, unsigned char);
	memcpy(saved_blk, child_blk_ptr, child_blk_size);
	BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), child_blk_size - SIZEOF(blk_hdr));
	assert(blk_seg_cnt == child_blk_size);
	if (!BLK_FINI(bs_ptr, bs1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	tmpcse = &cw_set[cw_set_depth];
	(free_blk_recycled) ? BIT_SET_RECYCLED_AND_CLEAR_FREE(tmpcse->blk_prior_state)
			    : BIT_CLEAR_RECYCLED_AND_SET_FREE(tmpcse->blk_prior_state);
	t_create(free_blk_id, (unsigned char *)bs1, 0, 0, child_blk_lvl);
	tmpcse->mode = gds_t_acquired;
	if (!free_blk_recycled || !cs_data->db_got_to_v5_once)
		tmpcse->old_block = NULL;
	else
	{
		tmpcse->old_block = freeblkhist.buffaddr;
		tmpcse->cr = freeblkhist.cr;
		tmpcse->cycle = freeblkhist.cycle;
		jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
		if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn))
		{
			bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz;
			if (bsiz > blk_size)
			{
				assert(CDB_STAGNATE > t_tries);
				t_retry(cdb_sc_lostbmlcr);
				return RETRY_SWAP;
			}
			JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, csd, csa, tmpcse->old_block, bsiz);
		}
	}
	/* 2. Parent block in directory tree remains busy, but points to new child block location. */
	curr_offset = dir_hist_ptr->h[parent_blk_lvl].curr_rec.offset;
	parent_blk_size = ((blk_hdr_ptr_t)parent_blk_ptr)->bsiz;
	GET_RSIZ(rec_size1, (parent_blk_ptr + curr_offset));
	if ((parent_blk_size < rec_size1 + curr_offset) || (BSTAR_REC_SIZE > rec_size1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	BLK_INIT(bs_ptr, bs1);
	if (0 == parent_blk_lvl)
		/* There can be collation stuff in the record value after the block pointer. See gvcst_root_search. */
		hdr_len = SIZEOF(rec_hdr) + gv_altkey->end + 1 - EVAL_CMPC((rec_hdr_ptr_t)(parent_blk_ptr + curr_offset));
	else
		hdr_len = rec_size1 - SIZEOF(block_id);
	bpntr_end = curr_offset + hdr_len + SIZEOF(block_id);
	BLK_SEG(bs_ptr, parent_blk_ptr + SIZEOF(blk_hdr), curr_offset + hdr_len - SIZEOF(blk_hdr));
	BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
	PUT_LONG(bn_ptr, free_blk_id);
	BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
	BLK_SEG(bs_ptr, parent_blk_ptr + bpntr_end, parent_blk_size - bpntr_end);
	assert(blk_seg_cnt == parent_blk_size);
	if (!BLK_FINI(bs_ptr, bs1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	t_write(&dir_hist_ptr->h[parent_blk_lvl], (unsigned char *)bs1, 0, 0, parent_blk_lvl, FALSE, TRUE, GDS_WRITE_KILLTN);
	/* To indicate later snapshot file writing process during fast_integ not to skip writing the block to snapshot file */
	BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state);
	/* 3. Free block's corresponding bitmap reflects above change. */
	PUT_LONG(update_array_ptr, free_bit);
	save_cw_set_depth = cw_set_depth; /* Bit maps go on end of cw_set (more fake acquired) */
	assert(!cw_map_depth);
	t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, curr_tn, 1);
	cw_map_depth = cw_set_depth;
	cw_set_depth = save_cw_set_depth;
	update_array_ptr += SIZEOF(block_id);
	temp_long = 0;
	PUT_LONG(update_array_ptr, temp_long);
	update_array_ptr += SIZEOF(block_id);
	assert(1 == cw_set[cw_map_depth - 1].reference_cnt);
	/* 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE) */
	kill_set_list->blk[kill_set_list->used].flag = 0;
	kill_set_list->blk[kill_set_list->used].level = 0;
	kill_set_list->blk[kill_set_list->used++].block = child_blk_id;
	return free_blk_id;
}
Exemplo n.º 26
0
void bm_setmap(block_id bml, block_id blk, int4 busy)
{
	sm_uc_ptr_t	bmp;
	trans_num	ctn;
	srch_hist	alt_hist;
	srch_blk_status	blkhist; /* block-history to fill in for t_write_map which uses "blk_num", "buffaddr", "cr", "cycle" */
	cw_set_element  *cse;
	int		lbm_status;	/* local bitmap status of input "blk" i.e. BUSY or FREE or RECYCLED  */
	int4		reference_cnt;
	uint4		bitnum;

	error_def(ERR_DSEFAIL);

	t_begin_crit(ERR_DSEFAIL);
	ctn = cs_addrs->ti->curr_tn;
	if (!(bmp = t_qread(bml, &blkhist.cycle, &blkhist.cr)))
		t_retry((enum cdb_sc)rdfail_detail);
	blkhist.blk_num = bml;
	blkhist.buffaddr = bmp;
	alt_hist.h[0].blk_num = 0;	/* Need for calls to T_END for bitmaps */
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	bitnum = blk - bml;
	/* Find out current status in order to determine if there is going to be a state transition */
	assert(ROUND_DOWN2(blk, cs_data->bplmap) == bml);
	GET_BM_STATUS(bmp, bitnum, lbm_status);
	switch(lbm_status)
	{
		case BLK_BUSY:
			reference_cnt = busy ? 0 : -1;
			break;
		case BLK_FREE:
		case BLK_MAPINVALID:
		case BLK_RECYCLED:
			assert(BLK_MAPINVALID != lbm_status);
			reference_cnt = busy ? 1 : 0;
			break;
		default:
			assert(FALSE);
			break;
	}
	if (reference_cnt)
	{	/* Initialize update array with non-zero bitnum only if reference_cnt is non-zero. */
		assert(bitnum);
		*((block_id_ptr_t)update_array_ptr) = bitnum;
		update_array_ptr += sizeof(block_id);
	}
	/* Terminate update array unconditionally with zero bitnum. */
	*((block_id_ptr_t)update_array_ptr) = 0;
	update_array_ptr += sizeof(block_id);
	t_write_map(&blkhist, (uchar_ptr_t)update_array, ctn, reference_cnt);
	if (JNL_ENABLED(cs_data))
        {
                cse = (cw_set_element *)(&cw_set[0]);
                cse->new_buff = non_tp_jfb_buff_ptr;
                memcpy(non_tp_jfb_buff_ptr, bmp, ((blk_hdr_ptr_t)bmp)->bsiz);
                gvcst_map_build((uint4 *)cse->upd_addr, (uchar_ptr_t)cse->new_buff, cse, cs_addrs->ti->curr_tn);
                cse->done = TRUE;
        }
	/* Call t_end till it succeeds or aborts (error will be reported) */
	while ((trans_num)0 == t_end(&alt_hist, 0))
		;
	return;
}
int gtmsource_checkhealth(void)
{
	uint4			gtmsource_pid;
	int			status, semval, save_errno;
	boolean_t		srv_alive, all_files_open;
	gtmsource_local_ptr_t	gtmsourcelocal_ptr;
	int4			index, num_servers;
	seq_num			reg_seqno, jnlseqno;
	gd_region		*reg, *region_top;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	char			errtxt[OUT_BUFF_SIZE];
	char			*modestr;

	assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]);
	if (NULL != jnlpool.gtmsource_local)	/* Check health of a specific source server */
		gtmsourcelocal_ptr = jnlpool.gtmsource_local;
	else
		gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[0];
	num_servers = 0;
	status = SRV_ALIVE;
	for (index = 0; index < NUM_GTMSRC_LCL; index++, gtmsourcelocal_ptr++)
	{
		if ('\0' == gtmsourcelocal_ptr->secondary_instname[0])
		{
			assert(NULL == jnlpool.gtmsource_local);
			continue;
		}
		gtmsource_pid = gtmsourcelocal_ptr->gtmsource_pid;
		/* If CHECKHEALTH on a specific secondary instance is requested, print the health information irrespective
		 * of whether a source server for that instance is alive or not. For CHECKHEALTH on ALL secondary instances
		 * print health information only for those instances that have an active or passive source server alive.
		 */
		if ((NULL == jnlpool.gtmsource_local) && (0 == gtmsource_pid))
			continue;
		repl_log(stdout, TRUE, TRUE, "Initiating CHECKHEALTH operation on source server pid [%d] for secondary instance"
			" name [%s]\n", gtmsource_pid, gtmsourcelocal_ptr->secondary_instname);
		srv_alive = (0 == gtmsource_pid) ? FALSE : is_proc_alive(gtmsource_pid, 0);
		if (srv_alive)
		{
			if (GTMSOURCE_MODE_ACTIVE == gtmsourcelocal_ptr->mode)
				modestr = "ACTIVE";
			else if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsourcelocal_ptr->mode)
				modestr = "ACTIVE REQUESTED";
			else if (GTMSOURCE_MODE_PASSIVE == gtmsourcelocal_ptr->mode)
				modestr = "PASSIVE";
			else if (GTMSOURCE_MODE_PASSIVE_REQUESTED == gtmsourcelocal_ptr->mode)
				modestr = "PASSIVE REQUESTED";
			else
			{
				assert(gtmsourcelocal_ptr->mode != gtmsourcelocal_ptr->mode);
				modestr = "UNKNOWN";
			}
			repl_log(stderr, FALSE, TRUE, FORMAT_STR1, gtmsource_pid, "Source server", "", modestr);
			status |= SRV_ALIVE;
			num_servers++;
		} else
		{
			repl_log(stderr, FALSE, TRUE, FORMAT_STR, gtmsource_pid, "Source server", " NOT");
			gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_SRCSRVNOTEXIST, 2,
					LEN_AND_STR(gtmsourcelocal_ptr->secondary_instname));
			status |= SRV_DEAD;
		}
		if (NULL != jnlpool.gtmsource_local)
			break;
	}
	if (NULL == jnlpool.gtmsource_local)
	{	/* Compare number of servers that were found alive with the current value of the COUNT semaphore.
		 * If they are not equal, report the discrepancy.
		 */
		semval = get_sem_info(SOURCE, SRC_SERV_COUNT_SEM, SEM_INFO_VAL);
		if (-1 == semval)
		{
			save_errno = errno;
			repl_log(stderr, FALSE, TRUE,
				"Error fetching source server count semaphore value : %s\n", STRERROR(save_errno));
			status |= SRV_ERR;
		} else if (semval != num_servers)
		{
			repl_log(stderr, FALSE, FALSE,
				"Error : Expected %d source server(s) to be alive but found %d actually alive\n",
				semval, num_servers);
			repl_log(stderr, FALSE, TRUE, "Error : Check if any pid reported above is NOT a source server process\n");
			status |= SRV_ERR;
		}
	}
	/* Check that there are no regions with replication state = WAS_ON (i.e. repl_was_open). If so report that.
	 * But to determine that, we need to attach to all the database regions.
	 */
	gvinit();
	/* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */
	all_files_open = region_init(FALSE);
	if (!all_files_open)
	{
		gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN);
		status |= SRV_ERR;
	} else
	{
		for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++)
		{
			csa = &FILE_INFO(reg)->s_addrs;
			csd = csa->hdr;
			if (REPL_WAS_ENABLED(csd))
			{
				assert(!JNL_ENABLED(csd) || REPL_ENABLED(csd));	/* || is for turning replication on concurrently */
				reg_seqno = csd->reg_seqno;
				jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO;
				sgtm_putmsg(errtxt, VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(reg),
					&reg_seqno, &reg_seqno, &jnlseqno, &jnlseqno);
				repl_log(stderr, FALSE, TRUE, errtxt);
				status |= SRV_ERR;
			}
		}
	}
	if (jnlpool.jnlpool_ctl->freeze)
	{
		repl_log(stderr, FALSE, FALSE, "Warning: Instance Freeze is ON\n");
		repl_log(stderr, FALSE, TRUE, "   Freeze Comment: %s\n", jnlpool.jnlpool_ctl->freeze_comment);
		status |= SRV_ERR;
	}
	return (status + NORMAL_SHUTDOWN);
}
Exemplo n.º 28
0
void gds_rundown(void)
{
	bool			is_mm, we_are_last_user, we_are_last_writer;
	boolean_t		ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch;
	now_t			now;	/* for GET_CUR_TIME macro */
	char			*time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */
	gd_region		*reg;
	int			save_errno, status;
	int4			semval, ftok_semval, sopcnt, ftok_sopcnt;
	short			crash_count;
	sm_long_t		munmap_len;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	struct shmid_ds		shm_buf;
	struct sembuf		sop[2], ftok_sop[2];
	uint4           	jnl_status;
	unix_db_info		*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;

	error_def(ERR_CRITSEMFAIL);
	error_def(ERR_DBCCERR);
	error_def(ERR_DBFILERR);
	error_def(ERR_DBRNDWNWRN);
	error_def(ERR_ERRCALL);
	error_def(ERR_GBLOFLOW);
	error_def(ERR_GTMASSERT);
	error_def(ERR_IPCNOTDEL);
	error_def(ERR_JNLFLUSH);
	error_def(ERR_RNDWNSEMFAIL);
	error_def(ERR_TEXT);
	error_def(ERR_WCBLOCKED);

	forced_exit = FALSE;		/* Okay, we're dying already -- let rel_crit live in peace now.
					 * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001
					 */
	grabbed_access_sem = FALSE;
	jnl_status = 0;
	reg = gv_cur_region;			/* Local copy */

	/*
	 * early out for cluster regions
	 * to avoid tripping the assert below.
	 * Note:
	 *	This early out is consistent with VMS.  It has been
	 *	noted that all of the gtcm assignments
	 *      to gv_cur_region should use the TP_CHANGE_REG
	 *	macro.  This would also avoid the assert problem
	 *	and should be done eventually.
	 */
	if (dba_cm == reg->dyn.addr->acc_meth)
		return;

	udi = FILE_INFO(reg);
	csa = &udi->s_addrs;
	csd = csa->hdr;
	assert(csa == cs_addrs && csd == cs_data);
	if ((reg->open) && (dba_usr == csd->acc_meth))
	{
		change_reg();
		gvusr_rundown();
		return;
	}
	ESTABLISH(gds_rundown_ch);
	if (!reg->open)				/* Not open, no point to rundown */
	{
		if (reg->opening)		/* Died partway open, kill rest of way */
		{
			rel_crit(reg);
			mutex_cleanup(reg);
/* revist this to handle MM properly  SMW 98/12/16
                        if (NULL != csa->nl)
                        {
                                status = shmdt((caddr_t)csa->nl);
                                if (-1 == status)
                                        send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
                                                ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno);
                        }
*/
			shmdt((caddr_t)csa->nl);
			csa->nl = NULL;
		}
		REVERT;
		return;
	}
	switch(csd->acc_meth)
	{	/* Pass mm and bg through */
	    case dba_bg:
		is_mm = FALSE;
		break;
	    case dba_mm:
		is_mm = TRUE;
		break;
	    case dba_usr:
		assert(FALSE);
	    default:
		REVERT;
		return;
	}
	/* Cancel any pending flush timer for this region by this task */
	CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer);
	we_are_last_user = FALSE;
	if (!csa->persistent_freeze)
		region_freeze(reg, FALSE, FALSE, FALSE);
	assert(!csa->read_lock);
	rel_crit(reg);		/* get locks to known state */
	mutex_cleanup(reg);
	/*
	 * We need to guarantee that none else access database file header when semid/shmid fields are reset.
	 * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it.
	 * So just lock it. We do it in blocking mode.
	 */
	if (!ftok_sem_lock(reg, FALSE, FALSE))
		rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
	/*
	 * For mupip_jnl_recover we already have database access control semaphore.
	 * We do not release it. We release it from  mur_close_files.
	 */
	if (!mupip_jnl_recover)
	{
		sop[0].sem_num = 0; sop[0].sem_op = 0;	/* Wait for 0 */
		sop[1].sem_num = 0; sop[1].sem_op = 1;	/* Lock */
		sopcnt = 2;
		sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */
		SEMOP(udi->semid, sop, sopcnt, status);
		if (-1 == status)			/* We couldn't get it in one shot -- see if we already have it */
		{
			save_errno = errno;
			/* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */
			if (semctl(udi->semid, 0, GETPID) == process_id)
			{
				send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2,
					DB_LEN_STR(reg),
					ERR_RNDWNSEMFAIL);
				REVERT;
				return;			/* Already in rundown for this region */
			}
			if (EAGAIN != save_errno)
			{
				assert(FALSE);
				rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno);
			}
			sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO;	/* Try again - blocking this time */
			SEMOP(udi->semid, sop, 2, status);
			if (-1 == status)			/* We couldn't get it at all.. */
				rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno);
		}
	}
	grabbed_access_sem = TRUE;
	/*
	 * We now have the dbinit/rundown lock, so we are alone in this code for this region
	 * and nobody else can attach.
	 * See if we are all alone in accessing this database shared memory.
	 */
	assert(csa->ref_cnt);	/* decrement private ref_cnt before shared ref_cnt decrement. */
	csa->ref_cnt--;		/* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */
	assert(!csa->ref_cnt);
	--csa->nl->ref_cnt;
	if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1))
	{	/* VERMISMATCH condition. Possible only if DSE */
		assert(dse_running);
		vermismatch = TRUE;
	} else
		vermismatch = FALSE;
	if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf))
	{
		save_errno = errno;
		rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
			ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno);
	} else
		we_are_last_user =  (1 == shm_buf.shm_nattch) && !vermismatch;
	assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */
	if (-1 == (semval = semctl(udi->semid, 1, GETVAL)))
		rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno);
	we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */
	assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */
	if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL)))
		rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno);
	/* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should
	 * 	not flush shared memory contents to disk as they might be in an inconsistent state.
	 * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine.
	 * A reissue of the recover/rollback command will restore the database to a consistent state.
	 * Otherwise, if we have write access to this region, let us perform a few writing tasks.
	 */
	if (csa->nl->donotflush_dbjnl)
		csa->wbuf_dqd = 0;	/* ignore csa->wbuf_dqd status as we do not care about the cache contents */
	else if (!reg->read_only && !vermismatch)
	{	/* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */
		if (csa->wbuf_dqd)
		{
			grab_crit(reg);
			SET_TRACEABLE_VAR(csd->wc_blocked, TRUE);
			BG_TRACE_PRO_ANY(csa, wcb_gds_rundown);
                        send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"),
                                process_id, &csa->ti->curr_tn, DB_LEN_STR(reg));
			csa->wbuf_dqd = 0;
			wcs_recover(reg);
			if (is_mm)
			{
				assert(FALSE);
				csd = csa->hdr;
			}
			BG_TRACE_PRO_ANY(csa, lost_block_recovery);
			rel_crit(reg);
		}
		if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type))
			originator_prc_vec = NULL;
		/* If we are the last writing user, then everything must be flushed */
		if (we_are_last_writer)
		{	/* Time to flush out all of our buffers */
			if (is_mm)
			{
				if (csa->total_blks != csa->ti->total_blks)	/* do remap if file had been extended */
				{
					grab_crit(reg);
					wcs_mm_recover(reg);
					csd = csa->hdr;
					rel_crit(reg);
				}
				csa->nl->remove_shm = TRUE;
			}
			/* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly
			 * also ensures that the db is fsynced. We don't want to use it in the calls to
			 * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there.
			 * In this case, since we are running down, we don't have any such option.
			 */
			csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			/* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in
			 * case of MM for potential file extension), even if it did a grab_crit().  Therefore, make
			 * sure that's true.
			 */
			assert(csd == csa->hdr);
			assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1));
			csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn;
		} else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer)
		{	/* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */
			grab_crit(reg);
			/* we need to sync the epoch as the fact that there is no active pending flush timer implies
			 * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion
			 */
			wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH);
			rel_crit(reg);
			assert((dba_mm == cs_data->acc_meth) || (csd == cs_data));
			csd = cs_data;	/* In case this is MM and wcs_flu() remapped an extended database, reset csd */
		}
		/* Do rundown journal processing after buffer flushes since they require jnl to be open */
		if (JNL_ENABLED(csd))
		{	/* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning
			 * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25.
			 */
			tp_change_reg();	/* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */
			jpc = csa->jnl;
			jbp = jpc->jnl_buff;
			if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
                        if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id)
                        {
                                assert(FALSE);
                                COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0);
                        }
			if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc))
				|| we_are_last_writer && (0 != csa->nl->jnl_file.u.inode))
			{	/* We need to close the journal file cleanly if we have the latest generation journal file open
				 *	or if we are the last writer and the journal file is open in shared memory (not necessarily
				 *	by ourselves e.g. the only process that opened the journal got shot abnormally)
				 * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode
				 * 	if we are not the last writer as it can be concurrently updated.
				 */
				grab_crit(reg);
				if (JNL_ENABLED(csd))
				{
					SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */
					/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
					 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
					 * journal records (if it decides to switch to a new journal file).
					 */
					ADJUST_GBL_JREC_TIME(jgbl, jbp);
					jnl_status = jnl_ensure_open();
					if (0 == jnl_status)
					{	/* If we_are_last_writer, we would have already done a wcs_flu() which would
						 * have written an epoch record and we are guaranteed no further updates
						 * since we are the last writer. So, just close the journal.
						 * Although we assert pini_addr should be non-zero for last_writer, we
						 * play it safe in PRO and write a PINI record if not written already.
						 */
						assert(!jbp->before_images || is_mm
								|| !we_are_last_writer || 0 != jpc->pini_addr);
						if (we_are_last_writer && 0 == jpc->pini_addr)
							jnl_put_jrt_pini(csa);
						if (0 != jpc->pini_addr)
							jnl_put_jrt_pfin(csa);
						/* If not the last writer and no pending flush timer left, do jnl flush now */
						if (!we_are_last_writer && (0 > csa->nl->wcs_timers))
						{
							if (SS_NORMAL == (jnl_status = jnl_flush(reg)))
							{
								assert(jbp->freeaddr == jbp->dskaddr);
								jnl_fsync(reg, jbp->dskaddr);
								assert(jbp->fsync_dskaddr == jbp->dskaddr);
							} else
							{
								send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
									ERR_TEXT, 2,
									RTS_ERROR_TEXT("Error with journal flush in gds_rundown"),
									jnl_status);
								assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */
								/* In this routine, all code that follows from here on does not
								 * assume anything about the journaling characteristics of this
								 * database so it is safe to continue execution even though
								 * journaling got closed in the middle.
								 */
							}
						}
						jnl_file_close(reg, we_are_last_writer, FALSE);
					} else
						send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg));
				}
				rel_crit(reg);
			}
		}
		if (we_are_last_writer)			/* Flush the fileheader last and harden the file to disk */
		{
			grab_crit(reg);			/* To satisfy crit requirement in fileheader_sync() */
			memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */
			if (!mupip_jnl_recover && we_are_last_user)
			{	/* mupip_jnl_recover will do this after mur_close_file */
				csd->semid = INVALID_SEMID;
				csd->shmid = INVALID_SHMID;
				csd->gt_sem_ctime.ctime = 0;
				csd->gt_shm_ctime.ctime = 0;
			}
			fileheader_sync(reg);
			rel_crit(reg);
			if (FALSE == is_mm)
			{
				if (-1 == fsync(udi->fd))		/* Sync it all */
				{
					rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
			} else
			{	/* Now do final MM file sync before exit */
#if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC)
				if (-1 == fsync(udi->fd))		/* Sync it all */
				{
					rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno);
				}
#else
				if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC))
				{
					rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
						  ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno);
				}
#endif
			}
                }
	} /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */
	if (reg->read_only && we_are_last_user && !mupip_jnl_recover)
	{	/* mupip_jnl_recover will do this after mur_close_file */
		db_ipcs.semid = INVALID_SEMID;
		db_ipcs.shmid = INVALID_SHMID;
		db_ipcs.gt_sem_ctime = 0;
		db_ipcs.gt_shm_ctime = 0;
		db_ipcs.fn_len = reg->dyn.addr->fname_len;
		memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len);
		db_ipcs.fn[reg->dyn.addr->fname_len] = 0;
 		/* request gtmsecshr to flush. read_only cannot flush itself */
		if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0))
			rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
				  ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header"));
	}
	/* Done with file now, close it */
	if (-1 == close(udi->fd))
	{
		rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg),
			  ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno);
	}
	/* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */
	if (is_mm)
	{
		munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr),
											 MSYNC_ADDR_INCS));
		if (munmap_len > 0)
		{
			munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)),
			       (size_t)(munmap_len));
#ifdef DEBUG_DB64
			rel_mmseg((caddr_t)csa->db_addrs[0]);
#endif
		}
	}
	/* Detach our shared memory while still under lock so reference counts will be
	 * correct for the next process to run down this region.
	 * In the process also get the remove_shm status from node_local before detaching.
	 * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data
	 * 	integrity as a reissue of recover will restore the database to a consistent state.
	 */
	remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl);
	status = shmdt((caddr_t)csa->nl);
	csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/
	if (-1 == status)
		send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno);
	reg->open = FALSE;

	/* If file is still not in good shape, die here and now before we get rid of our storage */
	if (csa->wbuf_dqd)
		GTMASSERT;
	ipc_deleted = FALSE;
	/* If we are the very last user, remove shared storage id and the semaphores */
	if (we_are_last_user)
	{	/* remove shared storage, only if last writer to rundown did a successful wcs_flu() */
		assert(!vermismatch);
		if (remove_shm)
		{
			ipc_deleted = TRUE;
			if (0 != shm_rmid(udi->shmid))
				rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory"));
		} else if (is_src_server || is_updproc)
		{
			gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
			send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		} else
			send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id);
		/*
		 * Don't release semaphore in case of mupip recover/rollback; since it has standalone access.
		 * It will release the semaphore in mur_close_files.
		 */
		if (!mupip_jnl_recover)
		{
			if (0 != sem_rmid(udi->semid))
				rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore"));
			grabbed_access_sem = FALSE;
		}
	} else
	{
		assert(!mupip_jnl_recover);
		/* If we were writing, get rid of our writer access count semaphore */
		if (!reg->read_only)
			if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO)))
				rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
					ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno);
		/* Now remove the rundown lock */
		if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO)))
			rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg),
				ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno);
		grabbed_access_sem = FALSE;
	}
	if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE))
			rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg));
	if (!ipc_deleted)
	{
		GET_CUR_TIME;
		if (is_src_server)
			gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Source server"), REG_LEN_STR(reg));
		if (is_updproc)
			gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Update process"), REG_LEN_STR(reg));
		if (mupip_jnl_recover)
		{
			gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
			send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr,
				LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg));
		}
	}
	REVERT;
}
Exemplo n.º 29
0
int4 mupip_set_file(int db_fn_len, char *db_fn)
{
	bool			got_standalone;
	boolean_t		bypass_partial_recov, need_standalone = FALSE;
	char			acc_spec[MAX_ACC_METH_LEN], ver_spec[MAX_DB_VER_LEN], exit_stat, *fn;
	unsigned short		acc_spec_len = MAX_ACC_METH_LEN, ver_spec_len = MAX_DB_VER_LEN;
	int			fd, fn_len;
	int4			status;
	int4			status1;
	int			glbl_buff_status, defer_status, rsrvd_bytes_status,
				extn_count_status, lock_space_status, disk_wait_status;
	int4			new_disk_wait, new_cache_size, new_extn_count, new_lock_space, reserved_bytes, defer_time;
	sgmnt_data_ptr_t	csd;
	tp_region		*rptr, single;
	enum db_acc_method	access, access_new;
	enum db_ver		desired_dbver;
	gd_region		*temp_cur_region;
	char			*errptr, *command = "MUPIP SET VERSION";
	int			save_errno;

	error_def(ERR_DBPREMATEOF);
	error_def(ERR_DBRDERR);
	error_def(ERR_DBRDONLY);
	error_def(ERR_INVACCMETHOD);
	error_def(ERR_MUNOACTION);
	error_def(ERR_RBWRNNOTCHG);
	error_def(ERR_WCERRNOTCHG);
	error_def(ERR_WCWRNNOTCHG);
	error_def(ERR_MMNODYNDWNGRD);

	exit_stat = EXIT_NRM;
	defer_status = cli_present("DEFER_TIME");
	if (defer_status)
		need_standalone = TRUE;
	bypass_partial_recov = cli_present("PARTIAL_RECOV_BYPASS") == CLI_PRESENT;
	if (bypass_partial_recov)
		need_standalone = TRUE;
	if (disk_wait_status = cli_present("WAIT_DISK"))
	{
		if (cli_get_int("WAIT_DISK", &new_disk_wait))
		{
			if (new_disk_wait < 0)
			{
				util_out_print("!UL negative, minimum WAIT_DISK allowed is 0.", TRUE, new_disk_wait);
				return (int4)ERR_WCWRNNOTCHG;
			}
			need_standalone = TRUE;
		} else
		{
			util_out_print("Error getting WAIT_DISK qualifier value", TRUE);
			return (int4)ERR_WCWRNNOTCHG;
		}
	}
	if (glbl_buff_status = cli_present("GLOBAL_BUFFERS"))
	{
		if (cli_get_int("GLOBAL_BUFFERS", &new_cache_size))
		{
			if (new_cache_size > WC_MAX_BUFFS)
			{
				util_out_print("!UL too large, maximum write cache buffers allowed is !UL", TRUE, new_cache_size,
						WC_MAX_BUFFS);
				return (int4)ERR_WCWRNNOTCHG;
			}
			if (new_cache_size < WC_MIN_BUFFS)
			{
				util_out_print("!UL too small, minimum cache buffers allowed is !UL", TRUE, new_cache_size,
						WC_MIN_BUFFS);
				return (int4)ERR_WCWRNNOTCHG;
			}
		} else
		{
			util_out_print("Error getting GLOBAL BUFFER qualifier value", TRUE);
			return (int4)ERR_WCWRNNOTCHG;
		}
		need_standalone = TRUE;
	}
	/* EXTENSION_COUNT does not require standalone access and hence need_standalone will not be set to TRUE for this. */
	if (extn_count_status = cli_present("EXTENSION_COUNT"))
	{
		if (cli_get_int("EXTENSION_COUNT", &new_extn_count))
		{
			if (new_extn_count > MAX_EXTN_COUNT)
			{
				util_out_print("!UL too large, maximum extension count allowed is !UL", TRUE, new_extn_count,
						MAX_EXTN_COUNT);
				return (int4)ERR_WCWRNNOTCHG;
			}
			if (new_extn_count < MIN_EXTN_COUNT)
			{
				util_out_print("!UL too small, minimum extension count allowed is !UL", TRUE, new_extn_count,
						MIN_EXTN_COUNT);
				return (int4)ERR_WCWRNNOTCHG;
			}
		} else
		{
			util_out_print("Error getting EXTENSION COUNT qualifier value", TRUE);
			return (int4)ERR_WCWRNNOTCHG;
		}
	}
	if (lock_space_status = cli_present("LOCK_SPACE"))
	{
		if (cli_get_int("LOCK_SPACE", &new_lock_space))
		{
			if (new_lock_space > MAX_LOCK_SPACE)
			{
				util_out_print("!UL too large, maximum lock space allowed is !UL", TRUE,
						new_lock_space, MAX_LOCK_SPACE);
				return (int4)ERR_WCWRNNOTCHG;
			}
			else if (new_lock_space < MIN_LOCK_SPACE)
			{
				util_out_print("!UL too small, minimum lock space allowed is !UL", TRUE,
						new_lock_space, MIN_LOCK_SPACE);
				return (int4)ERR_WCWRNNOTCHG;
			}
		} else
		{
			util_out_print("Error getting LOCK_SPACE qualifier value", TRUE);
			return (int4)ERR_WCWRNNOTCHG;
		}
		need_standalone = TRUE;
	}
	if (rsrvd_bytes_status = cli_present("RESERVED_BYTES"))
	{
		if (!cli_get_int("RESERVED_BYTES", &reserved_bytes))
		{
			util_out_print("Error getting RESERVED BYTES qualifier value", TRUE);
			return (int4)ERR_RBWRNNOTCHG;
		}
		need_standalone = TRUE;
	}
	if (cli_present("ACCESS_METHOD"))
	{
		cli_get_str("ACCESS_METHOD", acc_spec, &acc_spec_len);
		cli_strupper(acc_spec);
		if (0 == memcmp(acc_spec, "MM", acc_spec_len))
			access = dba_mm;
		else  if (0 == memcmp(acc_spec, "BG", acc_spec_len))
			access = dba_bg;
		else
			mupip_exit(ERR_INVACCMETHOD);
		need_standalone = TRUE;
	} else
		access = n_dba;		/* really want to keep current method,
					    which has not yet been read */
	if (cli_present("VERSION"))
	{
		assert(!need_standalone);
		cli_get_str("VERSION", ver_spec, &ver_spec_len);
		cli_strupper(ver_spec);
		if (0 == memcmp(ver_spec, "V4", ver_spec_len))
			desired_dbver = GDSV4;
		else  if (0 == memcmp(ver_spec, "V5", ver_spec_len))
			desired_dbver = GDSV5;
		else
			GTMASSERT;		/* CLI should prevent us ever getting here */
	} else
		desired_dbver = GDSVLAST;	/* really want to keep version, which has not yet been read */
	if (region)
		rptr = grlist;
	else
	{
		rptr = &single;
		memset(&single, 0, sizeof(single));
	}

	csd = (sgmnt_data *)malloc(ROUND_UP(sizeof(sgmnt_data), DISK_BLOCK_SIZE));
	in_backup = FALSE;		/* Only want yes/no from mupfndfil, not an address */
	for (;  rptr != NULL;  rptr = rptr->fPtr)
	{
		if (region)
		{
			if (dba_usr == rptr->reg->dyn.addr->acc_meth)
			{
				util_out_print("!/Region !AD is not a GDS access type", TRUE, REG_LEN_STR(rptr->reg));
				exit_stat |= EXIT_WRN;
				continue;
			}
			if (!mupfndfil(rptr->reg, NULL))
				continue;
			fn = (char *)rptr->reg->dyn.addr->fname;
			fn_len = rptr->reg->dyn.addr->fname_len;
		} else
		{
			fn = db_fn;
			fn_len = db_fn_len;
		}
		mu_gv_cur_reg_init();
		strcpy((char *)gv_cur_region->dyn.addr->fname, fn);
		gv_cur_region->dyn.addr->fname_len = fn_len;
		if (!need_standalone)
		{
			gvcst_init(gv_cur_region);
			change_reg();	/* sets cs_addrs and cs_data */
			if (gv_cur_region->read_only)
			{
				gtm_putmsg(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
				exit_stat |= EXIT_ERR;
				gds_rundown();
				mu_gv_cur_reg_free();
				continue;
			}
			grab_crit(gv_cur_region);
			status = EXIT_NRM;
			access_new = (n_dba == access ? cs_data->acc_meth : access);
							/* recalculate; n_dba is a proxy for no change */
			change_fhead_timer("FLUSH_TIME", cs_data->flush_time,
					   (dba_bg == access_new ? TIM_FLU_MOD_BG : TIM_FLU_MOD_MM),
					   FALSE);
			if (GDSVLAST != desired_dbver)
			{
				if ((dba_mm != access_new) || (GDSV4 != desired_dbver))
					status1 = desired_db_format_set(gv_cur_region, desired_dbver, command);
				else
				{
					status1 = ERR_MMNODYNDWNGRD;
					gtm_putmsg(VARLSTCNT(4) status1, 2, REG_LEN_STR(gv_cur_region));
				}
				if (SS_NORMAL != status1)
				{	/* "desired_db_format_set" would have printed appropriate error messages */
					if (ERR_MUNOACTION != status1)
					{	/* real error occurred while setting the db format. skip to next region */
						status = EXIT_ERR;
					}
				}
			}
			if (EXIT_NRM == status)
			{
				if (extn_count_status)
					cs_data->extension_size = (uint4)new_extn_count;
				wcs_flu(WCSFLU_FLUSH_HDR);
				if (extn_count_status)
					util_out_print("Database file !AD now has extension count !UL",
						TRUE, fn_len, fn, cs_data->extension_size);
				if (GDSVLAST != desired_dbver)
					util_out_print("Database file !AD now has desired DB format !AD", TRUE,
						fn_len, fn, LEN_AND_STR(gtm_dbversion_table[cs_data->desired_db_format]));
			} else
				exit_stat |= status;
			rel_crit(gv_cur_region);
			gds_rundown();
		} else
		{	/* Following part needs standalone access */
			assert(GDSVLAST == desired_dbver);
			got_standalone = mu_rndwn_file(gv_cur_region, TRUE);
			if (FALSE == got_standalone)
				return (int4)ERR_WCERRNOTCHG;
			/* we should open it (for changing) after mu_rndwn_file, since mu_rndwn_file changes the file header too */
			if (-1 == (fd = OPEN(fn, O_RDWR)))
			{
				save_errno = errno;
				errptr = (char *)STRERROR(save_errno);
				util_out_print("open : !AZ", TRUE, errptr);
				exit_stat |= EXIT_ERR;
				db_ipcs_reset(gv_cur_region, FALSE);
				mu_gv_cur_reg_free();
				continue;
			}
			LSEEKREAD(fd, 0, csd, sizeof(sgmnt_data), status);
			if (0 != status)
			{
				save_errno = errno;
				PERROR("Error reading header of file");
				errptr = (char *)STRERROR(save_errno);
				util_out_print("read : !AZ", TRUE, errptr);
				util_out_print("Error reading header of file", TRUE);
				util_out_print("Database file !AD not changed:  ", TRUE, fn_len, fn);
				if (-1 != status)
					rts_error(VARLSTCNT(4) ERR_DBRDERR, 2, fn_len, fn);
				else
					rts_error(VARLSTCNT(4) ERR_DBPREMATEOF, 2, fn_len, fn);
			}
			if (rsrvd_bytes_status)
			{
				if (reserved_bytes > MAX_RESERVE_B(csd))
				{
					util_out_print("!UL too large, maximum reserved bytes allowed is !UL for database file !AD",
							TRUE, reserved_bytes, MAX_RESERVE_B(csd), fn_len, fn);
					close(fd);
					db_ipcs_reset(gv_cur_region, FALSE);
					return (int4)ERR_RBWRNNOTCHG;
				}
				csd->reserved_bytes = reserved_bytes;
			}
			access_new = (n_dba == access ? csd->acc_meth : access);
							/* recalculate; n_dba is a proxy for no change */
			change_fhead_timer("FLUSH_TIME", csd->flush_time,
					   (dba_bg == access_new ? TIM_FLU_MOD_BG : TIM_FLU_MOD_MM),
					   FALSE);
			if ((n_dba != access) && (csd->acc_meth != access))	/* n_dba is a proxy for no change */
			{
				if (dba_mm == access)
					csd->defer_time = 1;			/* defer defaults to 1 */
				csd->acc_meth = access;
				if (0 == csd->n_bts)
				{
					csd->n_bts = WC_DEF_BUFFS;
					csd->bt_buckets = getprime(csd->n_bts);
				}
			}
			if (glbl_buff_status)
			{
				csd->n_bts = BT_FACTOR(new_cache_size);
				csd->bt_buckets = getprime(csd->n_bts);
				csd->n_wrt_per_flu = 7;
				csd->flush_trigger = FLUSH_FACTOR(csd->n_bts);
			}
			if (disk_wait_status)
				csd->wait_disk_space = new_disk_wait;
			if (extn_count_status)
				csd->extension_size = (uint4)new_extn_count;
			if (lock_space_status)
				csd->lock_space_size = (uint4)new_lock_space * OS_PAGELET_SIZE;
			if (bypass_partial_recov)
			{
				csd->file_corrupt = FALSE;
				util_out_print("Database file !AD now has partial recovery flag set to  !UL(FALSE) ",
						TRUE, fn_len, fn, csd->file_corrupt);
			}
			if (dba_mm == access_new)
			{
				if (CLI_NEGATED == defer_status)
					csd->defer_time = 0;
				else  if (CLI_PRESENT == defer_status)
				{
					if (!cli_get_num("DEFER_TIME", &defer_time))
					{
						util_out_print("Error getting DEFER_TIME qualifier value", TRUE);
						db_ipcs_reset(gv_cur_region, FALSE);
						return (int4)ERR_RBWRNNOTCHG;
					}
					if (-1 > defer_time)
					{
						util_out_print("DEFER_TIME cannot take negative values less than -1", TRUE);
						util_out_print("Database file !AD not changed", TRUE, fn_len, fn);
						exit_stat |= EXIT_WRN;
						db_ipcs_reset(gv_cur_region, FALSE);
						mu_gv_cur_reg_free();
						continue;
					}
					csd->defer_time = defer_time;
				}
				if (csd->blks_to_upgrd)
				{
					util_out_print("MM access method cannot be set if there are blocks to upgrade",	TRUE);
					util_out_print("Database file !AD not changed", TRUE, fn_len, fn);
					exit_stat |= EXIT_WRN;
					db_ipcs_reset(gv_cur_region, FALSE);
					mu_gv_cur_reg_free();
					continue;
				}
				if (GDSVCURR != csd->desired_db_format)
				{
					util_out_print("MM access method cannot be set in DB compatibility mode",
						TRUE);
					util_out_print("Database file !AD not changed", TRUE, fn_len, fn);
					exit_stat |= EXIT_WRN;
					db_ipcs_reset(gv_cur_region, FALSE);
					mu_gv_cur_reg_free();
					continue;
				}
				if (JNL_ENABLED(csd) && csd->jnl_before_image)
				{
					util_out_print("MM access method cannot be set with BEFORE image journaling", TRUE);
					util_out_print("Database file !AD not changed", TRUE, fn_len, fn);
					exit_stat |= EXIT_WRN;
					db_ipcs_reset(gv_cur_region, FALSE);
					mu_gv_cur_reg_free();
					continue;
				}
				csd->jnl_before_image = FALSE;
			} else
			{
				if (defer_status)
				{
					util_out_print("DEFER cannot be specified with BG access method.", TRUE);
					util_out_print("Database file !AD not changed", TRUE, fn_len, fn);
					exit_stat |= EXIT_WRN;
					db_ipcs_reset(gv_cur_region, FALSE);
					mu_gv_cur_reg_free();
					continue;
				}
			}
			LSEEKWRITE(fd, 0, csd, sizeof(sgmnt_data), status);
			if (0 != status)
			{
				save_errno = errno;
				errptr = (char *)STRERROR(save_errno);
				util_out_print("write : !AZ", TRUE, errptr);
				util_out_print("Error writing header of file", TRUE);
				util_out_print("Database file !AD not changed: ", TRUE, fn_len, fn);
				rts_error(VARLSTCNT(4) ERR_DBRDERR, 2, fn_len, fn);
			}
			close(fd);
			/* --------------------- report results ------------------------- */
			if (glbl_buff_status)
				util_out_print("Database file !AD now has !UL global buffers",
						TRUE, fn_len, fn, csd->n_bts);
			if (defer_status && (dba_mm == csd->acc_meth))
				util_out_print("Database file !AD now has defer_time set to !SL",
						TRUE, fn_len, fn, csd->defer_time);
			if (rsrvd_bytes_status)
				util_out_print("Database file !AD now has !UL reserved bytes",
						TRUE, fn_len, fn, csd->reserved_bytes);
			if (extn_count_status)
				util_out_print("Database file !AD now has extension count !UL",
						TRUE, fn_len, fn, csd->extension_size);
			if (lock_space_status)
				util_out_print("Database file !AD now has lock space !UL pages",
						TRUE, fn_len, fn, csd->lock_space_size/OS_PAGELET_SIZE);
			if (disk_wait_status)
				util_out_print("Database file !AD now has wait disk set to !UL seconds",
						TRUE, fn_len, fn, csd->wait_disk_space);
			db_ipcs_reset(gv_cur_region, FALSE);
		} /* end of else part if (!need_standalone) */
		mu_gv_cur_reg_free();
	}
	free(csd);
	assert(!(exit_stat & EXIT_INF));
	return (exit_stat & EXIT_ERR ? (int4)ERR_WCERRNOTCHG :
		(exit_stat & EXIT_WRN ? (int4)ERR_WCWRNNOTCHG : SS_NORMAL));
}
Exemplo n.º 30
0
void dse_chng_bhead(void)
{
	block_id	blk;
	block_id	*blkid_ptr;
	sgm_info	*dummysi = NULL;
	int4		x;
	cache_rec_ptr_t	cr;
	uchar_ptr_t	bp;
	sm_uc_ptr_t	blkBase;
	blk_hdr		new_hdr;
	blk_segment	*bs1, *bs_ptr;
	cw_set_element  *cse;
	int4		blk_seg_cnt, blk_size;	/* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */
	bool		ismap;
	bool		chng_blk;
	uint4		mapsize;
	uint4           jnl_status;

	error_def(ERR_DSEBLKRDFAIL);
	error_def(ERR_DSEFAIL);
	error_def(ERR_DBRDONLY);

        if (gv_cur_region->read_only)
                rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	assert(update_array);
	/* reset new block mechanism */
	update_array_ptr = update_array;
	chng_blk = FALSE;
	if (cli_present("BLOCK") == CLI_PRESENT)
	{
		if (!cli_get_hex("BLOCK",&blk))
			return;
		if (blk < 0 || blk > cs_addrs->ti->total_blks)
		{	util_out_print("Error: invalid block number.",TRUE);
			return;
		}
		patch_curr_blk = blk;
	}
	blk_size = cs_addrs->hdr->blk_size;
	ismap = (patch_curr_blk / cs_addrs->hdr->bplmap * cs_addrs->hdr->bplmap == patch_curr_blk);
	mapsize = BM_SIZE(cs_addrs->hdr->bplmap);

	t_begin_crit (ERR_DSEFAIL);
	if (!(bp = t_qread (patch_curr_blk,&dummy_hist.h[0].cycle,&dummy_hist.h[0].cr)))
		rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
	new_hdr = *(blk_hdr_ptr_t)bp;

	if (cli_present("LEVEL") == CLI_PRESENT)
	{
		if (!cli_get_num("LEVEL",&x))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (ismap && (unsigned char)x != LCL_MAP_LEVL)
		{
			util_out_print("Error: invalid level for a bit map block.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (!ismap && (x < 0 || x > MAX_BT_DEPTH + 1))
		{
			util_out_print("Error: invalid level.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
	 	new_hdr.levl = (unsigned char)x;

		chng_blk = TRUE;
		if (new_hdr.bsiz < sizeof(blk_hdr))
			new_hdr.bsiz = sizeof(blk_hdr);
		if (new_hdr.bsiz  > blk_size)
			new_hdr.bsiz = blk_size;
	}
	if (cli_present("BSIZ") == CLI_PRESENT)
	{
		if (!cli_get_hex("BSIZ",&x))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (ismap && x != mapsize)
		{
			util_out_print("Error: invalid bsiz.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		else if (x < sizeof(blk_hdr) || x > blk_size)
		{
			util_out_print("Error: invalid bsiz.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		chng_blk = TRUE;
		new_hdr.bsiz = x;
	}
	if (!chng_blk)
		t_abort(gv_cur_region, cs_addrs);
	else
	{
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, bp + sizeof(new_hdr), new_hdr.bsiz - sizeof(new_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			util_out_print("Error: bad block build.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		t_write (patch_curr_blk, (unsigned char *)bs1, 0, 0, bp, new_hdr.levl, TRUE, FALSE);
		BUILD_AIMG_IF_JNL_ENABLED(cs_addrs, cs_data, non_tp_jfb_buff_ptr, cse);
		t_end(&dummy_hist, 0);
	}
	if (cli_present("TN") == CLI_PRESENT)
	{
		if (!cli_get_hex("TN",&x))
			return;
		t_begin_crit(ERR_DSEFAIL);
		assert(cs_addrs->ti->early_tn == cs_addrs->ti->curr_tn);
		cs_addrs->ti->early_tn++;
		blkBase = t_qread(patch_curr_blk, &dummy_hist.h[0].cycle, &dummy_hist.h[0].cr);
		if (NULL == blkBase)
		{
			rel_crit(gv_cur_region);
			util_out_print("Error: Unable to read buffer.", TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		/* Create a null update array for a block */
		if (ismap)
		{
			BLK_ADDR(blkid_ptr, sizeof(block_id), block_id);
			*blkid_ptr = 0;
			t_write_map(patch_curr_blk, blkBase, (unsigned char *)blkid_ptr, cs_addrs->ti->curr_tn);
			cr_array_index = 0;
			block_saved = FALSE;
		} else
		{
			BLK_INIT(bs_ptr, bs1);
			BLK_SEG(bs_ptr, bp + sizeof(new_hdr), new_hdr.bsiz - sizeof(new_hdr));
			BLK_FINI(bs_ptr, bs1);
			t_write(patch_curr_blk, (unsigned char *)bs1, 0, 0, blkBase,
						((blk_hdr_ptr_t)blkBase)->levl, TRUE, FALSE);
			cr_array_index = 0;
			block_saved = FALSE;
			if (JNL_ENABLED(cs_data))
			{
				JNL_SHORT_TIME(jgbl.gbl_jrec_time);	/* needed for jnl_put_jrt_pini() and jnl_write_aimg_rec() */
				jnl_status = jnl_ensure_open();
				if (0 == jnl_status)
				{
					cse = (cw_set_element *)(&cw_set[0]);
					cse->new_buff = non_tp_jfb_buff_ptr;
					gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, x);
					cse->done = TRUE;
					if (0 == cs_addrs->jnl->pini_addr)
						jnl_put_jrt_pini(cs_addrs);
					jnl_write_aimg_rec(cs_addrs, cse->blk, (blk_hdr_ptr_t)cse->new_buff);
				} else
					rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region));
			}
		}
		/* Pass the desired tn "x" as argument to bg_update or mm_update */
		if (dba_bg == cs_addrs->hdr->acc_meth)
			bg_update(cw_set, cw_set + cw_set_depth, cs_addrs->ti->curr_tn, x, dummysi);
		else
			mm_update(cw_set, cw_set + cw_set_depth, cs_addrs->ti->curr_tn, x, dummysi);
		cs_addrs->ti->curr_tn++;
		assert(cs_addrs->ti->early_tn == cs_addrs->ti->curr_tn);
		/* the following code is analogous to that in t_end and should be maintained in a similar fashion */
		while (cr_array_index)
			cr_array[--cr_array_index]->in_cw_set = FALSE;
		rel_crit(gv_cur_region);
		if (block_saved)
			backup_buffer_flush(gv_cur_region);
		UNIX_ONLY(
			if (unhandled_stale_timer_pop)
				process_deferred_stale();
		)
		wcs_timer_start(gv_cur_region, TRUE);
	}