Beispiel #1
0
/**********************************************
 Statistics of reorg for current global.
 Also update dest_blklist_ptr for next globals
***********************************************/
void reorg_finish(block_id dest_blk_id, int blks_processed, int blks_killed,
	int blks_reused, int file_extended, int lvls_reduced,
	int blks_coalesced, int blks_split, int blks_swapped)
{
	t_abort(gv_cur_region, cs_addrs);
	file_extended = cs_data->trans_hist.total_blks - file_extended;
	util_out_print("Blocks processed    : !SL ", FLUSH, blks_processed);
	util_out_print("Blocks coalesced    : !SL ", FLUSH, blks_coalesced);
	util_out_print("Blocks split        : !SL ", FLUSH, blks_split);
	util_out_print("Blocks swapped      : !SL ", FLUSH, blks_swapped);
	util_out_print("Blocks freed        : !SL ", FLUSH, blks_killed);
	util_out_print("Blocks reused       : !SL ", FLUSH, blks_reused);
	if (0 > lvls_reduced)
		util_out_print("Levels Increased    : !SL ", FLUSH, -lvls_reduced);
	else if (0 < lvls_reduced)
		util_out_print("Levels Eliminated   : !SL ", FLUSH, lvls_reduced);
	util_out_print("Blocks extended     : !SL ", FLUSH, file_extended);
	cs_addrs->reorg_last_dest = dest_blk_id;

	/* next attempt for this global will start from the beginning, if RESUME option is present */
	cs_data->reorg_restart_block = 0;
	cs_data->reorg_restart_key[0] = 0;
	cs_data->reorg_restart_key[1] = 0;
}
trans_num gvcst_bmp_mark_free(kill_set *ks)
{
	block_id		bit_map, next_bm, *updptr;
	blk_ident		*blk, *blk_top, *nextblk;
	trans_num		ctn, start_db_fmt_tn;
	unsigned int		len;
#	if defined(UNIX) && defined(DEBUG)
	unsigned int		lcl_t_tries;
#	endif
	int4			blk_prev_version;
	srch_hist		alt_hist;
	trans_num		ret_tn = 0;
	boolean_t		visit_blks;
	srch_blk_status		bmphist;
	cache_rec_ptr_t		cr;
	enum db_ver		ondsk_blkver;
	enum cdb_sc		status;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	TREF(in_gvcst_bmp_mark_free) = TRUE;
	assert(inctn_bmp_mark_free_gtm == inctn_opcode || inctn_bmp_mark_free_mu_reorg == inctn_opcode);
	/* Note down the desired_db_format_tn before you start relying on cs_data->fully_upgraded.
	 * If the db is fully_upgraded, take the optimal path that does not need to read each block being freed.
	 * But in order to detect concurrent desired_db_format changes, note down the tn (when the last format change occurred)
	 * 	before the fully_upgraded check	and after having noted down the database current_tn.
	 * If they are the same, then we are guaranteed no concurrent desired_db_format change occurred.
	 * If they are not, then fall through to the non-optimal path where each to-be-killed block has to be visited.
	 * The reason we need to visit every block in case desired_db_format changes is to take care of the case where
	 *	MUPIP REORG DOWNGRADE concurrently changes a block that we are about to free.
	 */
	start_db_fmt_tn = cs_data->desired_db_format_tn;
	visit_blks = (!cs_data->fully_upgraded);	/* Local evaluation */
	assert(!visit_blks || (visit_blks && dba_bg == cs_addrs->hdr->acc_meth)); /* must have blks_to_upgrd == 0 for non-BG */
	assert(!dollar_tlevel); 			/* Should NOT be in TP now */
	blk = &ks->blk[0];
	blk_top = &ks->blk[ks->used];
	if (!visit_blks)
	{	/* Database has been completely upgraded. Free all blocks in one bitmap as part of one transaction. */
		assert(cs_data->db_got_to_v5_once); /* assert all V4 fmt blocks (including RECYCLED) have space for V5 upgrade */
		inctn_detail.blknum_struct.blknum = 0; /* to indicate no adjustment to "blks_to_upgrd" necessary */
		/* If any of the mini transaction below restarts because of an online rollback, we don't want the application
		 * refresh to happen (like $ZONLNRLBK++ or rts_error(DBROLLEDBACK). This is because, although we are currently in
		 * non-tp (dollar_tleve = 0), we could actually be in a TP transaction and have actually faked dollar_tlevel. In
		 * such a case, we should NOT * be issuing a DBROLLEDBACK error as TP transactions are supposed to just restart in
		 * case of an online rollback. So, set the global variable that gtm_onln_rlbk_clnup can check and skip doing the
		 * application refresh, but will reset the clues. The next update will see the cycle mismatch and will accordingly
		 * take the right action.
		 */
		for ( ; blk < blk_top;  blk = nextblk)
		{
			if (0 != blk->flag)
			{
				nextblk = blk + 1;
				continue;
			}
			assert(0 < blk->block);
			assert((int4)blk->block < cs_addrs->ti->total_blks);
			bit_map = ROUND_DOWN2((int)blk->block, BLKS_PER_LMAP);
			next_bm = bit_map + BLKS_PER_LMAP;
			CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
			/* Scan for the next local bitmap */
			updptr = (block_id *)update_array_ptr;
			for (nextblk = blk;
				(0 == nextblk->flag) && (nextblk < blk_top) && ((block_id)nextblk->block < next_bm);
				++nextblk)
			{
				assert((block_id)nextblk->block - bit_map);
				*updptr++ = (block_id)nextblk->block - bit_map;
			}
			len = (unsigned int)((char *)nextblk - (char *)blk);
			update_array_ptr = (char *)updptr;
			alt_hist.h[0].blk_num = 0;			/* need for calls to T_END for bitmaps */
			alt_hist.h[0].blk_target = NULL;		/* need to initialize for calls to T_END */
			/* the following assumes SIZEOF(blk_ident) == SIZEOF(int) */
			assert(SIZEOF(blk_ident) == SIZEOF(int));
			*(int *)update_array_ptr = 0;
			t_begin(ERR_GVKILLFAIL, UPDTRNS_DB_UPDATED_MASK);
			for (;;)
			{
				ctn = cs_addrs->ti->curr_tn;
				/* Need a read fence before reading fields from cs_data as we are reading outside
				 * of crit and relying on this value to detect desired db format state change.
				 */
				SHM_READ_MEMORY_BARRIER;
				if (start_db_fmt_tn != cs_data->desired_db_format_tn)
				{	/* Concurrent db format change has occurred. Need to visit every block to be killed
					 * to determine its block format. Fall through to the non-optimal path below
					 */
					ret_tn = 0;
					break;
				}
				bmphist.blk_num = bit_map;
				if (NULL == (bmphist.buffaddr = t_qread(bmphist.blk_num, (sm_int_ptr_t)&bmphist.cycle,
									&bmphist.cr)))
				{
					t_retry((enum cdb_sc)rdfail_detail);
					continue;
				}
				t_write_map(&bmphist, (uchar_ptr_t)update_array, ctn, -(int4)(nextblk - blk));
				UNIX_ONLY(DEBUG_ONLY(lcl_t_tries = t_tries));
				if ((trans_num)0 == (ret_tn = t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)))
				{
#					ifdef UNIX
					assert((CDB_STAGNATE == t_tries) || (lcl_t_tries == t_tries - 1));
					status = LAST_RESTART_CODE;
					if ((cdb_sc_onln_rlbk1 == status) || (cdb_sc_onln_rlbk2 == status)
						|| TREF(rlbk_during_redo_root))
					{	/* t_end restarted due to online rollback. Discard bitmap free-up and return control
						 * to the application. But, before that reset only_reset_clues_if_onln_rlbk to FALSE
						 */
						TREF(in_gvcst_bmp_mark_free) = FALSE;
						send_msg(VARLSTCNT(6) ERR_IGNBMPMRKFREE, 4, REG_LEN_STR(gv_cur_region),
								DB_LEN_STR(gv_cur_region));
						t_abort(gv_cur_region, cs_addrs);
						return ret_tn; /* actually 0 */
					}
#					endif
					continue;
				}
				break;
			}
			if (0 == ret_tn) /* db format change occurred. Fall through to below for loop to visit each block */
			{
				/* Abort any active transaction to get rid of lingering Non-TP artifacts */
				t_abort(gv_cur_region, cs_addrs);
				break;
			}
		}
	}	/* for all blocks in the kill_set */
Beispiel #3
0
void	mu_swap_root(glist *gl_ptr, int *root_swap_statistic_ptr)
{
	sgmnt_data_ptr_t	csd;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	srch_hist		*dir_hist_ptr, *gvt_hist_ptr;
	gv_namehead		*save_targ;
	block_id		root_blk_id, child_blk_id, free_blk_id;
	sm_uc_ptr_t		root_blk_ptr, child_blk_ptr;
	kill_set		kill_set_list;
	trans_num		curr_tn, ret_tn;
	int			level, root_blk_lvl;
	block_id		save_root;
	boolean_t		tn_aborted;
	unsigned int		lcl_t_tries;
	enum cdb_sc		status;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	assert(mu_reorg_process);
	gv_target = gl_ptr->gvt;
	gv_target->root = 0;		/* reset root so we recompute it in DO_OP_GVNAME below */
	gv_target->clue.end = 0;	/* reset clue since reorg action on later globals might have invalidated it */
	reorg_gv_target->gvname.var_name = gv_target->gvname.var_name;	/* needed by SAVE_ROOTSRCH_ENTRY_STATE */
	dir_hist_ptr = gv_target->alt_hist;
	gvt_hist_ptr = &(gv_target->hist);
	inctn_opcode = inctn_invalid_op;
	DO_OP_GVNAME(gl_ptr);
		/* sets gv_target/gv_currkey/gv_cur_region/cs_addrs/cs_data to correspond to <globalname,reg> in gl_ptr */
	csa = cs_addrs;
	cnl = csa->nl;
	csd = cs_data;	/* Be careful to keep csd up to date. With MM, cs_data can change, and
			 * dereferencing an older copy can result in a SIG-11.
			 */
	if (gv_cur_region->read_only)
		return;	/* Cannot proceed for read-only data files */
	if (0 == gv_target->root)
	{	/* Global does not exist (online rollback). No problem. */
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_GBLNOEXIST, 2, GNAME(gl_ptr).len, GNAME(gl_ptr).addr);
		return;
	}
	if (dba_mm == csd->acc_meth)
		/* return for now without doing any swapping operation because later mu_truncate
		 * is going to issue the MUTRUNCNOTBG message.
		 */
		return;
	SET_GV_ALTKEY_TO_GBLNAME_FROM_GV_CURRKEY;		/* set up gv_altkey to be just the gblname */
	/* ------------ Swap root block of global variable tree --------- */
	t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
	for (;;)
	{
		curr_tn = csa->ti->curr_tn;
		kill_set_list.used = 0;
		save_root = gv_target->root;
		gv_target->root = csa->dir_tree->root;
		gv_target->clue.end = 0;
		if (cdb_sc_normal != (status = gvcst_search(gv_altkey, dir_hist_ptr)))
		{	/* Assign directory tree path to dir_hist_ptr */
			assert(t_tries < CDB_STAGNATE);
			gv_target->root = save_root;
			t_retry(status);
			continue;
		}
		gv_target->root = save_root;
		gv_target->clue.end = 0;
		if (cdb_sc_normal != (gvcst_search(gv_currkey, NULL)))
		{	/* Assign global variable tree path to gvt_hist_ptr */
			assert(t_tries < CDB_STAGNATE);
			t_retry(status);
			continue;
		}
		/* We've already search the directory tree in op_gvname/t_retry and obtained gv_target->root.
		 * Should restart with gvtrootmod2 if they don't agree. gvcst_root_search is the final arbiter.
		 * Really need that for debug info and also should assert(gv_currkey is global name).
		 */
		root_blk_lvl = gvt_hist_ptr->depth;
		assert(root_blk_lvl > 0);
		root_blk_ptr = gvt_hist_ptr->h[root_blk_lvl].buffaddr;
		root_blk_id = gvt_hist_ptr->h[root_blk_lvl].blk_num;
		assert((CDB_STAGNATE > t_tries) || (gv_target->root == gvt_hist_ptr->h[root_blk_lvl].blk_num));
		free_blk_id = swap_root_or_directory_block(0, root_blk_lvl, dir_hist_ptr, root_blk_id,
				root_blk_ptr, &kill_set_list, curr_tn);
		if (RETRY_SWAP == free_blk_id)
			continue;
		else if (ABORT_SWAP == free_blk_id)
			break;
		update_trans = UPDTRNS_DB_UPDATED_MASK;
		inctn_opcode = inctn_mu_reorg;
		assert(1 == kill_set_list.used);
		need_kip_incr = TRUE;
		if (!csa->now_crit)
			WAIT_ON_INHIBIT_KILLS(cnl, MAXWAIT2KILL);
		DEBUG_ONLY(lcl_t_tries = t_tries);
		TREF(in_mu_swap_root_state) = MUSWP_INCR_ROOT_CYCLE;
		assert(!TREF(in_gvcst_redo_root_search));
		if ((trans_num)0 == (ret_tn = t_end(gvt_hist_ptr, dir_hist_ptr, TN_NOT_SPECIFIED)))
		{
			TREF(in_mu_swap_root_state) = MUSWP_NONE;
			need_kip_incr = FALSE;
			assert(NULL == kip_csa);
			ABORT_TRANS_IF_GBL_EXIST_NOMORE(lcl_t_tries, tn_aborted);
			if (tn_aborted)
			{	/* It is not an error if the global (that once existed) doesn't exist anymore (due to ROLLBACK) */
				gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_GBLNOEXIST, 2, GNAME(gl_ptr).len, GNAME(gl_ptr).addr);
				return;
			}
			continue;
		}
		TREF(in_mu_swap_root_state) = MUSWP_NONE;
		/* Note that this particular process's csa->root_search_cycle is now behind cnl->root_search_cycle.
		 * This forces a cdb_sc_gvtrootmod2 restart in gvcst_bmp_mark_free below.
		 */
		assert(cnl->root_search_cycle > csa->root_search_cycle);
		gvcst_kill_sort(&kill_set_list);
		GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, csa);
		DECR_KIP(csd, csa, kip_csa);
		*root_swap_statistic_ptr += 1;
		break;
	}
	/* ------------ Swap blocks in branch of directory tree --------- */
	for (level = 0; level <= MAX_BT_DEPTH; level++)
	{
		t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
		for (;;)
		{
			curr_tn = csa->ti->curr_tn;
			kill_set_list.used = 0;
			save_root = gv_target->root;
			gv_target->root = csa->dir_tree->root;
			gv_target->clue.end = 0;
			if (cdb_sc_normal != (status = gvcst_search(gv_altkey, dir_hist_ptr)))
			{	/* assign branch path of directory tree into dir_hist_ptr */
				assert(t_tries < CDB_STAGNATE);
				gv_target->root = save_root;
				t_retry(status);
				continue;
			}
			gv_target->root = save_root;
			gv_target->clue.end = 0;
			if (level >= dir_hist_ptr->depth)
			{	/* done */
				t_abort(gv_cur_region, csa);
				return;
			}
			child_blk_ptr = dir_hist_ptr->h[level].buffaddr;
			child_blk_id = dir_hist_ptr->h[level].blk_num;
			assert(csa->dir_tree->root != child_blk_id);
			free_blk_id = swap_root_or_directory_block(level + 1, level, dir_hist_ptr, child_blk_id,
					child_blk_ptr, &kill_set_list, curr_tn);
			if (level == 0)
				/* set level as 1 to mark this kill set is for level-0 block in directory tree.
				 * The kill-set level later will be used in gvcst_bmp_markfree to assign a special value to
				 * cw_set_element, which will be eventually used by t_end to write the block to snapshot
				 */
				kill_set_list.blk[kill_set_list.used - 1].level = 1;
			if (RETRY_SWAP == free_blk_id)
				continue;
			else if (ABORT_SWAP == free_blk_id)
				break;
			update_trans = UPDTRNS_DB_UPDATED_MASK;
			inctn_opcode = inctn_mu_reorg;
			assert(1 == kill_set_list.used);
			need_kip_incr = TRUE;
			if (!csa->now_crit)
				WAIT_ON_INHIBIT_KILLS(cnl, MAXWAIT2KILL);
			DEBUG_ONLY(lcl_t_tries = t_tries);
			TREF(in_mu_swap_root_state) = MUSWP_DIRECTORY_SWAP;
			if ((trans_num)0 == (ret_tn = t_end(dir_hist_ptr, NULL, TN_NOT_SPECIFIED)))
			{
				TREF(in_mu_swap_root_state) = MUSWP_NONE;
				need_kip_incr = FALSE;
				assert(NULL == kip_csa);
				continue;
			}
			TREF(in_mu_swap_root_state) = MUSWP_NONE;
			gvcst_kill_sort(&kill_set_list);
			TREF(in_mu_swap_root_state) = MUSWP_FREE_BLK;
			GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg,
					inctn_opcode, csa);
			TREF(in_mu_swap_root_state) = MUSWP_NONE;
			DECR_KIP(csd, csa, kip_csa);
			break;
		}
	}
	return;
}
Beispiel #4
0
/* Finds a free block and adds information to update array and cw_set */
block_id swap_root_or_directory_block(int parent_blk_lvl, int child_blk_lvl, srch_hist *dir_hist_ptr, block_id child_blk_id,
		sm_uc_ptr_t child_blk_ptr, kill_set *kill_set_list, trans_num curr_tn)
{
	sgmnt_data_ptr_t	csd;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	srch_blk_status		bmlhist, freeblkhist;
	block_id		hint_blk_num, free_blk_id, parent_blk_id;
	boolean_t		free_blk_recycled;
	int4			master_bit, num_local_maps, free_bit, hint_bit, maxbitsthismap;
	uint4			total_blks;
	int			blk_seg_cnt, blk_size;
	sm_uc_ptr_t		parent_blk_ptr, bn_ptr, saved_blk;
	blk_segment		*bs1, *bs_ptr;
	int			parent_blk_size, child_blk_size, bsiz;
	int			rec_size1, curr_offset, bpntr_end, hdr_len;
	int			tmp_cmpc;
	cw_set_element		*tmpcse;
	jnl_buffer_ptr_t	jbbp; /* jbbp is non-NULL only if before-image journaling */
	unsigned short		temp_ushort;
	unsigned long		temp_long;
	unsigned char		save_cw_set_depth;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csd = cs_data;
	csa = cs_addrs;
	cnl = csa->nl;
	blk_size = csd->blk_size;
	/* Find a free/recycled block for new block location. */
	hint_blk_num = 0;
	total_blks = csa->ti->total_blks;
	num_local_maps = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP);
	master_bit = bmm_find_free((hint_blk_num / BLKS_PER_LMAP), csa->bmm, num_local_maps);
	if ((NO_FREE_SPACE == master_bit))
	{
		t_abort(gv_cur_region, csa);
		return ABORT_SWAP;
	}
	bmlhist.blk_num = (block_id)master_bit * BLKS_PER_LMAP;
	if (NULL == (bmlhist.buffaddr = t_qread(bmlhist.blk_num, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr)))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry((enum cdb_sc)rdfail_detail);
		return RETRY_SWAP;
	}
	hint_bit = 0;
	maxbitsthismap = (master_bit != (num_local_maps - 1)) ? BLKS_PER_LMAP : total_blks - bmlhist.blk_num;
	free_bit = bm_find_blk(hint_bit, bmlhist.buffaddr + SIZEOF(blk_hdr), maxbitsthismap, &free_blk_recycled);
	free_blk_id = bmlhist.blk_num + free_bit;
	if (DIR_ROOT >= free_blk_id)
	{	/* Bitmap block 0 and directory tree root block 1 should always be marked busy. */
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_badbitmap);
		return RETRY_SWAP;
	}
	if (child_blk_id <= free_blk_id)
	{	/* stop swapping root or DT blocks once the database is truncated well enough. A good heuristic for this is to check
		 * if the block is to be swapped into a higher block number and if so do not swap
		 */
		t_abort(gv_cur_region, csa);
		return ABORT_SWAP;
	}
	/* ====== begin update array ======
	 * Four blocks get changed.
	 * 	1. Free block becomes busy and gains the contents of child (root block/directory tree block)
	 * 	2. Parent block in directory tree remains busy, but points to new root block location.
	 *	3. Free block's corresponding bitmap reflects above change.
	 * 	4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE)
	 */
	parent_blk_ptr = dir_hist_ptr->h[parent_blk_lvl].buffaddr; /* parent_blk_lvl is 0 iff we're moving a gvt root block */
	parent_blk_id = dir_hist_ptr->h[parent_blk_lvl].blk_num;
	CHECK_AND_RESET_UPDATE_ARRAY;
	if (free_blk_recycled)
	{	/* Otherwise, it's a completely free block, in which case no need to read. */
		freeblkhist.blk_num = (block_id)free_blk_id;
		if (NULL == (freeblkhist.buffaddr = t_qread(free_blk_id, (sm_int_ptr_t)&freeblkhist.cycle, &freeblkhist.cr)))
		{
			assert(t_tries < CDB_STAGNATE);
			t_retry((enum cdb_sc)rdfail_detail);
			return RETRY_SWAP;
		}
	}
	child_blk_size = ((blk_hdr_ptr_t)child_blk_ptr)->bsiz;
	BLK_INIT(bs_ptr, bs1);
	BLK_ADDR(saved_blk, child_blk_size, unsigned char);
	memcpy(saved_blk, child_blk_ptr, child_blk_size);
	BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), child_blk_size - SIZEOF(blk_hdr));
	assert(blk_seg_cnt == child_blk_size);
	if (!BLK_FINI(bs_ptr, bs1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	tmpcse = &cw_set[cw_set_depth];
	(free_blk_recycled) ? BIT_SET_RECYCLED_AND_CLEAR_FREE(tmpcse->blk_prior_state)
			    : BIT_CLEAR_RECYCLED_AND_SET_FREE(tmpcse->blk_prior_state);
	t_create(free_blk_id, (unsigned char *)bs1, 0, 0, child_blk_lvl);
	tmpcse->mode = gds_t_acquired;
	if (!free_blk_recycled || !cs_data->db_got_to_v5_once)
		tmpcse->old_block = NULL;
	else
	{
		tmpcse->old_block = freeblkhist.buffaddr;
		tmpcse->cr = freeblkhist.cr;
		tmpcse->cycle = freeblkhist.cycle;
		jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
		if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn))
		{
			bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz;
			if (bsiz > blk_size)
			{
				assert(CDB_STAGNATE > t_tries);
				t_retry(cdb_sc_lostbmlcr);
				return RETRY_SWAP;
			}
			JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, csd, csa, tmpcse->old_block, bsiz);
		}
	}
	/* 2. Parent block in directory tree remains busy, but points to new child block location. */
	curr_offset = dir_hist_ptr->h[parent_blk_lvl].curr_rec.offset;
	parent_blk_size = ((blk_hdr_ptr_t)parent_blk_ptr)->bsiz;
	GET_RSIZ(rec_size1, (parent_blk_ptr + curr_offset));
	if ((parent_blk_size < rec_size1 + curr_offset) || (BSTAR_REC_SIZE > rec_size1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	BLK_INIT(bs_ptr, bs1);
	if (0 == parent_blk_lvl)
		/* There can be collation stuff in the record value after the block pointer. See gvcst_root_search. */
		hdr_len = SIZEOF(rec_hdr) + gv_altkey->end + 1 - EVAL_CMPC((rec_hdr_ptr_t)(parent_blk_ptr + curr_offset));
	else
		hdr_len = rec_size1 - SIZEOF(block_id);
	bpntr_end = curr_offset + hdr_len + SIZEOF(block_id);
	BLK_SEG(bs_ptr, parent_blk_ptr + SIZEOF(blk_hdr), curr_offset + hdr_len - SIZEOF(blk_hdr));
	BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
	PUT_LONG(bn_ptr, free_blk_id);
	BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
	BLK_SEG(bs_ptr, parent_blk_ptr + bpntr_end, parent_blk_size - bpntr_end);
	assert(blk_seg_cnt == parent_blk_size);
	if (!BLK_FINI(bs_ptr, bs1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	t_write(&dir_hist_ptr->h[parent_blk_lvl], (unsigned char *)bs1, 0, 0, parent_blk_lvl, FALSE, TRUE, GDS_WRITE_KILLTN);
	/* To indicate later snapshot file writing process during fast_integ not to skip writing the block to snapshot file */
	BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state);
	/* 3. Free block's corresponding bitmap reflects above change. */
	PUT_LONG(update_array_ptr, free_bit);
	save_cw_set_depth = cw_set_depth; /* Bit maps go on end of cw_set (more fake acquired) */
	assert(!cw_map_depth);
	t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, curr_tn, 1);
	cw_map_depth = cw_set_depth;
	cw_set_depth = save_cw_set_depth;
	update_array_ptr += SIZEOF(block_id);
	temp_long = 0;
	PUT_LONG(update_array_ptr, temp_long);
	update_array_ptr += SIZEOF(block_id);
	assert(1 == cw_set[cw_map_depth - 1].reference_cnt);
	/* 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE) */
	kill_set_list->blk[kill_set_list->used].flag = 0;
	kill_set_list->blk[kill_set_list->used].level = 0;
	kill_set_list->blk[kill_set_list->used++].block = child_blk_id;
	return free_blk_id;
}
Beispiel #5
0
boolean_t mu_truncate(int4 truncate_percent)
{
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t 	csd;
	int			num_local_maps;
	int 			lmap_num, lmap_blk_num;
	int			bml_status, sigkill;
	int			save_errno;
	int			ftrunc_status;
	uint4			jnl_status;
	uint4			old_total, new_total;
	uint4			old_free, new_free;
	uint4			end_blocks;
	int4			blks_in_lmap, blk;
	gtm_uint64_t		before_trunc_file_size;
	off_t			trunc_file_size;
	off_t			padding;
	uchar_ptr_t		lmap_addr;
	boolean_t		was_crit;
	uint4			found_busy_blk;
	srch_blk_status		bmphist;
	srch_blk_status 	*blkhist;
	srch_hist		alt_hist;
	trans_num		curr_tn;
	blk_hdr_ptr_t		lmap_blk_hdr;
	block_id		*blkid_ptr;
	unix_db_info    	*udi;
	jnl_private_control	*jpc;
	jnl_buffer_ptr_t	jbp;
	char			*err_msg;
	intrpt_state_t		prev_intrpt_state;
	off_t			offset;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csa = cs_addrs;
	csd = cs_data;
	if (dba_mm == csd->acc_meth)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		return TRUE;
	}
	if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		return TRUE;
	}
	/* already checked for parallel truncates on this region --- see mupip_reorg.c */
	gv_target = NULL;
	assert(csa->nl->trunc_pid == process_id);
	assert(dba_mm != csd->acc_meth);
	old_total = csa->ti->total_blks;
	old_free = csa->ti->free_blocks;
	sigkill = 0;
	found_busy_blk = 0;
	memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */
	assert(csd->bplmap == BLKS_PER_LMAP);
	end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */
	if (0 == end_blocks)
		end_blocks = BLKS_PER_LMAP;
	num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP);
	/* ======================================== PHASE 1 ======================================== */
	for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--)
	{
		if (mu_ctrly_occurred || mu_ctrlc_occurred)
			return TRUE;
		assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */
		if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region),
					truncate_percent);
			return TRUE;
		}
		lmap_blk_num = lmap_num * BLKS_PER_LMAP;
		if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
		{
			found_busy_blk = lmap_blk_num;
			break;
		}
		blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP;
		/* Loop through non-bitmap blocks of this lmap, do recycled2free */
		DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n",
			lmap_num, lmap_blk_num, blks_in_lmap));
		for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;)
		{
			t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
			for (;;) /* retry loop for recycled to free transactions */
			{
				curr_tn = csd->trans_hist.curr_tn;
				/* Read the nth local bitmap into memory */
				bmphist.blk_num = lmap_blk_num;
				bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr);
				lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr;
				if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
				{ /* Could not read the block successfully. Retry. */
					t_retry((enum cdb_sc)rdfail_detail);
					continue;
				}
				lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr);
				/* starting from the hint (blk itself), find the first busy or recycled block */
				blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status);
				assert(blk < BLKS_PER_LMAP);
				if (blk == -1 || blk >= blks_in_lmap)
				{ /* done with this lmap, continue to next */
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num)
				{ /* stop processing blocks... skip ahead to phase 2 */
					found_busy_blk = lmap_blk_num;
					t_abort(gv_cur_region, csa);
					break;
				}
				else if (BLK_RECYCLED == bml_status)
				{ /* Write PBLK records for recycled blocks only if before_image journaling is
				   * enabled. t_end() takes care of checking if journaling is enabled and
				   * writing PBLK record. We have to at least mark the recycled block as free.
				   */
					RESET_UPDATE_ARRAY;
					update_trans = UPDTRNS_DB_UPDATED_MASK;
					*((block_id *)update_array_ptr) = blk;
					update_array_ptr += SIZEOF(block_id);
					*(int *)update_array_ptr = 0;
					alt_hist.h[1].blk_num = 0;
					alt_hist.h[0].level = 0;
					alt_hist.h[0].cse = NULL;
					alt_hist.h[0].tn = curr_tn;
					alt_hist.h[0].blk_num = lmap_blk_num + blk;
					alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num,
							&alt_hist.h[0].cycle, &alt_hist.h[0].cr);
					if (!alt_hist.h[0].buffaddr)
					{
						t_retry((enum cdb_sc)rdfail_detail);
						continue;
					}
					if (!t_recycled2free(&alt_hist.h[0]))
					{
						t_retry(cdb_sc_lostbmlcr);
						continue;
					}
					t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0);
					/* Set the opcode for INCTN record written by t_end() */
					inctn_opcode = inctn_blkmarkfree;
					if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
						continue;
					/* block processed, scan from the next one */
					blk++;
					break;
				} else
				{
					assert(t_tries < CDB_STAGNATE);
					t_retry(cdb_sc_badbitmap);
					continue;
				}
			} /* END recycled2free retry loop */
		} /* END scanning blocks of this particular lmap */
		/* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */
		/* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */
		DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num));
		t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK);
		for (;;)
		{
			RESET_UPDATE_ARRAY;
			BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id);
			*blkid_ptr = 0;
			update_trans = UPDTRNS_DB_UPDATED_MASK;
			inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */
			curr_tn = csd->trans_hist.curr_tn;
			blkhist = &alt_hist.h[0];
			blkhist->blk_num = lmap_blk_num;
			blkhist->tn = curr_tn;
			blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */
			/* Read the nth local bitmap into memory */
			blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr);
			lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr;
			if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz))
			{ /* Could not read the block successfully. Retry. */
				t_retry((enum cdb_sc)rdfail_detail);
				continue;
			}
			t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0);
			blkhist->blk_num = 0; /* create empty history for bitmap block */
			if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))
				continue;
			break;
		}
	} /* END scanning lmaps */
	/* ======================================== PHASE 2 ======================================== */
	assert(!csa->now_crit);
	for (;;)
	{ /* wait for FREEZE, we don't want to truncate a frozen database */
		grab_crit(gv_cur_region);
		if (FROZEN_CHILLED(cs_data))
			DO_CHILLED_AUTORELEASE(csa, cs_data);
		if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN)
			break;
		rel_crit(gv_cur_region);
		while (FROZEN(cs_data) || IS_REPL_INST_FROZEN)
		{
			hiber_start(1000);
			if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data))
				break;
		}
	}
	assert(csa->nl->trunc_pid == process_id);
	/* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */
	if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB))
	{
		assert(FALSE);
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"),
				DB_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return FALSE;
	}
	csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk);
	assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk));
	new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP);
	if (mu_ctrly_occurred || mu_ctrlc_occurred)
	{
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (csa->ti->total_blks != old_total || new_total == old_total)
	{
		assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent);
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (SNAPSHOTS_IN_PROG(csa->nl))
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	} else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb)
	{
		gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region));
		rel_crit(gv_cur_region);
		return TRUE;
	}
	DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	if (JNL_ENABLED(csa))
	{ /* Write JRT_TRUNC and INCTN records */
		if (!jgbl.dont_reset_gbl_jrec_time)
		SET_GBL_JREC_TIME;	/* needed before jnl_ensure_open as that can write jnl records */
		jpc = csa->jnl;
		jbp = jpc->jnl_buff;
		/* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order
		 * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write
		 * journal records (if it decides to switch to a new journal file).
		 */
		ADJUST_GBL_JREC_TIME(jgbl, jbp);
		jnl_status = jnl_ensure_open(gv_cur_region, csa);
		if (SS_NORMAL != jnl_status)
			send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region));
		else
		{
			if (0 == jpc->pini_addr)
				jnl_put_jrt_pini(csa);
			jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total);
			inctn_opcode = inctn_mu_reorg;
			jnl_write_inctn_rec(csa);
			jnl_status = jnl_flush(gv_cur_region);
			if (SS_NORMAL != jnl_status)
			{
				send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd),
					ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"),
					jnl_status);
				assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */
			}
		}
	}
	/* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */
	curr_tn = csa->ti->curr_tn;
	CHECK_TN(csa, csd, curr_tn);
	udi = FILE_INFO(gv_cur_region);
	/* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */
	trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1));
	csd->after_trunc_total_blks = new_total;
	csd->before_trunc_free_blocks = csa->ti->free_blocks;
	csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */
	/* file size and total blocks: INCONSISTENT */
	csa->ti->total_blks = new_total;
	/* past the point of no return -- shared memory intact */
	assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total));
	csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total);
	new_free = csa->ti->free_blocks;
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	CHECK_DBSYNC(gv_cur_region, save_errno);
	/* past the point of no return -- shared memory deleted */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */
	clear_cache_array(csa, csd, gv_cur_region, new_total, old_total);
	offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size;
	save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff)));
	if (0 != save_errno)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		return FALSE;
	}
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */
	/* Execute an ftruncate() and truncate the DB file
	 * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS)
	 * It ignores kill -9 signal till its operation is completed.
	 * So we can safely assume that the result of ftruncate() will be complete.
	 */
	FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status);
	if (0 != ftrunc_status)
	{
		err_msg = (char *)STRERROR(errno);
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg));
		/* should go through recover_truncate now, which will again try to FTRUNCATE */
		return FALSE;
	}
	/* file size and total blocks: CONSISTENT (shrunk) */
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */
	csa->nl->root_search_cycle++;	/* Force concurrent processes to restart in t_end/tp_tend to make sure no one
					 * tries to commit updates past the end of the file. Bitmap validations together
					 * with highest_lbm_with_busy_blk should actually be sufficient, so this is
					 * just to be safe.
					 */
	csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */
	/* Increment TN */
	assert(csa->ti->early_tn == csa->ti->curr_tn);
	csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1;
	INCREMENT_CURR_TN(csd);
	fileheader_sync(gv_cur_region);
	DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno);
	KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */
	CHECK_DBSYNC(gv_cur_region, save_errno);
	ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state);
	curr_tn = csa->ti->curr_tn;
	rel_crit(gv_cur_region);
	send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn);
	util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].",
					FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free);
	return TRUE;
} /* END of mu_truncate() */
Beispiel #6
0
void dse_chng_bhead(void)
{
	blk_hdr			new_hdr;
	blk_segment		*bs1, *bs_ptr;
	block_id		blk;
	boolean_t		chng_blk, ismap, was_hold_onto_crit;
	int4			blk_seg_cnt, blk_size;	/* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */
	int4			x;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
	srch_blk_status		blkhist;
	trans_num		tn;
	uint4			mapsize;

	csa = cs_addrs;
        if (gv_cur_region->read_only)
                rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	chng_blk = FALSE;
	if (BADDSEBLK == (blk = dse_getblk("BLOCK", DSEBMLOK, DSEBLKCUR)))		/* WARNING: assignment */
		return;
	csd = csa->hdr;
	assert(csd == cs_data);
	blk_size = csd->blk_size;
	ismap = IS_BITMAP_BLK(blk);
	mapsize = BM_SIZE(csd->bplmap);
	t_begin_crit(ERR_DSEFAIL);
	blkhist.blk_num = blk;
	if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr)))
		rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL);
	new_hdr = *(blk_hdr_ptr_t)blkhist.buffaddr;
	if (CLI_PRESENT == cli_present("LEVEL"))
	{
		if (!cli_get_hex("LEVEL", (uint4 *)&x))
		{
			t_abort(gv_cur_region, csa);
			return;
		}
		if (ismap && (unsigned char)x != LCL_MAP_LEVL)
		{
			util_out_print("Error: invalid level for a bit map block.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		if (!ismap && (x < 0 || x > MAX_BT_DEPTH + 1))
		{
			util_out_print("Error: invalid level.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		new_hdr.levl = (unsigned char)x;
		chng_blk = TRUE;
		if (new_hdr.bsiz < SIZEOF(blk_hdr))
			new_hdr.bsiz = SIZEOF(blk_hdr);
		if (new_hdr.bsiz  > blk_size)
			new_hdr.bsiz = blk_size;
	}
	if (CLI_PRESENT == cli_present("BSIZ"))
	{
		if (!cli_get_hex("BSIZ", (uint4 *)&x))
		{
			t_abort(gv_cur_region, csa);
			return;
		}
		if (ismap && x != mapsize)
		{
			util_out_print("Error: invalid bsiz.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		} else if (x < SIZEOF(blk_hdr) || x > blk_size)
		{
			util_out_print("Error: invalid bsiz.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		chng_blk = TRUE;
		new_hdr.bsiz = x;
	}
	if (!chng_blk)
		t_abort(gv_cur_region, csa);
	else
	{
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, blkhist.buffaddr + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_AIMGBLKFAIL, 3, blk, DB_LEN_STR(gv_cur_region));
			t_abort(gv_cur_region, csa);
			return;
		}
		t_write(&blkhist, (unsigned char *)bs1, 0, 0, new_hdr.levl, TRUE, FALSE, GDS_WRITE_KILLTN);
		BUILD_AIMG_IF_JNL_ENABLED(csd, csa->ti->curr_tn);
		t_end(&dummy_hist, NULL, TN_NOT_SPECIFIED);
	}
	if (CLI_PRESENT == cli_present("TN"))
	{
		if (!cli_get_hex64("TN", &tn))
			return;
		t_begin_crit(ERR_DSEFAIL);
		CHECK_TN(csa, csd, csd->trans_hist.curr_tn);	/* can issue rts_error TNTOOLARGE */
		assert(csa->ti->early_tn == csa->ti->curr_tn);
		if (NULL == (blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr)))
		{
			gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL);
			t_abort(gv_cur_region, csa);
			return;
		}
		if (new_hdr.bsiz < SIZEOF(blk_hdr))
			new_hdr.bsiz = SIZEOF(blk_hdr);
		if (new_hdr.bsiz  > blk_size)
			new_hdr.bsiz = blk_size;
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, blkhist.buffaddr + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr));
		BLK_FINI(bs_ptr, bs1);
		t_write(&blkhist, (unsigned char *)bs1, 0, 0,
			((blk_hdr_ptr_t)blkhist.buffaddr)->levl, TRUE, FALSE, GDS_WRITE_KILLTN);
		/* Pass the desired tn as argument to bg_update/mm_update below */
		BUILD_AIMG_IF_JNL_ENABLED_AND_T_END_WITH_EFFECTIVE_TN(csa, csd, tn, &dummy_hist);
	}
	return;
}
Beispiel #7
0
void preemptive_db_clnup(int preemptive_severe)
{
	sgmnt_addrs	*csa;
	sgm_info	*si;
	gd_region	*r_top, *reg;
	gd_addr		*addr_ptr;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	/* Clear "inctn_opcode" global variable now that any in-progress transaction is aborted at this point.
	 * Not doing so would cause future calls to "t_end" to get confused and skip writing logical jnl recs
	 * and instead incorrectly write an INCTN record (GTM-8425).
	 */
	if (bml_save_dollar_tlevel)
	{
		assert(!dollar_tlevel);
		dollar_tlevel = bml_save_dollar_tlevel;
		bml_save_dollar_tlevel = 0;
	}
	assert(!dollar_tlevel || (inctn_invalid_op == inctn_opcode) || (inctn_bmp_mark_free_gtm == inctn_opcode));
	assert(dollar_tlevel || update_trans || (inctn_invalid_op == inctn_opcode));
	inctn_opcode = inctn_invalid_op;
	if (!dollar_tlevel && update_trans)
	{	/* It's possible we hit an error in the middle of an update, at which point we have
		 * a valid clue and non-NULL cse. However, this causes problems for subsequent
		 * transactions (see comment in t_begin). In particular we could end up pinning buffers
		 * unnecessarily. So clear the cse of any histories that may have been active during the update.
		 */
		CLEAR_CSE(gv_target);
		if ((NULL != gv_target) && (NULL != gv_target->gd_csa))
		{
			CLEAR_CSE(gv_target->gd_csa->dir_tree);
			GTMTRIG_ONLY(CLEAR_CSE(gv_target->gd_csa->hasht_tree));
		}
		/* Resetting this is necessary to avoid blowing an assert in t_begin that it is 0 at the start of a transaction. */
		update_trans = 0;
	}
	if (INVALID_GV_TARGET != reset_gv_target)
	{
		if (SUCCESS != preemptive_severe && INFO != preemptive_severe)
		{
			/* We know of a few cases in Unix where gv_target and gv_currkey could be out of sync at this point.
			 *   a) If we are inside trigger code which in turn does an update that does
			 *	reads of ^#t global and ends up in a restart. This restart would
			 *	in turn do a rts_error_csa(TPRETRY) which would invoke mdb_condition_handler
			 *	that would in turn invoke preemptive_db_clnup which invokes this macro.
			 *	In this tp restart case though, it is ok for gv_target and gv_currkey
			 *	to be out of sync because they are going to be reset by tp_clean_up anyways.
			 *	So skip the dbg-only in-sync check.
			 *   b) If we are in gvtr_init reading the ^#t global and detect an error (e.g. TRIGINVCHSET)
			 *	gv_target after the reset would be pointing to a regular global whereas gv_currkey
			 *	would be pointing to ^#t. It is ok to be out-of-sync since in this case, we expect
			 *	mdb_condition_handler to be calling us. That has code to reset gv_currkey (and
			 *	cs_addrs/cs_data etc.) to reflect gv_target (i.e. get them back in sync).
			 * Therefore in Unix we pass SKIP_GVT_GVKEY_CHECK to skip the gvtarget/gvcurrkey out-of-sync check
			 * in RESET_GV_TARGET. In VMS we pass DO_GVT_GVKEY_CHECK as we dont yet know of an out-of-sync situation.
			 */
			RESET_GV_TARGET(UNIX_ONLY(SKIP_GVT_GVKEY_CHECK) VMS_ONLY(DO_GVT_GVKEY_CHECK));
		}
	}
	need_kip_incr = FALSE;	/* in case we got an error in t_end (e.g. GBLOFLOW), dont want this global variable to get
				 * carried over to the next non-TP transaction that this process does (e.g. inside an error trap).
				 */
	TREF(expand_prev_key) = FALSE;	/* reset global (in case it is TRUE) so it does not get carried over to future operations */
	if (dollar_tlevel)
	{
		for (si = first_sgm_info;  si != NULL; si = si->next_sgm_info)
		{
			if (NULL != si->kip_csa)
			{
				csa = si->tp_csa;
				assert(si->tp_csa == si->kip_csa);
				PROBE_DECR_KIP(csa->hdr, csa, si->kip_csa);
			}
		}
	} else if (NULL != kip_csa && (NULL != kip_csa->hdr) && (NULL != kip_csa->nl))
		PROBE_DECR_KIP(kip_csa->hdr, kip_csa, kip_csa);
	if (IS_DSE_IMAGE)
	{	/* Release crit on any region that was obtained for the current erroring DSE operation.
		 * Take care NOT to release crits obtained by a previous CRIT -SEIZE command.
		 */
		for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr))
		{
			for (reg = addr_ptr->regions, r_top = reg + addr_ptr->n_regions; reg < r_top; reg++)
			{
				if (reg->open && !reg->was_open)
				{
					csa = &FILE_INFO(reg)->s_addrs;
					assert(csa->hold_onto_crit || !csa->dse_crit_seize_done);
					assert(!csa->hold_onto_crit || csa->now_crit);
					if (csa->now_crit && (!csa->hold_onto_crit || !csa->dse_crit_seize_done))
					{
						rel_crit(reg);
						csa->hold_onto_crit = FALSE;
						t_abort(reg, csa);	/* cancel mini-transaction if any in progress */
					}
				}
			}
		}
	}
}
Beispiel #8
0
void dse_chng_rhead(void)
{
	block_id	blk;
	sm_uc_ptr_t	bp, b_top, cp, rp;
	boolean_t	chng_rec;
	rec_hdr		new_rec;
	uint4		x;
	blk_segment	*bs1, *bs_ptr;
	int4		blk_seg_cnt, blk_size;
	srch_blk_status	blkhist;

	error_def(ERR_DBRDONLY);
	error_def(ERR_DSEBLKRDFAIL);
	error_def(ERR_DSEFAIL);

        if (gv_cur_region->read_only)
                rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	if (cli_present("BLOCK") == CLI_PRESENT)
	{
		if(!cli_get_hex("BLOCK", (uint4 *)&blk))
			return;
		patch_curr_blk = blk;
	}
	if (patch_curr_blk < 0 || patch_curr_blk >= cs_addrs->ti->total_blks || !(patch_curr_blk % cs_addrs->hdr->bplmap))
	{
		util_out_print("Error: invalid block number.", TRUE);
		return;
	}

	t_begin_crit(ERR_DSEFAIL);
	blkhist.blk_num = patch_curr_blk;
	if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr)))
		rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
	bp = blkhist.buffaddr;
	blk_size = cs_addrs->hdr->blk_size;
	chng_rec = FALSE;
	b_top = bp + ((blk_hdr_ptr_t)bp)->bsiz;
	if (((blk_hdr_ptr_t)bp)->bsiz > blk_size || ((blk_hdr_ptr_t)bp)->bsiz < SIZEOF(blk_hdr))
		chng_rec = TRUE;	/* force rewrite to correct size */
	if (cli_present("RECORD") == CLI_PRESENT)
	{
		if (!(rp = skan_rnum(bp, FALSE)))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
	} else if (!(rp = skan_offset(bp, FALSE)))
	{
		t_abort(gv_cur_region, cs_addrs);
		return;
	}
	GET_SHORT(new_rec.rsiz, &((rec_hdr_ptr_t)rp)->rsiz);
	new_rec.cmpc = ((rec_hdr_ptr_t)rp)->cmpc;
	if (cli_present("CMPC") == CLI_PRESENT)
	{
		if (!cli_get_hex("CMPC", &x))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (x > 0x7f)
		{
			util_out_print("Error: invalid cmpc.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (x > patch_comp_count)
			util_out_print("Warning:  specified compression count is larger than the current expanded key size.", TRUE);
		new_rec.cmpc = x;
		chng_rec = TRUE;
	}
	if (cli_present("RSIZ") == CLI_PRESENT)
	{
		if (!cli_get_hex("RSIZ", &x))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (x < SIZEOF(rec_hdr) || x > blk_size)
		{
			util_out_print("Error: invalid rsiz.", TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		new_rec.rsiz = x;
		chng_rec = TRUE;
	}
	if (chng_rec)
	{
		BLK_INIT(bs_ptr, bs1);
		cp = bp;
		cp += SIZEOF(blk_hdr);
		if (chng_rec)
		{
			BLK_SEG(bs_ptr, cp, rp - cp);
			BLK_SEG(bs_ptr, (uchar_ptr_t)&new_rec, SIZEOF(rec_hdr));
			cp = rp + SIZEOF(rec_hdr);
		}
		if (b_top - cp)
			BLK_SEG(bs_ptr, cp, b_top - cp);
		if (!BLK_FINI(bs_ptr, bs1))
		{
			util_out_print("Error: bad blk build.", TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		t_write(&blkhist, (unsigned char *)bs1, 0, 0, ((blk_hdr_ptr_t)bp)->levl, TRUE, FALSE, GDS_WRITE_KILLTN);
		BUILD_AIMG_IF_JNL_ENABLED(cs_data, non_tp_jfb_buff_ptr, cs_addrs->ti->curr_tn);
		t_end(&dummy_hist, NULL, TN_NOT_SPECIFIED);
	}
	return;
}
Beispiel #9
0
void dse_chng_bhead(void)
{
	block_id		blk;
	int4			x;
	trans_num		tn;
	cache_rec_ptr_t		cr;
	blk_hdr			new_hdr;
	blk_segment		*bs1, *bs_ptr;
	int4			blk_seg_cnt, blk_size;	/* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */
	boolean_t		ismap;
	boolean_t		chng_blk;
	boolean_t		was_crit;
	boolean_t		was_hold_onto_crit;
	uint4			mapsize;
	srch_blk_status		blkhist;
	sgmnt_addrs		*csa;
	sgmnt_data_ptr_t	csd;
#	ifdef GTM_CRYPT
	int			req_enc_blk_size;
	int			crypt_status;
	blk_hdr_ptr_t		bp, save_bp, save_old_block;
#	endif

	error_def(ERR_DSEBLKRDFAIL);
	error_def(ERR_DSEFAIL);
	error_def(ERR_DBRDONLY);

        if (gv_cur_region->read_only)
                rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	chng_blk = FALSE;
	csa = cs_addrs;
	if (cli_present("BLOCK") == CLI_PRESENT)
	{
		if (!cli_get_hex("BLOCK", (uint4 *)&blk))
			return;
		if (blk < 0 || blk > csa->ti->total_blks)
		{
			util_out_print("Error: invalid block number.", TRUE);
			return;
		}
		patch_curr_blk = blk;
	}
	csd = csa->hdr;
	assert(csd == cs_data);
	blk_size = csd->blk_size;
	ismap = (patch_curr_blk / csd->bplmap * csd->bplmap == patch_curr_blk);
	mapsize = BM_SIZE(csd->bplmap);

	t_begin_crit(ERR_DSEFAIL);
	blkhist.blk_num = patch_curr_blk;
	if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr)))
		rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
	new_hdr = *(blk_hdr_ptr_t)blkhist.buffaddr;

	if (cli_present("LEVEL") == CLI_PRESENT)
	{
		if (!cli_get_hex("LEVEL", (uint4 *)&x))
		{
			t_abort(gv_cur_region, csa);
			return;
		}
		if (ismap && (unsigned char)x != LCL_MAP_LEVL)
		{
			util_out_print("Error: invalid level for a bit map block.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		if (!ismap && (x < 0 || x > MAX_BT_DEPTH + 1))
		{
			util_out_print("Error: invalid level.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
	 	new_hdr.levl = (unsigned char)x;

		chng_blk = TRUE;
		if (new_hdr.bsiz < SIZEOF(blk_hdr))
			new_hdr.bsiz = SIZEOF(blk_hdr);
		if (new_hdr.bsiz  > blk_size)
			new_hdr.bsiz = blk_size;
	}
	if (cli_present("BSIZ") == CLI_PRESENT)
	{
		if (!cli_get_hex("BSIZ", (uint4 *)&x))
		{
			t_abort(gv_cur_region, csa);
			return;
		}
		if (ismap && x != mapsize)
		{
			util_out_print("Error: invalid bsiz.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		} else if (x < SIZEOF(blk_hdr) || x > blk_size)
		{
			util_out_print("Error: invalid bsiz.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		chng_blk = TRUE;
		new_hdr.bsiz = x;
	}
	if (!chng_blk)
		t_abort(gv_cur_region, csa);
	else
	{
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, blkhist.buffaddr + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			util_out_print("Error: bad block build.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		t_write(&blkhist, (unsigned char *)bs1, 0, 0, new_hdr.levl, TRUE, FALSE, GDS_WRITE_KILLTN);
		BUILD_AIMG_IF_JNL_ENABLED(csd, non_tp_jfb_buff_ptr, csa->ti->curr_tn);
		t_end(&dummy_hist, NULL, TN_NOT_SPECIFIED);
	}
	if (cli_present("TN") == CLI_PRESENT)
	{
		if (!cli_get_hex64("TN", &tn))
			return;
		was_crit = csa->now_crit;
		t_begin_crit(ERR_DSEFAIL);
		CHECK_TN(csa, csd, csd->trans_hist.curr_tn);	/* can issue rts_error TNTOOLARGE */
		assert(csa->ti->early_tn == csa->ti->curr_tn);
		if (NULL == (blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr)))
		{
			util_out_print("Error: Unable to read buffer.", TRUE);
			t_abort(gv_cur_region, csa);
			return;
		}
		if (new_hdr.bsiz < SIZEOF(blk_hdr))
			new_hdr.bsiz = SIZEOF(blk_hdr);
		if (new_hdr.bsiz  > blk_size)
			new_hdr.bsiz = blk_size;
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, blkhist.buffaddr + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr));
		BLK_FINI(bs_ptr, bs1);
		t_write(&blkhist, (unsigned char *)bs1, 0, 0,
			((blk_hdr_ptr_t)blkhist.buffaddr)->levl, TRUE, FALSE, GDS_WRITE_KILLTN);
		/* Pass the desired tn as argument to bg_update/mm_update below */
		BUILD_AIMG_IF_JNL_ENABLED(csd, non_tp_jfb_buff_ptr, tn);
		was_hold_onto_crit = csa->hold_onto_crit;
		csa->hold_onto_crit = TRUE;
		t_end(&dummy_hist, NULL, tn);
#		ifdef GTM_CRYPT
		if (csd->is_encrypted && (tn < csa->ti->curr_tn))
		{	/* BG and db encryption is enabled and the DSE update caused the block-header to potentially have a tn
			 * that is LESS than what it had before. At this point, the global buffer (corresponding to blkhist.blk_num)
			 * reflects the contents of the block AFTER the dse update (bg_update would have touched this) whereas
			 * the corresponding encryption global buffer reflects the contents of the block BEFORE the update.
			 * Normally wcs_wtstart takes care of propagating the tn update from the regular global buffer to the
			 * corresponding encryption buffer. But if before it gets a chance, let us say a process goes to t_end
			 * as part of a subsequent transaction and updates this same block. Since the  blk-hdr-tn potentially
			 * decreased, it is possible that the PBLK writing check (comparing blk-hdr-tn with the epoch_tn) decides
			 * to write a PBLK for this block (even though a PBLK was already written for this block as part of a
			 * previous DSE CHANGE -BL -TN in the same epoch). In this case, since the db is encrypted, the logic
			 * will assume there were no updates to this block since the last time wcs_wtstart updated the encryption
			 * buffer and therefore use that to write the pblk, which is incorrect since it does not yet contain the
			 * tn update. The consequence of this is would be writing an older before-image PBLK) record to the
			 * journal file. To prevent this situation, we update the encryption buffer here (before releasing crit)
			 * using logic like that in wcs_wtstart to ensure it is in sync with the regular global buffer.
			 * Note:
			 * Although we use cw_set[0] to access the global buffer corresponding to the block number being updated,
			 * cw_set_depth at this point is 0 because t_end resets it. This is considered safe since cw_set is a
			 * static array (as opposed to malloc'ed memory) and hence is always available and valid until it gets
			 * overwritten by subsequent updates.
			 */
			bp = (blk_hdr_ptr_t)GDS_ANY_REL2ABS(csa, cw_set[0].cr->buffaddr);
			DBG_ENSURE_PTR_IS_VALID_GLOBUFF(csa, csd, (sm_uc_ptr_t)bp);
			save_bp = (blk_hdr_ptr_t)GDS_ANY_ENCRYPTGLOBUF(bp, csa);
			DBG_ENSURE_PTR_IS_VALID_ENCTWINGLOBUFF(csa, csd, (sm_uc_ptr_t)save_bp);
			assert((bp->bsiz <= csd->blk_size) && (bp->bsiz >= SIZEOF(*bp)));
			req_enc_blk_size = MIN(csd->blk_size, bp->bsiz) - SIZEOF(*bp);
			if (BLK_NEEDS_ENCRYPTION(bp->levl, req_enc_blk_size))
			{
				ASSERT_ENCRYPTION_INITIALIZED;
				memcpy(save_bp, bp, SIZEOF(blk_hdr));
				GTMCRYPT_ENCODE_FAST(csa->encr_key_handle, (char *)(bp + 1), req_enc_blk_size,
					(char *)(save_bp + 1), crypt_status);
				if (0 != crypt_status)
					GC_GTM_PUTMSG(crypt_status, gv_cur_region->dyn.addr->fname);
			} else
				memcpy(save_bp, bp, bp->bsiz);
		}
#		endif
		if (!was_hold_onto_crit)
			csa->hold_onto_crit = FALSE;
		if (!was_crit)
			rel_crit(gv_cur_region);
		if (unhandled_stale_timer_pop)
			process_deferred_stale();
	}
	return;
}
Beispiel #10
0
void dse_rmrec(void)
{
	block_id	blk;
	blk_segment	*bs1, *bs_ptr;
	int4		blk_seg_cnt, blk_size, count;
	sm_uc_ptr_t	bp;
	uchar_ptr_t	lbp, b_top, rp, r_top, key_top, rp_base;
	char		cc, comp_key[256], cc_base;
	short int	size, i, rsize;
	cw_set_element	*cse;
	error_def(ERR_DSEFAIL);
	error_def(ERR_DSEBLKRDFAIL);
	error_def(ERR_DBRDONLY);

        if (gv_cur_region->read_only)
                rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	assert(update_array);
	/* reset new block mechanism */
	update_array_ptr = update_array;
	if (cli_present("BLOCK") == CLI_PRESENT)
	{
		if(!cli_get_hex("BLOCK", &blk))
			return;
		if (blk < 0 || blk >= cs_addrs->ti->total_blks || !(blk % cs_addrs->hdr->bplmap))
		{
			util_out_print("Error: invalid block number.", TRUE);
			return;
		}
		patch_curr_blk = blk;
	}
	if (cli_present("COUNT") == CLI_PRESENT)
	{
		if (!cli_get_hex("COUNT", &count) || count < 1)
			return;
	} else
		count = 1;
	t_begin_crit(ERR_DSEFAIL);
	blk_size = cs_addrs->hdr->blk_size;
	if(!(bp = t_qread(patch_curr_blk, &dummy_hist.h[0].cycle, &dummy_hist.h[0].cr)))
		rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
	lbp = (uchar_ptr_t)malloc(blk_size);
	memcpy(lbp, bp, blk_size);

	if (((blk_hdr_ptr_t)lbp)->bsiz > cs_addrs->hdr->blk_size)
		b_top = lbp + cs_addrs->hdr->blk_size;
	else if (((blk_hdr_ptr_t)lbp)->bsiz < sizeof(blk_hdr))
		b_top = lbp + sizeof(blk_hdr);
	else
		b_top = lbp + ((blk_hdr_ptr_t)lbp)->bsiz;
	if (cli_present("RECORD") == CLI_PRESENT)
	{
		if (!(rp = rp_base = skan_rnum(lbp, FALSE)))
		{
			free(lbp);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
	} else if (!(rp = rp_base = skan_offset(lbp, FALSE)))
	{
		free(lbp);
		t_abort(gv_cur_region, cs_addrs);
		return;
	}
	memcpy(&comp_key[0], &patch_comp_key[0], sizeof(patch_comp_key));
	cc_base = patch_comp_count;
	for ( ; ; )
	{
		GET_SHORT(rsize, &((rec_hdr_ptr_t)rp)->rsiz);
		if (rsize < sizeof(rec_hdr))
			r_top = rp + sizeof(rec_hdr);
		else
			r_top = rp + rsize;
		if (r_top >= b_top)
		{
			if (count)
			{	if (((blk_hdr_ptr_t) lbp)->levl)
					util_out_print("Warning:  removed a star record from the end of this block.", TRUE);
				((blk_hdr_ptr_t)lbp)->bsiz = rp_base - lbp;
				BLK_INIT(bs_ptr, bs1);
				BLK_SEG(bs_ptr, (uchar_ptr_t)lbp + sizeof(blk_hdr),
					(int)((blk_hdr_ptr_t)lbp)->bsiz - sizeof(blk_hdr));
				if (!BLK_FINI(bs_ptr, bs1))
				{
					util_out_print("Error: bad blk build.",TRUE);
					free(lbp);
					t_abort(gv_cur_region, cs_addrs);
					return;
				}
				t_write(patch_curr_blk, (unsigned char *)bs1, 0, 0, bp, ((blk_hdr_ptr_t)lbp)->levl, TRUE, FALSE);
				BUILD_AIMG_IF_JNL_ENABLED(cs_addrs, cs_data, non_tp_jfb_buff_ptr, cse);
				t_end(&dummy_hist, 0);
				free(lbp);
				return;
			}
			r_top = b_top;
		}
		if (((blk_hdr_ptr_t)lbp)->levl)
			key_top = r_top - sizeof(block_id);
		else
		{
			for (key_top = rp + sizeof(rec_hdr); key_top < r_top; )
				if (!*key_top++ && !*key_top++)
					break;
		}
		if (((rec_hdr_ptr_t)rp)->cmpc > patch_comp_count)
			cc = patch_comp_count;
		else
			cc = ((rec_hdr_ptr_t)rp)->cmpc;
		size = key_top - rp - sizeof(rec_hdr);
		if (size < 0)
			size = 0;
		else if (size > sizeof(patch_comp_key) - 2)
			size = sizeof(patch_comp_key) - 2;
		memcpy(&patch_comp_key[cc], rp + sizeof(rec_hdr), size);
		patch_comp_count = cc + size;
		if (--count >= 0)
		{
			rp = r_top;
			continue;
		}
		size = (patch_comp_count < cc_base) ? patch_comp_count : cc_base;
		for (i = 0; i < size && patch_comp_key[i] == comp_key[i]; i++)
			;
		((rec_hdr_ptr_t)rp_base)->cmpc = i;
		rsize = r_top - key_top + sizeof(rec_hdr) + patch_comp_count - i;
		PUT_SHORT(&((rec_hdr_ptr_t)rp_base)->rsiz, rsize);
		memcpy(rp_base + sizeof(rec_hdr), &patch_comp_key[i], patch_comp_count - i);
		memcpy(rp_base + sizeof(rec_hdr) + patch_comp_count - i, key_top, b_top - key_top);
		((blk_hdr_ptr_t)lbp)->bsiz = rp_base + rsize - lbp + b_top - r_top;
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, (uchar_ptr_t)lbp + sizeof(blk_hdr), ((blk_hdr_ptr_t)lbp)->bsiz - sizeof(blk_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			util_out_print("Error: bad blk build.", TRUE);
			free(lbp);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		t_write(patch_curr_blk, (unsigned char *)bs1, 0, 0, bp, ((blk_hdr_ptr_t)lbp)->levl, TRUE, FALSE);
		BUILD_AIMG_IF_JNL_ENABLED(cs_addrs, cs_data, non_tp_jfb_buff_ptr, cse);
		t_end(&dummy_hist, 0);
		free(lbp);
		return;
	}
}
Beispiel #11
0
void dse_chng_bhead(void)
{
	block_id	blk;
	block_id	*blkid_ptr;
	sgm_info	*dummysi = NULL;
	int4		x;
	cache_rec_ptr_t	cr;
	uchar_ptr_t	bp;
	sm_uc_ptr_t	blkBase;
	blk_hdr		new_hdr;
	blk_segment	*bs1, *bs_ptr;
	cw_set_element  *cse;
	int4		blk_seg_cnt, blk_size;	/* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */
	bool		ismap;
	bool		chng_blk;
	uint4		mapsize;
	uint4           jnl_status;

	error_def(ERR_DSEBLKRDFAIL);
	error_def(ERR_DSEFAIL);
	error_def(ERR_DBRDONLY);

        if (gv_cur_region->read_only)
                rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
	assert(update_array);
	/* reset new block mechanism */
	update_array_ptr = update_array;
	chng_blk = FALSE;
	if (cli_present("BLOCK") == CLI_PRESENT)
	{
		if (!cli_get_hex("BLOCK",&blk))
			return;
		if (blk < 0 || blk > cs_addrs->ti->total_blks)
		{	util_out_print("Error: invalid block number.",TRUE);
			return;
		}
		patch_curr_blk = blk;
	}
	blk_size = cs_addrs->hdr->blk_size;
	ismap = (patch_curr_blk / cs_addrs->hdr->bplmap * cs_addrs->hdr->bplmap == patch_curr_blk);
	mapsize = BM_SIZE(cs_addrs->hdr->bplmap);

	t_begin_crit (ERR_DSEFAIL);
	if (!(bp = t_qread (patch_curr_blk,&dummy_hist.h[0].cycle,&dummy_hist.h[0].cr)))
		rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL);
	new_hdr = *(blk_hdr_ptr_t)bp;

	if (cli_present("LEVEL") == CLI_PRESENT)
	{
		if (!cli_get_num("LEVEL",&x))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (ismap && (unsigned char)x != LCL_MAP_LEVL)
		{
			util_out_print("Error: invalid level for a bit map block.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (!ismap && (x < 0 || x > MAX_BT_DEPTH + 1))
		{
			util_out_print("Error: invalid level.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
	 	new_hdr.levl = (unsigned char)x;

		chng_blk = TRUE;
		if (new_hdr.bsiz < sizeof(blk_hdr))
			new_hdr.bsiz = sizeof(blk_hdr);
		if (new_hdr.bsiz  > blk_size)
			new_hdr.bsiz = blk_size;
	}
	if (cli_present("BSIZ") == CLI_PRESENT)
	{
		if (!cli_get_hex("BSIZ",&x))
		{
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		if (ismap && x != mapsize)
		{
			util_out_print("Error: invalid bsiz.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		else if (x < sizeof(blk_hdr) || x > blk_size)
		{
			util_out_print("Error: invalid bsiz.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		chng_blk = TRUE;
		new_hdr.bsiz = x;
	}
	if (!chng_blk)
		t_abort(gv_cur_region, cs_addrs);
	else
	{
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, bp + sizeof(new_hdr), new_hdr.bsiz - sizeof(new_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			util_out_print("Error: bad block build.",TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		t_write (patch_curr_blk, (unsigned char *)bs1, 0, 0, bp, new_hdr.levl, TRUE, FALSE);
		BUILD_AIMG_IF_JNL_ENABLED(cs_addrs, cs_data, non_tp_jfb_buff_ptr, cse);
		t_end(&dummy_hist, 0);
	}
	if (cli_present("TN") == CLI_PRESENT)
	{
		if (!cli_get_hex("TN",&x))
			return;
		t_begin_crit(ERR_DSEFAIL);
		assert(cs_addrs->ti->early_tn == cs_addrs->ti->curr_tn);
		cs_addrs->ti->early_tn++;
		blkBase = t_qread(patch_curr_blk, &dummy_hist.h[0].cycle, &dummy_hist.h[0].cr);
		if (NULL == blkBase)
		{
			rel_crit(gv_cur_region);
			util_out_print("Error: Unable to read buffer.", TRUE);
			t_abort(gv_cur_region, cs_addrs);
			return;
		}
		/* Create a null update array for a block */
		if (ismap)
		{
			BLK_ADDR(blkid_ptr, sizeof(block_id), block_id);
			*blkid_ptr = 0;
			t_write_map(patch_curr_blk, blkBase, (unsigned char *)blkid_ptr, cs_addrs->ti->curr_tn);
			cr_array_index = 0;
			block_saved = FALSE;
		} else
		{
			BLK_INIT(bs_ptr, bs1);
			BLK_SEG(bs_ptr, bp + sizeof(new_hdr), new_hdr.bsiz - sizeof(new_hdr));
			BLK_FINI(bs_ptr, bs1);
			t_write(patch_curr_blk, (unsigned char *)bs1, 0, 0, blkBase,
						((blk_hdr_ptr_t)blkBase)->levl, TRUE, FALSE);
			cr_array_index = 0;
			block_saved = FALSE;
			if (JNL_ENABLED(cs_data))
			{
				JNL_SHORT_TIME(jgbl.gbl_jrec_time);	/* needed for jnl_put_jrt_pini() and jnl_write_aimg_rec() */
				jnl_status = jnl_ensure_open();
				if (0 == jnl_status)
				{
					cse = (cw_set_element *)(&cw_set[0]);
					cse->new_buff = non_tp_jfb_buff_ptr;
					gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, x);
					cse->done = TRUE;
					if (0 == cs_addrs->jnl->pini_addr)
						jnl_put_jrt_pini(cs_addrs);
					jnl_write_aimg_rec(cs_addrs, cse->blk, (blk_hdr_ptr_t)cse->new_buff);
				} else
					rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region));
			}
		}
		/* Pass the desired tn "x" as argument to bg_update or mm_update */
		if (dba_bg == cs_addrs->hdr->acc_meth)
			bg_update(cw_set, cw_set + cw_set_depth, cs_addrs->ti->curr_tn, x, dummysi);
		else
			mm_update(cw_set, cw_set + cw_set_depth, cs_addrs->ti->curr_tn, x, dummysi);
		cs_addrs->ti->curr_tn++;
		assert(cs_addrs->ti->early_tn == cs_addrs->ti->curr_tn);
		/* the following code is analogous to that in t_end and should be maintained in a similar fashion */
		while (cr_array_index)
			cr_array[--cr_array_index]->in_cw_set = FALSE;
		rel_crit(gv_cur_region);
		if (block_saved)
			backup_buffer_flush(gv_cur_region);
		UNIX_ONLY(
			if (unhandled_stale_timer_pop)
				process_deferred_stale();
		)
		wcs_timer_start(gv_cur_region, TRUE);
	}
Beispiel #12
0
/****************************************************************
Input Parameter:
	gn = Global name
	exclude_glist_ptr = list of globals in EXCLUDE option
	index_fill_factor = index blocks' fill factor
	data_fill_factor = data blocks' fill factor
Input/Output Parameter:
	resume = resume flag
	reorg_op = What operations to do (coalesce or, swap or, split) [Default is all]
			[Only for debugging]
 ****************************************************************/
boolean_t mu_reorg(mval *gn, glist *exclude_glist_ptr, boolean_t *resume, int index_fill_factor, int data_fill_factor, int reorg_op)
{
	boolean_t		end_of_tree = FALSE, complete_merge, detailed_log;
	int			rec_size;
	/*
	 *
	 * "level" is the level of the working block.
	 * "pre_order_successor_level" is pre_order successor level except in the case
	 * where we are in a left-most descent of the tree
	 * in which case pre_order_successor_level will be the maximum height of that subtree
	 * until we reach the leaf level block .
	 * In other words, pre_order_successor_level and level variable controls the iterative pre-order traversal.
	 * We start reorg from the (root_level - 1) to 0. That is, level = pre_order_successor_level:-1:0.
	 */
	int			pre_order_successor_level, level;
	static block_id		dest_blk_id = 0;
	int			tkeysize;
	int			blks_killed, blks_processed, blks_reused, blks_coalesced, blks_split, blks_swapped,
				count, file_extended, lvls_reduced;
	int			d_max_fill, i_max_fill, blk_size, cur_blk_size, max_fill, toler, d_toler, i_toler;
	int			cnt1, cnt2;
	kill_set		kill_set_list;
	sm_uc_ptr_t		rPtr1;
	enum cdb_sc		status;
	srch_hist		*rtsib_hist;
	jnl_buffer_ptr_t	jbp;
	trans_num		ret_tn;

	error_def(ERR_MUREORGFAIL);
	error_def(ERR_DBRDONLY);
	error_def(ERR_GBLNOEXIST);
	error_def(ERR_MAXBTLEVEL);

	t_err = ERR_MUREORGFAIL;
	kill_set_tail = &kill_set_list;
	/* Initialization for current global */
	op_gvname(VARLSTCNT(1) gn);
	/* Cannot proceed for read-only data files */
	if (gv_cur_region->read_only)
	{
		gtm_putmsg(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region));
		return FALSE;
	}
	dest_blk_id = cs_addrs->reorg_last_dest;
	inctn_opcode = inctn_mu_reorg;

	/* If resume option is present, then reorg_restart_key should be not null.
	 * Skip all globals until we are in the region for that global.
	 * Get the reorg_restart_key and reorg_restart_block from database header and restart from there.
	 */
	if (*resume && 0 != cs_data->reorg_restart_key[0])
	{
		/* resume from last key reorged in GVT */
		GET_KEY_LEN(tkeysize, &cs_data->reorg_restart_key[0]);
		memcpy(gv_currkey->base, cs_data->reorg_restart_key, tkeysize);
		gv_currkey->end = tkeysize - 1;
		dest_blk_id = cs_data->reorg_restart_block;
 		if (0 == memcmp(cs_data->reorg_restart_key, gn->str.addr, gn->str.len))
			/* Going to resume from current global, so it resumed and make it false */
			*resume = FALSE;
	} else
	{
		/* start from the left most leaf */
		memcpy(&gv_currkey->base[0], gn->str.addr, gn->str.len);
		gv_currkey->base[gn->str.len] = gv_currkey->base[gn->str.len + 1] = 0;
		gv_currkey->end = gn->str.len + 1;
	}
	if (*resume)
	{
		util_out_print("REORG cannot be resumed from this point, Skipping this global...", FLUSH);
		memcpy(&gv_currkey->base[0], gn->str.addr, gn->str.len);
		gv_currkey->base[gn->str.len] = gv_currkey->base[gn->str.len + 1] = 0;
		gv_currkey->end = gn->str.len + 1;
		return TRUE;
	}
 	memcpy(&gv_currkey_next_reorg->base[0], &gv_currkey->base[0], gv_currkey->end + 1);
	gv_currkey_next_reorg->end =  gv_currkey->end;
	if (2 > dest_blk_id)
		dest_blk_id = 2; /* we know that first block is bitmap and next one is directory tree root */
	file_extended = cs_data->trans_hist.total_blks;
	blk_size = cs_data->blk_size;
	d_max_fill = (double)data_fill_factor * blk_size / 100.0 - cs_data->reserved_bytes;
	i_max_fill = (double)index_fill_factor * blk_size / 100.0 - cs_data->reserved_bytes;
	d_toler = (double) DATA_FILL_TOLERANCE * blk_size / 100.0;
	i_toler = (double) INDEX_FILL_TOLERANCE * blk_size / 100.0;
	blks_killed = blks_processed = blks_reused = lvls_reduced = blks_coalesced = blks_split = blks_swapped = 0;
	pre_order_successor_level = level = MAX_BT_DEPTH + 1; /* Just some high value to initialize */

	/* --- more detailed debugging information --- */
	if (detailed_log = reorg_op & DETAIL)
		util_out_print("STARTING to work on global ^!AD from region !AD", TRUE,
			gn->str.len, gn->str.addr, REG_LEN_STR(gv_cur_region));

	/* In each iteration of MAIN loop, a working block is processed for a GVT */
	for (; ;)	/* ================ START MAIN LOOP ================ */
	{
		/* If right sibling is completely merged with the working block, do not swap the working block
		 * with its final destination block. Continue trying next right sibling. Swap only at the end.
		 */
		complete_merge = TRUE;
		while(complete_merge)	/* === START WHILE COMPLETE_MERGE === */
		{
			if (mu_ctrlc_occurred || mu_ctrly_occurred)
			{
				cs_data->reorg_restart_block = dest_blk_id;
				memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end + 1);
				return FALSE;
			}
			complete_merge = FALSE;
			blks_processed++;
			t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
			/* Folllowing for loop is to handle concurrency retry for split/coalesce */
			for (; ;)		/* === SPLIT-COALESCE LOOP STARTS === */
			{
				gv_target->clue.end = 0;
				/* search gv_currkey and get the result in gv_target */
				if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal)
				{
					assert(CDB_STAGNATE > t_tries);
					t_retry(status);
					continue;
				} else if (gv_currkey->end + 1 != gv_target->hist.h[0].curr_rec.match)
                                {
					if (SIZEOF(blk_hdr) == ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz
						&& 1 == gv_target->hist.depth)
					{
						if (cs_addrs->now_crit)
						{
							t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */
							gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
							reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused,
								file_extended, lvls_reduced,
								blks_coalesced, blks_split, blks_swapped);
							return TRUE; /* It is not an error that global was killed */
						} else
						{
							assert(CDB_STAGNATE > t_tries);
							t_retry(status);
							continue;
						}
					}
                                }
				if (gv_target->hist.depth <= level)
				{
					/* Will come here
					 * 	1) first iteration of the for loop (since level == MAX_BT_DEPTH + 1) or,
					 *	2) tree depth decreased for mu_reduce_level or, M-kill
					 */
					pre_order_successor_level = gv_target->hist.depth - 1;
					if (MAX_BT_DEPTH + 1 != level)
					{
						/* break the loop when tree depth decreased (case 2) */
						level = pre_order_successor_level;
						break;
					}
					level = pre_order_successor_level;
				}
				max_fill = (0 == level)? d_max_fill : i_max_fill;
				toler = (0 == level)? d_toler:i_toler;
				cur_blk_size =  ((blk_hdr_ptr_t)(gv_target->hist.h[level].buffaddr))->bsiz;
				if (cur_blk_size > max_fill + toler && 0 == (reorg_op & NOSPLIT)) /* SPLIT BLOCK */
				{
					cnt1 = cnt2 = 0;
					/* history of current working block is in gv_target */
					status = mu_split(level, i_max_fill, d_max_fill, &cnt1, &cnt2);
					if (cdb_sc_maxlvl == status)
					{
						gtm_putmsg(VARLSTCNT(4) ERR_MAXBTLEVEL, 2, gn->str.len, gn->str.addr);
						reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused,
							file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped);
						return FALSE;
					} else if (cdb_sc_normal == status)
					{
						if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), NULL, TN_NOT_SPECIFIED)))
						{
							need_kip_incr = FALSE;
							continue;
						}
						if (detailed_log)
							log_detailed_log("SPL", &(gv_target->hist), NULL, level, NULL, ret_tn);
						blks_reused += cnt1;
						lvls_reduced -= cnt2;
						blks_split++;
						break;
					} else if (cdb_sc_oprnotneeded == status)
					{	/* undo any update_array/cw_set changes and DROP THRU to mu_clsce */
						cw_set_depth = 0;
						CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
						assert(0 == cw_map_depth); /* mu_swap_blk (that changes cw_map_depth) comes later */
					} else
					{
						assert(CDB_STAGNATE > t_tries);
						t_retry(status);
						continue;
					}
				} /* end if SPLIT BLOCK */
				/* We are here because, mu_split() was not called or, split was not done or, not required */
				rtsib_hist = gv_target->alt_hist;
				status = gvcst_rtsib(rtsib_hist, level);
				if (cdb_sc_normal != status && cdb_sc_endtree != status)
				{
					assert(CDB_STAGNATE > t_tries);
					t_retry(status);
					continue;
				}
				if (cdb_sc_endtree == status)
				{
					if (0 == level)
						end_of_tree = TRUE;
					break;
				} else if (0 == level)
					pre_order_successor_level = rtsib_hist->depth - 1;
				/* COALESCE WITH RTSIB */
				kill_set_list.used = 0;
				if (cur_blk_size < max_fill - toler && 0 == (reorg_op & NOCOALESCE))
				{
					/* histories are sent in &gv_target->hist and gv_target->alt_hist */
					status = mu_clsce(level, i_max_fill, d_max_fill, &kill_set_list, &complete_merge);
					if (cdb_sc_normal == status)
					{
						if (level) /* delete lower elements of array, t_end might confuse */
						{
							memmove(&rtsib_hist->h[0], &rtsib_hist->h[level],
								SIZEOF(srch_blk_status)*(rtsib_hist->depth - level + 2));
							rtsib_hist->depth = rtsib_hist->depth - level;
						}
						if (0 < kill_set_list.used)     /* increase kill_in_prog */
						{
							need_kip_incr = TRUE;
							if (!cs_addrs->now_crit)	/* Do not sleep while holding crit */
								WAIT_ON_INHIBIT_KILLS(cs_addrs->nl, MAXWAIT2KILL);
						}
						if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), rtsib_hist,
							TN_NOT_SPECIFIED)))
						{
							need_kip_incr = FALSE;
							assert(NULL == kip_csa);
							if (level)
							{	/* reinitialize level member in rtsib_hist srch_blk_status' */
								for (count = 0; count < MAX_BT_DEPTH; count++)
									rtsib_hist->h[count].level = count;
							}
							continue;
						}
						if (level)
						{	/* reinitialize level member in rtsib_hist srch_blk_status' */
							for (count = 0; count < MAX_BT_DEPTH; count++)
								rtsib_hist->h[count].level = count;
						}
						if (detailed_log)
							log_detailed_log("CLS", &(gv_target->hist), rtsib_hist, level,
								NULL, ret_tn);
						assert(0 < kill_set_list.used || (NULL == kip_csa));
						if (0 < kill_set_list.used)     /* decrease kill_in_prog */
						{
							gvcst_kill_sort(&kill_set_list);
							GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg,
									inctn_bmp_mark_free_mu_reorg, inctn_opcode, cs_addrs)
							DECR_KIP(cs_data, cs_addrs, kip_csa);
							if (detailed_log)
								log_detailed_log("KIL", &(gv_target->hist), NULL, level,
									&kill_set_list, ret_tn);
							blks_killed += kill_set_list.used;
						}
						blks_coalesced++;
						break;
					} else if (cdb_sc_oprnotneeded == status)
					{	/* undo any update_array/cw_set changes and DROP THRU to t_end */
						cw_set_depth = 0;
						CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
						assert(0 == cw_map_depth); /* mu_swap_blk (that changes cw_map_depth) comes later */
					} else
					{
						assert(CDB_STAGNATE > t_tries);
						t_retry(status);
						continue;
					}
				} /* end if try coalesce */
				if (0 == level)
				{
					/* Note: In data block level:
					 *      if split is successful or,
					 *	if coalesce is successful without a complete merge of rtsib,
					 *	then gv_currkey_next_reorg is already set from the called function.
					 *	if split or, coalesce do a retry or,
					 *	if coalesce is successful with a complete merge then
					 *	gv_currkey will not be changed.
					 * If split or, coalesce is not successful or, not needed then
					 *	here gv_currkey_next_reorg will be set from right sibling
					 */
					cw_set_depth = cw_map_depth = 0;
					GET_KEY_LEN(tkeysize, rtsib_hist->h[0].buffaddr + SIZEOF(blk_hdr) + SIZEOF(rec_hdr));
					if (2 < tkeysize && MAX_KEY_SZ >= tkeysize)
					{
						memcpy(&(gv_currkey_next_reorg->base[0]), rtsib_hist->h[0].buffaddr
							+ SIZEOF(blk_hdr) +SIZEOF(rec_hdr), tkeysize);
						gv_currkey_next_reorg->end = tkeysize - 1;
						inctn_opcode = inctn_invalid_op; /* temporary reset; satisfy an assert in t_end() */
						assert(UPDTRNS_DB_UPDATED_MASK == update_trans);
						update_trans = 0; /* tell t_end, this is no longer an update transaction */
						if ((trans_num)0 == (ret_tn = t_end(rtsib_hist, NULL, TN_NOT_SPECIFIED)))
						{
							need_kip_incr = FALSE;
							inctn_opcode = inctn_mu_reorg;	/* reset inctn_opcode to its default */
							update_trans = UPDTRNS_DB_UPDATED_MASK;/* reset update_trans to old value */
							assert(NULL == kip_csa);
							continue;
						}
						/* There is no need to reset update_trans in case of a successful "t_end" call.
						 * This is because before the next call to "t_end" we should have a call to
						 * "t_begin" which will reset update_trans anyways.
						 */
						inctn_opcode = inctn_mu_reorg;	/* reset inctn_opcode to its default */
						if (detailed_log)
							log_detailed_log("NOU", rtsib_hist, NULL, level, NULL, ret_tn);
					} else
					{
						assert(CDB_STAGNATE > t_tries);
						t_retry(status);
						continue;
					}
				} /* end if (0 == level) */
				break;
			}/* === SPLIT-COALESCE LOOP END === */
			t_abort(gv_cur_region, cs_addrs);	/* do crit and other cleanup */
		}/* === START WHILE COMPLETE_MERGE === */

		if (mu_ctrlc_occurred || mu_ctrly_occurred)
		{
			cs_data->reorg_restart_block = dest_blk_id;
			memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end+1);
			return FALSE;
		}
		/* Now swap the working block */
		if (0 == (reorg_op & NOSWAP))
		{
			t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
			/* Following loop is to handle concurrency retry for swap */
			for (; ;)	/* === START OF SWAP LOOP === */
			{
				kill_set_list.used = 0;
				gv_target->clue.end = 0;
				/* search gv_currkey and get the result in gv_target */
				if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal)
				{
					assert(CDB_STAGNATE > t_tries);
					t_retry(status);
					continue;
				} else if (gv_currkey->end + 1 != gv_target->hist.h[0].curr_rec.match)
                                {
					if (SIZEOF(blk_hdr) == ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz
						&& 1 == gv_target->hist.depth)
					{
						if (cs_addrs->now_crit)
						{
							t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */
							gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
							reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused,
								file_extended, lvls_reduced,
								blks_coalesced, blks_split, blks_swapped);
							return TRUE; /* It is not an error that global was killed */
						} else
						{
							assert(CDB_STAGNATE > t_tries);
							t_retry(status);
							continue;
						}
					}
                                }
				if (gv_target->hist.depth <= level)
					break;
				/* swap working block with appropriate dest_blk_id block.
				   Historys are sent as gv_target->hist and reorg_gv_target->hist */
				mu_reorg_in_swap_blk = TRUE;
				status = mu_swap_blk(level, &dest_blk_id, &kill_set_list, exclude_glist_ptr);
				mu_reorg_in_swap_blk = FALSE;
				if (cdb_sc_oprnotneeded == status)
				{
					if (cs_data->trans_hist.total_blks <= dest_blk_id)
					{
						util_out_print("REORG may be incomplete for this global.", TRUE);
						reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused,
							file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped);
						return TRUE;
					}
				} else if (cdb_sc_normal == status)
				{
					if (0 < kill_set_list.used)
					{
						need_kip_incr = TRUE;
						if (!cs_addrs->now_crit)	/* Do not sleep while holding crit */
							WAIT_ON_INHIBIT_KILLS(cs_addrs->nl, MAXWAIT2KILL);
						/* second history not needed, because,
						   we are reusing a free block, which does not need history */
						if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), NULL, TN_NOT_SPECIFIED)))
						{
							need_kip_incr = FALSE;
							assert(NULL == kip_csa);
							DECR_BLK_NUM(dest_blk_id);
							continue;
						}
						if (detailed_log)
							log_detailed_log("SWA", &(gv_target->hist), NULL, level, NULL, ret_tn);
						gvcst_kill_sort(&kill_set_list);
						GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg,
								inctn_bmp_mark_free_mu_reorg, inctn_opcode, cs_addrs)
						DECR_KIP(cs_data, cs_addrs, kip_csa);
						if (detailed_log)
							log_detailed_log("KIL", &(gv_target->hist), NULL, level,
								&kill_set_list, ret_tn);
						blks_reused += kill_set_list.used;
						blks_killed += kill_set_list.used;
					}
					/* gv_target->hist is for working block's history, and
					   reorg_gv_target->hist is for destinition block's history.
					   Note: gv_target and reorg_gv_target can be part of different GVT.  */
					else if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), &(reorg_gv_target->hist),
						TN_NOT_SPECIFIED)))
					{
						need_kip_incr = FALSE;
						assert(NULL == kip_csa);
						DECR_BLK_NUM(dest_blk_id);
						continue;
					}
					if ((0 >= kill_set_list.used) && detailed_log)
						log_detailed_log("SWA", &(gv_target->hist), &(reorg_gv_target->hist),
							level, NULL, ret_tn);
					blks_swapped++;
					if (reorg_op & SWAPHIST)
						util_out_print("Dest !SL From !SL", TRUE, dest_blk_id,
							gv_target->hist.h[level].blk_num);
				} else
				{
					assert(CDB_STAGNATE > t_tries);
					t_retry(status);
					continue;
				}
				break;
			}	/* === END OF SWAP LOOP === */
			t_abort(gv_cur_region, cs_addrs);	/* do crit and other cleanup */
		}
		if (mu_ctrlc_occurred || mu_ctrly_occurred)
		{
			cs_data->reorg_restart_block = dest_blk_id;
			memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end + 1);
			return FALSE;
		}
		if (end_of_tree)
			break;
		if (0 < level)
			level--; /* Order of reorg is root towards leaf */
		else
		{
			level = pre_order_successor_level;
			memcpy(&gv_currkey->base[0], &gv_currkey_next_reorg->base[0], gv_currkey_next_reorg->end + 1);
			gv_currkey->end =  gv_currkey_next_reorg->end;
			cs_data->reorg_restart_block = dest_blk_id;
			memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end + 1);
		}
	}		/* ================ END MAIN LOOP ================ */

	/* =========== START REDUCE LEVEL ============== */
	memcpy(&gv_currkey->base[0], gn->str.addr, gn->str.len);
	gv_currkey->base[gn->str.len] = gv_currkey->base[gn->str.len + 1] = 0;
	gv_currkey->end = gn->str.len + 1;
	for (;;)	/* Reduce level continues until it fails to reduce */
	{
		t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK);
		cnt1 = 0;
		for (; ;) 	/* main reduce level loop starts */
		{
			kill_set_list.used = 0;
			gv_target->clue.end = 0;
			/* search gv_currkey and get the result in gv_target */
			if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal)
			{
				assert(CDB_STAGNATE > t_tries);
				t_retry(status);
				continue;
			} else if (gv_currkey->end + 1 != gv_target->hist.h[0].curr_rec.match)
			{
				if (SIZEOF(blk_hdr) == ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz
					&& 1 == gv_target->hist.depth)
				{
					if (cs_addrs->now_crit)
					{
						t_abort(gv_cur_region, cs_addrs);	/* do crit and other cleanup */
						gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr);
						reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused,
							file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped);
						return TRUE; /* It is not an error that global was killed */
					} else
					{
						assert(CDB_STAGNATE > t_tries);
						t_retry(status);
						continue;
					}
				}
			}
			if (gv_target->hist.depth <= level)
				break;
			/* History is passed in gv_target->hist */
			status = mu_reduce_level(&kill_set_list);
			if (cdb_sc_oprnotneeded != status && cdb_sc_normal != status)
			{
				assert(CDB_STAGNATE > t_tries);
				t_retry(status);
				continue;
			} else if (cdb_sc_normal == status)
			{
				assert(0 < kill_set_list.used);
				need_kip_incr = TRUE;
				if (!cs_addrs->now_crit)	/* Do not sleep while holding crit */
					WAIT_ON_INHIBIT_KILLS(cs_addrs->nl, MAXWAIT2KILL);
				if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), NULL, TN_NOT_SPECIFIED)))
				{
					need_kip_incr = FALSE;
					assert(NULL == kip_csa);
					continue;
				}
				if (detailed_log)
					log_detailed_log("RDL", &(gv_target->hist), NULL, level, NULL, ret_tn);
				gvcst_kill_sort(&kill_set_list);
				GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg,
						inctn_bmp_mark_free_mu_reorg, inctn_opcode, cs_addrs)
				DECR_KIP(cs_data, cs_addrs, kip_csa);
				if (detailed_log)
					log_detailed_log("KIL", &(gv_target->hist), NULL, level, &kill_set_list, ret_tn);
				blks_reused += kill_set_list.used;
				blks_killed += kill_set_list.used;
				cnt1 = 1;
				lvls_reduced++;
			}
			break;
		} 		/* main reduce level loop ends */
		t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */
		if (0 == cnt1)
			break;
	}
	/* =========== END REDUCE LEVEL ===========*/

	reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused,
		file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped);
	return TRUE;

} /* end mu_reorg() */