Ejemplo n.º 1
0
/* Finds a free block and adds information to update array and cw_set */
block_id swap_root_or_directory_block(int parent_blk_lvl, int child_blk_lvl, srch_hist *dir_hist_ptr, block_id child_blk_id,
		sm_uc_ptr_t child_blk_ptr, kill_set *kill_set_list, trans_num curr_tn)
{
	sgmnt_data_ptr_t	csd;
	sgmnt_addrs		*csa;
	node_local_ptr_t	cnl;
	srch_blk_status		bmlhist, freeblkhist;
	block_id		hint_blk_num, free_blk_id, parent_blk_id;
	boolean_t		free_blk_recycled;
	int4			master_bit, num_local_maps, free_bit, hint_bit, maxbitsthismap;
	uint4			total_blks;
	int			blk_seg_cnt, blk_size;
	sm_uc_ptr_t		parent_blk_ptr, bn_ptr, saved_blk;
	blk_segment		*bs1, *bs_ptr;
	int			parent_blk_size, child_blk_size, bsiz;
	int			rec_size1, curr_offset, bpntr_end, hdr_len;
	int			tmp_cmpc;
	cw_set_element		*tmpcse;
	jnl_buffer_ptr_t	jbbp; /* jbbp is non-NULL only if before-image journaling */
	unsigned short		temp_ushort;
	unsigned long		temp_long;
	unsigned char		save_cw_set_depth;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	csd = cs_data;
	csa = cs_addrs;
	cnl = csa->nl;
	blk_size = csd->blk_size;
	/* Find a free/recycled block for new block location. */
	hint_blk_num = 0;
	total_blks = csa->ti->total_blks;
	num_local_maps = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP);
	master_bit = bmm_find_free((hint_blk_num / BLKS_PER_LMAP), csa->bmm, num_local_maps);
	if ((NO_FREE_SPACE == master_bit))
	{
		t_abort(gv_cur_region, csa);
		return ABORT_SWAP;
	}
	bmlhist.blk_num = (block_id)master_bit * BLKS_PER_LMAP;
	if (NULL == (bmlhist.buffaddr = t_qread(bmlhist.blk_num, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr)))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry((enum cdb_sc)rdfail_detail);
		return RETRY_SWAP;
	}
	hint_bit = 0;
	maxbitsthismap = (master_bit != (num_local_maps - 1)) ? BLKS_PER_LMAP : total_blks - bmlhist.blk_num;
	free_bit = bm_find_blk(hint_bit, bmlhist.buffaddr + SIZEOF(blk_hdr), maxbitsthismap, &free_blk_recycled);
	free_blk_id = bmlhist.blk_num + free_bit;
	if (DIR_ROOT >= free_blk_id)
	{	/* Bitmap block 0 and directory tree root block 1 should always be marked busy. */
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_badbitmap);
		return RETRY_SWAP;
	}
	if (child_blk_id <= free_blk_id)
	{	/* stop swapping root or DT blocks once the database is truncated well enough. A good heuristic for this is to check
		 * if the block is to be swapped into a higher block number and if so do not swap
		 */
		t_abort(gv_cur_region, csa);
		return ABORT_SWAP;
	}
	/* ====== begin update array ======
	 * Four blocks get changed.
	 * 	1. Free block becomes busy and gains the contents of child (root block/directory tree block)
	 * 	2. Parent block in directory tree remains busy, but points to new root block location.
	 *	3. Free block's corresponding bitmap reflects above change.
	 * 	4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE)
	 */
	parent_blk_ptr = dir_hist_ptr->h[parent_blk_lvl].buffaddr; /* parent_blk_lvl is 0 iff we're moving a gvt root block */
	parent_blk_id = dir_hist_ptr->h[parent_blk_lvl].blk_num;
	CHECK_AND_RESET_UPDATE_ARRAY;
	if (free_blk_recycled)
	{	/* Otherwise, it's a completely free block, in which case no need to read. */
		freeblkhist.blk_num = (block_id)free_blk_id;
		if (NULL == (freeblkhist.buffaddr = t_qread(free_blk_id, (sm_int_ptr_t)&freeblkhist.cycle, &freeblkhist.cr)))
		{
			assert(t_tries < CDB_STAGNATE);
			t_retry((enum cdb_sc)rdfail_detail);
			return RETRY_SWAP;
		}
	}
	child_blk_size = ((blk_hdr_ptr_t)child_blk_ptr)->bsiz;
	BLK_INIT(bs_ptr, bs1);
	BLK_ADDR(saved_blk, child_blk_size, unsigned char);
	memcpy(saved_blk, child_blk_ptr, child_blk_size);
	BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), child_blk_size - SIZEOF(blk_hdr));
	assert(blk_seg_cnt == child_blk_size);
	if (!BLK_FINI(bs_ptr, bs1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	tmpcse = &cw_set[cw_set_depth];
	(free_blk_recycled) ? BIT_SET_RECYCLED_AND_CLEAR_FREE(tmpcse->blk_prior_state)
			    : BIT_CLEAR_RECYCLED_AND_SET_FREE(tmpcse->blk_prior_state);
	t_create(free_blk_id, (unsigned char *)bs1, 0, 0, child_blk_lvl);
	tmpcse->mode = gds_t_acquired;
	if (!free_blk_recycled || !cs_data->db_got_to_v5_once)
		tmpcse->old_block = NULL;
	else
	{
		tmpcse->old_block = freeblkhist.buffaddr;
		tmpcse->cr = freeblkhist.cr;
		tmpcse->cycle = freeblkhist.cycle;
		jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL;
		if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn))
		{
			bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz;
			if (bsiz > blk_size)
			{
				assert(CDB_STAGNATE > t_tries);
				t_retry(cdb_sc_lostbmlcr);
				return RETRY_SWAP;
			}
			JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, csd, csa, tmpcse->old_block, bsiz);
		}
	}
	/* 2. Parent block in directory tree remains busy, but points to new child block location. */
	curr_offset = dir_hist_ptr->h[parent_blk_lvl].curr_rec.offset;
	parent_blk_size = ((blk_hdr_ptr_t)parent_blk_ptr)->bsiz;
	GET_RSIZ(rec_size1, (parent_blk_ptr + curr_offset));
	if ((parent_blk_size < rec_size1 + curr_offset) || (BSTAR_REC_SIZE > rec_size1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	BLK_INIT(bs_ptr, bs1);
	if (0 == parent_blk_lvl)
		/* There can be collation stuff in the record value after the block pointer. See gvcst_root_search. */
		hdr_len = SIZEOF(rec_hdr) + gv_altkey->end + 1 - EVAL_CMPC((rec_hdr_ptr_t)(parent_blk_ptr + curr_offset));
	else
		hdr_len = rec_size1 - SIZEOF(block_id);
	bpntr_end = curr_offset + hdr_len + SIZEOF(block_id);
	BLK_SEG(bs_ptr, parent_blk_ptr + SIZEOF(blk_hdr), curr_offset + hdr_len - SIZEOF(blk_hdr));
	BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
	PUT_LONG(bn_ptr, free_blk_id);
	BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
	BLK_SEG(bs_ptr, parent_blk_ptr + bpntr_end, parent_blk_size - bpntr_end);
	assert(blk_seg_cnt == parent_blk_size);
	if (!BLK_FINI(bs_ptr, bs1))
	{
		assert(t_tries < CDB_STAGNATE);
		t_retry(cdb_sc_blkmod);
		return RETRY_SWAP;
	}
	t_write(&dir_hist_ptr->h[parent_blk_lvl], (unsigned char *)bs1, 0, 0, parent_blk_lvl, FALSE, TRUE, GDS_WRITE_KILLTN);
	/* To indicate later snapshot file writing process during fast_integ not to skip writing the block to snapshot file */
	BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state);
	/* 3. Free block's corresponding bitmap reflects above change. */
	PUT_LONG(update_array_ptr, free_bit);
	save_cw_set_depth = cw_set_depth; /* Bit maps go on end of cw_set (more fake acquired) */
	assert(!cw_map_depth);
	t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, curr_tn, 1);
	cw_map_depth = cw_set_depth;
	cw_set_depth = save_cw_set_depth;
	update_array_ptr += SIZEOF(block_id);
	temp_long = 0;
	PUT_LONG(update_array_ptr, temp_long);
	update_array_ptr += SIZEOF(block_id);
	assert(1 == cw_set[cw_map_depth - 1].reference_cnt);
	/* 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE) */
	kill_set_list->blk[kill_set_list->used].flag = 0;
	kill_set_list->blk[kill_set_list->used].level = 0;
	kill_set_list->blk[kill_set_list->used++].block = child_blk_id;
	return free_blk_id;
}
Ejemplo n.º 2
0
/***********************************************************************************************
	Input Parameters:
		cur_level: Working block's level
		d_max_fill: Database fill factor
		i_max_fill: Index fill factor
	Output Parameters:
		blks_created: how many new blocks are created
		lvls_increased : How much level is increased
	Input/Output Parameters:
		gv_target: History of working block
	Here it is assumed that i_max_fill or, d_max_fill is strictly less than block size.
	Returns:
		cdb_sc_normal: if successful
		cdb_sc status otherwise
 ************************************************************************************************/
enum cdb_sc mu_split(int cur_level, int i_max_fill, int d_max_fill, int *blks_created, int *lvls_increased)
{
    boolean_t	first_copy, new_rtblk_star_only, create_root = FALSE, split_required, insert_in_left;
    unsigned char	curr_prev_key[MAX_KEY_SZ+1], new_blk1_last_key[MAX_KEY_SZ+1];
    unsigned short  temp_ushort;
    int		rec_size, new_ins_keycmpc, tkeycmpc, new_ances_currkeycmpc, old_ances_currkeycmpc;
    int		tmp_cmpc;
    block_index	left_index, right_index;
    block_offset 	ins_off, ins_off2;
    int		level;
    int		new_ins_keysz, new_ances_currkeysz, new_blk1_last_keysz, newblk2_first_keysz, next_gv_currkeysz;
    int		old_ances_currkeylen, new_ins_keylen, new_ances_currkeylen, tkeylen, newblk2_first_keylen;
    int		old_blk1_last_rec_size, old_blk1_sz, save_blk_piece_len, old_right_piece_len;
    int		delta, max_fill;
    enum cdb_sc	status;
    int		blk_seg_cnt, blk_size, new_leftblk_top_off;
    block_id	allocation_clue;
    sm_uc_ptr_t 	rPtr1, rPtr2, rec_base, key_base, next_gv_currkey,
                    bn_ptr1, bn_ptr2, save_blk_piece,
                    old_blk_after_currec, ances_currkey,
                    old_blk1_base,
                    new_blk1_top, new_blk2_top,
                    new_blk2_frec_base, new_blk2_rem,
                    newblk2_first_key, new_ins_key;
    blk_segment     *bs_ptr1, *bs_ptr2;
    cw_set_element  *cse;
    rec_hdr_ptr_t	star_rec_hdr, new_rec_hdr1a, new_rec_hdr1b, new_rec_hdr2, root_hdr;
    blk_hdr_ptr_t	blk_hdr_ptr;

    blk_size = cs_data->blk_size;
    CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */

    BLK_ADDR(star_rec_hdr, SIZEOF(rec_hdr), rec_hdr);
    star_rec_hdr->rsiz = BSTAR_REC_SIZE;
    SET_CMPC(star_rec_hdr, 0);
    level = cur_level;
    max_fill = (0 == level)? d_max_fill : i_max_fill;

    /*  -------------------
     *  Split working block.
     *  -------------------
     *  new_blk1_last_key = last key of the new working block after split
     *  new_blk1_last_keysz = size of new_blk1_last_key
     *  old_blk1_last_rec_size = last record size of the new working block after split (for old block)
     *  new_blk2_frec_base = base of first record of right block created after split
     *  newblk2_first_key = first key of new block created after split
     *  newblk2_first_keysz = size of newblk2_first_key
     *  new_blk2_rem = pointer to new block to be created after split exclude 1st record header + key
     */
    blk_hdr_ptr = (blk_hdr_ptr_t)(gv_target->hist.h[level].buffaddr);
    old_blk1_base = (sm_uc_ptr_t)blk_hdr_ptr;
    old_blk1_sz = blk_hdr_ptr->bsiz;
    new_blk2_top = old_blk1_base + old_blk1_sz;
    if (cdb_sc_normal != (status = locate_block_split_point (old_blk1_base, level, old_blk1_sz, max_fill,
                                   &old_blk1_last_rec_size, new_blk1_last_key, &new_blk1_last_keysz, &new_leftblk_top_off)))
    {
        assert(t_tries < CDB_STAGNATE);
        return cdb_sc_blkmod;
    }
    if (new_leftblk_top_off + BSTAR_REC_SIZE >= old_blk1_sz)
        /* Avoid split to create a small right sibling. Note this should not happen often when tolerance is high */
        return cdb_sc_oprnotneeded;
    old_right_piece_len = old_blk1_sz - new_leftblk_top_off;
    new_blk2_frec_base = old_blk1_base + new_leftblk_top_off;
    BLK_ADDR(newblk2_first_key, gv_cur_region->max_rec_size + 1, unsigned char);
    READ_RECORD(level, new_blk2_frec_base, tkeycmpc, rec_size, newblk2_first_key, newblk2_first_keylen, status);
    if (cdb_sc_normal != status) /* restart for cdb_sc_starrecord too, because we eliminated the possibility already */
    {
        assert(t_tries < CDB_STAGNATE);
        return cdb_sc_blkmod;
    }
    memcpy(newblk2_first_key, &new_blk1_last_key[0], tkeycmpc); /* copy the compressed key piece */
    new_blk2_rem = new_blk2_frec_base + SIZEOF(rec_hdr) + newblk2_first_keylen;
    newblk2_first_keysz = newblk2_first_keylen + tkeycmpc;

    /* gv_currkey_next_reorg will be saved for next iteration in mu_reorg */
    next_gv_currkey = newblk2_first_key;
    next_gv_currkeysz = newblk2_first_keysz;

    BLK_ADDR(new_rec_hdr1b, SIZEOF(rec_hdr), rec_hdr);
    new_rec_hdr1b->rsiz = rec_size + tkeycmpc;
    SET_CMPC(new_rec_hdr1b, 0);

    /* Create new split piece, we already know that this will not be *-rec only.
     * Note that this has to be done BEFORE modifying working block as building this buffer relies on the
     * working block to be pinned which is possible only if this cw-set-element is created ahead of that
     * of the working block (since order in which blocks are built is the order in which cses are created).
     */
    BLK_INIT(bs_ptr2, bs_ptr1);
    BLK_SEG(bs_ptr2, (sm_uc_ptr_t)new_rec_hdr1b, SIZEOF(rec_hdr));
    BLK_SEG(bs_ptr2, newblk2_first_key, newblk2_first_keysz);
    BLK_SEG(bs_ptr2, new_blk2_rem, new_blk2_top - new_blk2_rem);
    if (!BLK_FINI(bs_ptr2, bs_ptr1))
    {
        assert(t_tries < CDB_STAGNATE);
        return cdb_sc_blkmod;
    }
    allocation_clue = ALLOCATION_CLUE(cs_data->trans_hist.total_blks);
    right_index = t_create(allocation_clue++, (unsigned char *)bs_ptr1, 0, 0, level);
    (*blks_created)++;

    /* Modify working block removing split piece */
    BLK_INIT(bs_ptr2, bs_ptr1);
    if (0 == level)
    {
        BLK_SEG(bs_ptr2, old_blk1_base + SIZEOF(blk_hdr), new_leftblk_top_off - SIZEOF(blk_hdr));
    }
    else
    {
        BLK_SEG(bs_ptr2, old_blk1_base + SIZEOF(blk_hdr),
                new_leftblk_top_off - SIZEOF(blk_hdr) - old_blk1_last_rec_size);
        BLK_SEG(bs_ptr2, (sm_uc_ptr_t)star_rec_hdr, SIZEOF(rec_hdr) );
        BLK_ADDR(bn_ptr1, SIZEOF(block_id), unsigned char);
        memcpy(bn_ptr1, old_blk1_base + new_leftblk_top_off - SIZEOF(block_id), SIZEOF(block_id));
        BLK_SEG(bs_ptr2, bn_ptr1, SIZEOF(block_id));
    }
    if ( !BLK_FINI(bs_ptr2, bs_ptr1))
    {
        assert(t_tries < CDB_STAGNATE);
        return cdb_sc_blkmod;
    }
    t_write(&gv_target->hist.h[level], (unsigned char *)bs_ptr1, 0, 0, level, FALSE, TRUE, GDS_WRITE_KILLTN);

    /*
    ----------------------------------------------------------------------------
    Modify ancestor block for the split in current level.
    new_ins_key = new key to be inserted in parent because of split in child
    new_ins_key will be inserted after gv_target->hist.h[level].prev_rec and
                                before gv_target->hist.h[level].curr_rec
        new_ins_keysz = size of new_ins_key
        Note: A restriction of the algorithm is to have current key and new_ins_key
    	in the same block, either left or, new right block
    ----------------------------------------------------------------------------
    */
    BLK_ADDR(new_ins_key, new_blk1_last_keysz, unsigned char);
    memcpy(new_ins_key, &new_blk1_last_key[0], new_blk1_last_keysz);
    new_ins_keysz = new_blk1_last_keysz;
    for(;;) 	/* ========== loop through ancestors as necessary ======= */
    {
        level ++;
        max_fill = i_max_fill;
        /*
        old_blk_after_currec = remaining of current block after currec
        ances_currkey = old real value of currkey in ancestor block
        */
        blk_hdr_ptr = (blk_hdr_ptr_t)(gv_target->hist.h[level].buffaddr);
        old_blk1_base = (sm_uc_ptr_t)blk_hdr_ptr;
        old_blk1_sz = blk_hdr_ptr->bsiz;
        new_blk2_top = old_blk1_base + old_blk1_sz;
        rec_base = old_blk1_base + gv_target->hist.h[level].curr_rec.offset;
        GET_RSIZ(rec_size, rec_base);
        old_blk_after_currec = rec_base + rec_size;
        old_ances_currkeycmpc = EVAL_CMPC((rec_hdr_ptr_t)rec_base);
        old_ances_currkeylen = rec_size - BSTAR_REC_SIZE;
        if (INVALID_RECORD(level, rec_size,  old_ances_currkeylen, old_ances_currkeycmpc))
        {
            assert(t_tries < CDB_STAGNATE);
            return cdb_sc_blkmod;
        }
        if (0 == old_ances_currkeylen)
        {
            if (0 != old_ances_currkeycmpc)
            {
                assert(t_tries < CDB_STAGNATE);
                return cdb_sc_blkmod;
            }
            new_ances_currkeycmpc = new_ances_currkeylen = 0;
        }
        else
        {
            BLK_ADDR(ances_currkey, gv_cur_region->max_rec_size + 1, unsigned char);
            key_base = rec_base +  SIZEOF(rec_hdr);
        }
        new_ances_currkeysz = old_ances_currkeycmpc + old_ances_currkeylen;
        if (SIZEOF(blk_hdr) != gv_target->hist.h[level].curr_rec.offset) /* cur_rec is not first key */
        {
            if (cdb_sc_normal != (status = gvcst_expand_any_key(old_blk1_base,
                                           old_blk1_base + gv_target->hist.h[level].curr_rec.offset,
                                           &curr_prev_key[0], &rec_size, &tkeylen, &tkeycmpc, NULL)))
            {
                assert(t_tries < CDB_STAGNATE);
                return cdb_sc_blkmod;
            }
            if (old_ances_currkeycmpc)
                memcpy(ances_currkey, &curr_prev_key[0], old_ances_currkeycmpc);
        }
        if (old_ances_currkeylen)
        {
            memcpy(ances_currkey + old_ances_currkeycmpc, key_base, old_ances_currkeylen);
            GET_CMPC(new_ances_currkeycmpc, new_ins_key, ances_currkey);
            new_ances_currkeylen = new_ances_currkeysz - new_ances_currkeycmpc;
        }
        if (SIZEOF(blk_hdr) != gv_target->hist.h[level].curr_rec.offset)
        {
            /* new_ins_key will be inseted after curr_prev_key */
            GET_CMPC(new_ins_keycmpc, &curr_prev_key[0], new_ins_key);
        }
        else
            new_ins_keycmpc = 0; /* new_ins_key will be the 1st key */
        new_ins_keylen = new_ins_keysz - new_ins_keycmpc ;

        delta = BSTAR_REC_SIZE + new_ins_keylen - old_ances_currkeylen + new_ances_currkeylen;
        if (old_blk1_sz + delta > blk_size - cs_data->reserved_bytes) /* split required */
        {
            split_required = TRUE;
            if (level == gv_target->hist.depth)
            {
                create_root = TRUE;
                if (MAX_BT_DEPTH - 1 <= level)  /* maximum level reached */
                    return cdb_sc_maxlvl;
            }
            if (max_fill + BSTAR_REC_SIZE > old_blk1_sz)
            {
                if (SIZEOF(blk_hdr) + BSTAR_REC_SIZE == old_blk1_sz)
                    return cdb_sc_oprnotneeded; /* Improve code to avoid this */
                max_fill = old_blk1_sz - BSTAR_REC_SIZE;
            }
            status = locate_block_split_point(old_blk1_base, level, old_blk1_sz, max_fill,
                                              &old_blk1_last_rec_size, new_blk1_last_key, &new_blk1_last_keysz, &new_leftblk_top_off);
            if (cdb_sc_normal != status || new_leftblk_top_off >= old_blk1_sz
                    || 0 == new_blk1_last_keysz)
            {
                assert(t_tries < CDB_STAGNATE);
                return cdb_sc_blkmod;
            }
            assert(BSTAR_REC_SIZE != old_blk1_last_rec_size);
            old_right_piece_len = old_blk1_sz - new_leftblk_top_off;
            new_blk2_frec_base = new_blk1_top = old_blk1_base + new_leftblk_top_off;
            if (BSTAR_REC_SIZE == old_right_piece_len)
                new_rtblk_star_only = TRUE;
            else
                new_rtblk_star_only = FALSE;
            if (new_leftblk_top_off == gv_target->hist.h[level].curr_rec.offset)
            {
                /* inserted key will be the first record of new right block */
                new_ins_keylen = new_ins_keysz;
                new_ins_keycmpc = 0;
            }
            else
                /* process 1st record of new right block */
            {
                BLK_ADDR(newblk2_first_key, gv_cur_region->max_rec_size + 1, unsigned char);
                READ_RECORD(level, new_blk2_frec_base, tkeycmpc, rec_size,
                            newblk2_first_key, newblk2_first_keylen, status);
                if (cdb_sc_normal == status)
                {
                    memcpy(newblk2_first_key, &new_blk1_last_key[0], tkeycmpc); /* compressed piece */
                    new_blk2_rem =  new_blk2_frec_base + SIZEOF(rec_hdr) + newblk2_first_keylen;
                    newblk2_first_keysz = newblk2_first_keylen + tkeycmpc;
                    BLK_ADDR(new_rec_hdr2, SIZEOF(rec_hdr), rec_hdr);
                    new_rec_hdr2->rsiz = newblk2_first_keysz + BSTAR_REC_SIZE;
                    SET_CMPC(new_rec_hdr2, 0);
                }
                else if (cdb_sc_starrecord != status || !new_rtblk_star_only)
                {
                    assert(t_tries < CDB_STAGNATE);
                    return cdb_sc_blkmod;
                }
            }
            /* else gv_target->hist.h[level].curr_rec will be newblk2_first_key */

            if (new_leftblk_top_off >  gv_target->hist.h[level].curr_rec.offset +
                    old_ances_currkeylen + BSTAR_REC_SIZE)
            {
                /* in this case prev_rec (if exists), new key and curr_rec should go into left block */
                if (new_leftblk_top_off + delta - old_blk1_last_rec_size + BSTAR_REC_SIZE
                        <= blk_size - cs_data->reserved_bytes)
                    insert_in_left = TRUE;
                else
                {
                    /* cannot handle it now */
                    return cdb_sc_oprnotneeded;
                }
            }
            else if (new_leftblk_top_off <  gv_target->hist.h[level].curr_rec.offset +
                     old_ances_currkeylen + BSTAR_REC_SIZE)
            {
                /* if gv_target->hist.h[level].curr_rec is the first key in old_blk1
                   then in new right block,
                   	new_ins_key will be the 1st record key and
                	curr_rec will be 2nd record and
                	there will be no prev_rec in right block.
                   Else (if curr_rec is not first key)
                	there will be some records before new_ins_key, at least prev_rec */
                delta = (int)(BSTAR_REC_SIZE + new_ins_keylen
                              - old_ances_currkeylen + new_ances_currkeylen
                              + ((0 == new_ins_keycmpc) ? 0 : (EVAL_CMPC((rec_hdr_ptr_t)new_blk2_frec_base))));
                if (SIZEOF(blk_hdr) + old_right_piece_len + delta <= blk_size - cs_data->reserved_bytes)
                {
                    insert_in_left = FALSE;
                    if (new_leftblk_top_off + BSTAR_REC_SIZE >= old_blk1_sz)
                    {
                        /* cannot handle it now */
                        return cdb_sc_oprnotneeded;
                    }
                }
                else
                {
                    /* cannot handle it now */
                    return cdb_sc_oprnotneeded;
                }
            }
            else
            {
                /* in this case prev_rec (if exists), new key and curr_rec should go into left block
                	and curr_rec will be the last record (*-key) of left new block */
                delta = BSTAR_REC_SIZE + new_ins_keylen;
                if (new_leftblk_top_off + delta <= blk_size - cs_data->reserved_bytes)
                    insert_in_left = TRUE;
                else
                {
                    /* cannot handle it now */
                    return cdb_sc_oprnotneeded;
                }
            }
        } /* end if split required */
        else
Ejemplo n.º 3
0
/*************************************************************************************************
Input Parameters:
	gv_target: working block's history
	level : Level of working block and its right sibling
	d_blk_fill_size : Maximum fill allowed in a data block
	i_blk_fill_size : Maximum fill allowed in an index block
Output Parameters:
	kill_set_ptr : List of blocks to be freed from LBM (already killed in mu_clsce)
	remove_rtsib : if right sibling was completely merged with working
Returns:
	cdb_sc_normal on success
	Other wise error status
 *************************************************************************************************/
enum cdb_sc mu_clsce(int level, int i_max_fill, int d_max_fill, kill_set *kill_set_ptr,
	boolean_t *remove_rtsib)
{
	boolean_t	complete_merge = FALSE,
			old_ref_star_only = FALSE,
			new_rtsib_star_only = FALSE,
			star_only_merge = FALSE,
			blk2_ances_star_only = FALSE,
			delete_all_blk2_ances = TRUE,
			levelp_next_is_star, forward_process;
	unsigned char	oldblk1_prev_key[MAX_KEY_SZ+1],
			old_levelp_cur_prev_key[MAX_KEY_SZ+1],
			old_levelp_cur_key[MAX_KEY_SZ+1]; /* keys in private memory */
	unsigned short	temp_ushort;
	int		new_levelp_cur_cmpc, new_levelp_cur_next_cmpc, tkeycmpc,
			oldblk1_last_cmpc, newblk1_mid_cmpc, newblk1_last_cmpc;
	int		levelp, level2;
	int		old_blk1_sz, old_blk2_sz;
	int		old_levelp_cur_prev_keysz,
			old_levelp_cur_keysz,
			old_levelp_cur_next_keysz,
			newblk1_last_keysz,
			newblk2_first_keysz,
			new_blk2_ances_first_keysz;
	int		old_levelp_cur_keylen,
			new_levelp_cur_keylen,
			old_levelp_cur_next_keylen,
			new_levelp_cur_next_keylen,
			oldblk1_last_keylen,
			newblk1_last_keylen,
			newblk2_first_keylen;
	int		rec_size, piece_len, tkeylen, old_levelp_rec_offset;
	int		blk_seg_cnt, blk_size;
	uint4		save_t_err;
	enum cdb_sc	status;
	sm_uc_ptr_t 	oldblk1_last_key, old_levelp_cur_next_key,
			newblk1_last_key, newblk2_first_key, new_blk2_ances_first_key; /* shared memory keys */
	sm_uc_ptr_t 	rec_base, old_levelp_blk_base,
			bn_ptr1, bn_ptr2, blk2_ances_remain, old_blk1_base, old_blk2_base,
			new_blk1_top, new_blk2_first_rec_base, new_blk2_remain; /* shared memory pointers */
	sm_uc_ptr_t 	rPtr1, rPtr2;
	rec_hdr_ptr_t	star_rec_hdr, old_last_rec_hdr1, new_rec_hdr1, new_rec_hdr2,
			blk2_ances_hdr, new_levelp_cur_hdr, new_levelp_cur_next_hdr;
	blk_segment	*bs_ptr1, *bs_ptr2;
	srch_hist	*blk1ptr, *blk2ptr; /* blk2ptr is for right sibling's hist from a minimum sub-tree containing both blocks */
	error_def(ERR_GVKILLFAIL);

	blk_size = cs_data->blk_size;
	assert(update_array != NULL);
	update_array_ptr = update_array;

	blk1ptr = &(gv_target->hist);
	blk2ptr = gv_target->alt_hist;
	old_blk1_base = blk1ptr->h[level].buffaddr;
	old_blk2_base = blk2ptr->h[level].buffaddr;
	old_blk1_sz = ((blk_hdr_ptr_t)old_blk1_base)->bsiz;
	old_blk2_sz = ((blk_hdr_ptr_t)old_blk2_base)->bsiz;
	if (0 != level && sizeof(blk_hdr) + BSTAR_REC_SIZE == old_blk1_sz)
		old_ref_star_only = TRUE;
	/* Search an ancestor block at levelp >= level+1,
	which has a real key value corresponding to the working block.
	This key value will be changed after coalesce.  */
	levelp = level;
	do
	{
		if (++levelp > blk1ptr->depth ||  levelp > blk2ptr->depth)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		old_levelp_blk_base = blk1ptr->h[levelp].buffaddr;
		old_levelp_rec_offset = blk1ptr->h[levelp].curr_rec.offset;
		rec_base = old_levelp_blk_base + old_levelp_rec_offset;
		GET_RSIZ(rec_size, rec_base);
	} while (BSTAR_REC_SIZE == rec_size); /* search ancestors to get a real value */

	/*
	old_levelp_cur_prev_key = real value of the key before the curr_key at levelp
	old_levelp_cur_prev_keysz = uncompressed size of the key
	Note: we may not have a previous key (old_levelp_cur_prev_keysz = 0)
	*/
	if (sizeof(blk_hdr) == old_levelp_rec_offset)
		old_levelp_cur_prev_keysz = 0;
	else
	{
		if (cdb_sc_normal != (status = gvcst_expand_any_key (old_levelp_blk_base, rec_base,
			&old_levelp_cur_prev_key[0], &rec_size, &tkeylen, &tkeycmpc, NULL)))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		old_levelp_cur_prev_keysz = tkeylen + tkeycmpc;
	}

	/*
	old_levelp_cur_key = real value of the curr_key at levelp
	old_levelp_cur_keysz = uncompressed size of the key
	old_levelp_cur_keylen = compressed size of the key
	*/
	READ_RECORD(levelp, rec_base, tkeycmpc, rec_size,  &old_levelp_cur_key[0], old_levelp_cur_keylen, status);
	if (cdb_sc_normal != status)
	{
		assert(t_tries < CDB_STAGNATE);
		return cdb_sc_blkmod;
	}
	if (old_levelp_cur_prev_keysz)
		memcpy(&old_levelp_cur_key[0], &old_levelp_cur_prev_key[0], tkeycmpc);
	rec_base += rec_size;
	old_levelp_cur_keysz = old_levelp_cur_keylen + tkeycmpc;

	/*
	old_levelp_cur_next_key = uncompressed value of the next right key of old_levelp_cur_key
	old_levelp_cur_next_keysz = uncomressed size of the key
	old_levelp_cur_next_keylen = comressed size of the key
		Note: we may not have a next key (old_levelp_cur_next_keysz = 0)
	*/
	BLK_ADDR(old_levelp_cur_next_key, gv_cur_region->max_key_size + 1, unsigned char);
	READ_RECORD(levelp, rec_base, tkeycmpc, rec_size, old_levelp_cur_next_key, old_levelp_cur_next_keylen, status);
	if (cdb_sc_starrecord == status)
		levelp_next_is_star = TRUE;
	else if (cdb_sc_normal != status)
	{
		assert(t_tries < CDB_STAGNATE);
		return cdb_sc_blkmod;
	}
	else
	{
		memcpy(old_levelp_cur_next_key, &old_levelp_cur_key[0], tkeycmpc);
		old_levelp_cur_next_keysz = old_levelp_cur_next_keylen + tkeycmpc;
		levelp_next_is_star = FALSE;
	}


	/*
	Now process the actual working block at current level
		oldblk1_last_key = real value of last key of the working block
			For index block decompress *-key
		oldblk1_last_keylen = compressed size of the last key
		oldblk1_last_cmpc = compression count of last key of working block
		old_last_rec_hdr1 = New working index block's last record header
	*/
	BLK_ADDR(oldblk1_last_key, gv_cur_region->max_key_size + 1, unsigned char);
	if (0 == level) /* data block */
	{
		if (cdb_sc_normal != (status = gvcst_expand_any_key (old_blk1_base, old_blk1_base + old_blk1_sz,
			oldblk1_last_key, &rec_size, &oldblk1_last_keylen, &oldblk1_last_cmpc, NULL)))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		rec_base = old_blk1_base + old_blk1_sz;
	}
	else  /* Index blocks */
	{
		/* Since we will join this working block with the right sibling,
		we need to remove the *-key at the end of working block
		and replace with actual key value (with required compression).
		We will get the real value of *-rec from its ancestor at levelp */
                memcpy (oldblk1_last_key, &old_levelp_cur_key[0], old_levelp_cur_keysz);
		if (!old_ref_star_only) /* if the index block is not a *-key only block) */
		{
			if (cdb_sc_normal != (status = gvcst_expand_any_key (old_blk1_base,
				old_blk1_base + old_blk1_sz - BSTAR_REC_SIZE, &oldblk1_prev_key[0],
				&rec_size, &tkeylen, &tkeycmpc, NULL)))
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			GET_CMPC(oldblk1_last_cmpc, &oldblk1_prev_key[0], &old_levelp_cur_key[0]);
			oldblk1_last_keylen = old_levelp_cur_keysz - oldblk1_last_cmpc;
		}
		else /* working block has a *-key record only */
		{
			/* get key value from ancestor blocks key */
			oldblk1_last_keylen = old_levelp_cur_keysz;
			oldblk1_last_cmpc = 0;
        	}
                BLK_ADDR(old_last_rec_hdr1, sizeof(rec_hdr), rec_hdr);
                old_last_rec_hdr1->rsiz = BSTAR_REC_SIZE + oldblk1_last_keylen;
                old_last_rec_hdr1->cmpc = oldblk1_last_cmpc;
	}

	/*
	newblk1_last_key = new working blocks final appended key
	newblk1_mid_cmpc = new working blocks firstly appended key's cmpc
	newblk1_last_keysz = new working blocks lastly appended key's size
	star_only_merge = TRUE, we can append only a *-key record into the working block
				(decompressing current *-key)
	complete_merge = TRUE, rtsib can be completely merged with working block
	piece_len = Size of data from old rtsibling to be merged into working block (includes rec_hdr size)
	*/
	BLK_ADDR(newblk1_last_key, gv_cur_region->max_key_size + 1, unsigned char);
	rec_base = old_blk2_base + sizeof(blk_hdr);
	READ_RECORD(level, rec_base, newblk1_last_cmpc, rec_size, newblk1_last_key, newblk1_last_keylen, status);
	if (cdb_sc_starrecord == status) /* rtsib index block has *-record only */
	{
		if (old_blk1_sz + oldblk1_last_keylen + BSTAR_REC_SIZE > i_max_fill ) /* cannot fit even one record */
			return cdb_sc_oprnotneeded;
		star_only_merge = TRUE;
		complete_merge = TRUE;
		rec_base = old_blk2_base + sizeof(blk_hdr) + BSTAR_REC_SIZE;
	}
	else if (cdb_sc_normal != status)
	{
		assert(t_tries < CDB_STAGNATE);;
		return cdb_sc_blkmod;
	}
	else /* for both data and non-* index block */
	{
		newblk1_last_keysz = newblk1_last_keylen; /* first key has uncompressed real value */
		GET_CMPC(newblk1_mid_cmpc, oldblk1_last_key, newblk1_last_key);
		piece_len = rec_size - newblk1_mid_cmpc;
		if (level == 0) /* data block */
		{
			if (old_blk1_sz + piece_len > d_max_fill ) /* cannot fit even one record */
				return cdb_sc_oprnotneeded;
		}
		else /* else an index block */
		{
			if (old_blk1_sz + oldblk1_last_keylen + BSTAR_REC_SIZE > i_max_fill ) /* cannot fit even one record */
				return cdb_sc_oprnotneeded;
			if (old_blk1_sz + oldblk1_last_keylen + piece_len + BSTAR_REC_SIZE > i_max_fill )
					star_only_merge = TRUE; /* can fit only a *-record */
		}
		rec_base += rec_size;
	}

	/* new_blk2_first_rec_base and new_blk1_top is set with final value for star_only_merge  for index block */
	new_blk2_first_rec_base = new_blk1_top = rec_base;
	if (!star_only_merge)
	{
		BLK_ADDR(new_rec_hdr1, sizeof(rec_hdr), rec_hdr);
		new_rec_hdr1->rsiz = piece_len;
		new_rec_hdr1->cmpc = newblk1_mid_cmpc;
	}
	/* else only new_blk1_last_key will be appeneded in working block */


	/* find a piece of the right sibling to be copied into the working block.
	Note: rec_base points to 2nd record of old rtsib */
	if (0 == level) /* if data block */
	{
		complete_merge = TRUE;
		while (rec_base < old_blk2_base + old_blk2_sz)
		{
			GET_RSIZ(rec_size, rec_base);
			if (old_blk1_sz + piece_len + rec_size > d_max_fill )
			{
				complete_merge = FALSE;
				break;
			}
			READ_RECORD(level, rec_base, newblk1_last_cmpc, rec_size, newblk1_last_key, newblk1_last_keylen, status);
			if (cdb_sc_normal != status)
			{
				assert(t_tries < CDB_STAGNATE);;
				return cdb_sc_blkmod;
			}
			newblk1_last_keysz = newblk1_last_keylen + newblk1_last_cmpc;
			rec_base += rec_size;
			piece_len += rec_size;
		}/* end of "while" loop */
		new_blk1_top = new_blk2_first_rec_base = rec_base;
	}
	else /* index block */
	{
		if (!star_only_merge)
		{
			/* we know we can fit more record in working block and rtsibling has more records */
			complete_merge = TRUE;
			while (rec_base < old_blk2_base + old_blk2_sz)
			{
				GET_RSIZ(rec_size, rec_base);
				if (BSTAR_REC_SIZE == rec_size)
				{
					rec_base += rec_size;
					piece_len += rec_size;
					break; /* already we know we can fit this *-record in working block */
				}
				READ_RECORD(level, rec_base, newblk1_last_cmpc, rec_size,
					newblk1_last_key, newblk1_last_keylen, status);
				if (cdb_sc_normal != status)
				{
					assert(t_tries < CDB_STAGNATE);;
					return cdb_sc_blkmod;
				}
				newblk1_last_keysz = newblk1_last_keylen + newblk1_last_cmpc;
				rec_base += rec_size;
				piece_len += rec_size;
				if (old_blk1_sz + oldblk1_last_keylen + piece_len + BSTAR_REC_SIZE > i_max_fill )
				{
					complete_merge = FALSE;
					break;
				}
			}/* end of "while" loop */
			new_blk1_top = new_blk2_first_rec_base = rec_base;
		} /* end else  *-only merge */
	} /* end else index block */




	if (!complete_merge)
	{
		/*
		Adjust new right sibling's buffer
		if new_rtsib_star_only == TRUE then
			new right sibling will have a *-key record only
		else
			new_blk2_remain = base pointer of buffer including 1st record but exclude rec_header and key
			new_blk2_first_keysz = size of new rtsib block's first key
		*/
		BLK_ADDR(newblk2_first_key, gv_cur_region->max_key_size + 1, unsigned char);
		READ_RECORD(level, new_blk2_first_rec_base, tkeycmpc, rec_size,
			newblk2_first_key, newblk2_first_keylen, status);
		if (cdb_sc_starrecord == status) /* new rtsib will have a *-record only */
			new_rtsib_star_only = TRUE;
		else if (cdb_sc_normal != status)
		{
			assert(t_tries < CDB_STAGNATE);;
			return cdb_sc_blkmod;
		}
		else
		{
			memcpy(newblk2_first_key, newblk1_last_key, tkeycmpc); /* copy the compressed piece */
			newblk2_first_keysz = newblk2_first_keylen + tkeycmpc;
			new_blk2_remain = new_blk2_first_rec_base + sizeof(rec_hdr) + newblk2_first_keylen;
			BLK_ADDR(new_rec_hdr2, sizeof(rec_hdr), rec_hdr);
			new_rec_hdr2->rsiz = rec_size + tkeycmpc;
			new_rec_hdr2->cmpc = 0;
		}
	}
Ejemplo n.º 4
0
boolean_t gvcst_queryget2(mval *val, unsigned char *sn_ptr)
{
	blk_hdr_ptr_t	bp;
	boolean_t	found, two_histories;
	enum cdb_sc	status;
	int		rsiz, key_size, data_len;
	rec_hdr_ptr_t	rp;
	srch_blk_status	*bh;
	srch_hist	*rt_history;
	unsigned short	temp_ushort;
	int		tmp_cmpc;
	DEBUG_ONLY(unsigned char *save_strp = NULL);

	T_BEGIN_READ_NONTP_OR_TP(ERR_GVQUERYGETFAIL);
	assert((CDB_STAGNATE > t_tries) || cs_addrs->now_crit);	/* we better hold crit in the final retry (TP & non-TP) */
	for (;;)
	{
		two_histories = FALSE;
#if defined(DEBUG) && defined(UNIX)
			if (gtm_white_box_test_case_enabled && (WBTEST_ANTIFREEZE_GVQUERYGETFAIL == gtm_white_box_test_case_number))
			{
				status = cdb_sc_blknumerr;
				t_retry(status);
				continue;
			}
#endif
		if (cdb_sc_normal == (status = gvcst_search(gv_currkey, 0)))
		{
			found = TRUE;
			bh = &gv_target->hist.h[0];
			rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset);
			bp = (blk_hdr_ptr_t)bh->buffaddr;
			if (rp >= (rec_hdr_ptr_t)CST_TOB(bp))
			{
				two_histories = TRUE;
				rt_history = gv_target->alt_hist;
				status = gvcst_rtsib(rt_history, 0);
				if (cdb_sc_endtree == status)		/* end of tree */
				{
					found = FALSE;
					two_histories = FALSE;		/* second history not valid */
				} else  if (cdb_sc_normal != status)
				{
					t_retry(status);
					continue;
				} else
				{
					bh = &rt_history->h[0];
					if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh)))
					{
						t_retry(status);
						continue;
					}
					rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset);
					bp = (blk_hdr_ptr_t)bh->buffaddr;
				}
			}
			/* !found indicates that the end of tree has been reached (see call to
			 *  gvcst_rtsib).  If there is no more tree, don't bother doing expansion.
			 */
			if (found)
			{
				status = gvcst_expand_key((blk_hdr_ptr_t)bh->buffaddr, (int4)((sm_uc_ptr_t)rp - bh->buffaddr),
						gv_altkey);
				if (cdb_sc_normal != status)
				{
					t_retry(status);
					continue;
				}
				key_size = gv_altkey->end + 1;
				GET_RSIZ(rsiz, rp);
				data_len = rsiz + EVAL_CMPC(rp) - SIZEOF(rec_hdr) - key_size;
				if (data_len < 0  || (sm_uc_ptr_t)rp + rsiz > (sm_uc_ptr_t)bp + ((blk_hdr_ptr_t)bp)->bsiz)
				{
					assert(CDB_STAGNATE > t_tries);
					t_retry(cdb_sc_rmisalign1);
					continue;
				}
				ENSURE_STP_FREE_SPACE(data_len);
				DEBUG_ONLY (
				if (!save_strp)
					save_strp = stringpool.free);
				assert(stringpool.top - stringpool.free >= data_len);
				memcpy(stringpool.free, (sm_uc_ptr_t)rp + rsiz - data_len, data_len);
				/* Assumption: t_end/tp_hist will never cause stp_gcol() call BYPASSOK */
			}
			if (!dollar_tlevel)
			{
				if ((trans_num)0 == t_end(&gv_target->hist, !two_histories ? NULL : rt_history, TN_NOT_SPECIFIED))
					continue;
			} else
			{
				status = tp_hist(!two_histories ? NULL : rt_history);
				if (cdb_sc_normal != status)
				{
					t_retry(status);
					continue;
				}
			}
			if (found)
			{
				DEBUG_ONLY(assert(save_strp == stringpool.free));
				/* Process val first. Already copied to string pool. */
				val->mvtype = MV_STR;
				val->str.addr = (char *)stringpool.free;
				val->str.len = data_len;
				stringpool.free += data_len;
				INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_get, 1);
			}
			return found;
		}
		t_retry(status);
	}
Ejemplo n.º 5
0
/*******************************************************************************************
Input Parameter:
	blk_base = Block's base which has the key
	rec_top = record top of the record which will be expanded
Output Parameter:
	expanded_key = expanded key
	rec_size = last record size whic has the key
	keylen = key size
	keycmpc = key compression cound
	hist_ptr = history of blocks read, while expanding a *-key
		History excludes the working block from which key is expanded and
		includes the blocks read below the current block to expand a *-key
	NOTE: hist_ptr.depth will be unchanged
Return:
	cdb_sc_normal on success
	failure code on concurrency failure
 *******************************************************************************************/
enum cdb_sc gvcst_expand_any_key (sm_uc_ptr_t blk_base, sm_uc_ptr_t rec_top, sm_uc_ptr_t expanded_key,
	int *rec_size, int *keylen, int *keycmpc, srch_hist *hist_ptr)
{
	enum cdb_sc	 	status;
	unsigned char		expanded_star_key[MAX_KEY_SZ];
	unsigned short		temp_ushort;
	int			cur_level;
	int			star_keycmpc;
	int			star_keylen;
	int			star_rec_size;
	int			tblk_size;
	block_id		tblk_num;
	sm_uc_ptr_t 		rPtr1, rPtr2, curptr;


	cur_level = ((blk_hdr_ptr_t)blk_base)->levl;
	curptr = blk_base + sizeof(blk_hdr);
	*rec_size = *keycmpc = *keylen = 0;
	while (curptr < rec_top)
	{
		GET_RSIZ(*rec_size, curptr);
		if (0 == cur_level || BSTAR_REC_SIZE != *rec_size)
		{
			READ_RECORD(cur_level, curptr, *keycmpc, *rec_size, expanded_key, *keylen, status);
			if (cdb_sc_normal != status)
			{
				assert(t_tries < CDB_STAGNATE);
				return status;
			}
			else
			{
				curptr += *rec_size;
				if (curptr >= rec_top)
					break;
			}
		}
		else /* a star record in index block */
		{
			if (curptr + *rec_size != rec_top || NULL == hist_ptr)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_rmisalign;
			}
			while (0 != cur_level)
			{
				tblk_size = ((blk_hdr_ptr_t)blk_base)->bsiz;
				GET_LONG(tblk_num, blk_base + tblk_size - sizeof(block_id));
				if (0 == tblk_num  || cs_data->trans_hist.total_blks - 1 < tblk_num)
				{
					assert(t_tries < CDB_STAGNATE);
					return cdb_sc_badlvl;
				}
				cur_level--;
				hist_ptr->h[cur_level].tn =  cs_addrs->ti->curr_tn;
				if (!(blk_base = t_qread(tblk_num, (sm_int_ptr_t)(&(hist_ptr->h[cur_level].cycle)),
					&(hist_ptr->h[cur_level].cr) )))
				{
					assert(t_tries < CDB_STAGNATE);
					return rdfail_detail;
				}
				if (((blk_hdr_ptr_t)blk_base)->levl != cur_level)
				{
					assert(t_tries < CDB_STAGNATE);
					return cdb_sc_badlvl;
				}
				hist_ptr->h[cur_level].buffaddr = blk_base;
				hist_ptr->h[cur_level].blk_num = tblk_num;
				hist_ptr->h[cur_level].prev_rec.match = 0;
				hist_ptr->h[cur_level].prev_rec.offset = 0;
				hist_ptr->h[cur_level].curr_rec.match = 0;
				hist_ptr->h[cur_level].curr_rec.offset = 0;
			}
			tblk_size = ((blk_hdr_ptr_t)blk_base)->bsiz;
			/* expand *-key from right most leaf level block of the
			   sub-tree, of which, the original block is root  */
			if (cdb_sc_normal != (status = (gvcst_expand_any_key(blk_base, blk_base + tblk_size,
				expanded_star_key, &star_rec_size, &star_keylen, &star_keycmpc, hist_ptr))))
				return status;
			if (*keylen + *keycmpc) /* Previous key exists */
			{
				GET_CMPC(*keycmpc, expanded_key, &expanded_star_key[0]);
			}
			memcpy(expanded_key, expanded_star_key, star_keylen + star_keycmpc);
			*keylen = star_keylen + star_keycmpc - *keycmpc;
			*rec_size  = *keylen + *keycmpc + BSTAR_REC_SIZE;
			return cdb_sc_normal;
		} /* end else if *-record */
	}/* end of "while" loop */
	if (curptr == rec_top)
	{
		return cdb_sc_normal;
	}
	else
	{
		assert(t_tries < CDB_STAGNATE);
		return cdb_sc_rmisalign;
	}
}
Ejemplo n.º 6
0
/******************************************************************************************
Input Parameters:
	level: level of working block
	dest_blk_id: last destination used for swap
Output Parameters:
	kill_set_ptr: Kill set to be freed
	*exclude_glist_ptr: List of globals not to be moved for a swap destination
Input/Output Parameters:
	gv_target : as working block's history
	reorg_gv_target->hist : as desitnitions block's history
 ******************************************************************************************/
enum cdb_sc mu_swap_blk(int level, block_id *pdest_blk_id, kill_set *kill_set_ptr, glist *exclude_glist_ptr)
{
	unsigned char		x_blk_lmap;
	unsigned short		temp_ushort;
	int			rec_size1, rec_size2;
	int			wlevel, nslevel, dest_blk_level;
	int			piece_len1, piece_len2, first_offset, second_offset,
				work_blk_size, work_parent_size, dest_blk_size, dest_parent_size;
	int			dest_child_cycle;
	int			blk_seg_cnt, blk_size;
	trans_num		ctn;
	int			key_len, key_len_dir;
	block_id		dest_blk_id, work_blk_id, child1, child2;
	enum cdb_sc		status;
	srch_hist 		*dest_hist_ptr, *dir_hist_ptr;
	cache_rec_ptr_t		dest_child_cr;
	blk_segment		*bs1, *bs_ptr;
	sm_uc_ptr_t		saved_blk, work_blk_ptr, work_parent_ptr, dest_parent_ptr, dest_blk_ptr,
				bn_ptr, bmp_buff, tblk_ptr, rec_base, rPtr1;
	boolean_t		gbl_target_was_set, blk_was_free, deleted;
	gv_namehead		*save_targ;
	srch_blk_status		bmlhist, destblkhist, *hist_ptr;
	unsigned char    	save_cw_set_depth;
	cw_set_element		*tmpcse;
	jnl_buffer_ptr_t	jbbp; /* jbbp is non-NULL only if before-image journaling */
	unsigned int		bsiz;
	DCL_THREADGBL_ACCESS;

	SETUP_THREADGBL_ACCESS;
	dest_blk_id = *pdest_blk_id;
	CHECK_AND_RESET_UPDATE_ARRAY;	/* reset update_array_ptr to update_array */
	if (NULL == TREF(gv_reorgkey))
		GVKEY_INIT(TREF(gv_reorgkey), DBKEYSIZE(MAX_KEY_SZ));
	dest_hist_ptr = &(reorg_gv_target->hist);
	dir_hist_ptr = reorg_gv_target->alt_hist;
	blk_size = cs_data->blk_size;
	work_parent_ptr = gv_target->hist.h[level+1].buffaddr;
	work_parent_size = ((blk_hdr_ptr_t)work_parent_ptr)->bsiz;
	work_blk_ptr = gv_target->hist.h[level].buffaddr;
	work_blk_size = ((blk_hdr_ptr_t)work_blk_ptr)->bsiz;
	work_blk_id = gv_target->hist.h[level].blk_num;
	if (blk_size < work_blk_size)
	{
		assert(t_tries < CDB_STAGNATE);
		return cdb_sc_blkmod;
	}
	cws_reorg_remove_index = 0;
	/*===== Infinite loop to find the destination block =====*/
	for ( ; ; )
	{
		blk_was_free = FALSE;
		INCR_BLK_NUM(dest_blk_id);
		/* A Pre-order traversal should not cause a child block to go to its parent.
		 * However, in case it happens because already the organization was like that or for any other reason, skip swap.
		 * If we decide to swap, code below should be changed to take care of the special case.
		 * Still a grand-child can go to its grand-parent. This is rare and following code can handle it.
		 */
		if (dest_blk_id == gv_target->hist.h[level+1].blk_num)
			continue;
		if (cs_data->trans_hist.total_blks <= dest_blk_id || dest_blk_id == work_blk_id)
		{
			*pdest_blk_id = dest_blk_id;
			return cdb_sc_oprnotneeded;
		}
		ctn = cs_addrs->ti->curr_tn;
		/* We need to save the block numbers that were NEWLY ADDED (since entering this function "mu_swap_blk")
		 * through the CWS_INSERT macro (in db_csh_get/db_csh_getn which can be called by t_qread or gvcst_search below).
		 * This is so that we can delete these blocks from the "cw_stagnate" hashtable in case we determine the need to
		 * choose a different "dest_blk_id" in this for loop (i.e. come to the next iteration). If these blocks are not
		 * deleted, then the hashtable will keep growing (a good example will be if -EXCLUDE qualifier is specified and
		 * a lot of prospective dest_blk_ids get skipped because they contain EXCLUDEd global variables) and very soon
		 * the hashtable will contain more entries than there are global buffers and at that point db_csh_getn will not
		 * be able to get a free global buffer for a new block (since it checks the "cw_stagnate" hashtable before reusing
		 * a buffer in case of MUPIP REORG). To delete these previous iteration blocks, we use the "cws_reorg_remove_array"
		 * variable. This array should have enough entries to accommodate the maximum number of blocks that can be t_qread
		 * in one iteration down below. And that number is the sum of
		 *	+     MAX_BT_DEPTH : for the t_qread while loop down the tree done below
		 *	+ 2 * MAX_BT_DEPTH : for the two calls to gvcst_search done below
		 *	+ 2                : 1 for the t_qread of dest_blk_id and 1 more for the t_qread of a
		 *			     bitmap block done inside the call to get_lmap below
		 *	= 3 * MAX_BT_DEPTH + 2
		 * To be safe, we give a buffer of MAX_BT_DEPTH elements i.e. (4 * MAX_BT_DEPTH) + 2.
		 * This is defined in the macro CWS_REMOVE_ARRAYSIZE in cws_insert.h
		 */
		/* reset whatever blocks the previous iteration of this for loop had filled in the cw_stagnate hashtable */
		for ( ; cws_reorg_remove_index > 0; cws_reorg_remove_index--)
		{
			deleted = delete_hashtab_int4(&cw_stagnate, (uint4 *)&cws_reorg_remove_array[cws_reorg_remove_index]);
			assert(deleted);
		}
		/* read corresponding bitmap block before attempting to read destination  block.
		 * if bitmap indicates block is free, we will not read the destination block
		 */
		bmp_buff = get_lmap(dest_blk_id, &x_blk_lmap, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr);
		if (!bmp_buff || BLK_MAPINVALID == x_blk_lmap ||
			((blk_hdr_ptr_t)bmp_buff)->bsiz != BM_SIZE(BLKS_PER_LMAP) ||
			((blk_hdr_ptr_t)bmp_buff)->levl != LCL_MAP_LEVL)
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_badbitmap;
		}
		if (BLK_FREE != x_blk_lmap)
		{	/* x_blk_lmap is either BLK_BUSY or BLK_RECYCLED. In either case, we need to read destination block
			 * in case we later detect that the before-image needs to be written.
			 */
			if (!(dest_blk_ptr = t_qread(dest_blk_id, (sm_int_ptr_t)&destblkhist.cycle, &destblkhist.cr)))
			{
				assert(t_tries < CDB_STAGNATE);
				return (enum cdb_sc)rdfail_detail;
			}
			destblkhist.blk_num = dest_blk_id;
			destblkhist.buffaddr = dest_blk_ptr;
			destblkhist.level = dest_blk_level = ((blk_hdr_ptr_t)dest_blk_ptr)->levl;
		}
		if (BLK_BUSY != x_blk_lmap)
		{	/* x_blk_map is either BLK_FREE or BLK_RECYCLED both of which mean the block is not used in the bitmap */
			blk_was_free = TRUE;
			break;
		}
		/* dest_blk_id might contain a *-record only.
		 * So follow the pointer to go to the data/index block, which has a non-* key to search.
		 */
		nslevel = dest_blk_level;
		if (MAX_BT_DEPTH <= nslevel)
		{
			assert(CDB_STAGNATE > t_tries);
			return cdb_sc_maxlvl;
		}
		rec_base = dest_blk_ptr + SIZEOF(blk_hdr);
		GET_RSIZ(rec_size1, rec_base);
		tblk_ptr = dest_blk_ptr;
		while ((BSTAR_REC_SIZE == rec_size1) && (0 != nslevel))
		{
			GET_LONG(child1, (rec_base + SIZEOF(rec_hdr)));
			if (0 == child1 || child1 > cs_data->trans_hist.total_blks - 1)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_rdfail;
			}
			if (!(tblk_ptr = t_qread(child1, (sm_int_ptr_t)&dest_child_cycle, &dest_child_cr)))
			{
				assert(t_tries < CDB_STAGNATE);
				return (enum cdb_sc)rdfail_detail;
			}
			/* leaf of a killed GVT can have block header only.   Skip those blocks */
			if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz)
				break;
			nslevel--;
			rec_base = tblk_ptr + SIZEOF(blk_hdr);
			GET_RSIZ(rec_size1, rec_base);
		}
		/* leaf of a killed GVT can have block header only.   Skip those blocks */
		if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz)
			continue;
		/* get length of global variable name (do not read subscript) for dest_blk_id */
		GET_GBLNAME_LEN(key_len_dir, rec_base + SIZEOF(rec_hdr));
		/* key_len = length of 1st key value (including subscript) for dest_blk_id */
		GET_KEY_LEN(key_len, rec_base + SIZEOF(rec_hdr));
		if ((1 >= key_len_dir || MAX_MIDENT_LEN + 1 < key_len_dir) || (2 >= key_len || MAX_KEY_SZ < key_len))
		{	/* Earlier used to restart here always. But dest_blk_id can be a block,
			 * which is just killed and still marked busy.  Skip it, if we are in last retry.
			 */
			if (CDB_STAGNATE <= t_tries)
				continue;
			else
				return cdb_sc_blkmod;
		}
		memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len_dir);
		(TREF(gv_reorgkey))->base[key_len_dir] = 0;
		(TREF(gv_reorgkey))->end = key_len_dir;
		if (exclude_glist_ptr->next)
		{	/* exclude blocks for globals in the list of EXCLUDE option */
			if  (in_exclude_list(&((TREF(gv_reorgkey))->base[0]), key_len_dir - 1, exclude_glist_ptr))
				continue;
		}
		save_targ = gv_target;
		if (INVALID_GV_TARGET != reset_gv_target)
			gbl_target_was_set = TRUE;
		else
		{
			gbl_target_was_set = FALSE;
			reset_gv_target = save_targ;
		}
		gv_target = reorg_gv_target;
		gv_target->root = cs_addrs->dir_tree->root;
		gv_target->clue.end = 0;
		/* assign Directory tree path to find dest_blk_id in dir_hist_ptr */
		status = gvcst_search(TREF(gv_reorgkey), dir_hist_ptr);
		if (cdb_sc_normal != status)
		{
			assert(t_tries < CDB_STAGNATE);
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			return status;
		}
		if (dir_hist_ptr->h[0].curr_rec.match != (TREF(gv_reorgkey))->end + 1)
		{	/* may be in a kill_set of another process */
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			continue;
		}
		for (wlevel = 0; wlevel <= dir_hist_ptr->depth &&
			dir_hist_ptr->h[wlevel].blk_num != dest_blk_id; wlevel++);
		if (dir_hist_ptr->h[wlevel].blk_num == dest_blk_id)
		{	/* do not swap a dir_tree block */
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			continue;
		}
		/* gv_reorgkey will now have the first key from dest_blk_id,
		 * or, from a descendant of dest_blk_id (in case it had a *-key only).
		 */
		memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len);
		(TREF(gv_reorgkey))->end = key_len - 1;
		GET_KEY_LEN(key_len_dir, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr));
		/* Get root of GVT for dest_blk_id */
		GET_LONG(gv_target->root,
			dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr) + key_len_dir);
		if ((0 == gv_target->root) || (gv_target->root > (cs_data->trans_hist.total_blks - 1)))
		{
			assert(t_tries < CDB_STAGNATE);
			RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
			return cdb_sc_blkmod;
		}
		/* Assign Global Variable Tree path to find dest_blk_id in dest_hist_ptr */
		gv_target->clue.end = 0;
		status = gvcst_search(TREF(gv_reorgkey), dest_hist_ptr);
		RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK);
		if (dest_blk_level >= dest_hist_ptr->depth || /* do not swap in root level */
			dest_hist_ptr->h[dest_blk_level].blk_num != dest_blk_id) /* must be in a kill set of another process. */
			continue;
		if ((cdb_sc_normal != status) || (dest_hist_ptr->h[nslevel].curr_rec.match != ((TREF(gv_reorgkey))->end + 1)))
		{
			assert(t_tries < CDB_STAGNATE);
			return (cdb_sc_normal != status ? status : cdb_sc_blkmod);
		}
		for (wlevel = nslevel; wlevel <= dest_blk_level; wlevel++)
			dest_hist_ptr->h[wlevel].tn = ctn;
		dest_blk_ptr = dest_hist_ptr->h[dest_blk_level].buffaddr;
		dest_blk_size = ((blk_hdr_ptr_t)dest_blk_ptr)->bsiz;
		dest_parent_ptr = dest_hist_ptr->h[dest_blk_level+1].buffaddr;
		dest_parent_size = ((blk_hdr_ptr_t)dest_parent_ptr)->bsiz;
		break;
	}
	/*===== End of infinite loop to find the destination block =====*/
	/*-----------------------------------------------------
	   Now modify blocks for swapping. Maximum of 4 blocks.
	   -----------------------------------------------------*/
	if (!blk_was_free)
	{	/* 1: dest_blk_id into work_blk_id */
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, dest_blk_ptr + SIZEOF(blk_hdr), dest_blk_size - SIZEOF(blk_hdr));
		if (!BLK_FINI (bs_ptr,bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		assert(gv_target->hist.h[level].blk_num == work_blk_id);
		assert(gv_target->hist.h[level].buffaddr == work_blk_ptr);
		t_write(&gv_target->hist.h[level], (unsigned char *)bs1, 0, 0, dest_blk_level, TRUE, TRUE, GDS_WRITE_KILLTN);
	}
	/* 2: work_blk_id into dest_blk_id */
	if (!blk_was_free && work_blk_id == dest_hist_ptr->h[dest_blk_level+1].blk_num)
	{	/* work_blk_id will be swapped with its child.
		 * This is the only vertical swap.  Here working block goes to its child.
		 * Working block cannot goto its parent because of traversal
		 */
		if (dest_blk_level + 1 != level || dest_parent_size != work_blk_size)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		BLK_INIT(bs_ptr, bs1);
		BLK_ADDR(saved_blk, dest_parent_size, unsigned char);
		memcpy(saved_blk, dest_parent_ptr, dest_parent_size);
		first_offset = dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset;
		GET_RSIZ(rec_size1, saved_blk + first_offset);
		if (work_blk_size < first_offset + rec_size1)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		piece_len1 =  first_offset + rec_size1;
		BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), piece_len1 - SIZEOF(block_id) - SIZEOF(blk_hdr));
		BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
		PUT_LONG(bn_ptr, work_blk_id); /* since work_blk_id will now be the child of dest_blk_id */
		BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
		BLK_SEG(bs_ptr, saved_blk + piece_len1, dest_parent_size - piece_len1);
		if (!BLK_FINI(bs_ptr, bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		assert(dest_blk_id == dest_hist_ptr->h[dest_blk_level].blk_num);
		assert(dest_blk_ptr == dest_hist_ptr->h[dest_blk_level].buffaddr);
		t_write(&dest_hist_ptr->h[dest_blk_level], (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN);
	} else /* free block or, when working block does not move vertically (swap with parent/child) */
	{
		BLK_INIT(bs_ptr, bs1);
		BLK_ADDR(saved_blk, work_blk_size, unsigned char);
		memcpy(saved_blk, work_blk_ptr, work_blk_size);
		BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), work_blk_size - SIZEOF(blk_hdr));
		if (!BLK_FINI(bs_ptr, bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		if (blk_was_free)
		{
			tmpcse = &cw_set[cw_set_depth];
			t_create(dest_blk_id, (unsigned char *)bs1, 0, 0, level);
			/* Although we invoked t_create, we do not want t_end to allocate the block (i.e. change mode
			 * from gds_t_create to gds_t_acquired). Instead we do that and a little more (that t_end does) all here.
			 */
			assert(dest_blk_id == tmpcse->blk);
			tmpcse->mode = gds_t_acquired;
			/* If snapshots are in progress, we might want to read the before images of the FREE blocks also.
			 * Since mu_swap_blk mimics a small part of t_end, it sets cse->mode to gds_t_acquired and hence
			 * will not read the before images of the FREE blocks in t_end. To workaround this, set
			 * cse->was_free to TRUE so that in t_end, this condition can be used to read the before images of
			 * the FREE blocks if needed.
			 */
			(BLK_FREE == x_blk_lmap) ? SET_FREE(tmpcse) : SET_NFREE(tmpcse);
			/* No need to write before-image in case the block is FREE. In case the database had never been fully
			 * upgraded from V4 to V5 format (after the MUPIP UPGRADE), all RECYCLED blocks can basically be considered
			 * FREE (i.e. no need to write before-images since backward journal recovery will never be expected
			 * to take the database to a point BEFORE the mupip upgrade).
			 */
			if ((BLK_FREE == x_blk_lmap) || !cs_data->db_got_to_v5_once)
				tmpcse->old_block = NULL;
			else
			{	/* Destination is a recycled block that needs a before image */
				tmpcse->old_block = destblkhist.buffaddr;
				/* Record cr,cycle. This is used later in t_end to determine if checksums need to be recomputed */
				tmpcse->cr = destblkhist.cr;
				tmpcse->cycle = destblkhist.cycle;
				jbbp = (JNL_ENABLED(cs_addrs) && cs_addrs->jnl_before_image) ? cs_addrs->jnl->jnl_buff : NULL;
				if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn))
				{	/* Compute CHECKSUM for writing PBLK record before getting crit.
					 * It is possible that we are reading a block that is actually marked free in
					 * the bitmap (due to concurrency issues at this point). Therefore we might be
					 * actually reading uninitialized block headers and in turn a bad value of
					 * "old_block->bsiz". Restart if we ever access a buffer whose size is greater
					 * than the db block size.
					 */
					bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz;
					if (bsiz > blk_size)
					{
						assert(CDB_STAGNATE > t_tries);
						return cdb_sc_lostbmlcr;
					}
					JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, cs_data, cs_addrs, tmpcse->old_block, bsiz);
				}
			}
			assert(GDSVCURR == tmpcse->ondsk_blkver);	/* should have been set by t_create above */
		} else
		{
			hist_ptr = &dest_hist_ptr->h[dest_blk_level];
			assert(dest_blk_id == hist_ptr->blk_num);
			assert(dest_blk_ptr == hist_ptr->buffaddr);
			t_write(hist_ptr, (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN);
		}
	}
	if (!blk_was_free)
	{	/* 3: Parent of destination block (may be parent of working block too) */
		if (gv_target->hist.h[level+1].blk_num == dest_hist_ptr->h[dest_blk_level+1].blk_num)
		{	/* dest parent == work_blk parent */
			BLK_INIT(bs_ptr, bs1);
			/* Interchange pointer to dest_blk_id and work_blk_id */
			if (level != dest_blk_level ||
				gv_target->hist.h[level+1].curr_rec.offset == dest_hist_ptr->h[level+1].curr_rec.offset)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			if (gv_target->hist.h[level+1].curr_rec.offset < dest_hist_ptr->h[level+1].curr_rec.offset)
			{
				first_offset = gv_target->hist.h[level+1].curr_rec.offset;
				second_offset = dest_hist_ptr->h[level+1].curr_rec.offset;
			} else
			{
				first_offset = dest_hist_ptr->h[level+1].curr_rec.offset;
				second_offset = gv_target->hist.h[level+1].curr_rec.offset;
			}
			GET_RSIZ(rec_size1, dest_parent_ptr + first_offset);
			GET_RSIZ(rec_size2, dest_parent_ptr + second_offset);
			if (dest_parent_size < first_offset + rec_size1 ||
				dest_parent_size < second_offset + rec_size2 ||
				BSTAR_REC_SIZE >= rec_size1 || BSTAR_REC_SIZE > rec_size2)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			piece_len1 =  first_offset + rec_size1 - SIZEOF(block_id);
			piece_len2 =  second_offset + rec_size2 - SIZEOF(block_id);
			GET_LONG(child1, dest_parent_ptr + piece_len1);
			GET_LONG(child2, dest_parent_ptr + piece_len2);
			BLK_SEG(bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), piece_len1 - SIZEOF(blk_hdr));
			BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
			PUT_LONG(bn_ptr, child2);
			BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
			BLK_SEG(bs_ptr, dest_parent_ptr + first_offset + rec_size1,
				second_offset + rec_size2 - SIZEOF(block_id) - first_offset - rec_size1);
			BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
			PUT_LONG(bn_ptr, child1);
			BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
			BLK_SEG(bs_ptr, dest_parent_ptr + second_offset + rec_size2,
				dest_parent_size - second_offset - rec_size2);
			if (!BLK_FINI(bs_ptr,bs1))
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			assert(level == dest_blk_level);
			assert(dest_parent_ptr == dest_hist_ptr->h[level+1].buffaddr);
			t_write(&dest_hist_ptr->h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN);
		} else if (work_blk_id != dest_hist_ptr->h[dest_blk_level+1].blk_num)
		{	/* Destination block moved in the position of working block.
			 * So destination block's parent's pointer should be changed to work_blk_id
			 */
			BLK_INIT(bs_ptr, bs1);
			GET_RSIZ(rec_size1, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset);
			if (dest_parent_size < rec_size1 +  dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset ||
				BSTAR_REC_SIZE > rec_size1)
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			BLK_SEG (bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr),
			    dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id));
			BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
			PUT_LONG(bn_ptr, work_blk_id);
			BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
			BLK_SEG(bs_ptr, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1,
				dest_parent_size - dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset  - rec_size1);
			if (!BLK_FINI(bs_ptr,bs1))
			{
				assert(t_tries < CDB_STAGNATE);
				return cdb_sc_blkmod;
			}
			assert(dest_parent_ptr == dest_hist_ptr->h[dest_blk_level+1].buffaddr);
			t_write(&dest_hist_ptr->h[dest_blk_level+1], (unsigned char *)bs1, 0, 0, dest_blk_level+1,
				FALSE, TRUE, GDS_WRITE_KILLTN);
		}
	}
	/* 4: Parent of working block, if different than destination's parent or, destination was a free block */
	if (blk_was_free || gv_target->hist.h[level+1].blk_num != dest_hist_ptr->h[dest_blk_level+1].blk_num)
	{	/* Parent block of working blk should correctly point the working block. Working block went to dest_blk_id  */
		GET_RSIZ(rec_size1, (work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset));
		if (work_parent_size < rec_size1 +  gv_target->hist.h[level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1)
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		BLK_INIT(bs_ptr, bs1);
		BLK_SEG(bs_ptr, work_parent_ptr + SIZEOF(blk_hdr),
			gv_target->hist.h[level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id));
		BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char);
		PUT_LONG(bn_ptr, dest_blk_id);
		BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id));
		BLK_SEG(bs_ptr, work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset + rec_size1,
			work_parent_size - gv_target->hist.h[level+1].curr_rec.offset - rec_size1);
		if (!BLK_FINI(bs_ptr, bs1))
		{
			assert(t_tries < CDB_STAGNATE);
			return cdb_sc_blkmod;
		}
		assert(gv_target->hist.h[level+1].buffaddr == work_parent_ptr);
		t_write(&gv_target->hist.h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN);
	}
	/* else already taken care of, when dest_blk_id moved */
	if (blk_was_free)
	{	/* A free/recycled block will become busy block.
		 * So the local bitmap must be updated.
		 * Local bit map block will be added in the list of update arrray for concurrency check and
		 * 	also the cw_set element will be created to mark the free/recycled block as free.
		 * kill_set_ptr will save the block which will become free.
		 */
		child1 = ROUND_DOWN2(dest_blk_id, BLKS_PER_LMAP); /* bit map block */
		bmlhist.buffaddr = bmp_buff;
		bmlhist.blk_num = child1;
		child1 = dest_blk_id - child1;
		assert(child1);
		PUT_LONG(update_array_ptr, child1);
		/* Need to put bit maps on the end of the cw set for concurrency checking.
		 * We want to simulate t_write_map, except we want to update "cw_map_depth" instead of "cw_set_depth".
		 * Hence the save and restore logic (for "cw_set_depth") below.
		 */
		save_cw_set_depth = cw_set_depth;
		assert(!cw_map_depth);
		t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, ctn, 1);	/* will increment cw_set_depth */
		cw_map_depth = cw_set_depth;		/* set cw_map_depth to the latest cw_set_depth */
		cw_set_depth = save_cw_set_depth;	/* restore cw_set_depth */
		/* t_write_map simulation end */
		update_array_ptr += SIZEOF(block_id);
		child1 = 0;
		PUT_LONG(update_array_ptr, child1);
		update_array_ptr += SIZEOF(block_id);
		assert(1 == cw_set[cw_map_depth - 1].reference_cnt);	/* 1 free block is now becoming BLK_USED in the bitmap */
		/* working block will be removed */
		kill_set_ptr->blk[kill_set_ptr->used].flag = 0;
		kill_set_ptr->blk[kill_set_ptr->used].level = 0;
		kill_set_ptr->blk[kill_set_ptr->used++].block = work_blk_id;
	}
	*pdest_blk_id = dest_blk_id;
	return cdb_sc_normal;
}