int mu_extr_getblk(unsigned char *ptr) { error_def(ERR_GVGETFAIL); enum cdb_sc status; rec_hdr_ptr_t rp; bool two_histories, end_of_tree; blk_hdr_ptr_t bp; srch_blk_status *bh; srch_hist *rt_history; t_begin(ERR_GVGETFAIL, FALSE); for (;;) { if (cdb_sc_normal != (status = gvcst_search(gv_currkey, NULL))) { t_retry(status); continue; } end_of_tree = two_histories = FALSE; bh = gv_target->hist.h; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; if (rp >= (rec_hdr_ptr_t)CST_TOB(bp)) { rt_history = gv_target->alt_hist; if (cdb_sc_normal == (status = gvcst_rtsib(rt_history, 0))) { two_histories = TRUE; if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, rt_history->h))) { t_retry(status); continue; } bp = (blk_hdr_ptr_t)rt_history->h[0].buffaddr; } else if (cdb_sc_endtree == status) end_of_tree = TRUE; else { t_retry(status); continue; } } memcpy(ptr, bp, bp->bsiz); if (t_end(&gv_target->hist, two_histories ? rt_history : NULL) != 0) { if (two_histories) memcpy(gv_target->hist.h, rt_history->h, sizeof(srch_blk_status) * (rt_history->depth + 1)); return !end_of_tree; } } }
void gvcst_root_search(void) { srch_blk_status *h0; uchar_ptr_t c, c1; sm_uc_ptr_t rp; unsigned short rlen, hdr_len; uchar_ptr_t subrec_ptr; enum cdb_sc status; boolean_t gbl_target_was_set; gv_namehead *save_targ; mname_entry *gvent; int altkeylen; assert((dba_bg == gv_cur_region->dyn.addr->acc_meth) || (dba_mm == gv_cur_region->dyn.addr->acc_meth)); assert(gv_altkey->top == gv_currkey->top); assert(gv_altkey->top == gv_keysize); assert(gv_currkey->end < gv_currkey->top); for (c = gv_altkey->base, c1 = gv_currkey->base; *c1;) *c++ = *c1++; *c++ = 0; *c = 0; gv_altkey->end = c - gv_altkey->base; assert(gv_altkey->end < gv_altkey->top); assert(gv_target != cs_addrs->dir_tree); save_targ = gv_target; /* Check if "gv_target->gvname" matches "gv_altkey->base". If not, there is a name mismatch (out-of-design situation). * This check is temporary until we catch the situation that caused D9H02-002641 */ /* --- Check BEGIN --- */ gvent = &save_targ->gvname; altkeylen = gv_altkey->end - 1; if (!altkeylen || (altkeylen != gvent->var_name.len) || memcmp(gv_altkey->base, gvent->var_name.addr, gvent->var_name.len)) GTMASSERT; /* --- Check END --- */ if (INVALID_GV_TARGET != reset_gv_target) gbl_target_was_set = TRUE; else { gbl_target_was_set = FALSE; reset_gv_target = save_targ; } gv_target = cs_addrs->dir_tree; if (is_standalone) /* *&& (0 != gv_target->clue.end) && (FALSE == is_valid_hist(&gv_target->hist))) */ gv_target->clue.end = 0; T_BEGIN_READ_NONTP_OR_TP(ERR_GVGETFAIL); assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { hdr_len = rlen = 0; gv_target = cs_addrs->dir_tree; if (dollar_trestart) gv_target->clue.end = 0; if (cdb_sc_normal == (status = gvcst_search(gv_altkey, 0))) { if (gv_altkey->end + 1 == gv_target->hist.h[0].curr_rec.match) { h0 = gv_target->hist.h; rp = (h0->buffaddr + h0->curr_rec.offset); hdr_len = sizeof(rec_hdr) + gv_altkey->end + 1 - ((rec_hdr_ptr_t)rp)->cmpc; GET_USHORT(rlen, rp); if (FALSE == (CHKRECLEN(rp, h0->buffaddr, rlen)) || (rlen < hdr_len + sizeof(block_id))) { gv_target->clue.end = 0; RESET_GV_TARGET_LCL(save_targ); t_retry(cdb_sc_rmisalign); continue; } GET_LONG(save_targ->root, (rp + hdr_len)); if (rlen > hdr_len + sizeof(block_id)) { assert(NULL != global_collation_mstr.addr || 0 == global_collation_mstr.len); if (global_collation_mstr.len < rlen - (hdr_len + sizeof(block_id))) { if (NULL != global_collation_mstr.addr) free(global_collation_mstr.addr); global_collation_mstr.len = rlen - (hdr_len + SIZEOF(block_id)); global_collation_mstr.addr = (char *)malloc(global_collation_mstr.len); } /* the memcpy needs to be done here instead of out of for loop for * concurrency consideration. We don't use s2pool because the pointer rp is 64 bits */ memcpy(global_collation_mstr.addr, rp + hdr_len + sizeof(block_id), rlen - (hdr_len + sizeof(block_id))); } if (0 != dollar_tlevel) { status = tp_hist(NULL); if (cdb_sc_normal != status) { gv_target->clue.end = 0; RESET_GV_TARGET_LCL(save_targ); gv_target->root = 0; t_retry(status); continue; } break; } } if (0 == dollar_tlevel) { if ((trans_num)0 != t_end(&gv_target->hist, 0)) break; } else { status = tp_hist(NULL); if (cdb_sc_normal == status) break; gv_target->clue.end = 0; RESET_GV_TARGET_LCL(save_targ); gv_target->root = 0; t_retry(status); continue; } save_targ->root = 0; } else { gv_target->clue.end = 0; RESET_GV_TARGET_LCL(save_targ); t_retry(status); continue; } } RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ); if (rlen > hdr_len + sizeof(block_id)) { assert(NULL != global_collation_mstr.addr); subrec_ptr = get_spec((uchar_ptr_t)global_collation_mstr.addr, (int)(rlen - (hdr_len + sizeof(block_id))), COLL_SPEC); if (subrec_ptr) { gv_target->nct = *(subrec_ptr + COLL_NCT_OFFSET); gv_target->act = *(subrec_ptr + COLL_ACT_OFFSET); gv_target->ver = *(subrec_ptr + COLL_VER_OFFSET); } else { gv_target->nct = 0; gv_target->act = 0; gv_target->ver = 0; } } else { gv_target->nct = 0; gv_target->act = cs_addrs->hdr->def_coll; gv_target->ver = cs_addrs->hdr->def_coll_ver; } if (gv_target->act) act_in_gvt(); assert(gv_target->act || NULL == gv_target->collseq); return; }
void mu_swap_root(glist *gl_ptr, int *root_swap_statistic_ptr) { sgmnt_data_ptr_t csd; sgmnt_addrs *csa; node_local_ptr_t cnl; srch_hist *dir_hist_ptr, *gvt_hist_ptr; gv_namehead *save_targ; block_id root_blk_id, child_blk_id, free_blk_id; sm_uc_ptr_t root_blk_ptr, child_blk_ptr; kill_set kill_set_list; trans_num curr_tn, ret_tn; int level, root_blk_lvl; block_id save_root; boolean_t tn_aborted; unsigned int lcl_t_tries; enum cdb_sc status; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(mu_reorg_process); gv_target = gl_ptr->gvt; gv_target->root = 0; /* reset root so we recompute it in DO_OP_GVNAME below */ gv_target->clue.end = 0; /* reset clue since reorg action on later globals might have invalidated it */ reorg_gv_target->gvname.var_name = gv_target->gvname.var_name; /* needed by SAVE_ROOTSRCH_ENTRY_STATE */ dir_hist_ptr = gv_target->alt_hist; gvt_hist_ptr = &(gv_target->hist); inctn_opcode = inctn_invalid_op; DO_OP_GVNAME(gl_ptr); /* sets gv_target/gv_currkey/gv_cur_region/cs_addrs/cs_data to correspond to <globalname,reg> in gl_ptr */ csa = cs_addrs; cnl = csa->nl; csd = cs_data; /* Be careful to keep csd up to date. With MM, cs_data can change, and * dereferencing an older copy can result in a SIG-11. */ if (gv_cur_region->read_only) return; /* Cannot proceed for read-only data files */ if (0 == gv_target->root) { /* Global does not exist (online rollback). No problem. */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_GBLNOEXIST, 2, GNAME(gl_ptr).len, GNAME(gl_ptr).addr); return; } if (dba_mm == csd->acc_meth) /* return for now without doing any swapping operation because later mu_truncate * is going to issue the MUTRUNCNOTBG message. */ return; SET_GV_ALTKEY_TO_GBLNAME_FROM_GV_CURRKEY; /* set up gv_altkey to be just the gblname */ /* ------------ Swap root block of global variable tree --------- */ t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { curr_tn = csa->ti->curr_tn; kill_set_list.used = 0; save_root = gv_target->root; gv_target->root = csa->dir_tree->root; gv_target->clue.end = 0; if (cdb_sc_normal != (status = gvcst_search(gv_altkey, dir_hist_ptr))) { /* Assign directory tree path to dir_hist_ptr */ assert(t_tries < CDB_STAGNATE); gv_target->root = save_root; t_retry(status); continue; } gv_target->root = save_root; gv_target->clue.end = 0; if (cdb_sc_normal != (gvcst_search(gv_currkey, NULL))) { /* Assign global variable tree path to gvt_hist_ptr */ assert(t_tries < CDB_STAGNATE); t_retry(status); continue; } /* We've already search the directory tree in op_gvname/t_retry and obtained gv_target->root. * Should restart with gvtrootmod2 if they don't agree. gvcst_root_search is the final arbiter. * Really need that for debug info and also should assert(gv_currkey is global name). */ root_blk_lvl = gvt_hist_ptr->depth; assert(root_blk_lvl > 0); root_blk_ptr = gvt_hist_ptr->h[root_blk_lvl].buffaddr; root_blk_id = gvt_hist_ptr->h[root_blk_lvl].blk_num; assert((CDB_STAGNATE > t_tries) || (gv_target->root == gvt_hist_ptr->h[root_blk_lvl].blk_num)); free_blk_id = swap_root_or_directory_block(0, root_blk_lvl, dir_hist_ptr, root_blk_id, root_blk_ptr, &kill_set_list, curr_tn); if (RETRY_SWAP == free_blk_id) continue; else if (ABORT_SWAP == free_blk_id) break; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; assert(1 == kill_set_list.used); need_kip_incr = TRUE; if (!csa->now_crit) WAIT_ON_INHIBIT_KILLS(cnl, MAXWAIT2KILL); DEBUG_ONLY(lcl_t_tries = t_tries); TREF(in_mu_swap_root_state) = MUSWP_INCR_ROOT_CYCLE; assert(!TREF(in_gvcst_redo_root_search)); if ((trans_num)0 == (ret_tn = t_end(gvt_hist_ptr, dir_hist_ptr, TN_NOT_SPECIFIED))) { TREF(in_mu_swap_root_state) = MUSWP_NONE; need_kip_incr = FALSE; assert(NULL == kip_csa); ABORT_TRANS_IF_GBL_EXIST_NOMORE(lcl_t_tries, tn_aborted); if (tn_aborted) { /* It is not an error if the global (that once existed) doesn't exist anymore (due to ROLLBACK) */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_GBLNOEXIST, 2, GNAME(gl_ptr).len, GNAME(gl_ptr).addr); return; } continue; } TREF(in_mu_swap_root_state) = MUSWP_NONE; /* Note that this particular process's csa->root_search_cycle is now behind cnl->root_search_cycle. * This forces a cdb_sc_gvtrootmod2 restart in gvcst_bmp_mark_free below. */ assert(cnl->root_search_cycle > csa->root_search_cycle); gvcst_kill_sort(&kill_set_list); GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, csa); DECR_KIP(csd, csa, kip_csa); *root_swap_statistic_ptr += 1; break; } /* ------------ Swap blocks in branch of directory tree --------- */ for (level = 0; level <= MAX_BT_DEPTH; level++) { t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { curr_tn = csa->ti->curr_tn; kill_set_list.used = 0; save_root = gv_target->root; gv_target->root = csa->dir_tree->root; gv_target->clue.end = 0; if (cdb_sc_normal != (status = gvcst_search(gv_altkey, dir_hist_ptr))) { /* assign branch path of directory tree into dir_hist_ptr */ assert(t_tries < CDB_STAGNATE); gv_target->root = save_root; t_retry(status); continue; } gv_target->root = save_root; gv_target->clue.end = 0; if (level >= dir_hist_ptr->depth) { /* done */ t_abort(gv_cur_region, csa); return; } child_blk_ptr = dir_hist_ptr->h[level].buffaddr; child_blk_id = dir_hist_ptr->h[level].blk_num; assert(csa->dir_tree->root != child_blk_id); free_blk_id = swap_root_or_directory_block(level + 1, level, dir_hist_ptr, child_blk_id, child_blk_ptr, &kill_set_list, curr_tn); if (level == 0) /* set level as 1 to mark this kill set is for level-0 block in directory tree. * The kill-set level later will be used in gvcst_bmp_markfree to assign a special value to * cw_set_element, which will be eventually used by t_end to write the block to snapshot */ kill_set_list.blk[kill_set_list.used - 1].level = 1; if (RETRY_SWAP == free_blk_id) continue; else if (ABORT_SWAP == free_blk_id) break; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; assert(1 == kill_set_list.used); need_kip_incr = TRUE; if (!csa->now_crit) WAIT_ON_INHIBIT_KILLS(cnl, MAXWAIT2KILL); DEBUG_ONLY(lcl_t_tries = t_tries); TREF(in_mu_swap_root_state) = MUSWP_DIRECTORY_SWAP; if ((trans_num)0 == (ret_tn = t_end(dir_hist_ptr, NULL, TN_NOT_SPECIFIED))) { TREF(in_mu_swap_root_state) = MUSWP_NONE; need_kip_incr = FALSE; assert(NULL == kip_csa); continue; } TREF(in_mu_swap_root_state) = MUSWP_NONE; gvcst_kill_sort(&kill_set_list); TREF(in_mu_swap_root_state) = MUSWP_FREE_BLK; GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, csa); TREF(in_mu_swap_root_state) = MUSWP_NONE; DECR_KIP(csd, csa, kip_csa); break; } } return; }
boolean_t gvcst_query2(void) { boolean_t found, two_histories; enum cdb_sc status; blk_hdr_ptr_t bp; rec_hdr_ptr_t rp; unsigned char *c1, *c2; srch_blk_status *bh; srch_hist *rt_history; T_BEGIN_READ_NONTP_OR_TP(ERR_GVQUERYFAIL); assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { two_histories = FALSE; # if defined(DEBUG) && defined(UNIX) if (gtm_white_box_test_case_enabled && (WBTEST_ANTIFREEZE_GVQUERYFAIL == gtm_white_box_test_case_number)) { t_retry(cdb_sc_blknumerr); continue; } # endif if (cdb_sc_normal == (status = gvcst_search(gv_currkey, 0))) { found = TRUE; bh = &gv_target->hist.h[0]; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; if (rp >= (rec_hdr_ptr_t)CST_TOB(bp)) { two_histories = TRUE; rt_history = gv_target->alt_hist; status = gvcst_rtsib(rt_history, 0); if (cdb_sc_endtree == status) /* end of tree */ { found = FALSE; two_histories = FALSE; /* second history not valid */ } else if (cdb_sc_normal != status) { t_retry(status); continue; } else { bh = &rt_history->h[0]; if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh))) { t_retry(status); continue; } rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; } } if (found) { /* !found indicates that the end of tree has been reached (see call to * gvcst_rtsib). If there is no more tree, don't bother doing expansion. */ status = gvcst_expand_curr_key(bh, gv_currkey, gv_altkey); if (cdb_sc_normal != status) { t_retry(status); continue; } } if (!dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, !two_histories ? NULL : rt_history, TN_NOT_SPECIFIED)) continue; } else { status = tp_hist(!two_histories ? NULL : rt_history); if (cdb_sc_normal != status) { t_retry(status); continue; } } assert(cs_data == cs_addrs->hdr); INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_query, 1); if (found) { c1 = &gv_altkey->base[0]; c2 = &gv_currkey->base[0]; for ( ; *c2; ) { if (*c2++ != *c1++) break; } if (!*c2 && !*c1) return TRUE; } return FALSE; } t_retry(status); } }
mint gvcst_data(void) { blk_hdr_ptr_t bp; enum cdb_sc status; mint val; rec_hdr_ptr_t rp; unsigned short rec_size; srch_blk_status *bh; srch_hist *rt_history; sm_uc_ptr_t b_top; assert((gv_target->root < cs_addrs->ti->total_blks) || (0 < dollar_tlevel)); T_BEGIN_READ_NONTP_OR_TP(ERR_GVDATAFAIL); assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { rt_history = gv_target->alt_hist; rt_history->h[0].blk_num = 0; if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal) { t_retry(status); continue; } bh = gv_target->hist.h; bp = (blk_hdr_ptr_t)bh->buffaddr; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); b_top = bh->buffaddr + bp->bsiz; val = 0; if (gv_currkey->end + 1 == bh->curr_rec.match) val = 1; else if (bh->curr_rec.match >= gv_currkey->end) val = 10; if (1 == val || rp == (rec_hdr_ptr_t)b_top) { GET_USHORT(rec_size, &rp->rsiz); if (rp == (rec_hdr_ptr_t)b_top || (sm_uc_ptr_t)rp + rec_size == b_top) { if (cdb_sc_endtree != (status = gvcst_rtsib(rt_history, 0))) { if ((cdb_sc_normal != status) || (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, rt_history->h)))) { t_retry(status); continue; } if (rt_history->h[0].curr_rec.match >= gv_currkey->end) val += 10; } } else { if ((sm_uc_ptr_t)rp + rec_size > b_top) { t_retry(cdb_sc_rmisalign); continue; } rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rec_size); if (rp->cmpc >= gv_currkey->end) val += 10; } } if (0 == dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, 0 == rt_history->h[0].blk_num ? NULL : rt_history)) continue; } else { status = tp_hist(0 == rt_history->h[0].blk_num ? NULL : rt_history); if (cdb_sc_normal != status) { t_retry(status); continue; } } INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_data, 1); return val; } }
/* This function is the equivalent of invoking gvcst_data & gvcst_get at the same time. * One crucial difference is that this function does NOT handle restarts by automatically invoking t_retry. * Instead, it returns the restart code to the caller so that it can handle the restart accordingly. * This is important in the case of triggers because we do NOT want to call t_retry in case of a implicit tstart * wrapped gvcst_put or gvcst_kill trigger-invoking update transaction. Additionally, this function assumes * that it is called always inside of TP (i.e. dollar_tlevel is non-zero). */ enum cdb_sc gvcst_dataget(mint *dollar_data, mval *val) { blk_hdr_ptr_t bp; boolean_t do_rtsib; enum cdb_sc status; mint dlr_data; rec_hdr_ptr_t rp; unsigned short match, rsiz; srch_blk_status *bh; srch_hist *rt_history; sm_uc_ptr_t b_top; int key_size, data_len; uint4 save_t_err; error_def(ERR_GVDATAGETFAIL); error_def(ERR_GVKILLFAIL); /* The following code is lifted from gvcst_data. Any changes here might need to be reflected there as well */ assert(dollar_tlevel); assert((CDB_STAGNATE > t_tries) || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ save_t_err = t_err; assert(ERR_GVKILLFAIL == save_t_err); /* this function should currently be called only from gvcst_kill */ t_err = ERR_GVDATAGETFAIL; /* switch t_err to reflect dataget sub-operation (under the KILL operation) */ /* In case of a failure return, it is ok to return with t_err set to ERR_GVDATAGETFAIL as that gives a better * picture of exactly where in the transaction the failure occurred. */ rt_history = gv_target->alt_hist; rt_history->h[0].blk_num = 0; if (cdb_sc_normal != (status = gvcst_search(gv_currkey, NULL))) return status; bh = gv_target->hist.h; bp = (blk_hdr_ptr_t)bh->buffaddr; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); b_top = bh->buffaddr + bp->bsiz; match = bh->curr_rec.match; key_size = gv_currkey->end + 1; do_rtsib = FALSE; /* Even if key does not exist, return null string in "val". Caller can use dollar_data to distinguish * whether the key is undefined or defined and set to the null string. */ val->mvtype = MV_STR; val->str.len = 0; if (key_size == match) { dlr_data = 1; /* the following code is lifted from gvcst_get. any changes here might need to be reflected there as well */ GET_USHORT(rsiz, &rp->rsiz); data_len = rsiz + rp->cmpc - SIZEOF(rec_hdr) - key_size; if ((0 > data_len) || ((sm_uc_ptr_t)rp + rsiz > b_top)) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_rmisalign1; return status; } else { ENSURE_STP_FREE_SPACE(data_len); memcpy(stringpool.free, (sm_uc_ptr_t)rp + rsiz - data_len, data_len); val->str.addr = (char *)stringpool.free; val->str.len = data_len; stringpool.free += data_len; } /* --------------------- end code lifted from gvcst_get ---------------------------- */ rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rsiz); if ((sm_uc_ptr_t)rp > b_top) { status = cdb_sc_rmisalign; return status; } else if ((sm_uc_ptr_t)rp == b_top) do_rtsib = TRUE; else if (rp->cmpc >= gv_currkey->end) dlr_data += 10; } else if (match >= gv_currkey->end) dlr_data = 10; else { dlr_data = 0; if (rp == (rec_hdr_ptr_t)b_top) do_rtsib = TRUE; } if (do_rtsib && (cdb_sc_endtree != (status = gvcst_rtsib(rt_history, 0)))) { if ((cdb_sc_normal != status) || (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, rt_history->h)))) return status; if (rt_history->h[0].curr_rec.match >= gv_currkey->end) { assert(1 >= dlr_data); dlr_data += 10; } } status = tp_hist(0 == rt_history->h[0].blk_num ? NULL : rt_history); if (cdb_sc_normal != status) return status; *dollar_data = dlr_data; t_err = save_t_err; /* restore t_err to what it was at function entry */ return status; }
boolean_t gvcst_order2(void) { blk_hdr_ptr_t bp; boolean_t found, two_histories; enum cdb_sc status; rec_hdr_ptr_t rp; unsigned short rec_size; srch_blk_status *bh; srch_hist *rt_history; sm_uc_ptr_t c1, c2, ctop, alt_top; int tmp_cmpc; T_BEGIN_READ_NONTP_OR_TP(ERR_GVORDERFAIL); for (;;) { assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ two_histories = FALSE; #if defined(DEBUG) && defined(UNIX) if (gtm_white_box_test_case_enabled && (WBTEST_ANTIFREEZE_GVORDERFAIL == gtm_white_box_test_case_number)) { status = cdb_sc_blknumerr; t_retry(status); continue; } #endif if (cdb_sc_normal == (status = gvcst_search(gv_currkey, NULL))) { found = TRUE; bh = gv_target->hist.h; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; if ((rec_hdr_ptr_t)CST_TOB(bp) <= rp) { two_histories = TRUE; rt_history = gv_target->alt_hist; status = gvcst_rtsib(rt_history, 0); if (cdb_sc_normal == status) { bh = rt_history->h; if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh))) { t_retry(status); continue; } rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; } else { if (cdb_sc_endtree == status) { found = FALSE; two_histories = FALSE; /* second history not valid */ } else { t_retry(status); continue; } } } if (found) { assert(gv_altkey->top == gv_currkey->top); assert(gv_altkey->top == gv_keysize); assert(gv_altkey->end < gv_altkey->top); /* store new subscipt */ c1 = gv_altkey->base; alt_top = gv_altkey->base + gv_altkey->top - 1; /* Make alt_top one less than gv_altkey->top to allow double-null at end of a key-name */ /* 4/17/96 * HP compiler bug work-around. The original statement was * c2 = (unsigned char *)CST_BOK(rp) + bh->curr_rec.match - rp->cmpc; * * ...but this was sometimes compiled incorrectly (the lower 4 bits * of rp->cmpc, sign extended, were subtracted from bh->curr_rec.match). * I separated out the subtraction of rp->cmpc. * * -VTF. */ c2 = (sm_uc_ptr_t)CST_BOK(rp) + bh->curr_rec.match; memcpy(c1, gv_currkey->base, bh->curr_rec.match); c1 += bh->curr_rec.match; c2 -= EVAL_CMPC(rp); GET_USHORT(rec_size, &rp->rsiz); ctop = (sm_uc_ptr_t)rp + rec_size; for (;;) { if (c2 >= ctop || c1 >= alt_top) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_rmisalign; goto restart; /* goto needed because of nested FOR loop */ } if (0 == (*c1++ = *c2++)) { *c1 = 0; break; } } gv_altkey->end = c1 - gv_altkey->base; assert(gv_altkey->end < gv_altkey->top); } if (!dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, two_histories ? rt_history : NULL, TN_NOT_SPECIFIED)) continue; } else { status = tp_hist(two_histories ? rt_history : NULL); if (cdb_sc_normal != status) { t_retry(status); continue; } } assert(cs_data == cs_addrs->hdr); INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_order, 1); return (found && (bh->curr_rec.match >= gv_currkey->prev)); } restart: t_retry(status); } }
boolean_t gvcst_gblmod(mval *v) { boolean_t gblmod, is_dummy; enum cdb_sc status; int key_size, key_size2, data_len; srch_hist *alt_history; blk_hdr_ptr_t bp; rec_hdr_ptr_t rp; unsigned short match, match2, rsiz, offset_to_value, oldend; srch_blk_status *bh; sm_uc_ptr_t b_top; trans_num tn_to_compare; T_BEGIN_READ_NONTP_OR_TP(ERR_GBLMODFAIL); assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { gblmod = TRUE; if (cdb_sc_normal == (status = gvcst_search(gv_currkey, NULL))) { alt_history = gv_target->alt_hist; alt_history->h[0].blk_num = 0; VMS_ONLY( if (cs_addrs->hdr->resync_tn >= ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->tn) gblmod = FALSE; ) # ifdef UNIX tn_to_compare = ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->tn; bh = gv_target->hist.h; bp = (blk_hdr_ptr_t) bh->buffaddr; rp = (rec_hdr_ptr_t) (bh->buffaddr + bh->curr_rec.offset); b_top = bh->buffaddr + bp->bsiz; GET_USHORT(rsiz, &rp->rsiz); key_size = gv_currkey->end + 1; data_len = rsiz + EVAL_CMPC(rp) - SIZEOF(rec_hdr) - key_size; match = bh->curr_rec.match; if (key_size == match) { if ((0 > data_len) || ((sm_uc_ptr_t)rp + rsiz > b_top)) { status = cdb_sc_rmisalign1; t_retry(status); continue; } offset_to_value = SIZEOF(rec_hdr) + key_size - EVAL_CMPC(rp); /* If it could be a spanning node, i.e., has special value, then try to get tn from the * block that contains the first special subscript. Since dummy nodes always have the * same value, the tn number is not updated It s enough to do only the first piece * since all pieces of a spanning node are killed before an update is applied. */ if (IS_SN_DUMMY(data_len, (sm_uc_ptr_t)rp + offset_to_value)) { oldend = gv_currkey->end; APPEND_HIDDEN_SUB(gv_currkey); if (cdb_sc_normal == (status = gvcst_search(gv_currkey, alt_history))) { key_size2 = gv_currkey->end + 1; match = alt_history->h[0].curr_rec.match; if (key_size2 == match) tn_to_compare = ((blk_hdr_ptr_t)alt_history->h[0].buffaddr)->tn; } else { gv_currkey->end = oldend; gv_currkey->base[gv_currkey->end - 1] = KEY_DELIMITER; gv_currkey->base[gv_currkey->end] = KEY_DELIMITER; t_retry(status); continue; } gv_currkey->end = oldend; gv_currkey->base[gv_currkey->end - 1] = KEY_DELIMITER; gv_currkey->base[gv_currkey->end] = KEY_DELIMITER; } } if (cs_addrs->hdr->zqgblmod_tn > tn_to_compare) gblmod = FALSE; # endif if (!dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, 0 == alt_history->h[0].blk_num ? NULL : alt_history, TN_NOT_SPECIFIED)) continue; } else { status = tp_hist(0 == alt_history->h[0].blk_num ? NULL : alt_history); if (cdb_sc_normal != status) { t_retry(status); continue; } } return gblmod; }
bool gvcst_get(mval *v) { srch_blk_status *s; enum cdb_sc status; int key_size, data_len; unsigned short rsiz; rec_hdr_ptr_t rp; T_BEGIN_READ_NONTP_OR_TP(ERR_GVGETFAIL); assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { if (cdb_sc_normal == (status = gvcst_search(gv_currkey, NULL))) { if ((key_size = gv_currkey->end + 1) == gv_target->hist.h[0].curr_rec.match) { rp = (rec_hdr_ptr_t)(gv_target->hist.h[0].buffaddr + gv_target->hist.h[0].curr_rec.offset); GET_USHORT(rsiz, &rp->rsiz); data_len = rsiz + rp->cmpc - sizeof(rec_hdr) - key_size; if (data_len < 0 || (sm_uc_ptr_t)rp + rsiz > gv_target->hist.h[0].buffaddr + ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_rmisalign1; } else { if (stringpool.top - stringpool.free < data_len) stp_gcol(data_len); assert(stringpool.top - stringpool.free >= data_len); memcpy(stringpool.free, (sm_uc_ptr_t)rp + rsiz - data_len, data_len); if (0 == dollar_tlevel) { if (0 == t_end(&gv_target->hist, NULL)) continue; } else { status = tp_hist(NULL); if (cdb_sc_normal != status) { t_retry(status); continue; } } v->mvtype = MV_STR; v->str.addr = (char *)stringpool.free; v->str.len = data_len; stringpool.free += data_len; if (cs_addrs->read_write) cs_addrs->hdr->n_gets++; return TRUE; } } else { if (0 == dollar_tlevel) { if (0 == t_end(&gv_target->hist, NULL)) continue; } else { status = tp_hist(NULL); if (cdb_sc_normal != status) { t_retry(status); continue; } } cs_addrs->hdr->n_gets++; return FALSE; } } t_retry(status); } }
mint gvcst_data(void) { blk_hdr_ptr_t bp; boolean_t do_rtsib; enum cdb_sc status; mint val; rec_hdr_ptr_t rp; unsigned short match, rsiz; srch_blk_status *bh; srch_hist *rt_history; sm_uc_ptr_t b_top; assert((gv_target->root < cs_addrs->ti->total_blks) || dollar_tlevel); T_BEGIN_READ_NONTP_OR_TP(ERR_GVDATAFAIL); assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { /* The following code is duplicated in gvcst_dataget. Any changes here might need to be reflected there as well */ rt_history = gv_target->alt_hist; rt_history->h[0].blk_num = 0; if (cdb_sc_normal != (status = gvcst_search(gv_currkey, NULL))) { t_retry(status); continue; } bh = gv_target->hist.h; bp = (blk_hdr_ptr_t)bh->buffaddr; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); b_top = bh->buffaddr + bp->bsiz; match = bh->curr_rec.match; do_rtsib = FALSE; if (gv_currkey->end + 1 == match) { val = 1; GET_USHORT(rsiz, &rp->rsiz); rp = (rec_hdr_ptr_t)((sm_uc_ptr_t)rp + rsiz); if ((sm_uc_ptr_t)rp > b_top) { t_retry(cdb_sc_rmisalign); continue; } else if ((sm_uc_ptr_t)rp == b_top) do_rtsib = TRUE; else if (rp->cmpc >= gv_currkey->end) val += 10; } else if (match >= gv_currkey->end) val = 10; else { val = 0; if (rp == (rec_hdr_ptr_t)b_top) do_rtsib = TRUE; } if (do_rtsib && (cdb_sc_endtree != (status = gvcst_rtsib(rt_history, 0)))) { if ((cdb_sc_normal != status) || (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, rt_history->h)))) { t_retry(status); continue; } if (rt_history->h[0].curr_rec.match >= gv_currkey->end) { assert(1 >= val); val += 10; } } if (!dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, 0 == rt_history->h[0].blk_num ? NULL : rt_history, TN_NOT_SPECIFIED)) continue; } else { status = tp_hist(0 == rt_history->h[0].blk_num ? NULL : rt_history); if (cdb_sc_normal != status) { t_retry(status); continue; } } INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_data, 1); return val; } }
boolean_t gvcst_queryget2(mval *val, unsigned char *sn_ptr) { blk_hdr_ptr_t bp; boolean_t found, two_histories; enum cdb_sc status; int rsiz, key_size, data_len; rec_hdr_ptr_t rp; srch_blk_status *bh; srch_hist *rt_history; unsigned short temp_ushort; int tmp_cmpc; DEBUG_ONLY(unsigned char *save_strp = NULL); T_BEGIN_READ_NONTP_OR_TP(ERR_GVQUERYGETFAIL); assert((CDB_STAGNATE > t_tries) || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ for (;;) { two_histories = FALSE; #if defined(DEBUG) && defined(UNIX) if (gtm_white_box_test_case_enabled && (WBTEST_ANTIFREEZE_GVQUERYGETFAIL == gtm_white_box_test_case_number)) { status = cdb_sc_blknumerr; t_retry(status); continue; } #endif if (cdb_sc_normal == (status = gvcst_search(gv_currkey, 0))) { found = TRUE; bh = &gv_target->hist.h[0]; rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; if (rp >= (rec_hdr_ptr_t)CST_TOB(bp)) { two_histories = TRUE; rt_history = gv_target->alt_hist; status = gvcst_rtsib(rt_history, 0); if (cdb_sc_endtree == status) /* end of tree */ { found = FALSE; two_histories = FALSE; /* second history not valid */ } else if (cdb_sc_normal != status) { t_retry(status); continue; } else { bh = &rt_history->h[0]; if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh))) { t_retry(status); continue; } rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->curr_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; } } /* !found indicates that the end of tree has been reached (see call to * gvcst_rtsib). If there is no more tree, don't bother doing expansion. */ if (found) { status = gvcst_expand_key((blk_hdr_ptr_t)bh->buffaddr, (int4)((sm_uc_ptr_t)rp - bh->buffaddr), gv_altkey); if (cdb_sc_normal != status) { t_retry(status); continue; } key_size = gv_altkey->end + 1; GET_RSIZ(rsiz, rp); data_len = rsiz + EVAL_CMPC(rp) - SIZEOF(rec_hdr) - key_size; if (data_len < 0 || (sm_uc_ptr_t)rp + rsiz > (sm_uc_ptr_t)bp + ((blk_hdr_ptr_t)bp)->bsiz) { assert(CDB_STAGNATE > t_tries); t_retry(cdb_sc_rmisalign1); continue; } ENSURE_STP_FREE_SPACE(data_len); DEBUG_ONLY ( if (!save_strp) save_strp = stringpool.free); assert(stringpool.top - stringpool.free >= data_len); memcpy(stringpool.free, (sm_uc_ptr_t)rp + rsiz - data_len, data_len); /* Assumption: t_end/tp_hist will never cause stp_gcol() call BYPASSOK */ } if (!dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, !two_histories ? NULL : rt_history, TN_NOT_SPECIFIED)) continue; } else { status = tp_hist(!two_histories ? NULL : rt_history); if (cdb_sc_normal != status) { t_retry(status); continue; } } if (found) { DEBUG_ONLY(assert(save_strp == stringpool.free)); /* Process val first. Already copied to string pool. */ val->mvtype = MV_STR; val->str.addr = (char *)stringpool.free; val->str.len = data_len; stringpool.free += data_len; INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_get, 1); } return found; } t_retry(status); }
bool gvcst_zprevious(void) { static gv_key *zprev_temp_key; static int4 zprev_temp_keysize = 0; blk_hdr_ptr_t bp; bool found, two_histories; enum cdb_sc status; rec_hdr_ptr_t rp; unsigned char *c1, *c2, *ctop; srch_blk_status *bh; srch_hist *lft_history; T_BEGIN_READ_NONTP_OR_TP(ERR_GVORDERFAIL); for (;;) { assert(t_tries < CDB_STAGNATE || cs_addrs->now_crit); /* we better hold crit in the final retry (TP & non-TP) */ two_histories = FALSE; if (cdb_sc_normal == (status = gvcst_search(gv_currkey, NULL))) { found = TRUE; bh = gv_target->hist.h; if (0 == bh->prev_rec.offset) { two_histories = TRUE; lft_history = gv_target->alt_hist; status = gvcst_lftsib(lft_history); if (cdb_sc_normal == status) { bh = lft_history->h; if (cdb_sc_normal != (status = gvcst_search_blk(gv_currkey, bh))) { t_retry(status); continue; } } else if (cdb_sc_endtree == status) { found = FALSE; two_histories = FALSE; /* second history not valid */ } else { t_retry(status); continue; } } if (found) { /* store new subscipt */ assert(gv_altkey->top == gv_currkey->top); assert(gv_altkey->top == gv_keysize); assert(gv_currkey->end < gv_currkey->top); rp = (rec_hdr_ptr_t)(bh->buffaddr + bh->prev_rec.offset); bp = (blk_hdr_ptr_t)bh->buffaddr; c1 = gv_altkey->base; memcpy(c1, gv_currkey->base, bh->prev_rec.match); c1 += bh->prev_rec.match; assert(zprev_temp_keysize <= gv_keysize); if (zprev_temp_keysize < gv_keysize) { zprev_temp_keysize = gv_keysize; GVKEY_INIT(zprev_temp_key, zprev_temp_keysize); } assert(zprev_temp_key->top >= gv_currkey->top); if (cdb_sc_normal != (status = gvcst_expand_key((blk_hdr_ptr_t)bh->buffaddr, bh->prev_rec.offset, zprev_temp_key))) { t_retry(status); continue; } if ((zprev_temp_key->end < gv_currkey->end) && (zprev_temp_key->end <= gv_currkey->prev)) found = FALSE; else { c2 = zprev_temp_key->base + bh->prev_rec.match; ctop = zprev_temp_key->base + zprev_temp_key->end; for (;;) { if (c2 >= ctop) { assert(CDB_STAGNATE > t_tries); status = cdb_sc_rmisalign; goto restart; /* goto needed because of nested FOR loop */ } if (0 == (*c1++ = *c2++)) { *c1 = 0; break; } } } gv_altkey->end = c1 - gv_altkey->base; assert(gv_altkey->end < gv_altkey->top); } if (!dollar_tlevel) { if ((trans_num)0 == t_end(&gv_target->hist, two_histories ? lft_history : NULL, TN_NOT_SPECIFIED)) continue; } else { status = tp_hist(two_histories ? lft_history : NULL); if (cdb_sc_normal != status) { t_retry(status); continue; } } assert(cs_data == cs_addrs->hdr); INCR_GVSTATS_COUNTER(cs_addrs, cs_addrs->nl, n_zprev, 1); return (found && (bh->prev_rec.match >= gv_currkey->prev)); } restart: t_retry(status); } }
/****************************************************************************************** Input Parameters: level: level of working block dest_blk_id: last destination used for swap Output Parameters: kill_set_ptr: Kill set to be freed *exclude_glist_ptr: List of globals not to be moved for a swap destination Input/Output Parameters: gv_target : as working block's history reorg_gv_target->hist : as desitnitions block's history ******************************************************************************************/ enum cdb_sc mu_swap_blk(int level, block_id *pdest_blk_id, kill_set *kill_set_ptr, glist *exclude_glist_ptr) { unsigned char x_blk_lmap; unsigned short temp_ushort; int rec_size1, rec_size2; int wlevel, nslevel, dest_blk_level; int piece_len1, piece_len2, first_offset, second_offset, work_blk_size, work_parent_size, dest_blk_size, dest_parent_size; int dest_child_cycle; int blk_seg_cnt, blk_size; trans_num ctn; int key_len, key_len_dir; block_id dest_blk_id, work_blk_id, child1, child2; enum cdb_sc status; srch_hist *dest_hist_ptr, *dir_hist_ptr; cache_rec_ptr_t dest_child_cr; blk_segment *bs1, *bs_ptr; sm_uc_ptr_t saved_blk, work_blk_ptr, work_parent_ptr, dest_parent_ptr, dest_blk_ptr, bn_ptr, bmp_buff, tblk_ptr, rec_base, rPtr1; boolean_t gbl_target_was_set, blk_was_free, deleted; gv_namehead *save_targ; srch_blk_status bmlhist, destblkhist, *hist_ptr; unsigned char save_cw_set_depth; cw_set_element *tmpcse; jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */ unsigned int bsiz; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; dest_blk_id = *pdest_blk_id; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ if (NULL == TREF(gv_reorgkey)) GVKEY_INIT(TREF(gv_reorgkey), DBKEYSIZE(MAX_KEY_SZ)); dest_hist_ptr = &(reorg_gv_target->hist); dir_hist_ptr = reorg_gv_target->alt_hist; blk_size = cs_data->blk_size; work_parent_ptr = gv_target->hist.h[level+1].buffaddr; work_parent_size = ((blk_hdr_ptr_t)work_parent_ptr)->bsiz; work_blk_ptr = gv_target->hist.h[level].buffaddr; work_blk_size = ((blk_hdr_ptr_t)work_blk_ptr)->bsiz; work_blk_id = gv_target->hist.h[level].blk_num; if (blk_size < work_blk_size) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } cws_reorg_remove_index = 0; /*===== Infinite loop to find the destination block =====*/ for ( ; ; ) { blk_was_free = FALSE; INCR_BLK_NUM(dest_blk_id); /* A Pre-order traversal should not cause a child block to go to its parent. * However, in case it happens because already the organization was like that or for any other reason, skip swap. * If we decide to swap, code below should be changed to take care of the special case. * Still a grand-child can go to its grand-parent. This is rare and following code can handle it. */ if (dest_blk_id == gv_target->hist.h[level+1].blk_num) continue; if (cs_data->trans_hist.total_blks <= dest_blk_id || dest_blk_id == work_blk_id) { *pdest_blk_id = dest_blk_id; return cdb_sc_oprnotneeded; } ctn = cs_addrs->ti->curr_tn; /* We need to save the block numbers that were NEWLY ADDED (since entering this function "mu_swap_blk") * through the CWS_INSERT macro (in db_csh_get/db_csh_getn which can be called by t_qread or gvcst_search below). * This is so that we can delete these blocks from the "cw_stagnate" hashtable in case we determine the need to * choose a different "dest_blk_id" in this for loop (i.e. come to the next iteration). If these blocks are not * deleted, then the hashtable will keep growing (a good example will be if -EXCLUDE qualifier is specified and * a lot of prospective dest_blk_ids get skipped because they contain EXCLUDEd global variables) and very soon * the hashtable will contain more entries than there are global buffers and at that point db_csh_getn will not * be able to get a free global buffer for a new block (since it checks the "cw_stagnate" hashtable before reusing * a buffer in case of MUPIP REORG). To delete these previous iteration blocks, we use the "cws_reorg_remove_array" * variable. This array should have enough entries to accommodate the maximum number of blocks that can be t_qread * in one iteration down below. And that number is the sum of * + MAX_BT_DEPTH : for the t_qread while loop down the tree done below * + 2 * MAX_BT_DEPTH : for the two calls to gvcst_search done below * + 2 : 1 for the t_qread of dest_blk_id and 1 more for the t_qread of a * bitmap block done inside the call to get_lmap below * = 3 * MAX_BT_DEPTH + 2 * To be safe, we give a buffer of MAX_BT_DEPTH elements i.e. (4 * MAX_BT_DEPTH) + 2. * This is defined in the macro CWS_REMOVE_ARRAYSIZE in cws_insert.h */ /* reset whatever blocks the previous iteration of this for loop had filled in the cw_stagnate hashtable */ for ( ; cws_reorg_remove_index > 0; cws_reorg_remove_index--) { deleted = delete_hashtab_int4(&cw_stagnate, (uint4 *)&cws_reorg_remove_array[cws_reorg_remove_index]); assert(deleted); } /* read corresponding bitmap block before attempting to read destination block. * if bitmap indicates block is free, we will not read the destination block */ bmp_buff = get_lmap(dest_blk_id, &x_blk_lmap, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr); if (!bmp_buff || BLK_MAPINVALID == x_blk_lmap || ((blk_hdr_ptr_t)bmp_buff)->bsiz != BM_SIZE(BLKS_PER_LMAP) || ((blk_hdr_ptr_t)bmp_buff)->levl != LCL_MAP_LEVL) { assert(CDB_STAGNATE > t_tries); return cdb_sc_badbitmap; } if (BLK_FREE != x_blk_lmap) { /* x_blk_lmap is either BLK_BUSY or BLK_RECYCLED. In either case, we need to read destination block * in case we later detect that the before-image needs to be written. */ if (!(dest_blk_ptr = t_qread(dest_blk_id, (sm_int_ptr_t)&destblkhist.cycle, &destblkhist.cr))) { assert(t_tries < CDB_STAGNATE); return (enum cdb_sc)rdfail_detail; } destblkhist.blk_num = dest_blk_id; destblkhist.buffaddr = dest_blk_ptr; destblkhist.level = dest_blk_level = ((blk_hdr_ptr_t)dest_blk_ptr)->levl; } if (BLK_BUSY != x_blk_lmap) { /* x_blk_map is either BLK_FREE or BLK_RECYCLED both of which mean the block is not used in the bitmap */ blk_was_free = TRUE; break; } /* dest_blk_id might contain a *-record only. * So follow the pointer to go to the data/index block, which has a non-* key to search. */ nslevel = dest_blk_level; if (MAX_BT_DEPTH <= nslevel) { assert(CDB_STAGNATE > t_tries); return cdb_sc_maxlvl; } rec_base = dest_blk_ptr + SIZEOF(blk_hdr); GET_RSIZ(rec_size1, rec_base); tblk_ptr = dest_blk_ptr; while ((BSTAR_REC_SIZE == rec_size1) && (0 != nslevel)) { GET_LONG(child1, (rec_base + SIZEOF(rec_hdr))); if (0 == child1 || child1 > cs_data->trans_hist.total_blks - 1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_rdfail; } if (!(tblk_ptr = t_qread(child1, (sm_int_ptr_t)&dest_child_cycle, &dest_child_cr))) { assert(t_tries < CDB_STAGNATE); return (enum cdb_sc)rdfail_detail; } /* leaf of a killed GVT can have block header only. Skip those blocks */ if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz) break; nslevel--; rec_base = tblk_ptr + SIZEOF(blk_hdr); GET_RSIZ(rec_size1, rec_base); } /* leaf of a killed GVT can have block header only. Skip those blocks */ if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz) continue; /* get length of global variable name (do not read subscript) for dest_blk_id */ GET_GBLNAME_LEN(key_len_dir, rec_base + SIZEOF(rec_hdr)); /* key_len = length of 1st key value (including subscript) for dest_blk_id */ GET_KEY_LEN(key_len, rec_base + SIZEOF(rec_hdr)); if ((1 >= key_len_dir || MAX_MIDENT_LEN + 1 < key_len_dir) || (2 >= key_len || MAX_KEY_SZ < key_len)) { /* Earlier used to restart here always. But dest_blk_id can be a block, * which is just killed and still marked busy. Skip it, if we are in last retry. */ if (CDB_STAGNATE <= t_tries) continue; else return cdb_sc_blkmod; } memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len_dir); (TREF(gv_reorgkey))->base[key_len_dir] = 0; (TREF(gv_reorgkey))->end = key_len_dir; if (exclude_glist_ptr->next) { /* exclude blocks for globals in the list of EXCLUDE option */ if (in_exclude_list(&((TREF(gv_reorgkey))->base[0]), key_len_dir - 1, exclude_glist_ptr)) continue; } save_targ = gv_target; if (INVALID_GV_TARGET != reset_gv_target) gbl_target_was_set = TRUE; else { gbl_target_was_set = FALSE; reset_gv_target = save_targ; } gv_target = reorg_gv_target; gv_target->root = cs_addrs->dir_tree->root; gv_target->clue.end = 0; /* assign Directory tree path to find dest_blk_id in dir_hist_ptr */ status = gvcst_search(TREF(gv_reorgkey), dir_hist_ptr); if (cdb_sc_normal != status) { assert(t_tries < CDB_STAGNATE); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); return status; } if (dir_hist_ptr->h[0].curr_rec.match != (TREF(gv_reorgkey))->end + 1) { /* may be in a kill_set of another process */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); continue; } for (wlevel = 0; wlevel <= dir_hist_ptr->depth && dir_hist_ptr->h[wlevel].blk_num != dest_blk_id; wlevel++); if (dir_hist_ptr->h[wlevel].blk_num == dest_blk_id) { /* do not swap a dir_tree block */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); continue; } /* gv_reorgkey will now have the first key from dest_blk_id, * or, from a descendant of dest_blk_id (in case it had a *-key only). */ memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len); (TREF(gv_reorgkey))->end = key_len - 1; GET_KEY_LEN(key_len_dir, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr)); /* Get root of GVT for dest_blk_id */ GET_LONG(gv_target->root, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr) + key_len_dir); if ((0 == gv_target->root) || (gv_target->root > (cs_data->trans_hist.total_blks - 1))) { assert(t_tries < CDB_STAGNATE); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); return cdb_sc_blkmod; } /* Assign Global Variable Tree path to find dest_blk_id in dest_hist_ptr */ gv_target->clue.end = 0; status = gvcst_search(TREF(gv_reorgkey), dest_hist_ptr); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); if (dest_blk_level >= dest_hist_ptr->depth || /* do not swap in root level */ dest_hist_ptr->h[dest_blk_level].blk_num != dest_blk_id) /* must be in a kill set of another process. */ continue; if ((cdb_sc_normal != status) || (dest_hist_ptr->h[nslevel].curr_rec.match != ((TREF(gv_reorgkey))->end + 1))) { assert(t_tries < CDB_STAGNATE); return (cdb_sc_normal != status ? status : cdb_sc_blkmod); } for (wlevel = nslevel; wlevel <= dest_blk_level; wlevel++) dest_hist_ptr->h[wlevel].tn = ctn; dest_blk_ptr = dest_hist_ptr->h[dest_blk_level].buffaddr; dest_blk_size = ((blk_hdr_ptr_t)dest_blk_ptr)->bsiz; dest_parent_ptr = dest_hist_ptr->h[dest_blk_level+1].buffaddr; dest_parent_size = ((blk_hdr_ptr_t)dest_parent_ptr)->bsiz; break; } /*===== End of infinite loop to find the destination block =====*/ /*----------------------------------------------------- Now modify blocks for swapping. Maximum of 4 blocks. -----------------------------------------------------*/ if (!blk_was_free) { /* 1: dest_blk_id into work_blk_id */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, dest_blk_ptr + SIZEOF(blk_hdr), dest_blk_size - SIZEOF(blk_hdr)); if (!BLK_FINI (bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(gv_target->hist.h[level].blk_num == work_blk_id); assert(gv_target->hist.h[level].buffaddr == work_blk_ptr); t_write(&gv_target->hist.h[level], (unsigned char *)bs1, 0, 0, dest_blk_level, TRUE, TRUE, GDS_WRITE_KILLTN); } /* 2: work_blk_id into dest_blk_id */ if (!blk_was_free && work_blk_id == dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* work_blk_id will be swapped with its child. * This is the only vertical swap. Here working block goes to its child. * Working block cannot goto its parent because of traversal */ if (dest_blk_level + 1 != level || dest_parent_size != work_blk_size) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, dest_parent_size, unsigned char); memcpy(saved_blk, dest_parent_ptr, dest_parent_size); first_offset = dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset; GET_RSIZ(rec_size1, saved_blk + first_offset); if (work_blk_size < first_offset + rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } piece_len1 = first_offset + rec_size1; BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), piece_len1 - SIZEOF(block_id) - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, work_blk_id); /* since work_blk_id will now be the child of dest_blk_id */ BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, saved_blk + piece_len1, dest_parent_size - piece_len1); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(dest_blk_id == dest_hist_ptr->h[dest_blk_level].blk_num); assert(dest_blk_ptr == dest_hist_ptr->h[dest_blk_level].buffaddr); t_write(&dest_hist_ptr->h[dest_blk_level], (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN); } else /* free block or, when working block does not move vertically (swap with parent/child) */ { BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, work_blk_size, unsigned char); memcpy(saved_blk, work_blk_ptr, work_blk_size); BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), work_blk_size - SIZEOF(blk_hdr)); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } if (blk_was_free) { tmpcse = &cw_set[cw_set_depth]; t_create(dest_blk_id, (unsigned char *)bs1, 0, 0, level); /* Although we invoked t_create, we do not want t_end to allocate the block (i.e. change mode * from gds_t_create to gds_t_acquired). Instead we do that and a little more (that t_end does) all here. */ assert(dest_blk_id == tmpcse->blk); tmpcse->mode = gds_t_acquired; /* If snapshots are in progress, we might want to read the before images of the FREE blocks also. * Since mu_swap_blk mimics a small part of t_end, it sets cse->mode to gds_t_acquired and hence * will not read the before images of the FREE blocks in t_end. To workaround this, set * cse->was_free to TRUE so that in t_end, this condition can be used to read the before images of * the FREE blocks if needed. */ (BLK_FREE == x_blk_lmap) ? SET_FREE(tmpcse) : SET_NFREE(tmpcse); /* No need to write before-image in case the block is FREE. In case the database had never been fully * upgraded from V4 to V5 format (after the MUPIP UPGRADE), all RECYCLED blocks can basically be considered * FREE (i.e. no need to write before-images since backward journal recovery will never be expected * to take the database to a point BEFORE the mupip upgrade). */ if ((BLK_FREE == x_blk_lmap) || !cs_data->db_got_to_v5_once) tmpcse->old_block = NULL; else { /* Destination is a recycled block that needs a before image */ tmpcse->old_block = destblkhist.buffaddr; /* Record cr,cycle. This is used later in t_end to determine if checksums need to be recomputed */ tmpcse->cr = destblkhist.cr; tmpcse->cycle = destblkhist.cycle; jbbp = (JNL_ENABLED(cs_addrs) && cs_addrs->jnl_before_image) ? cs_addrs->jnl->jnl_buff : NULL; if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn)) { /* Compute CHECKSUM for writing PBLK record before getting crit. * It is possible that we are reading a block that is actually marked free in * the bitmap (due to concurrency issues at this point). Therefore we might be * actually reading uninitialized block headers and in turn a bad value of * "old_block->bsiz". Restart if we ever access a buffer whose size is greater * than the db block size. */ bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz; if (bsiz > blk_size) { assert(CDB_STAGNATE > t_tries); return cdb_sc_lostbmlcr; } JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, cs_data, cs_addrs, tmpcse->old_block, bsiz); } } assert(GDSVCURR == tmpcse->ondsk_blkver); /* should have been set by t_create above */ } else { hist_ptr = &dest_hist_ptr->h[dest_blk_level]; assert(dest_blk_id == hist_ptr->blk_num); assert(dest_blk_ptr == hist_ptr->buffaddr); t_write(hist_ptr, (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN); } } if (!blk_was_free) { /* 3: Parent of destination block (may be parent of working block too) */ if (gv_target->hist.h[level+1].blk_num == dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* dest parent == work_blk parent */ BLK_INIT(bs_ptr, bs1); /* Interchange pointer to dest_blk_id and work_blk_id */ if (level != dest_blk_level || gv_target->hist.h[level+1].curr_rec.offset == dest_hist_ptr->h[level+1].curr_rec.offset) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } if (gv_target->hist.h[level+1].curr_rec.offset < dest_hist_ptr->h[level+1].curr_rec.offset) { first_offset = gv_target->hist.h[level+1].curr_rec.offset; second_offset = dest_hist_ptr->h[level+1].curr_rec.offset; } else { first_offset = dest_hist_ptr->h[level+1].curr_rec.offset; second_offset = gv_target->hist.h[level+1].curr_rec.offset; } GET_RSIZ(rec_size1, dest_parent_ptr + first_offset); GET_RSIZ(rec_size2, dest_parent_ptr + second_offset); if (dest_parent_size < first_offset + rec_size1 || dest_parent_size < second_offset + rec_size2 || BSTAR_REC_SIZE >= rec_size1 || BSTAR_REC_SIZE > rec_size2) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } piece_len1 = first_offset + rec_size1 - SIZEOF(block_id); piece_len2 = second_offset + rec_size2 - SIZEOF(block_id); GET_LONG(child1, dest_parent_ptr + piece_len1); GET_LONG(child2, dest_parent_ptr + piece_len2); BLK_SEG(bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), piece_len1 - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, child2); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + first_offset + rec_size1, second_offset + rec_size2 - SIZEOF(block_id) - first_offset - rec_size1); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, child1); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + second_offset + rec_size2, dest_parent_size - second_offset - rec_size2); if (!BLK_FINI(bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(level == dest_blk_level); assert(dest_parent_ptr == dest_hist_ptr->h[level+1].buffaddr); t_write(&dest_hist_ptr->h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } else if (work_blk_id != dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* Destination block moved in the position of working block. * So destination block's parent's pointer should be changed to work_blk_id */ BLK_INIT(bs_ptr, bs1); GET_RSIZ(rec_size1, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset); if (dest_parent_size < rec_size1 + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_SEG (bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, work_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1, dest_parent_size - dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset - rec_size1); if (!BLK_FINI(bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(dest_parent_ptr == dest_hist_ptr->h[dest_blk_level+1].buffaddr); t_write(&dest_hist_ptr->h[dest_blk_level+1], (unsigned char *)bs1, 0, 0, dest_blk_level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } } /* 4: Parent of working block, if different than destination's parent or, destination was a free block */ if (blk_was_free || gv_target->hist.h[level+1].blk_num != dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* Parent block of working blk should correctly point the working block. Working block went to dest_blk_id */ GET_RSIZ(rec_size1, (work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset)); if (work_parent_size < rec_size1 + gv_target->hist.h[level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, work_parent_ptr + SIZEOF(blk_hdr), gv_target->hist.h[level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, dest_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset + rec_size1, work_parent_size - gv_target->hist.h[level+1].curr_rec.offset - rec_size1); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(gv_target->hist.h[level+1].buffaddr == work_parent_ptr); t_write(&gv_target->hist.h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } /* else already taken care of, when dest_blk_id moved */ if (blk_was_free) { /* A free/recycled block will become busy block. * So the local bitmap must be updated. * Local bit map block will be added in the list of update arrray for concurrency check and * also the cw_set element will be created to mark the free/recycled block as free. * kill_set_ptr will save the block which will become free. */ child1 = ROUND_DOWN2(dest_blk_id, BLKS_PER_LMAP); /* bit map block */ bmlhist.buffaddr = bmp_buff; bmlhist.blk_num = child1; child1 = dest_blk_id - child1; assert(child1); PUT_LONG(update_array_ptr, child1); /* Need to put bit maps on the end of the cw set for concurrency checking. * We want to simulate t_write_map, except we want to update "cw_map_depth" instead of "cw_set_depth". * Hence the save and restore logic (for "cw_set_depth") below. */ save_cw_set_depth = cw_set_depth; assert(!cw_map_depth); t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, ctn, 1); /* will increment cw_set_depth */ cw_map_depth = cw_set_depth; /* set cw_map_depth to the latest cw_set_depth */ cw_set_depth = save_cw_set_depth; /* restore cw_set_depth */ /* t_write_map simulation end */ update_array_ptr += SIZEOF(block_id); child1 = 0; PUT_LONG(update_array_ptr, child1); update_array_ptr += SIZEOF(block_id); assert(1 == cw_set[cw_map_depth - 1].reference_cnt); /* 1 free block is now becoming BLK_USED in the bitmap */ /* working block will be removed */ kill_set_ptr->blk[kill_set_ptr->used].flag = 0; kill_set_ptr->blk[kill_set_ptr->used].level = 0; kill_set_ptr->blk[kill_set_ptr->used++].block = work_blk_id; } *pdest_blk_id = dest_blk_id; return cdb_sc_normal; }
/**************************************************************** Input Parameter: gn = Global name exclude_glist_ptr = list of globals in EXCLUDE option index_fill_factor = index blocks' fill factor data_fill_factor = data blocks' fill factor Input/Output Parameter: resume = resume flag reorg_op = What operations to do (coalesce or, swap or, split) [Default is all] [Only for debugging] ****************************************************************/ boolean_t mu_reorg(mval *gn, glist *exclude_glist_ptr, boolean_t *resume, int index_fill_factor, int data_fill_factor, int reorg_op) { boolean_t end_of_tree = FALSE, complete_merge, detailed_log; int rec_size; /* * * "level" is the level of the working block. * "pre_order_successor_level" is pre_order successor level except in the case * where we are in a left-most descent of the tree * in which case pre_order_successor_level will be the maximum height of that subtree * until we reach the leaf level block . * In other words, pre_order_successor_level and level variable controls the iterative pre-order traversal. * We start reorg from the (root_level - 1) to 0. That is, level = pre_order_successor_level:-1:0. */ int pre_order_successor_level, level; static block_id dest_blk_id = 0; int tkeysize; int blks_killed, blks_processed, blks_reused, blks_coalesced, blks_split, blks_swapped, count, file_extended, lvls_reduced; int d_max_fill, i_max_fill, blk_size, cur_blk_size, max_fill, toler, d_toler, i_toler; int cnt1, cnt2; kill_set kill_set_list; sm_uc_ptr_t rPtr1; enum cdb_sc status; srch_hist *rtsib_hist; jnl_buffer_ptr_t jbp; trans_num ret_tn; error_def(ERR_MUREORGFAIL); error_def(ERR_DBRDONLY); error_def(ERR_GBLNOEXIST); error_def(ERR_MAXBTLEVEL); t_err = ERR_MUREORGFAIL; kill_set_tail = &kill_set_list; /* Initialization for current global */ op_gvname(VARLSTCNT(1) gn); /* Cannot proceed for read-only data files */ if (gv_cur_region->read_only) { gtm_putmsg(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); return FALSE; } dest_blk_id = cs_addrs->reorg_last_dest; inctn_opcode = inctn_mu_reorg; /* If resume option is present, then reorg_restart_key should be not null. * Skip all globals until we are in the region for that global. * Get the reorg_restart_key and reorg_restart_block from database header and restart from there. */ if (*resume && 0 != cs_data->reorg_restart_key[0]) { /* resume from last key reorged in GVT */ GET_KEY_LEN(tkeysize, &cs_data->reorg_restart_key[0]); memcpy(gv_currkey->base, cs_data->reorg_restart_key, tkeysize); gv_currkey->end = tkeysize - 1; dest_blk_id = cs_data->reorg_restart_block; if (0 == memcmp(cs_data->reorg_restart_key, gn->str.addr, gn->str.len)) /* Going to resume from current global, so it resumed and make it false */ *resume = FALSE; } else { /* start from the left most leaf */ memcpy(&gv_currkey->base[0], gn->str.addr, gn->str.len); gv_currkey->base[gn->str.len] = gv_currkey->base[gn->str.len + 1] = 0; gv_currkey->end = gn->str.len + 1; } if (*resume) { util_out_print("REORG cannot be resumed from this point, Skipping this global...", FLUSH); memcpy(&gv_currkey->base[0], gn->str.addr, gn->str.len); gv_currkey->base[gn->str.len] = gv_currkey->base[gn->str.len + 1] = 0; gv_currkey->end = gn->str.len + 1; return TRUE; } memcpy(&gv_currkey_next_reorg->base[0], &gv_currkey->base[0], gv_currkey->end + 1); gv_currkey_next_reorg->end = gv_currkey->end; if (2 > dest_blk_id) dest_blk_id = 2; /* we know that first block is bitmap and next one is directory tree root */ file_extended = cs_data->trans_hist.total_blks; blk_size = cs_data->blk_size; d_max_fill = (double)data_fill_factor * blk_size / 100.0 - cs_data->reserved_bytes; i_max_fill = (double)index_fill_factor * blk_size / 100.0 - cs_data->reserved_bytes; d_toler = (double) DATA_FILL_TOLERANCE * blk_size / 100.0; i_toler = (double) INDEX_FILL_TOLERANCE * blk_size / 100.0; blks_killed = blks_processed = blks_reused = lvls_reduced = blks_coalesced = blks_split = blks_swapped = 0; pre_order_successor_level = level = MAX_BT_DEPTH + 1; /* Just some high value to initialize */ /* --- more detailed debugging information --- */ if (detailed_log = reorg_op & DETAIL) util_out_print("STARTING to work on global ^!AD from region !AD", TRUE, gn->str.len, gn->str.addr, REG_LEN_STR(gv_cur_region)); /* In each iteration of MAIN loop, a working block is processed for a GVT */ for (; ;) /* ================ START MAIN LOOP ================ */ { /* If right sibling is completely merged with the working block, do not swap the working block * with its final destination block. Continue trying next right sibling. Swap only at the end. */ complete_merge = TRUE; while(complete_merge) /* === START WHILE COMPLETE_MERGE === */ { if (mu_ctrlc_occurred || mu_ctrly_occurred) { cs_data->reorg_restart_block = dest_blk_id; memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end + 1); return FALSE; } complete_merge = FALSE; blks_processed++; t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); /* Folllowing for loop is to handle concurrency retry for split/coalesce */ for (; ;) /* === SPLIT-COALESCE LOOP STARTS === */ { gv_target->clue.end = 0; /* search gv_currkey and get the result in gv_target */ if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal) { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } else if (gv_currkey->end + 1 != gv_target->hist.h[0].curr_rec.match) { if (SIZEOF(blk_hdr) == ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz && 1 == gv_target->hist.depth) { if (cs_addrs->now_crit) { t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */ gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr); reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused, file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped); return TRUE; /* It is not an error that global was killed */ } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } } } if (gv_target->hist.depth <= level) { /* Will come here * 1) first iteration of the for loop (since level == MAX_BT_DEPTH + 1) or, * 2) tree depth decreased for mu_reduce_level or, M-kill */ pre_order_successor_level = gv_target->hist.depth - 1; if (MAX_BT_DEPTH + 1 != level) { /* break the loop when tree depth decreased (case 2) */ level = pre_order_successor_level; break; } level = pre_order_successor_level; } max_fill = (0 == level)? d_max_fill : i_max_fill; toler = (0 == level)? d_toler:i_toler; cur_blk_size = ((blk_hdr_ptr_t)(gv_target->hist.h[level].buffaddr))->bsiz; if (cur_blk_size > max_fill + toler && 0 == (reorg_op & NOSPLIT)) /* SPLIT BLOCK */ { cnt1 = cnt2 = 0; /* history of current working block is in gv_target */ status = mu_split(level, i_max_fill, d_max_fill, &cnt1, &cnt2); if (cdb_sc_maxlvl == status) { gtm_putmsg(VARLSTCNT(4) ERR_MAXBTLEVEL, 2, gn->str.len, gn->str.addr); reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused, file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped); return FALSE; } else if (cdb_sc_normal == status) { if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), NULL, TN_NOT_SPECIFIED))) { need_kip_incr = FALSE; continue; } if (detailed_log) log_detailed_log("SPL", &(gv_target->hist), NULL, level, NULL, ret_tn); blks_reused += cnt1; lvls_reduced -= cnt2; blks_split++; break; } else if (cdb_sc_oprnotneeded == status) { /* undo any update_array/cw_set changes and DROP THRU to mu_clsce */ cw_set_depth = 0; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ assert(0 == cw_map_depth); /* mu_swap_blk (that changes cw_map_depth) comes later */ } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } } /* end if SPLIT BLOCK */ /* We are here because, mu_split() was not called or, split was not done or, not required */ rtsib_hist = gv_target->alt_hist; status = gvcst_rtsib(rtsib_hist, level); if (cdb_sc_normal != status && cdb_sc_endtree != status) { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } if (cdb_sc_endtree == status) { if (0 == level) end_of_tree = TRUE; break; } else if (0 == level) pre_order_successor_level = rtsib_hist->depth - 1; /* COALESCE WITH RTSIB */ kill_set_list.used = 0; if (cur_blk_size < max_fill - toler && 0 == (reorg_op & NOCOALESCE)) { /* histories are sent in &gv_target->hist and gv_target->alt_hist */ status = mu_clsce(level, i_max_fill, d_max_fill, &kill_set_list, &complete_merge); if (cdb_sc_normal == status) { if (level) /* delete lower elements of array, t_end might confuse */ { memmove(&rtsib_hist->h[0], &rtsib_hist->h[level], SIZEOF(srch_blk_status)*(rtsib_hist->depth - level + 2)); rtsib_hist->depth = rtsib_hist->depth - level; } if (0 < kill_set_list.used) /* increase kill_in_prog */ { need_kip_incr = TRUE; if (!cs_addrs->now_crit) /* Do not sleep while holding crit */ WAIT_ON_INHIBIT_KILLS(cs_addrs->nl, MAXWAIT2KILL); } if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), rtsib_hist, TN_NOT_SPECIFIED))) { need_kip_incr = FALSE; assert(NULL == kip_csa); if (level) { /* reinitialize level member in rtsib_hist srch_blk_status' */ for (count = 0; count < MAX_BT_DEPTH; count++) rtsib_hist->h[count].level = count; } continue; } if (level) { /* reinitialize level member in rtsib_hist srch_blk_status' */ for (count = 0; count < MAX_BT_DEPTH; count++) rtsib_hist->h[count].level = count; } if (detailed_log) log_detailed_log("CLS", &(gv_target->hist), rtsib_hist, level, NULL, ret_tn); assert(0 < kill_set_list.used || (NULL == kip_csa)); if (0 < kill_set_list.used) /* decrease kill_in_prog */ { gvcst_kill_sort(&kill_set_list); GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, cs_addrs) DECR_KIP(cs_data, cs_addrs, kip_csa); if (detailed_log) log_detailed_log("KIL", &(gv_target->hist), NULL, level, &kill_set_list, ret_tn); blks_killed += kill_set_list.used; } blks_coalesced++; break; } else if (cdb_sc_oprnotneeded == status) { /* undo any update_array/cw_set changes and DROP THRU to t_end */ cw_set_depth = 0; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ assert(0 == cw_map_depth); /* mu_swap_blk (that changes cw_map_depth) comes later */ } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } } /* end if try coalesce */ if (0 == level) { /* Note: In data block level: * if split is successful or, * if coalesce is successful without a complete merge of rtsib, * then gv_currkey_next_reorg is already set from the called function. * if split or, coalesce do a retry or, * if coalesce is successful with a complete merge then * gv_currkey will not be changed. * If split or, coalesce is not successful or, not needed then * here gv_currkey_next_reorg will be set from right sibling */ cw_set_depth = cw_map_depth = 0; GET_KEY_LEN(tkeysize, rtsib_hist->h[0].buffaddr + SIZEOF(blk_hdr) + SIZEOF(rec_hdr)); if (2 < tkeysize && MAX_KEY_SZ >= tkeysize) { memcpy(&(gv_currkey_next_reorg->base[0]), rtsib_hist->h[0].buffaddr + SIZEOF(blk_hdr) +SIZEOF(rec_hdr), tkeysize); gv_currkey_next_reorg->end = tkeysize - 1; inctn_opcode = inctn_invalid_op; /* temporary reset; satisfy an assert in t_end() */ assert(UPDTRNS_DB_UPDATED_MASK == update_trans); update_trans = 0; /* tell t_end, this is no longer an update transaction */ if ((trans_num)0 == (ret_tn = t_end(rtsib_hist, NULL, TN_NOT_SPECIFIED))) { need_kip_incr = FALSE; inctn_opcode = inctn_mu_reorg; /* reset inctn_opcode to its default */ update_trans = UPDTRNS_DB_UPDATED_MASK;/* reset update_trans to old value */ assert(NULL == kip_csa); continue; } /* There is no need to reset update_trans in case of a successful "t_end" call. * This is because before the next call to "t_end" we should have a call to * "t_begin" which will reset update_trans anyways. */ inctn_opcode = inctn_mu_reorg; /* reset inctn_opcode to its default */ if (detailed_log) log_detailed_log("NOU", rtsib_hist, NULL, level, NULL, ret_tn); } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } } /* end if (0 == level) */ break; }/* === SPLIT-COALESCE LOOP END === */ t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */ }/* === START WHILE COMPLETE_MERGE === */ if (mu_ctrlc_occurred || mu_ctrly_occurred) { cs_data->reorg_restart_block = dest_blk_id; memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end+1); return FALSE; } /* Now swap the working block */ if (0 == (reorg_op & NOSWAP)) { t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); /* Following loop is to handle concurrency retry for swap */ for (; ;) /* === START OF SWAP LOOP === */ { kill_set_list.used = 0; gv_target->clue.end = 0; /* search gv_currkey and get the result in gv_target */ if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal) { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } else if (gv_currkey->end + 1 != gv_target->hist.h[0].curr_rec.match) { if (SIZEOF(blk_hdr) == ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz && 1 == gv_target->hist.depth) { if (cs_addrs->now_crit) { t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */ gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr); reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused, file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped); return TRUE; /* It is not an error that global was killed */ } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } } } if (gv_target->hist.depth <= level) break; /* swap working block with appropriate dest_blk_id block. Historys are sent as gv_target->hist and reorg_gv_target->hist */ mu_reorg_in_swap_blk = TRUE; status = mu_swap_blk(level, &dest_blk_id, &kill_set_list, exclude_glist_ptr); mu_reorg_in_swap_blk = FALSE; if (cdb_sc_oprnotneeded == status) { if (cs_data->trans_hist.total_blks <= dest_blk_id) { util_out_print("REORG may be incomplete for this global.", TRUE); reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused, file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped); return TRUE; } } else if (cdb_sc_normal == status) { if (0 < kill_set_list.used) { need_kip_incr = TRUE; if (!cs_addrs->now_crit) /* Do not sleep while holding crit */ WAIT_ON_INHIBIT_KILLS(cs_addrs->nl, MAXWAIT2KILL); /* second history not needed, because, we are reusing a free block, which does not need history */ if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), NULL, TN_NOT_SPECIFIED))) { need_kip_incr = FALSE; assert(NULL == kip_csa); DECR_BLK_NUM(dest_blk_id); continue; } if (detailed_log) log_detailed_log("SWA", &(gv_target->hist), NULL, level, NULL, ret_tn); gvcst_kill_sort(&kill_set_list); GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, cs_addrs) DECR_KIP(cs_data, cs_addrs, kip_csa); if (detailed_log) log_detailed_log("KIL", &(gv_target->hist), NULL, level, &kill_set_list, ret_tn); blks_reused += kill_set_list.used; blks_killed += kill_set_list.used; } /* gv_target->hist is for working block's history, and reorg_gv_target->hist is for destinition block's history. Note: gv_target and reorg_gv_target can be part of different GVT. */ else if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), &(reorg_gv_target->hist), TN_NOT_SPECIFIED))) { need_kip_incr = FALSE; assert(NULL == kip_csa); DECR_BLK_NUM(dest_blk_id); continue; } if ((0 >= kill_set_list.used) && detailed_log) log_detailed_log("SWA", &(gv_target->hist), &(reorg_gv_target->hist), level, NULL, ret_tn); blks_swapped++; if (reorg_op & SWAPHIST) util_out_print("Dest !SL From !SL", TRUE, dest_blk_id, gv_target->hist.h[level].blk_num); } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } break; } /* === END OF SWAP LOOP === */ t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */ } if (mu_ctrlc_occurred || mu_ctrly_occurred) { cs_data->reorg_restart_block = dest_blk_id; memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end + 1); return FALSE; } if (end_of_tree) break; if (0 < level) level--; /* Order of reorg is root towards leaf */ else { level = pre_order_successor_level; memcpy(&gv_currkey->base[0], &gv_currkey_next_reorg->base[0], gv_currkey_next_reorg->end + 1); gv_currkey->end = gv_currkey_next_reorg->end; cs_data->reorg_restart_block = dest_blk_id; memcpy(&cs_data->reorg_restart_key[0], &gv_currkey->base[0], gv_currkey->end + 1); } } /* ================ END MAIN LOOP ================ */ /* =========== START REDUCE LEVEL ============== */ memcpy(&gv_currkey->base[0], gn->str.addr, gn->str.len); gv_currkey->base[gn->str.len] = gv_currkey->base[gn->str.len + 1] = 0; gv_currkey->end = gn->str.len + 1; for (;;) /* Reduce level continues until it fails to reduce */ { t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); cnt1 = 0; for (; ;) /* main reduce level loop starts */ { kill_set_list.used = 0; gv_target->clue.end = 0; /* search gv_currkey and get the result in gv_target */ if ((status = gvcst_search(gv_currkey, NULL)) != cdb_sc_normal) { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } else if (gv_currkey->end + 1 != gv_target->hist.h[0].curr_rec.match) { if (SIZEOF(blk_hdr) == ((blk_hdr_ptr_t)gv_target->hist.h[0].buffaddr)->bsiz && 1 == gv_target->hist.depth) { if (cs_addrs->now_crit) { t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */ gtm_putmsg(VARLSTCNT(4) ERR_GBLNOEXIST, 2, gn->str.len, gn->str.addr); reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused, file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped); return TRUE; /* It is not an error that global was killed */ } else { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } } } if (gv_target->hist.depth <= level) break; /* History is passed in gv_target->hist */ status = mu_reduce_level(&kill_set_list); if (cdb_sc_oprnotneeded != status && cdb_sc_normal != status) { assert(CDB_STAGNATE > t_tries); t_retry(status); continue; } else if (cdb_sc_normal == status) { assert(0 < kill_set_list.used); need_kip_incr = TRUE; if (!cs_addrs->now_crit) /* Do not sleep while holding crit */ WAIT_ON_INHIBIT_KILLS(cs_addrs->nl, MAXWAIT2KILL); if ((trans_num)0 == (ret_tn = t_end(&(gv_target->hist), NULL, TN_NOT_SPECIFIED))) { need_kip_incr = FALSE; assert(NULL == kip_csa); continue; } if (detailed_log) log_detailed_log("RDL", &(gv_target->hist), NULL, level, NULL, ret_tn); gvcst_kill_sort(&kill_set_list); GVCST_BMP_MARK_FREE(&kill_set_list, ret_tn, inctn_mu_reorg, inctn_bmp_mark_free_mu_reorg, inctn_opcode, cs_addrs) DECR_KIP(cs_data, cs_addrs, kip_csa); if (detailed_log) log_detailed_log("KIL", &(gv_target->hist), NULL, level, &kill_set_list, ret_tn); blks_reused += kill_set_list.used; blks_killed += kill_set_list.used; cnt1 = 1; lvls_reduced++; } break; } /* main reduce level loop ends */ t_abort(gv_cur_region, cs_addrs); /* do crit and other cleanup */ if (0 == cnt1) break; } /* =========== END REDUCE LEVEL ===========*/ reorg_finish(dest_blk_id, blks_processed, blks_killed, blks_reused, file_extended, lvls_reduced, blks_coalesced, blks_split, blks_swapped); return TRUE; } /* end mu_reorg() */