/* This routine reads a PINI record from a Journal file. * If it is not already in the hash table, it saves it in the hash table, * For success pplst = (pointer to the pini_list_struct structure) is updated. * For success it returns SS_NORMAL. Else error code is returned. */ uint4 mur_get_pini(jnl_ctl_list *jctl, off_jnl_t pini_addr, pini_list_struct **pplst) { pini_list_struct *plst; struct_jrec_pini *pini_rec; uint4 status; ht_ent_int4 *tabent; struct_jrec_pini *pinirec; reg_ctl_list *rctl; mur_read_desc_t *mur_desc; if (NULL != (tabent = lookup_hashtab_int4(&jctl->pini_list, (uint4 *)&pini_addr))) plst = tabent->value; else plst = NULL; if (NULL != plst) { *pplst = plst; return SS_NORMAL; } rctl = jctl->reg_ctl; mur_desc = rctl->mur_desc; mur_desc->random_buff.dskaddr = ROUND_DOWN2(pini_addr, DISK_BLOCK_SIZE); mur_desc->random_buff.blen = pini_addr - mur_desc->random_buff.dskaddr + PINI_RECLEN; if (mur_desc->random_buff.dskaddr > jctl->eof_addr - mur_desc->random_buff.blen || (SS_NORMAL != (status = mur_read(jctl)))) { if (mur_options.update && jctl->after_end_of_data && !jgbl.forw_phase_recovery) return ERR_JNLBADRECFMT; gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len, jctl->jnl_fn, jctl->rec_offset); gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(5) ERR_JNLREAD, 3, jctl->jnl_fn_len, jctl->jnl_fn, pini_addr); assert(FALSE); murgbl.wrn_count++; PROCEED_IF_EXTRACT_SHOW_VERIFY(jctl, pini_addr, plst, pplst); } pinirec = (struct_jrec_pini *)(mur_desc->random_buff.base + (pini_addr - mur_desc->random_buff.dskaddr)); /* Verify that it's actually a PINI record */ if (JRT_PINI != pinirec->prefix.jrec_type || PINI_RECLEN != pinirec->prefix.forwptr || !IS_VALID_JNLREC((jnl_record *)pinirec, jctl->jfh)) { if (mur_options.update && jctl->after_end_of_data && !jgbl.forw_phase_recovery) return ERR_JNLBADRECFMT; gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len, jctl->jnl_fn, jctl->rec_offset); if (JRT_PINI != pinirec->prefix.jrec_type) gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(5) ERR_NOPINI, 3, jctl->jnl_fn_len, jctl->jnl_fn, pini_addr); assert(FALSE); murgbl.wrn_count++; PROCEED_IF_EXTRACT_SHOW_VERIFY(jctl, pini_addr, plst, pplst); } /* Insert it into the list */ plst = (pini_list_struct *)get_new_element(murgbl.pini_buddy_list, 1); plst->pini_addr = pini_addr; plst->new_pini_addr = 0; plst->state = IGNORE_PROC; memcpy(&plst->jpv, &pinirec->process_vector[CURR_JPV], SIZEOF(jnl_process_vector)); memcpy(&plst->origjpv, &pinirec->process_vector[ORIG_JPV], SIZEOF(jnl_process_vector)); plst->pini_jpv_time = plst->jpv.jpv_time; /* save copy just in case jpv->jpv_time gets * changed in forward phase of journal recovery */ NON_GTM64_ONLY(assert(SIZEOF(void *) == SIZEOF(pini_addr));)
/* get_lmap.c: Reads local bit map and returns buffer address, two bit local bit-map value corresponding to the block, cycle and cr Input Parameter: blk: block id of the block whose bit map this routine is to fetch Output Parameter: bits: two bit local bit map cycle: Cycle value found in t_qread cr: Cache Record value found in t_qread Returns: buffer address of local bitmap block Null: if t_qread fails */ sm_uc_ptr_t get_lmap (block_id blk, unsigned char *bits, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr) { sm_uc_ptr_t ptr, bp; block_id index, offset; error_def(ERR_DSEBLKRDFAIL); index = ROUND_DOWN2(blk, BLKS_PER_LMAP); offset = blk - index; bp = t_qread (index, cycle, cr); if (bp) { ptr = bp + SIZEOF(blk_hdr) + (offset * BML_BITS_PER_BLK) / 8; *bits = *ptr; switch (blk % (8 / BML_BITS_PER_BLK)) { case 0: break; case 1: *bits = *bits >> BML_BITS_PER_BLK; break; case 2: *bits = *bits >> 2 * BML_BITS_PER_BLK; break; case 3: *bits = *bits >> 3 * BML_BITS_PER_BLK; break; } *bits = *bits & 3; } return bp; }
trans_num gvcst_bmp_mark_free(kill_set *ks) { block_id bit_map, next_bm, *updptr; blk_ident *blk, *blk_top, *nextblk; trans_num ctn, start_db_fmt_tn; unsigned int len; # if defined(UNIX) && defined(DEBUG) unsigned int lcl_t_tries; # endif int4 blk_prev_version; srch_hist alt_hist; trans_num ret_tn = 0; boolean_t visit_blks; srch_blk_status bmphist; cache_rec_ptr_t cr; enum db_ver ondsk_blkver; enum cdb_sc status; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; TREF(in_gvcst_bmp_mark_free) = TRUE; assert(inctn_bmp_mark_free_gtm == inctn_opcode || inctn_bmp_mark_free_mu_reorg == inctn_opcode); /* Note down the desired_db_format_tn before you start relying on cs_data->fully_upgraded. * If the db is fully_upgraded, take the optimal path that does not need to read each block being freed. * But in order to detect concurrent desired_db_format changes, note down the tn (when the last format change occurred) * before the fully_upgraded check and after having noted down the database current_tn. * If they are the same, then we are guaranteed no concurrent desired_db_format change occurred. * If they are not, then fall through to the non-optimal path where each to-be-killed block has to be visited. * The reason we need to visit every block in case desired_db_format changes is to take care of the case where * MUPIP REORG DOWNGRADE concurrently changes a block that we are about to free. */ start_db_fmt_tn = cs_data->desired_db_format_tn; visit_blks = (!cs_data->fully_upgraded); /* Local evaluation */ assert(!visit_blks || (visit_blks && dba_bg == cs_addrs->hdr->acc_meth)); /* must have blks_to_upgrd == 0 for non-BG */ assert(!dollar_tlevel); /* Should NOT be in TP now */ blk = &ks->blk[0]; blk_top = &ks->blk[ks->used]; if (!visit_blks) { /* Database has been completely upgraded. Free all blocks in one bitmap as part of one transaction. */ assert(cs_data->db_got_to_v5_once); /* assert all V4 fmt blocks (including RECYCLED) have space for V5 upgrade */ inctn_detail.blknum_struct.blknum = 0; /* to indicate no adjustment to "blks_to_upgrd" necessary */ /* If any of the mini transaction below restarts because of an online rollback, we don't want the application * refresh to happen (like $ZONLNRLBK++ or rts_error(DBROLLEDBACK). This is because, although we are currently in * non-tp (dollar_tleve = 0), we could actually be in a TP transaction and have actually faked dollar_tlevel. In * such a case, we should NOT * be issuing a DBROLLEDBACK error as TP transactions are supposed to just restart in * case of an online rollback. So, set the global variable that gtm_onln_rlbk_clnup can check and skip doing the * application refresh, but will reset the clues. The next update will see the cycle mismatch and will accordingly * take the right action. */ for ( ; blk < blk_top; blk = nextblk) { if (0 != blk->flag) { nextblk = blk + 1; continue; } assert(0 < blk->block); assert((int4)blk->block < cs_addrs->ti->total_blks); bit_map = ROUND_DOWN2((int)blk->block, BLKS_PER_LMAP); next_bm = bit_map + BLKS_PER_LMAP; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ /* Scan for the next local bitmap */ updptr = (block_id *)update_array_ptr; for (nextblk = blk; (0 == nextblk->flag) && (nextblk < blk_top) && ((block_id)nextblk->block < next_bm); ++nextblk) { assert((block_id)nextblk->block - bit_map); *updptr++ = (block_id)nextblk->block - bit_map; } len = (unsigned int)((char *)nextblk - (char *)blk); update_array_ptr = (char *)updptr; alt_hist.h[0].blk_num = 0; /* need for calls to T_END for bitmaps */ alt_hist.h[0].blk_target = NULL; /* need to initialize for calls to T_END */ /* the following assumes SIZEOF(blk_ident) == SIZEOF(int) */ assert(SIZEOF(blk_ident) == SIZEOF(int)); *(int *)update_array_ptr = 0; t_begin(ERR_GVKILLFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { ctn = cs_addrs->ti->curr_tn; /* Need a read fence before reading fields from cs_data as we are reading outside * of crit and relying on this value to detect desired db format state change. */ SHM_READ_MEMORY_BARRIER; if (start_db_fmt_tn != cs_data->desired_db_format_tn) { /* Concurrent db format change has occurred. Need to visit every block to be killed * to determine its block format. Fall through to the non-optimal path below */ ret_tn = 0; break; } bmphist.blk_num = bit_map; if (NULL == (bmphist.buffaddr = t_qread(bmphist.blk_num, (sm_int_ptr_t)&bmphist.cycle, &bmphist.cr))) { t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(&bmphist, (uchar_ptr_t)update_array, ctn, -(int4)(nextblk - blk)); UNIX_ONLY(DEBUG_ONLY(lcl_t_tries = t_tries)); if ((trans_num)0 == (ret_tn = t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))) { # ifdef UNIX assert((CDB_STAGNATE == t_tries) || (lcl_t_tries == t_tries - 1)); status = LAST_RESTART_CODE; if ((cdb_sc_onln_rlbk1 == status) || (cdb_sc_onln_rlbk2 == status) || TREF(rlbk_during_redo_root)) { /* t_end restarted due to online rollback. Discard bitmap free-up and return control * to the application. But, before that reset only_reset_clues_if_onln_rlbk to FALSE */ TREF(in_gvcst_bmp_mark_free) = FALSE; send_msg(VARLSTCNT(6) ERR_IGNBMPMRKFREE, 4, REG_LEN_STR(gv_cur_region), DB_LEN_STR(gv_cur_region)); t_abort(gv_cur_region, cs_addrs); return ret_tn; /* actually 0 */ } # endif continue; } break; } if (0 == ret_tn) /* db format change occurred. Fall through to below for loop to visit each block */ { /* Abort any active transaction to get rid of lingering Non-TP artifacts */ t_abort(gv_cur_region, cs_addrs); break; } } } /* for all blocks in the kill_set */
trans_num gvcst_bmp_mark_free(kill_set *ks) { block_id bit_map, next_bm, *updptr; blk_ident *blk, *blk_top, *nextblk; trans_num ctn, start_db_fmt_tn; unsigned int len; int4 blk_prev_version; srch_hist alt_hist; trans_num ret_tn = 0; boolean_t visit_blks; srch_blk_status bmphist; cache_rec_ptr_t cr; enum db_ver ondsk_blkver; error_def(ERR_GVKILLFAIL); assert(inctn_bmp_mark_free_gtm == inctn_opcode || inctn_bmp_mark_free_mu_reorg == inctn_opcode); /* Note down the desired_db_format_tn before you start relying on cs_data->fully_upgraded. * If the db is fully_upgraded, take the optimal path that does not need to read each block being freed. * But in order to detect concurrent desired_db_format changes, note down the tn (when the last format change occurred) * before the fully_upgraded check and after having noted down the database current_tn. * If they are the same, then we are guaranteed no concurrent desired_db_format change occurred. * If they are not, then fall through to the non-optimal path where each to-be-killed block has to be visited. * The reason we need to visit every block in case desired_db_format changes is to take care of the case where * MUPIP REORG DOWNGRADE concurrently changes a block that we are about to free. */ start_db_fmt_tn = cs_data->desired_db_format_tn; visit_blks = (!cs_data->fully_upgraded); /* Local evaluation */ assert(!visit_blks || (visit_blks && dba_bg == cs_addrs->hdr->acc_meth)); /* must have blks_to_upgrd == 0 for non-BG */ assert(!dollar_tlevel); /* Should NOT be in TP now */ blk = &ks->blk[0]; blk_top = &ks->blk[ks->used]; if (!visit_blks) { /* Database has been completely upgraded. Free all blocks in one bitmap as part of one transaction. */ assert(cs_data->db_got_to_v5_once); /* assert all V4 fmt blocks (including RECYCLED) have space for V5 upgrade */ inctn_detail.blknum_struct.blknum = 0; /* to indicate no adjustment to "blks_to_upgrd" necessary */ for ( ; blk < blk_top; blk = nextblk) { if (0 != blk->flag) { nextblk = blk + 1; continue; } assert(0 < blk->block); assert((int4)blk->block < cs_addrs->ti->total_blks); bit_map = ROUND_DOWN2((int)blk->block, BLKS_PER_LMAP); next_bm = bit_map + BLKS_PER_LMAP; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ /* Scan for the next local bitmap */ updptr = (block_id *)update_array_ptr; for (nextblk = blk; (0 == nextblk->flag) && (nextblk < blk_top) && ((block_id)nextblk->block < next_bm); ++nextblk) { assert((block_id)nextblk->block - bit_map); *updptr++ = (block_id)nextblk->block - bit_map; } len = (unsigned int)((char *)nextblk - (char *)blk); update_array_ptr = (char *)updptr; alt_hist.h[0].blk_num = 0; /* need for calls to T_END for bitmaps */ /* the following assumes SIZEOF(blk_ident) == SIZEOF(int) */ assert(SIZEOF(blk_ident) == SIZEOF(int)); *(int *)update_array_ptr = 0; t_begin(ERR_GVKILLFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { ctn = cs_addrs->ti->curr_tn; /* Need a read fence before reading fields from cs_data as we are reading outside * of crit and relying on this value to detect desired db format state change. */ SHM_READ_MEMORY_BARRIER; if (start_db_fmt_tn != cs_data->desired_db_format_tn) { /* Concurrent db format change has occurred. Need to visit every block to be killed * to determine its block format. Fall through to the non-optimal path below */ ret_tn = 0; break; } bmphist.blk_num = bit_map; if (NULL == (bmphist.buffaddr = t_qread(bmphist.blk_num, (sm_int_ptr_t)&bmphist.cycle, &bmphist.cr))) { t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(&bmphist, (uchar_ptr_t)update_array, ctn, -(int4)(nextblk - blk)); if ((trans_num)0 == (ret_tn = t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))) continue; break; } if (0 == ret_tn) /* db format change occurred. Fall through to below for loop to visit each block */ break; } } /* for all blocks in the kill_set */
void bm_setmap(block_id bml, block_id blk, int4 busy) { sm_uc_ptr_t bmp; trans_num ctn; srch_hist alt_hist; srch_blk_status blkhist; /* block-history to fill in for t_write_map which uses "blk_num", "buffaddr", "cr", "cycle" */ cw_set_element *cse; int lbm_status; /* local bitmap status of input "blk" i.e. BUSY or FREE or RECYCLED */ int4 reference_cnt; uint4 bitnum; error_def(ERR_DSEFAIL); t_begin_crit(ERR_DSEFAIL); ctn = cs_addrs->ti->curr_tn; if (!(bmp = t_qread(bml, &blkhist.cycle, &blkhist.cr))) t_retry((enum cdb_sc)rdfail_detail); blkhist.blk_num = bml; blkhist.buffaddr = bmp; alt_hist.h[0].blk_num = 0; /* Need for calls to T_END for bitmaps */ CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ bitnum = blk - bml; /* Find out current status in order to determine if there is going to be a state transition */ assert(ROUND_DOWN2(blk, cs_data->bplmap) == bml); GET_BM_STATUS(bmp, bitnum, lbm_status); switch(lbm_status) { case BLK_BUSY: reference_cnt = busy ? 0 : -1; break; case BLK_FREE: case BLK_MAPINVALID: case BLK_RECYCLED: assert(BLK_MAPINVALID != lbm_status); reference_cnt = busy ? 1 : 0; break; default: assert(FALSE); break; } if (reference_cnt) { /* Initialize update array with non-zero bitnum only if reference_cnt is non-zero. */ assert(bitnum); *((block_id_ptr_t)update_array_ptr) = bitnum; update_array_ptr += sizeof(block_id); } /* Terminate update array unconditionally with zero bitnum. */ *((block_id_ptr_t)update_array_ptr) = 0; update_array_ptr += sizeof(block_id); t_write_map(&blkhist, (uchar_ptr_t)update_array, ctn, reference_cnt); if (JNL_ENABLED(cs_data)) { cse = (cw_set_element *)(&cw_set[0]); cse->new_buff = non_tp_jfb_buff_ptr; memcpy(non_tp_jfb_buff_ptr, bmp, ((blk_hdr_ptr_t)bmp)->bsiz); gvcst_map_build((uint4 *)cse->upd_addr, (uchar_ptr_t)cse->new_buff, cse, cs_addrs->ti->curr_tn); cse->done = TRUE; } /* Call t_end till it succeeds or aborts (error will be reported) */ while ((trans_num)0 == t_end(&alt_hist, 0)) ; return; }
block_id bm_getfree(block_id orig_hint, boolean_t *blk_used, unsigned int cw_work, cw_set_element *cs, int *cw_depth_ptr) { cw_set_element *cs1; sm_uc_ptr_t bmp; block_id bml, hint, hint_cycled, hint_limit; block_id_ptr_t b_ptr; int cw_set_top, depth, lcnt; unsigned int local_maps, map_size, n_decrements = 0, total_blks; trans_num ctn; int4 free_bit, offset; uint4 space_needed; uint4 status; srch_blk_status blkhist; total_blks = (dba_mm == cs_data->acc_meth) ? cs_addrs->total_blks : cs_addrs->ti->total_blks; if (orig_hint >= total_blks) /* for TP, hint can be > total_blks */ orig_hint = 1; hint = orig_hint; hint_cycled = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP); hint_limit = DIVIDE_ROUND_DOWN(orig_hint, BLKS_PER_LMAP); local_maps = hint_cycled + 2; /* for (up to) 2 wraps */ for (lcnt = 0; lcnt <= local_maps; lcnt++) { bml = bmm_find_free(hint / BLKS_PER_LMAP, (sm_uc_ptr_t)MM_ADDR(cs_data), local_maps); if ((NO_FREE_SPACE == bml) || (bml >= hint_cycled)) { /* if no free space or might have looped to original map, extend */ if ((NO_FREE_SPACE != bml) && (hint_limit < hint_cycled)) { hint_cycled = hint_limit; hint = 1; continue; } if (SS_NORMAL != (status = gdsfilext(cs_data->extension_size, total_blks))) return (status); if (dba_mm == cs_data->acc_meth) return (FILE_EXTENDED); hint = total_blks; total_blks = cs_addrs->ti->total_blks; hint_cycled = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP); local_maps = hint_cycled + 2; /* for (up to) 2 wraps */ /* * note that you can make an optimization of not going back over the whole database and going over * only the extended section. but since it is very unlikely that a free block won't be found * in the extended section and the fact that we are starting from the extended section in either * approach and the fact that we have a GTMASSERT to check that we don't have a lot of * free blocks while doing an extend and the fact that it is very easy to make the change to do * a full-pass, the full-pass solution is currently being implemented */ lcnt = -1; /* allow it one extra pass to ensure that it can take advantage of the entension */ n_decrements++; /* used only for debugging purposes */ continue; } bml *= BLKS_PER_LMAP; if (ROUND_DOWN2(hint, BLKS_PER_LMAP) != bml) { /* not within requested map */ if ((bml < hint) && (hint_cycled)) /* wrap? - second one should force an extend for sure */ hint_cycled = (hint_limit < hint_cycled) ? hint_limit: 0; hint = bml + 1; /* start at beginning */ } if (ROUND_DOWN2(total_blks, BLKS_PER_LMAP) == bml) map_size = (total_blks - bml); else map_size = BLKS_PER_LMAP; if (0 != dollar_tlevel) { depth = cw_work; cw_set_top = *cw_depth_ptr; if (depth < cw_set_top) tp_get_cw(cs, cw_work, &cs1); for (; depth < cw_set_top; depth++, cs1 = cs1->next_cw_set) { /* do tp front to back because list is more efficient than tp_get_cw and forward pointers exist */ if (bml == cs1->blk) { TRAVERSE_TO_LATEST_CSE(cs1); break; } } if (depth >= cw_set_top) { assert(cw_set_top == depth); depth = 0; } } else { for (depth = *cw_depth_ptr - 1; depth >= cw_work; depth--) { /* do non-tp back to front, because of adjacency */ if (bml == (cs + depth)->blk) { cs1 = cs + depth; break; } } if (depth < cw_work) { assert(cw_work - 1 == depth); depth = 0; } } if (0 == depth) { ctn = cs_addrs->ti->curr_tn; if (!(bmp = t_qread(bml, (sm_int_ptr_t)&blkhist.cycle, &blkhist.cr))) return MAP_RD_FAIL; if ((BM_SIZE(BLKS_PER_LMAP) != ((blk_hdr_ptr_t)bmp)->bsiz) || (LCL_MAP_LEVL != ((blk_hdr_ptr_t)bmp)->levl)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_badbitmap; return MAP_RD_FAIL; } offset = 0; } else { bmp = cs1->old_block; b_ptr = (block_id_ptr_t)(cs1->upd_addr); b_ptr += cs1->reference_cnt - 1; offset = *b_ptr + 1; } if (offset < map_size) { free_bit = bm_find_blk(offset, (sm_uc_ptr_t)bmp + sizeof(blk_hdr), map_size, blk_used); if (MAP_RD_FAIL == free_bit) return MAP_RD_FAIL; } else free_bit = NO_FREE_SPACE; if (NO_FREE_SPACE != free_bit) break; if ((hint = bml + BLKS_PER_LMAP) >= total_blks) /* if map is full, start at 1st blk in next map */ { /* wrap - second one should force an extend for sure */ hint = 1; if (hint_cycled) hint_cycled = (hint_limit < hint_cycled) ? hint_limit: 0; } if ((0 == depth) && (FALSE != cs_addrs->now_crit)) /* if it's from the cw_set, its state is murky */ bit_clear(bml / BLKS_PER_LMAP, MM_ADDR(cs_data)); /* if crit, repair master map error */ } /* If not in the final retry, it is possible that free_bit is >= map_size (e.g. if bitmap block gets recycled). */ if (map_size <= (uint4)free_bit && CDB_STAGNATE <= t_tries) { /* bad free bit */ assert((NO_FREE_SPACE == free_bit) && (lcnt > local_maps)); /* All maps full, should have extended */ GTMASSERT; } if (0 != depth) { b_ptr = (block_id_ptr_t)(cs1->upd_addr); b_ptr += cs1->reference_cnt++; *b_ptr = free_bit; } else { space_needed = (BLKS_PER_LMAP + 1) * sizeof(block_id); if (dollar_tlevel) { ENSURE_UPDATE_ARRAY_SPACE(space_needed); /* have brackets for "if" for macros */ } BLK_ADDR(b_ptr, space_needed, block_id); memset(b_ptr, 0, space_needed); *b_ptr = free_bit; blkhist.blk_num = bml; blkhist.buffaddr = bmp; /* cycle and cr have already been assigned from t_qread */ t_write_map(&blkhist, (uchar_ptr_t)b_ptr, ctn, 1); /* last parameter 1 is what cs->reference_cnt gets set to */ } return bml + free_bit; }
void dse_maps(void) { block_id blk, bml_blk; blk_segment *bs1, *bs_ptr; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT, BLK_SEG and BLK_FINI macros */ sm_uc_ptr_t bp; char util_buff[MAX_UTIL_LEN]; int4 bml_size, bml_list_size, blk_index, bml_index; int4 total_blks, blks_in_bitmap; int4 bplmap, dummy_int; unsigned char *bml_list; cache_rec_ptr_t cr, dummy_cr; bt_rec_ptr_t btr; int util_len; uchar_ptr_t blk_ptr; boolean_t was_crit; uint4 jnl_status; srch_blk_status blkhist; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; if (CLI_PRESENT == cli_present("BUSY") || CLI_PRESENT == cli_present("FREE") || CLI_PRESENT == cli_present("MASTER") || CLI_PRESENT == cli_present("RESTORE_ALL")) { if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); } CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ csa = cs_addrs; assert(&FILE_INFO(gv_cur_region)->s_addrs == csa); was_crit = csa->now_crit; if (csa->critical) crash_count = csa->critical->crashcnt; csd = csa->hdr; bplmap = csd->bplmap; if (CLI_PRESENT == cli_present("BLOCK")) { if (!cli_get_hex("BLOCK", (uint4 *)&blk)) return; if (blk < 0 || blk >= csa->ti->total_blks) { util_out_print("Error: invalid block number.", TRUE); return; } patch_curr_blk = blk; } else blk = patch_curr_blk; if (CLI_PRESENT == cli_present("FREE")) { if (0 == bplmap) { util_out_print("Cannot perform map updates: bplmap field of file header is zero.", TRUE); return; } if (blk / bplmap * bplmap == blk) { util_out_print("Cannot perform action on a map block.", TRUE); return; } bml_blk = blk / bplmap * bplmap; bm_setmap(bml_blk, blk, FALSE); return; } if (CLI_PRESENT == cli_present("BUSY")) { if (0 == bplmap) { util_out_print("Cannot perform map updates: bplmap field of file header is zero.", TRUE); return; } if (blk / bplmap * bplmap == blk) { util_out_print("Cannot perform action on a map block.", TRUE); return; } bml_blk = blk / bplmap * bplmap; bm_setmap(bml_blk, blk, TRUE); return; } blk_size = csd->blk_size; if (CLI_PRESENT == cli_present("MASTER")) { if (0 == bplmap) { util_out_print("Cannot perform maps updates: bplmap field of file header is zero.", TRUE); return; } if (!was_crit) grab_crit(gv_cur_region); bml_blk = blk / bplmap * bplmap; if (dba_mm == csd->acc_meth) bp = MM_BASE_ADDR(csa) + (off_t)bml_blk * blk_size; else { assert(dba_bg == csd->acc_meth); if (!(bp = t_qread(bml_blk, &dummy_int, &dummy_cr))) rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL); } if ((csa->ti->total_blks / bplmap) * bplmap == bml_blk) total_blks = (csa->ti->total_blks - bml_blk); else total_blks = bplmap; if (NO_FREE_SPACE == bml_find_free(0, bp + SIZEOF(blk_hdr), total_blks)) bit_clear(bml_blk / bplmap, csa->bmm); else bit_set(bml_blk / bplmap, csa->bmm); if (bml_blk > csa->nl->highest_lbm_blk_changed) csa->nl->highest_lbm_blk_changed = bml_blk; if (!was_crit) rel_crit(gv_cur_region); return; } if (CLI_PRESENT == cli_present("RESTORE_ALL")) { if (0 == bplmap) { util_out_print("Cannot perform maps updates: bplmap field of file header is zero.", TRUE); return; } total_blks = csa->ti->total_blks; assert(ROUND_DOWN2(blk_size, 2 * SIZEOF(int4)) == blk_size); bml_size = BM_SIZE(bplmap); bml_list_size = (total_blks + bplmap - 1) / bplmap * bml_size; bml_list = (unsigned char *)malloc(bml_list_size); for (blk_index = 0, bml_index = 0; blk_index < total_blks; blk_index += bplmap, bml_index++) bml_newmap((blk_hdr_ptr_t)(bml_list + bml_index * bml_size), bml_size, csa->ti->curr_tn); if (!was_crit) { grab_crit(gv_cur_region); csa->hold_onto_crit = TRUE; /* need to do this AFTER grab_crit */ } blk = get_dir_root(); assert(blk < bplmap); csa->ti->free_blocks = total_blks - DIVIDE_ROUND_UP(total_blks, bplmap); bml_busy(blk, bml_list + SIZEOF(blk_hdr)); csa->ti->free_blocks = csa->ti->free_blocks - 1; dse_m_rest(blk, bml_list, bml_size, &csa->ti->free_blocks, TRUE); for (blk_index = 0, bml_index = 0; blk_index < total_blks; blk_index += bplmap, bml_index++) { t_begin_crit(ERR_DSEFAIL); CHECK_TN(csa, csd, csd->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ CWS_RESET; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ assert(csa->ti->early_tn == csa->ti->curr_tn); blk_ptr = bml_list + bml_index * bml_size; blkhist.blk_num = blk_index; if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr))) rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL); BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blk_ptr + SIZEOF(blk_hdr), bml_size - SIZEOF(blk_hdr)); BLK_FINI(bs_ptr, bs1); t_write(&blkhist, (unsigned char *)bs1, 0, 0, LCL_MAP_LEVL, TRUE, FALSE, GDS_WRITE_KILLTN); BUILD_AIMG_IF_JNL_ENABLED(csd, csa->ti->curr_tn); t_end(&dummy_hist, NULL, csa->ti->curr_tn); } /* Fill in master map */ for (blk_index = 0, bml_index = 0; blk_index < total_blks; blk_index += bplmap, bml_index++) { blks_in_bitmap = (blk_index + bplmap <= total_blks) ? bplmap : total_blks - blk_index; assert(1 < blks_in_bitmap); /* the last valid block in the database should never be a bitmap block */ if (NO_FREE_SPACE != bml_find_free(0, (bml_list + bml_index * bml_size) + SIZEOF(blk_hdr), blks_in_bitmap)) bit_set(blk_index / bplmap, csa->bmm); else bit_clear(blk_index / bplmap, csa->bmm); if (blk_index > csa->nl->highest_lbm_blk_changed) csa->nl->highest_lbm_blk_changed = blk_index; } if (!was_crit) { csa->hold_onto_crit = FALSE; /* need to do this before the rel_crit */ rel_crit(gv_cur_region); } if (unhandled_stale_timer_pop) process_deferred_stale(); free(bml_list); csd->kill_in_prog = csd->abandoned_kills = 0; return; } MEMCPY_LIT(util_buff, "!/Block "); util_len = SIZEOF("!/Block ") - 1; util_len += i2hex_nofill(blk, (uchar_ptr_t)&util_buff[util_len], 8); memcpy(&util_buff[util_len], " is marked !AD in its local bit map.!/", SIZEOF(" is marked !AD in its local bit map.!/") - 1); util_len += SIZEOF(" is marked !AD in its local bit map.!/") - 1; util_buff[util_len] = 0; if (!was_crit) grab_crit(gv_cur_region); util_out_print(util_buff, TRUE, 4, dse_is_blk_free(blk, &dummy_int, &dummy_cr) ? "free" : "busy"); if (!was_crit) rel_crit(gv_cur_region); return; }
/****************************************************************************************** Input Parameters: level: level of working block dest_blk_id: last destination used for swap Output Parameters: kill_set_ptr: Kill set to be freed *exclude_glist_ptr: List of globals not to be moved for a swap destination Input/Output Parameters: gv_target : as working block's history reorg_gv_target->hist : as desitnitions block's history ******************************************************************************************/ enum cdb_sc mu_swap_blk(int level, block_id *pdest_blk_id, kill_set *kill_set_ptr, glist *exclude_glist_ptr) { unsigned char x_blk_lmap; unsigned short temp_ushort; int rec_size1, rec_size2; int wlevel, nslevel, dest_blk_level; int piece_len1, piece_len2, first_offset, second_offset, work_blk_size, work_parent_size, dest_blk_size, dest_parent_size; int dest_child_cycle; int blk_seg_cnt, blk_size; trans_num ctn; int key_len, key_len_dir; block_id dest_blk_id, work_blk_id, child1, child2; enum cdb_sc status; srch_hist *dest_hist_ptr, *dir_hist_ptr; cache_rec_ptr_t dest_child_cr; blk_segment *bs1, *bs_ptr; sm_uc_ptr_t saved_blk, work_blk_ptr, work_parent_ptr, dest_parent_ptr, dest_blk_ptr, bn_ptr, bmp_buff, tblk_ptr, rec_base, rPtr1; boolean_t gbl_target_was_set, blk_was_free, deleted; gv_namehead *save_targ; srch_blk_status bmlhist, destblkhist, *hist_ptr; unsigned char save_cw_set_depth; cw_set_element *tmpcse; jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */ unsigned int bsiz; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; dest_blk_id = *pdest_blk_id; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ if (NULL == TREF(gv_reorgkey)) GVKEY_INIT(TREF(gv_reorgkey), DBKEYSIZE(MAX_KEY_SZ)); dest_hist_ptr = &(reorg_gv_target->hist); dir_hist_ptr = reorg_gv_target->alt_hist; blk_size = cs_data->blk_size; work_parent_ptr = gv_target->hist.h[level+1].buffaddr; work_parent_size = ((blk_hdr_ptr_t)work_parent_ptr)->bsiz; work_blk_ptr = gv_target->hist.h[level].buffaddr; work_blk_size = ((blk_hdr_ptr_t)work_blk_ptr)->bsiz; work_blk_id = gv_target->hist.h[level].blk_num; if (blk_size < work_blk_size) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } cws_reorg_remove_index = 0; /*===== Infinite loop to find the destination block =====*/ for ( ; ; ) { blk_was_free = FALSE; INCR_BLK_NUM(dest_blk_id); /* A Pre-order traversal should not cause a child block to go to its parent. * However, in case it happens because already the organization was like that or for any other reason, skip swap. * If we decide to swap, code below should be changed to take care of the special case. * Still a grand-child can go to its grand-parent. This is rare and following code can handle it. */ if (dest_blk_id == gv_target->hist.h[level+1].blk_num) continue; if (cs_data->trans_hist.total_blks <= dest_blk_id || dest_blk_id == work_blk_id) { *pdest_blk_id = dest_blk_id; return cdb_sc_oprnotneeded; } ctn = cs_addrs->ti->curr_tn; /* We need to save the block numbers that were NEWLY ADDED (since entering this function "mu_swap_blk") * through the CWS_INSERT macro (in db_csh_get/db_csh_getn which can be called by t_qread or gvcst_search below). * This is so that we can delete these blocks from the "cw_stagnate" hashtable in case we determine the need to * choose a different "dest_blk_id" in this for loop (i.e. come to the next iteration). If these blocks are not * deleted, then the hashtable will keep growing (a good example will be if -EXCLUDE qualifier is specified and * a lot of prospective dest_blk_ids get skipped because they contain EXCLUDEd global variables) and very soon * the hashtable will contain more entries than there are global buffers and at that point db_csh_getn will not * be able to get a free global buffer for a new block (since it checks the "cw_stagnate" hashtable before reusing * a buffer in case of MUPIP REORG). To delete these previous iteration blocks, we use the "cws_reorg_remove_array" * variable. This array should have enough entries to accommodate the maximum number of blocks that can be t_qread * in one iteration down below. And that number is the sum of * + MAX_BT_DEPTH : for the t_qread while loop down the tree done below * + 2 * MAX_BT_DEPTH : for the two calls to gvcst_search done below * + 2 : 1 for the t_qread of dest_blk_id and 1 more for the t_qread of a * bitmap block done inside the call to get_lmap below * = 3 * MAX_BT_DEPTH + 2 * To be safe, we give a buffer of MAX_BT_DEPTH elements i.e. (4 * MAX_BT_DEPTH) + 2. * This is defined in the macro CWS_REMOVE_ARRAYSIZE in cws_insert.h */ /* reset whatever blocks the previous iteration of this for loop had filled in the cw_stagnate hashtable */ for ( ; cws_reorg_remove_index > 0; cws_reorg_remove_index--) { deleted = delete_hashtab_int4(&cw_stagnate, (uint4 *)&cws_reorg_remove_array[cws_reorg_remove_index]); assert(deleted); } /* read corresponding bitmap block before attempting to read destination block. * if bitmap indicates block is free, we will not read the destination block */ bmp_buff = get_lmap(dest_blk_id, &x_blk_lmap, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr); if (!bmp_buff || BLK_MAPINVALID == x_blk_lmap || ((blk_hdr_ptr_t)bmp_buff)->bsiz != BM_SIZE(BLKS_PER_LMAP) || ((blk_hdr_ptr_t)bmp_buff)->levl != LCL_MAP_LEVL) { assert(CDB_STAGNATE > t_tries); return cdb_sc_badbitmap; } if (BLK_FREE != x_blk_lmap) { /* x_blk_lmap is either BLK_BUSY or BLK_RECYCLED. In either case, we need to read destination block * in case we later detect that the before-image needs to be written. */ if (!(dest_blk_ptr = t_qread(dest_blk_id, (sm_int_ptr_t)&destblkhist.cycle, &destblkhist.cr))) { assert(t_tries < CDB_STAGNATE); return (enum cdb_sc)rdfail_detail; } destblkhist.blk_num = dest_blk_id; destblkhist.buffaddr = dest_blk_ptr; destblkhist.level = dest_blk_level = ((blk_hdr_ptr_t)dest_blk_ptr)->levl; } if (BLK_BUSY != x_blk_lmap) { /* x_blk_map is either BLK_FREE or BLK_RECYCLED both of which mean the block is not used in the bitmap */ blk_was_free = TRUE; break; } /* dest_blk_id might contain a *-record only. * So follow the pointer to go to the data/index block, which has a non-* key to search. */ nslevel = dest_blk_level; if (MAX_BT_DEPTH <= nslevel) { assert(CDB_STAGNATE > t_tries); return cdb_sc_maxlvl; } rec_base = dest_blk_ptr + SIZEOF(blk_hdr); GET_RSIZ(rec_size1, rec_base); tblk_ptr = dest_blk_ptr; while ((BSTAR_REC_SIZE == rec_size1) && (0 != nslevel)) { GET_LONG(child1, (rec_base + SIZEOF(rec_hdr))); if (0 == child1 || child1 > cs_data->trans_hist.total_blks - 1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_rdfail; } if (!(tblk_ptr = t_qread(child1, (sm_int_ptr_t)&dest_child_cycle, &dest_child_cr))) { assert(t_tries < CDB_STAGNATE); return (enum cdb_sc)rdfail_detail; } /* leaf of a killed GVT can have block header only. Skip those blocks */ if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz) break; nslevel--; rec_base = tblk_ptr + SIZEOF(blk_hdr); GET_RSIZ(rec_size1, rec_base); } /* leaf of a killed GVT can have block header only. Skip those blocks */ if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz) continue; /* get length of global variable name (do not read subscript) for dest_blk_id */ GET_GBLNAME_LEN(key_len_dir, rec_base + SIZEOF(rec_hdr)); /* key_len = length of 1st key value (including subscript) for dest_blk_id */ GET_KEY_LEN(key_len, rec_base + SIZEOF(rec_hdr)); if ((1 >= key_len_dir || MAX_MIDENT_LEN + 1 < key_len_dir) || (2 >= key_len || MAX_KEY_SZ < key_len)) { /* Earlier used to restart here always. But dest_blk_id can be a block, * which is just killed and still marked busy. Skip it, if we are in last retry. */ if (CDB_STAGNATE <= t_tries) continue; else return cdb_sc_blkmod; } memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len_dir); (TREF(gv_reorgkey))->base[key_len_dir] = 0; (TREF(gv_reorgkey))->end = key_len_dir; if (exclude_glist_ptr->next) { /* exclude blocks for globals in the list of EXCLUDE option */ if (in_exclude_list(&((TREF(gv_reorgkey))->base[0]), key_len_dir - 1, exclude_glist_ptr)) continue; } save_targ = gv_target; if (INVALID_GV_TARGET != reset_gv_target) gbl_target_was_set = TRUE; else { gbl_target_was_set = FALSE; reset_gv_target = save_targ; } gv_target = reorg_gv_target; gv_target->root = cs_addrs->dir_tree->root; gv_target->clue.end = 0; /* assign Directory tree path to find dest_blk_id in dir_hist_ptr */ status = gvcst_search(TREF(gv_reorgkey), dir_hist_ptr); if (cdb_sc_normal != status) { assert(t_tries < CDB_STAGNATE); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); return status; } if (dir_hist_ptr->h[0].curr_rec.match != (TREF(gv_reorgkey))->end + 1) { /* may be in a kill_set of another process */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); continue; } for (wlevel = 0; wlevel <= dir_hist_ptr->depth && dir_hist_ptr->h[wlevel].blk_num != dest_blk_id; wlevel++); if (dir_hist_ptr->h[wlevel].blk_num == dest_blk_id) { /* do not swap a dir_tree block */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); continue; } /* gv_reorgkey will now have the first key from dest_blk_id, * or, from a descendant of dest_blk_id (in case it had a *-key only). */ memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len); (TREF(gv_reorgkey))->end = key_len - 1; GET_KEY_LEN(key_len_dir, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr)); /* Get root of GVT for dest_blk_id */ GET_LONG(gv_target->root, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr) + key_len_dir); if ((0 == gv_target->root) || (gv_target->root > (cs_data->trans_hist.total_blks - 1))) { assert(t_tries < CDB_STAGNATE); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); return cdb_sc_blkmod; } /* Assign Global Variable Tree path to find dest_blk_id in dest_hist_ptr */ gv_target->clue.end = 0; status = gvcst_search(TREF(gv_reorgkey), dest_hist_ptr); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); if (dest_blk_level >= dest_hist_ptr->depth || /* do not swap in root level */ dest_hist_ptr->h[dest_blk_level].blk_num != dest_blk_id) /* must be in a kill set of another process. */ continue; if ((cdb_sc_normal != status) || (dest_hist_ptr->h[nslevel].curr_rec.match != ((TREF(gv_reorgkey))->end + 1))) { assert(t_tries < CDB_STAGNATE); return (cdb_sc_normal != status ? status : cdb_sc_blkmod); } for (wlevel = nslevel; wlevel <= dest_blk_level; wlevel++) dest_hist_ptr->h[wlevel].tn = ctn; dest_blk_ptr = dest_hist_ptr->h[dest_blk_level].buffaddr; dest_blk_size = ((blk_hdr_ptr_t)dest_blk_ptr)->bsiz; dest_parent_ptr = dest_hist_ptr->h[dest_blk_level+1].buffaddr; dest_parent_size = ((blk_hdr_ptr_t)dest_parent_ptr)->bsiz; break; } /*===== End of infinite loop to find the destination block =====*/ /*----------------------------------------------------- Now modify blocks for swapping. Maximum of 4 blocks. -----------------------------------------------------*/ if (!blk_was_free) { /* 1: dest_blk_id into work_blk_id */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, dest_blk_ptr + SIZEOF(blk_hdr), dest_blk_size - SIZEOF(blk_hdr)); if (!BLK_FINI (bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(gv_target->hist.h[level].blk_num == work_blk_id); assert(gv_target->hist.h[level].buffaddr == work_blk_ptr); t_write(&gv_target->hist.h[level], (unsigned char *)bs1, 0, 0, dest_blk_level, TRUE, TRUE, GDS_WRITE_KILLTN); } /* 2: work_blk_id into dest_blk_id */ if (!blk_was_free && work_blk_id == dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* work_blk_id will be swapped with its child. * This is the only vertical swap. Here working block goes to its child. * Working block cannot goto its parent because of traversal */ if (dest_blk_level + 1 != level || dest_parent_size != work_blk_size) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, dest_parent_size, unsigned char); memcpy(saved_blk, dest_parent_ptr, dest_parent_size); first_offset = dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset; GET_RSIZ(rec_size1, saved_blk + first_offset); if (work_blk_size < first_offset + rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } piece_len1 = first_offset + rec_size1; BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), piece_len1 - SIZEOF(block_id) - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, work_blk_id); /* since work_blk_id will now be the child of dest_blk_id */ BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, saved_blk + piece_len1, dest_parent_size - piece_len1); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(dest_blk_id == dest_hist_ptr->h[dest_blk_level].blk_num); assert(dest_blk_ptr == dest_hist_ptr->h[dest_blk_level].buffaddr); t_write(&dest_hist_ptr->h[dest_blk_level], (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN); } else /* free block or, when working block does not move vertically (swap with parent/child) */ { BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, work_blk_size, unsigned char); memcpy(saved_blk, work_blk_ptr, work_blk_size); BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), work_blk_size - SIZEOF(blk_hdr)); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } if (blk_was_free) { tmpcse = &cw_set[cw_set_depth]; t_create(dest_blk_id, (unsigned char *)bs1, 0, 0, level); /* Although we invoked t_create, we do not want t_end to allocate the block (i.e. change mode * from gds_t_create to gds_t_acquired). Instead we do that and a little more (that t_end does) all here. */ assert(dest_blk_id == tmpcse->blk); tmpcse->mode = gds_t_acquired; /* If snapshots are in progress, we might want to read the before images of the FREE blocks also. * Since mu_swap_blk mimics a small part of t_end, it sets cse->mode to gds_t_acquired and hence * will not read the before images of the FREE blocks in t_end. To workaround this, set * cse->was_free to TRUE so that in t_end, this condition can be used to read the before images of * the FREE blocks if needed. */ (BLK_FREE == x_blk_lmap) ? SET_FREE(tmpcse) : SET_NFREE(tmpcse); /* No need to write before-image in case the block is FREE. In case the database had never been fully * upgraded from V4 to V5 format (after the MUPIP UPGRADE), all RECYCLED blocks can basically be considered * FREE (i.e. no need to write before-images since backward journal recovery will never be expected * to take the database to a point BEFORE the mupip upgrade). */ if ((BLK_FREE == x_blk_lmap) || !cs_data->db_got_to_v5_once) tmpcse->old_block = NULL; else { /* Destination is a recycled block that needs a before image */ tmpcse->old_block = destblkhist.buffaddr; /* Record cr,cycle. This is used later in t_end to determine if checksums need to be recomputed */ tmpcse->cr = destblkhist.cr; tmpcse->cycle = destblkhist.cycle; jbbp = (JNL_ENABLED(cs_addrs) && cs_addrs->jnl_before_image) ? cs_addrs->jnl->jnl_buff : NULL; if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn)) { /* Compute CHECKSUM for writing PBLK record before getting crit. * It is possible that we are reading a block that is actually marked free in * the bitmap (due to concurrency issues at this point). Therefore we might be * actually reading uninitialized block headers and in turn a bad value of * "old_block->bsiz". Restart if we ever access a buffer whose size is greater * than the db block size. */ bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz; if (bsiz > blk_size) { assert(CDB_STAGNATE > t_tries); return cdb_sc_lostbmlcr; } JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, cs_data, cs_addrs, tmpcse->old_block, bsiz); } } assert(GDSVCURR == tmpcse->ondsk_blkver); /* should have been set by t_create above */ } else { hist_ptr = &dest_hist_ptr->h[dest_blk_level]; assert(dest_blk_id == hist_ptr->blk_num); assert(dest_blk_ptr == hist_ptr->buffaddr); t_write(hist_ptr, (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN); } } if (!blk_was_free) { /* 3: Parent of destination block (may be parent of working block too) */ if (gv_target->hist.h[level+1].blk_num == dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* dest parent == work_blk parent */ BLK_INIT(bs_ptr, bs1); /* Interchange pointer to dest_blk_id and work_blk_id */ if (level != dest_blk_level || gv_target->hist.h[level+1].curr_rec.offset == dest_hist_ptr->h[level+1].curr_rec.offset) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } if (gv_target->hist.h[level+1].curr_rec.offset < dest_hist_ptr->h[level+1].curr_rec.offset) { first_offset = gv_target->hist.h[level+1].curr_rec.offset; second_offset = dest_hist_ptr->h[level+1].curr_rec.offset; } else { first_offset = dest_hist_ptr->h[level+1].curr_rec.offset; second_offset = gv_target->hist.h[level+1].curr_rec.offset; } GET_RSIZ(rec_size1, dest_parent_ptr + first_offset); GET_RSIZ(rec_size2, dest_parent_ptr + second_offset); if (dest_parent_size < first_offset + rec_size1 || dest_parent_size < second_offset + rec_size2 || BSTAR_REC_SIZE >= rec_size1 || BSTAR_REC_SIZE > rec_size2) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } piece_len1 = first_offset + rec_size1 - SIZEOF(block_id); piece_len2 = second_offset + rec_size2 - SIZEOF(block_id); GET_LONG(child1, dest_parent_ptr + piece_len1); GET_LONG(child2, dest_parent_ptr + piece_len2); BLK_SEG(bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), piece_len1 - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, child2); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + first_offset + rec_size1, second_offset + rec_size2 - SIZEOF(block_id) - first_offset - rec_size1); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, child1); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + second_offset + rec_size2, dest_parent_size - second_offset - rec_size2); if (!BLK_FINI(bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(level == dest_blk_level); assert(dest_parent_ptr == dest_hist_ptr->h[level+1].buffaddr); t_write(&dest_hist_ptr->h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } else if (work_blk_id != dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* Destination block moved in the position of working block. * So destination block's parent's pointer should be changed to work_blk_id */ BLK_INIT(bs_ptr, bs1); GET_RSIZ(rec_size1, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset); if (dest_parent_size < rec_size1 + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_SEG (bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, work_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1, dest_parent_size - dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset - rec_size1); if (!BLK_FINI(bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(dest_parent_ptr == dest_hist_ptr->h[dest_blk_level+1].buffaddr); t_write(&dest_hist_ptr->h[dest_blk_level+1], (unsigned char *)bs1, 0, 0, dest_blk_level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } } /* 4: Parent of working block, if different than destination's parent or, destination was a free block */ if (blk_was_free || gv_target->hist.h[level+1].blk_num != dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* Parent block of working blk should correctly point the working block. Working block went to dest_blk_id */ GET_RSIZ(rec_size1, (work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset)); if (work_parent_size < rec_size1 + gv_target->hist.h[level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, work_parent_ptr + SIZEOF(blk_hdr), gv_target->hist.h[level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, dest_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset + rec_size1, work_parent_size - gv_target->hist.h[level+1].curr_rec.offset - rec_size1); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(gv_target->hist.h[level+1].buffaddr == work_parent_ptr); t_write(&gv_target->hist.h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } /* else already taken care of, when dest_blk_id moved */ if (blk_was_free) { /* A free/recycled block will become busy block. * So the local bitmap must be updated. * Local bit map block will be added in the list of update arrray for concurrency check and * also the cw_set element will be created to mark the free/recycled block as free. * kill_set_ptr will save the block which will become free. */ child1 = ROUND_DOWN2(dest_blk_id, BLKS_PER_LMAP); /* bit map block */ bmlhist.buffaddr = bmp_buff; bmlhist.blk_num = child1; child1 = dest_blk_id - child1; assert(child1); PUT_LONG(update_array_ptr, child1); /* Need to put bit maps on the end of the cw set for concurrency checking. * We want to simulate t_write_map, except we want to update "cw_map_depth" instead of "cw_set_depth". * Hence the save and restore logic (for "cw_set_depth") below. */ save_cw_set_depth = cw_set_depth; assert(!cw_map_depth); t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, ctn, 1); /* will increment cw_set_depth */ cw_map_depth = cw_set_depth; /* set cw_map_depth to the latest cw_set_depth */ cw_set_depth = save_cw_set_depth; /* restore cw_set_depth */ /* t_write_map simulation end */ update_array_ptr += SIZEOF(block_id); child1 = 0; PUT_LONG(update_array_ptr, child1); update_array_ptr += SIZEOF(block_id); assert(1 == cw_set[cw_map_depth - 1].reference_cnt); /* 1 free block is now becoming BLK_USED in the bitmap */ /* working block will be removed */ kill_set_ptr->blk[kill_set_ptr->used].flag = 0; kill_set_ptr->blk[kill_set_ptr->used].level = 0; kill_set_ptr->blk[kill_set_ptr->used++].block = work_blk_id; } *pdest_blk_id = dest_blk_id; return cdb_sc_normal; }
void mu_reorg_upgrd_dwngrd(void) { blk_hdr new_hdr; blk_segment *bs1, *bs_ptr; block_id *blkid_ptr, curblk, curbmp, start_blk, stop_blk, start_bmp, last_bmp; block_id startblk_input, stopblk_input; boolean_t upgrade, downgrade, safejnl, nosafejnl, region, first_reorg_in_this_db_fmt, reorg_entiredb; boolean_t startblk_specified, stopblk_specified, set_fully_upgraded, db_got_to_v5_once, mark_blk_free; cache_rec_ptr_t cr; char *bml_lcl_buff = NULL, *command, *reorg_command; sm_uc_ptr_t bptr = NULL; cw_set_element *cse; enum cdb_sc cdb_status; enum db_ver new_db_format, ondsk_blkver; gd_region *reg; int cycle; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */ int4 blocks_left, expected_blks2upgrd, actual_blks2upgrd, total_blks, free_blks; int4 status, status1, mapsize, lcnt, bml_status; reorg_stats_t reorg_stats; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; sm_uc_ptr_t blkBase, bml_sm_buff; /* shared memory pointer to the bitmap global buffer */ srch_hist alt_hist; srch_blk_status *blkhist, bmlhist; tp_region *rptr; trans_num curr_tn; unsigned char save_cw_set_depth; uint4 lcl_update_trans; region = (CLI_PRESENT == cli_present("REGION")); upgrade = (CLI_PRESENT == cli_present("UPGRADE")); downgrade = (CLI_PRESENT == cli_present("DOWNGRADE")); assert(upgrade && !downgrade || !upgrade && downgrade); command = upgrade ? "UPGRADE" : "DOWNGRADE"; reorg_command = upgrade ? "MUPIP REORG UPGRADE" : "MUPIP REORG DOWNGRADE"; reorg_entiredb = TRUE; /* unless STARTBLK or STOPBLK is specified we are going to {up,down}grade the entire database */ startblk_specified = FALSE; assert(SIZEOF(block_id) == SIZEOF(uint4)); if ((CLI_PRESENT == cli_present("STARTBLK")) && (cli_get_hex("STARTBLK", (uint4 *)&startblk_input))) { reorg_entiredb = FALSE; startblk_specified = TRUE; } stopblk_specified = FALSE; assert(SIZEOF(block_id) == SIZEOF(uint4)); if ((CLI_PRESENT == cli_present("STOPBLK")) && (cli_get_hex("STOPBLK", (uint4 *)&stopblk_input))) { reorg_entiredb = FALSE; stopblk_specified = TRUE; } mu_reorg_upgrd_dwngrd_in_prog = TRUE; mu_reorg_nosafejnl = (CLI_NEGATED == cli_present("SAFEJNL")) ? TRUE : FALSE; assert(region); status = SS_NORMAL; error_mupip = FALSE; gvinit(); /* initialize gd_header (needed by the later call to mu_getlst) */ mu_getlst("REG_NAME", SIZEOF(tp_region)); /* get the parameter corresponding to REGION qualifier */ if (error_mupip) { util_out_print("!/MUPIP REORG !AD cannot proceed with above errors!/", TRUE, LEN_AND_STR(command)); mupip_exit(ERR_MUNOACTION); } assert(DBKEYSIZE(MAX_KEY_SZ) == gv_keysize); /* no need to invoke GVKEYSIZE_INIT_IF_NEEDED macro */ gv_target = targ_alloc(gv_keysize, NULL, NULL); /* t_begin needs this initialized */ gv_target_list = NULL; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ blkhist = &alt_hist.h[0]; for (rptr = grlist; NULL != rptr; rptr = rptr->fPtr) { if (mu_ctrly_occurred || mu_ctrlc_occurred) break; reg = rptr->reg; util_out_print("!/Region !AD : MUPIP REORG !AD started", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); if (reg_cmcheck(reg)) { util_out_print("Region !AD : MUPIP REORG !AD cannot run across network", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = ERR_MUNOFINISH; continue; } mu_reorg_process = TRUE; /* gvcst_init will use this value to use gtm_poollimit settings. */ gvcst_init(reg); mu_reorg_process = FALSE; assert(update_array != NULL); /* access method stored in global directory and database file header might be different in which case * the database setting prevails. therefore, the access method check can be done only after opening * the database (i.e. after the gvcst_init) */ if (dba_bg != REG_ACC_METH(reg)) { util_out_print("Region !AD : MUPIP REORG !AD cannot continue as access method is not BG", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = ERR_MUNOFINISH; continue; } /* The mu_getlst call above uses insert_region to create the grlist, which ensures that duplicate regions mapping to * the same db file correspond to only one grlist entry. */ assert(FALSE == reg->was_open); TP_CHANGE_REG(reg); /* sets gv_cur_region, cs_addrs, cs_data */ csa = cs_addrs; csd = cs_data; blk_size = csd->blk_size; /* "blk_size" is used by the BLK_FINI macro */ if (reg->read_only) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(reg)); status = ERR_MUNOFINISH; continue; } assert(GDSVCURR == GDSV6); /* so we trip this assert in case GDSVCURR changes without a change to this module */ new_db_format = (upgrade ? GDSV6 : GDSV4); grab_crit(reg); curr_tn = csd->trans_hist.curr_tn; /* set the desired db format in the file header to the appropriate version, increment transaction number */ status1 = desired_db_format_set(reg, new_db_format, reorg_command); assert(csa->now_crit); /* desired_db_format_set() should not have released crit */ first_reorg_in_this_db_fmt = TRUE; /* with the current desired_db_format, this is the first reorg */ if (SS_NORMAL != status1) { /* "desired_db_format_set" would have printed appropriate error messages */ if (ERR_MUNOACTION != status1) { /* real error occurred while setting the db format. skip to next region */ status = ERR_MUNOFINISH; rel_crit(reg); continue; } util_out_print("Region !AD : Desired DB Format remains at !AD after !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command)); if (csd->reorg_db_fmt_start_tn == csd->desired_db_format_tn) first_reorg_in_this_db_fmt = FALSE; } else util_out_print("Region !AD : Desired DB Format set to !AD by !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command)); assert(dba_bg == csd->acc_meth); /* Check blks_to_upgrd counter to see if upgrade/downgrade is complete */ total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; actual_blks2upgrd = csd->blks_to_upgrd; /* If MUPIP REORG UPGRADE and there is no block to upgrade in the database as indicated by BOTH * "csd->blks_to_upgrd" and "csd->fully_upgraded", then we can skip processing. * If MUPIP REORG UPGRADE and all non-free blocks need to be upgraded then again we can skip processing. */ if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded) || (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd))) { util_out_print("Region !AD : Blocks to Upgrade counter indicates no action needed for MUPIP REORG !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : " "Blocks to upgrade = [0x!XL]", TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd); util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); rel_crit(reg); continue; } stop_blk = total_blks; if (stopblk_specified && stopblk_input <= stop_blk) stop_blk = stopblk_input; if (first_reorg_in_this_db_fmt) { /* Note down reorg start tn (in case we are interrupted, future reorg will know to resume) */ csd->reorg_db_fmt_start_tn = csd->desired_db_format_tn; csd->reorg_upgrd_dwngrd_restart_block = 0; start_blk = (startblk_specified ? startblk_input : 0); } else { /* Either a concurrent MUPIP REORG of the same type ({up,down}grade) is currently running * or a previously running REORG of the same type was interrupted (Ctrl-Ced). * In either case resume processing from whatever restart block number is stored in fileheader * the only exception is if "STARTBLK" was specified in the input in which use that unconditionally. */ start_blk = (startblk_specified ? startblk_input : csd->reorg_upgrd_dwngrd_restart_block); } if (start_blk > stop_blk) start_blk = stop_blk; mu_reorg_upgrd_dwngrd_start_tn = csd->reorg_db_fmt_start_tn; /* Before releasing crit, flush the file-header and dirty buffers in cache to disk. This is because we are now * going to read each GDS block directly from disk to determine if it needs to be upgraded/downgraded or not. */ if (!wcs_flu(WCSFLU_FLUSH_HDR)) /* wcs_flu assumes gv_cur_region is set (which it is in this routine) */ { rel_crit(reg); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg)); status = ERR_MUNOFINISH; continue; } rel_crit(reg); /* Loop through entire database one GDS block at a time and upgrade/downgrade each of them */ status1 = SS_NORMAL; start_bmp = ROUND_DOWN2(start_blk, BLKS_PER_LMAP); last_bmp = ROUND_DOWN2(stop_blk - 1, BLKS_PER_LMAP); curblk = start_blk; /* curblk is the block to be upgraded/downgraded */ util_out_print("Region !AD : Started processing from block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk); if (NULL != bptr) { /* malloc/free "bptr" for each region as GDS block-size can be different */ free(bptr); bptr = NULL; } memset(&reorg_stats, 0, SIZEOF(reorg_stats)); /* initialize statistics for this region */ for (curbmp = start_bmp; curbmp <= last_bmp; curbmp += BLKS_PER_LMAP) { if (mu_ctrly_occurred || mu_ctrlc_occurred) { status1 = ERR_MUNOFINISH; break; } /* -------------------------------------------------------------- * Read in current bitmap block * -------------------------------------------------------------- */ assert(!csa->now_crit); bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* bring block into the cache outside of crit */ reorg_stats.blks_read_from_disk_bmp++; grab_crit_encr_cycle_sync(reg); /* needed so t_qread does not return NULL below */ if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn) { /* csd->desired_db_format changed since reorg started. discontinue the reorg */ /* see later comment on "csd->reorg_upgrd_dwngrd_restart_block" for why the assignment * of this field should be done only if a db format change did not occur. */ rel_crit(reg); status1 = ERR_MUNOFINISH; /* This "start_tn" check is redone after the for-loop and an error message is printed there */ break; } else if (reorg_entiredb) { /* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */ assert(csd->reorg_upgrd_dwngrd_restart_block <= MAX(start_blk, curbmp)); csd->reorg_upgrd_dwngrd_restart_block = curbmp; /* previous blocks have been upgraded/downgraded */ } /* Check blks_to_upgrd counter to see if upgrade/downgrade is complete. * Repeat check done a few steps earlier outside of this for loop. */ total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; actual_blks2upgrd = csd->blks_to_upgrd; if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded) || (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd))) { rel_crit(reg); break; } bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* now that in crit, note down stable buffer */ if (NULL == bml_sm_buff) rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL); ondsk_blkver = cr->ondsk_blkver; /* note down db fmt on disk for bitmap block */ /* Take a copy of the shared memory bitmap buffer into process-private memory before releasing crit. * We are interested in those blocks that are currently marked as USED in the bitmap. * It is possible that once we release crit, concurrent updates change the bitmap state of those blocks. * In that case, those updates will take care of doing the upgrade/downgrade of those blocks in the * format currently set in csd->desired_db_format i.e. accomplishing MUPIP REORG UPGRADE/DOWNGRADE's job. * If the desired_db_format changes concurrently, we will stop doing REORG UPGRADE/DOWNGRADE processing. */ if (NULL == bml_lcl_buff) bml_lcl_buff = malloc(BM_SIZE(BLKS_PER_LMAP)); memcpy(bml_lcl_buff, (blk_hdr_ptr_t)bml_sm_buff, BM_SIZE(BLKS_PER_LMAP)); if (FALSE == cert_blk(reg, curbmp, (blk_hdr_ptr_t)bml_lcl_buff, 0, FALSE)) { /* certify the block while holding crit as cert_blk uses fields from file-header (shared memory) */ assert(FALSE); /* in pro, skip ugprading/downgarding all blks in this unreliable local bitmap */ rel_crit(reg); util_out_print("Region !AD : Bitmap Block [0x!XL] has integrity errors. Skipping this bitmap.", TRUE, REG_LEN_STR(reg), curbmp); status1 = ERR_MUNOFINISH; continue; } rel_crit(reg); /* ------------------------------------------------------------------------ * Upgrade/Downgrade all BUSY blocks in the current bitmap * ------------------------------------------------------------------------ */ curblk = (curbmp == start_bmp) ? start_blk : curbmp; mapsize = (curbmp == last_bmp) ? (stop_blk - curbmp) : BLKS_PER_LMAP; assert(0 != mapsize); assert(mapsize <= BLKS_PER_LMAP); db_got_to_v5_once = csd->db_got_to_v5_once; for (lcnt = curblk - curbmp; lcnt < mapsize; lcnt++, curblk++) { if (mu_ctrly_occurred || mu_ctrlc_occurred) { status1 = ERR_MUNOFINISH; goto stop_reorg_on_this_reg; /* goto needed because of nested FOR Loop */ } GET_BM_STATUS(bml_lcl_buff, lcnt, bml_status); assert(BLK_MAPINVALID != bml_status); /* cert_blk ran clean so we dont expect invalid entries */ if (BLK_FREE == bml_status) { reorg_stats.blks_skipped_free++; continue; } /* MUPIP REORG UPGRADE/DOWNGRADE will convert USED & RECYCLED blocks */ if (db_got_to_v5_once || (BLK_RECYCLED != bml_status)) { /* Do NOT read recycled V4 block from disk unless it is guaranteed NOT to be too full */ if (lcnt) { /* non-bitmap block */ /* read in block from disk into private buffer. dont pollute the cache yet */ if (NULL == bptr) bptr = (sm_uc_ptr_t)malloc(blk_size); status1 = dsk_read(curblk, bptr, &ondsk_blkver, FALSE); /* dsk_read on curblk could return an error (DYNUPGRDFAIL) if curblk needs to be * upgraded and if its block size was too big to allow the extra block-header space * requirements for a dynamic upgrade. a MUPIP REORG DOWNGRADE should not error out * in that case as the block is already in the downgraded format. */ if (SS_NORMAL != status1) { if (!upgrade && (ERR_DYNUPGRDFAIL == status1)) { assert(GDSV4 == new_db_format); ondsk_blkver = new_db_format; } else { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status1); util_out_print("Region !AD : Error occurred while reading block " "[0x!XL]", TRUE, REG_LEN_STR(reg), curblk); status1 = ERR_MUNOFINISH; goto stop_reorg_on_this_reg;/* goto needed due to nested FOR Loop */ } } reorg_stats.blks_read_from_disk_nonbmp++; } /* else bitmap block has been read in crit earlier and ondsk_blkver appropriately set */ if (new_db_format == ondsk_blkver) { assert((SS_NORMAL == status1) || (!upgrade && (ERR_DYNUPGRDFAIL == status1))); status1 = SS_NORMAL; /* treat DYNUPGRDFAIL as no error in case of downgrade */ reorg_stats.blks_skipped_newfmtindisk++; continue; /* current disk version is identical to what is desired */ } assert(SS_NORMAL == status1); } /* Begin non-TP transaction to upgrade/downgrade the block. * The way we do that is by updating the block using a null update array. * Any update to a block will trigger an automatic upgrade/downgrade of the block based on * the current fileheader desired_db_format setting and we use that here. */ t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); for (; ;) { CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ curr_tn = csd->trans_hist.curr_tn; db_got_to_v5_once = csd->db_got_to_v5_once; if (db_got_to_v5_once || (BLK_RECYCLED != bml_status)) { blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ blkBase = t_qread(curblk, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); if (NULL == blkBase) { t_retry((enum cdb_sc)rdfail_detail); continue; } blkhist->blk_num = curblk; blkhist->buffaddr = blkBase; ondsk_blkver = blkhist->cr->ondsk_blkver; new_hdr = *(blk_hdr_ptr_t)blkBase; mu_reorg_upgrd_dwngrd_blktn = new_hdr.tn; mark_blk_free = FALSE; inctn_opcode = upgrade ? inctn_blkupgrd : inctn_blkdwngrd; } else { mark_blk_free = TRUE; inctn_opcode = inctn_blkmarkfree; } inctn_detail.blknum_struct.blknum = curblk; /* t_end assumes that the history it is passed does not contain a bitmap block. * for bitmap block, the history validation information is passed through cse instead. * therefore we need to handle bitmap and non-bitmap cases separately. */ if (!lcnt) { /* Means a bitmap block. * At this point we can do a "new_db_format != ondsk_blkver" check to determine * if the block got converted since we did the dsk_read (see the non-bitmap case * for a similar check done there), but in that case we will have a transaction * which has read 1 bitmap block and is updating no block. "t_end" currently cannot * handle this case as it expects any bitmap block that needs validation to also * have a corresponding cse which will hold its history. Hence we avoid doing the * new_db_format check. The only disadvantage of this is that we will end up * modifying the bitmap block as part of this transaction (in an attempt to convert * its ondsk_blkver) even though it is already in the right format. Since this * overhead is going to be one per bitmap block and since the block is in the cache * at this point, we should not lose much. */ assert(!mark_blk_free); BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); assert(&alt_hist.h[0] == blkhist); alt_hist.h[0].blk_num = 0; /* create empty history for bitmap block */ assert(update_trans); } else { /* non-bitmap block. fill in history for validation in t_end */ assert(curblk); /* we should never come here for block 0 (bitmap) */ if (!mark_blk_free) { assert(blkhist->blk_num == curblk); assert(blkhist->buffaddr == blkBase); blkhist->tn = curr_tn; alt_hist.h[1].blk_num = 0; } /* Also need to pass the bitmap as history to detect if any concurrent M-kill * is freeing up the same USED block that we are trying to convert OR if any * concurrent M-set is reusing the same RECYCLED block that we are trying to * convert. Because of t_end currently not being able to validate a bitmap * without that simultaneously having a cse, we need to create a cse for the * bitmap that is used only for bitmap history validation, but should not be * used to update the contents of the bitmap block in bg_update. */ bmlhist.buffaddr = t_qread(curbmp, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr); if (NULL == bmlhist.buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } bmlhist.blk_num = curbmp; bmlhist.tn = curr_tn; GET_BM_STATUS(bmlhist.buffaddr, lcnt, bml_status); if (BLK_MAPINVALID == bml_status) { t_retry(cdb_sc_lostbmlcr); continue; } if (!mark_blk_free) { if ((new_db_format != ondsk_blkver) && (BLK_FREE != bml_status)) { /* block still needs to be converted. create cse */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blkBase + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr)); BLK_FINI(bs_ptr, bs1); t_write(blkhist, (unsigned char *)bs1, 0, 0, ((blk_hdr_ptr_t)blkBase)->levl, FALSE, FALSE, GDS_WRITE_PLAIN); /* The directory tree status for now is only used to determine * whether writing the block to snapshot file (see t_end_sysops.c). * For reorg upgrade/downgrade process, the block is updated in a * sequential way without changing the gv_target. In this case, we * assume the block is in directory tree so as to have it written to * the snapshot file. */ BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state); /* reset update_trans in case previous retry had set it to 0 */ update_trans = UPDTRNS_DB_UPDATED_MASK; if (BLK_RECYCLED == bml_status) { /* If block that we are upgarding is RECYCLED, indicate to * bg_update that blks_to_upgrd counter should NOT be * touched in this case by setting "mode" to a special value */ assert(cw_set[cw_set_depth-1].mode == gds_t_write); cw_set[cw_set_depth-1].mode = gds_t_write_recycled; /* we SET block as NOT RECYCLED, otherwise, the mm_update() * or bg_update_phase2 may skip writing it to snapshot file * when its level is 0 */ BIT_CLEAR_RECYCLED(cw_set[cw_set_depth-1].blk_prior_state); } } else { /* Block got converted by another process since we did the dsk_read. * or this block became marked free in the bitmap. * No need to update this block. just call t_end for validation of * both the non-bitmap block as well as the bitmap block. * Note down that this transaction is no longer updating any blocks. */ update_trans = 0; } /* Need to put bit maps on the end of the cw set for concurrency checking. * We want to simulate t_write_map, except we want to update "cw_map_depth" * instead of "cw_set_depth". Hence the save and restore logic below. * This part of the code is similar to the one in mu_swap_blk.c */ save_cw_set_depth = cw_set_depth; assert(!cw_map_depth); t_write_map(&bmlhist, NULL, curr_tn, 0); /* will increment cw_set_depth */ cw_map_depth = cw_set_depth; /* set cw_map_depth to latest cw_set_depth */ cw_set_depth = save_cw_set_depth;/* restore cw_set_depth */ /* t_write_map simulation end */ } else { if (BLK_RECYCLED != bml_status) { /* Block was RECYCLED at beginning but no longer so. Retry */ t_retry(cdb_sc_bmlmod); continue; } /* Mark recycled block as FREE in bitmap */ assert(lcnt == (curblk - curbmp)); assert(update_array_ptr == update_array); *((block_id *)update_array_ptr) = lcnt; update_array_ptr += SIZEOF(block_id); /* the following assumes SIZEOF(block_id) == SIZEOF(int) */ assert(SIZEOF(block_id) == SIZEOF(int)); *(int *)update_array_ptr = 0; t_write_map(&bmlhist, (unsigned char *)update_array, curr_tn, 0); update_trans = UPDTRNS_DB_UPDATED_MASK; } } assert(SIZEOF(lcl_update_trans) == SIZEOF(update_trans)); lcl_update_trans = update_trans; /* take a copy before t_end modifies it */ if ((trans_num)0 != t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) { /* In case this is MM and t_end() remapped an extended database, reset csd */ assert(csd == cs_data); if (!lcl_update_trans) { assert(lcnt); assert(!mark_blk_free); assert((new_db_format == ondsk_blkver) || (BLK_BUSY != bml_status)); if (BLK_BUSY != bml_status) reorg_stats.blks_skipped_free++; else reorg_stats.blks_skipped_newfmtincache++; } else if (!lcnt) reorg_stats.blks_converted_bmp++; else reorg_stats.blks_converted_nonbmp++; break; } assert(csd == cs_data); } } } stop_reorg_on_this_reg: /* even though ctrl-c occurred, update file-header fields to store reorg's progress before exiting */ grab_crit(reg); blocks_left = 0; assert(csd->trans_hist.total_blks >= csd->blks_to_upgrd); actual_blks2upgrd = csd->blks_to_upgrd; total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; /* Care should be taken not to set "csd->reorg_upgrd_dwngrd_restart_block" in case of a concurrent db fmt * change. This is because let us say we are doing REORG UPGRADE. A concurrent REORG DOWNGRADE would * have reset "csd->reorg_upgrd_dwngrd_restart_block" field to 0 and if that reorg is interrupted by a * Ctrl-C (before this reorg came here) it would have updated "csd->reorg_upgrd_dwngrd_restart_block" to * a non-zero value indicating how many blocks from 0 have been downgraded. We should not reset this * field to "curblk" as it will be mis-interpreted as the number of blocks that have been DOWNgraded. */ set_fully_upgraded = FALSE; if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn) { /* csd->desired_db_format changed since reorg started. discontinue the reorg */ util_out_print("Region !AD : Desired DB Format changed during REORG. Stopping REORG.", TRUE, REG_LEN_STR(reg)); status1 = ERR_MUNOFINISH; } else if (reorg_entiredb) { /* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */ assert(csd->reorg_upgrd_dwngrd_restart_block <= curblk); csd->reorg_upgrd_dwngrd_restart_block = curblk; /* blocks lesser than this have been upgraded/downgraded */ expected_blks2upgrd = upgrade ? 0 : (total_blks - free_blks); blocks_left = upgrade ? actual_blks2upgrd : (expected_blks2upgrd - actual_blks2upgrd); /* If this reorg command went through all blocks in the database, then it should have * correctly concluded at this point whether the reorg is complete or not. * If this reorg command started from where a previous incomplete reorg left * (i.e. first_reorg_in_this_db_fmt is FALSE), it cannot determine if the initial * GDS blocks that it skipped are completely {up,down}graded or not. */ assert((0 == blocks_left) || (SS_NORMAL != status1) || !first_reorg_in_this_db_fmt); /* If this is a MUPIP REORG UPGRADE that did go through every block in the database (indicated by * "reorg_entiredb" && "first_reorg_in_this_db_fmt") and the current count of "blks_to_upgrd" is * 0 in the file-header and the desired_db_format did not change since the start of the REORG, * we can be sure that the entire database has been upgraded. Set "csd->fully_upgraded" to TRUE. */ if ((SS_NORMAL == status1) && first_reorg_in_this_db_fmt && upgrade && (0 == actual_blks2upgrd)) { csd->fully_upgraded = TRUE; csd->db_got_to_v5_once = TRUE; set_fully_upgraded = TRUE; } /* flush all changes noted down in the file-header */ if (!wcs_flu(WCSFLU_FLUSH_HDR)) /* wcs_flu assumes gv_cur_region is set (which it is in this routine) */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg)); status = ERR_MUNOFINISH; rel_crit(reg); continue; } } curr_tn = csd->trans_hist.curr_tn; rel_crit(reg); util_out_print("Region !AD : Stopped processing at block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk); /* Print statistics */ util_out_print("Region !AD : Statistics : Blocks Read From Disk (Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_bmp); util_out_print("Region !AD : Statistics : Blocks Skipped (Free) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_free); util_out_print("Region !AD : Statistics : Blocks Read From Disk (Non-Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_nonbmp); util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in disk) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtindisk); util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in cache) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtincache); util_out_print("Region !AD : Statistics : Blocks Converted (Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_bmp); util_out_print("Region !AD : Statistics : Blocks Converted (Non-Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_nonbmp); if (reorg_entiredb && (SS_NORMAL == status1) && (0 != blocks_left)) { /* file-header counter does not match what reorg on the entire database expected to see */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBBTUWRNG, 2, expected_blks2upgrd, actual_blks2upgrd); util_out_print("Region !AD : Run MUPIP INTEG (without FAST qualifier) to fix the counter", TRUE, REG_LEN_STR(reg)); status1 = ERR_MUNOFINISH; } else util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : " "Blocks to upgrade = [0x!XL]", TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd); /* Issue success or failure message for this region */ if (SS_NORMAL == status1) { /* issue success only if REORG did not encounter any error in its processing */ if (set_fully_upgraded) util_out_print("Region !AD : Database is now FULLY UPGRADED", TRUE, REG_LEN_STR(reg)); util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUREUPDWNGRDEND, 5, REG_LEN_STR(reg), process_id, process_id, &curr_tn); } else { assert(ERR_MUNOFINISH == status1); assert((SS_NORMAL == status) || (ERR_MUNOFINISH == status)); util_out_print("Region !AD : MUPIP REORG !AD incomplete. See above messages.!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = status1; } } if (NULL != bptr) free(bptr); if (NULL != bml_lcl_buff) free(bml_lcl_buff); if (mu_ctrly_occurred || mu_ctrlc_occurred) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_REORGCTRLY); status = ERR_MUNOFINISH; } mupip_exit(status); }