uint4 set_jnl_file_close(set_jnl_file_close_opcode_t set_jnl_file_close_opcode) { uint4 jnl_status = 0; cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs; jnl_status = jnl_ensure_open(); if (0 == jnl_status) { if (0 == cs_addrs->jnl->pini_addr) jnl_put_jrt_pini(cs_addrs); wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH); jnl_put_jrt_pfin(cs_addrs); jnl_file_close(gv_cur_region, TRUE, TRUE); } else gtm_putmsg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_addrs->hdr), DB_LEN_STR(gv_cur_region)); return jnl_status; }
boolean_t mu_truncate(int4 truncate_percent) { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int num_local_maps; int lmap_num, lmap_blk_num; int bml_status, sigkill; int save_errno; int ftrunc_status; uint4 jnl_status; uint4 old_total, new_total; uint4 old_free, new_free; uint4 end_blocks; int4 blks_in_lmap, blk; gtm_uint64_t before_trunc_file_size; off_t trunc_file_size; off_t padding; uchar_ptr_t lmap_addr; boolean_t was_crit; uint4 found_busy_blk; srch_blk_status bmphist; srch_blk_status *blkhist; srch_hist alt_hist; trans_num curr_tn; blk_hdr_ptr_t lmap_blk_hdr; block_id *blkid_ptr; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; char *err_msg; intrpt_state_t prev_intrpt_state; off_t offset; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csa = cs_addrs; csd = cs_data; if (dba_mm == csd->acc_meth) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } /* already checked for parallel truncates on this region --- see mupip_reorg.c */ gv_target = NULL; assert(csa->nl->trunc_pid == process_id); assert(dba_mm != csd->acc_meth); old_total = csa->ti->total_blks; old_free = csa->ti->free_blocks; sigkill = 0; found_busy_blk = 0; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ assert(csd->bplmap == BLKS_PER_LMAP); end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */ if (0 == end_blocks) end_blocks = BLKS_PER_LMAP; num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP); /* ======================================== PHASE 1 ======================================== */ for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--) { if (mu_ctrly_occurred || mu_ctrlc_occurred) return TRUE; assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */ if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } lmap_blk_num = lmap_num * BLKS_PER_LMAP; if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { found_busy_blk = lmap_blk_num; break; } blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP; /* Loop through non-bitmap blocks of this lmap, do recycled2free */ DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n", lmap_num, lmap_blk_num, blks_in_lmap)); for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;) { t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) /* retry loop for recycled to free transactions */ { curr_tn = csd->trans_hist.curr_tn; /* Read the nth local bitmap into memory */ bmphist.blk_num = lmap_blk_num; bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr); lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr; if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr); /* starting from the hint (blk itself), find the first busy or recycled block */ blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status); assert(blk < BLKS_PER_LMAP); if (blk == -1 || blk >= blks_in_lmap) { /* done with this lmap, continue to next */ t_abort(gv_cur_region, csa); break; } else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { /* stop processing blocks... skip ahead to phase 2 */ found_busy_blk = lmap_blk_num; t_abort(gv_cur_region, csa); break; } else if (BLK_RECYCLED == bml_status) { /* Write PBLK records for recycled blocks only if before_image journaling is * enabled. t_end() takes care of checking if journaling is enabled and * writing PBLK record. We have to at least mark the recycled block as free. */ RESET_UPDATE_ARRAY; update_trans = UPDTRNS_DB_UPDATED_MASK; *((block_id *)update_array_ptr) = blk; update_array_ptr += SIZEOF(block_id); *(int *)update_array_ptr = 0; alt_hist.h[1].blk_num = 0; alt_hist.h[0].level = 0; alt_hist.h[0].cse = NULL; alt_hist.h[0].tn = curr_tn; alt_hist.h[0].blk_num = lmap_blk_num + blk; alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num, &alt_hist.h[0].cycle, &alt_hist.h[0].cr); if (!alt_hist.h[0].buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } if (!t_recycled2free(&alt_hist.h[0])) { t_retry(cdb_sc_lostbmlcr); continue; } t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0); /* Set the opcode for INCTN record written by t_end() */ inctn_opcode = inctn_blkmarkfree; if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; /* block processed, scan from the next one */ blk++; break; } else { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_badbitmap); continue; } } /* END recycled2free retry loop */ } /* END scanning blocks of this particular lmap */ /* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */ /* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */ DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num)); t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { RESET_UPDATE_ARRAY; BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */ curr_tn = csd->trans_hist.curr_tn; blkhist = &alt_hist.h[0]; blkhist->blk_num = lmap_blk_num; blkhist->tn = curr_tn; blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ /* Read the nth local bitmap into memory */ blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr; if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); blkhist->blk_num = 0; /* create empty history for bitmap block */ if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; break; } } /* END scanning lmaps */ /* ======================================== PHASE 2 ======================================== */ assert(!csa->now_crit); for (;;) { /* wait for FREEZE, we don't want to truncate a frozen database */ grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(csa, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } assert(csa->nl->trunc_pid == process_id); /* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */ if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) { assert(FALSE); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"), DB_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return FALSE; } csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk); assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk)); new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP); if (mu_ctrly_occurred || mu_ctrlc_occurred) { rel_crit(gv_cur_region); return TRUE; } else if (csa->ti->total_blks != old_total || new_total == old_total) { assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); rel_crit(gv_cur_region); return TRUE; } else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (SNAPSHOTS_IN_PROG(csa->nl)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); if (JNL_ENABLED(csa)) { /* Write JRT_TRUNC and INCTN records */ if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = csa->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, csa); if (SS_NORMAL != jnl_status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); else { if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total); inctn_opcode = inctn_mu_reorg; jnl_write_inctn_rec(csa); jnl_status = jnl_flush(gv_cur_region); if (SS_NORMAL != jnl_status) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"), jnl_status); assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */ } } } /* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */ curr_tn = csa->ti->curr_tn; CHECK_TN(csa, csd, curr_tn); udi = FILE_INFO(gv_cur_region); /* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */ trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1)); csd->after_trunc_total_blks = new_total; csd->before_trunc_free_blocks = csa->ti->free_blocks; csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */ /* file size and total blocks: INCONSISTENT */ csa->ti->total_blks = new_total; /* past the point of no return -- shared memory intact */ assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total)); csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total); new_free = csa->ti->free_blocks; KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */ fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); CHECK_DBSYNC(gv_cur_region, save_errno); /* past the point of no return -- shared memory deleted */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */ clear_cache_array(csa, csd, gv_cur_region, new_total, old_total); offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size; save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff))); if (0 != save_errno) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); return FALSE; } KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */ /* Execute an ftruncate() and truncate the DB file * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS) * It ignores kill -9 signal till its operation is completed. * So we can safely assume that the result of ftruncate() will be complete. */ FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status); if (0 != ftrunc_status) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); /* should go through recover_truncate now, which will again try to FTRUNCATE */ return FALSE; } /* file size and total blocks: CONSISTENT (shrunk) */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */ csa->nl->root_search_cycle++; /* Force concurrent processes to restart in t_end/tp_tend to make sure no one * tries to commit updates past the end of the file. Bitmap validations together * with highest_lbm_with_busy_blk should actually be sufficient, so this is * just to be safe. */ csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */ /* Increment TN */ assert(csa->ti->early_tn == csa->ti->curr_tn); csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1; INCREMENT_CURR_TN(csd); fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */ CHECK_DBSYNC(gv_cur_region, save_errno); ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); curr_tn = csa->ti->curr_tn; rel_crit(gv_cur_region); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn); util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].", FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free); return TRUE; } /* END of mu_truncate() */
int jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header header; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, result; GTM_BAVAIL_TYPE avail_blocks; uint4 aligned_tot_jrec_size, count; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(jpc->region == gv_cur_region); assert(csa->jnl_state == csd->jnl_state); if (!JNL_ENABLED(csa) || (NOJNL == jpc->channel) || (JNL_FILE_SWITCHED(jpc))) GTMASSERT; /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { if ((new_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* if we cannot satisfy the request, it is an error */ send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - new_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, sizeof(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH); jnl_file_close(gv_cur_region, TRUE, TRUE); } else rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); if (jgbl.forw_phase_recovery && (NULL != jgbl.mur_pini_addr_reset_fnptr)) (*jgbl.mur_pini_addr_reset_fnptr)(); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->trans_hist.header_open_tn = jnl_info.tn; /* needed for successful jnl_file_open() */ send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREATE, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); assert(jb->filesize == csd->jnl_alq); aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } else { send_msg(VARLSTCNT(4) ERR_JNLCREATERR, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jb->filesize = new_alq; /* Actually this is virtual file size blocks */ DO_FILE_READ(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); assert((header.virtual_size + new_blocks) == new_alq); header.virtual_size = new_alq; DO_FILE_WRITE(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } if (0 >= new_blocks) break; } if (0 >= new_blocks) { jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); new_blocks = -1; } return new_blocks; }
int4 mupip_set_file(int db_fn_len, char *db_fn) { bool got_standalone; boolean_t bypass_partial_recov, need_standalone = FALSE; char acc_spec[MAX_ACC_METH_LEN], ver_spec[MAX_DB_VER_LEN], exit_stat, *fn; unsigned short acc_spec_len = MAX_ACC_METH_LEN, ver_spec_len = MAX_DB_VER_LEN; int fd, fn_len; int4 status; int4 status1; int glbl_buff_status, defer_status, rsrvd_bytes_status, extn_count_status, lock_space_status, disk_wait_status; int4 new_disk_wait, new_cache_size, new_extn_count, new_lock_space, reserved_bytes, defer_time; sgmnt_data_ptr_t csd; tp_region *rptr, single; enum db_acc_method access, access_new; enum db_ver desired_dbver; gd_region *temp_cur_region; char *errptr, *command = "MUPIP SET VERSION"; int save_errno; error_def(ERR_DBPREMATEOF); error_def(ERR_DBRDERR); error_def(ERR_DBRDONLY); error_def(ERR_INVACCMETHOD); error_def(ERR_MUNOACTION); error_def(ERR_RBWRNNOTCHG); error_def(ERR_WCERRNOTCHG); error_def(ERR_WCWRNNOTCHG); error_def(ERR_MMNODYNDWNGRD); exit_stat = EXIT_NRM; defer_status = cli_present("DEFER_TIME"); if (defer_status) need_standalone = TRUE; bypass_partial_recov = cli_present("PARTIAL_RECOV_BYPASS") == CLI_PRESENT; if (bypass_partial_recov) need_standalone = TRUE; if (disk_wait_status = cli_present("WAIT_DISK")) { if (cli_get_int("WAIT_DISK", &new_disk_wait)) { if (new_disk_wait < 0) { util_out_print("!UL negative, minimum WAIT_DISK allowed is 0.", TRUE, new_disk_wait); return (int4)ERR_WCWRNNOTCHG; } need_standalone = TRUE; } else { util_out_print("Error getting WAIT_DISK qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } } if (glbl_buff_status = cli_present("GLOBAL_BUFFERS")) { if (cli_get_int("GLOBAL_BUFFERS", &new_cache_size)) { if (new_cache_size > WC_MAX_BUFFS) { util_out_print("!UL too large, maximum write cache buffers allowed is !UL", TRUE, new_cache_size, WC_MAX_BUFFS); return (int4)ERR_WCWRNNOTCHG; } if (new_cache_size < WC_MIN_BUFFS) { util_out_print("!UL too small, minimum cache buffers allowed is !UL", TRUE, new_cache_size, WC_MIN_BUFFS); return (int4)ERR_WCWRNNOTCHG; } } else { util_out_print("Error getting GLOBAL BUFFER qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } need_standalone = TRUE; } /* EXTENSION_COUNT does not require standalone access and hence need_standalone will not be set to TRUE for this. */ if (extn_count_status = cli_present("EXTENSION_COUNT")) { if (cli_get_int("EXTENSION_COUNT", &new_extn_count)) { if (new_extn_count > MAX_EXTN_COUNT) { util_out_print("!UL too large, maximum extension count allowed is !UL", TRUE, new_extn_count, MAX_EXTN_COUNT); return (int4)ERR_WCWRNNOTCHG; } if (new_extn_count < MIN_EXTN_COUNT) { util_out_print("!UL too small, minimum extension count allowed is !UL", TRUE, new_extn_count, MIN_EXTN_COUNT); return (int4)ERR_WCWRNNOTCHG; } } else { util_out_print("Error getting EXTENSION COUNT qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } } if (lock_space_status = cli_present("LOCK_SPACE")) { if (cli_get_int("LOCK_SPACE", &new_lock_space)) { if (new_lock_space > MAX_LOCK_SPACE) { util_out_print("!UL too large, maximum lock space allowed is !UL", TRUE, new_lock_space, MAX_LOCK_SPACE); return (int4)ERR_WCWRNNOTCHG; } else if (new_lock_space < MIN_LOCK_SPACE) { util_out_print("!UL too small, minimum lock space allowed is !UL", TRUE, new_lock_space, MIN_LOCK_SPACE); return (int4)ERR_WCWRNNOTCHG; } } else { util_out_print("Error getting LOCK_SPACE qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } need_standalone = TRUE; } if (rsrvd_bytes_status = cli_present("RESERVED_BYTES")) { if (!cli_get_int("RESERVED_BYTES", &reserved_bytes)) { util_out_print("Error getting RESERVED BYTES qualifier value", TRUE); return (int4)ERR_RBWRNNOTCHG; } need_standalone = TRUE; } if (cli_present("ACCESS_METHOD")) { cli_get_str("ACCESS_METHOD", acc_spec, &acc_spec_len); cli_strupper(acc_spec); if (0 == memcmp(acc_spec, "MM", acc_spec_len)) access = dba_mm; else if (0 == memcmp(acc_spec, "BG", acc_spec_len)) access = dba_bg; else mupip_exit(ERR_INVACCMETHOD); need_standalone = TRUE; } else access = n_dba; /* really want to keep current method, which has not yet been read */ if (cli_present("VERSION")) { assert(!need_standalone); cli_get_str("VERSION", ver_spec, &ver_spec_len); cli_strupper(ver_spec); if (0 == memcmp(ver_spec, "V4", ver_spec_len)) desired_dbver = GDSV4; else if (0 == memcmp(ver_spec, "V5", ver_spec_len)) desired_dbver = GDSV5; else GTMASSERT; /* CLI should prevent us ever getting here */ } else desired_dbver = GDSVLAST; /* really want to keep version, which has not yet been read */ if (region) rptr = grlist; else { rptr = &single; memset(&single, 0, sizeof(single)); } csd = (sgmnt_data *)malloc(ROUND_UP(sizeof(sgmnt_data), DISK_BLOCK_SIZE)); in_backup = FALSE; /* Only want yes/no from mupfndfil, not an address */ for (; rptr != NULL; rptr = rptr->fPtr) { if (region) { if (dba_usr == rptr->reg->dyn.addr->acc_meth) { util_out_print("!/Region !AD is not a GDS access type", TRUE, REG_LEN_STR(rptr->reg)); exit_stat |= EXIT_WRN; continue; } if (!mupfndfil(rptr->reg, NULL)) continue; fn = (char *)rptr->reg->dyn.addr->fname; fn_len = rptr->reg->dyn.addr->fname_len; } else { fn = db_fn; fn_len = db_fn_len; } mu_gv_cur_reg_init(); strcpy((char *)gv_cur_region->dyn.addr->fname, fn); gv_cur_region->dyn.addr->fname_len = fn_len; if (!need_standalone) { gvcst_init(gv_cur_region); change_reg(); /* sets cs_addrs and cs_data */ if (gv_cur_region->read_only) { gtm_putmsg(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); exit_stat |= EXIT_ERR; gds_rundown(); mu_gv_cur_reg_free(); continue; } grab_crit(gv_cur_region); status = EXIT_NRM; access_new = (n_dba == access ? cs_data->acc_meth : access); /* recalculate; n_dba is a proxy for no change */ change_fhead_timer("FLUSH_TIME", cs_data->flush_time, (dba_bg == access_new ? TIM_FLU_MOD_BG : TIM_FLU_MOD_MM), FALSE); if (GDSVLAST != desired_dbver) { if ((dba_mm != access_new) || (GDSV4 != desired_dbver)) status1 = desired_db_format_set(gv_cur_region, desired_dbver, command); else { status1 = ERR_MMNODYNDWNGRD; gtm_putmsg(VARLSTCNT(4) status1, 2, REG_LEN_STR(gv_cur_region)); } if (SS_NORMAL != status1) { /* "desired_db_format_set" would have printed appropriate error messages */ if (ERR_MUNOACTION != status1) { /* real error occurred while setting the db format. skip to next region */ status = EXIT_ERR; } } } if (EXIT_NRM == status) { if (extn_count_status) cs_data->extension_size = (uint4)new_extn_count; wcs_flu(WCSFLU_FLUSH_HDR); if (extn_count_status) util_out_print("Database file !AD now has extension count !UL", TRUE, fn_len, fn, cs_data->extension_size); if (GDSVLAST != desired_dbver) util_out_print("Database file !AD now has desired DB format !AD", TRUE, fn_len, fn, LEN_AND_STR(gtm_dbversion_table[cs_data->desired_db_format])); } else exit_stat |= status; rel_crit(gv_cur_region); gds_rundown(); } else { /* Following part needs standalone access */ assert(GDSVLAST == desired_dbver); got_standalone = mu_rndwn_file(gv_cur_region, TRUE); if (FALSE == got_standalone) return (int4)ERR_WCERRNOTCHG; /* we should open it (for changing) after mu_rndwn_file, since mu_rndwn_file changes the file header too */ if (-1 == (fd = OPEN(fn, O_RDWR))) { save_errno = errno; errptr = (char *)STRERROR(save_errno); util_out_print("open : !AZ", TRUE, errptr); exit_stat |= EXIT_ERR; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } LSEEKREAD(fd, 0, csd, sizeof(sgmnt_data), status); if (0 != status) { save_errno = errno; PERROR("Error reading header of file"); errptr = (char *)STRERROR(save_errno); util_out_print("read : !AZ", TRUE, errptr); util_out_print("Error reading header of file", TRUE); util_out_print("Database file !AD not changed: ", TRUE, fn_len, fn); if (-1 != status) rts_error(VARLSTCNT(4) ERR_DBRDERR, 2, fn_len, fn); else rts_error(VARLSTCNT(4) ERR_DBPREMATEOF, 2, fn_len, fn); } if (rsrvd_bytes_status) { if (reserved_bytes > MAX_RESERVE_B(csd)) { util_out_print("!UL too large, maximum reserved bytes allowed is !UL for database file !AD", TRUE, reserved_bytes, MAX_RESERVE_B(csd), fn_len, fn); close(fd); db_ipcs_reset(gv_cur_region, FALSE); return (int4)ERR_RBWRNNOTCHG; } csd->reserved_bytes = reserved_bytes; } access_new = (n_dba == access ? csd->acc_meth : access); /* recalculate; n_dba is a proxy for no change */ change_fhead_timer("FLUSH_TIME", csd->flush_time, (dba_bg == access_new ? TIM_FLU_MOD_BG : TIM_FLU_MOD_MM), FALSE); if ((n_dba != access) && (csd->acc_meth != access)) /* n_dba is a proxy for no change */ { if (dba_mm == access) csd->defer_time = 1; /* defer defaults to 1 */ csd->acc_meth = access; if (0 == csd->n_bts) { csd->n_bts = WC_DEF_BUFFS; csd->bt_buckets = getprime(csd->n_bts); } } if (glbl_buff_status) { csd->n_bts = BT_FACTOR(new_cache_size); csd->bt_buckets = getprime(csd->n_bts); csd->n_wrt_per_flu = 7; csd->flush_trigger = FLUSH_FACTOR(csd->n_bts); } if (disk_wait_status) csd->wait_disk_space = new_disk_wait; if (extn_count_status) csd->extension_size = (uint4)new_extn_count; if (lock_space_status) csd->lock_space_size = (uint4)new_lock_space * OS_PAGELET_SIZE; if (bypass_partial_recov) { csd->file_corrupt = FALSE; util_out_print("Database file !AD now has partial recovery flag set to !UL(FALSE) ", TRUE, fn_len, fn, csd->file_corrupt); } if (dba_mm == access_new) { if (CLI_NEGATED == defer_status) csd->defer_time = 0; else if (CLI_PRESENT == defer_status) { if (!cli_get_num("DEFER_TIME", &defer_time)) { util_out_print("Error getting DEFER_TIME qualifier value", TRUE); db_ipcs_reset(gv_cur_region, FALSE); return (int4)ERR_RBWRNNOTCHG; } if (-1 > defer_time) { util_out_print("DEFER_TIME cannot take negative values less than -1", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } csd->defer_time = defer_time; } if (csd->blks_to_upgrd) { util_out_print("MM access method cannot be set if there are blocks to upgrade", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } if (GDSVCURR != csd->desired_db_format) { util_out_print("MM access method cannot be set in DB compatibility mode", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } if (JNL_ENABLED(csd) && csd->jnl_before_image) { util_out_print("MM access method cannot be set with BEFORE image journaling", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } csd->jnl_before_image = FALSE; } else { if (defer_status) { util_out_print("DEFER cannot be specified with BG access method.", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } } LSEEKWRITE(fd, 0, csd, sizeof(sgmnt_data), status); if (0 != status) { save_errno = errno; errptr = (char *)STRERROR(save_errno); util_out_print("write : !AZ", TRUE, errptr); util_out_print("Error writing header of file", TRUE); util_out_print("Database file !AD not changed: ", TRUE, fn_len, fn); rts_error(VARLSTCNT(4) ERR_DBRDERR, 2, fn_len, fn); } close(fd); /* --------------------- report results ------------------------- */ if (glbl_buff_status) util_out_print("Database file !AD now has !UL global buffers", TRUE, fn_len, fn, csd->n_bts); if (defer_status && (dba_mm == csd->acc_meth)) util_out_print("Database file !AD now has defer_time set to !SL", TRUE, fn_len, fn, csd->defer_time); if (rsrvd_bytes_status) util_out_print("Database file !AD now has !UL reserved bytes", TRUE, fn_len, fn, csd->reserved_bytes); if (extn_count_status) util_out_print("Database file !AD now has extension count !UL", TRUE, fn_len, fn, csd->extension_size); if (lock_space_status) util_out_print("Database file !AD now has lock space !UL pages", TRUE, fn_len, fn, csd->lock_space_size/OS_PAGELET_SIZE); if (disk_wait_status) util_out_print("Database file !AD now has wait disk set to !UL seconds", TRUE, fn_len, fn, csd->wait_disk_space); db_ipcs_reset(gv_cur_region, FALSE); } /* end of else part if (!need_standalone) */ mu_gv_cur_reg_free(); } free(csd); assert(!(exit_stat & EXIT_INF)); return (exit_stat & EXIT_ERR ? (int4)ERR_WCERRNOTCHG : (exit_stat & EXIT_WRN ? (int4)ERR_WCWRNNOTCHG : SS_NORMAL)); }
int4 gds_rundown(void) { boolean_t canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin; boolean_t have_standalone_access, ipc_deleted, err_caught; boolean_t is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm; boolean_t unsafe_last_writer; char time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status, rc; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; shm_snapshot_t *ss_shm_ptr; uint4 ss_pid, onln_rlbk_pid, holder_pid; boolean_t was_crit; boolean_t safe_mode; /* Do not flush or take down shared memory. */ boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen, ftok_counter_halted, access_counter_halted; int secshrstat; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return EXIT_NRM; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return EXIT_NRM; } /* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on * that later to determine if the process had standalone access or not when it entered this function. We need to guarantee * that none else access database file header when semid/shmid fields are reset. We already have created ftok semaphore in * db_init or, mu_rndwn_file and did not remove it. So just lock it. We do it in blocking mode. */ have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */ DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); ESTABLISH_NORET(gds_rundown_ch, err_caught); if (err_caught) { REVERT; WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0); ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE); return EXIT_ERR; } assert(reg->open); /* if we failed to open, dbinit_ch should have taken care of proper clean up */ assert(!reg->opening); /* see comment above */ assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth)); is_mm = (dba_bg != csd->acc_meth); assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk)); /* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown. */ was_crit = (csa->now_crit && jgbl.onlnrlbk); /* Cancel any pending flush timer for this region by this task */ canceled_flush_timer = FALSE; canceled_dbsync_timer = FALSE; CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); we_are_last_user = FALSE; inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr); if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); if (!was_crit) { rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); } /* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach * mupip_exit_handler while holding access control semaphore. Assert accordingly. */ assert(!have_standalone_access || mupip_jnl_recover || process_exiting); /* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it * needs the access control lock as well. The only expection is we are online rollback and currently running down. */ cnl = csa->nl; onln_rlbk_pid = cnl->onln_rlbk_pid; assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0)); if (!have_standalone_access) { if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */ { save_errno = errno; assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); } may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */ /* We need to guarantee that no one else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it. */ if (!ftok_sem_lock(reg, may_bypass_ftok)) { if (may_bypass_ftok) { /* We did a non-blocking wait. It's ok to proceed without locking */ bypassed_ftok = TRUE; holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */ { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at rundown")); } } else { /* We did a blocking wait but something bad happened. */ FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } } sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (0 != status) { save_errno = errno; /* Check # of processes counted on access sem. */ if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); } bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt; /* Before attempting again in the blocking mode, see if the holding process is an online rollback. * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we * are better off skipping rundown and continuing with sanity cleanup and exit. */ holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!bypassed_access) { /* We couldn't get it in one shot-- see if we already have it */ if (holder_pid == process_id) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); assert(FALSE); return EXIT_ERR; } if (EAGAIN != save_errno) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status, FORCED_WAIT); if (-1 == status) /* We couldn't get it at all.. */ rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, errno); } else if (!IS_GTM_IMAGE) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at rundown")); } udi->grabbed_access_sem = !bypassed_access; } } /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the * ftok semaphore and trying it could cause deadlock */ /* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE. * But there could be other processes still having the database open so we cannot safely reset the halted fields. */ if (have_standalone_access && !jgbl.onlnrlbk) csd->ftok_counter_halted = csd->access_counter_halted = FALSE; ftok_counter_halted = csd->ftok_counter_halted; access_counter_halted = csd->access_counter_halted; /* If we bypassed any of the semaphores, activate safe mode. * Also, if the replication instance is frozen and this db has replication turned on (which means * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode. */ ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen); safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted; /* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --cnl->ref_cnt; if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode; /* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */ assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen); if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); /* There's one writer left and I am it */ assert(reg->read_only || semval >= 0); unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch; we_are_last_writer = unsafe_last_writer && !safe_mode; assert(!we_are_last_writer || !safe_mode); assert(!we_are_last_user || !safe_mode); /* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */ assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen); GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */ if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL)))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); if (NULL != csa->ss_ctx) ss_destroy_context(csa->ss_ctx); /* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */ assert(1 == MAX_SNAPSHOTS); ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); ss_pid = ss_shm_ptr->ss_info.ss_pid; is_cur_process_ss_initiator = (process_id == ss_pid); if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user)) { /* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP * INTEG or by a MUPIP RUNDOWN. */ if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid) && (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0))) { ss_release(NULL); ss_release_lock(reg); } } /* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a * consistent state. */ if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY * if there is NO concurrent online rollback running (as we need crit to set wc_blocked) */ if (csa->wbuf_dqd && !is_mm) { /* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked * BEFORE secshr_db_clnup in mur_close_files. * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in * wcs_wtstart or wcs_get_space. Assert accordingly. */ assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode); if (!was_crit) grab_crit(reg); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); BG_TRACE_PRO_ANY(csa, lost_block_recovery); if (!was_crit) rel_crit(reg); } if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ assert(!safe_mode); if (is_mm) { MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg); cnl->remove_shm = TRUE; } if (cnl->wc_blocked && jgbl.onlnrlbk) { /* if the last update done by online rollback was not committed in the normal code-path but was * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This * could result in the last update never getting flushed to the disk and if online rollback happened * to be the last writer then the shared memory will be flushed and removed and the last update will * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is * possible only if phase1 or phase2 errors are induced using white box test cases */ assert(WB_COMMIT_ERR_ENABLED); wcs_recover(reg); } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); } else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen) { /* canceled pending db or jnl flush timers - flush database and journal buffers to disk */ if (!was_crit) grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); if (!was_crit) rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ if (!was_crit) grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing * a pini, so allow for that. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr)); /* If we haven't written a pini, let jnl_file_close write the pini/pfin. */ if (!jgbl.mur_extract && (0 != jpc->pini_addr)) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > cnl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } if (!was_crit) rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { if (!was_crit) grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!have_standalone_access && we_are_last_user) { /* mupip_exit_handler will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); if (!was_crit) rel_crit(reg); if (!is_mm) { GTM_DB_FSYNC(csa, udi->fd, rc); /* Sync it all */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ assert(csa->ti->total_blks == csa->total_blks); #ifdef _AIX GTM_DB_FSYNC(csa, udi->fd, rc); if (-1 == rc) #else if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1])) #endif { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } } else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg)); cnl->lastwriterbypas_msg_issued = TRUE; } } /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */ /* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this). */ CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm) { /* mupip_exit_handler will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (!csa->read_only_fs) { secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0); if (0 != secshrstat) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } } /* Done with file now, close it */ CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ # if !defined(_AIX) if (is_mm && (NULL != csa->db_addrs[0])) { assert(csa->db_addrs[1] > csa->db_addrs[0]); munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]); if (0 < munmap_len) munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len)); } # endif /* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down * this region. In the process also get the remove_shm status from node_local before detaching. * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl); /* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0. * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid). */ if (jgbl.onlnrlbk) cnl->onln_rlbk_pid = 0; rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */ /* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down. * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes. */ if (safe_mode) /* indicates flushing was skipped */ { if (bypassed_access) cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */ if (bypassed_ftok) cnl->dbrndwn_ftok_skip++; } if (jgbl.onlnrlbk) csa->hold_onto_crit = FALSE; GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0); status = shmdt((caddr_t)cnl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ /* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so * we are safe passing "csa". */ if (-1 == status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); REMOVE_CSA_FROM_CSADDRSLIST(csa); /* remove "csa" from list of open regions (cs_addrs_list) */ reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ assertpro(0 == csa->wbuf_dqd); ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); /* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */ udi->new_shm = FALSE; /* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from * mur_close_files()) */ if (!have_standalone_access) { if (0 != sem_rmid(udi->semid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); udi->new_sem = FALSE; /* Note that we no longer have a new semaphore */ udi->grabbed_access_sem = FALSE; udi->counter_acc_incremented = FALSE; } } else if (is_src_server || is_updproc) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else { assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode); if (!jgbl.onlnrlbk && !have_standalone_access) { /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) { if (!access_counter_halted) { save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO); if (0 != save_errno) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"), CALLFROM, save_errno); } udi->counter_acc_incremented = FALSE; } assert(safe_mode || !bypassed_access); /* Now remove the rundown lock */ if (!bypassed_access) { if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore release"), CALLFROM, save_errno); udi->grabbed_access_sem = FALSE; } } /* else access control semaphore will be released in db_ipcs_reset */ } if (!have_standalone_access) { if (bypassed_ftok) { if (!ftok_counter_halted) if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE)) { FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } udi->grabbed_ftok_sem = FALSE; udi->counter_ftok_incremented = FALSE; } ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); if (!ipc_deleted) { GET_CUR_TIME(time_str); if (is_src_server) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; return EXIT_NRM; }
void gds_rundown(void) { bool is_mm, we_are_last_user, we_are_last_writer; boolean_t ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch; now_t now; /* for GET_CUR_TIME macro */ char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; error_def(ERR_CRITSEMFAIL); error_def(ERR_DBCCERR); error_def(ERR_DBFILERR); error_def(ERR_DBRNDWNWRN); error_def(ERR_ERRCALL); error_def(ERR_GBLOFLOW); error_def(ERR_GTMASSERT); error_def(ERR_IPCNOTDEL); error_def(ERR_JNLFLUSH); error_def(ERR_RNDWNSEMFAIL); error_def(ERR_TEXT); error_def(ERR_WCBLOCKED); forced_exit = FALSE; /* Okay, we're dying already -- let rel_crit live in peace now. * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001 */ grabbed_access_sem = FALSE; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* * early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return; } ESTABLISH(gds_rundown_ch); if (!reg->open) /* Not open, no point to rundown */ { if (reg->opening) /* Died partway open, kill rest of way */ { rel_crit(reg); mutex_cleanup(reg); /* revist this to handle MM properly SMW 98/12/16 if (NULL != csa->nl) { status = shmdt((caddr_t)csa->nl); if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); } */ shmdt((caddr_t)csa->nl); csa->nl = NULL; } REVERT; return; } switch(csd->acc_meth) { /* Pass mm and bg through */ case dba_bg: is_mm = FALSE; break; case dba_mm: is_mm = TRUE; break; case dba_usr: assert(FALSE); default: REVERT; return; } /* Cancel any pending flush timer for this region by this task */ CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer); we_are_last_user = FALSE; if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); assert(!csa->read_lock); rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); /* * We need to guarantee that none else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it. * So just lock it. We do it in blocking mode. */ if (!ftok_sem_lock(reg, FALSE, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * For mupip_jnl_recover we already have database access control semaphore. * We do not release it. We release it from mur_close_files. */ if (!mupip_jnl_recover) { sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) /* We couldn't get it in one shot -- see if we already have it */ { save_errno = errno; /* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */ if (semctl(udi->semid, 0, GETPID) == process_id) { send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; return; /* Already in rundown for this region */ } if (EAGAIN != save_errno) { assert(FALSE); rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status); if (-1 == status) /* We couldn't get it at all.. */ rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); } } grabbed_access_sem = TRUE; /* * We now have the dbinit/rundown lock, so we are alone in this code for this region * and nobody else can attach. * See if we are all alone in accessing this database shared memory. */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --csa->nl->ref_cnt; if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch; assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */ if (-1 == (semval = semctl(udi->semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */ assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */ if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); /* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should * not flush shared memory contents to disk as they might be in an inconsistent state. * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine. * A reissue of the recover/rollback command will restore the database to a consistent state. * Otherwise, if we have write access to this region, let us perform a few writing tasks. */ if (csa->nl->donotflush_dbjnl) csa->wbuf_dqd = 0; /* ignore csa->wbuf_dqd status as we do not care about the cache contents */ else if (!reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */ if (csa->wbuf_dqd) { grab_crit(reg); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); if (is_mm) { assert(FALSE); csd = csa->hdr; } BG_TRACE_PRO_ANY(csa, lost_block_recovery); rel_crit(reg); } if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type)) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ if (is_mm) { if (csa->total_blks != csa->ti->total_blks) /* do remap if file had been extended */ { grab_crit(reg); wcs_mm_recover(reg); csd = csa->hdr; rel_crit(reg); } csa->nl->remove_shm = TRUE; } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn; } else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer) { /* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */ grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != csa->nl->jnl_file.u.inode)) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * Although we assert pini_addr should be non-zero for last_writer, we * play it safe in PRO and write a PINI record if not written already. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || 0 != jpc->pini_addr); if (we_are_last_writer && 0 == jpc->pini_addr) jnl_put_jrt_pini(csa); if (0 != jpc->pini_addr) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > csa->nl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!mupip_jnl_recover && we_are_last_user) { /* mupip_jnl_recover will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); rel_crit(reg); if (FALSE == is_mm) { if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ #if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC) if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } #else if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno); } #endif } } } /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */ if (reg->read_only && we_are_last_user && !mupip_jnl_recover) { /* mupip_jnl_recover will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } /* Done with file now, close it */ if (-1 == close(udi->fd)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ if (is_mm) { munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)); if (munmap_len > 0) { munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)), (size_t)(munmap_len)); #ifdef DEBUG_DB64 rel_mmseg((caddr_t)csa->db_addrs[0]); #endif } } /* Detach our shared memory while still under lock so reference counts will be * correct for the next process to run down this region. * In the process also get the remove_shm status from node_local before detaching. * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl); status = shmdt((caddr_t)csa->nl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ if (csa->wbuf_dqd) GTMASSERT; ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); } else if (is_src_server || is_updproc) { gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); /* * Don't release semaphore in case of mupip recover/rollback; since it has standalone access. * It will release the semaphore in mur_close_files. */ if (!mupip_jnl_recover) { if (0 != sem_rmid(udi->semid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); grabbed_access_sem = FALSE; } } else { assert(!mupip_jnl_recover); /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno); /* Now remove the rundown lock */ if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno); grabbed_access_sem = FALSE; } if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); if (!ipc_deleted) { GET_CUR_TIME; if (is_src_server) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover) { gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; }
void mu_reorg_upgrd_dwngrd(void) { blk_hdr new_hdr; blk_segment *bs1, *bs_ptr; block_id *blkid_ptr, curblk, curbmp, start_blk, stop_blk, start_bmp, last_bmp; block_id startblk_input, stopblk_input; boolean_t upgrade, downgrade, safejnl, nosafejnl, region, first_reorg_in_this_db_fmt, reorg_entiredb; boolean_t startblk_specified, stopblk_specified, set_fully_upgraded, db_got_to_v5_once, mark_blk_free; cache_rec_ptr_t cr; char *bml_lcl_buff = NULL, *command, *reorg_command; sm_uc_ptr_t bptr = NULL; cw_set_element *cse; enum cdb_sc cdb_status; enum db_ver new_db_format, ondsk_blkver; gd_region *reg; int cycle; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */ int4 blocks_left, expected_blks2upgrd, actual_blks2upgrd, total_blks, free_blks; int4 status, status1, mapsize, lcnt, bml_status; reorg_stats_t reorg_stats; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; sm_uc_ptr_t blkBase, bml_sm_buff; /* shared memory pointer to the bitmap global buffer */ srch_hist alt_hist; srch_blk_status *blkhist, bmlhist; tp_region *rptr; trans_num curr_tn; unsigned char save_cw_set_depth; uint4 lcl_update_trans; region = (CLI_PRESENT == cli_present("REGION")); upgrade = (CLI_PRESENT == cli_present("UPGRADE")); downgrade = (CLI_PRESENT == cli_present("DOWNGRADE")); assert(upgrade && !downgrade || !upgrade && downgrade); command = upgrade ? "UPGRADE" : "DOWNGRADE"; reorg_command = upgrade ? "MUPIP REORG UPGRADE" : "MUPIP REORG DOWNGRADE"; reorg_entiredb = TRUE; /* unless STARTBLK or STOPBLK is specified we are going to {up,down}grade the entire database */ startblk_specified = FALSE; assert(SIZEOF(block_id) == SIZEOF(uint4)); if ((CLI_PRESENT == cli_present("STARTBLK")) && (cli_get_hex("STARTBLK", (uint4 *)&startblk_input))) { reorg_entiredb = FALSE; startblk_specified = TRUE; } stopblk_specified = FALSE; assert(SIZEOF(block_id) == SIZEOF(uint4)); if ((CLI_PRESENT == cli_present("STOPBLK")) && (cli_get_hex("STOPBLK", (uint4 *)&stopblk_input))) { reorg_entiredb = FALSE; stopblk_specified = TRUE; } mu_reorg_upgrd_dwngrd_in_prog = TRUE; mu_reorg_nosafejnl = (CLI_NEGATED == cli_present("SAFEJNL")) ? TRUE : FALSE; assert(region); status = SS_NORMAL; error_mupip = FALSE; gvinit(); /* initialize gd_header (needed by the later call to mu_getlst) */ mu_getlst("REG_NAME", SIZEOF(tp_region)); /* get the parameter corresponding to REGION qualifier */ if (error_mupip) { util_out_print("!/MUPIP REORG !AD cannot proceed with above errors!/", TRUE, LEN_AND_STR(command)); mupip_exit(ERR_MUNOACTION); } assert(DBKEYSIZE(MAX_KEY_SZ) == gv_keysize); /* no need to invoke GVKEYSIZE_INIT_IF_NEEDED macro */ gv_target = targ_alloc(gv_keysize, NULL, NULL); /* t_begin needs this initialized */ gv_target_list = NULL; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ blkhist = &alt_hist.h[0]; for (rptr = grlist; NULL != rptr; rptr = rptr->fPtr) { if (mu_ctrly_occurred || mu_ctrlc_occurred) break; reg = rptr->reg; util_out_print("!/Region !AD : MUPIP REORG !AD started", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); if (reg_cmcheck(reg)) { util_out_print("Region !AD : MUPIP REORG !AD cannot run across network", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = ERR_MUNOFINISH; continue; } mu_reorg_process = TRUE; /* gvcst_init will use this value to use gtm_poollimit settings. */ gvcst_init(reg); mu_reorg_process = FALSE; assert(update_array != NULL); /* access method stored in global directory and database file header might be different in which case * the database setting prevails. therefore, the access method check can be done only after opening * the database (i.e. after the gvcst_init) */ if (dba_bg != REG_ACC_METH(reg)) { util_out_print("Region !AD : MUPIP REORG !AD cannot continue as access method is not BG", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = ERR_MUNOFINISH; continue; } /* The mu_getlst call above uses insert_region to create the grlist, which ensures that duplicate regions mapping to * the same db file correspond to only one grlist entry. */ assert(FALSE == reg->was_open); TP_CHANGE_REG(reg); /* sets gv_cur_region, cs_addrs, cs_data */ csa = cs_addrs; csd = cs_data; blk_size = csd->blk_size; /* "blk_size" is used by the BLK_FINI macro */ if (reg->read_only) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(reg)); status = ERR_MUNOFINISH; continue; } assert(GDSVCURR == GDSV6); /* so we trip this assert in case GDSVCURR changes without a change to this module */ new_db_format = (upgrade ? GDSV6 : GDSV4); grab_crit(reg); curr_tn = csd->trans_hist.curr_tn; /* set the desired db format in the file header to the appropriate version, increment transaction number */ status1 = desired_db_format_set(reg, new_db_format, reorg_command); assert(csa->now_crit); /* desired_db_format_set() should not have released crit */ first_reorg_in_this_db_fmt = TRUE; /* with the current desired_db_format, this is the first reorg */ if (SS_NORMAL != status1) { /* "desired_db_format_set" would have printed appropriate error messages */ if (ERR_MUNOACTION != status1) { /* real error occurred while setting the db format. skip to next region */ status = ERR_MUNOFINISH; rel_crit(reg); continue; } util_out_print("Region !AD : Desired DB Format remains at !AD after !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command)); if (csd->reorg_db_fmt_start_tn == csd->desired_db_format_tn) first_reorg_in_this_db_fmt = FALSE; } else util_out_print("Region !AD : Desired DB Format set to !AD by !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command)); assert(dba_bg == csd->acc_meth); /* Check blks_to_upgrd counter to see if upgrade/downgrade is complete */ total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; actual_blks2upgrd = csd->blks_to_upgrd; /* If MUPIP REORG UPGRADE and there is no block to upgrade in the database as indicated by BOTH * "csd->blks_to_upgrd" and "csd->fully_upgraded", then we can skip processing. * If MUPIP REORG UPGRADE and all non-free blocks need to be upgraded then again we can skip processing. */ if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded) || (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd))) { util_out_print("Region !AD : Blocks to Upgrade counter indicates no action needed for MUPIP REORG !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : " "Blocks to upgrade = [0x!XL]", TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd); util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); rel_crit(reg); continue; } stop_blk = total_blks; if (stopblk_specified && stopblk_input <= stop_blk) stop_blk = stopblk_input; if (first_reorg_in_this_db_fmt) { /* Note down reorg start tn (in case we are interrupted, future reorg will know to resume) */ csd->reorg_db_fmt_start_tn = csd->desired_db_format_tn; csd->reorg_upgrd_dwngrd_restart_block = 0; start_blk = (startblk_specified ? startblk_input : 0); } else { /* Either a concurrent MUPIP REORG of the same type ({up,down}grade) is currently running * or a previously running REORG of the same type was interrupted (Ctrl-Ced). * In either case resume processing from whatever restart block number is stored in fileheader * the only exception is if "STARTBLK" was specified in the input in which use that unconditionally. */ start_blk = (startblk_specified ? startblk_input : csd->reorg_upgrd_dwngrd_restart_block); } if (start_blk > stop_blk) start_blk = stop_blk; mu_reorg_upgrd_dwngrd_start_tn = csd->reorg_db_fmt_start_tn; /* Before releasing crit, flush the file-header and dirty buffers in cache to disk. This is because we are now * going to read each GDS block directly from disk to determine if it needs to be upgraded/downgraded or not. */ if (!wcs_flu(WCSFLU_FLUSH_HDR)) /* wcs_flu assumes gv_cur_region is set (which it is in this routine) */ { rel_crit(reg); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg)); status = ERR_MUNOFINISH; continue; } rel_crit(reg); /* Loop through entire database one GDS block at a time and upgrade/downgrade each of them */ status1 = SS_NORMAL; start_bmp = ROUND_DOWN2(start_blk, BLKS_PER_LMAP); last_bmp = ROUND_DOWN2(stop_blk - 1, BLKS_PER_LMAP); curblk = start_blk; /* curblk is the block to be upgraded/downgraded */ util_out_print("Region !AD : Started processing from block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk); if (NULL != bptr) { /* malloc/free "bptr" for each region as GDS block-size can be different */ free(bptr); bptr = NULL; } memset(&reorg_stats, 0, SIZEOF(reorg_stats)); /* initialize statistics for this region */ for (curbmp = start_bmp; curbmp <= last_bmp; curbmp += BLKS_PER_LMAP) { if (mu_ctrly_occurred || mu_ctrlc_occurred) { status1 = ERR_MUNOFINISH; break; } /* -------------------------------------------------------------- * Read in current bitmap block * -------------------------------------------------------------- */ assert(!csa->now_crit); bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* bring block into the cache outside of crit */ reorg_stats.blks_read_from_disk_bmp++; grab_crit_encr_cycle_sync(reg); /* needed so t_qread does not return NULL below */ if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn) { /* csd->desired_db_format changed since reorg started. discontinue the reorg */ /* see later comment on "csd->reorg_upgrd_dwngrd_restart_block" for why the assignment * of this field should be done only if a db format change did not occur. */ rel_crit(reg); status1 = ERR_MUNOFINISH; /* This "start_tn" check is redone after the for-loop and an error message is printed there */ break; } else if (reorg_entiredb) { /* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */ assert(csd->reorg_upgrd_dwngrd_restart_block <= MAX(start_blk, curbmp)); csd->reorg_upgrd_dwngrd_restart_block = curbmp; /* previous blocks have been upgraded/downgraded */ } /* Check blks_to_upgrd counter to see if upgrade/downgrade is complete. * Repeat check done a few steps earlier outside of this for loop. */ total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; actual_blks2upgrd = csd->blks_to_upgrd; if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded) || (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd))) { rel_crit(reg); break; } bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* now that in crit, note down stable buffer */ if (NULL == bml_sm_buff) rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL); ondsk_blkver = cr->ondsk_blkver; /* note down db fmt on disk for bitmap block */ /* Take a copy of the shared memory bitmap buffer into process-private memory before releasing crit. * We are interested in those blocks that are currently marked as USED in the bitmap. * It is possible that once we release crit, concurrent updates change the bitmap state of those blocks. * In that case, those updates will take care of doing the upgrade/downgrade of those blocks in the * format currently set in csd->desired_db_format i.e. accomplishing MUPIP REORG UPGRADE/DOWNGRADE's job. * If the desired_db_format changes concurrently, we will stop doing REORG UPGRADE/DOWNGRADE processing. */ if (NULL == bml_lcl_buff) bml_lcl_buff = malloc(BM_SIZE(BLKS_PER_LMAP)); memcpy(bml_lcl_buff, (blk_hdr_ptr_t)bml_sm_buff, BM_SIZE(BLKS_PER_LMAP)); if (FALSE == cert_blk(reg, curbmp, (blk_hdr_ptr_t)bml_lcl_buff, 0, FALSE)) { /* certify the block while holding crit as cert_blk uses fields from file-header (shared memory) */ assert(FALSE); /* in pro, skip ugprading/downgarding all blks in this unreliable local bitmap */ rel_crit(reg); util_out_print("Region !AD : Bitmap Block [0x!XL] has integrity errors. Skipping this bitmap.", TRUE, REG_LEN_STR(reg), curbmp); status1 = ERR_MUNOFINISH; continue; } rel_crit(reg); /* ------------------------------------------------------------------------ * Upgrade/Downgrade all BUSY blocks in the current bitmap * ------------------------------------------------------------------------ */ curblk = (curbmp == start_bmp) ? start_blk : curbmp; mapsize = (curbmp == last_bmp) ? (stop_blk - curbmp) : BLKS_PER_LMAP; assert(0 != mapsize); assert(mapsize <= BLKS_PER_LMAP); db_got_to_v5_once = csd->db_got_to_v5_once; for (lcnt = curblk - curbmp; lcnt < mapsize; lcnt++, curblk++) { if (mu_ctrly_occurred || mu_ctrlc_occurred) { status1 = ERR_MUNOFINISH; goto stop_reorg_on_this_reg; /* goto needed because of nested FOR Loop */ } GET_BM_STATUS(bml_lcl_buff, lcnt, bml_status); assert(BLK_MAPINVALID != bml_status); /* cert_blk ran clean so we dont expect invalid entries */ if (BLK_FREE == bml_status) { reorg_stats.blks_skipped_free++; continue; } /* MUPIP REORG UPGRADE/DOWNGRADE will convert USED & RECYCLED blocks */ if (db_got_to_v5_once || (BLK_RECYCLED != bml_status)) { /* Do NOT read recycled V4 block from disk unless it is guaranteed NOT to be too full */ if (lcnt) { /* non-bitmap block */ /* read in block from disk into private buffer. dont pollute the cache yet */ if (NULL == bptr) bptr = (sm_uc_ptr_t)malloc(blk_size); status1 = dsk_read(curblk, bptr, &ondsk_blkver, FALSE); /* dsk_read on curblk could return an error (DYNUPGRDFAIL) if curblk needs to be * upgraded and if its block size was too big to allow the extra block-header space * requirements for a dynamic upgrade. a MUPIP REORG DOWNGRADE should not error out * in that case as the block is already in the downgraded format. */ if (SS_NORMAL != status1) { if (!upgrade && (ERR_DYNUPGRDFAIL == status1)) { assert(GDSV4 == new_db_format); ondsk_blkver = new_db_format; } else { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status1); util_out_print("Region !AD : Error occurred while reading block " "[0x!XL]", TRUE, REG_LEN_STR(reg), curblk); status1 = ERR_MUNOFINISH; goto stop_reorg_on_this_reg;/* goto needed due to nested FOR Loop */ } } reorg_stats.blks_read_from_disk_nonbmp++; } /* else bitmap block has been read in crit earlier and ondsk_blkver appropriately set */ if (new_db_format == ondsk_blkver) { assert((SS_NORMAL == status1) || (!upgrade && (ERR_DYNUPGRDFAIL == status1))); status1 = SS_NORMAL; /* treat DYNUPGRDFAIL as no error in case of downgrade */ reorg_stats.blks_skipped_newfmtindisk++; continue; /* current disk version is identical to what is desired */ } assert(SS_NORMAL == status1); } /* Begin non-TP transaction to upgrade/downgrade the block. * The way we do that is by updating the block using a null update array. * Any update to a block will trigger an automatic upgrade/downgrade of the block based on * the current fileheader desired_db_format setting and we use that here. */ t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); for (; ;) { CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ curr_tn = csd->trans_hist.curr_tn; db_got_to_v5_once = csd->db_got_to_v5_once; if (db_got_to_v5_once || (BLK_RECYCLED != bml_status)) { blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ blkBase = t_qread(curblk, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); if (NULL == blkBase) { t_retry((enum cdb_sc)rdfail_detail); continue; } blkhist->blk_num = curblk; blkhist->buffaddr = blkBase; ondsk_blkver = blkhist->cr->ondsk_blkver; new_hdr = *(blk_hdr_ptr_t)blkBase; mu_reorg_upgrd_dwngrd_blktn = new_hdr.tn; mark_blk_free = FALSE; inctn_opcode = upgrade ? inctn_blkupgrd : inctn_blkdwngrd; } else { mark_blk_free = TRUE; inctn_opcode = inctn_blkmarkfree; } inctn_detail.blknum_struct.blknum = curblk; /* t_end assumes that the history it is passed does not contain a bitmap block. * for bitmap block, the history validation information is passed through cse instead. * therefore we need to handle bitmap and non-bitmap cases separately. */ if (!lcnt) { /* Means a bitmap block. * At this point we can do a "new_db_format != ondsk_blkver" check to determine * if the block got converted since we did the dsk_read (see the non-bitmap case * for a similar check done there), but in that case we will have a transaction * which has read 1 bitmap block and is updating no block. "t_end" currently cannot * handle this case as it expects any bitmap block that needs validation to also * have a corresponding cse which will hold its history. Hence we avoid doing the * new_db_format check. The only disadvantage of this is that we will end up * modifying the bitmap block as part of this transaction (in an attempt to convert * its ondsk_blkver) even though it is already in the right format. Since this * overhead is going to be one per bitmap block and since the block is in the cache * at this point, we should not lose much. */ assert(!mark_blk_free); BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); assert(&alt_hist.h[0] == blkhist); alt_hist.h[0].blk_num = 0; /* create empty history for bitmap block */ assert(update_trans); } else { /* non-bitmap block. fill in history for validation in t_end */ assert(curblk); /* we should never come here for block 0 (bitmap) */ if (!mark_blk_free) { assert(blkhist->blk_num == curblk); assert(blkhist->buffaddr == blkBase); blkhist->tn = curr_tn; alt_hist.h[1].blk_num = 0; } /* Also need to pass the bitmap as history to detect if any concurrent M-kill * is freeing up the same USED block that we are trying to convert OR if any * concurrent M-set is reusing the same RECYCLED block that we are trying to * convert. Because of t_end currently not being able to validate a bitmap * without that simultaneously having a cse, we need to create a cse for the * bitmap that is used only for bitmap history validation, but should not be * used to update the contents of the bitmap block in bg_update. */ bmlhist.buffaddr = t_qread(curbmp, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr); if (NULL == bmlhist.buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } bmlhist.blk_num = curbmp; bmlhist.tn = curr_tn; GET_BM_STATUS(bmlhist.buffaddr, lcnt, bml_status); if (BLK_MAPINVALID == bml_status) { t_retry(cdb_sc_lostbmlcr); continue; } if (!mark_blk_free) { if ((new_db_format != ondsk_blkver) && (BLK_FREE != bml_status)) { /* block still needs to be converted. create cse */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blkBase + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr)); BLK_FINI(bs_ptr, bs1); t_write(blkhist, (unsigned char *)bs1, 0, 0, ((blk_hdr_ptr_t)blkBase)->levl, FALSE, FALSE, GDS_WRITE_PLAIN); /* The directory tree status for now is only used to determine * whether writing the block to snapshot file (see t_end_sysops.c). * For reorg upgrade/downgrade process, the block is updated in a * sequential way without changing the gv_target. In this case, we * assume the block is in directory tree so as to have it written to * the snapshot file. */ BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state); /* reset update_trans in case previous retry had set it to 0 */ update_trans = UPDTRNS_DB_UPDATED_MASK; if (BLK_RECYCLED == bml_status) { /* If block that we are upgarding is RECYCLED, indicate to * bg_update that blks_to_upgrd counter should NOT be * touched in this case by setting "mode" to a special value */ assert(cw_set[cw_set_depth-1].mode == gds_t_write); cw_set[cw_set_depth-1].mode = gds_t_write_recycled; /* we SET block as NOT RECYCLED, otherwise, the mm_update() * or bg_update_phase2 may skip writing it to snapshot file * when its level is 0 */ BIT_CLEAR_RECYCLED(cw_set[cw_set_depth-1].blk_prior_state); } } else { /* Block got converted by another process since we did the dsk_read. * or this block became marked free in the bitmap. * No need to update this block. just call t_end for validation of * both the non-bitmap block as well as the bitmap block. * Note down that this transaction is no longer updating any blocks. */ update_trans = 0; } /* Need to put bit maps on the end of the cw set for concurrency checking. * We want to simulate t_write_map, except we want to update "cw_map_depth" * instead of "cw_set_depth". Hence the save and restore logic below. * This part of the code is similar to the one in mu_swap_blk.c */ save_cw_set_depth = cw_set_depth; assert(!cw_map_depth); t_write_map(&bmlhist, NULL, curr_tn, 0); /* will increment cw_set_depth */ cw_map_depth = cw_set_depth; /* set cw_map_depth to latest cw_set_depth */ cw_set_depth = save_cw_set_depth;/* restore cw_set_depth */ /* t_write_map simulation end */ } else { if (BLK_RECYCLED != bml_status) { /* Block was RECYCLED at beginning but no longer so. Retry */ t_retry(cdb_sc_bmlmod); continue; } /* Mark recycled block as FREE in bitmap */ assert(lcnt == (curblk - curbmp)); assert(update_array_ptr == update_array); *((block_id *)update_array_ptr) = lcnt; update_array_ptr += SIZEOF(block_id); /* the following assumes SIZEOF(block_id) == SIZEOF(int) */ assert(SIZEOF(block_id) == SIZEOF(int)); *(int *)update_array_ptr = 0; t_write_map(&bmlhist, (unsigned char *)update_array, curr_tn, 0); update_trans = UPDTRNS_DB_UPDATED_MASK; } } assert(SIZEOF(lcl_update_trans) == SIZEOF(update_trans)); lcl_update_trans = update_trans; /* take a copy before t_end modifies it */ if ((trans_num)0 != t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) { /* In case this is MM and t_end() remapped an extended database, reset csd */ assert(csd == cs_data); if (!lcl_update_trans) { assert(lcnt); assert(!mark_blk_free); assert((new_db_format == ondsk_blkver) || (BLK_BUSY != bml_status)); if (BLK_BUSY != bml_status) reorg_stats.blks_skipped_free++; else reorg_stats.blks_skipped_newfmtincache++; } else if (!lcnt) reorg_stats.blks_converted_bmp++; else reorg_stats.blks_converted_nonbmp++; break; } assert(csd == cs_data); } } } stop_reorg_on_this_reg: /* even though ctrl-c occurred, update file-header fields to store reorg's progress before exiting */ grab_crit(reg); blocks_left = 0; assert(csd->trans_hist.total_blks >= csd->blks_to_upgrd); actual_blks2upgrd = csd->blks_to_upgrd; total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; /* Care should be taken not to set "csd->reorg_upgrd_dwngrd_restart_block" in case of a concurrent db fmt * change. This is because let us say we are doing REORG UPGRADE. A concurrent REORG DOWNGRADE would * have reset "csd->reorg_upgrd_dwngrd_restart_block" field to 0 and if that reorg is interrupted by a * Ctrl-C (before this reorg came here) it would have updated "csd->reorg_upgrd_dwngrd_restart_block" to * a non-zero value indicating how many blocks from 0 have been downgraded. We should not reset this * field to "curblk" as it will be mis-interpreted as the number of blocks that have been DOWNgraded. */ set_fully_upgraded = FALSE; if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn) { /* csd->desired_db_format changed since reorg started. discontinue the reorg */ util_out_print("Region !AD : Desired DB Format changed during REORG. Stopping REORG.", TRUE, REG_LEN_STR(reg)); status1 = ERR_MUNOFINISH; } else if (reorg_entiredb) { /* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */ assert(csd->reorg_upgrd_dwngrd_restart_block <= curblk); csd->reorg_upgrd_dwngrd_restart_block = curblk; /* blocks lesser than this have been upgraded/downgraded */ expected_blks2upgrd = upgrade ? 0 : (total_blks - free_blks); blocks_left = upgrade ? actual_blks2upgrd : (expected_blks2upgrd - actual_blks2upgrd); /* If this reorg command went through all blocks in the database, then it should have * correctly concluded at this point whether the reorg is complete or not. * If this reorg command started from where a previous incomplete reorg left * (i.e. first_reorg_in_this_db_fmt is FALSE), it cannot determine if the initial * GDS blocks that it skipped are completely {up,down}graded or not. */ assert((0 == blocks_left) || (SS_NORMAL != status1) || !first_reorg_in_this_db_fmt); /* If this is a MUPIP REORG UPGRADE that did go through every block in the database (indicated by * "reorg_entiredb" && "first_reorg_in_this_db_fmt") and the current count of "blks_to_upgrd" is * 0 in the file-header and the desired_db_format did not change since the start of the REORG, * we can be sure that the entire database has been upgraded. Set "csd->fully_upgraded" to TRUE. */ if ((SS_NORMAL == status1) && first_reorg_in_this_db_fmt && upgrade && (0 == actual_blks2upgrd)) { csd->fully_upgraded = TRUE; csd->db_got_to_v5_once = TRUE; set_fully_upgraded = TRUE; } /* flush all changes noted down in the file-header */ if (!wcs_flu(WCSFLU_FLUSH_HDR)) /* wcs_flu assumes gv_cur_region is set (which it is in this routine) */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg)); status = ERR_MUNOFINISH; rel_crit(reg); continue; } } curr_tn = csd->trans_hist.curr_tn; rel_crit(reg); util_out_print("Region !AD : Stopped processing at block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk); /* Print statistics */ util_out_print("Region !AD : Statistics : Blocks Read From Disk (Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_bmp); util_out_print("Region !AD : Statistics : Blocks Skipped (Free) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_free); util_out_print("Region !AD : Statistics : Blocks Read From Disk (Non-Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_nonbmp); util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in disk) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtindisk); util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in cache) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtincache); util_out_print("Region !AD : Statistics : Blocks Converted (Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_bmp); util_out_print("Region !AD : Statistics : Blocks Converted (Non-Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_nonbmp); if (reorg_entiredb && (SS_NORMAL == status1) && (0 != blocks_left)) { /* file-header counter does not match what reorg on the entire database expected to see */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBBTUWRNG, 2, expected_blks2upgrd, actual_blks2upgrd); util_out_print("Region !AD : Run MUPIP INTEG (without FAST qualifier) to fix the counter", TRUE, REG_LEN_STR(reg)); status1 = ERR_MUNOFINISH; } else util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : " "Blocks to upgrade = [0x!XL]", TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd); /* Issue success or failure message for this region */ if (SS_NORMAL == status1) { /* issue success only if REORG did not encounter any error in its processing */ if (set_fully_upgraded) util_out_print("Region !AD : Database is now FULLY UPGRADED", TRUE, REG_LEN_STR(reg)); util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUREUPDWNGRDEND, 5, REG_LEN_STR(reg), process_id, process_id, &curr_tn); } else { assert(ERR_MUNOFINISH == status1); assert((SS_NORMAL == status) || (ERR_MUNOFINISH == status)); util_out_print("Region !AD : MUPIP REORG !AD incomplete. See above messages.!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = status1; } } if (NULL != bptr) free(bptr); if (NULL != bml_lcl_buff) free(bml_lcl_buff); if (mu_ctrly_occurred || mu_ctrlc_occurred) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_REORGCTRLY); status = ERR_MUNOFINISH; } mupip_exit(status); }
uint4 jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header *header; unsigned char hdr_buff[REAL_JNL_HDR_LEN + MAX_IO_BLOCK_SIZE]; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, warn_blocks, result; gtm_uint64_t avail_blocks; uint4 aligned_tot_jrec_size, count; uint4 jnl_fs_block_size, read_write_size; DCL_THREADGBL_ACCESS; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(&FILE_INFO(jpc->region)->s_addrs == csa); assert(csa->jnl_state == csd->jnl_state); assertpro(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && (!JNL_FILE_SWITCHED(jpc))); /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq || (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit)) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { warn_blocks = (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit) ? ((csd->jnl_deq > csd->autoswitchlimit) ? csd->jnl_deq : csd->autoswitchlimit) : new_blocks; if ((warn_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* If we cannot satisfy the request, it is an error, unless the anticipatory freeze * scheme is in effect in which case, we will assume space is available even if * it is not and go ahead with writes to the disk. If the writes fail with ENOSPC * we will freeze the instance and wait for space to become available and keep * retrying the writes. Therefore, we make the NOSPACEEXT a warning in this case. */ SETUP_THREADGBL_ACCESS; if (!ANTICIPATORY_FREEZE_ENABLED(csa)) { send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - warn_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, SIZEOF(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. Set a global variable in_jnl_file_autoswitch * so jnl_write can know not to do the padding check. But because this is a global * variable, we also need to make sure it is reset in case of errors during the * autoswitch (or else calls to jnl_write after we are out of the autoswitch logic * will continue to incorrectly not do the padding check. Hence a condition handler. */ assert(!in_jnl_file_autoswitch); in_jnl_file_autoswitch = TRUE; /* Also make sure time is not changed. This way if "jnl_write" as part of writing a * journal record invokes jnl_file_extend, when the autoswitch is done and writing * of the parent jnl_write resumes, we want it to continue with the same timestamp * and not have to reset its time (non-trivial task) to reflect any changes since then. */ assert(!jgbl.save_dont_reset_gbl_jrec_time); jgbl.save_dont_reset_gbl_jrec_time = jgbl.dont_reset_gbl_jrec_time; jgbl.dont_reset_gbl_jrec_time = TRUE; /* Establish a condition handler so we reset a few global variables that have * temporarily been modified in case of errors inside wcs_flu/jnl_file_close. */ ESTABLISH_RET(jnl_file_autoswitch_ch, EXIT_ERR); /* It is possible we still have not written a PINI record in this journal file * (e.g. mupip extend saw the need to do jnl_file_extend inside jnl_write while * trying to write a PINI record). Write a PINI record in that case before closing * the journal file that way the EOF record will have a non-zero pini_addr. */ if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SPEEDUP_NOBEFORE); jnl_file_close(gv_cur_region, TRUE, TRUE); REVERT; in_jnl_file_autoswitch = FALSE; jgbl.dont_reset_gbl_jrec_time = jgbl.save_dont_reset_gbl_jrec_time; DEBUG_ONLY(jgbl.save_dont_reset_gbl_jrec_time = FALSE); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In MM, wcs_flu() can remap an extended DB, so reset csd to be sure */ } else { if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); assert(jgbl.forw_phase_recovery || (NULL == jgbl.mur_pini_addr_reset_fnptr)); if (NULL != jgbl.mur_pini_addr_reset_fnptr) (*jgbl.mur_pini_addr_reset_fnptr)(csa); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->jnl_checksum = jnl_info.checksum; csd->jnl_eovtn = csd->trans_hist.curr_tn; send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREAT, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; fc->op_len = SGMNT_HDR_LEN; fc->op_pos = 1; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) { if (jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } assert(jb->filesize == csd->jnl_alq); if (csd->jnl_alq + csd->jnl_deq <= csd->autoswitchlimit) { aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } } else { send_msg(VARLSTCNT(4) ERR_JNLNOCREATE, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jnl_fs_block_size = jb->fs_block_size; header = (jnl_file_header *)(ROUND_UP2((uintszofptr_t)hdr_buff, jnl_fs_block_size)); read_write_size = ROUND_UP2(REAL_JNL_HDR_LEN, jnl_fs_block_size); assert((unsigned char *)header + read_write_size <= ARRAYTOP(hdr_buff)); DO_FILE_READ(jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) { assert(FALSE); rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); } assert((header->virtual_size + new_blocks) == new_alq); jb->filesize = new_alq; /* Actually this is virtual file size blocks */ header->virtual_size = new_alq; JNL_DO_FILE_WRITE(csa, csd->jnl_file_name, jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) { assert(FALSE); rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } } if (0 >= new_blocks) break; } if (0 < new_blocks) { INCR_GVSTATS_COUNTER(csa, csa->nl, n_jnl_extends, 1); return EXIT_NRM; } jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); return EXIT_ERR; }
uint4 mur_process_intrpt_recov() { jnl_ctl_list *jctl, *last_jctl; reg_ctl_list *rctl, *rctl_top; int rename_fn_len, save_name_len; char prev_jnl_fn[MAX_FN_LEN + 1], rename_fn[MAX_FN_LEN + 1], save_name[MAX_FN_LEN + 1]; jnl_create_info jnl_info; uint4 status, status2; uint4 max_autoswitchlimit, max_jnl_alq, max_jnl_deq, freeblks; sgmnt_data_ptr_t csd; #if defined(VMS) io_status_block_disk iosb; #endif boolean_t jfh_changed; for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { gv_cur_region = rctl->gd; /* wcs_flu requires this to be set */ cs_addrs = rctl->csa; csd = cs_data = rctl->csd; /* MM logic after wcs_flu call requires this to be set */ assert(csd == rctl->csa->hdr); jctl = rctl->jctl_turn_around; max_jnl_alq = max_jnl_deq = max_autoswitchlimit = 0; for (last_jctl = NULL ; (NULL != jctl); last_jctl = jctl, jctl = jctl->next_gen) { if (max_autoswitchlimit < jctl->jfh->autoswitchlimit) { /* Note that max_jnl_alq, max_jnl_deq are not the maximum journal allocation/extensions across * generations, but rather the allocation/extension corresponding to the maximum autoswitchlimit. */ max_autoswitchlimit = jctl->jfh->autoswitchlimit; max_jnl_alq = jctl->jfh->jnl_alq; max_jnl_deq = jctl->jfh->jnl_deq; } /* Until now, "rctl->blks_to_upgrd_adjust" holds the number of V4 format newly created bitmap blocks * seen in INCTN records in backward processing. It is possible that backward processing might have * missed out on seeing those INCTN records which are part of virtually-truncated or completely-rolled-bak * journal files. The journal file-header has a separate field "prev_recov_blks_to_upgrd_adjust" which * maintains exactly this count. Therefore adjust the rctl counter accordingly. */ assert(!jctl->jfh->prev_recov_blks_to_upgrd_adjust || !jctl->jfh->recover_interrupted); assert(!jctl->jfh->prev_recov_blks_to_upgrd_adjust || jctl->jfh->prev_recov_end_of_data); rctl->blks_to_upgrd_adjust += jctl->jfh->prev_recov_blks_to_upgrd_adjust; } if (max_autoswitchlimit > last_jctl->jfh->autoswitchlimit) { csd->jnl_alq = max_jnl_alq; csd->jnl_deq = max_jnl_deq; csd->autoswitchlimit = max_autoswitchlimit; } else { assert(csd->jnl_alq == last_jctl->jfh->jnl_alq); assert(csd->jnl_deq == last_jctl->jfh->jnl_deq); assert(csd->autoswitchlimit == last_jctl->jfh->autoswitchlimit); } /* now that rctl->blks_to_upgrd_adjust is completely computed, use that to increment filehdr blks_to_upgrd. */ csd->blks_to_upgrd += rctl->blks_to_upgrd_adjust; if (csd->blks_to_upgrd) csd->fully_upgraded = FALSE; jctl = rctl->jctl_turn_around; csd->trans_hist.early_tn = jctl->turn_around_tn; csd->trans_hist.curr_tn = csd->trans_hist.early_tn; /* INCREMENT_CURR_TN macro not used but noted in comment * to identify all places that set curr_tn */ csd->jnl_eovtn = csd->trans_hist.curr_tn; csd->turn_around_point = TRUE; /* MUPIP REORG UPGRADE/DOWNGRADE stores its partially processed state in the database file header. * It is difficult for recovery to restore those fields to a correct partial value. * Hence reset the related fields as if the desired_db_format got set just ONE tn BEFORE the EPOCH record * and that there was no more processing that happened. * This might potentially mean some duplicate processing for MUPIP REORG UPGRADE/DOWNGRADE after the recovery. * But that will only be the case as long as the database is in compatibility (mixed) mode (hopefully not long). */ if (csd->desired_db_format_tn >= jctl->turn_around_tn) csd->desired_db_format_tn = jctl->turn_around_tn - 1; if (csd->reorg_db_fmt_start_tn >= jctl->turn_around_tn) csd->reorg_db_fmt_start_tn = jctl->turn_around_tn - 1; if (csd->tn_upgrd_blks_0 > jctl->turn_around_tn) csd->tn_upgrd_blks_0 = (trans_num)-1; csd->reorg_upgrd_dwngrd_restart_block = 0; /* Compute current value of "free_blocks" based on the value of "free_blocks" at the turnaround point epoch * record and the change in "total_blks" since that epoch to the present form of the database. Any difference * in "total_blks" implies database file extensions happened since the turnaround point. A backward rollback * undoes everything (including all updates) except file extensions (it does not truncate the file size). * Therefore every block that was newly allocated as part of those file extensions should be considered FREE * for the current calculations except for the local bitmap blocks which are BUSY the moment they are created. */ assert(rctl->trnarnd_total_blks <= csd->trans_hist.total_blks); csd->trans_hist.free_blocks = rctl->trnarnd_free_blocks + (csd->trans_hist.total_blks - rctl->trnarnd_total_blks) - DIVIDE_ROUND_UP(csd->trans_hist.total_blks, BLKS_PER_LMAP) + DIVIDE_ROUND_UP(rctl->trnarnd_total_blks, BLKS_PER_LMAP); assert((freeblks = mur_blocks_free(rctl)) == csd->trans_hist.free_blocks); if (dba_bg == csd->acc_meth) /* This is taken from bt_refresh() */ ((th_rec *)((uchar_ptr_t)cs_addrs->th_base + cs_addrs->th_base->tnque.fl))->tn = jctl->turn_around_tn - 1; wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_FSYNC_DB); csd->turn_around_point = FALSE; /* In case this is MM and wcs_flu() remapped an extended database, reset rctl->csd */ assert((dba_mm == cs_data->acc_meth) || (rctl->csd == cs_data)); rctl->csd = cs_data; } for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { if (!rctl->jfh_recov_interrupted) jctl = rctl->jctl_turn_around; else { DEBUG_ONLY( for (jctl = rctl->jctl_turn_around; NULL != jctl->next_gen; jctl = jctl->next_gen) ; /* check that latest gener file name does not match db header */ assert((rctl->csd->jnl_file_len != jctl->jnl_fn_len) || (0 != memcmp(rctl->csd->jnl_file_name, jctl->jnl_fn, jctl->jnl_fn_len))); ) jctl = rctl->jctl_alt_head; } assert(NULL != jctl); for ( ; NULL != jctl->next_gen; jctl = jctl->next_gen) ; assert(rctl->csd->jnl_file_len == jctl->jnl_fn_len); /* latest gener file name */ assert(0 == memcmp(rctl->csd->jnl_file_name, jctl->jnl_fn, jctl->jnl_fn_len)); /* should match db header */ if (SS_NORMAL != (status = prepare_unique_name((char *)jctl->jnl_fn, jctl->jnl_fn_len, "", "", rename_fn, &rename_fn_len, &status2))) return status; jctl->jnl_fn_len = rename_fn_len; /* change the name in memory to the proposed name */ memcpy(jctl->jnl_fn, rename_fn, rename_fn_len + 1); /* Rename hasn't happened yet at the filesystem level. In case current recover command is interrupted, * we need to update jfh->next_jnl_file_name before mur_forward(). Update jfh->next_jnl_file_name for * all journal files from which PBLK records were applied. Create new journal files for forward play. */ assert(NULL != rctl->jctl_turn_around); jctl = rctl->jctl_turn_around; /* points to journal file which has current recover's turn around point */ assert(0 != jctl->turn_around_offset); jctl->jfh->turn_around_offset = jctl->turn_around_offset; /* save progress in file header for */ jctl->jfh->turn_around_time = jctl->turn_around_time; /* possible re-issue of recover */ jfh_changed = TRUE; for ( ; NULL != jctl; jctl = jctl->next_gen) { /* setup the next_jnl links. note that in the case of interrupted recovery, next_jnl links * would have been already set starting from the turn-around point journal file of the * interrupted recovery but the new recovery MIGHT have taken us to a still previous * generation journal file that needs its next_jnl link set. this is why we do the next_jnl * link setup even in the case of interrupted recovery although in most cases it is unnecessary. */ if (NULL != jctl->next_gen) { jctl->jfh->next_jnl_file_name_length = jctl->next_gen->jnl_fn_len; memcpy(jctl->jfh->next_jnl_file_name, jctl->next_gen->jnl_fn, jctl->next_gen->jnl_fn_len); jfh_changed = TRUE; } else assert(0 == jctl->jfh->next_jnl_file_name_length); /* null link from latest generation */ if (jctl->jfh->turn_around_offset && (jctl != rctl->jctl_turn_around)) { /* It is possible that the current recovery has a turn-around-point much before the * previously interrupted recovery. If it happens to be a previous generation journal * file then we have to reset the original turn-around-point to be zero in the journal * file header in order to ensure if this recovery gets interrupted we do interrupted * recovery processing until the new turn-around-point instead of stopping incorrectly * at the original turn-around-point itself. Note that there could be more than one * journal file with a non-zero turn_around_offset (depending on how many previous * recoveries got interrupted in this loop) that need to be reset. */ assert(!jctl->turn_around_offset); assert(rctl->recov_interrupted); /* rctl->jfh_recov_interrupted can fail */ jctl->jfh->turn_around_offset = 0; jctl->jfh->turn_around_time = 0; jfh_changed = TRUE; } if (jfh_changed) { DO_FILE_WRITE(jctl->channel, 0, jctl->jfh, REAL_JNL_HDR_LEN, jctl->status, jctl->status2); if (SS_NORMAL != jctl->status) { assert(FALSE); if (SS_NORMAL == jctl->status2) gtm_putmsg(VARLSTCNT(5) ERR_JNLWRERR, 2, jctl->jnl_fn_len, jctl->jnl_fn, jctl->status); else gtm_putmsg(VARLSTCNT1(6) ERR_JNLWRERR, 2, jctl->jnl_fn_len, jctl->jnl_fn, jctl->status, PUT_SYS_ERRNO(jctl->status2)); return jctl->status; } UNIX_ONLY( GTM_FSYNC(jctl->channel, jctl->status); if (-1 == jctl->status) { jctl->status2 = errno; assert(FALSE); gtm_putmsg(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, jctl->jnl_fn_len, jctl->jnl_fn, ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), jctl->status2); return ERR_JNLFSYNCERR; } ) } jfh_changed = FALSE; } memset(&jnl_info, 0, SIZEOF(jnl_info)); jnl_info.status = jnl_info.status2 = SS_NORMAL; jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(rctl->gd, &jnl_info); jnl_info.prev_jnl_len = rctl->jctl_turn_around->jnl_fn_len; memcpy(jnl_info.prev_jnl, rctl->jctl_turn_around->jnl_fn, rctl->jctl_turn_around->jnl_fn_len); jnl_info.prev_jnl[jnl_info.prev_jnl_len] = 0; jnl_info.jnl_len = rctl->csd->jnl_file_len; memcpy(jnl_info.jnl, rctl->csd->jnl_file_name, jnl_info.jnl_len); jnl_info.jnl[jnl_info.jnl_len] = 0; assert(!mur_options.rollback || jgbl.mur_rollback); jnl_info.reg_seqno = rctl->jctl_turn_around->turn_around_seqno; jgbl.gbl_jrec_time = rctl->jctl_turn_around->turn_around_time; /* time needed for cre_jnl_file_common() */ if (EXIT_NRM != cre_jnl_file_common(&jnl_info, rename_fn, rename_fn_len)) { gtm_putmsg(VARLSTCNT(4) ERR_JNLNOCREATE, 2, jnl_info.jnl_len, jnl_info.jnl); return jnl_info.status; } if (NULL != rctl->jctl_alt_head) /* remove the journal files created by last interrupted recover process */ { mur_rem_jctls(rctl); rctl->jctl_alt_head = NULL; } /* From this point on, journal records are written into the newly created journal file. However, we still read * from old journal files. */ }
uint4 mur_process_intrpt_recov() { jnl_ctl_list *jctl; reg_ctl_list *rctl, *rctl_top; int rename_fn_len, save_name_len; char prev_jnl_fn[MAX_FN_LEN + 1], rename_fn[MAX_FN_LEN + 1], save_name[MAX_FN_LEN + 1]; jnl_create_info jnl_info; uint4 status, status2; uint4 max_autoswitchlimit, max_jnl_alq, max_jnl_deq; #if defined(VMS) io_status_block_disk iosb; #endif boolean_t jfh_changed, first_time; error_def (ERR_PREMATEOF); /* for DO_FILE_WRITE */ error_def (ERR_JNLCREATERR); error_def (ERR_JNLWRERR); error_def (ERR_JNLFSYNCERR); error_def (ERR_TEXT); for (mur_regno = 0, rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++, mur_regno++) { cs_addrs = rctl->csa; cs_data = rctl->csd; assert(rctl->csd == rctl->csa->hdr); mur_jctl = jctl = rctl->jctl_turn_around; assert(rctl->csd == rctl->csa->hdr); mur_master_map(); cs_data->trans_hist.header_open_tn = cs_data->trans_hist.early_tn = cs_data->trans_hist.curr_tn = jctl->turn_around_tn; cs_data->trans_hist.free_blocks = mur_blocks_free(); if (dba_bg == cs_data->acc_meth) /* This is taken from bt_refresh() */ ((th_rec *)((uchar_ptr_t)cs_addrs->th_base + cs_addrs->th_base->tnque.fl))->tn = jctl->turn_around_tn - 1; max_jnl_alq = max_jnl_deq = max_autoswitchlimit = 0; for ( ; (NULL != jctl->next_gen); jctl = jctl->next_gen) { if (max_autoswitchlimit < jctl->jfh->autoswitchlimit) { /* Note that max_jnl_alq, max_jnl_deq are not the maximum journal allocation/extensions across * generations, but rather the allocation/extension corresponding to the maximum autoswitchlimit. */ max_autoswitchlimit = jctl->jfh->autoswitchlimit; max_jnl_alq = jctl->jfh->jnl_alq; max_jnl_deq = jctl->jfh->jnl_deq; } } if (max_autoswitchlimit > jctl->jfh->autoswitchlimit) { cs_data->jnl_alq = max_jnl_alq; cs_data->jnl_deq = max_jnl_deq; cs_data->autoswitchlimit = max_autoswitchlimit; } else { assert(cs_data->jnl_alq == jctl->jfh->jnl_alq); assert(cs_data->jnl_deq == jctl->jfh->jnl_deq); assert(cs_data->autoswitchlimit == jctl->jfh->autoswitchlimit); } gv_cur_region = rctl->gd; wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_FSYNC_DB); } for (mur_regno = 0, rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++, mur_regno++) { if (!rctl->jfh_recov_interrupted) jctl = rctl->jctl_turn_around; else { DEBUG_ONLY( for (jctl = rctl->jctl_turn_around; NULL != jctl->next_gen; jctl = jctl->next_gen) ; /* check that latest gener file name does not match db header */ assert((rctl->csd->jnl_file_len != jctl->jnl_fn_len) || (0 != memcmp(rctl->csd->jnl_file_name, jctl->jnl_fn, jctl->jnl_fn_len))); ) jctl = rctl->jctl_alt_head; } assert(NULL != jctl); for ( ; NULL != jctl->next_gen; jctl = jctl->next_gen) ; assert(rctl->csd->jnl_file_len == jctl->jnl_fn_len); /* latest gener file name */ assert(0 == memcmp(rctl->csd->jnl_file_name, jctl->jnl_fn, jctl->jnl_fn_len)); /* should match db header */ if (SS_NORMAL != (status = prepare_unique_name((char *)jctl->jnl_fn, jctl->jnl_fn_len, "", "", rename_fn, &rename_fn_len, &status2))) return status; jctl->jnl_fn_len = rename_fn_len; /* change the name in memory to the proposed name */ memcpy(jctl->jnl_fn, rename_fn, rename_fn_len + 1); /* Rename hasn't happened yet at the filesystem level. In case current recover command is interrupted, * we need to update jfh->next_jnl_file_name before mur_forward(). Update jfh->next_jnl_file_name for * all journal files from which PBLK records were applied. Create new journal files for forward play. */ assert(NULL != rctl->jctl_turn_around); jctl = rctl->jctl_turn_around; /* points to journal file which has current recover's turn around point */ assert(0 != jctl->turn_around_offset); jctl->jfh->turn_around_offset = jctl->turn_around_offset; /* save progress in file header for */ jctl->jfh->turn_around_time = jctl->turn_around_time; /* possible re-issue of recover */ jfh_changed = TRUE; DEBUG_ONLY(first_time = TRUE;) for ( ; NULL != jctl; jctl = jctl->next_gen)
void gv_select(char *cli_buff, int n_len, boolean_t freeze, char opname[], glist *gl_head, int *reg_max_rec, int *reg_max_key, int *reg_max_blk) { bool stashed = FALSE; int num_quote, len; char *ptr, *ptr1, *c; mstr gmap[256], *gmap_ptr, gmap_beg, gmap_end; mval val, curr_gbl_name; glist *gl_tail, *gl_ptr; htab_desc *ext_hash; mname ht_mname; ht_entry *h; error_def(ERR_FREEZE); error_def(ERR_DBRDONLY); error_def(ERR_SELECTSYNTAX); error_def(ERR_MUNOFINISH); error_def(ERR_MUNOACTION); memset(gmap, 0, sizeof(gmap)); for (ptr = cli_buff; *ptr; ptr = ptr1) { for (ptr1 = ptr; ; ptr1++) { if (',' == *ptr1) { len = ptr1 - ptr; ptr1++; break; } else if (!*ptr1) { len = ptr1 - ptr; break; } } gmap_beg.addr = ptr; c = gmap_beg.addr + len - 1; num_quote = 0; while ('"' == *c) { len--; c--; num_quote++; } if (0 >= len) { gtm_putmsg(VARLSTCNT(4) ERR_SELECTSYNTAX, 2, LEN_AND_STR(opname)); mupip_exit(ERR_MUNOACTION); } c = gmap_beg.addr; while (0 < num_quote) { if ('"' == *c) { c++; len--; } else { gtm_putmsg(VARLSTCNT(4) ERR_SELECTSYNTAX, 2, LEN_AND_STR(opname)); mupip_exit(ERR_MUNOACTION); } num_quote--; } gmap_beg.addr = c; if ('^' == *c) { gmap_beg.addr++; len--; } gmap_beg.len = len; c = mu_extr_ident(&gmap_beg); len -= (c - gmap_beg.addr); assert(len >= 0); if (0 == len) gmap_end = gmap_beg; else if (gmap_beg.len == 1 && '*' == *c) { gmap_beg.addr = (char*)&percent_lit; gmap_beg.len = sizeof(percent_lit); gmap_end.addr = (char*)&tilde_lit; gmap_end.len = sizeof(tilde_lit); } else if (1 == len && '*' == *c) { gmap_end = gmap_beg; gmap_beg.len--; *c = '~'; } else if (':' != *c) { gtm_putmsg(VARLSTCNT(4) ERR_SELECTSYNTAX, 2, LEN_AND_STR(opname)); mupip_exit(ERR_MUNOACTION); } else { gmap_beg.len = c - gmap_beg.addr; c++; gmap_end.addr = c; gmap_end.len = len - 1; if ('^' == *c) { gmap_end.addr++; gmap_end.len--; } c = mu_extr_ident(&gmap_end); if (c - gmap_end.addr != gmap_end.len || mstrcmp(&gmap_beg, &gmap_end) > 0) { gtm_putmsg(VARLSTCNT(4) ERR_SELECTSYNTAX, 2, LEN_AND_STR(opname)); mupip_exit(ERR_MUNOACTION); } } global_map(gmap, &gmap_beg, &gmap_end); } if (freeze) { ext_hash = (htab_desc *)malloc(sizeof(htab_desc)); ht_init(ext_hash, 0); memset(&ht_mname.txt[0], 0, sizeof(mname)); } gl_head->next = NULL; gl_tail = gl_head; *reg_max_rec = 0; *reg_max_key = 0; *reg_max_blk = 0; for (gmap_ptr = gmap; gmap_ptr->addr ; gmap_ptr++) { curr_gbl_name.mvtype = MV_STR; curr_gbl_name.str = *gmap_ptr++; op_gvname(VARLSTCNT(1) &curr_gbl_name); if (dba_cm == gv_cur_region->dyn.addr->acc_meth) { util_out_print("Can not select globals from region !AD across network",TRUE,gv_cur_region->rname_len, gv_cur_region->rname); mupip_exit(ERR_MUNOFINISH); } if (dba_bg != gv_cur_region->dyn.addr->acc_meth && dba_mm != gv_cur_region->dyn.addr->acc_meth) { assert(gv_cur_region->dyn.addr->acc_meth == dba_usr); util_out_print("Can not select globals from non-GTC region !AD",TRUE,gv_cur_region->rname_len, gv_cur_region->rname); mupip_exit(ERR_MUNOFINISH); } op_gvdata(&val); if (0 == val.m[1]) { op_gvname(VARLSTCNT(1) &curr_gbl_name); op_gvorder(&curr_gbl_name); if (!curr_gbl_name.str.len) break; assert('^' == *curr_gbl_name.str.addr); curr_gbl_name.str.addr++; curr_gbl_name.str.len--; } for (;;) { if (mstrcmp(&curr_gbl_name.str, gmap_ptr) > 0) break; if (freeze) { memcpy(&ht_mname.txt[0], &gv_cur_region, sizeof(int4)); h = ht_put(ext_hash, &ht_mname, &stashed); if (stashed) { if (cs_addrs->hdr->freeze) { gtm_putmsg(VARLSTCNT(4) ERR_FREEZE, 2, gv_cur_region->rname_len, gv_cur_region->rname); mupip_exit(ERR_MUNOFINISH); } /* Cannot proceed for read-only data files */ if (gv_cur_region->read_only) { util_out_print("Cannot freeze the database",TRUE); gtm_putmsg(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); mupip_exit(ERR_MUNOFINISH); } while (FALSE == region_freeze(gv_cur_region, TRUE, FALSE)) hiber_start(1000); wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); } } assert(curr_gbl_name.str.len > 0); gl_ptr = (glist*)malloc(sizeof(glist) - 1 + curr_gbl_name.str.len); gl_ptr->name.mvtype = MV_STR; gl_ptr->name.str.addr = (char*)gl_ptr->nbuf; gl_ptr->name.str.len = curr_gbl_name.str.len; memcpy(gl_ptr->nbuf, curr_gbl_name.str.addr, curr_gbl_name.str.len); gl_ptr->next = 0; gl_tail->next = gl_ptr; gl_tail = gl_ptr; if (*reg_max_rec < cs_data->max_rec_size) *reg_max_rec = cs_data->max_rec_size; if (*reg_max_key < cs_data->max_key_size) *reg_max_key = cs_data->max_key_size; if (*reg_max_blk < cs_data->blk_size) *reg_max_blk = cs_data->blk_size; op_gvname(VARLSTCNT(1) &gl_tail->name); op_gvorder(&curr_gbl_name); if (0 == curr_gbl_name.str.len) { (gmap_ptr + 1)->addr = 0; break; } assert('^' == *curr_gbl_name.str.addr); curr_gbl_name.str.addr++; curr_gbl_name.str.len--; } } }
uint4 mur_process_intrpt_recov() { jnl_ctl_list *jctl, *last_jctl; reg_ctl_list *rctl, *rctl_top; int rename_fn_len, save_name_len, idx; char prev_jnl_fn[MAX_FN_LEN + 1], rename_fn[MAX_FN_LEN + 1], save_name[MAX_FN_LEN + 1]; jnl_create_info jnl_info; uint4 status, status2; uint4 max_autoswitchlimit, max_jnl_alq, max_jnl_deq, freeblks; sgmnt_data_ptr_t csd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; boolean_t jfh_changed; jnl_record *jnlrec; jnl_file_header *jfh; jnl_tm_t now; for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { TP_CHANGE_REG(rctl->gd); csd = cs_data; /* MM logic after wcs_flu call requires this to be set */ assert(csd == rctl->csa->hdr); jctl = rctl->jctl_turn_around; max_jnl_alq = max_jnl_deq = max_autoswitchlimit = 0; for (last_jctl = NULL ; (NULL != jctl); last_jctl = jctl, jctl = jctl->next_gen) { jfh = jctl->jfh; if (max_autoswitchlimit < jfh->autoswitchlimit) { /* Note that max_jnl_alq, max_jnl_deq are not the maximum journal allocation/extensions across * generations, but rather the allocation/extension corresponding to the maximum autoswitchlimit. */ max_autoswitchlimit = jfh->autoswitchlimit; max_jnl_alq = jfh->jnl_alq; max_jnl_deq = jfh->jnl_deq; } /* Until now, "rctl->blks_to_upgrd_adjust" holds the number of V4 format newly created bitmap blocks * seen in INCTN records in backward processing. It is possible that backward processing might have * missed out on seeing those INCTN records which are part of virtually-truncated or completely-rolled-bak * journal files. The journal file-header has a separate field "prev_recov_blks_to_upgrd_adjust" which * maintains exactly this count. Therefore adjust the rctl counter accordingly. */ assert(!jfh->prev_recov_blks_to_upgrd_adjust || !jfh->recover_interrupted); assert(!jfh->prev_recov_blks_to_upgrd_adjust || jfh->prev_recov_end_of_data); rctl->blks_to_upgrd_adjust += jfh->prev_recov_blks_to_upgrd_adjust; } if (max_autoswitchlimit > last_jctl->jfh->autoswitchlimit) { csd->jnl_alq = max_jnl_alq; csd->jnl_deq = max_jnl_deq; csd->autoswitchlimit = max_autoswitchlimit; } else { assert(csd->jnl_alq == last_jctl->jfh->jnl_alq); assert(csd->jnl_deq == last_jctl->jfh->jnl_deq); assert(csd->autoswitchlimit == last_jctl->jfh->autoswitchlimit); } jctl = rctl->jctl_turn_around; /* Get a pointer to the turn around point EPOCH record */ jnlrec = rctl->mur_desc->jnlrec; assert(JRT_EPOCH == jnlrec->prefix.jrec_type); assert(jctl->turn_around_time == jnlrec->prefix.time); assert(jctl->turn_around_seqno == jnlrec->jrec_epoch.jnl_seqno); assert(jctl->turn_around_tn == jnlrec->prefix.tn); assert(jctl->rec_offset == jctl->turn_around_offset); /* Reset file-header "blks_to_upgrd" counter to the turn around point epoch value. Adjust this to include * the number of new V4 format bitmaps created by post-turnaround-point db file extensions. * The adjustment value is maintained in rctl->blks_to_upgrd_adjust. */ csd->blks_to_upgrd = jnlrec->jrec_epoch.blks_to_upgrd; csd->blks_to_upgrd += rctl->blks_to_upgrd_adjust; # ifdef GTM_TRIGGER /* online rollback can potentially take the database to a point in the past where the triggers that were * previously installed are no longer a part of the current database state and so any process that restarts * AFTER online rollback completes SHOULD reload triggers and the only way to do that is by incrementing the * db_trigger_cycle in the file header. */ if (jgbl.onlnrlbk && (0 < csd->db_trigger_cycle)) { /* check for non-zero db_trigger_cycle is to prevent other processes (continuing after online rollback) * to establish implicit TP (on seeing the trigger cycle mismatch) when there are actually no triggers * installed in the database (because there were none at the start of online rollback). */ csd->db_trigger_cycle++; if (0 == csd->db_trigger_cycle) csd->db_trigger_cycle = 1; /* Don't allow cycle set to 0 which means uninitialized */ } # endif assert((WBTEST_ALLOW_ARBITRARY_FULLY_UPGRADED == gtm_white_box_test_case_number) || (FALSE == jctl->turn_around_fullyupgraded) || (TRUE == jctl->turn_around_fullyupgraded)); /* Set csd->fully_upgraded to FALSE if: * a) The turn around EPOCH had the fully_upgraded field set to FALSE * OR * b) If csd->blks_to_upgrd counter is non-zero. This field can be non-zero even if the turnaround EPOCH's * fully_upgraded field is TRUE. This is possible if the database was downgraded to V4 (post turnaround EPOCH) * format and database extensions happened causing new V4 format bitmap blocks to be written. The count of V4 * format bitmap blocks is maintained ONLY as part of INCTN records (with INCTN opcode SET_JNL_FILE_CLOSE_EXTEND) * noted down in rctl->blks_to_upgrd_adjust counter as part of BACKWARD processing which are finally added to * csd->blks_to_upgrd. */ if (!jctl->turn_around_fullyupgraded || csd->blks_to_upgrd) csd->fully_upgraded = FALSE; csd->trans_hist.early_tn = jctl->turn_around_tn; csd->trans_hist.curr_tn = csd->trans_hist.early_tn; /* INCREMENT_CURR_TN macro not used but noted in comment * to identify all places that set curr_tn */ csd->jnl_eovtn = csd->trans_hist.curr_tn; csd->turn_around_point = TRUE; /* MUPIP REORG UPGRADE/DOWNGRADE stores its partially processed state in the database file header. * It is difficult for recovery to restore those fields to a correct partial value. * Hence reset the related fields as if the desired_db_format got set just ONE tn BEFORE the EPOCH record * and that there was no more processing that happened. * This might potentially mean some duplicate processing for MUPIP REORG UPGRADE/DOWNGRADE after the recovery. * But that will only be the case as long as the database is in compatibility (mixed) mode (hopefully not long). */ if (csd->desired_db_format_tn >= jctl->turn_around_tn) csd->desired_db_format_tn = jctl->turn_around_tn - 1; if (csd->reorg_db_fmt_start_tn >= jctl->turn_around_tn) csd->reorg_db_fmt_start_tn = jctl->turn_around_tn - 1; if (csd->tn_upgrd_blks_0 > jctl->turn_around_tn) csd->tn_upgrd_blks_0 = (trans_num)-1; csd->reorg_upgrd_dwngrd_restart_block = 0; /* Compute current value of "free_blocks" based on the value of "free_blocks" at the turnaround point epoch * record and the change in "total_blks" since that epoch to the present form of the database. Any difference * in "total_blks" implies database file extensions happened since the turnaround point. A backward rollback * undoes everything (including all updates) except file extensions (it does not truncate the file size). * Therefore every block that was newly allocated as part of those file extensions should be considered FREE * for the current calculations except for the local bitmap blocks which are BUSY the moment they are created. */ assert(jnlrec->jrec_epoch.total_blks <= csd->trans_hist.total_blks); csd->trans_hist.free_blocks = jnlrec->jrec_epoch.free_blocks + (csd->trans_hist.total_blks - jnlrec->jrec_epoch.total_blks) - DIVIDE_ROUND_UP(csd->trans_hist.total_blks, BLKS_PER_LMAP) + DIVIDE_ROUND_UP(jnlrec->jrec_epoch.total_blks, BLKS_PER_LMAP); assert(!csd->blks_to_upgrd || !csd->fully_upgraded); assert((freeblks = mur_blocks_free(rctl)) == csd->trans_hist.free_blocks); /* Update strm_reg_seqno[] in db file header to reflect the turn around point. * Before updating "strm_reg_seqno", make sure value is saved into "save_strm_reg_seqno". * This is relied upon by the function "mur_get_max_strm_reg_seqno" in case of interrupted rollback. */ for (idx = 0; idx < MAX_SUPPL_STRMS; idx++) { if (!csd->save_strm_reg_seqno[idx]) csd->save_strm_reg_seqno[idx] = csd->strm_reg_seqno[idx]; csd->strm_reg_seqno[idx] = jnlrec->jrec_epoch.strm_seqno[idx]; } wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_FSYNC_DB); assert(cs_addrs->ti->curr_tn == jctl->turn_around_tn); # ifdef UNIX if (jgbl.onlnrlbk) { if (dba_bg == cs_addrs->hdr->acc_meth) { /* dryclean the cache (basically reset the cycle fields in all teh cache records) so as to make * GT.M processes that only does 'reads' to require crit and hence realize that online rollback * is in progress */ bt_refresh(cs_addrs, FALSE); /* sets earliest bt TN to be the turn around TN */ } db_csh_ref(cs_addrs, FALSE); assert(NULL != cs_addrs->jnl); jpc = cs_addrs->jnl; assert(NULL != jpc->jnl_buff); jbp = jpc->jnl_buff; /* Since Rollback simulates the journal record along with the timestamp at which the update was made, it * sets jgbl.dont_reset_gbl_jrec_time to TRUE so that during forward processing t_end or tp_tend does not * reset the gbl_jrec_time to reflect the current time. But, with Online Rollback, one can have the shared * memory up and running and hence can have jbp->prev_jrec_time to be the time of the most recent journal * update made. Later in t_end/tp_tend, ADJUST_GBL_JREC_TIME is invoked which ensures that if ever * gbl_jrec_time (the time of the current update) is less than jbp->prev_jrec_time (time of the latest * journal update), dont_reset_gbl_jrec_time better be FALSE. But, this assert will trip since Rollback * sets the latter to TRUE. To fix this, set jbp->prev_jrec_time to the turn around time stamp. This way * we are guaranteed that all the updates done in the forward processing will have a timestamp that is * greater than the turn around timestamp */ SET_JNLBUFF_PREV_JREC_TIME(jbp, jctl->turn_around_time, DO_GBL_JREC_TIME_CHECK_FALSE); } else if (dba_bg == csd->acc_meth) { /* set earliest bt TN to be the turn-around TN (taken from bt_refresh()) */ SET_OLDEST_HIST_TN(cs_addrs, cs_addrs->ti->curr_tn - 1); } # else if (dba_bg == csd->acc_meth) { /* set earliest bt TN to be the turn-around TN (taken from bt_refresh()) */ SET_OLDEST_HIST_TN(cs_addrs, cs_addrs->ti->curr_tn - 1); } # endif csd->turn_around_point = FALSE; assert(OLDEST_HIST_TN(cs_addrs) == (cs_addrs->ti->curr_tn - 1)); /* In case this is MM and wcs_flu() remapped an extended database, reset rctl->csd */ assert((dba_mm == cs_data->acc_meth) || (rctl->csd == cs_data)); rctl->csd = cs_data; } JNL_SHORT_TIME(now); for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { TP_CHANGE_REG_IF_NEEDED(rctl->gd); if (!rctl->jfh_recov_interrupted) jctl = rctl->jctl_turn_around; else { DEBUG_ONLY( for (jctl = rctl->jctl_turn_around; NULL != jctl->next_gen; jctl = jctl->next_gen) ; /* check that latest gener file name does not match db header */ assert((rctl->csd->jnl_file_len != jctl->jnl_fn_len) || (0 != memcmp(rctl->csd->jnl_file_name, jctl->jnl_fn, jctl->jnl_fn_len))); ) jctl = rctl->jctl_alt_head; } assert(NULL != jctl); for ( ; NULL != jctl->next_gen; jctl = jctl->next_gen) ; assert(rctl->csd->jnl_file_len == jctl->jnl_fn_len); /* latest gener file name */ assert(0 == memcmp(rctl->csd->jnl_file_name, jctl->jnl_fn, jctl->jnl_fn_len)); /* should match db header */ if (SS_NORMAL != (status = prepare_unique_name((char *)jctl->jnl_fn, jctl->jnl_fn_len, "", "", rename_fn, &rename_fn_len, now, &status2))) return status; jctl->jnl_fn_len = rename_fn_len; /* change the name in memory to the proposed name */ memcpy(jctl->jnl_fn, rename_fn, rename_fn_len + 1); /* Rename hasn't happened yet at the filesystem level. In case current recover command is interrupted, * we need to update jfh->next_jnl_file_name before mur_forward(). Update jfh->next_jnl_file_name for * all journal files from which PBLK records were applied. Create new journal files for forward play. */ assert(NULL != rctl->jctl_turn_around); jctl = rctl->jctl_turn_around; /* points to journal file which has current recover's turn around point */ assert(0 != jctl->turn_around_offset); jfh = jctl->jfh; jfh->turn_around_offset = jctl->turn_around_offset; /* save progress in file header for */ jfh->turn_around_time = jctl->turn_around_time; /* possible re-issue of recover */ for (idx = 0; idx < MAX_SUPPL_STRMS; idx++) jfh->strm_end_seqno[idx] = csd->strm_reg_seqno[idx]; jfh_changed = TRUE; /* We are about to update the journal file header of the turnaround-point journal file to store the * non-zero jfh->turn_around_offset. Ensure corresponding database is considered updated. * This is needed in case journal recovery/rollback terminates abnormally and we go to mur_close_files. * We need to ensure csd->recov_interrupted does not get reset to FALSE even if this region did not have * have any updates to the corresponding database file otherwise. (GTM-8394) */ rctl->db_updated = TRUE; for ( ; NULL != jctl; jctl = jctl->next_gen) { /* setup the next_jnl links. note that in the case of interrupted recovery, next_jnl links * would have been already set starting from the turn-around point journal file of the * interrupted recovery but the new recovery MIGHT have taken us to a still previous * generation journal file that needs its next_jnl link set. this is why we do the next_jnl * link setup even in the case of interrupted recovery although in most cases it is unnecessary. */ jfh = jctl->jfh; if (NULL != jctl->next_gen) { jfh->next_jnl_file_name_length = jctl->next_gen->jnl_fn_len; memcpy(jfh->next_jnl_file_name, jctl->next_gen->jnl_fn, jctl->next_gen->jnl_fn_len); jfh_changed = TRUE; } else assert(0 == jfh->next_jnl_file_name_length); /* null link from latest generation */ if (jfh->turn_around_offset && (jctl != rctl->jctl_turn_around)) { /* It is possible that the current recovery has a turn-around-point much before the * previously interrupted recovery. If it happens to be a previous generation journal * file then we have to reset the original turn-around-point to be zero in the journal * file header in order to ensure if this recovery gets interrupted we do interrupted * recovery processing until the new turn-around-point instead of stopping incorrectly * at the original turn-around-point itself. Note that there could be more than one * journal file with a non-zero turn_around_offset (depending on how many previous * recoveries got interrupted in this loop) that need to be reset. */ assert(!jctl->turn_around_offset); assert(rctl->recov_interrupted || rctl->jctl_apply_pblk); /* rctl->jfh_recov_interrupted can fail */ jfh->turn_around_offset = 0; jfh->turn_around_time = 0; jfh_changed = TRUE; } if (jfh_changed) { /* Since overwriting the journal file header (an already allocated block * in the file) should not cause ENOSPC, we dont take the trouble of * passing csa or jnl_fn (first two parameters). Instead we pass NULL. */ JNL_DO_FILE_WRITE(NULL, NULL, jctl->channel, 0, jfh, REAL_JNL_HDR_LEN, jctl->status, jctl->status2); if (SS_NORMAL != jctl->status) { assert(FALSE); if (SS_NORMAL == jctl->status2) gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(5) ERR_JNLWRERR, 2, jctl->jnl_fn_len, jctl->jnl_fn, jctl->status); else gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT1(6) ERR_JNLWRERR, 2, jctl->jnl_fn_len, jctl->jnl_fn, jctl->status, PUT_SYS_ERRNO(jctl->status2)); return jctl->status; } GTM_JNL_FSYNC(rctl->csa, jctl->channel, jctl->status); if (-1 == jctl->status) { jctl->status2 = errno; assert(FALSE); gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(9) ERR_JNLFSYNCERR, 2, jctl->jnl_fn_len, jctl->jnl_fn, ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), jctl->status2); return ERR_JNLFSYNCERR; } } jfh_changed = FALSE; } memset(&jnl_info, 0, SIZEOF(jnl_info)); jnl_info.status = jnl_info.status2 = SS_NORMAL; jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(rctl->gd, &jnl_info); jnl_info.prev_jnl_len = rctl->jctl_turn_around->jnl_fn_len; memcpy(jnl_info.prev_jnl, rctl->jctl_turn_around->jnl_fn, rctl->jctl_turn_around->jnl_fn_len); jnl_info.prev_jnl[jnl_info.prev_jnl_len] = 0; jnl_info.jnl_len = rctl->csd->jnl_file_len; memcpy(jnl_info.jnl, rctl->csd->jnl_file_name, jnl_info.jnl_len); jnl_info.jnl[jnl_info.jnl_len] = 0; assert(!mur_options.rollback || jgbl.mur_rollback); jnl_info.reg_seqno = rctl->jctl_turn_around->turn_around_seqno; jgbl.gbl_jrec_time = rctl->jctl_turn_around->turn_around_time; /* time needed for cre_jnl_file_common() */ if (EXIT_NRM != cre_jnl_file_common(&jnl_info, rename_fn, rename_fn_len)) { gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(4) ERR_JNLNOCREATE, 2, jnl_info.jnl_len, jnl_info.jnl); return jnl_info.status; } # ifdef UNIX if (jgbl.onlnrlbk) { cs_addrs = rctl->csa; /* Mimic what jnl_file_close in case of cleanly a closed journal file */ jpc = cs_addrs->jnl; /* the previous loop makes sure cs_addrs->jnl->jnl_buff is valid*/ NULLIFY_JNL_FILE_ID(cs_addrs); jpc->jnl_buff->cycle++; /* so that, all other processes knows to switch to newer journal file */ jpc->cycle--; /* decrement cycle so jnl_ensure_open() knows to reopen the journal */ } # endif if (NULL != rctl->jctl_alt_head) /* remove the journal files created by last interrupted recover process */ { mur_rem_jctls(rctl); rctl->jctl_alt_head = NULL; } /* From this point on, journal records are written into the newly created journal file. However, we still read * from old journal files. */ }