void dse_maps(void) { block_id blk, bml_blk; blk_segment *bs1, *bs_ptr; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT, BLK_SEG and BLK_FINI macros */ sm_uc_ptr_t bp; char util_buff[MAX_UTIL_LEN]; int4 bml_size, bml_list_size, blk_index, bml_index; int4 total_blks, blks_in_bitmap; int4 bplmap, dummy_int; unsigned char *bml_list; cache_rec_ptr_t cr, dummy_cr; bt_rec_ptr_t btr; int util_len; uchar_ptr_t blk_ptr; boolean_t was_crit; uint4 jnl_status; srch_blk_status blkhist; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; if (CLI_PRESENT == cli_present("BUSY") || CLI_PRESENT == cli_present("FREE") || CLI_PRESENT == cli_present("MASTER") || CLI_PRESENT == cli_present("RESTORE_ALL")) { if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); } CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ csa = cs_addrs; assert(&FILE_INFO(gv_cur_region)->s_addrs == csa); was_crit = csa->now_crit; if (csa->critical) crash_count = csa->critical->crashcnt; csd = csa->hdr; bplmap = csd->bplmap; if (CLI_PRESENT == cli_present("BLOCK")) { if (!cli_get_hex("BLOCK", (uint4 *)&blk)) return; if (blk < 0 || blk >= csa->ti->total_blks) { util_out_print("Error: invalid block number.", TRUE); return; } patch_curr_blk = blk; } else blk = patch_curr_blk; if (CLI_PRESENT == cli_present("FREE")) { if (0 == bplmap) { util_out_print("Cannot perform map updates: bplmap field of file header is zero.", TRUE); return; } if (blk / bplmap * bplmap == blk) { util_out_print("Cannot perform action on a map block.", TRUE); return; } bml_blk = blk / bplmap * bplmap; bm_setmap(bml_blk, blk, FALSE); return; } if (CLI_PRESENT == cli_present("BUSY")) { if (0 == bplmap) { util_out_print("Cannot perform map updates: bplmap field of file header is zero.", TRUE); return; } if (blk / bplmap * bplmap == blk) { util_out_print("Cannot perform action on a map block.", TRUE); return; } bml_blk = blk / bplmap * bplmap; bm_setmap(bml_blk, blk, TRUE); return; } blk_size = csd->blk_size; if (CLI_PRESENT == cli_present("MASTER")) { if (0 == bplmap) { util_out_print("Cannot perform maps updates: bplmap field of file header is zero.", TRUE); return; } if (!was_crit) grab_crit(gv_cur_region); bml_blk = blk / bplmap * bplmap; if (dba_mm == csd->acc_meth) bp = MM_BASE_ADDR(csa) + (off_t)bml_blk * blk_size; else { assert(dba_bg == csd->acc_meth); if (!(bp = t_qread(bml_blk, &dummy_int, &dummy_cr))) rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL); } if ((csa->ti->total_blks / bplmap) * bplmap == bml_blk) total_blks = (csa->ti->total_blks - bml_blk); else total_blks = bplmap; if (NO_FREE_SPACE == bml_find_free(0, bp + SIZEOF(blk_hdr), total_blks)) bit_clear(bml_blk / bplmap, csa->bmm); else bit_set(bml_blk / bplmap, csa->bmm); if (bml_blk > csa->nl->highest_lbm_blk_changed) csa->nl->highest_lbm_blk_changed = bml_blk; if (!was_crit) rel_crit(gv_cur_region); return; } if (CLI_PRESENT == cli_present("RESTORE_ALL")) { if (0 == bplmap) { util_out_print("Cannot perform maps updates: bplmap field of file header is zero.", TRUE); return; } total_blks = csa->ti->total_blks; assert(ROUND_DOWN2(blk_size, 2 * SIZEOF(int4)) == blk_size); bml_size = BM_SIZE(bplmap); bml_list_size = (total_blks + bplmap - 1) / bplmap * bml_size; bml_list = (unsigned char *)malloc(bml_list_size); for (blk_index = 0, bml_index = 0; blk_index < total_blks; blk_index += bplmap, bml_index++) bml_newmap((blk_hdr_ptr_t)(bml_list + bml_index * bml_size), bml_size, csa->ti->curr_tn); if (!was_crit) { grab_crit(gv_cur_region); csa->hold_onto_crit = TRUE; /* need to do this AFTER grab_crit */ } blk = get_dir_root(); assert(blk < bplmap); csa->ti->free_blocks = total_blks - DIVIDE_ROUND_UP(total_blks, bplmap); bml_busy(blk, bml_list + SIZEOF(blk_hdr)); csa->ti->free_blocks = csa->ti->free_blocks - 1; dse_m_rest(blk, bml_list, bml_size, &csa->ti->free_blocks, TRUE); for (blk_index = 0, bml_index = 0; blk_index < total_blks; blk_index += bplmap, bml_index++) { t_begin_crit(ERR_DSEFAIL); CHECK_TN(csa, csd, csd->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ CWS_RESET; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ assert(csa->ti->early_tn == csa->ti->curr_tn); blk_ptr = bml_list + bml_index * bml_size; blkhist.blk_num = blk_index; if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr))) rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL); BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blk_ptr + SIZEOF(blk_hdr), bml_size - SIZEOF(blk_hdr)); BLK_FINI(bs_ptr, bs1); t_write(&blkhist, (unsigned char *)bs1, 0, 0, LCL_MAP_LEVL, TRUE, FALSE, GDS_WRITE_KILLTN); BUILD_AIMG_IF_JNL_ENABLED(csd, csa->ti->curr_tn); t_end(&dummy_hist, NULL, csa->ti->curr_tn); } /* Fill in master map */ for (blk_index = 0, bml_index = 0; blk_index < total_blks; blk_index += bplmap, bml_index++) { blks_in_bitmap = (blk_index + bplmap <= total_blks) ? bplmap : total_blks - blk_index; assert(1 < blks_in_bitmap); /* the last valid block in the database should never be a bitmap block */ if (NO_FREE_SPACE != bml_find_free(0, (bml_list + bml_index * bml_size) + SIZEOF(blk_hdr), blks_in_bitmap)) bit_set(blk_index / bplmap, csa->bmm); else bit_clear(blk_index / bplmap, csa->bmm); if (blk_index > csa->nl->highest_lbm_blk_changed) csa->nl->highest_lbm_blk_changed = blk_index; } if (!was_crit) { csa->hold_onto_crit = FALSE; /* need to do this before the rel_crit */ rel_crit(gv_cur_region); } if (unhandled_stale_timer_pop) process_deferred_stale(); free(bml_list); csd->kill_in_prog = csd->abandoned_kills = 0; return; } MEMCPY_LIT(util_buff, "!/Block "); util_len = SIZEOF("!/Block ") - 1; util_len += i2hex_nofill(blk, (uchar_ptr_t)&util_buff[util_len], 8); memcpy(&util_buff[util_len], " is marked !AD in its local bit map.!/", SIZEOF(" is marked !AD in its local bit map.!/") - 1); util_len += SIZEOF(" is marked !AD in its local bit map.!/") - 1; util_buff[util_len] = 0; if (!was_crit) grab_crit(gv_cur_region); util_out_print(util_buff, TRUE, 4, dse_is_blk_free(blk, &dummy_int, &dummy_cr) ? "free" : "busy"); if (!was_crit) rel_crit(gv_cur_region); return; }
void gv_rundown(void) { gd_region *r_top, *r_save, *r_local; gd_addr *addr_ptr; sgm_info *si; # ifdef VMS vms_gds_info *gds_info; # elif UNIX unix_db_info *udi; # endif r_save = gv_cur_region; /* Save for possible core dump */ gvcmy_rundown(); ENABLE_AST if (pool_init) rel_lock(jnlpool.jnlpool_dummy_reg); for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++) { if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth) { /* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above. * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions, * region->open is TRUE although cs_addrs is NULL. */ gv_cur_region = r_local; tp_change_reg(); gds_rundown(); /* Now that gds_rundown is done, free up the memory associated with the region. * Ideally the following memory freeing code should go to gds_rundown, but * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM. */ if (NULL != cs_addrs) { if (NULL != cs_addrs->dir_tree) FREE_CSA_DIR_TREE(cs_addrs); if (cs_addrs->sgm_info_ptr) { si = cs_addrs->sgm_info_ptr; /* It is possible we got interrupted before initializing all fields of "si" * completely so account for NULL values while freeing/releasing those fields. */ assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa)); if (si->jnl_tail) { CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list); } CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list); CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list); CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list); if (NULL != si->blks_in_use) { free_hashtab_int4(si->blks_in_use); free(si->blks_in_use); si->blks_in_use = NULL; } if (si->cr_array_size) { assert(NULL != si->cr_array); if (NULL != si->cr_array) free(si->cr_array); } if (NULL != si->first_tp_hist) free(si->first_tp_hist); free(si); } if (cs_addrs->jnl) { assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs); if (cs_addrs->jnl->jnllsb) { UNIX_ONLY(assert(FALSE)); free(cs_addrs->jnl->jnllsb); } free(cs_addrs->jnl); } GTMCRYPT_ONLY( if (cs_addrs->encrypted_blk_contents) free(cs_addrs->encrypted_blk_contents); ) } assert(gv_cur_region->dyn.addr->file_cntl->file_info); VMS_ONLY( gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info; if (gds_info->xabpro) free(gds_info->xabpro); if (gds_info->xabfhc) free(gds_info->xabfhc); if (gds_info->nam) { free(gds_info->nam->nam$l_esa); free(gds_info->nam); } if (gds_info->fab) free(gds_info->fab); ) free(gv_cur_region->dyn.addr->file_cntl->file_info); free(gv_cur_region->dyn.addr->file_cntl); } r_local->open = r_local->was_open = FALSE; }
void mubclnup(backup_reg_list *curr_ptr, clnup_stage stage) { sgmnt_addrs *csa; backup_reg_list *ptr, *next; uint4 status; boolean_t had_lock; unix_db_info *udi; int rc; assert(stage >= need_to_free_space && stage < num_of_clnup_stage); free(stringpool.base); switch(stage) { case need_to_rel_crit: for (ptr = (backup_reg_list *)grlist; ptr != NULL && ptr != curr_ptr && ptr != (backup_reg_list *)halt_ptr;) { if (keep_going == ptr->not_this_time) { csa = &FILE_INFO(ptr->reg)->s_addrs; DECR_INHIBIT_KILLS(csa->nl); rel_crit(ptr->reg); } ptr = ptr->fPtr; } curr_ptr = (backup_reg_list *)halt_ptr; /* Intentional Fall Through */ case need_to_del_tempfile: for (ptr = (backup_reg_list *)grlist; ptr != NULL && ptr != curr_ptr;) { assert(3 == num_backup_proc_status); /* Ensure there are only 3 possible values for "ptr->not_this_time". * The assert below and the following if check rely on this. */ assert((keep_going == ptr->not_this_time) || (give_up_before_create_tempfile == ptr->not_this_time) || (give_up_after_create_tempfile == ptr->not_this_time)); if (give_up_before_create_tempfile != ptr->not_this_time) { free(ptr->backup_hdr); if (online) { /* Stop temporary file from growing if we made it active */ if (keep_going == ptr->not_this_time) { csa = &FILE_INFO(ptr->reg)->s_addrs; csa->nl->nbb = BACKUP_NOT_IN_PROGRESS; /* Make sure all running processes have a chance to see this backup state change so they won't be trying to flush when we go to delete the temporary files (mostly an issue on VMS). This operation notifies other processes by: 1) Using a compswap lock with builtin memory barriers so other processors know the memory state change. 2) Processes obtaining the lock after we release it will do their own memory barrier operation and see the change. 3) By grabbing the lock, we are assured that anyone else getting the lock after us will also be checking the errno flag AFTER getting the lock (see backup_buffer_flush()) and see no flush is necessary. */ if (!(had_lock = shmpool_lock_held_by_us(ptr->reg))) shmpool_lock_hdr(ptr->reg); if (backup_interrupted && 0 == csa->shmpool_buffer->backup_errno) /* Needs a non-zero value to stop the backup */ csa->shmpool_buffer->backup_errno = ERR_FORCEDHALT; if (!had_lock) shmpool_unlock_hdr(ptr->reg); } /* get rid of the temporary file */ if (ptr->backup_fd > 2) { CLOSEFILE_RESET(ptr->backup_fd, rc); /* resets "ptr" to FD_INVALID */ UNLINK(ptr->backup_tempfile); } } else /* defreeze the databases */ region_freeze(ptr->reg, FALSE, FALSE, FALSE); } ptr = ptr->fPtr; } /* Intentional fall through */ case need_to_free_space: for (ptr = (backup_reg_list *)grlist; ptr != NULL;) { next = ptr->fPtr; if (keep_going != ptr->not_this_time) error_mupip = TRUE; if (NULL != ptr->backup_file.addr) free(ptr->backup_file.addr); free(ptr); ptr = next; } } /* Release FTOK lock on the replication instance file if holding it */ assert((NULL == jnlpool.jnlpool_dummy_reg) || (NULL != mu_repl_inst_reg_list) || jnlpool_init_needed); if ((NULL != mu_repl_inst_reg_list) && (NULL != jnlpool.jnlpool_dummy_reg) && jnlpool.jnlpool_dummy_reg->open) { udi = FILE_INFO(jnlpool.jnlpool_dummy_reg); assert(NULL != udi); if (NULL != udi) { /* See gv_rundown.c comment for why ftok_sem_release 2nd parameter is FALSE below */ if (udi->grabbed_ftok_sem) ftok_sem_release(jnlpool.jnlpool_dummy_reg, FALSE, TRUE); assert(!udi->grabbed_ftok_sem); } } return; }
void lke_show(void) { bool locks, all = TRUE, wait = TRUE, interactive = FALSE, match = FALSE, memory = TRUE, nocrit = TRUE; boolean_t exact = FALSE, was_crit; int4 pid; size_t ls_len; int n; char regbuf[MAX_RN_LEN], nodebuf[32], one_lockbuf[MAX_KEY_SZ]; mlk_ctldata_ptr_t ctl; mstr reg, node, one_lock; error_def(ERR_UNIMPLOP); error_def(ERR_TEXT); /* Get all command parameters */ reg.addr = regbuf; reg.len = SIZEOF(regbuf); node.addr = nodebuf; node.len = SIZEOF(nodebuf); one_lock.addr = one_lockbuf; one_lock.len = SIZEOF(one_lockbuf); if (lke_getcli(&all, &wait, &interactive, &pid, ®, &node, &one_lock, &memory, &nocrit, &exact) == 0) return; /* Search all regions specified on the command line */ for (gv_cur_region = gd_header->regions, n = 0; n != gd_header->n_regions; ++gv_cur_region, ++n) { /* If region matches and is open */ if ((reg.len == 0 || gv_cur_region->rname_len == reg.len && memcmp(gv_cur_region->rname, reg.addr, reg.len) == 0) && gv_cur_region->open) { match = TRUE; util_out_print("!/!AD!/", NOFLUSH, REG_LEN_STR(gv_cur_region)); /* If distributed database, the region is located on another node */ if (gv_cur_region->dyn.addr->acc_meth == dba_cm) { # if defined(LKE_WORKS_OK_WITH_CM) /* Obtain lock info from the remote node */ locks = gtcmtr_lke_showreq(gv_cur_region->dyn.addr->cm_blk, gv_cur_region->cmx_regnum, all, wait, pid, &node); # else gtm_putmsg(VARLSTCNT(10) ERR_UNIMPLOP, 0, ERR_TEXT, 2, LEN_AND_LIT("GT.CM region - locks must be displayed on the local node"), ERR_TEXT, 2, REG_LEN_STR(gv_cur_region)); continue; # endif } else if (gv_cur_region->dyn.addr->acc_meth == dba_bg || gv_cur_region->dyn.addr->acc_meth == dba_mm) { /* Local region */ cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs; ls_len = (size_t)(cs_addrs->lock_addrs[1] - cs_addrs->lock_addrs[0]); ctl = (mlk_ctldata_ptr_t)malloc(ls_len); /* Prevent any modification of the lock space while we make a local copy of it */ if (cs_addrs->critical != NULL) crash_count = cs_addrs->critical->crashcnt; was_crit = cs_addrs->now_crit; if (!nocrit && !was_crit) grab_crit(gv_cur_region); longcpy((uchar_ptr_t)ctl, (uchar_ptr_t)cs_addrs->lock_addrs[0], ls_len); if (!nocrit && !was_crit) rel_crit(gv_cur_region); locks = ctl->blkroot == 0 ? FALSE: lke_showtree(NULL, (mlk_shrblk_ptr_t)R2A(ctl->blkroot), all, wait, pid, one_lock, memory); free(ctl); } else { util_out_print(NULL, RESET); util_out_print("Region is not BG, MM, or CM", FLUSH); locks = TRUE; } if (!locks) { util_out_print(NULL, RESET); util_out_print("No locks were found in !AD", FLUSH, REG_LEN_STR(gv_cur_region)); } } } if (!match && reg.len != 0) rts_error(VARLSTCNT(4) ERR_NOREGION, 2, reg.len, reg.addr); }
cache_rec_ptr_t db_csh_getn(block_id block) { cache_rec_ptr_t hdr, q0, start_cr, cr; bt_rec_ptr_t bt; unsigned int lcnt, ocnt; int rip, max_ent, pass1, pass2, pass3; int4 flsh_trigger; uint4 first_r_epid, latest_r_epid; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; srch_blk_status *tp_srch_status; ht_ent_int4 *tabent; csa = cs_addrs; csd = csa->hdr; assert(csa->now_crit); assert(csa == &FILE_INFO(gv_cur_region)->s_addrs); max_ent = csd->n_bts; cr = (cache_rec_ptr_t)GDS_REL2ABS(csa->nl->cur_lru_cache_rec_off); hdr = csa->acc_meth.bg.cache_state->cache_array + (block % csd->bt_buckets); start_cr = csa->acc_meth.bg.cache_state->cache_array + csd->bt_buckets; pass1 = max_ent; /* skip referred or dirty or read-into cache records */ pass2 = 2 * max_ent; /* skip referred cache records */ pass3 = 3 * max_ent; /* skip nothing */ INCR_DB_CSH_COUNTER(csa, n_db_csh_getns, 1); DEFER_INTERRUPTS(INTRPT_IN_DB_CSH_GETN); for (lcnt = 0; ; lcnt++) { if (lcnt > pass3) { BG_TRACE_PRO(wc_blocked_db_csh_getn_loopexceed); assert(FALSE); break; } cr++; if (cr == start_cr + max_ent) cr = start_cr; VMS_ONLY( if ((lcnt == pass1) || (lcnt == pass2)) wcs_wtfini(gv_cur_region); ) if (cr->refer && (lcnt < pass2)) { /* in passes 1 & 2, set refer to FALSE and skip; in the third pass attempt reuse even if TRUE == refer */ cr->refer = FALSE; continue; } if (cr->in_cw_set || cr->in_tend) { /* some process already has this pinned for reading and/or updating. skip it. */ cr->refer = TRUE; continue; } if (CDB_STAGNATE <= t_tries || mu_reorg_process) { /* Prevent stepping on self when crit for entire transaction. * This is done by looking up in sgm_info_ptr->blk_in_use and cw_stagnate for presence of the block. * The following two hashtable lookups are not similar, since in TP, sgm_info_ptr->blks_in_use * is updated to the latest cw_stagnate list of blocks only in "tp_hist". * Also note that the lookup in sgm_info_ptr->blks_in_use reuses blocks that don't have cse's. * This is to allow big-read TP transactions which may use up more than the available global buffers. * There is one issue here in that a block that has been only read till now may be stepped upon here * but may later be needed for update. It is handled by updating the block's corresponding * entry in the set of histories (sgm_info_ptr->first_tp_hist[index] structure) to hold the * "cr" and "cycle" of the t_qread done for the block when it was intended to be changed for the * first time within the transaction since otherwise the transaction would restart due to a * cdb_sc_lostcr status. Note that "tn" (read_tn of the block) in the first_tp_hist will still * remain the "tn" when the block was first read within this transaction to ensure the block * hasn't been modified since the start of the transaction. Once we intend on changing the * block i.e. srch_blk_status->cse is non-NULL, we ensure in the code below not to step on it. * ["tp_hist" is the routine that updates the "cr", "cycle" and "tn" of the block]. * Note that usually in a transaction the first_tp_hist[] structure holds the "cr", "cycle", and "tn" * of the first t_qread of the block within that transaction. The above is the only exception. * Also note that for blocks in cw_stagnate (i.e. current TP mini-action), we don't reuse any of * them even if they don't have a cse. This is to ensure that the current action doesn't * encounter a restart due to cdb_sc_lostcr in "tp_hist" even in the fourth-retry. */ tp_srch_status = NULL; if (dollar_tlevel && (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&cr->blk))) && (tp_srch_status = (srch_blk_status *)tabent->value) && (tp_srch_status->cse)) { /* this process is already using the block - skip it */ cr->refer = TRUE; continue; } if (NULL != lookup_hashtab_int4(&cw_stagnate, (uint4 *)&cr->blk)) { /* this process is already using the block for the current gvcst_search - skip it */ cr->refer = TRUE; continue; } if (NULL != tp_srch_status) { /* About to reuse a buffer that is part of the read-set of the current TP transaction. * Reset clue as otherwise the next global reference of that global will use an outofdate clue. * Even though tp_srch_status is available after the sgm_info_ptr->blks_in_use hashtable check, * we dont want to reset the clue in case the cw_stagnate hashtable check causes the same cr * to be skipped from reuse. Hence the placement of this reset logic AFTER the cw_stagnate check. */ tp_srch_status->blk_target->clue.end = 0; } } if (cr->dirty) { /* Note that in Unix, it is possible that we see a stale value of cr->dirty (possible if a * concurrent "wcs_wtstart" has reset dirty to 0 but that update did not reach us yet). In this * case the call to "wcs_get_space" below will do the necessary memory barrier instructions * (through calls to "aswp") which will allow us to see the non-stale value of cr->dirty. * * It is also possible that cr->dirty is non-zero but < cr->flushed_dirty_tn. In this case, wcs_get_space * done below will return FALSE forcing a cache-rebuild which will fix this situation. * * In VMS, another process cannot be concurrently resetting cr->dirty to 0 as the resetting routine * is "wcs_wtfini" which is executed in crit which another process cannot be in as we are in crit now. */ if (gv_cur_region->read_only) continue; if (lcnt < pass1) { if (!csa->timer && (csa->nl->wcs_timers < 1)) wcs_timer_start(gv_cur_region, FALSE); continue; } BG_TRACE_PRO(db_csh_getn_flush_dirty); if (FALSE == wcs_get_space(gv_cur_region, 0, cr)) { /* failed to flush it out - force a rebuild */ BG_TRACE_PRO(wc_blocked_db_csh_getn_wcsstarvewrt); assert(csa->nl->wc_blocked); /* only reason we currently know why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); break; } assert(0 == cr->dirty); } UNIX_ONLY( /* the cache-record is not free for reuse until the write-latch value becomes LATCH_CLEAR. * In VMS, resetting the write-latch value occurs in "wcs_wtfini" which is in CRIT, we are fine. * In Unix, this resetting is done by "wcs_wtstart" which is out-of-crit. Therefore, we need to * wait for this value to be LATCH_CLEAR before reusing this cache-record. * Note that we are examining the write-latch-value without holding the interlock. It is ok to do * this because the only two routines that modify the latch value are "bg_update" and * "wcs_wtstart". The former cannot be concurrently executing because we are in crit. * The latter will not update the latch value unless this cache-record is dirty. But in this * case we would have most likely gone through the if (cr->dirty) check above. Most likely * because there is one rare possibility where a concurrent "wcs_wtstart" has set cr->dirty * to 0 but not yet cleared the latch. In that case we wait for the latch to be cleared. * In all other cases, nobody is modifying the latch since when we got crit and therefore * it is safe to observe the value of the latch without holding the interlock. */ if (LATCH_CLEAR != WRITE_LATCH_VAL(cr)) { /* possible if a concurrent "wcs_wtstart" has set cr->dirty to 0 but not yet * cleared the latch. this should be very rare though. */ if (lcnt < pass2) continue; /* try to find some other cache-record to reuse until the 3rd pass */ for (ocnt = 1; (MAXWRTLATCHWAIT >= ocnt) && (LATCH_CLEAR != WRITE_LATCH_VAL(cr)); ocnt++) wcs_sleep(SLEEP_WRTLATCHWAIT); /* since it is a short lock, sleep the minimum */ if (MAXWRTLATCHWAIT <= ocnt) { BG_TRACE_PRO(db_csh_getn_wrt_latch_stuck); assert(FALSE); continue; } } )
void db_auto_upgrade(gd_region *reg) { /* detect unitialized file header fields for this version of GT.M and do a mini auto-upgrade, initializing such fields * to default values in the new GT.M version */ sgmnt_addrs *csa; sgmnt_data_ptr_t csd; assert(NULL != reg); if (NULL == reg) return; csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; assert(NULL != csd); if (NULL == csd) return; if (0 == csd->mutex_spin_parms.mutex_hard_spin_count) csd->mutex_spin_parms.mutex_hard_spin_count = MUTEX_HARD_SPIN_COUNT; if (0 == csd->mutex_spin_parms.mutex_sleep_spin_count) csd->mutex_spin_parms.mutex_sleep_spin_count = MUTEX_SLEEP_SPIN_COUNT; /* zero is a legitimate value for csd->mutex_spin_parms.mutex_spin_sleep_mask; so can't detect if need re-initialization */ INIT_NUM_CRIT_ENTRY_IF_NEEDED(csd); /* Auto upgrade based on minor database version number. This code currently only does auto upgrade and does not * do auto downgrade although that certainly is possible to implement if necessary. For now, if the current version * is at a lower level than the minor db version, we do nothing. * * Note the purpose of the minor_dbver field is so that some part of gtm (either runtime, or conversion utility) some * time and several versions down the road from now knows by looking at this field what fields in the fileheader are * valid so it is important that the minor db version be updated each time the fileheader is updated and this routine * correspondingly updated. SE 5/2006. */ if (csd->minor_dbver < GDSMVCURR) { /* In general, the method for adding new versions is: * 1) If there are no automatic updates for this version, it is optional to add the version to the switch * statement below. Those there are more for example at this time (through V53000). * 2) Update (or add) a case for the previous version to update any necessary fields. */ if (!csd->opened_by_gtmv53 && !csd->db_got_to_v5_once) { csd->opened_by_gtmv53 = TRUE; /* This is a case of a database that has been used by a pre-V53 version of GT.M that did not contain * the fix (C9H07-002873). At this point, the database might contain RECYCLED blocks that are a mix of * a) Those blocks that were RECYCLED at the time of the MUPIP UPGRADE from V4 to V5. * b) Those blocks that became RECYCLED due to M-kills in V5. * It is only (a) that we have to mark as FREE as it might contain too-full v4 format blocks. But there * is no way to distinguish the two. So we mark both (a) and (b) as FREE. This will mean no PBLKs written * for (b) and hence no backward journal recovery possible to a point before the start of the REORG UPGRADE. * We force a MUPIP REORG UPGRADE rerun (to mark RECYCLED blocks FREE) by setting fully_upgraded to FALSE. * Note that this does not need to be done for databases created by a V5 version (C9I05-002987). */ if (MASTER_MAP_SIZE_V4 == csd->master_map_len) { csd->fully_upgraded = FALSE; csd->reorg_upgrd_dwngrd_restart_block = 0; /* reorg upgrade should restart from block 0 */ /* Ensure reorg_db_fmt_start_tn and desired_db_format_tn are set to different * values so fresh reorg upgrade can set fully_upgraded to TRUE once it is done. */ csd->reorg_db_fmt_start_tn = 0; csd->desired_db_format_tn = 1; } else csd->db_got_to_v5_once = TRUE; /* db was created by V5 so safe to set this */ } /* When adding a new minor version, the following template should be maintained * a) Remove the penultimate 'break' * b) Remove the assert(FALSE) in the last case (most recent minor version) * c) If there are any file header fields added in the new minor version, initialize the fields to default values * in the last case * d) Add a new case with the new minor version * e) Add assert(FALSE) and break (like it was before) */ switch(csd->minor_dbver) { /* Note that handling for any fields introduced in a version will not go in the "switch-case" block * of code introduced for the new version but will go in the PREVIOUS "switch-case" block. */ case GDSMV51000: /* Multi-site replication available */ case GDSMV52000: /* Unicode */ case GDSMV53000: /* M-Itanium release */ gvstats_rec_upgrade(csa); /* Move GVSTATS information to new place in file header */ case GDSMV54002: /* GT.M V54002B introduced jnl_eov_tn for backward recovery */ csd->jnl_eovtn = csd->trans_hist.curr_tn; case GDSMV54002B: /* GT.M V55000 introduced strm_reg_seqno, save_strm_reg_seqno, intrpt_recov_resync_strm_seqno * AND obsoleted dualsite_resync_seqno. For new fields, we are guaranteed they are * zero (in formerly unused sections of the file header) so no need for any initialization. * For obsoleted fields, it would be good to clear them here so we dont run into issues later. */ UNIX_ONLY(csd->filler_seqno = 0;) /* was "dualsite_resync_seqno" in pre-V55000 versions */ /* In addition, V55000 introduced before_trunc_total_blks for MUPIP REORG -TRUNCATE. * Since it is a new field no initialization necessary. */ case GDSMV55000: UNIX_ONLY(csd->freeze_on_fail = FALSE;) UNIX_ONLY(csd->span_node_absent = TRUE;)
error_def(ERR_FREEZEID); freeze_status region_freeze(gd_region *region, boolean_t freeze, boolean_t override, boolean_t wait_for_kip) { uint4 freeze_id, sleep_counter; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; now_t now; /* for GET_CUR_TIME macro */ char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ boolean_t was_crit; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; freeze_id = FREEZE_ID; csa = &FILE_INFO(region)->s_addrs; csd = csa->hdr; if (freeze) { was_crit = csa->now_crit; if (!was_crit) grab_crit(region); /* really need this to be sure in UNIX, shouldn't be frequent anyway */ INCR_INHIBIT_KILLS(csa->nl); if (OWNERSHIP) { DECR_INHIBIT_KILLS(csa->nl); if (!was_crit) rel_crit(region); return REG_FREEZE_SUCCESS; } if (!override && csd->freeze)
/* go after a specific number of buffers or a particular buffer */ bool wcs_get_space(gd_region *reg, int needed, cache_rec *cr) { unsigned int lcnt, ocnt, status; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; que_ent_ptr_t base, q0; int4 dummy_errno; boolean_t is_mm; assert((0 != needed) || (NULL != cr)); csa = &(FILE_INFO(reg)->s_addrs); assert(csa == cs_addrs); csd = csa->hdr; is_mm = (dba_mm == csd->acc_meth); assert(is_mm || (dba_bg == csd->acc_meth)); cnl = csa->nl; if (FALSE == csa->now_crit) { assert(0 != needed); /* if needed == 0, then we should be in crit */ for (lcnt = DIVIDE_ROUND_UP(needed, csd->n_wrt_per_flu); 0 < lcnt; lcnt--) JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno); /* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */ return TRUE; } if (FALSE == wcs_wtfini(reg)) return FALSE; /* while calculating flush_trigger, the decrement should be atleast 1 if still not reached the minimum allowed */ csd->flush_trigger = MAX(csd->flush_trigger - MAX(csd->flush_trigger/STEP_FACTOR, 1), MIN_FLUSH_TRIGGER(csd->n_bts)); if (0 == needed) { if (!is_mm) { /* If another process is concurrently finishing up phase2 of commit, wait for that to complete first. */ if (cr->in_tend && !wcs_phase2_commit_wait(csa, cr)) return FALSE; /* assumption is that caller will set wc_blocked and trigger cache recovery */ } for (lcnt = 1; (MAXGETSPACEWAIT > lcnt) && (0 != cr->dirty); lcnt++) { /* We want to flush a specific cache-record. We speed up the wait by moving the dirty cache-record * to the head of the active queue. But to do this, we need exclusive access to the active queue. * The only other processes outside of crit that can be touching this concurrently are wcs_wtstart * (which can remove entries from the queue) and bg_update_phase2 (which can add entries to the queue). * In the case of writers, we can wait for those to complete (by setting cnl->wc_blocked to TRUE) * and then play with the queue. But in the case of bg_update_phase2, it is not easily possible to * do a similar wait so in this case we choose to do plain wcs_wtstart (which uses interlocked * queue operations and hence can work well with concurrent bg_update_phase2) and wait until the * cache record of interest becomes non-dirty. The consequence is we might wait a little longer than * necessary but that is considered acceptable for now. */ /* Check if cache recovery is needed (could be set by another process in * secshr_db_clnup finishing off a phase2 commit). If so, no point invoking * wcs_wtstart as it will return right away. Instead return FALSE so * cache-recovery can be triggered by the caller. */ if (cnl->wc_blocked) { assert(gtm_white_box_test_case_enabled); return FALSE; } if (!is_mm && cnl->wcs_phase2_commit_pidcnt) { JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno); /* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */ wcs_sleep(lcnt); } else if (LATCH_CLEAR == WRITE_LATCH_VAL(cr)) { SIGNAL_WRITERS_TO_STOP(cnl); /* to stop all active writers */ WAIT_FOR_WRITERS_TO_STOP(cnl, ocnt, MAXGETSPACEWAIT); if (MAXGETSPACEWAIT <= ocnt) { assert(FALSE); return FALSE; } if (LATCH_CLEAR == WRITE_LATCH_VAL(cr)) { /* Check if cache-record is part of the active queue. If so, then remove it from the * tail of the active queue and move it to the head to try and speed up the flush. * If not and if cr->dirty is non-zero, then the only way this is possible we know * of is if a concurrent process encountered an error in the midst of commit in phase2 * of bg_update and finished the update but did not reinsert the cache-record in the * active queue (see comment in secshr_db_clnup about why INSQ*I macros are not used * in VMS). In this case, return FALSE as wcs_get_space cannot flush this cache-record. * The caller will trigger appropriate error handling. We are guaranteed that cr cannot * be part of the wip queue because WRITE_LATCH_VAL(cr) is LATCH_CLEAR (in wip queue it * will be > LATCH_CLEAR). */ if (0 != cr->state_que.fl) { /* We are about to play with the queues without using interlocks. * Assert no one else could be concurrently playing with the queue. */ assert(!cnl->wcs_phase2_commit_pidcnt && !cnl->in_wtstart); base = &csa->acc_meth.bg.cache_state->cacheq_active; q0 = (que_ent_ptr_t)((sm_uc_ptr_t)&cr->state_que + cr->state_que.fl); shuffqth((que_ent_ptr_t)q0, (que_ent_ptr_t)base); } else if (cr->dirty) { assert(gtm_white_box_test_case_enabled); return FALSE; } } SIGNAL_WRITERS_TO_RESUME(cnl); JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno); /* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */ wcs_sleep(lcnt); } else if ((0 == cr->iosb.cond) || (WRT_STRT_PNDNG == cr->iosb.cond)) { JNL_ENSURE_OPEN_WCS_WTSTART(csa, reg, 0, dummy_errno); /* a macro that ensure jnl is open, dclast's wcs_wtstart and checks for errors etc. */ wcs_sleep(lcnt); } if (FALSE == wcs_wtfini(reg)) return FALSE; } if (0 == cr->dirty) return TRUE; assert(FALSE); return FALSE; } for (lcnt = 1; ((cnl->wc_in_free < needed) && (MAXGETSPACEWAIT > lcnt)); lcnt++) { DCLAST_WCS_WTSTART(reg, 0, dummy_errno); /* a macro that dclast's wcs_wtstart and checks for errors etc. */ wcs_sleep(lcnt); if (FALSE == wcs_wtfini(reg)) return FALSE; } if (cnl->wc_in_free < needed) { assert(FALSE); return FALSE; } return TRUE; }
void preemptive_db_clnup(int preemptive_severe) { sgmnt_addrs *csa; sgm_info *si; gd_region *r_top, *reg; gd_addr *addr_ptr; if (!dollar_tlevel && update_trans) { /* It's possible we hit an error in the middle of an update, at which point we have * a valid clue and non-NULL cse. However, this causes problems for subsequent * transactions (see comment in t_begin). In particular we could end up pinning buffers * unnecessarily. So clear the cse of any histories that may have been active during the update. */ CLEAR_CSE(gv_target); if ((NULL != gv_target) && (NULL != gv_target->gd_csa)) { CLEAR_CSE(gv_target->gd_csa->dir_tree); GTMTRIG_ONLY(CLEAR_CSE(gv_target->gd_csa->hasht_tree)); } } if (INVALID_GV_TARGET != reset_gv_target) { if (SUCCESS != preemptive_severe && INFO != preemptive_severe) { /* We know of a few cases in Unix where gv_target and gv_currkey could be out of sync at this point. * a) If we are inside trigger code which in turn does an update that does * reads of ^#t global and ends up in a restart. This restart would * in turn do a rts_error(TPRETRY) which would invoke mdb_condition_handler * that would in turn invoke preemptive_db_clnup which invokes this macro. * In this tp restart case though, it is ok for gv_target and gv_currkey * to be out of sync because they are going to be reset by tp_clean_up anyways. * So skip the dbg-only in-sync check. * b) If we are in gvtr_init reading the ^#t global and detect an error (e.g. TRIGINVCHSET) * gv_target after the reset would be pointing to a regular global whereas gv_currkey * would be pointing to ^#t. It is ok to be out-of-sync since in this case, we expect * mdb_condition_handler to be calling us. That has code to reset gv_currkey (and * cs_addrs/cs_data etc.) to reflect gv_target (i.e. get them back in sync). * Therefore in Unix we pass SKIP_GVT_GVKEY_CHECK to skip the gvtarget/gvcurrkey out-of-sync check * in RESET_GV_TARGET. In VMS we pass DO_GVT_GVKEY_CHECK as we dont yet know of an out-of-sync situation. */ RESET_GV_TARGET(UNIX_ONLY(SKIP_GVT_GVKEY_CHECK) VMS_ONLY(DO_GVT_GVKEY_CHECK)); } } need_kip_incr = FALSE; /* in case we got an error in t_end (e.g. GBLOFLOW), dont want this global variable to get * carried over to the next non-TP transaction that this process does (e.g. inside an error trap). */ if (dollar_tlevel) { for (si = first_sgm_info; si != NULL; si = si->next_sgm_info) { if (NULL != si->kip_csa) { csa = si->tp_csa; assert(si->tp_csa == si->kip_csa); CAREFUL_DECR_KIP(csa->hdr, csa, si->kip_csa); } } } else if (NULL != kip_csa && (NULL != kip_csa->hdr) && (NULL != kip_csa->nl)) CAREFUL_DECR_KIP(kip_csa->hdr, kip_csa, kip_csa); if (IS_DSE_IMAGE) { /* Release crit on any region that was obtained for the current erroring DSE operation. * Take care NOT to release crits obtained by a previous CRIT -SEIZE command. */ for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { for (reg = addr_ptr->regions, r_top = reg + addr_ptr->n_regions; reg < r_top; reg++) { if (reg->open && !reg->was_open) { csa = &FILE_INFO(reg)->s_addrs; assert(csa->hold_onto_crit || !csa->dse_crit_seize_done); assert(!csa->hold_onto_crit || csa->now_crit); if (csa->now_crit && (!csa->hold_onto_crit || !csa->dse_crit_seize_done)) { rel_crit(reg); csa->hold_onto_crit = FALSE; t_abort(reg, csa); /* cancel mini-transaction if any in progress */ } } } } } }
boolean_t mu_truncate(int4 truncate_percent) { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int num_local_maps; int lmap_num, lmap_blk_num; int bml_status, sigkill; int save_errno; int ftrunc_status; uint4 jnl_status; uint4 old_total, new_total; uint4 old_free, new_free; uint4 end_blocks; int4 blks_in_lmap, blk; gtm_uint64_t before_trunc_file_size; off_t trunc_file_size; off_t padding; uchar_ptr_t lmap_addr; boolean_t was_crit; uint4 found_busy_blk; srch_blk_status bmphist; srch_blk_status *blkhist; srch_hist alt_hist; trans_num curr_tn; blk_hdr_ptr_t lmap_blk_hdr; block_id *blkid_ptr; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; char *err_msg; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csa = cs_addrs; csd = cs_data; if (dba_mm == csd->acc_meth) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } /* already checked for parallel truncates on this region --- see mupip_reorg.c */ gv_target = NULL; assert(csa->nl->trunc_pid == process_id); assert(dba_mm != csd->acc_meth); old_total = csa->ti->total_blks; old_free = csa->ti->free_blocks; sigkill = 0; found_busy_blk = 0; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ assert(csd->bplmap == BLKS_PER_LMAP); end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */ if (0 == end_blocks) end_blocks = BLKS_PER_LMAP; num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP); /* ======================================== PHASE 1 ======================================== */ for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--) { if (mu_ctrly_occurred || mu_ctrlc_occurred) return TRUE; assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */ if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } lmap_blk_num = lmap_num * BLKS_PER_LMAP; if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { found_busy_blk = lmap_blk_num; break; } blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP; /* Loop through non-bitmap blocks of this lmap, do recycled2free */ DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n", lmap_num, lmap_blk_num, blks_in_lmap)); for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;) { t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) /* retry loop for recycled to free transactions */ { curr_tn = csd->trans_hist.curr_tn; /* Read the nth local bitmap into memory */ bmphist.blk_num = lmap_blk_num; bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr); lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr; if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr); /* starting from the hint (blk itself), find the first busy or recycled block */ blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status); assert(blk < BLKS_PER_LMAP); if (blk == -1 || blk >= blks_in_lmap) { /* done with this lmap, continue to next */ t_abort(gv_cur_region, csa); break; } else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { /* stop processing blocks... skip ahead to phase 2 */ found_busy_blk = lmap_blk_num; t_abort(gv_cur_region, csa); break; } else if (BLK_RECYCLED == bml_status) { /* Write PBLK records for recycled blocks only if before_image journaling is * enabled. t_end() takes care of checking if journaling is enabled and * writing PBLK record. We have to at least mark the recycled block as free. */ RESET_UPDATE_ARRAY; update_trans = UPDTRNS_DB_UPDATED_MASK; *((block_id *)update_array_ptr) = blk; update_array_ptr += SIZEOF(block_id); *(int *)update_array_ptr = 0; alt_hist.h[1].blk_num = 0; alt_hist.h[0].level = 0; alt_hist.h[0].cse = NULL; alt_hist.h[0].tn = curr_tn; alt_hist.h[0].blk_num = lmap_blk_num + blk; alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num, &alt_hist.h[0].cycle, &alt_hist.h[0].cr); if (!alt_hist.h[0].buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } if (!t_recycled2free(&alt_hist.h[0])) { t_retry(cdb_sc_lostbmlcr); continue; } t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0); /* Set the opcode for INCTN record written by t_end() */ inctn_opcode = inctn_blkmarkfree; if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; /* block processed, scan from the next one */ blk++; break; } else { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_badbitmap); continue; } } /* END recycled2free retry loop */ } /* END scanning blocks of this particular lmap */ /* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */ /* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */ DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num)); t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { RESET_UPDATE_ARRAY; BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */ curr_tn = csd->trans_hist.curr_tn; blkhist = &alt_hist.h[0]; blkhist->blk_num = lmap_blk_num; blkhist->tn = curr_tn; blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ /* Read the nth local bitmap into memory */ blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr; if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); blkhist->blk_num = 0; /* create empty history for bitmap block */ if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; break; } } /* END scanning lmaps */ /* ======================================== PHASE 2 ======================================== */ assert(!csa->now_crit); for (;;) { /* wait for FREEZE, we don't want to truncate a frozen database */ grab_crit(gv_cur_region); if (!cs_data->freeze && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (cs_data->freeze || IS_REPL_INST_FROZEN) hiber_start(1000); } assert(csa->nl->trunc_pid == process_id); /* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */ if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) { assert(FALSE); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"), DB_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return FALSE; } csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk); assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk)); new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP); if (mu_ctrly_occurred || mu_ctrlc_occurred) { rel_crit(gv_cur_region); return TRUE; } else if (csa->ti->total_blks != old_total || new_total == old_total) { assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); rel_crit(gv_cur_region); return TRUE; } else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (SNAPSHOTS_IN_PROG(csa->nl)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); if (JNL_ENABLED(csa)) { /* Write JRT_TRUNC and INCTN records */ if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = csa->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (SS_NORMAL != jnl_status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); else { if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total); inctn_opcode = inctn_mu_reorg; jnl_write_inctn_rec(csa); jnl_status = jnl_flush(gv_cur_region); if (SS_NORMAL != jnl_status) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"), jnl_status); assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */ } } } /* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */ curr_tn = csa->ti->curr_tn; CHECK_TN(csa, csd, curr_tn); udi = FILE_INFO(gv_cur_region); /* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */ trunc_file_size = BLK_ZERO_OFF(csd) + ((off_t)csd->blk_size * new_total) + DISK_BLOCK_SIZE; csd->after_trunc_total_blks = new_total; csd->before_trunc_free_blocks = csa->ti->free_blocks; csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */ /* file size and total blocks: INCONSISTENT */ csa->ti->total_blks = new_total; /* past the point of no return -- shared memory intact */ assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total)); csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total); new_free = csa->ti->free_blocks; KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */ fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); CHECK_DBSYNC(gv_cur_region, save_errno); /* past the point of no return -- shared memory deleted */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */ clear_cache_array(csa, csd, gv_cur_region, new_total, old_total); WRITE_EOF_BLOCK(gv_cur_region, csd, new_total, save_errno); if (0 != save_errno) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); return FALSE; } KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */ /* Execute an ftruncate() and truncate the DB file * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS) * It ignores kill -9 signal till its operation is completed. * So we can safely assume that the result of ftruncate() will be complete. */ FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status); if (0 != ftrunc_status) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); /* should go through recover_truncate now, which will again try to FTRUNCATE */ return FALSE; } /* file size and total blocks: CONSISTENT (shrunk) */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */ csa->nl->root_search_cycle++; /* Force concurrent processes to restart in t_end/tp_tend to make sure no one * tries to commit updates past the end of the file. Bitmap validations together * with highest_lbm_with_busy_blk should actually be sufficient, so this is * just to be safe. */ csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */ /* Increment TN */ assert(csa->ti->early_tn == csa->ti->curr_tn); csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1; INCREMENT_CURR_TN(csd); fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */ CHECK_DBSYNC(gv_cur_region, save_errno); ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); curr_tn = csa->ti->curr_tn; rel_crit(gv_cur_region); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn); util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].", FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free); return TRUE; } /* END of mu_truncate() */
void dse_f_reg(void) { char rn[MAX_RN_LEN]; unsigned short rnlen; int i; bool found; gd_region *ptr; gd_addr *temp_gdaddr; gd_binding *map; temp_gdaddr = gd_header; gd_header = original_header; rnlen = SIZEOF(rn); if (!cli_get_str("REGION",rn,&rnlen)) { gd_header = temp_gdaddr; return; } if (rn[0] == '*' && rnlen == 1) { util_out_print("List of global directory:!_!AD!/",TRUE,dollar_zgbldir.str.len,dollar_zgbldir.str.addr); for (i=0, ptr = gd_header->regions; i < gd_header->n_regions ;i++, ptr++) { util_out_print("!/File !_!AD",TRUE, ptr->dyn.addr->fname_len,&ptr->dyn.addr->fname[0]); util_out_print("Region!_!AD",TRUE, REG_LEN_STR(ptr)); } gd_header = temp_gdaddr; return; } assert(rn[0]); found = FALSE; for (i=0, ptr = gd_header->regions; i < gd_header->n_regions ;i++, ptr++) { if (found = !memcmp(&ptr->rname[0],&rn[0],MAX_RN_LEN)) break; } if (!found) { util_out_print("Error: region not found.",TRUE); gd_header = temp_gdaddr; return; } if (ptr == gv_cur_region) { util_out_print("Error: already in region: !AD",TRUE,REG_LEN_STR(gv_cur_region)); gd_header = temp_gdaddr; return; } if (ptr->dyn.addr->acc_meth == dba_cm) { util_out_print("Error: Cannot edit an GT.CM database file.",TRUE); gd_header = temp_gdaddr; return; } if (ptr->dyn.addr->acc_meth == dba_usr) { util_out_print("Error: Cannot edit a non-GDS format database file.",TRUE); gd_header = temp_gdaddr; return; } if (!ptr->open) { util_out_print("Error: that region was not opened because it is not bound to any namespace.",TRUE); gd_header = temp_gdaddr; return; } if (TRUE == cs_addrs->now_crit) { util_out_print("Warning: now leaving region in critical section: !AD",TRUE, gv_cur_region->rname_len, gv_cur_region->rname); } gv_cur_region = ptr; gv_target = NULL; /* to prevent out-of-sync situations between gv_target and cs_addrs */ gv_currkey->base[0] = '\0'; /* prevent fast-path from op_gvname from being taken as region has been switched * and gv_target has been reset to NULL. */ gv_currkey->end = 0; /* clear end so it is in sync with base[0] */ switch (gv_cur_region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs; cs_data = cs_addrs->hdr; break; default: GTMASSERT; } if (cs_addrs && cs_addrs->critical) crash_count = cs_addrs->critical->crashcnt; util_out_print("!/File !_!AD",TRUE, DB_LEN_STR(gv_cur_region)); util_out_print("Region!_!AD!/",TRUE, REG_LEN_STR(gv_cur_region)); patch_curr_blk = get_dir_root(); gv_init_reg(gv_cur_region); GET_SAVED_GDADDR(gd_header, temp_gdaddr, map, gv_cur_region); return; }
int main(int argc, char *argv[]) { DCL_THREADGBL_ACCESS; GTM_THREADGBL_INIT; common_startup_init(DSE_IMAGE); licensed = TRUE; TREF(transform) = TRUE; TREF(no_spangbls) = TRUE; /* dse operates on a per-region basis irrespective of global mapping in gld */ TREF(skip_file_corrupt_check) = TRUE; /* do not let csd->file_corrupt flag cause errors in dse */ op_open_ptr = op_open; patch_curr_blk = get_dir_root(); err_init(util_base_ch); UNICODE_ONLY(gtm_strToTitle_ptr = >m_strToTitle); GTM_ICU_INIT_IF_NEEDED; /* Note: should be invoked after err_init (since it may error out) and before CLI parsing */ sig_init(generic_signal_handler, dse_ctrlc_handler, suspsigs_handler, continue_handler); atexit(util_exit_handler); SET_LATCH_GLOBAL(&defer_latch, LOCK_AVAILABLE); stp_init(STP_INITSIZE); rts_stringpool = stringpool; getjobname(); INVOKE_INIT_SECSHR_ADDRS; io_init(TRUE); getzdir(); gtm_chk_dist(argv[0]); prealloc_gt_timers(); gt_timers_add_safe_hndlrs(); initialize_pattern_table(); gvinit(); region_init(FALSE); util_out_print("!/File !_!AD", TRUE, DB_LEN_STR(gv_cur_region)); util_out_print("Region!_!AD!/", TRUE, REG_LEN_STR(gv_cur_region)); cli_lex_setup(argc, argv); /* Since DSE operates on a region-by-region basis (for the most part), do not use a global directory at all from now on */ original_header = gd_header; gd_header = NULL; OPERATOR_LOG_MSG; # ifdef DEBUG if ((gtm_white_box_test_case_enabled && (WBTEST_SEMTOOLONG_STACK_TRACE == gtm_white_box_test_case_number) )) { sgmnt_addrs * csa; node_local_ptr_t cnl; csa = &FILE_INFO(gv_cur_region)->s_addrs; cnl = csa->nl; cnl->wbox_test_seq_num = 1; /*Signal the first step and wait here*/ /* The signal to the shell. MUPIP must not start BEFORE DSE */ util_out_print("DSE is ready. MUPIP can start. Note: This message is a part of WBTEST_SEMTOOLONG_STACK_TRACE test. " "It will not appear in PRO version.", TRUE); while (2 != cnl->wbox_test_seq_num) /*Wait for another process to get hold of the semaphore and signal next step*/ LONG_SLEEP(1); } # endif if (argc < 2) display_prompt(); while (1) { if (!dse_process(argc)) break; display_prompt(); } dse_exit(); REVERT; return 0; }
void gv_rundown(void) { gd_region *r_top, *r_save, *r_local; gd_addr *addr_ptr; sgm_info *si; int4 rundown_status = EXIT_NRM; /* if gds_rundown went smoothly */ # ifdef VMS vms_gds_info *gds_info; # elif UNIX unix_db_info *udi; # endif #if defined(DEBUG) && defined(UNIX) sgmnt_addrs *csa; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; r_save = gv_cur_region; /* Save for possible core dump */ gvcmy_rundown(); ENABLE_AST if (pool_init) rel_lock(jnlpool.jnlpool_dummy_reg); for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++) { if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth) { /* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above. * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions, * region->open is TRUE although cs_addrs is NULL. */ # if defined(DEBUG) && defined(UNIX) if (is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE && TREF(gtm_test_fake_enospc)) { /* Clear ENOSPC faking now that we are running down */ csa = REG2CSA(r_local); if (csa->nl->fake_db_enospc || csa->nl->fake_jnl_enospc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT, 2, LEN_AND_LIT("Resetting fake_db_enospc and fake_jnl_enospc")); csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = FALSE; } } # endif gv_cur_region = r_local; tp_change_reg(); UNIX_ONLY(rundown_status |=) gds_rundown(); /* Now that gds_rundown is done, free up the memory associated with the region. * Ideally the following memory freeing code should go to gds_rundown, but * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM. */ if (NULL != cs_addrs) { if (NULL != cs_addrs->dir_tree) FREE_CSA_DIR_TREE(cs_addrs); if (cs_addrs->sgm_info_ptr) { si = cs_addrs->sgm_info_ptr; /* It is possible we got interrupted before initializing all fields of "si" * completely so account for NULL values while freeing/releasing those fields. */ assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa)); if (si->jnl_tail) { CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list); } CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list); CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list); CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list); if (NULL != si->blks_in_use) { free_hashtab_int4(si->blks_in_use); free(si->blks_in_use); si->blks_in_use = NULL; } if (si->cr_array_size) { assert(NULL != si->cr_array); if (NULL != si->cr_array) free(si->cr_array); } if (NULL != si->first_tp_hist) free(si->first_tp_hist); free(si); } if (cs_addrs->jnl) { assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs); if (cs_addrs->jnl->jnllsb) { UNIX_ONLY(assert(FALSE)); free(cs_addrs->jnl->jnllsb); } free(cs_addrs->jnl); } GTMCRYPT_ONLY( if (cs_addrs->encrypted_blk_contents) free(cs_addrs->encrypted_blk_contents); ) } assert(gv_cur_region->dyn.addr->file_cntl->file_info); VMS_ONLY( gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info; if (gds_info->xabpro) free(gds_info->xabpro); if (gds_info->xabfhc) free(gds_info->xabfhc); if (gds_info->nam) { free(gds_info->nam->nam$l_esa); free(gds_info->nam); } if (gds_info->fab) free(gds_info->fab); ) free(gv_cur_region->dyn.addr->file_cntl->file_info); free(gv_cur_region->dyn.addr->file_cntl); } r_local->open = r_local->was_open = FALSE; } }
void jnl_fsync(gd_region *reg, uint4 fsync_addr) { jnl_private_control *jpc; jnl_buffer_ptr_t jb; uint4 lcnt, saved_dsk_addr, saved_status; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int4 lck_state; int fsync_ret, save_errno; error_def(ERR_JNLFSYNCERR); error_def(ERR_FSYNCTIMOUT); error_def(ERR_TEXT); error_def(ERR_JNLFRCDTERM); error_def(ERR_JNLFSYNCLSTCK); csa = &FILE_INFO(reg)->s_addrs; jpc = csa->jnl; jb = jpc->jnl_buff; if ((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) { csd = csa->hdr; for (lcnt = 1; fsync_addr > jb->fsync_dskaddr && !JNL_FILE_SWITCHED(jpc); lcnt++) { if (MAX_FSYNC_WAIT_CNT / 2 == lcnt) /* half way into max.patience*/ { saved_status = jpc->status; jpc->status = SS_NORMAL; jnl_send_oper(jpc, ERR_JNLFSYNCLSTCK); jpc->status = saved_status ; } if (MAX_FSYNC_WAIT_CNT == lcnt) /* tried a long */ { saved_status = jpc->status; jpc->status = SS_NORMAL; jnl_send_oper(jpc, ERR_JNLFSYNCLSTCK); jpc->status = saved_status ; send_msg(VARLSTCNT(4) ERR_FSYNCTIMOUT, 2, JNL_LEN_STR(csd)); GTMASSERT; } BG_TRACE_PRO_ANY(csa, n_jnl_fsync_tries); if (GET_SWAPLOCK(&jb->fsync_in_prog_latch)) break; wcs_sleep(lcnt); performCASLatchCheck(&jb->fsync_in_prog_latch, lcnt); } if (fsync_addr > jb->fsync_dskaddr && !JNL_FILE_SWITCHED(jpc)) { assert(process_id == jb->fsync_in_prog_latch.u.parts.latch_pid); /* assert we have the lock */ saved_dsk_addr = jb->dskaddr; if (jpc->sync_io) { /* We need to maintain the fsync control fields irrespective of the type of IO, because we might * switch between these at any time. */ jb->fsync_dskaddr = saved_dsk_addr; } else { GTM_FSYNC(jpc->channel, fsync_ret); if (-1 == fsync_ret) { save_errno = errno; assert(FALSE); send_msg(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), save_errno); rts_error(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), save_errno); } else { jb->fsync_dskaddr = saved_dsk_addr; BG_TRACE_PRO_ANY(csa, n_jnl_fsyncs); } } } if (process_id == jb->fsync_in_prog_latch.u.parts.latch_pid) RELEASE_SWAPLOCK(&jb->fsync_in_prog_latch); } return; }
void lke_clear(void) { bool locks, all = TRUE, wait = FALSE, interactive = TRUE, match = FALSE, memory = FALSE, nocrit = FALSE; boolean_t exact = TRUE, was_crit; int4 pid; int n; char regbuf[MAX_RN_LEN], nodebuf[32], one_lockbuf[MAX_KEY_SZ]; mlk_ctldata_ptr_t ctl; mstr reg, node, one_lock; /* Get all command parameters */ reg.addr = regbuf; reg.len = SIZEOF(regbuf); node.addr = nodebuf; node.len = SIZEOF(nodebuf); one_lock.addr = one_lockbuf; one_lock.len = SIZEOF(one_lockbuf); if (lke_getcli(&all, &wait, &interactive, &pid, ®, &node, &one_lock, &memory, &nocrit, &exact) == 0) return; /* Search all regions specified on the command line */ for (gv_cur_region = gd_header->regions, n = 0; n != gd_header->n_regions; ++gv_cur_region, ++n) { /* If region matches and is open */ if ((reg.len == 0 || gv_cur_region->rname_len == reg.len && memcmp(gv_cur_region->rname, reg.addr, reg.len) == 0) && gv_cur_region->open) { match = TRUE; util_out_print("!/!AD!/", NOFLUSH, REG_LEN_STR(gv_cur_region)); /* If distributed database, the region is located on another node */ if (gv_cur_region->dyn.addr->acc_meth == dba_cm) { # if defined(LKE_WORKS_OK_WITH_CM) /* Remote lock clears are not supported, so LKE CLEAR -EXACT qualifier * will not be supported on GT.CM.*/ locks = gtcmtr_lke_clearreq(gv_cur_region->dyn.addr->cm_blk, gv_cur_region->cmx_regnum, all, interactive, pid, &node); # else gtm_putmsg(VARLSTCNT(10) ERR_UNIMPLOP, 0, ERR_TEXT, 2, LEN_AND_LIT("GT.CM region - locks must be cleared on the local node"), ERR_TEXT, 2, REG_LEN_STR(gv_cur_region)); continue; # endif } else if ((dba_bg == gv_cur_region->dyn.addr->acc_meth) || (dba_mm == gv_cur_region->dyn.addr->acc_meth)) { /* Local region */ cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs; ctl = (mlk_ctldata_ptr_t)cs_addrs->lock_addrs[0]; /* Prevent any modifications of locks while we are clearing */ if (cs_addrs->critical != NULL) crash_count = cs_addrs->critical->crashcnt; was_crit = cs_addrs->now_crit; if (!was_crit) grab_crit(gv_cur_region); locks = ctl->blkroot == 0 ? FALSE : lke_cleartree(gv_cur_region, NULL, ctl, (mlk_shrblk_ptr_t)R2A(ctl->blkroot), all, interactive, pid, one_lock, exact); if (!was_crit) rel_crit(gv_cur_region); } else { gtm_putmsg(VARLSTCNT(2) ERR_BADREGION, 0); locks = TRUE; } if (!locks) { gtm_putmsg(VARLSTCNT(4) ERR_NOLOCKMATCH, 2, REG_LEN_STR(gv_cur_region)); } } } if (!match && reg.len != 0) rts_error(VARLSTCNT(4) ERR_NOREGION, 2, reg.len, reg.addr); }
int4 gds_rundown(void) { boolean_t canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin; boolean_t have_standalone_access, ipc_deleted, err_caught; boolean_t is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm; boolean_t unsafe_last_writer; char time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status, rc; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; shm_snapshot_t *ss_shm_ptr; uint4 ss_pid, onln_rlbk_pid, holder_pid; boolean_t was_crit; boolean_t safe_mode; /* Do not flush or take down shared memory. */ boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen, ftok_counter_halted, access_counter_halted; int secshrstat; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return EXIT_NRM; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return EXIT_NRM; } /* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on * that later to determine if the process had standalone access or not when it entered this function. We need to guarantee * that none else access database file header when semid/shmid fields are reset. We already have created ftok semaphore in * db_init or, mu_rndwn_file and did not remove it. So just lock it. We do it in blocking mode. */ have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */ DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); ESTABLISH_NORET(gds_rundown_ch, err_caught); if (err_caught) { REVERT; WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0); ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE); return EXIT_ERR; } assert(reg->open); /* if we failed to open, dbinit_ch should have taken care of proper clean up */ assert(!reg->opening); /* see comment above */ assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth)); is_mm = (dba_bg != csd->acc_meth); assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk)); /* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown. */ was_crit = (csa->now_crit && jgbl.onlnrlbk); /* Cancel any pending flush timer for this region by this task */ canceled_flush_timer = FALSE; canceled_dbsync_timer = FALSE; CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); we_are_last_user = FALSE; inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr); if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); if (!was_crit) { rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); } /* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach * mupip_exit_handler while holding access control semaphore. Assert accordingly. */ assert(!have_standalone_access || mupip_jnl_recover || process_exiting); /* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it * needs the access control lock as well. The only expection is we are online rollback and currently running down. */ cnl = csa->nl; onln_rlbk_pid = cnl->onln_rlbk_pid; assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0)); if (!have_standalone_access) { if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */ { save_errno = errno; assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); } may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */ /* We need to guarantee that no one else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it. */ if (!ftok_sem_lock(reg, may_bypass_ftok)) { if (may_bypass_ftok) { /* We did a non-blocking wait. It's ok to proceed without locking */ bypassed_ftok = TRUE; holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */ { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at rundown")); } } else { /* We did a blocking wait but something bad happened. */ FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } } sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (0 != status) { save_errno = errno; /* Check # of processes counted on access sem. */ if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); } bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt; /* Before attempting again in the blocking mode, see if the holding process is an online rollback. * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we * are better off skipping rundown and continuing with sanity cleanup and exit. */ holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!bypassed_access) { /* We couldn't get it in one shot-- see if we already have it */ if (holder_pid == process_id) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); assert(FALSE); return EXIT_ERR; } if (EAGAIN != save_errno) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status, FORCED_WAIT); if (-1 == status) /* We couldn't get it at all.. */ rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, errno); } else if (!IS_GTM_IMAGE) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at rundown")); } udi->grabbed_access_sem = !bypassed_access; } } /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the * ftok semaphore and trying it could cause deadlock */ /* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE. * But there could be other processes still having the database open so we cannot safely reset the halted fields. */ if (have_standalone_access && !jgbl.onlnrlbk) csd->ftok_counter_halted = csd->access_counter_halted = FALSE; ftok_counter_halted = csd->ftok_counter_halted; access_counter_halted = csd->access_counter_halted; /* If we bypassed any of the semaphores, activate safe mode. * Also, if the replication instance is frozen and this db has replication turned on (which means * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode. */ ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen); safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted; /* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --cnl->ref_cnt; if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode; /* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */ assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen); if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); /* There's one writer left and I am it */ assert(reg->read_only || semval >= 0); unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch; we_are_last_writer = unsafe_last_writer && !safe_mode; assert(!we_are_last_writer || !safe_mode); assert(!we_are_last_user || !safe_mode); /* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */ assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen); GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */ if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL)))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); if (NULL != csa->ss_ctx) ss_destroy_context(csa->ss_ctx); /* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */ assert(1 == MAX_SNAPSHOTS); ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); ss_pid = ss_shm_ptr->ss_info.ss_pid; is_cur_process_ss_initiator = (process_id == ss_pid); if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user)) { /* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP * INTEG or by a MUPIP RUNDOWN. */ if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid) && (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0))) { ss_release(NULL); ss_release_lock(reg); } } /* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a * consistent state. */ if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY * if there is NO concurrent online rollback running (as we need crit to set wc_blocked) */ if (csa->wbuf_dqd && !is_mm) { /* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked * BEFORE secshr_db_clnup in mur_close_files. * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in * wcs_wtstart or wcs_get_space. Assert accordingly. */ assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode); if (!was_crit) grab_crit(reg); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); BG_TRACE_PRO_ANY(csa, lost_block_recovery); if (!was_crit) rel_crit(reg); } if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ assert(!safe_mode); if (is_mm) { MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg); cnl->remove_shm = TRUE; } if (cnl->wc_blocked && jgbl.onlnrlbk) { /* if the last update done by online rollback was not committed in the normal code-path but was * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This * could result in the last update never getting flushed to the disk and if online rollback happened * to be the last writer then the shared memory will be flushed and removed and the last update will * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is * possible only if phase1 or phase2 errors are induced using white box test cases */ assert(WB_COMMIT_ERR_ENABLED); wcs_recover(reg); } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); } else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen) { /* canceled pending db or jnl flush timers - flush database and journal buffers to disk */ if (!was_crit) grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); if (!was_crit) rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ if (!was_crit) grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing * a pini, so allow for that. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr)); /* If we haven't written a pini, let jnl_file_close write the pini/pfin. */ if (!jgbl.mur_extract && (0 != jpc->pini_addr)) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > cnl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } if (!was_crit) rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { if (!was_crit) grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!have_standalone_access && we_are_last_user) { /* mupip_exit_handler will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); if (!was_crit) rel_crit(reg); if (!is_mm) { GTM_DB_FSYNC(csa, udi->fd, rc); /* Sync it all */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ assert(csa->ti->total_blks == csa->total_blks); #ifdef _AIX GTM_DB_FSYNC(csa, udi->fd, rc); if (-1 == rc) #else if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1])) #endif { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } } else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg)); cnl->lastwriterbypas_msg_issued = TRUE; } } /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */ /* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this). */ CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm) { /* mupip_exit_handler will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (!csa->read_only_fs) { secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0); if (0 != secshrstat) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } } /* Done with file now, close it */ CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ # if !defined(_AIX) if (is_mm && (NULL != csa->db_addrs[0])) { assert(csa->db_addrs[1] > csa->db_addrs[0]); munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]); if (0 < munmap_len) munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len)); } # endif /* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down * this region. In the process also get the remove_shm status from node_local before detaching. * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl); /* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0. * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid). */ if (jgbl.onlnrlbk) cnl->onln_rlbk_pid = 0; rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */ /* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down. * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes. */ if (safe_mode) /* indicates flushing was skipped */ { if (bypassed_access) cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */ if (bypassed_ftok) cnl->dbrndwn_ftok_skip++; } if (jgbl.onlnrlbk) csa->hold_onto_crit = FALSE; GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0); status = shmdt((caddr_t)cnl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ /* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so * we are safe passing "csa". */ if (-1 == status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); REMOVE_CSA_FROM_CSADDRSLIST(csa); /* remove "csa" from list of open regions (cs_addrs_list) */ reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ assertpro(0 == csa->wbuf_dqd); ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); /* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */ udi->new_shm = FALSE; /* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from * mur_close_files()) */ if (!have_standalone_access) { if (0 != sem_rmid(udi->semid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); udi->new_sem = FALSE; /* Note that we no longer have a new semaphore */ udi->grabbed_access_sem = FALSE; udi->counter_acc_incremented = FALSE; } } else if (is_src_server || is_updproc) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else { assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode); if (!jgbl.onlnrlbk && !have_standalone_access) { /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) { if (!access_counter_halted) { save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO); if (0 != save_errno) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"), CALLFROM, save_errno); } udi->counter_acc_incremented = FALSE; } assert(safe_mode || !bypassed_access); /* Now remove the rundown lock */ if (!bypassed_access) { if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore release"), CALLFROM, save_errno); udi->grabbed_access_sem = FALSE; } } /* else access control semaphore will be released in db_ipcs_reset */ } if (!have_standalone_access) { if (bypassed_ftok) { if (!ftok_counter_halted) if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE)) { FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } udi->grabbed_ftok_sem = FALSE; udi->counter_ftok_incremented = FALSE; } ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); if (!ipc_deleted) { GET_CUR_TIME(time_str); if (is_src_server) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; return EXIT_NRM; }
void mlk_wake_pending(mlk_ctldata_ptr_t ctl, mlk_shrblk_ptr_t d, gd_region *reg) { mlk_prcblk_ptr_t next, pr; sm_uint_ptr_t empty_slot, ctop; sgmnt_addrs *csa; boolean_t remote_pid; int crit_wake_res; /* also used in macro DO_CRIT_WAKE */ int lcnt; csa = &FILE_INFO(reg)->s_addrs; if (!d->pending) return; ctl->wakeups++; /* Before updating d->sequence ensure there is no process owning this lock, since otherwise when the owner process attempts * to release the lock it will fail as its private copy of "p->sequence" will not match the shared memory "d->sequence". */ assert(!d->owner); d->sequence = csa->hdr->trans_hist.lock_sequence++; /* This node is being awakened (GTCM) */ BG_TRACE_PRO_ANY(csa, mlock_wakeups); /* Record halted slumbers */ if (reg->dyn.addr->acc_meth == dba_bg && csa->hdr->clustered) { remote_pid = FALSE; for (empty_slot = ctl->clus_pids, ctop = &ctl->clus_pids[NUM_CLST_LCKS-1]; *empty_slot && empty_slot <= ctop; empty_slot++) ; for (pr = (mlk_prcblk_ptr_t)R2A(d->pending), lcnt = csa->hdr->lock_space_size / PRC_FACTOR; lcnt; lcnt--) { next = (pr->next) ? (mlk_prcblk_ptr_t)R2A(pr->next) : 0; /* in case it's deleted */ if ((pr->process_id & NODENUMBER) == (process_id & NODENUMBER)) { DO_CRIT_WAKE; } else if (empty_slot <= ctop) { remote_pid = TRUE; *empty_slot = pr->process_id; empty_slot++; } if (next) pr = next; else break; } if (remote_pid) ccp_cluster_lock_wake(reg); } else { for (pr = (mlk_prcblk_ptr_t)R2A(d->pending), lcnt = csa->hdr->lock_space_size / PRC_FACTOR; lcnt; lcnt--) { next = (pr->next) ? (mlk_prcblk_ptr_t)R2A(pr->next) : 0; /* in case it's deleted */ DO_CRIT_WAKE; /* Wake one process to keep things orderly, if it loses its way, others * will jump in after a timout */ if (GONE == crit_wake_res && next) pr = next; else break; } } if (!lcnt) GTMASSERT; return; }
void mu_int_reg(gd_region *reg, boolean_t *return_value) { boolean_t read_only, was_crit; freeze_status status; node_local_ptr_t cnl; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; # ifdef DEBUG boolean_t need_to_wait = FALSE; int trynum; uint4 curr_wbox_seq_num; # endif sgmnt_data *csd_copy_ptr; gd_segment *seg; int gtmcrypt_errno; *return_value = FALSE; UNIX_ONLY(jnlpool_init_needed = TRUE); ESTABLISH(mu_int_reg_ch); if (dba_usr == reg->dyn.addr->acc_meth) { util_out_print("!/Can't integ region !AD; not GDS format", TRUE, REG_LEN_STR(reg)); mu_int_skipreg_cnt++; return; } gv_cur_region = reg; if (reg_cmcheck(reg)) { util_out_print("!/Can't integ region across network", TRUE); mu_int_skipreg_cnt++; return; } gvcst_init(gv_cur_region); if (gv_cur_region->was_open) { /* already open under another name */ gv_cur_region->open = FALSE; return; } change_reg(); csa = &FILE_INFO(gv_cur_region)->s_addrs; cnl = csa->nl; csd = csa->hdr; read_only = gv_cur_region->read_only; assert(NULL != mu_int_master); /* Ensure that we don't see an increase in the file header and master map size compared to it's maximum values */ assert(SGMNT_HDR_LEN >= SIZEOF(sgmnt_data) && (MASTER_MAP_SIZE_MAX >= MASTER_MAP_SIZE(csd))); /* ONLINE INTEG if asked for explicitly by specifying -ONLINE is an error if the db has partial V4 blocks. * However, if -ONLINE is not explicitly specified but rather assumed implicitly (as default for -REG) * then turn off ONLINE INTEG for this region and continue as if -NOONLINE was specified */ # ifdef GTM_SNAPSHOT if (!csd->fully_upgraded) { ointeg_this_reg = FALSE; /* Turn off ONLINE INTEG for this region */ if (online_specified) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_SSV4NOALLOW, 2, DB_LEN_STR(gv_cur_region)); util_out_print(NO_ONLINE_ERR_MSG, TRUE); mu_int_skipreg_cnt++; return; } } # endif if (!ointeg_this_reg || read_only) { status = region_freeze(gv_cur_region, TRUE, FALSE, TRUE); switch (status) { case REG_ALREADY_FROZEN: UNIX_ONLY(if (csa->read_only_fs) break); util_out_print("!/Database for region !AD is already frozen, not integing", TRUE, REG_LEN_STR(gv_cur_region)); mu_int_skipreg_cnt++; return; case REG_HAS_KIP: /* We have already waited for KIP to reset. This time do not wait for KIP */ status = region_freeze(gv_cur_region, TRUE, FALSE, FALSE); if (REG_ALREADY_FROZEN == status) { UNIX_ONLY(if (csa->read_only_fs) break); util_out_print("!/Database for region !AD is already frozen, not integing", TRUE, REG_LEN_STR(gv_cur_region)); mu_int_skipreg_cnt++; return; } break; case REG_FREEZE_SUCCESS: break; default: assert(FALSE); }
uint4 jnl_file_open(gd_region *reg, bool init, void *dummy) /* third argument for compatibility with VMS version */ { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; jnl_private_control *jpc; jnl_buffer_ptr_t jb; struct stat stat_buf; uint4 sts; sm_uc_ptr_t nameptr; int fstat_res; int close_res; boolean_t retry; ZOS_ONLY(int realfiletag;) csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; jpc = csa->jnl; jb = jpc->jnl_buff; assert(NOJNL == jpc->channel); sts = 0; jpc->status = jpc->status2 = SS_NORMAL; nameptr = csd->jnl_file_name; assert('/' == csd->jnl_file_name[0]); if (init) { assert(csd->jnl_file_len < JNL_NAME_SIZE); nameptr[csd->jnl_file_len] = 0; cre_jnl_file_intrpt_rename(((int)csd->jnl_file_len), csd->jnl_file_name); /* although jnl_file_close() would have reset jnl_file.u.inode and device to 0 and incremented cycle, it * might have got shot in the middle of executing those instructions. we redo it here just to be safe.
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { int4 status; uint4 blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; boolean_t clustered, hold_onto_crit, was_crit; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; enum db_ver ondsk_blkver; int4 dummy_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked; ht_ent_int4 *tabent; srch_blk_status *blkhist; trans_num dirty, blkhdrtn; sm_uc_ptr_t buffaddr; uint4 stuck_cnt = 0; boolean_t lcl_blk_free; node_local_ptr_t cnl; lcl_blk_free = block_is_free; block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */ first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); /* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */ assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover); if (dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); assert(!chain1.flag || cse); if (cse) { /* transaction has modified the sought after block */ if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode)) { /* Changes have not been committed to shared memory, i.e. still in private memory. * Build block in private buffer if not already done and return the same. */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ assert(gds_t_committed != cse->mode); already_built = (NULL != cse->new_buff); /* Validate the block's search history right after building a private copy. * This is not needed in case gvcst_search is going to reuse the clue's search * history and return (because tp_hist will do the validation of this block). * But if gvcst_search decides to do a fresh traversal (because the clue does not * cover the path of the current input key etc.) the block build that happened now * will not get validated in tp_hist since it will instead be given the current * key's search history path (a totally new path) for validation. Since a private * copy of the block has been built, tp_tend would also skip validating this block * so it is necessary that we validate the block right here. Since it is tricky to * accurately differentiate between the two cases, we do the validation * unconditionally here (besides it is only a few if checks done per block build * so it is considered okay performance-wise). */ gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(NULL != cse->blk_target); if (!already_built && !chain1.flag) { buffaddr = first_tp_srch_status->buffaddr; cr = first_tp_srch_status->cr; assert((is_mm || cr) && buffaddr); blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn; if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl); return (sm_uc_ptr_t)NULL; } if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle) || (first_tp_srch_status->blk_num != cr->blk))) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } else { /* Block changes are already committed to shared memory (possible if we are in TP * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read * block from shared memory; do not look at private memory (i.e. cse) as that might * not be as uptodate as shared memory. */ assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */ /* If this block was newly created as part of the TP transaction, it should not be killed * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would * have had an old_mode of kill_t_create in which case we would not have come into this * else block. Assert accordingly. */ assert(!chain1.flag); first_tp_srch_status = NULL; /* do not use any previous srch_hist information */ } } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; } ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { cr = first_tp_srch_status->cr; assert(cr && !first_tp_srch_status->cse); if (first_tp_srch_status->cycle == cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = cr; cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled in the cache. * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect * if block recycling happened within the same mini-action and restart in that case. * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know * the values to update to. Set a variable that will enable the updation before returning. * Also assert that if we are in the final retry, we are never in a situation where we have a * block that got recycled since the start of the current mini-action. This is easily detected since * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that * have been read as part of the current mini-action until now. */ assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk))); reset_first_tp_srch_status = TRUE; } } } if ((blk >= csa->ti->total_blks) || (blk < 0)) { /* requested block out of range; could occur because of a concurrency conflict */ if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data)) GTMASSERT; assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } # ifdef GTM_CRYPT /* If database is encrypted, check if encryption initialization went fine for this database. If not, * do not let process proceed as it could now potentially get a peek at the desired data from the * decrypted shared memory global buffers (read in from disk by other processes) without having to go to disk. * If DSE, allow for a special case where it is trying to dump a local bitmap block. In this case, DSE * can continue to run fine (even if encryption initialization failed) since bitmap blocks are unencrypted. */ if (csa->encrypt_init_status && (!dse_running || !IS_BITMAP_BLK(blk))) GC_RTS_ERROR(csa->encrypt_init_status, gv_cur_region->dyn.addr->fname); # endif assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; was_crit = csa->now_crit; ocnt = 0; cnl = csa->nl; set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */ hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (!csa->now_crit) { assert(!hold_onto_crit); if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only)) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno); /* a macro that dclast's "wcs_wtstart" and checks for errors etc. */ grab_crit(gv_cur_region); cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csd->trans_hist.curr_tn; /* Record history of most recent disk reads only in dbg builds for now. Although the macro * is just a couple dozen instructions, it is done while holding crit so we want to avoid * delaying crit unless really necessary. Whoever wants this information can enable it * by a build change to remove the DEBUG_ONLY part below. */ DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);) if (!was_crit && !hold_onto_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr); if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr), &ondsk_blkver, lcl_blk_free))) { /* buffer does not contain valid data, so reset blk to be empty */ cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */ cr->blk = CR_BLKEMPTY; cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); assert(was_crit == csa->now_crit); if (FUTURE_READ == status) { /* in cluster, block can be in the "future" with respect to the local history */ assert(TRUE == clustered); assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_future_read; /* t_retry forces the history up to date */ return (sm_uc_ptr_t)NULL; } if (ERR_DYNUPGRDFAIL == status) { /* if we dont hold crit on the region, it is possible due to concurrency conflicts * that this block is unused (i.e. marked free/recycled in bitmap, see comments in * gds_blk_upgrade.h). in this case we should not error out but instead restart. */ if (was_crit) { assert(FALSE); rts_error(VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region)); } else { rdfail_detail = cdb_sc_lostcr; return (sm_uc_ptr_t)NULL; } } if (-1 == status) { /* could have been concurrent truncate, and we read a blk >= csa->ti->total_blks */ /* restart */ rdfail_detail = cdb_sc_truncate; return (sm_uc_ptr_t)NULL; } else rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } disk_blk_read = TRUE; assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); /* Only set in cache if read was success */ cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); } else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt)) { assert(!hold_onto_crit); assert(TRUE == csa->now_crit); assert(cnl->in_crit == process_id); rel_crit(gv_cur_region); } }
int gtmsource_checkhealth(void) { uint4 gtmsource_pid; int status, semval, save_errno; boolean_t srv_alive, all_files_open; gtmsource_local_ptr_t gtmsourcelocal_ptr; int4 index, num_servers; seq_num reg_seqno, jnlseqno; gd_region *reg, *region_top; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char errtxt[OUT_BUFF_SIZE]; char *modestr; assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]); if (NULL != jnlpool.gtmsource_local) /* Check health of a specific source server */ gtmsourcelocal_ptr = jnlpool.gtmsource_local; else gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[0]; num_servers = 0; status = SRV_ALIVE; for (index = 0; index < NUM_GTMSRC_LCL; index++, gtmsourcelocal_ptr++) { if ('\0' == gtmsourcelocal_ptr->secondary_instname[0]) { assert(NULL == jnlpool.gtmsource_local); continue; } gtmsource_pid = gtmsourcelocal_ptr->gtmsource_pid; /* If CHECKHEALTH on a specific secondary instance is requested, print the health information irrespective * of whether a source server for that instance is alive or not. For CHECKHEALTH on ALL secondary instances * print health information only for those instances that have an active or passive source server alive. */ if ((NULL == jnlpool.gtmsource_local) && (0 == gtmsource_pid)) continue; repl_log(stdout, TRUE, TRUE, "Initiating CHECKHEALTH operation on source server pid [%d] for secondary instance" " name [%s]\n", gtmsource_pid, gtmsourcelocal_ptr->secondary_instname); srv_alive = (0 == gtmsource_pid) ? FALSE : is_proc_alive(gtmsource_pid, 0); if (srv_alive) { if (GTMSOURCE_MODE_ACTIVE == gtmsourcelocal_ptr->mode) modestr = "ACTIVE"; else if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsourcelocal_ptr->mode) modestr = "ACTIVE REQUESTED"; else if (GTMSOURCE_MODE_PASSIVE == gtmsourcelocal_ptr->mode) modestr = "PASSIVE"; else if (GTMSOURCE_MODE_PASSIVE_REQUESTED == gtmsourcelocal_ptr->mode) modestr = "PASSIVE REQUESTED"; else { assert(gtmsourcelocal_ptr->mode != gtmsourcelocal_ptr->mode); modestr = "UNKNOWN"; } repl_log(stderr, FALSE, TRUE, FORMAT_STR1, gtmsource_pid, "Source server", "", modestr); status |= SRV_ALIVE; num_servers++; } else { repl_log(stderr, FALSE, TRUE, FORMAT_STR, gtmsource_pid, "Source server", " NOT"); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_SRCSRVNOTEXIST, 2, LEN_AND_STR(gtmsourcelocal_ptr->secondary_instname)); status |= SRV_DEAD; } if (NULL != jnlpool.gtmsource_local) break; } if (NULL == jnlpool.gtmsource_local) { /* Compare number of servers that were found alive with the current value of the COUNT semaphore. * If they are not equal, report the discrepancy. */ semval = get_sem_info(SOURCE, SRC_SERV_COUNT_SEM, SEM_INFO_VAL); if (-1 == semval) { save_errno = errno; repl_log(stderr, FALSE, TRUE, "Error fetching source server count semaphore value : %s\n", STRERROR(save_errno)); status |= SRV_ERR; } else if (semval != num_servers) { repl_log(stderr, FALSE, FALSE, "Error : Expected %d source server(s) to be alive but found %d actually alive\n", semval, num_servers); repl_log(stderr, FALSE, TRUE, "Error : Check if any pid reported above is NOT a source server process\n"); status |= SRV_ERR; } } /* Check that there are no regions with replication state = WAS_ON (i.e. repl_was_open). If so report that. * But to determine that, we need to attach to all the database regions. */ gvinit(); /* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */ all_files_open = region_init(FALSE); if (!all_files_open) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN); status |= SRV_ERR; } else { for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++) { csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; if (REPL_WAS_ENABLED(csd)) { assert(!JNL_ENABLED(csd) || REPL_ENABLED(csd)); /* || is for turning replication on concurrently */ reg_seqno = csd->reg_seqno; jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO; sgtm_putmsg(errtxt, VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(reg), ®_seqno, ®_seqno, &jnlseqno, &jnlseqno); repl_log(stderr, FALSE, TRUE, errtxt); status |= SRV_ERR; } } } if (jnlpool.jnlpool_ctl->freeze) { repl_log(stderr, FALSE, FALSE, "Warning: Instance Freeze is ON\n"); repl_log(stderr, FALSE, TRUE, " Freeze Comment: %s\n", jnlpool.jnlpool_ctl->freeze_comment); status |= SRV_ERR; } return (status + NORMAL_SHUTDOWN); }
void ccp_reqwm_interrupt(ccp_db_header **pdb) { ccp_db_header *db; sgmnt_addrs *csa; uint4 status; assert(lib$ast_in_prog()); db = *pdb; csa = db->segment; if (csa == NULL || csa->nl->ccp_state == CCST_CLOSED) return; switch (db->wm_iosb.cond) { case SS$_DEADLOCK: ccp_signal_cont(SS$_DEADLOCK); /* Just try again */ ccp_request_write_mode(db); return; case SS$_CANCEL: /* Lock cancelled by close */ return; case SS$_VALNOTVALID: /* Force reads from disk */ db->wm_iosb.valblk[CCP_VALBLK_TRANS_HIST] = 0; db->last_lk_sequence = db->master_map_start_tn = 0; /* Drop through ... */ case SS$_NORMAL: if (db->wm_iosb.valblk[CCP_VALBLK_TRANS_HIST] == csa->ti->curr_tn + csa->ti->lock_sequence) { /* No change to current tn, do not need to update header */ if (csa->now_crit) { assert (csa->nl->in_crit == process_id); csa->nl->in_crit = 0; (void)mutex_unlockw(csa->critical, csa->critical->crashcnt, &csa->now_crit); /***** Check error status here? *****/ } ccp_writedb5(db); } else { if (csa->nl->in_crit == 0) { if (mutex_lockwim(csa->critical, csa->critical->crashcnt, &csa->now_crit) == cdb_sc_normal) csa->nl->in_crit = process_id; /* now_crit was set by mutex_lockwim */ else if (csa->nl->in_crit == 0) /***** Why is this re-tested? *****/ { status = sys$setimr(0, delta_100_msec, ccp_reqwm_interrupt, &db->wmcrit_timer_id, 0); if (status != SS$_NORMAL) ccp_signal_cont(status); /***** Is this reasonable? *****/ return; } } status = sys$qio(0, FILE_INFO(db->greg)->fab->fab$l_stv, IO$_READVBLK, &db->qio_iosb, ccp_writedb2, db, &db->glob_sec->trans_hist, BT_SIZE(csa->hdr) + SIZEOF(th_index), TH_BLOCK, 0, 0, 0); if (status != SS$_NORMAL) ccp_signal_cont(status); /***** Is this reasonable? *****/ } return; default: ccp_signal_cont(db->wm_iosb.cond); /***** Is this reasonable? *****/ return; } }
error_def(ERR_DBCCERR); error_def(ERR_DBFLCORRP); void grab_crit(gd_region *reg) { unix_db_info *udi; sgmnt_addrs *csa; node_local_ptr_t cnl; sgmnt_data_ptr_t csd; enum cdb_sc status; mutex_spin_parms_ptr_t mutex_spin_parms; DEBUG_ONLY(sgmnt_addrs *jnlpool_csa;) DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; cnl = csa->nl; # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_SENDTO_EPERM == gtm_white_box_test_case_number) && (0 == cnl->wbox_test_seq_num)) { FPRINTF(stderr, "MUPIP BACKUP entered grab_crit\n"); cnl->wbox_test_seq_num = 1; while (2 != cnl->wbox_test_seq_num) LONG_SLEEP(1); FPRINTF(stderr, "MUPIP BACKUP resumed in grab_crit\n"); cnl->wbox_test_seq_num = 3; }
bt_rec_ptr_t bt_put(gd_region *reg, int4 block) { bt_rec_ptr_t bt, q0, q1, hdr; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; cache_rec_ptr_t cr; th_rec_ptr_t th; trans_num lcl_tn; uint4 lcnt; csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs; csd = csa->hdr; assert(csa->now_crit || csd->clustered); assert(dba_mm != csa->hdr->acc_meth); lcl_tn = csa->ti->curr_tn; hdr = csa->bt_header + (block % csd->bt_buckets); assert(BT_QUEHEAD == hdr->blk); for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl); ; bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++) { if (BT_QUEHEAD == bt->blk) { /* there is no matching bt */ assert(bt == hdr); bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque)); if (CR_NOTVALID != bt->cache_index) { /* the oldest bt is still valid */ assert(!in_wcs_recover); cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index); if (cr->dirty) { /* get it written so it can be reused */ BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty); if (FALSE == wcs_get_space(reg, 0, cr)) { assert(csa->nl->wc_blocked); /* only reason we currently know * why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); BG_TRACE_PRO_ANY(csa, wcb_bt_put); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"), process_id, &lcl_tn, DB_LEN_STR(reg)); return NULL; } } bt->cache_index = CR_NOTVALID; cr->bt_index = 0; } q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl); q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0); if (EMPTY_QUEUE == (sm_long_t)q1) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1); bt->blk = block; bt->killtn = lcl_tn; insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr); th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (bt->blk == block) { /* bt_put should never be called twice for the same block with the same lcl_tn. This is because * t_end/tp_tend update every block only once as part of each update transaction. Assert this. * The two exceptions are * a) Forward journal recovery which simulates a 2-phase M-kill where the same block * could get updated in both phases (example bitmap block gets updated for blocks created * within the TP transaction as well as for blocks that are freed up in the 2nd phase of * the M-kill) with the same transaction number. This is because although GT.M would have * updated the same block with different transaction numbers in the two phases, forward * recovery will update it with the same tn and instead increment the db tn on seeing the * following INCTN journal record(s). * b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block * and potentially with the same tn. This is because the state of the queues is questionable * and there could be more than one cache record for a given block number. */ assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa))); q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl); th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque))); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (0 == bt->blkque.fl) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2); if (lcnt >= csd->n_bts) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3); } insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base); bt->tn = lcl_tn; return bt; }
/* * Description: * Grab ftok semaphore on replication instance file * Grab all replication semaphores for the instance (both jnlpool and recvpool) * Release ftok semaphore * Parameters: * Return Value: TRUE, if succsessful * FALSE, if fails. */ boolean_t mu_replpool_grab_sem(boolean_t immediate) { char instfilename[MAX_FN_LEN + 1]; gd_region *r_save; static gd_region *replreg; int status, save_errno; union semun semarg; struct semid_ds semstat; repl_inst_hdr repl_instance; unix_db_info *udi; unsigned int full_len; error_def(ERR_RECVPOOLSETUP); error_def(ERR_JNLPOOLSETUP); error_def(ERR_REPLFTOKSEM); error_def(ERR_TEXT); if (NULL == replreg) { r_save = gv_cur_region; mu_gv_cur_reg_init(); replreg = gv_cur_region; gv_cur_region = r_save; } jnlpool.jnlpool_dummy_reg = replreg; recvpool.recvpool_dummy_reg = replreg; if (!repl_inst_get_name(instfilename, &full_len, MAX_FN_LEN + 1, issue_rts_error)) GTMASSERT; /* rts_error should have been issued by repl_inst_get_name */ assert(full_len); memcpy((char *)replreg->dyn.addr->fname, instfilename, full_len); replreg->dyn.addr->fname_len = full_len; udi = FILE_INFO(replreg); udi->fn = (char *)replreg->dyn.addr->fname; if (!ftok_sem_get(replreg, TRUE, REPLPOOL_ID, immediate)) rts_error(VARLSTCNT(4) ERR_REPLFTOKSEM, 2, full_len, instfilename); repl_inst_read(instfilename, (off_t)0, (sm_uc_ptr_t)&repl_instance, SIZEOF(repl_inst_hdr)); /* * -------------------------- * First semaphores of jnlpool * -------------------------- */ if (-1 == (udi->semid = init_sem_set_source(IPC_PRIVATE, NUM_SRC_SEMS, RWDALL | IPC_CREAT))) { ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error creating journal pool"), REPL_SEM_ERRNO); } semarg.val = GTM_ID; if (-1 == semctl(udi->semid, SOURCE_ID_SEM, SETVAL, semarg)) { save_errno = errno; remove_sem_set(SOURCE); /* Remove what we created */ ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error with jnlpool semctl"), save_errno); } semarg.buf = &semstat; if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg)) { save_errno = errno; remove_sem_set(SOURCE); /* Remove what we created */ ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error with jnlpool semctl"), save_errno); } udi->gt_sem_ctime = semarg.buf->sem_ctime; status = grab_sem_all_source(); if (0 != status) { remove_sem_set(SOURCE); /* Remove what we created */ ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(1) ERR_JNLPOOLSETUP); } repl_instance.jnlpool_semid = udi->semid; repl_instance.jnlpool_semid_ctime = udi->gt_sem_ctime; /* * -------------------------- * Now semaphores of recvpool * -------------------------- */ assert(NUM_SRC_SEMS == NUM_RECV_SEMS); if (-1 == (udi->semid = init_sem_set_recvr(IPC_PRIVATE, NUM_RECV_SEMS, RWDALL | IPC_CREAT))) { remove_sem_set(SOURCE); ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(7) ERR_RECVPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error creating recv pool"), REPL_SEM_ERRNO); } semarg.val = GTM_ID; if (-1 == semctl(udi->semid, RECV_ID_SEM, SETVAL, semarg)) { save_errno = errno; remove_sem_set(SOURCE); remove_sem_set(RECV); ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(7) ERR_RECVPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error with recvpool semctl"), save_errno); } semarg.buf = &semstat; if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg)) /* For creation time */ { save_errno = errno; remove_sem_set(SOURCE); remove_sem_set(RECV); ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(7) ERR_RECVPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error with recvpool semctl"), save_errno); } udi->gt_sem_ctime = semarg.buf->sem_ctime; status = grab_sem_all_receive(); if (0 != status) { remove_sem_set(SOURCE); remove_sem_set(RECV); ftok_sem_release(replreg, TRUE, TRUE); rts_error(VARLSTCNT(1) ERR_RECVPOOLSETUP); } repl_instance.recvpool_semid = udi->semid; repl_instance.recvpool_semid_ctime = udi->gt_sem_ctime; /* Initialize jnlpool.repl_inst_filehdr as it is used later by gtmrecv_fetchresync() */ assert(NULL == jnlpool.repl_inst_filehdr); jnlpool.repl_inst_filehdr = (repl_inst_hdr_ptr_t)malloc(SIZEOF(repl_inst_hdr)); memcpy(jnlpool.repl_inst_filehdr, &repl_instance, SIZEOF(repl_inst_hdr)); /* Flush changes to the replication instance file header to disk */ repl_inst_write(instfilename, (off_t)0, (sm_uc_ptr_t)&repl_instance, SIZEOF(repl_inst_hdr)); /* Now release jnlpool/recvpool ftok semaphore */ if (!ftok_sem_release(replreg, FALSE, immediate)) { remove_sem_set(SOURCE); remove_sem_set(RECV); rts_error(VARLSTCNT(4) ERR_REPLFTOKSEM, 2, full_len, instfilename); } return TRUE; }
OS_PAGE_SIZE_DECLARE uint4 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog) { sm_uc_ptr_t old_base[2], mmap_retaddr; boolean_t was_crit, is_mm; char buff[DISK_BLOCK_SIZE]; int result, save_errno, status; uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total; uint4 jnl_status, to_wait, to_msg, wait_period; gtm_uint64_t avail_blocks, mmap_sz; off_t new_eof; trans_num curr_tn; unix_db_info *udi; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; cache_rec_ptr_t cr; DCL_THREADGBL_ACCESS; assert(!IS_DSE_IMAGE); assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */ assert(!process_exiting); DEBUG_ONLY(old_base[0] = old_base[1] = NULL); assert(!gv_cur_region->read_only); udi = FILE_INFO(gv_cur_region); is_mm = (dba_mm == cs_addrs->hdr->acc_meth); # if !defined(MM_FILE_EXT_OK) if (!udi->grabbed_access_sem && is_mm) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ # endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); if (!blocks) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks * and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this * manner as every non-bitmap block must have an associated bitmap) * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert(0 < (int)new_blocks); if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { assert(FALSE); send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { if (!(gtmDebugLevel & GDL_IgnoreAvailSpace)) { /* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB * in space it could never fit it if wasn't sparse. Needed for some tests. */ avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (blocks > (uint4)avail_blocks) { SETUP_THREADGBL_ACCESS; if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs)) return (uint4)(NO_FREE_SPACE); else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks); } else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); } } } /* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */ was_crit = cs_addrs->now_crit; assert(!cs_addrs->hold_onto_crit || was_crit); /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur. * If we are coming from online rollback (when that feature is available), we will come in holding crit and in * the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself. * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for * freeze. * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited * for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this * function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that * at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region * to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused * op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which * we hold crit. */ assert(!was_crit || !cs_data->freeze || (dollar_tlevel && (CDB_STAGNATE <= t_tries))); /* * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) { for ( ; ; ) { grab_crit(gv_cur_region); if (!cs_data->freeze && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (cs_data->freeze || IS_REPL_INST_FROZEN) hiber_start(1000); } } else if (cs_data->freeze && dollar_tlevel) { /* We don't want to continue with file extension as explained above. Hence return with an error code which * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly. */ assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_FREEZE_PROG; } if (IS_REPL_INST_FROZEN && trans_in_prog) { assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_INST_FREEZE; } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); old_total = cs_data->trans_hist.total_blks; if (old_total != filesize) { /* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM, * we still need to remap the database */ assert((old_total > filesize) GTM_TRUNCATE_ONLY( || !is_mm)); /* For BG, someone else could have truncated or extended - we have no idea */ GDSFILEXT_CLNUP; return (SS_NORMAL); }
/* Upgrade ^#t global in "reg" region */ void trigger_upgrade(gd_region *reg) { boolean_t est_first_pass, do_upgrade, is_defined; boolean_t was_null = FALSE, is_null = FALSE; int seq_num, trig_seq_num; int currlabel; mval tmpmval, xecuteimval, *gvname, *tmpmv, *tmpmv2; int4 result, tmpint4; uint4 curend, gvname_prev, xecute_curend; uint4 hash_code, kill_hash_code; int count, i, xecutei, tncount; char *trigname, *trigindex, *ptr; char name_and_index[MAX_MIDENT_LEN + 1 + MAX_DIGITS_IN_INT]; char trigvn[MAX_MIDENT_LEN + 1 + MAX_DIGITS_IN_INT], nullbyte[1]; uint4 trigname_len, name_index_len; int ilen; sgmnt_addrs *csa; jnl_private_control *jpc; uint4 sts; int close_res; hash128_state_t hash_state, kill_hash_state; uint4 hash_totlen, kill_hash_totlen; int trig_protected_mval_push_count; # ifdef DEBUG int save_dollar_tlevel; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(gv_cur_region == reg); assert(!dollar_tlevel); /* caller should have ensured this. this is needed as otherwise things get complicated. */ assert(!is_replicator); /* caller should have ensured this. this is needed so we dont bump jnl_seqno (if replicating) */ csa = &FILE_INFO(reg)->s_addrs; assert(csa->hdr->hasht_upgrade_needed); /* If before-image journaling is turned on in this region (does not matter if replication is turned on or not), * once this transaction is done, we need to switch to new journal file and cut the back link because * otherwise it is possible for backward journal recovery (or rollback) or source server to encounter * the journal records generated in this ^#t-upgrade-transaction in which case they dont know to handle * it properly (e.g. rollback or backward recovery does not know to restore csa->hdr->hasht_upgrade_needed * if it rolls back this transaction). To achieve this, we set hold_onto_crit to TRUE and do the jnl link * cut AFTER the transaction commits but before anyone else can sneak in to do any more updates. * Since most often we expect databases to be journaled, we do this hold_onto_crit even for the non-journaled case. */ grab_crit(reg); csa->hold_onto_crit = TRUE; DEBUG_ONLY(save_dollar_tlevel = dollar_tlevel); assert(!donot_INVOKE_MUMTSTART); DEBUG_ONLY(donot_INVOKE_MUMTSTART = TRUE); op_tstart(IMPLICIT_TSTART, TRUE, &literal_batch, 0); /* 0 ==> save no locals but RESTART OK */ ESTABLISH_NORET(trigger_upgrade_ch, est_first_pass); /* On a TP restart anywhere down below, this line is where the restart resumes execution from */ assert(donot_INVOKE_MUMTSTART); /* Make sure still set for every try/retry of TP transaction */ change_reg(); /* TP_CHANGE_REG wont work as we need to set sgm_info_ptr */ assert(NULL != cs_addrs); assert(csa == cs_addrs); SET_GVTARGET_TO_HASHT_GBL(csa); /* sets up gv_target */ assert(NULL != gv_target); INITIAL_HASHT_ROOT_SEARCH_IF_NEEDED; /* Needed to do every retry in case restart was due to an online rollback. * This also sets up gv_currkey */ /* Do actual upgrade of ^#t global. * * Below is a sample layout of the label 2 ^#t global * ------------------------------------------------------- * ^#t("#TNAME","x")="a"_$C(0)_"1" (present in DEFAULT only) * ^#t("#TRHASH",89771515,1)="a"_$C(0)_"1" (present in DEFAULT only) * ^#t("#TRHASH",106937755,1)="a"_$C(0)_"1" (present in DEFAULT only) * ^#t("a",1,"BHASH")="106937755" * ^#t("a",1,"CHSET")="M" * ^#t("a",1,"CMD")="S" * ^#t("a",1,"LHASH")="89771515" * ^#t("a",1,"TRIGNAME")="x#" * ^#t("a",1,"XECUTE")=" do ^twork" * ^#t("a","#COUNT")="1" * ^#t("a","#CYCLE")="1" * ^#t("a","#LABEL")="2" * * Below is a sample layout of the label 3 ^#t global * ------------------------------------------------------- * ^#t("#LABEL")="3" (present only after upgrade, not regular trigger load) * ^#t("#TNAME","x")="a"_$C(0)_"1" (present in CURRENT region) * ^#t("a",1,"BHASH")="71945627" * ^#t("a",1,"CHSET")="M" * ^#t("a",1,"CMD")="S" * ^#t("a",1,"LHASH")="71945627" * ^#t("a",1,"TRIGNAME")="x#" * ^#t("a",1,"XECUTE")=" do ^twork" * ^#t("a","#COUNT")="1" * ^#t("a","#CYCLE")="2" * ^#t("a","#LABEL")="3" * ^#t("a","#TRHASH",71945627,1)="a"_$C(0)_"1" * * Key aspects of the format change * ---------------------------------- * 1) New ^#t("#LABEL")="3" to indicate the format of the ^#t global. This is in addition to * ^#t("a","#LABEL") etc. which is already there. This way we have a #LABEL for not just the installed * triggers but also for the name information stored in the #TNAME nodes. * 2) In the BHASH and LHASH fields. The hash computation is different so there are more chances of BHASH and LHASH * matching in which case we store only one #TRHASH entry (instead of two). So thre is fewer ^#t records in the new * format in most cases. * 3) ^#t("a","#LABEL") bumps from 2 to 3. Similarly ^#t("a","#CYCLE") bumps by one (to make sure triggers for this * global get re-read if and when we implement an -ONLINE upgrade). * 4) DEFAULT used to have ^#t("#TNAME",...) nodes corresponding to triggers across ALL regions in the gbldir and * other regions used to have NO ^#t("#TNAME",...) nodes whereas after the upgrade every region have * ^#t("#TNAME",...) nodes corresponding to triggers installed in that region. So it is safer to kill ^#t("#TNAME") * nodes and add them as needed. * 5) #TRHASH has moved from ^#t() to ^#t(<gbl>). So it is safer to kill ^#t("#TRHASH") nodes and add them as needed. * * Below is a sample layout of the label 4 ^#t global * ------------------------------------------------------- * ^#t("#TNAME","x")="a"_$C(0)_"1" (present in CURRENT region) * ^#t("a",1,"BHASH")="71945627" * ^#t("a",1,"CHSET")="M" * ^#t("a",1,"CMD")="S" * ^#t("a",1,"LHASH")="71945627" * ^#t("a",1,"TRIGNAME")="x#" * ^#t("a",1,"XECUTE")=" do ^twork" * ^#t("a","#COUNT")="1" * ^#t("a","#CYCLE")="2" * ^#t("a","#LABEL")="4" * ^#t("a","#TRHASH",71945627,1)="a"_$C(0)_"1" * * Key aspects of the format change * ---------------------------------- * 1) Removed ^#t("#LABEL") as it is redundant information and trigger load does not include it * 2) Multiline triggers were incorrectly processed resulting in incorrect BHASH and LHASH values. Upgrade fixes this * 3) ^#t("a","#LABEL") bumps from 3 to 4. Similarly ^#t("a","#CYCLE") bumps by one (to make sure * triggers for this global get re-read if and when we implement an -ONLINE upgrade). */ tmpmv = &tmpmval; /* At all points maintain this relationship. The two are used interchangeably below */ if (gv_target->root) do_upgrade = TRUE; /* The below logic assumes ^#t global does not have any integrity errors */ assert(do_upgrade); /* caller should have not invoked us otherwise */ if (do_upgrade) { /* kill ^#t("#TRHASH"), ^#t("#TNAME") and ^#t("#LABEL") first. Regenerate each again as we process ^#t(<gbl>,...) */ csa->incr_db_trigger_cycle = TRUE; /* so that we increment csd->db_trigger_cycle at commit time. * this forces concurrent processes to read upgraded triggers. */ if (JNL_WRITE_LOGICAL_RECS(csa)) { /* Note that the ^#t upgrade is a physical layout change. But it has no logical change (i.e. users * see the same MUPIP TRIGGER -SELECT output as before). So write only a dummy LGTRIG journal * record for this operation. Hence write a string that starts with a trigger comment character ";". */ assert(!gv_cur_region->read_only); jnl_format(JNL_LGTRIG, NULL, (mval *)&literal_trigjnlrec, 0); } /* KILL ^#t("#LABEL") unconditionally */ BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHLABEL, STRLEN(LITERAL_HASHLABEL)); if (0 != gvcst_data()) gvcst_kill(TRUE); /* KILL ^#t("#TNAME") unconditionally and regenerate */ BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHTNAME, STRLEN(LITERAL_HASHTNAME)); if (0 != gvcst_data()) gvcst_kill(TRUE); /* KILL ^#t("#TRHASH") unconditionally and regenerate */ BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHTRHASH, STRLEN(LITERAL_HASHTRHASH)); if (0 != gvcst_data()) gvcst_kill(TRUE); /* Loop through all global names for which ^#t(<gvn>) exists. The only first-level subscripts of ^#t starting * with # are #TNAME and #TRHASH in collation order. So after #TRHASH we expect to find subscripts that are * global names. Hence the HASHTRHASH code is placed AFTER the HASHTNAME code above. */ TREF(gd_targ_gvnh_reg) = NULL; /* needed so op_gvorder below goes through gvcst_order (i.e. focuses only * on the current region) and NOT through gvcst_spr_order (which does not * apply anyways in the case of ^#t). */ nullbyte[0] = '\0'; trig_protected_mval_push_count = 0; INCR_AND_PUSH_MV_STENT(gvname); /* Protect gvname from garbage collection */ do { op_gvorder(gvname); if (0 == gvname->str.len) break; assert(ARRAYSIZE(trigvn) > gvname->str.len); memcpy(&trigvn[0], gvname->str.addr, gvname->str.len); gvname->str.addr = &trigvn[0]; /* point away from stringpool to avoid stp_gcol issues */ /* Save gv_currkey->prev so it is restored before next call to op_gvorder (which cares about this field). * gv_currkey->prev gets tampered with in the for loop below (e.g. BUILD_HASHT_SUB_CURRKEY macro). * No need to do this for gv_currkey->end since the body of the for loop takes care of restoring it. */ gvname_prev = gv_currkey->prev; BUILD_HASHT_SUB_CURRKEY(gvname->str.addr, gvname->str.len); /* At this point, gv_currkey is ^#t(<gvn>) */ /* Increment ^#t(<gvn>,"#CYCLE") */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_hashcycle, tmpmv); assert(is_defined); tmpint4 = mval2i(tmpmv); tmpint4++; i2mval(tmpmv, tmpint4); gvtr_set_hasht_gblsubs((mval *)&literal_hashcycle, tmpmv); /* Read ^#t(<gvn>,"#COUNT") */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_hashcount, tmpmv); if (is_defined) { tmpint4 = mval2i(tmpmv); count = tmpint4; /* Get ^#t(<gvn>,"#LABEL"), error out for invalid values. Upgrade disallowed for label 1 triggers */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_hashlabel, tmpmv); assert(is_defined); currlabel = mval2i(tmpmv); if ((V19_HASHT_GBL_LABEL_INT >= currlabel) || (HASHT_GBL_CURLABEL_INT <= currlabel)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_TRIGUPBADLABEL, 6, currlabel, HASHT_GBL_CURLABEL_INT, gvname->str.len, gvname->str.addr, REG_LEN_STR(reg)); /* Set ^#t(<gvn>,"#LABEL")=HASHT_GBL_CURLABEL */ gvtr_set_hasht_gblsubs((mval *)&literal_hashlabel, (mval *)&literal_curlabel); } else count = 0; /* Kill ^#t(<gvn>,"#TRHASH") unconditionally and regenerate */ gvtr_kill_hasht_gblsubs((mval *)&literal_hashtrhash, TRUE); /* At this point, gv_currkey is ^#t(<gvn>) */ for (i = 1; i <= count; i++) { /* At this point, gv_currkey is ^#t(<gvn>) */ curend = gv_currkey->end; /* note gv_currkey->end before changing it so we can restore it later */ assert(KEY_DELIMITER == gv_currkey->base[curend]); assert(gv_target->gd_csa == cs_addrs); i2mval(tmpmv, i); COPY_SUBS_TO_GVCURRKEY(tmpmv, gv_cur_region, gv_currkey, was_null, is_null); /* At this point, gv_currkey is ^#t(<gvn>,i) */ /* Compute new LHASH and BHASH hash values. * LHASH uses : GVSUBS, XECUTE * BHASH uses : GVSUBS, DELIM, ZDELIM, PIECES, XECUTE * So reach each of these pieces and compute hash along the way. */ STR_PHASH_INIT(hash_state, hash_totlen); STR_PHASH_PROCESS(hash_state, hash_totlen, gvname->str.addr, gvname->str.len); STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1); /* Read in ^#t(<gvn>,i,"GVSUBS") */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_gvsubs, tmpmv); if (is_defined) { STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len); STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1); } /* Copy over SET hash state (2-tuple <state,totlen>) to KILL hash state before adding * the PIECES, DELIM, ZDELIM portions (those are only part of the SET hash). */ kill_hash_state = hash_state; kill_hash_totlen = hash_totlen; /* Read in ^#t(<gvn>,i,"PIECES") */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_pieces, tmpmv); if (is_defined) { STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len); STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1); } /* Read in ^#t(<gvn>,i,"DELIM") */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_delim, tmpmv); if (is_defined) { STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len); STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1); } /* Read in ^#t(<gvn>,i,"ZDELIM") */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_zdelim, tmpmv); if (is_defined) { STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len); STR_PHASH_PROCESS(hash_state, hash_totlen, nullbyte, 1); } /* Read in ^#t(<gvn>,i,"XECUTE"). * Note: The XECUTE portion of the trigger definition is used in SET and KILL hash. * But since we have started maintaining "hash_state" and "kill_hash_state" separately * (due to PIECES, DELIM, ZDELIM) we need to update the hash for both using same input string. */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_xecute, tmpmv); if (is_defined) { STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len); STR_PHASH_PROCESS(kill_hash_state, kill_hash_totlen, tmpmval.str.addr, tmpmval.str.len); } else { /* Multi-record XECUTE string */ /* At this point, gv_currkey is ^#t(<gvn>,i) */ xecute_curend = gv_currkey->end; /* note gv_currkey->end so we can restore it later */ assert(KEY_DELIMITER == gv_currkey->base[xecute_curend]); tmpmv2 = (mval *)&literal_xecute; COPY_SUBS_TO_GVCURRKEY(tmpmv2, gv_cur_region, gv_currkey, was_null, is_null); xecutei = 1; do { i2mval(&xecuteimval, xecutei); is_defined = gvtr_get_hasht_gblsubs(&xecuteimval, tmpmv); if (!is_defined) break; STR_PHASH_PROCESS(hash_state, hash_totlen, tmpmval.str.addr, tmpmval.str.len); STR_PHASH_PROCESS(kill_hash_state, kill_hash_totlen, tmpmval.str.addr, tmpmval.str.len); xecutei++; } while (TRUE); /* Restore gv_currkey to ^#t(<gvn>,i) */ gv_currkey->end = xecute_curend; gv_currkey->base[xecute_curend] = KEY_DELIMITER; } STR_PHASH_RESULT(hash_state, hash_totlen, hash_code); STR_PHASH_RESULT(kill_hash_state, kill_hash_totlen, kill_hash_code); /* Set ^#t(<gvn>,i,"LHASH") */ MV_FORCE_UMVAL(tmpmv, kill_hash_code); gvtr_set_hasht_gblsubs((mval *)&literal_lhash, tmpmv); /* Set ^#t(<gvn>,i,"BHASH") */ MV_FORCE_UMVAL(tmpmv, hash_code); gvtr_set_hasht_gblsubs((mval *)&literal_bhash, tmpmv); /* Read in ^#t(<gvn>,i,"TRIGNAME") to determine if #SEQNUM/#TNCOUNT needs to be maintained */ is_defined = gvtr_get_hasht_gblsubs((mval *)&literal_trigname, tmpmv); assert(is_defined); assert('#' == tmpmval.str.addr[tmpmval.str.len - 1]); tmpmval.str.len--; if ((tmpmval.str.len <= ARRAYSIZE(name_and_index)) && (NULL != (ptr = memchr(tmpmval.str.addr, '#', tmpmval.str.len)))) { /* Auto-generated name. Need to maintain #SEQNUM/#TNCOUNT */ /* Take copy of trigger name into non-stringpool location to avoid stp_gcol issues */ trigname_len = ptr - tmpmval.str.addr; ptr++; name_index_len = (tmpmval.str.addr + tmpmval.str.len) - ptr; assert(ARRAYSIZE(name_and_index) >= (trigname_len + 1 + name_index_len)); trigname = &name_and_index[0]; trigindex = ptr; memcpy(trigname, tmpmval.str.addr, tmpmval.str.len); A2I(ptr, ptr + name_index_len, trig_seq_num); /* At this point, gv_currkey is ^#t(<gvn>,i) */ /* $get(^#t("#TNAME",<trigger name>,"#SEQNUM")) */ BUILD_HASHT_SUB_SUB_SUB_CURRKEY(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME), trigname, trigname_len, LITERAL_HASHSEQNUM, STR_LIT_LEN(LITERAL_HASHSEQNUM)); seq_num = gvcst_get(tmpmv) ? mval2i(tmpmv) : 0; if (trig_seq_num > seq_num) { /* Set ^#t("#TNAME",<trigger name>,"#SEQNUM") = trig_seq_num */ SET_TRIGGER_GLOBAL_SUB_SUB_SUB_STR(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME), trigname, trigname_len, LITERAL_HASHSEQNUM, STR_LIT_LEN(LITERAL_HASHSEQNUM), trigindex, name_index_len, result); assert(PUT_SUCCESS == result); } /* set ^#t("#TNAME",<trigger name>,"#TNCOUNT")++ */ BUILD_HASHT_SUB_SUB_SUB_CURRKEY(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME), trigname, trigname_len, LITERAL_HASHTNCOUNT, STR_LIT_LEN(LITERAL_HASHTNCOUNT)); tncount = gvcst_get(tmpmv) ? mval2i(tmpmv) + 1 : 1; i2mval(tmpmv, tncount); SET_TRIGGER_GLOBAL_SUB_SUB_SUB_MVAL(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME), trigname, trigname_len, LITERAL_HASHTNCOUNT, STR_LIT_LEN(LITERAL_HASHTNCOUNT), tmpmval, result); trigname_len += 1 + name_index_len; /* in preparation for ^#t("#TNAME") set below */ assert(PUT_SUCCESS == result); BUILD_HASHT_SUB_CURRKEY(gvname->str.addr, gvname->str.len); /* At this point, gv_currkey is ^#t(<gvn>) */ } else { /* Take copy of trigger name into non-stringpool location to avoid stp_gcol issues */ trigname = &name_and_index[0]; /* in preparation for ^#t("#TNAME") set below */ trigname_len = MIN(tmpmval.str.len, ARRAYSIZE(name_and_index)); assert(ARRAYSIZE(name_and_index) >= trigname_len); memcpy(trigname, tmpmval.str.addr, trigname_len); /* Restore gv_currkey to what it was at beginning of for loop iteration */ gv_currkey->end = curend; gv_currkey->base[curend] = KEY_DELIMITER; } /* At this point, gv_currkey is ^#t(<gvn>) */ if (kill_hash_code != hash_code) gvtr_set_hashtrhash(gvname->str.addr, gvname->str.len, kill_hash_code, i); /* Set ^#t(<gvn>,"#TRHASH",hash_code,i) */ gvtr_set_hashtrhash(gvname->str.addr, gvname->str.len, hash_code, i); /* Set ^#t("#TNAME",<trigname>)=<gvn>_$c(0)_<trigindx> */ /* The upgrade assumes that the region does not contain two triggers with the same name. * V62000 and before could potentially have this out of design case. Once implemented * the trigger integrity check will warn users of this edge case */ ptr = &trigvn[gvname->str.len]; *ptr++ = '\0'; ilen = 0; I2A(ptr, ilen, i); ptr += ilen; assert(ptr <= ARRAYTOP(trigvn)); SET_TRIGGER_GLOBAL_SUB_SUB_STR(LITERAL_HASHTNAME, STR_LIT_LEN(LITERAL_HASHTNAME), trigname, trigname_len, trigvn, ptr - gvname->str.addr, result); assert(PUT_SUCCESS == result); BUILD_HASHT_SUB_CURRKEY(gvname->str.addr, gvname->str.len); /* At this point, gv_currkey is ^#t(<gvn>) */ } /* At this point, gv_currkey is ^#t(<gvn>) i.e. gv_currkey->end is correct but gv_currkey->prev * might have been tampered with. Restore it to proper value first. */ gv_currkey->prev = gvname_prev; gvname->mvtype = 0; /* can now be garbage collected in the next iteration */ } while (TRUE); } op_tcommit(); REVERT; /* remove our condition handler */ DEBUG_ONLY(donot_INVOKE_MUMTSTART = FALSE;) if (csa->hold_onto_crit)
/* * This will rundown a replication instance journal (and receiver) pool. * Input Parameter: * replpool_id of the instance. Instaname must be null terminated in replpool_id. * Returns : * TRUE, if successful. * FALSE, otherwise. */ boolean_t mu_rndwn_repl_instance(replpool_identifier *replpool_id, boolean_t immediate) { boolean_t jnlpool_stat = TRUE, recvpool_stat = TRUE; char *instname, shmid_buff[TMP_BUF_LEN]; gd_region *r_save; repl_inst_fmt repl_instance; static gd_region *reg = NULL; struct semid_ds semstat; struct shmid_ds shmstat; union semun semarg; uchar_ptr_t ret_ptr; unix_db_info *udi; int save_errno; error_def(ERR_MUJPOOLRNDWNSUC); error_def(ERR_MURPOOLRNDWNSUC); error_def(ERR_MUJPOOLRNDWNFL); error_def(ERR_MURPOOLRNDWNFL); error_def(ERR_SEMREMOVED); error_def(ERR_REPLACCSEM); error_def(ERR_SYSCALL); if (NULL == reg) { r_save = gv_cur_region; mu_gv_cur_reg_init(); reg = gv_cur_region; gv_cur_region = r_save; } jnlpool.jnlpool_dummy_reg = reg; instname = replpool_id->instname; reg->dyn.addr->fname_len = strlen(instname); assert(0 == instname[reg->dyn.addr->fname_len]); memcpy((char *)reg->dyn.addr->fname, instname, reg->dyn.addr->fname_len + 1); udi = FILE_INFO(reg); udi->fn = (char *)reg->dyn.addr->fname; /* Lock replication instance using ftok semaphore */ if (!ftok_sem_get(reg, TRUE, REPLPOOL_ID, immediate)) return FALSE; repl_inst_get((char *)instname, &repl_instance); semarg.buf = &semstat; /* * -------------------------- * First rundown Journal pool * -------------------------- */ if (INVALID_SEMID != repl_instance.jnlpool_semid) if ((-1 == semctl(repl_instance.jnlpool_semid, 0, IPC_STAT, semarg)) || (semarg.buf->sem_ctime != repl_instance.jnlpool_semid_ctime)) repl_instance.jnlpool_semid = INVALID_SEMID; if (INVALID_SHMID != repl_instance.jnlpool_shmid) if ((-1 == shmctl(repl_instance.jnlpool_shmid, IPC_STAT, &shmstat)) || (shmstat.shm_ctime != repl_instance.jnlpool_shmid_ctime)) repl_instance.jnlpool_shmid = INVALID_SHMID; if (INVALID_SHMID != repl_instance.jnlpool_shmid) { jnlpool_stat = mu_rndwn_replpool(replpool_id, repl_instance.jnlpool_semid, repl_instance.jnlpool_shmid); ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.jnlpool_shmid); *ret_ptr = '\0'; gtm_putmsg(VARLSTCNT(6) (jnlpool_stat ? ERR_MUJPOOLRNDWNSUC : ERR_MUJPOOLRNDWNFL), 4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname)); } else if (INVALID_SEMID != repl_instance.jnlpool_semid) { if (0 == sem_rmid(repl_instance.jnlpool_semid)) { /* note that shmid_buff used here is actually a buffer to hold semid (not shmid) */ ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.jnlpool_semid); *ret_ptr = '\0'; gtm_putmsg(VARLSTCNT(9) ERR_MUJPOOLRNDWNSUC, 4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname), ERR_SEMREMOVED, 1, repl_instance.jnlpool_semid); } else { save_errno = errno; gtm_putmsg(VARLSTCNT(13) ERR_REPLACCSEM, 3, repl_instance.jnlpool_semid, RTS_ERROR_STRING(instname), ERR_SYSCALL, 5, RTS_ERROR_LITERAL("jnlpool sem_rmid()"), CALLFROM, save_errno); } /* Note that jnlpool_stat is not set to FALSE in case sem_rmid() fails above. This is because the journal pool is * anyway not present and it is safer to reset the sem/shmids in the instance file. The only thing this might cause * is a stranded semaphore but that is considered better than getting errors due to not resetting instance file. */ } if (jnlpool_stat) /* Reset instance file for jnlpool info */ repl_inst_jnlpool_reset(); /* * -------------------------- * Now rundown Receivpool * -------------------------- */ recvpool.recvpool_dummy_reg = reg; if (INVALID_SEMID != repl_instance.recvpool_semid) if ((-1 == semctl(repl_instance.recvpool_semid, 0, IPC_STAT, semarg)) || (semarg.buf->sem_ctime != repl_instance.recvpool_semid_ctime)) repl_instance.recvpool_semid = INVALID_SEMID; if (INVALID_SHMID != repl_instance.recvpool_shmid) if ((-1 == shmctl(repl_instance.recvpool_shmid, IPC_STAT, &shmstat)) || (shmstat.shm_ctime != repl_instance.recvpool_shmid_ctime)) repl_instance.recvpool_shmid = INVALID_SHMID; if (INVALID_SHMID != repl_instance.recvpool_shmid) { recvpool_stat = mu_rndwn_replpool(replpool_id, repl_instance.recvpool_semid, repl_instance.recvpool_shmid); ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.recvpool_shmid); *ret_ptr = '\0'; gtm_putmsg(VARLSTCNT(6) (recvpool_stat ? ERR_MURPOOLRNDWNSUC : ERR_MURPOOLRNDWNFL), 4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname)); } else if (INVALID_SEMID != repl_instance.recvpool_semid) { if (0 == sem_rmid(repl_instance.recvpool_semid)) { /* note that shmid_buff used here is actually a buffer to hold semid (not shmid) */ ret_ptr = i2asc((uchar_ptr_t)shmid_buff, repl_instance.recvpool_semid); *ret_ptr = '\0'; gtm_putmsg(VARLSTCNT(9) ERR_MURPOOLRNDWNSUC, 4, LEN_AND_STR(shmid_buff), LEN_AND_STR(replpool_id->instname), ERR_SEMREMOVED, 1, repl_instance.recvpool_semid); } else { save_errno = errno; gtm_putmsg(VARLSTCNT(13) ERR_REPLACCSEM, 3, repl_instance.recvpool_semid, RTS_ERROR_STRING(instname), ERR_SYSCALL, 5, RTS_ERROR_LITERAL("recvpool sem_rmid()"), CALLFROM, save_errno); } /* Note that recvpool_stat is not set to FALSE in case sem_rmid() fails above. This is because the journal pool is * anyway not present and it is safer to reset the sem/shmids in the instance file. The only thing this might cause * is a stranded semaphore but that is considered better than getting errors due to not resetting instance file. */ } if (recvpool_stat) /* Reset instance file for recvpool info */ repl_inst_recvpool_reset(); /* Release replication instance ftok semaphore lock */ if (!ftok_sem_release(reg, TRUE, immediate)) return FALSE; return (jnlpool_stat && recvpool_stat); }
int mu_replpool_grab_sem(repl_inst_hdr_ptr_t repl_inst_filehdr, char pool_type, boolean_t *sem_created_ptr) { int status, save_errno, sem_id, semval, semnum, instfilelen; time_t sem_ctime; boolean_t sem_created; char *instfilename; union semun semarg; struct semid_ds semstat; gd_region *replreg; DEBUG_ONLY(unix_db_info *udi;) *sem_created_ptr = sem_created = FALSE; /* assume semaphore not created by default */ /* First ensure that the caller has grabbed the ftok semaphore on the replication instance file */ assert((NULL != jnlpool.jnlpool_dummy_reg) && (jnlpool.jnlpool_dummy_reg == recvpool.recvpool_dummy_reg)); replreg = jnlpool.jnlpool_dummy_reg; DEBUG_ONLY(udi = FILE_INFO(jnlpool.jnlpool_dummy_reg)); assert(udi->grabbed_ftok_sem); /* the caller should have grabbed ftok semaphore */ instfilename = (char *)replreg->dyn.addr->fname; instfilelen = replreg->dyn.addr->fname_len; assert((NULL != instfilename) && (0 != instfilelen) && ('\0' == instfilename[instfilelen])); assert((JNLPOOL_SEGMENT == pool_type) || (RECVPOOL_SEGMENT == pool_type)); if (JNLPOOL_SEGMENT == pool_type) { sem_id = repl_inst_filehdr->jnlpool_semid; sem_ctime = repl_inst_filehdr->jnlpool_semid_ctime; } else { sem_id = repl_inst_filehdr->recvpool_semid; sem_ctime = repl_inst_filehdr->recvpool_semid_ctime; }
void gds_rundown(void) { bool is_mm, we_are_last_user, we_are_last_writer; boolean_t ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch; now_t now; /* for GET_CUR_TIME macro */ char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; error_def(ERR_CRITSEMFAIL); error_def(ERR_DBCCERR); error_def(ERR_DBFILERR); error_def(ERR_DBRNDWNWRN); error_def(ERR_ERRCALL); error_def(ERR_GBLOFLOW); error_def(ERR_GTMASSERT); error_def(ERR_IPCNOTDEL); error_def(ERR_JNLFLUSH); error_def(ERR_RNDWNSEMFAIL); error_def(ERR_TEXT); error_def(ERR_WCBLOCKED); forced_exit = FALSE; /* Okay, we're dying already -- let rel_crit live in peace now. * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001 */ grabbed_access_sem = FALSE; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* * early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return; } ESTABLISH(gds_rundown_ch); if (!reg->open) /* Not open, no point to rundown */ { if (reg->opening) /* Died partway open, kill rest of way */ { rel_crit(reg); mutex_cleanup(reg); /* revist this to handle MM properly SMW 98/12/16 if (NULL != csa->nl) { status = shmdt((caddr_t)csa->nl); if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); } */ shmdt((caddr_t)csa->nl); csa->nl = NULL; } REVERT; return; } switch(csd->acc_meth) { /* Pass mm and bg through */ case dba_bg: is_mm = FALSE; break; case dba_mm: is_mm = TRUE; break; case dba_usr: assert(FALSE); default: REVERT; return; } /* Cancel any pending flush timer for this region by this task */ CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer); we_are_last_user = FALSE; if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); assert(!csa->read_lock); rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); /* * We need to guarantee that none else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it. * So just lock it. We do it in blocking mode. */ if (!ftok_sem_lock(reg, FALSE, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * For mupip_jnl_recover we already have database access control semaphore. * We do not release it. We release it from mur_close_files. */ if (!mupip_jnl_recover) { sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) /* We couldn't get it in one shot -- see if we already have it */ { save_errno = errno; /* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */ if (semctl(udi->semid, 0, GETPID) == process_id) { send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; return; /* Already in rundown for this region */ } if (EAGAIN != save_errno) { assert(FALSE); rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status); if (-1 == status) /* We couldn't get it at all.. */ rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); } } grabbed_access_sem = TRUE; /* * We now have the dbinit/rundown lock, so we are alone in this code for this region * and nobody else can attach. * See if we are all alone in accessing this database shared memory. */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --csa->nl->ref_cnt; if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch; assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */ if (-1 == (semval = semctl(udi->semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */ assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */ if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); /* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should * not flush shared memory contents to disk as they might be in an inconsistent state. * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine. * A reissue of the recover/rollback command will restore the database to a consistent state. * Otherwise, if we have write access to this region, let us perform a few writing tasks. */ if (csa->nl->donotflush_dbjnl) csa->wbuf_dqd = 0; /* ignore csa->wbuf_dqd status as we do not care about the cache contents */ else if (!reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */ if (csa->wbuf_dqd) { grab_crit(reg); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); if (is_mm) { assert(FALSE); csd = csa->hdr; } BG_TRACE_PRO_ANY(csa, lost_block_recovery); rel_crit(reg); } if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type)) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ if (is_mm) { if (csa->total_blks != csa->ti->total_blks) /* do remap if file had been extended */ { grab_crit(reg); wcs_mm_recover(reg); csd = csa->hdr; rel_crit(reg); } csa->nl->remove_shm = TRUE; } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn; } else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer) { /* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */ grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != csa->nl->jnl_file.u.inode)) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * Although we assert pini_addr should be non-zero for last_writer, we * play it safe in PRO and write a PINI record if not written already. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || 0 != jpc->pini_addr); if (we_are_last_writer && 0 == jpc->pini_addr) jnl_put_jrt_pini(csa); if (0 != jpc->pini_addr) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > csa->nl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!mupip_jnl_recover && we_are_last_user) { /* mupip_jnl_recover will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); rel_crit(reg); if (FALSE == is_mm) { if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ #if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC) if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } #else if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno); } #endif } } } /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */ if (reg->read_only && we_are_last_user && !mupip_jnl_recover) { /* mupip_jnl_recover will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } /* Done with file now, close it */ if (-1 == close(udi->fd)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ if (is_mm) { munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)); if (munmap_len > 0) { munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)), (size_t)(munmap_len)); #ifdef DEBUG_DB64 rel_mmseg((caddr_t)csa->db_addrs[0]); #endif } } /* Detach our shared memory while still under lock so reference counts will be * correct for the next process to run down this region. * In the process also get the remove_shm status from node_local before detaching. * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl); status = shmdt((caddr_t)csa->nl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ if (csa->wbuf_dqd) GTMASSERT; ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); } else if (is_src_server || is_updproc) { gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); /* * Don't release semaphore in case of mupip recover/rollback; since it has standalone access. * It will release the semaphore in mur_close_files. */ if (!mupip_jnl_recover) { if (0 != sem_rmid(udi->semid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); grabbed_access_sem = FALSE; } } else { assert(!mupip_jnl_recover); /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno); /* Now remove the rundown lock */ if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno); grabbed_access_sem = FALSE; } if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); if (!ipc_deleted) { GET_CUR_TIME; if (is_src_server) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover) { gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; }