bt_rec_ptr_t bt_get(int4 block) /* block = block # to get */ { register sgmnt_addrs *csa; bt_rec_ptr_t bt; int lcnt; csa = cs_addrs; assert(csa->read_lock || csa->now_crit); bt = csa->bt_header + (block % csa->hdr->bt_buckets); assert(bt->blk == BT_QUEHEAD); for (lcnt = csa->hdr->n_bts; lcnt > 0; lcnt--) { bt = (bt_rec_ptr_t) ((sm_uc_ptr_t) bt + bt->blkque.fl); if (bt->blk == block) return bt; if (bt->blk == BT_QUEHEAD) return NULL; } SET_TRACEABLE_VAR(csa->hdr->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_bt_get); return NULL; /* actually should return BT_INVALID or some such value but callers check only for NULL */ }
void mlk_wake_pending(mlk_ctldata_ptr_t ctl, mlk_shrblk_ptr_t d, gd_region *reg) { mlk_prcblk_ptr_t next, pr; sm_uint_ptr_t empty_slot, ctop; sgmnt_addrs *csa; boolean_t remote_pid; int crit_wake_res; /* also used in macro DO_CRIT_WAKE */ int lcnt; csa = &FILE_INFO(reg)->s_addrs; if (!d->pending) return; ctl->wakeups++; /* Before updating d->sequence ensure there is no process owning this lock, since otherwise when the owner process attempts * to release the lock it will fail as its private copy of "p->sequence" will not match the shared memory "d->sequence". */ assert(!d->owner); d->sequence = csa->hdr->trans_hist.lock_sequence++; /* This node is being awakened (GTCM) */ BG_TRACE_PRO_ANY(csa, mlock_wakeups); /* Record halted slumbers */ if (reg->dyn.addr->acc_meth == dba_bg && csa->hdr->clustered) { remote_pid = FALSE; for (empty_slot = ctl->clus_pids, ctop = &ctl->clus_pids[NUM_CLST_LCKS-1]; *empty_slot && empty_slot <= ctop; empty_slot++) ; for (pr = (mlk_prcblk_ptr_t)R2A(d->pending), lcnt = csa->hdr->lock_space_size / PRC_FACTOR; lcnt; lcnt--) { next = (pr->next) ? (mlk_prcblk_ptr_t)R2A(pr->next) : 0; /* in case it's deleted */ if ((pr->process_id & NODENUMBER) == (process_id & NODENUMBER)) { DO_CRIT_WAKE; } else if (empty_slot <= ctop) { remote_pid = TRUE; *empty_slot = pr->process_id; empty_slot++; } if (next) pr = next; else break; } if (remote_pid) ccp_cluster_lock_wake(reg); } else { for (pr = (mlk_prcblk_ptr_t)R2A(d->pending), lcnt = csa->hdr->lock_space_size / PRC_FACTOR; lcnt; lcnt--) { next = (pr->next) ? (mlk_prcblk_ptr_t)R2A(pr->next) : 0; /* in case it's deleted */ DO_CRIT_WAKE; /* Wake one process to keep things orderly, if it loses its way, others * will jump in after a timout */ if (GONE == crit_wake_res && next) pr = next; else break; } } if (!lcnt) GTMASSERT; return; }
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { int4 status; uint4 blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; boolean_t clustered, hold_onto_crit, was_crit; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; enum db_ver ondsk_blkver; int4 dummy_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked; ht_ent_int4 *tabent; srch_blk_status *blkhist; trans_num dirty, blkhdrtn; sm_uc_ptr_t buffaddr; uint4 stuck_cnt = 0; boolean_t lcl_blk_free; node_local_ptr_t cnl; lcl_blk_free = block_is_free; block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */ first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); /* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */ assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover); if (dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); assert(!chain1.flag || cse); if (cse) { /* transaction has modified the sought after block */ if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode)) { /* Changes have not been committed to shared memory, i.e. still in private memory. * Build block in private buffer if not already done and return the same. */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ assert(gds_t_committed != cse->mode); already_built = (NULL != cse->new_buff); /* Validate the block's search history right after building a private copy. * This is not needed in case gvcst_search is going to reuse the clue's search * history and return (because tp_hist will do the validation of this block). * But if gvcst_search decides to do a fresh traversal (because the clue does not * cover the path of the current input key etc.) the block build that happened now * will not get validated in tp_hist since it will instead be given the current * key's search history path (a totally new path) for validation. Since a private * copy of the block has been built, tp_tend would also skip validating this block * so it is necessary that we validate the block right here. Since it is tricky to * accurately differentiate between the two cases, we do the validation * unconditionally here (besides it is only a few if checks done per block build * so it is considered okay performance-wise). */ gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(NULL != cse->blk_target); if (!already_built && !chain1.flag) { buffaddr = first_tp_srch_status->buffaddr; cr = first_tp_srch_status->cr; assert((is_mm || cr) && buffaddr); blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn; if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl); return (sm_uc_ptr_t)NULL; } if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle) || (first_tp_srch_status->blk_num != cr->blk))) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } else { /* Block changes are already committed to shared memory (possible if we are in TP * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read * block from shared memory; do not look at private memory (i.e. cse) as that might * not be as uptodate as shared memory. */ assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */ /* If this block was newly created as part of the TP transaction, it should not be killed * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would * have had an old_mode of kill_t_create in which case we would not have come into this * else block. Assert accordingly. */ assert(!chain1.flag); first_tp_srch_status = NULL; /* do not use any previous srch_hist information */ } } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; } ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { cr = first_tp_srch_status->cr; assert(cr && !first_tp_srch_status->cse); if (first_tp_srch_status->cycle == cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = cr; cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled in the cache. * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect * if block recycling happened within the same mini-action and restart in that case. * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know * the values to update to. Set a variable that will enable the updation before returning. * Also assert that if we are in the final retry, we are never in a situation where we have a * block that got recycled since the start of the current mini-action. This is easily detected since * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that * have been read as part of the current mini-action until now. */ assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk))); reset_first_tp_srch_status = TRUE; } } } if ((blk >= csa->ti->total_blks) || (blk < 0)) { /* requested block out of range; could occur because of a concurrency conflict */ if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data)) GTMASSERT; assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } # ifdef GTM_CRYPT /* If database is encrypted, check if encryption initialization went fine for this database. If not, * do not let process proceed as it could now potentially get a peek at the desired data from the * decrypted shared memory global buffers (read in from disk by other processes) without having to go to disk. * If DSE, allow for a special case where it is trying to dump a local bitmap block. In this case, DSE * can continue to run fine (even if encryption initialization failed) since bitmap blocks are unencrypted. */ if (csa->encrypt_init_status && (!dse_running || !IS_BITMAP_BLK(blk))) GC_RTS_ERROR(csa->encrypt_init_status, gv_cur_region->dyn.addr->fname); # endif assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; was_crit = csa->now_crit; ocnt = 0; cnl = csa->nl; set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */ hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (!csa->now_crit) { assert(!hold_onto_crit); if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } if ((csd->flush_trigger <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only)) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno); /* a macro that dclast's "wcs_wtstart" and checks for errors etc. */ grab_crit(gv_cur_region); cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csd->trans_hist.curr_tn; /* Record history of most recent disk reads only in dbg builds for now. Although the macro * is just a couple dozen instructions, it is done while holding crit so we want to avoid * delaying crit unless really necessary. Whoever wants this information can enable it * by a build change to remove the DEBUG_ONLY part below. */ DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);) if (!was_crit && !hold_onto_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr); if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr), &ondsk_blkver, lcl_blk_free))) { /* buffer does not contain valid data, so reset blk to be empty */ cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */ cr->blk = CR_BLKEMPTY; cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); assert(was_crit == csa->now_crit); if (FUTURE_READ == status) { /* in cluster, block can be in the "future" with respect to the local history */ assert(TRUE == clustered); assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_future_read; /* t_retry forces the history up to date */ return (sm_uc_ptr_t)NULL; } if (ERR_DYNUPGRDFAIL == status) { /* if we dont hold crit on the region, it is possible due to concurrency conflicts * that this block is unused (i.e. marked free/recycled in bitmap, see comments in * gds_blk_upgrade.h). in this case we should not error out but instead restart. */ if (was_crit) { assert(FALSE); rts_error(VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region)); } else { rdfail_detail = cdb_sc_lostcr; return (sm_uc_ptr_t)NULL; } } if (-1 == status) { /* could have been concurrent truncate, and we read a blk >= csa->ti->total_blks */ /* restart */ rdfail_detail = cdb_sc_truncate; return (sm_uc_ptr_t)NULL; } else rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } disk_blk_read = TRUE; assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); /* Only set in cache if read was success */ cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); } else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt)) { assert(!hold_onto_crit); assert(TRUE == csa->now_crit); assert(cnl->in_crit == process_id); rel_crit(gv_cur_region); } }
bt_rec_ptr_t bt_put(gd_region *reg, int4 block) { bt_rec_ptr_t bt, q0, q1, hdr; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; cache_rec_ptr_t cr; th_rec_ptr_t th; trans_num lcl_tn; uint4 lcnt; csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs; csd = csa->hdr; assert(csa->now_crit || csd->clustered); assert(dba_mm != csa->hdr->acc_meth); lcl_tn = csa->ti->curr_tn; hdr = csa->bt_header + (block % csd->bt_buckets); assert(BT_QUEHEAD == hdr->blk); for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl); ; bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++) { if (BT_QUEHEAD == bt->blk) { /* there is no matching bt */ assert(bt == hdr); bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque)); if (CR_NOTVALID != bt->cache_index) { /* the oldest bt is still valid */ assert(!in_wcs_recover); cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index); if (cr->dirty) { /* get it written so it can be reused */ BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty); if (FALSE == wcs_get_space(reg, 0, cr)) { assert(csa->nl->wc_blocked); /* only reason we currently know * why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); BG_TRACE_PRO_ANY(csa, wcb_bt_put); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"), process_id, &lcl_tn, DB_LEN_STR(reg)); return NULL; } } bt->cache_index = CR_NOTVALID; cr->bt_index = 0; } q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl); q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0); if (EMPTY_QUEUE == (sm_long_t)q1) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1); bt->blk = block; bt->killtn = lcl_tn; insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr); th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (bt->blk == block) { /* bt_put should never be called twice for the same block with the same lcl_tn. This is because * t_end/tp_tend update every block only once as part of each update transaction. Assert this. * The two exceptions are * a) Forward journal recovery which simulates a 2-phase M-kill where the same block * could get updated in both phases (example bitmap block gets updated for blocks created * within the TP transaction as well as for blocks that are freed up in the 2nd phase of * the M-kill) with the same transaction number. This is because although GT.M would have * updated the same block with different transaction numbers in the two phases, forward * recovery will update it with the same tn and instead increment the db tn on seeing the * following INCTN journal record(s). * b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block * and potentially with the same tn. This is because the state of the queues is questionable * and there could be more than one cache record for a given block number. */ assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa))); q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl); th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque))); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (0 == bt->blkque.fl) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2); if (lcnt >= csd->n_bts) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3); } insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base); bt->tn = lcl_tn; return bt; }
int4 gds_rundown(void) { boolean_t canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin; boolean_t have_standalone_access, ipc_deleted, err_caught; boolean_t is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm; boolean_t unsafe_last_writer; char time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status, rc; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; shm_snapshot_t *ss_shm_ptr; uint4 ss_pid, onln_rlbk_pid, holder_pid; boolean_t was_crit; boolean_t safe_mode; /* Do not flush or take down shared memory. */ boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen, ftok_counter_halted, access_counter_halted; int secshrstat; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return EXIT_NRM; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return EXIT_NRM; } /* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on * that later to determine if the process had standalone access or not when it entered this function. We need to guarantee * that none else access database file header when semid/shmid fields are reset. We already have created ftok semaphore in * db_init or, mu_rndwn_file and did not remove it. So just lock it. We do it in blocking mode. */ have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */ DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); ESTABLISH_NORET(gds_rundown_ch, err_caught); if (err_caught) { REVERT; WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0); ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE); return EXIT_ERR; } assert(reg->open); /* if we failed to open, dbinit_ch should have taken care of proper clean up */ assert(!reg->opening); /* see comment above */ assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth)); is_mm = (dba_bg != csd->acc_meth); assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk)); /* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown. */ was_crit = (csa->now_crit && jgbl.onlnrlbk); /* Cancel any pending flush timer for this region by this task */ canceled_flush_timer = FALSE; canceled_dbsync_timer = FALSE; CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); we_are_last_user = FALSE; inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr); if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); if (!was_crit) { rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); } /* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach * mupip_exit_handler while holding access control semaphore. Assert accordingly. */ assert(!have_standalone_access || mupip_jnl_recover || process_exiting); /* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it * needs the access control lock as well. The only expection is we are online rollback and currently running down. */ cnl = csa->nl; onln_rlbk_pid = cnl->onln_rlbk_pid; assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0)); if (!have_standalone_access) { if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */ { save_errno = errno; assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); } may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */ /* We need to guarantee that no one else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it. */ if (!ftok_sem_lock(reg, may_bypass_ftok)) { if (may_bypass_ftok) { /* We did a non-blocking wait. It's ok to proceed without locking */ bypassed_ftok = TRUE; holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */ { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at rundown")); } } else { /* We did a blocking wait but something bad happened. */ FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } } sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (0 != status) { save_errno = errno; /* Check # of processes counted on access sem. */ if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); } bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt; /* Before attempting again in the blocking mode, see if the holding process is an online rollback. * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we * are better off skipping rundown and continuing with sanity cleanup and exit. */ holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!bypassed_access) { /* We couldn't get it in one shot-- see if we already have it */ if (holder_pid == process_id) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); assert(FALSE); return EXIT_ERR; } if (EAGAIN != save_errno) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status, FORCED_WAIT); if (-1 == status) /* We couldn't get it at all.. */ rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, errno); } else if (!IS_GTM_IMAGE) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at rundown")); } udi->grabbed_access_sem = !bypassed_access; } } /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the * ftok semaphore and trying it could cause deadlock */ /* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE. * But there could be other processes still having the database open so we cannot safely reset the halted fields. */ if (have_standalone_access && !jgbl.onlnrlbk) csd->ftok_counter_halted = csd->access_counter_halted = FALSE; ftok_counter_halted = csd->ftok_counter_halted; access_counter_halted = csd->access_counter_halted; /* If we bypassed any of the semaphores, activate safe mode. * Also, if the replication instance is frozen and this db has replication turned on (which means * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode. */ ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen); safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted; /* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --cnl->ref_cnt; if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode; /* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */ assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen); if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); /* There's one writer left and I am it */ assert(reg->read_only || semval >= 0); unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch; we_are_last_writer = unsafe_last_writer && !safe_mode; assert(!we_are_last_writer || !safe_mode); assert(!we_are_last_user || !safe_mode); /* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */ assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen); GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */ if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL)))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); if (NULL != csa->ss_ctx) ss_destroy_context(csa->ss_ctx); /* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */ assert(1 == MAX_SNAPSHOTS); ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); ss_pid = ss_shm_ptr->ss_info.ss_pid; is_cur_process_ss_initiator = (process_id == ss_pid); if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user)) { /* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP * INTEG or by a MUPIP RUNDOWN. */ if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid) && (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0))) { ss_release(NULL); ss_release_lock(reg); } } /* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a * consistent state. */ if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY * if there is NO concurrent online rollback running (as we need crit to set wc_blocked) */ if (csa->wbuf_dqd && !is_mm) { /* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked * BEFORE secshr_db_clnup in mur_close_files. * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in * wcs_wtstart or wcs_get_space. Assert accordingly. */ assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode); if (!was_crit) grab_crit(reg); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); BG_TRACE_PRO_ANY(csa, lost_block_recovery); if (!was_crit) rel_crit(reg); } if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ assert(!safe_mode); if (is_mm) { MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg); cnl->remove_shm = TRUE; } if (cnl->wc_blocked && jgbl.onlnrlbk) { /* if the last update done by online rollback was not committed in the normal code-path but was * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This * could result in the last update never getting flushed to the disk and if online rollback happened * to be the last writer then the shared memory will be flushed and removed and the last update will * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is * possible only if phase1 or phase2 errors are induced using white box test cases */ assert(WB_COMMIT_ERR_ENABLED); wcs_recover(reg); } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); } else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen) { /* canceled pending db or jnl flush timers - flush database and journal buffers to disk */ if (!was_crit) grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); if (!was_crit) rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ if (!was_crit) grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing * a pini, so allow for that. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr)); /* If we haven't written a pini, let jnl_file_close write the pini/pfin. */ if (!jgbl.mur_extract && (0 != jpc->pini_addr)) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > cnl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } if (!was_crit) rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { if (!was_crit) grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!have_standalone_access && we_are_last_user) { /* mupip_exit_handler will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); if (!was_crit) rel_crit(reg); if (!is_mm) { GTM_DB_FSYNC(csa, udi->fd, rc); /* Sync it all */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ assert(csa->ti->total_blks == csa->total_blks); #ifdef _AIX GTM_DB_FSYNC(csa, udi->fd, rc); if (-1 == rc) #else if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1])) #endif { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } } else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg)); cnl->lastwriterbypas_msg_issued = TRUE; } } /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */ /* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this). */ CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm) { /* mupip_exit_handler will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (!csa->read_only_fs) { secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0); if (0 != secshrstat) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } } /* Done with file now, close it */ CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ # if !defined(_AIX) if (is_mm && (NULL != csa->db_addrs[0])) { assert(csa->db_addrs[1] > csa->db_addrs[0]); munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]); if (0 < munmap_len) munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len)); } # endif /* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down * this region. In the process also get the remove_shm status from node_local before detaching. * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl); /* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0. * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid). */ if (jgbl.onlnrlbk) cnl->onln_rlbk_pid = 0; rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */ /* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down. * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes. */ if (safe_mode) /* indicates flushing was skipped */ { if (bypassed_access) cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */ if (bypassed_ftok) cnl->dbrndwn_ftok_skip++; } if (jgbl.onlnrlbk) csa->hold_onto_crit = FALSE; GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0); status = shmdt((caddr_t)cnl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ /* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so * we are safe passing "csa". */ if (-1 == status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); REMOVE_CSA_FROM_CSADDRSLIST(csa); /* remove "csa" from list of open regions (cs_addrs_list) */ reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ assertpro(0 == csa->wbuf_dqd); ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); /* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */ udi->new_shm = FALSE; /* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from * mur_close_files()) */ if (!have_standalone_access) { if (0 != sem_rmid(udi->semid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); udi->new_sem = FALSE; /* Note that we no longer have a new semaphore */ udi->grabbed_access_sem = FALSE; udi->counter_acc_incremented = FALSE; } } else if (is_src_server || is_updproc) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else { assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode); if (!jgbl.onlnrlbk && !have_standalone_access) { /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) { if (!access_counter_halted) { save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO); if (0 != save_errno) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"), CALLFROM, save_errno); } udi->counter_acc_incremented = FALSE; } assert(safe_mode || !bypassed_access); /* Now remove the rundown lock */ if (!bypassed_access) { if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore release"), CALLFROM, save_errno); udi->grabbed_access_sem = FALSE; } } /* else access control semaphore will be released in db_ipcs_reset */ } if (!have_standalone_access) { if (bypassed_ftok) { if (!ftok_counter_halted) if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE)) { FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } udi->grabbed_ftok_sem = FALSE; udi->counter_ftok_incremented = FALSE; } ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); if (!ipc_deleted) { GET_CUR_TIME(time_str); if (is_src_server) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; return EXIT_NRM; }
void jnl_fsync(gd_region *reg, uint4 fsync_addr) { jnl_private_control *jpc; jnl_buffer_ptr_t jb; uint4 lcnt, saved_dsk_addr, saved_status; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int4 lck_state; int fsync_ret, save_errno; error_def(ERR_JNLFSYNCERR); error_def(ERR_FSYNCTIMOUT); error_def(ERR_TEXT); error_def(ERR_JNLFRCDTERM); error_def(ERR_JNLFSYNCLSTCK); csa = &FILE_INFO(reg)->s_addrs; jpc = csa->jnl; jb = jpc->jnl_buff; if ((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) { csd = csa->hdr; for (lcnt = 1; fsync_addr > jb->fsync_dskaddr && !JNL_FILE_SWITCHED(jpc); lcnt++) { if (MAX_FSYNC_WAIT_CNT / 2 == lcnt) /* half way into max.patience*/ { saved_status = jpc->status; jpc->status = SS_NORMAL; jnl_send_oper(jpc, ERR_JNLFSYNCLSTCK); jpc->status = saved_status ; } if (MAX_FSYNC_WAIT_CNT == lcnt) /* tried a long */ { saved_status = jpc->status; jpc->status = SS_NORMAL; jnl_send_oper(jpc, ERR_JNLFSYNCLSTCK); jpc->status = saved_status ; send_msg(VARLSTCNT(4) ERR_FSYNCTIMOUT, 2, JNL_LEN_STR(csd)); GTMASSERT; } BG_TRACE_PRO_ANY(csa, n_jnl_fsync_tries); if (GET_SWAPLOCK(&jb->fsync_in_prog_latch)) break; wcs_sleep(lcnt); performCASLatchCheck(&jb->fsync_in_prog_latch, lcnt); } if (fsync_addr > jb->fsync_dskaddr && !JNL_FILE_SWITCHED(jpc)) { assert(process_id == jb->fsync_in_prog_latch.u.parts.latch_pid); /* assert we have the lock */ saved_dsk_addr = jb->dskaddr; if (jpc->sync_io) { /* We need to maintain the fsync control fields irrespective of the type of IO, because we might * switch between these at any time. */ jb->fsync_dskaddr = saved_dsk_addr; } else { GTM_FSYNC(jpc->channel, fsync_ret); if (-1 == fsync_ret) { save_errno = errno; assert(FALSE); send_msg(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), save_errno); rts_error(VARLSTCNT(9) ERR_JNLFSYNCERR, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with fsync"), save_errno); } else { jb->fsync_dskaddr = saved_dsk_addr; BG_TRACE_PRO_ANY(csa, n_jnl_fsyncs); } } } if (process_id == jb->fsync_in_prog_latch.u.parts.latch_pid) RELEASE_SWAPLOCK(&jb->fsync_in_prog_latch); } return; }
void gds_rundown(void) { bool is_mm, we_are_last_user, we_are_last_writer; boolean_t ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch; now_t now; /* for GET_CUR_TIME macro */ char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; error_def(ERR_CRITSEMFAIL); error_def(ERR_DBCCERR); error_def(ERR_DBFILERR); error_def(ERR_DBRNDWNWRN); error_def(ERR_ERRCALL); error_def(ERR_GBLOFLOW); error_def(ERR_GTMASSERT); error_def(ERR_IPCNOTDEL); error_def(ERR_JNLFLUSH); error_def(ERR_RNDWNSEMFAIL); error_def(ERR_TEXT); error_def(ERR_WCBLOCKED); forced_exit = FALSE; /* Okay, we're dying already -- let rel_crit live in peace now. * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001 */ grabbed_access_sem = FALSE; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* * early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return; } ESTABLISH(gds_rundown_ch); if (!reg->open) /* Not open, no point to rundown */ { if (reg->opening) /* Died partway open, kill rest of way */ { rel_crit(reg); mutex_cleanup(reg); /* revist this to handle MM properly SMW 98/12/16 if (NULL != csa->nl) { status = shmdt((caddr_t)csa->nl); if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); } */ shmdt((caddr_t)csa->nl); csa->nl = NULL; } REVERT; return; } switch(csd->acc_meth) { /* Pass mm and bg through */ case dba_bg: is_mm = FALSE; break; case dba_mm: is_mm = TRUE; break; case dba_usr: assert(FALSE); default: REVERT; return; } /* Cancel any pending flush timer for this region by this task */ CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer); we_are_last_user = FALSE; if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); assert(!csa->read_lock); rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); /* * We need to guarantee that none else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it. * So just lock it. We do it in blocking mode. */ if (!ftok_sem_lock(reg, FALSE, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * For mupip_jnl_recover we already have database access control semaphore. * We do not release it. We release it from mur_close_files. */ if (!mupip_jnl_recover) { sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) /* We couldn't get it in one shot -- see if we already have it */ { save_errno = errno; /* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */ if (semctl(udi->semid, 0, GETPID) == process_id) { send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; return; /* Already in rundown for this region */ } if (EAGAIN != save_errno) { assert(FALSE); rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status); if (-1 == status) /* We couldn't get it at all.. */ rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); } } grabbed_access_sem = TRUE; /* * We now have the dbinit/rundown lock, so we are alone in this code for this region * and nobody else can attach. * See if we are all alone in accessing this database shared memory. */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --csa->nl->ref_cnt; if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch; assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */ if (-1 == (semval = semctl(udi->semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */ assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */ if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); /* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should * not flush shared memory contents to disk as they might be in an inconsistent state. * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine. * A reissue of the recover/rollback command will restore the database to a consistent state. * Otherwise, if we have write access to this region, let us perform a few writing tasks. */ if (csa->nl->donotflush_dbjnl) csa->wbuf_dqd = 0; /* ignore csa->wbuf_dqd status as we do not care about the cache contents */ else if (!reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */ if (csa->wbuf_dqd) { grab_crit(reg); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); if (is_mm) { assert(FALSE); csd = csa->hdr; } BG_TRACE_PRO_ANY(csa, lost_block_recovery); rel_crit(reg); } if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type)) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ if (is_mm) { if (csa->total_blks != csa->ti->total_blks) /* do remap if file had been extended */ { grab_crit(reg); wcs_mm_recover(reg); csd = csa->hdr; rel_crit(reg); } csa->nl->remove_shm = TRUE; } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn; } else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer) { /* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */ grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != csa->nl->jnl_file.u.inode)) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * Although we assert pini_addr should be non-zero for last_writer, we * play it safe in PRO and write a PINI record if not written already. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || 0 != jpc->pini_addr); if (we_are_last_writer && 0 == jpc->pini_addr) jnl_put_jrt_pini(csa); if (0 != jpc->pini_addr) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > csa->nl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!mupip_jnl_recover && we_are_last_user) { /* mupip_jnl_recover will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); rel_crit(reg); if (FALSE == is_mm) { if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ #if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC) if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } #else if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno); } #endif } } } /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */ if (reg->read_only && we_are_last_user && !mupip_jnl_recover) { /* mupip_jnl_recover will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } /* Done with file now, close it */ if (-1 == close(udi->fd)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ if (is_mm) { munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)); if (munmap_len > 0) { munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)), (size_t)(munmap_len)); #ifdef DEBUG_DB64 rel_mmseg((caddr_t)csa->db_addrs[0]); #endif } } /* Detach our shared memory while still under lock so reference counts will be * correct for the next process to run down this region. * In the process also get the remove_shm status from node_local before detaching. * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl); status = shmdt((caddr_t)csa->nl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ if (csa->wbuf_dqd) GTMASSERT; ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); } else if (is_src_server || is_updproc) { gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); /* * Don't release semaphore in case of mupip recover/rollback; since it has standalone access. * It will release the semaphore in mur_close_files. */ if (!mupip_jnl_recover) { if (0 != sem_rmid(udi->semid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); grabbed_access_sem = FALSE; } } else { assert(!mupip_jnl_recover); /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno); /* Now remove the rundown lock */ if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno); grabbed_access_sem = FALSE; } if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); if (!ipc_deleted) { GET_CUR_TIME; if (is_src_server) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover) { gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; }
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { uint4 status, duint4, blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; bool clustered, was_crit; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; int4 dummy_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked; error_def(ERR_DBFILERR); error_def(ERR_BUFOWNERSTUCK); first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); assert((t_tries < CDB_STAGNATE) || csa->now_crit); if (0 < dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { first_tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)blk, &duint4); ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->ptr : NULL; } assert(!cse || !cse->high_tlevel); if (cse) { /* transaction has modified the sought after block */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ already_built = (NULL != cse->new_buff); gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(cse->blk_target); if (!already_built && !chain1.flag) { assert(first_tp_srch_status && (is_mm || first_tp_srch_status->cr) && first_tp_srch_status->buffaddr); if (first_tp_srch_status->tn <= ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->tn, ((blk_hdr_ptr_t)(first_tp_srch_status->buffaddr))->levl); return (sm_uc_ptr_t)NULL; } if ((!is_mm) && (first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle || first_tp_srch_status->blk_num != first_tp_srch_status->cr->blk)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } if (certify_all_blocks && FALSE == cert_blk(gv_cur_region, blk, (blk_hdr_ptr_t)cse->new_buff, cse->blk_target->root)) GTMASSERT; } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } assert(!chain1.flag); } else first_tp_srch_status = (srch_blk_status *)lookup_hashtab_ent(sgm_info_ptr->blks_in_use, (void *)blk, &duint4); ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { assert(first_tp_srch_status->cr && !first_tp_srch_status->ptr); if (first_tp_srch_status->cycle == first_tp_srch_status->cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = first_tp_srch_status->cr; first_tp_srch_status->cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled. Allow for * recycling. But update the first_tp_srch_status (for this blk) in the si->first_tp_hist * array to reflect the new buffer, cycle and cache-record. Since we know those only at the end of * t_qread, set a variable here that will enable the updation before returning from t_qread(). */ reset_first_tp_srch_status = TRUE; } } } if ((blk >= csa->ti->total_blks) || (blk < 0)) { /* requested block out of range; could occur because of a concurrency conflict */ if ((&FILE_INFO(gv_cur_region)->s_addrs != csa) || (csd != cs_data)) GTMASSERT; assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; was_crit = csa->now_crit; ocnt = 0; set_wc_blocked = FALSE; /* to indicate whether csd->wc_blocked was set to TRUE by us */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (FALSE == csa->now_crit) { if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } if (csd->flush_trigger <= csa->nl->wcs_active_lvl && FALSE == gv_cur_region->read_only) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, 0, dummy_errno); /* a macro that dclast's wcs_wtstart() and checks for errors etc. */ grab_crit(gv_cur_region); cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csa->ti->curr_tn; if (FALSE == was_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); INCR_DB_CSH_COUNTER(csa, n_dsk_reads, 1); if (SS_NORMAL != (status = dsk_read(blk, GDS_REL2ABS(cr->buffaddr)))) { RELEASE_BUFF_READ_LOCK(cr); assert(was_crit == csa->now_crit); if (FUTURE_READ == status) { /* in cluster, block can be in the "future" with respect to the local history */ assert(TRUE == clustered); assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_future_read; /* t_retry forces the history up to date */ return (sm_uc_ptr_t)NULL; } rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } return (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); } else if ((FALSE == was_crit) && (BAD_LUCK_ABOUNDS > ocnt)) { assert(TRUE == csa->now_crit); assert(csa->nl->in_crit == process_id); rel_crit(gv_cur_region); } } if (CR_NOTVALID == (sm_long_t)cr) { SET_TRACEABLE_VAR(cs_data->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_get_invalid_blk); set_wc_blocked = TRUE; break; } for (lcnt = 1; ; lcnt++) { if (0 > cr->read_in_progress) { /* it's not being read */ if (clustered && (0 == cr->bt_index) && (cr->tn < ((th_rec *)((uchar_ptr_t)csa->th_base + csa->th_base->tnque.fl))->tn)) { /* can't rely on the buffer */ cr->cycle++; /* increment cycle whenever blk number changes (tp_hist depends on this) */ cr->blk = CR_BLKEMPTY; break; } *cycle = cr->cycle; *cr_out = cr; VMS_ONLY( /* If we were doing the db_csh_get() above (in t_qread itself) and located the cache-record * which, before coming here and taking a copy of cr->cycle a few lines above, was made an * older twin by another process in bg_update (note this can happen in VMS only) which has * already incremented the cycle, we will end up having a copy of the old cache-record with * its incremented cycle number and hence will succeed in tp_hist validation if we return * this <cr,cycle> combination although we don't want to since this "cr" is not current for * the given block as of now. Note that the "indexmod" optimization in tp_tend() relies on * an accurate intermediate validation by tp_hist() which in turn relies on the <cr,cycle> * value returned by t_qread() to be accurate for a given blk at the current point in time. * We detect the older-twin case by the following check. Note that here we depend on the * the fact that bg_update() sets cr->bt_index to 0 before incrementing cr->cycle. * Given that order, cr->bt_index can be guaranteed to be 0 if we read the incremented cycle */ if (cr->twin && (0 == cr->bt_index)) break; ) if (cr->blk != blk) break; if (was_crit != csa->now_crit) rel_crit(gv_cur_region); assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) { /* keep the parantheses for the if (although single line) since the following is a macro */ RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); } /* Note that at this point we expect t_qread() to return a <cr,cycle> combination that * corresponds to "blk" passed in. It is crucial to get an accurate value for both the fields * since tp_hist() relies on this for its intermediate validation. */ return (sm_uc_ptr_t)GDS_ANY_REL2ABS(csa, cr->buffaddr); } if (blk != cr->blk) break; if (lcnt >= BUF_OWNER_STUCK && (0 == (lcnt % BUF_OWNER_STUCK))) { if (FALSE == csa->now_crit) grab_crit(gv_cur_region); if (cr->read_in_progress < -1) { /* outside of design; clear to known state */ BG_TRACE_PRO(t_qread_out_of_design); INTERLOCK_INIT(cr); assert(0 == cr->r_epid); cr->r_epid = 0; } else if (cr->read_in_progress >= 0) { BG_TRACE_PRO(t_qread_buf_owner_stuck); if (0 != (blocking_pid = cr->r_epid)) { if (FALSE == is_proc_alive(blocking_pid, cr->image_count)) { /* process gone: release that process's lock */ assert(0 == cr->bt_index); if (cr->bt_index) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index1); set_wc_blocked = TRUE; break; } cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */ cr->blk = CR_BLKEMPTY; RELEASE_BUFF_READ_LOCK(cr); } else { rel_crit(gv_cur_region); send_msg(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region)); send_msg(VARLSTCNT(9) ERR_BUFOWNERSTUCK, 7, process_id, blocking_pid, cr->blk, cr->blk, (lcnt / BUF_OWNER_STUCK), cr->read_in_progress, cr->rip_latch.latch_pid); if ((4 * BUF_OWNER_STUCK) <= lcnt) GTMASSERT; /* Kickstart the process taking a long time in case it was suspended */ UNIX_ONLY(continue_proc(blocking_pid)); } } else { /* process stopped before could set r_epid */ assert(0 == cr->bt_index); if (cr->bt_index) { SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_bad_bt_index2); set_wc_blocked = TRUE; break; } cr->cycle++; /* increment cycle for blk number changes (for tp_hist) */ cr->blk = CR_BLKEMPTY; RELEASE_BUFF_READ_LOCK(cr); if (cr->read_in_progress < -1) /* race: process released since if r_epid */ LOCK_BUFF_FOR_READ(cr, dummy); } } if (was_crit != csa->now_crit) rel_crit(gv_cur_region); } else wcs_sleep(lcnt); } if (set_wc_blocked) /* cannot use csd->wc_blocked here as we might not necessarily have crit */ break; ocnt++; if (BAD_LUCK_ABOUNDS <= ocnt) { if (BAD_LUCK_ABOUNDS < ocnt || csa->now_crit) { rel_crit(gv_cur_region); GTMASSERT; } if (FALSE == csa->now_crit) grab_crit(gv_cur_region); } } while (TRUE);
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { int4 status; uint4 blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; boolean_t clustered, hold_onto_crit, was_crit, issued_db_init_crypt_warning, sync_needed; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; enum db_ver ondsk_blkver; int4 dummy_errno, gtmcrypt_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked; ht_ent_int4 *tabent; srch_blk_status *blkhist; trans_num dirty, blkhdrtn; sm_uc_ptr_t buffaddr; uint4 stuck_cnt = 0; boolean_t lcl_blk_free; node_local_ptr_t cnl; gd_segment *seg; uint4 buffs_per_flush, flush_target; enc_info_t *encr_ptr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; lcl_blk_free = block_is_free; block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */ first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); /* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */ assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover); if (dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); assert(!chain1.flag || cse); if (cse) { /* transaction has modified the sought after block */ if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode)) { /* Changes have not been committed to shared memory, i.e. still in private memory. * Build block in private buffer if not already done and return the same. */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ assert(gds_t_committed != cse->mode); already_built = (NULL != cse->new_buff); /* Validate the block's search history right after building a private copy. * This is not needed in case gvcst_search is going to reuse the clue's search * history and return (because tp_hist will do the validation of this block). * But if gvcst_search decides to do a fresh traversal (because the clue does not * cover the path of the current input key etc.) the block build that happened now * will not get validated in tp_hist since it will instead be given the current * key's search history path (a totally new path) for validation. Since a private * copy of the block has been built, tp_tend would also skip validating this block * so it is necessary that we validate the block right here. Since it is tricky to * accurately differentiate between the two cases, we do the validation * unconditionally here (besides it is only a few if checks done per block build * so it is considered okay performance-wise). */ gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(NULL != cse->blk_target); if (!already_built && !chain1.flag) { buffaddr = first_tp_srch_status->buffaddr; cr = first_tp_srch_status->cr; assert((is_mm || cr) && buffaddr); blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn; if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl); return (sm_uc_ptr_t)NULL; } if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle) || (first_tp_srch_status->blk_num != cr->blk))) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } else { /* Block changes are already committed to shared memory (possible if we are in TP * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read * block from shared memory; do not look at private memory (i.e. cse) as that might * not be as uptodate as shared memory. */ assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */ /* If this block was newly created as part of the TP transaction, it should not be killed * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would * have had an old_mode of kill_t_create in which case we would not have come into this * else block. Assert accordingly. */ assert(!chain1.flag); first_tp_srch_status = NULL; /* do not use any previous srch_hist information */ } } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; } ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { cr = first_tp_srch_status->cr; assert(cr && !first_tp_srch_status->cse); if (first_tp_srch_status->cycle == cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = cr; cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled in the cache. * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect * if block recycling happened within the same mini-action and restart in that case. * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know * the values to update to. Set a variable that will enable the updation before returning. * Also assert that if we are in the final retry, we are never in a situation where we have a * block that got recycled since the start of the current mini-action. This is easily detected since * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that * have been read as part of the current mini-action until now. */ assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk))); reset_first_tp_srch_status = TRUE; } } } if ((uint4)blk >= (uint4)csa->ti->total_blks) { /* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is * never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread * can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below. */ assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data)); assert(!csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } was_crit = csa->now_crit; cnl = csa->nl; encr_ptr = csa->encr_ptr; if (NULL != encr_ptr) { /* If this is an encrypted database and we hold crit, make sure our private cycle matches the shared cycle. * Or else we would need to call "process_reorg_encrypt_restart" below (a heavyweight operation) holding crit. */ assert(!was_crit || (cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle)); seg = gv_cur_region->dyn.addr; issued_db_init_crypt_warning = encr_ptr->issued_db_init_crypt_warning; if (!IS_BITMAP_BLK(blk) && issued_db_init_crypt_warning) { /* A non-GT.M process is attempting to read a non-bitmap block, yet it has previously encountered an error * during db_init (because it did not have access to the encryption keys) and reported it with a -W- * severity. Since the block it is attempting to read can be in the unencrypted shared memory (read from * disk by another process with access to the encryption keys), we cannot let it access it without a valid * handle, so issue an rts_error. * * TODO: DSE and LKE could bypass getting the ftok semaphore. LKE is not an issue, but DSE does care about * the csa->reorg_encrypt_cycle. So it means DSE could get an inconsistent copy of reorg_encrypt_cycle * and associated hashes if it had done a bypass and a concurrent REORG -ENCRYPT is holding the ftok * semaphore and changing these values at the same time. */ assert(!IS_GTM_IMAGE); /* GT.M would have error'ed out in db_init */ gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG)); GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname); } else if (cnl->reorg_encrypt_cycle != encr_ptr->reorg_encrypt_cycle) { /* A concurrent MUPIP REORG ENCRYPT occurred. Cannot proceed with the read even if the block is * already loaded from disk into the unencrypted global buffers (security issue). Need to load the * new encryption keys and only let those processes which are able to successfully do this proceed * with the read. First, copy the key hashes from csd into csa->encr_ptr. That needs crit * to ensure a concurrent MUPIP REORG ENCRYPT does not sneak in. * * Note: Even though we asserted a few lines above that if "was_crit" is TRUE, then we expect * the encryption cycles to be in sync, we handle this out-of-design situation in "pro" by fixing * the cycles while holding crit (hopefully rare case so it is okay to hold crit for a heavyweight call). */ if (!was_crit) grab_crit(gv_cur_region); /* Now that we have crit, sync them up by copying the new keys inside crit and opening the key handles * outside crit (a potentially long running operation). */ SIGNAL_REORG_ENCRYPT_RESTART(mu_reorg_encrypt_in_prog, reorg_encrypt_restart_csa, cnl, csa, csd, rdfail_detail, process_id); assert(csa == reorg_encrypt_restart_csa); if (!was_crit) rel_crit(gv_cur_region); /* If we are inside a TP read-write transaction, it is possible we already used the old keys for * prior calls to "jnl_format" so we have to restart (cannot sync up cycles). Do the same for * TP read-only transaction as well as NON-TP read-write transaction. In all these cases we know * the caller is capable of restarting. All other cases we dont know if the caller is capable so * sync up the cycles and proceed using the new keys for the read. * * But since it is possible the caller does not call t_retry right away (e.g. mupip reorg which can * choose to abandone this tree path and move on to another block without aborting this transaction) * it is better we finish the pending call to "process_reorg_encrypt_restart" right here before returning. */ process_reorg_encrypt_restart(); assert(NULL == reorg_encrypt_restart_csa); if (IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans)) { assert(cdb_sc_reorg_encrypt == rdfail_detail); /* set by SIGNAL_REORG_ENCRYPT_RESTART macro */ return (sm_uc_ptr_t)NULL; } } } assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; ocnt = 0; set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */ hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (!csa->now_crit) { assert(!hold_onto_crit); if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } /* assume defaults for flush_target and buffs_per_flush */ flush_target = csd->flush_trigger; buffs_per_flush = 0; if ((0 != csd->epoch_taper) && (FALSE == gv_cur_region->read_only) && JNL_ENABLED(csd) && (0 != cnl->wcs_active_lvl) && (NOJNL != csa->jnl->channel) && (0 != cnl->jnl_file.u.inode) && csd->jnl_before_image) { EPOCH_TAPER_IF_NEEDED(csa, csd, cnl, (gd_region *) 0, FALSE, buffs_per_flush, flush_target); } if ((flush_target <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only)) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, buffs_per_flush, dummy_errno); /* a macro that dclast's "wcs_wtstart" and checks for errors etc. */ /* Get crit but also ensure encryption cycles are in sync ("dsk_read" relies on this). * Note: "sync_needed" should be TRUE very rarely since we synced the cycles just a few lines * above. But in case a MUPIP REORG ENCRYPT concurrently sneaked in between these lines we * need to resync. */ sync_needed = grab_crit_encr_cycle_sync(gv_cur_region); assert(NULL == reorg_encrypt_restart_csa); assert(!sync_needed || (NULL != encr_ptr)); if (sync_needed && IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans)) { assert(cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle); rel_crit(gv_cur_region); rdfail_detail = cdb_sc_reorg_encrypt; /* set by SIGNAL_REORG_ENCRYPT_RESTART macro */ return (sm_uc_ptr_t)NULL; } cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csd->trans_hist.curr_tn; /* Record history of most recent disk reads only in dbg builds for now. Although the macro * is just a couple dozen instructions, it is done while holding crit so we want to avoid * delaying crit unless really necessary. Whoever wants this information can enable it * by a build change to remove the DEBUG_ONLY part below. */ DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);) if (!was_crit && !hold_onto_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr); buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); # ifdef DEBUG /* stop self to test sechshr_db_clnup clears the read state */ if (gtm_white_box_test_case_enabled && (WBTEST_SIGTSTP_IN_T_QREAD == gtm_white_box_test_case_number)) { /* this should never fail, but because of the way we developed the test we got paranoid */ dummy = kill(process_id, SIGTERM); assert(0 == dummy); for (dummy = 10; dummy; dummy--) LONG_SLEEP(10); /* time for sigterm to take hit before we clear block_now_locked */ } # endif if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free))) { /* buffer does not contain valid data, so reset blk to be empty */ cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */ cr->blk = CR_BLKEMPTY; cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); TREF(block_now_locked) = NULL; assert(-1 <= cr->read_in_progress); assert(was_crit == csa->now_crit); if (ERR_DYNUPGRDFAIL == status) { /* if we dont hold crit on the region, it is possible due to concurrency conflicts * that this block is unused (i.e. marked free/recycled in bitmap, see comments in * gds_blk_upgrade.h). in this case we should not error out but instead restart. */ if (was_crit) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region)); } else { rdfail_detail = cdb_sc_lostcr; return (sm_uc_ptr_t)NULL; } } if ((-1 == status) && !was_crit) { /* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch * a full database block's length of data. This can happen if the requested read is * past the end of the file, which can happen if a concurrent truncate occurred * after the blk >= csa->ti->total_blks comparison above. Allow for this scenario * by restarting. However, if we've had crit the whole time, no truncate could have * happened. -1 indicates a problem with the file, so fall through to DBFILERR. */ rdfail_detail = cdb_sc_truncate; return (sm_uc_ptr_t)NULL; } else if (IS_CRYPTERR_MASK(status)) { seg = gv_cur_region->dyn.addr; GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname); } else { /* A DBFILERR can be thrown for two possible reasons: * (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or * (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block * number for one region from another region with fewer total_blks. * We suspect the former is what happened in GTM-7623. Apparently the latter * has been an issue before, too. If either occurs again in pro, this assertpro * distinguishes the two possibilities. */ assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data)); rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } } disk_blk_read = TRUE; assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); /* Only set in cache if read was success */ cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); TREF(block_now_locked) = NULL; assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); return buffaddr; } else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt)) { assert(!hold_onto_crit); assert(TRUE == csa->now_crit); assert(cnl->in_crit == process_id); rel_crit(gv_cur_region); } }