void lke_setgdr(void) { gd_region *r_top; mval reset; bool def; short len; char buf[256]; int4 rundown_status = EXIT_NRM; /* if gds_rundown went smoothly */ static readonly char init_gdr[] = "gtmgbldir"; gvcmy_rundown(); gv_cur_region = gd_header->regions; r_top = gv_cur_region + gd_header->n_regions; for (gv_cur_region = gd_header->regions, r_top = gv_cur_region + gd_header->n_regions; gv_cur_region < r_top; gv_cur_region++) { tp_change_reg(); UNIX_ONLY(rundown_status |=) gds_rundown(); } if (EXIT_NRM != rundown_status) rts_error(VARLSTCNT(1) ERR_NOTALLDBRNDWN); if (cli_present("gld")) { cli_get_value("gld", buf) ; def = FALSE; reset.mvtype = MV_STR; reset.str.len = STRLEN(buf); reset.str.addr = buf; } else { reset.mvtype = MV_STR; reset.str.len = SIZEOF(init_gdr) - 1; reset.str.addr = init_gdr; } zgbldir(&reset); cs_addrs = 0; cs_data = 0; region_init(FALSE) ; #ifndef MUTEX_MSEM_WAKE mutex_sock_cleanup(); #endif gtmsecshr_sock_cleanup(CLIENT); }
void gv_rundown(void) { gd_region *r_top, *r_save, *r_local; gd_addr *addr_ptr; sgm_info *si; #ifdef VMS vms_gds_info *gds_info; #endif error_def(ERR_TEXT); r_save = gv_cur_region; /* Save for possible core dump */ gvcmy_rundown(); ENABLE_AST if (pool_init) rel_lock(jnlpool.jnlpool_dummy_reg); for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++) { if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth) { /* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above. * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions, * region->open is TRUE although cs_addrs is NULL. */ gv_cur_region = r_local; tp_change_reg(); gds_rundown(); /* Now that gds_rundown is done, free up the memory associated with the region. * Ideally the following memory freeing code should go to gds_rundown, but * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM. */ if (NULL != cs_addrs) { if (NULL != cs_addrs->dir_tree) FREE_CSA_DIR_TREE(cs_addrs); if (cs_addrs->sgm_info_ptr) { si = cs_addrs->sgm_info_ptr; /* It is possible we got interrupted before initializing all fields of "si" * completely so account for NULL values while freeing/releasing those fields. */ assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa)); if (si->jnl_tail) { CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list); } CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list); CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list); CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list); if (NULL != si->blks_in_use) { free_hashtab_int4(si->blks_in_use); free(si->blks_in_use); si->blks_in_use = NULL; } if (si->cr_array_size) { assert(NULL != si->cr_array); if (NULL != si->cr_array) free(si->cr_array); } if (NULL != si->first_tp_hist) free(si->first_tp_hist); free(si); } if (cs_addrs->jnl) { assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs); if (cs_addrs->jnl->jnllsb) { UNIX_ONLY(assert(FALSE)); free(cs_addrs->jnl->jnllsb); } free(cs_addrs->jnl); } GTMCRYPT_ONLY( if (cs_addrs->encrypted_blk_contents) free(cs_addrs->encrypted_blk_contents); ) } assert(gv_cur_region->dyn.addr->file_cntl->file_info); VMS_ONLY( gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info; if (gds_info->xabpro) free(gds_info->xabpro); if (gds_info->xabfhc) free(gds_info->xabfhc); if (gds_info->nam) { free(gds_info->nam->nam$l_esa); free(gds_info->nam); } if (gds_info->fab) free(gds_info->fab); ) free(gv_cur_region->dyn.addr->file_cntl->file_info); free(gv_cur_region->dyn.addr->file_cntl); } r_local->open = r_local->was_open = FALSE; }
int mur_forward_multi_proc(reg_ctl_list *rctl) { boolean_t multi_proc, this_reg_stuck, release_latch, ok_to_play; boolean_t cancelled_dbsync_timer, cancelled_timer; reg_ctl_list *rctl_top, *prev_rctl; jnl_ctl_list *jctl; gd_region *reg; sgmnt_addrs *csa; seq_num rec_token_seq; jnl_tm_t rec_time; enum broken_type recstat; jnl_record *rec; enum jnl_record_type rectype; char errstr[256]; int i, rctl_index, save_errno, num_procs_stuck, num_reg_stuck; uint4 status, regcnt_stuck, num_partners, start_hrtbt_cntr; forw_multi_struct *forw_multi; shm_forw_multi_t *sfm; multi_struct *multi; jnl_tm_t adjusted_resolve_time; shm_reg_ctl_t *shm_rctl_start, *shm_rctl, *first_shm_rctl; size_t shm_size, reccnt, copy_size; int4 *size_ptr; char *shmPtr; /* not using "shm_ptr" since it is already used in an AIX include file */ int shmid; multi_proc_shm_hdr_t *mp_hdr; /* Pointer to "multi_proc_shm_hdr_t" structure in shared memory */ status = 0; /* Although we made sure the # of tasks is the same as the # of processes forked off (in the "gtm_multi_proc" * invocation in "mur_forward"), it is possible one of the forked process finishes one invocation of * "mur_forward_multi_proc" before even another forked process gets assigned one task in "gtm_multi_proc_helper". * In this case, we would be invoked more than once. But the first invocation would have done all the needed stuff * so return for later invocations. */ if (mur_forward_multi_proc_done) return 0; mur_forward_multi_proc_done = TRUE; /* Note: "rctl" is unused. But cannot avoid passing it since "gtm_multi_proc" expects something */ prev_rctl = NULL; rctl_start = NULL; adjusted_resolve_time = murgbl.adjusted_resolve_time; assert(0 == murgbl.regcnt_remaining); multi_proc = multi_proc_in_use; /* cache value in "local" to speed up access inside loops below */ if (multi_proc) { mp_hdr = multi_proc_shm_hdr; shm_rctl_start = mur_shm_hdr->shm_rctl_start; if (jgbl.onlnrlbk) { for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { assert(rctl->csa->hold_onto_crit); /* would have been set in parent process */ rctl->csa->hold_onto_crit = FALSE; /* reset since we dont own this region */ assert(rctl->csa->now_crit); /* would have been set in parent process */ rctl->csa->now_crit = FALSE; /* reset since we dont own this region */ } } START_HEARTBEAT_IF_NEEDED; /* heartbeat timer needed later (in case not already started by "gtm_multi_proc") */ } first_shm_rctl = NULL; /* Phase1 of forward recovery starts */ for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { /* Check if "rctl" is available for us or if some other concurrent process has taken it */ if (multi_proc) { rctl_index = rctl - &mur_ctl[0]; shm_rctl = &shm_rctl_start[rctl_index]; if (shm_rctl->owning_pid) { assert(process_id != shm_rctl->owning_pid); continue; } GRAB_MULTI_PROC_LATCH_IF_NEEDED(release_latch); assert(release_latch); for ( ; rctl < rctl_top; rctl++, shm_rctl++) { if (shm_rctl->owning_pid) { assert(process_id != shm_rctl->owning_pid); continue; } shm_rctl->owning_pid = process_id; /* Declare ownership */ rctl->this_pid_is_owner = TRUE; if (jgbl.onlnrlbk) { /* This is an online rollback and crit was grabbed on all regions by the parent rollback * process. But this child process now owns this region and does the actual rollback on * this region so borrow crit for the duration of this child process. */ csa = rctl->csa; csa->hold_onto_crit = TRUE; csa->now_crit = TRUE; assert(csa->nl->in_crit == mp_hdr->parent_pid); csa->nl->in_crit = process_id; assert(csa->nl->onln_rlbk_pid == mp_hdr->parent_pid); csa->nl->onln_rlbk_pid = process_id; } if (NULL == first_shm_rctl) first_shm_rctl = shm_rctl; break; } REL_MULTI_PROC_LATCH_IF_NEEDED(release_latch); if (rctl >= rctl_top) { assert(rctl == rctl_top); break; } /* Set key to print this rctl'ss region-name as prefix in case this forked off process prints any output */ MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); # ifdef MUR_DEBUG fprintf(stderr, "pid = %d : Owns region %s\n", process_id, multi_proc_key); # endif } else rctl->this_pid_is_owner = TRUE; if (mur_options.forward) { assert(NULL == rctl->jctl_turn_around); jctl = rctl->jctl = rctl->jctl_head; assert(jctl->reg_ctl == rctl); jctl->rec_offset = JNL_HDR_LEN; jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END; /* initialized to reflect journaling is not enabled */ if (mur_options.rollback) jgbl.mur_jrec_seqno = jctl->jfh->start_seqno; } else { jctl = rctl->jctl = (NULL == rctl->jctl_turn_around) ? rctl->jctl_head : rctl->jctl_turn_around; assert(jctl->reg_ctl == rctl); jctl->rec_offset = jctl->turn_around_offset; jgbl.mur_jrec_seqno = jctl->turn_around_seqno; assert((NULL != rctl->jctl_turn_around) || (0 == jctl->rec_offset)); } if (mur_options.rollback) { if (murgbl.consist_jnl_seqno < jgbl.mur_jrec_seqno) { /* Assert that murgbl.losttn_seqno is never lesser than jgbl.mur_jrec_seqno (the turnaround * point seqno) as this is what murgbl.consist_jnl_seqno is going to be set to and will * eventually be the post-rollback seqno. If this condition is violated, the result of the * recovery is a compromised database (the file header will indicate a Region Seqno which * is not necessarily correct since seqnos prior to it might be absent in the database). * Therefore, this is an out-of-design situation with respect to rollback and so stop it. */ assert(murgbl.losttn_seqno >= jgbl.mur_jrec_seqno); murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno; } assert(murgbl.consist_jnl_seqno <= murgbl.losttn_seqno); } if (mur_options.update || mur_options.extr[GOOD_TN]) { reg = rctl->gd; gv_cur_region = reg; tp_change_reg(); /* note : sets cs_addrs to non-NULL value even if gv_cur_region->open is FALSE * (cs_data could still be NULL). */ rctl->csa = cs_addrs; cs_addrs->miscptr = (void *)rctl; rctl->csd = cs_data; rctl->sgm_info_ptr = cs_addrs->sgm_info_ptr; assert(!reg->open || (NULL != cs_addrs->dir_tree)); gv_target = cs_addrs->dir_tree; } jctl->after_end_of_data = FALSE; status = mur_next(jctl, jctl->rec_offset); assert(ERR_JNLREADEOF != status); /* cannot get EOF at start of forward processing */ if (SS_NORMAL != status) goto finish; PRINT_VERBOSE_STAT(jctl, "mur_forward:at the start"); rctl->process_losttn = FALSE; /* Any multi-region TP transaction will be processed as multiple single-region TP transactions up * until the tp-resolve-time is reached. From then on, they will be treated as one multi-region TP * transaction. This is needed for proper lost-tn determination (any multi-region transaction that * gets played in a region AFTER it has already encountered a broken tn should treat this as a lost tn). */ do { if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr)) { /* We are at a logical point. So exit if signaled by parent */ status = ERR_FORCEDHALT; goto finish; } assert(jctl == rctl->jctl); rec = rctl->mur_desc->jnlrec; rec_time = rec->prefix.time; if (rec_time > mur_options.before_time) break; /* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */ if (rec_time < mur_options.after_time) { status = mur_next_rec(&jctl); continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */ } if (rec_time >= adjusted_resolve_time) break; /* Records after this adjusted resolve_time will be processed below in phase2 */ /* Note: Since we do hashtable token processing only for records from tp_resolve_time onwards, * it is possible that if we encounter any broken transactions here we wont know they are broken * but will play them as is. That is unavoidable. Specify -SINCE_TIME (for -BACKWARD rollback/recover) * and -VERIFY (for -FORWARD rollback/recover) to control tp_resolve_time (and in turn more * effective broken tn determination). */ status = mur_forward_play_cur_jrec(rctl); if (SS_NORMAL != status) break; status = mur_next_rec(&jctl); } while (SS_NORMAL == status); CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */ if (SS_NORMAL != status) { /* ERR_FILENOTCREATE is possible from "mur_cre_file_extfmt" OR ERR_FORCEDHALT is possible * from "mur_forward_play_cur_jrec". No other errors are known to occur here. Assert accordingly. */ assert((ERR_FILENOTCREATE == status) || (ERR_FORCEDHALT == status)); goto finish; } if (rctl->forw_eof_seen) { PRINT_VERBOSE_STAT(jctl, "mur_forward:Reached EOF before tp_resolve_time"); continue; /* Reached EOF before even getting to tp_resolve_time. * Do not even consider region for next processing loop */ } rctl->last_tn = 0; murgbl.regcnt_remaining++; /* # of regions participating in recovery at this point */ if (NULL == rctl_start) rctl_start = rctl; if (NULL != prev_rctl) { prev_rctl->next_rctl = rctl; rctl->prev_rctl = prev_rctl; } prev_rctl = rctl; assert(murgbl.ok_to_update_db || !rctl->db_updated); PRINT_VERBOSE_STAT(jctl, "mur_forward:at tp_resolve_time"); } if (multi_proc) multi_proc_key = NULL; /* reset key until it can be set to rctl's region-name again */ /* Note that it is possible for rctl_start to be NULL at this point. That is there is no journal record in any region * AFTER the calculated tp-resolve-time. This is possible if for example -AFTER_TIME was used and has a time later * than any journal record in all journal files. If rctl_start is NULL, prev_rctl should also be NULL and vice versa. */ if (NULL != rctl_start) { assert(NULL != prev_rctl); prev_rctl->next_rctl = rctl_start; rctl_start->prev_rctl = prev_rctl; } rctl = rctl_start; regcnt_stuck = 0; /* # of regions we are stuck in waiting for other regions to resolve a multi-region TP transaction */ assert((NULL == rctl) || (NULL == rctl->forw_multi)); gv_cur_region = NULL; /* clear out any previous value to ensure gv_cur_region/cs_addrs/cs_data * all get set in sync by the MUR_CHANGE_REG macro below. */ /* Phase2 of forward recovery starts */ while (NULL != rctl) { /* while there is at least one region remaining with unprocessed journal records */ assert(NULL != rctl_start); assert(0 < murgbl.regcnt_remaining); if (NULL != rctl->forw_multi) { /* This region's current journal record is part of a TP transaction waiting for other regions */ regcnt_stuck++; assert(regcnt_stuck <= murgbl.regcnt_remaining); if (regcnt_stuck == murgbl.regcnt_remaining) { assertpro(multi_proc_in_use); /* Else : Out-of-design situation. Stuck in ALL regions. */ /* Check one last time if all regions are stuck waiting for another process to resolve the * multi-region TP transaction. If so, wait in a sleep loop. If not, we can proceed. */ rctl = rctl_start; start_hrtbt_cntr = heartbeat_counter; do { if (IS_FORCED_MULTI_PROC_EXIT(mp_hdr)) { /* We are at a logical point. So exit if signaled by parent */ status = ERR_FORCEDHALT; goto finish; } forw_multi = rctl->forw_multi; assert(NULL != forw_multi); sfm = forw_multi->shm_forw_multi; assert(NULL != sfm); assert(sfm->num_reg_seen_forward <= sfm->num_reg_seen_backward); # ifdef MUR_DEBUG fprintf(stderr, "Pid = %d : Line %d : token = %llu : forward = %d : backward = %d\n", process_id, __LINE__, (long long int)sfm->token, sfm->num_reg_seen_forward, sfm->num_reg_seen_backward); # endif if (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward) { /* We are no longer stuck in this region */ assert(!forw_multi->no_longer_stuck); forw_multi->no_longer_stuck = TRUE; break; } rctl = rctl->next_rctl; /* Move on to the next available region */ assert(NULL != rctl); if (rctl == rctl_start) { /* We went through all regions once and are still stuck. * Sleep until at leat TWO heartbeats have elapsed after which check for deadlock. * Do this only in the child process that owns the FIRST region in the region list. * This way we dont have contention for the GRAB_MULTI_PROC_LATCH from * all children at more or less the same time. */ if ((rctl == mur_ctl) && (heartbeat_counter > (start_hrtbt_cntr + 2))) { /* Check if all processes are stuck for a while. If so assertpro */ GRAB_MULTI_PROC_LATCH_IF_NEEDED(release_latch); assert(release_latch); shm_rctl_start = mur_shm_hdr->shm_rctl_start; num_reg_stuck = 0; for (i = 0; i < murgbl.reg_total; i++) { shm_rctl = &shm_rctl_start[i]; sfm = shm_rctl->shm_forw_multi; if (NULL != sfm) { if (sfm->num_reg_seen_forward != sfm->num_reg_seen_backward) num_reg_stuck++; } } REL_MULTI_PROC_LATCH_IF_NEEDED(release_latch); /* If everyone is stuck at this point, it is an out-of-design situation */ assertpro(num_reg_stuck < murgbl.reg_total); start_hrtbt_cntr = heartbeat_counter; } else { /* Sleep and recheck if any region we are stuck in got resolved. * To minimize time spent sleeping, we just yield our timeslice. */ rel_quant(); continue; } } } while (TRUE); } else { rctl = rctl->next_rctl; /* Move on to the next available region */ assert(NULL != rctl); continue; } } regcnt_stuck = 0; /* restart the counter now that we found at least one non-stuck region */ MUR_CHANGE_REG(rctl); jctl = rctl->jctl; this_reg_stuck = FALSE; for ( status = SS_NORMAL; SS_NORMAL == status; ) { if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr)) { /* We are at a logical point. So exit if signaled by parent */ status = ERR_FORCEDHALT; goto finish; } assert(jctl == rctl->jctl); rec = rctl->mur_desc->jnlrec; rec_time = rec->prefix.time; if (rec_time > mur_options.before_time) break; /* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */ assert((rec_time >= adjusted_resolve_time) || (mur_options.notncheck && !mur_options.verify)); assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated)); if (rec_time < mur_options.after_time) { status = mur_next_rec(&jctl); continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */ } /* Check if current journal record can be played right away or need to wait for corresponding journal * records from other participating TP regions to be reached. A non-TP or ZTP transaction can be played * without issues (i.e. has no dependencies with any other regions). A single-region TP transaction too * falls in the same category. A multi-region TP transaction needs to wait until all participating regions * have played all journal records BEFORE this TP in order to ensure recover plays records in the exact * same order that GT.M performed them in. */ /* If FENCE_NONE is specified, we would not have maintained any multi hashtable in mur_back_process for * broken transaction processing. So we process multi-region TP transactions as multiple single-region * TP transactions in forward phase. */ if (FENCE_NONE != mur_options.fences) { rectype = (enum jnl_record_type)rec->prefix.jrec_type; if (IS_TP(rectype) && IS_TUPD(rectype)) { assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype)); assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants); assert(&rec->jrec_set_kill.num_participants == &rec->jrec_lgtrig.num_participants); num_partners = rec->jrec_set_kill.num_participants; assert(0 < num_partners); if (1 < num_partners) { this_reg_stuck = TRUE; assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num); assert(&rec->jrec_set_kill.update_num == &rec->jrec_lgtrig.update_num); } } } if (this_reg_stuck) { rec_token_seq = GET_JNL_SEQNO(rec); MUR_FORW_TOKEN_LOOKUP(forw_multi, rec_token_seq, rec_time); if (NULL != forw_multi) { /* This token has already been seen in another region in forward processing. * Add current region as well. If all regions have been resolved, then play * the entire transaction maintaining the exact same order of updates within. */ if (!forw_multi->no_longer_stuck) MUR_FORW_TOKEN_ONE_MORE_REG(forw_multi, rctl); } else { /* First time we are seeing this token in forward processing. Check if this * has already been determined to be a broken transaction. */ recstat = GOOD_TN; multi = NULL; if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq)) { multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, TPFENCE); if ((NULL != multi) && (0 < multi->partner)) recstat = BROKEN_TN; } MUR_FORW_TOKEN_ADD(forw_multi, rec_token_seq, rec_time, rctl, num_partners, recstat, multi); } /* Check that "tabent" field has been initialized above (by either the MUR_FORW_TOKEN_LOOKUP * or MUR_FORW_TOKEN_ADD macros). This is relied upon by "mur_forward_play_multireg_tp" below. */ assert(NULL != forw_multi->u.tabent); assert(forw_multi->num_reg_seen_forward <= forw_multi->num_reg_seen_backward); if (multi_proc) { sfm = forw_multi->shm_forw_multi; ok_to_play = (NULL == sfm) || (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward); } else ok_to_play = (forw_multi->num_reg_seen_forward == forw_multi->num_reg_seen_backward); assert(ok_to_play || !forw_multi->no_longer_stuck); if (ok_to_play ) { /* We have enough information to proceed with playing this multi-region TP in * forward processing (even if we might not have seen all needed regions). Now play it. * Note that the TP could be BROKEN_TN or GOOD_TN. The callee handles it. */ assert(forw_multi == rctl->forw_multi); status = mur_forward_play_multireg_tp(forw_multi, rctl); this_reg_stuck = FALSE; /* Note that as part of playing the TP transaction, we could have reached * the EOF of rctl. In this case, we need to break out of the loop. */ if ((SS_NORMAL != status) || rctl->forw_eof_seen) break; assert(NULL == rctl->forw_multi); assert(!dollar_tlevel); jctl = rctl->jctl; /* In case the first record after the most recently processed * TP transaction is in the next generation journal file */ continue; } break; } else { status = mur_forward_play_cur_jrec(rctl); if (SS_NORMAL != status) break; } assert(!this_reg_stuck); status = mur_next_rec(&jctl); } assert((NULL == rctl->forw_multi) || this_reg_stuck); assert((NULL != rctl->forw_multi) || !this_reg_stuck); if (!this_reg_stuck) { /* We are not stuck in this region (to resolve a multi-region TP). * This means we are done processing all the records of this region. */ assert(NULL == rctl->forw_multi); if (!rctl->forw_eof_seen) { CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */ if (SS_NORMAL != status) { assert(ERR_FILENOTCREATE == status); goto finish; } assert(!dollar_tlevel); DELETE_RCTL_FROM_UNPROCESSED_LIST(rctl); /* since all of its records should have been processed */ } else { /* EOF was seen in rctl inside "mur_forward_play_multireg_tp" and it was removed * from the unprocessed list of rctls. At the time rctl was removed, its "next_rctl" * field could have been pointing to another <rctl> that has since then also been * removed inside the same function. Therefore the "next_rctl" field is not reliable * in this case but instead we should rely on the global variable "rctl_start" which * points to the list of unprocessed rctls. Set "next_rctl" accordingly. */ rctl->next_rctl = rctl_start; if (ERR_JNLREADEOF == status) status = SS_NORMAL; } assert(rctl->deleted_from_unprocessed_list); } assert(SS_NORMAL == status); assert(!this_reg_stuck || !rctl->forw_eof_seen); assert((NULL == rctl->next_rctl) || (NULL != rctl_start)); assert((NULL == rctl->next_rctl) || (0 < murgbl.regcnt_remaining)); rctl = rctl->next_rctl; /* Note : even though "rctl" could have been deleted from the doubly linked list above, * rctl->next_rctl is not touched so we can still use it to get to the next element. */ } assert(0 == murgbl.regcnt_remaining); jgbl.mur_pini_addr_reset_fnptr = NULL; /* No more simulation of GT.M activity for any region */ prc_vec = murgbl.prc_vec; /* Use process-vector of MUPIP RECOVER (not any simulating GT.M process) now onwards */ assert(0 == dollar_tlevel); for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { if (!rctl->this_pid_is_owner) { assert(multi_proc_in_use); continue; /* in a parallel processing environment, process only regions we own */ } if (multi_proc) { /* Set key to print this rctl's region-name as prefix in case this forked off process prints any output */ MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); } PRINT_VERBOSE_STAT(rctl->jctl, "mur_forward:at the end"); assert(!mur_options.rollback || (0 != murgbl.consist_jnl_seqno)); assert(mur_options.rollback || (0 == murgbl.consist_jnl_seqno)); assert(!dollar_tlevel); /* In case it applied a broken TUPD */ assert(murgbl.ok_to_update_db || !rctl->db_updated); rctl->mur_plst = NULL; /* reset now that simulation of GT.M updates is done */ /* Ensure mur_block_count_correct is called if updates allowed */ if (murgbl.ok_to_update_db && (SS_NORMAL != mur_block_count_correct(rctl))) { gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(4) ERR_BLKCNTEDITFAIL, 2, DB_LEN_STR(rctl->gd)); murgbl.wrn_count++; } } finish: if (multi_proc) multi_proc_key = NULL; /* reset key until it can be set to rctl's region-name again */ if ((SS_NORMAL == status) && mur_options.show) mur_output_show(); if (NULL != first_shm_rctl) { /* Transfer needed process-private information to shared memory so parent process can later inherit this. */ first_shm_rctl->err_cnt = murgbl.err_cnt; first_shm_rctl->wrn_count = murgbl.wrn_count; first_shm_rctl->consist_jnl_seqno = murgbl.consist_jnl_seqno; /* If extract files were created by this process for one or more regions, then copy that information to * shared memory so parent process can use this information to do a merge sort. */ shm_rctl = mur_shm_hdr->shm_rctl_start; for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++, shm_rctl++) { assert(multi_proc_in_use); if (!rctl->this_pid_is_owner) continue; /* in a parallel processing environment, process only regions we own */ /* Cancel any flush/dbsync timers by this child process for this region. This is because the * child is not going to go through exit handling code (no gds_rundown etc.). And we need to * clear up csa->nl->wcs_timers. (normally done by gds_rundown). */ if (NULL != rctl->csa) /* rctl->csa can be NULL in case of "mupip journal -extract" etc. */ CANCEL_DB_TIMERS(rctl->gd, rctl->csa, cancelled_timer, cancelled_dbsync_timer); reccnt = 0; for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0; recstat < TOT_EXTR_TYPES; recstat++, size_ptr++) { /* Assert "extr_file_created" information is in sync between rctl and shm_rctl. * This was done at the end of "mur_cre_file_extfmt". */ assert(shm_rctl->extr_file_created[recstat] == rctl->extr_file_created[recstat]); /* Assert that if *size_ptr is non-zero, then we better have created an extract file. * Note that the converse is not true. It is possible we created a file for example to * write an INCTN record but decided to not write anything because it was not a -detail * type of extract. So *sizeptr could be 0 even though we created the extract file. */ assert(!*size_ptr || rctl->extr_file_created[recstat]); shm_rctl->jnlext_list_size[recstat] = *size_ptr; reccnt += *size_ptr; } assert(INVALID_SHMID == shm_rctl->jnlext_shmid); shm_size = reccnt * SIZEOF(jnlext_multi_t); /* If we are quitting because of an abnormal status OR a forced signal to terminate * OR if the parent is dead (kill -9) dont bother creating shmid to communicate back with parent. */ if (mp_hdr->parent_pid != getppid()) { SET_FORCED_MULTI_PROC_EXIT; /* Also signal sibling children to stop processing */ if (SS_NORMAL != status) status = ERR_FORCEDHALT; } if ((SS_NORMAL == status) && shm_size) { shmid = shmget(IPC_PRIVATE, shm_size, 0600 | IPC_CREAT); if (-1 == shmid) { save_errno = errno; SNPRINTF(errstr, SIZEOF(errstr), "shmget() : shmsize=0x%llx", shm_size); MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); /* to print region name prefix */ rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno); } shmPtr = (char *)do_shmat(shmid, 0, 0); if (-1 == (sm_long_t)shmPtr) { save_errno = errno; SNPRINTF(errstr, SIZEOF(errstr), "shmat() : shmid=%d shmsize=0x%llx", shmid, shm_size); MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); /* to print region name prefix */ rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno); } shm_rctl->jnlext_shmid = shmid; shm_rctl->jnlext_shm_size = shm_size; for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0; recstat < TOT_EXTR_TYPES; recstat++, size_ptr++) { shm_size = *size_ptr; if (shm_size) { copy_size = copy_list_to_buf(rctl->jnlext_multi_list[recstat], (int4)shm_size, shmPtr); assert(copy_size == (shm_size * SIZEOF(jnlext_multi_t))); shmPtr += copy_size; } } } } } mur_close_file_extfmt(IN_MUR_CLOSE_FILES_FALSE); /* Need to flush buffered extract/losttrans/brokentrans files */ return (int)status; }
int4 gds_rundown(void) { boolean_t canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin; boolean_t have_standalone_access, ipc_deleted, err_caught; boolean_t is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm; boolean_t unsafe_last_writer; char time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status, rc; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; shm_snapshot_t *ss_shm_ptr; uint4 ss_pid, onln_rlbk_pid, holder_pid; boolean_t was_crit; boolean_t safe_mode; /* Do not flush or take down shared memory. */ boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen, ftok_counter_halted, access_counter_halted; int secshrstat; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return EXIT_NRM; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return EXIT_NRM; } /* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on * that later to determine if the process had standalone access or not when it entered this function. We need to guarantee * that none else access database file header when semid/shmid fields are reset. We already have created ftok semaphore in * db_init or, mu_rndwn_file and did not remove it. So just lock it. We do it in blocking mode. */ have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */ DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); ESTABLISH_NORET(gds_rundown_ch, err_caught); if (err_caught) { REVERT; WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0); ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE); return EXIT_ERR; } assert(reg->open); /* if we failed to open, dbinit_ch should have taken care of proper clean up */ assert(!reg->opening); /* see comment above */ assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth)); is_mm = (dba_bg != csd->acc_meth); assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk)); /* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown. */ was_crit = (csa->now_crit && jgbl.onlnrlbk); /* Cancel any pending flush timer for this region by this task */ canceled_flush_timer = FALSE; canceled_dbsync_timer = FALSE; CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); we_are_last_user = FALSE; inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr); if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); if (!was_crit) { rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); } /* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach * mupip_exit_handler while holding access control semaphore. Assert accordingly. */ assert(!have_standalone_access || mupip_jnl_recover || process_exiting); /* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it * needs the access control lock as well. The only expection is we are online rollback and currently running down. */ cnl = csa->nl; onln_rlbk_pid = cnl->onln_rlbk_pid; assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0)); if (!have_standalone_access) { if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */ { save_errno = errno; assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); } may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */ /* We need to guarantee that no one else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it. */ if (!ftok_sem_lock(reg, may_bypass_ftok)) { if (may_bypass_ftok) { /* We did a non-blocking wait. It's ok to proceed without locking */ bypassed_ftok = TRUE; holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */ { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at rundown")); } } else { /* We did a blocking wait but something bad happened. */ FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } } sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (0 != status) { save_errno = errno; /* Check # of processes counted on access sem. */ if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); } bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt; /* Before attempting again in the blocking mode, see if the holding process is an online rollback. * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we * are better off skipping rundown and continuing with sanity cleanup and exit. */ holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!bypassed_access) { /* We couldn't get it in one shot-- see if we already have it */ if (holder_pid == process_id) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); assert(FALSE); return EXIT_ERR; } if (EAGAIN != save_errno) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status, FORCED_WAIT); if (-1 == status) /* We couldn't get it at all.. */ rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, errno); } else if (!IS_GTM_IMAGE) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at rundown")); } udi->grabbed_access_sem = !bypassed_access; } } /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the * ftok semaphore and trying it could cause deadlock */ /* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE. * But there could be other processes still having the database open so we cannot safely reset the halted fields. */ if (have_standalone_access && !jgbl.onlnrlbk) csd->ftok_counter_halted = csd->access_counter_halted = FALSE; ftok_counter_halted = csd->ftok_counter_halted; access_counter_halted = csd->access_counter_halted; /* If we bypassed any of the semaphores, activate safe mode. * Also, if the replication instance is frozen and this db has replication turned on (which means * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode. */ ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen); safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted; /* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --cnl->ref_cnt; if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode; /* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */ assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen); if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); /* There's one writer left and I am it */ assert(reg->read_only || semval >= 0); unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch; we_are_last_writer = unsafe_last_writer && !safe_mode; assert(!we_are_last_writer || !safe_mode); assert(!we_are_last_user || !safe_mode); /* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */ assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen); GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */ if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL)))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); if (NULL != csa->ss_ctx) ss_destroy_context(csa->ss_ctx); /* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */ assert(1 == MAX_SNAPSHOTS); ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); ss_pid = ss_shm_ptr->ss_info.ss_pid; is_cur_process_ss_initiator = (process_id == ss_pid); if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user)) { /* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP * INTEG or by a MUPIP RUNDOWN. */ if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid) && (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0))) { ss_release(NULL); ss_release_lock(reg); } } /* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a * consistent state. */ if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY * if there is NO concurrent online rollback running (as we need crit to set wc_blocked) */ if (csa->wbuf_dqd && !is_mm) { /* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked * BEFORE secshr_db_clnup in mur_close_files. * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in * wcs_wtstart or wcs_get_space. Assert accordingly. */ assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode); if (!was_crit) grab_crit(reg); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); BG_TRACE_PRO_ANY(csa, lost_block_recovery); if (!was_crit) rel_crit(reg); } if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ assert(!safe_mode); if (is_mm) { MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg); cnl->remove_shm = TRUE; } if (cnl->wc_blocked && jgbl.onlnrlbk) { /* if the last update done by online rollback was not committed in the normal code-path but was * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This * could result in the last update never getting flushed to the disk and if online rollback happened * to be the last writer then the shared memory will be flushed and removed and the last update will * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is * possible only if phase1 or phase2 errors are induced using white box test cases */ assert(WB_COMMIT_ERR_ENABLED); wcs_recover(reg); } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); } else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen) { /* canceled pending db or jnl flush timers - flush database and journal buffers to disk */ if (!was_crit) grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); if (!was_crit) rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ if (!was_crit) grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing * a pini, so allow for that. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr)); /* If we haven't written a pini, let jnl_file_close write the pini/pfin. */ if (!jgbl.mur_extract && (0 != jpc->pini_addr)) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > cnl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } if (!was_crit) rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { if (!was_crit) grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!have_standalone_access && we_are_last_user) { /* mupip_exit_handler will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); if (!was_crit) rel_crit(reg); if (!is_mm) { GTM_DB_FSYNC(csa, udi->fd, rc); /* Sync it all */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ assert(csa->ti->total_blks == csa->total_blks); #ifdef _AIX GTM_DB_FSYNC(csa, udi->fd, rc); if (-1 == rc) #else if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1])) #endif { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } } else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg)); cnl->lastwriterbypas_msg_issued = TRUE; } } /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */ /* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this). */ CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm) { /* mupip_exit_handler will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (!csa->read_only_fs) { secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0); if (0 != secshrstat) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } } /* Done with file now, close it */ CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ # if !defined(_AIX) if (is_mm && (NULL != csa->db_addrs[0])) { assert(csa->db_addrs[1] > csa->db_addrs[0]); munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]); if (0 < munmap_len) munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len)); } # endif /* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down * this region. In the process also get the remove_shm status from node_local before detaching. * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl); /* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0. * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid). */ if (jgbl.onlnrlbk) cnl->onln_rlbk_pid = 0; rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */ /* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down. * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes. */ if (safe_mode) /* indicates flushing was skipped */ { if (bypassed_access) cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */ if (bypassed_ftok) cnl->dbrndwn_ftok_skip++; } if (jgbl.onlnrlbk) csa->hold_onto_crit = FALSE; GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0); status = shmdt((caddr_t)cnl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ /* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so * we are safe passing "csa". */ if (-1 == status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); REMOVE_CSA_FROM_CSADDRSLIST(csa); /* remove "csa" from list of open regions (cs_addrs_list) */ reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ assertpro(0 == csa->wbuf_dqd); ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); /* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */ udi->new_shm = FALSE; /* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from * mur_close_files()) */ if (!have_standalone_access) { if (0 != sem_rmid(udi->semid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); udi->new_sem = FALSE; /* Note that we no longer have a new semaphore */ udi->grabbed_access_sem = FALSE; udi->counter_acc_incremented = FALSE; } } else if (is_src_server || is_updproc) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else { assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode); if (!jgbl.onlnrlbk && !have_standalone_access) { /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) { if (!access_counter_halted) { save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO); if (0 != save_errno) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"), CALLFROM, save_errno); } udi->counter_acc_incremented = FALSE; } assert(safe_mode || !bypassed_access); /* Now remove the rundown lock */ if (!bypassed_access) { if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore release"), CALLFROM, save_errno); udi->grabbed_access_sem = FALSE; } } /* else access control semaphore will be released in db_ipcs_reset */ } if (!have_standalone_access) { if (bypassed_ftok) { if (!ftok_counter_halted) if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE)) { FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } udi->grabbed_ftok_sem = FALSE; udi->counter_ftok_incremented = FALSE; } ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); if (!ipc_deleted) { GET_CUR_TIME(time_str); if (is_src_server) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; return EXIT_NRM; }
void gv_rundown(void) { gd_region *r_top, *r_save, *r_local; gd_addr *addr_ptr; sgm_info *si; int4 rundown_status = EXIT_NRM; /* if gds_rundown went smoothly */ # ifdef VMS vms_gds_info *gds_info; # elif UNIX unix_db_info *udi; # endif #if defined(DEBUG) && defined(UNIX) sgmnt_addrs *csa; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; r_save = gv_cur_region; /* Save for possible core dump */ gvcmy_rundown(); ENABLE_AST if (pool_init) rel_lock(jnlpool.jnlpool_dummy_reg); for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++) { if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth) { /* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above. * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions, * region->open is TRUE although cs_addrs is NULL. */ # if defined(DEBUG) && defined(UNIX) if (is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE && TREF(gtm_test_fake_enospc)) { /* Clear ENOSPC faking now that we are running down */ csa = REG2CSA(r_local); if (csa->nl->fake_db_enospc || csa->nl->fake_jnl_enospc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT, 2, LEN_AND_LIT("Resetting fake_db_enospc and fake_jnl_enospc")); csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = FALSE; } } # endif gv_cur_region = r_local; tp_change_reg(); UNIX_ONLY(rundown_status |=) gds_rundown(); /* Now that gds_rundown is done, free up the memory associated with the region. * Ideally the following memory freeing code should go to gds_rundown, but * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM. */ if (NULL != cs_addrs) { if (NULL != cs_addrs->dir_tree) FREE_CSA_DIR_TREE(cs_addrs); if (cs_addrs->sgm_info_ptr) { si = cs_addrs->sgm_info_ptr; /* It is possible we got interrupted before initializing all fields of "si" * completely so account for NULL values while freeing/releasing those fields. */ assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa)); if (si->jnl_tail) { CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list); } CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list); CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list); CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list); if (NULL != si->blks_in_use) { free_hashtab_int4(si->blks_in_use); free(si->blks_in_use); si->blks_in_use = NULL; } if (si->cr_array_size) { assert(NULL != si->cr_array); if (NULL != si->cr_array) free(si->cr_array); } if (NULL != si->first_tp_hist) free(si->first_tp_hist); free(si); } if (cs_addrs->jnl) { assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs); if (cs_addrs->jnl->jnllsb) { UNIX_ONLY(assert(FALSE)); free(cs_addrs->jnl->jnllsb); } free(cs_addrs->jnl); } GTMCRYPT_ONLY( if (cs_addrs->encrypted_blk_contents) free(cs_addrs->encrypted_blk_contents); ) } assert(gv_cur_region->dyn.addr->file_cntl->file_info); VMS_ONLY( gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info; if (gds_info->xabpro) free(gds_info->xabpro); if (gds_info->xabfhc) free(gds_info->xabfhc); if (gds_info->nam) { free(gds_info->nam->nam$l_esa); free(gds_info->nam); } if (gds_info->fab) free(gds_info->fab); ) free(gv_cur_region->dyn.addr->file_cntl->file_info); free(gv_cur_region->dyn.addr->file_cntl); } r_local->open = r_local->was_open = FALSE; } }
void gds_rundown(void) { bool is_mm, we_are_last_user, we_are_last_writer; boolean_t ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch; now_t now; /* for GET_CUR_TIME macro */ char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; error_def(ERR_CRITSEMFAIL); error_def(ERR_DBCCERR); error_def(ERR_DBFILERR); error_def(ERR_DBRNDWNWRN); error_def(ERR_ERRCALL); error_def(ERR_GBLOFLOW); error_def(ERR_GTMASSERT); error_def(ERR_IPCNOTDEL); error_def(ERR_JNLFLUSH); error_def(ERR_RNDWNSEMFAIL); error_def(ERR_TEXT); error_def(ERR_WCBLOCKED); forced_exit = FALSE; /* Okay, we're dying already -- let rel_crit live in peace now. * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001 */ grabbed_access_sem = FALSE; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* * early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return; } ESTABLISH(gds_rundown_ch); if (!reg->open) /* Not open, no point to rundown */ { if (reg->opening) /* Died partway open, kill rest of way */ { rel_crit(reg); mutex_cleanup(reg); /* revist this to handle MM properly SMW 98/12/16 if (NULL != csa->nl) { status = shmdt((caddr_t)csa->nl); if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); } */ shmdt((caddr_t)csa->nl); csa->nl = NULL; } REVERT; return; } switch(csd->acc_meth) { /* Pass mm and bg through */ case dba_bg: is_mm = FALSE; break; case dba_mm: is_mm = TRUE; break; case dba_usr: assert(FALSE); default: REVERT; return; } /* Cancel any pending flush timer for this region by this task */ CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer); we_are_last_user = FALSE; if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); assert(!csa->read_lock); rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); /* * We need to guarantee that none else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it. * So just lock it. We do it in blocking mode. */ if (!ftok_sem_lock(reg, FALSE, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * For mupip_jnl_recover we already have database access control semaphore. * We do not release it. We release it from mur_close_files. */ if (!mupip_jnl_recover) { sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) /* We couldn't get it in one shot -- see if we already have it */ { save_errno = errno; /* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */ if (semctl(udi->semid, 0, GETPID) == process_id) { send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; return; /* Already in rundown for this region */ } if (EAGAIN != save_errno) { assert(FALSE); rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status); if (-1 == status) /* We couldn't get it at all.. */ rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); } } grabbed_access_sem = TRUE; /* * We now have the dbinit/rundown lock, so we are alone in this code for this region * and nobody else can attach. * See if we are all alone in accessing this database shared memory. */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --csa->nl->ref_cnt; if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch; assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */ if (-1 == (semval = semctl(udi->semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */ assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */ if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); /* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should * not flush shared memory contents to disk as they might be in an inconsistent state. * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine. * A reissue of the recover/rollback command will restore the database to a consistent state. * Otherwise, if we have write access to this region, let us perform a few writing tasks. */ if (csa->nl->donotflush_dbjnl) csa->wbuf_dqd = 0; /* ignore csa->wbuf_dqd status as we do not care about the cache contents */ else if (!reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */ if (csa->wbuf_dqd) { grab_crit(reg); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); if (is_mm) { assert(FALSE); csd = csa->hdr; } BG_TRACE_PRO_ANY(csa, lost_block_recovery); rel_crit(reg); } if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type)) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ if (is_mm) { if (csa->total_blks != csa->ti->total_blks) /* do remap if file had been extended */ { grab_crit(reg); wcs_mm_recover(reg); csd = csa->hdr; rel_crit(reg); } csa->nl->remove_shm = TRUE; } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn; } else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer) { /* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */ grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != csa->nl->jnl_file.u.inode)) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * Although we assert pini_addr should be non-zero for last_writer, we * play it safe in PRO and write a PINI record if not written already. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || 0 != jpc->pini_addr); if (we_are_last_writer && 0 == jpc->pini_addr) jnl_put_jrt_pini(csa); if (0 != jpc->pini_addr) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > csa->nl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!mupip_jnl_recover && we_are_last_user) { /* mupip_jnl_recover will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); rel_crit(reg); if (FALSE == is_mm) { if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ #if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC) if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } #else if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno); } #endif } } } /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */ if (reg->read_only && we_are_last_user && !mupip_jnl_recover) { /* mupip_jnl_recover will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } /* Done with file now, close it */ if (-1 == close(udi->fd)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ if (is_mm) { munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)); if (munmap_len > 0) { munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)), (size_t)(munmap_len)); #ifdef DEBUG_DB64 rel_mmseg((caddr_t)csa->db_addrs[0]); #endif } } /* Detach our shared memory while still under lock so reference counts will be * correct for the next process to run down this region. * In the process also get the remove_shm status from node_local before detaching. * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl); status = shmdt((caddr_t)csa->nl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ if (csa->wbuf_dqd) GTMASSERT; ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); } else if (is_src_server || is_updproc) { gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); /* * Don't release semaphore in case of mupip recover/rollback; since it has standalone access. * It will release the semaphore in mur_close_files. */ if (!mupip_jnl_recover) { if (0 != sem_rmid(udi->semid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); grabbed_access_sem = FALSE; } } else { assert(!mupip_jnl_recover); /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno); /* Now remove the rundown lock */ if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno); grabbed_access_sem = FALSE; } if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); if (!ipc_deleted) { GET_CUR_TIME; if (is_src_server) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover) { gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; }
uint4 mur_forward(jnl_tm_t min_broken_time, seq_num min_broken_seqno, seq_num losttn_seqno) { boolean_t added, this_reg_stuck; boolean_t is_set_kill_zkill_ztworm, is_set_kill_zkill; jnl_record *rec; enum jnl_record_type rectype; enum rec_fence_type rec_fence; enum broken_type recstat; jnl_tm_t rec_time; int4 rec_image_count = 0; /* This is a dummy variable for UNIX */ uint4 status, regcnt_stuck, num_partners; mval mv; reg_ctl_list *rctl, *rctl_top, *prev_rctl; jnl_ctl_list *jctl; gd_region *reg; sgmnt_addrs *csa; seq_num rec_token_seq; forw_multi_struct *forw_multi; multi_struct *multi; error_def(ERR_JNLREADEOF); error_def(ERR_BLKCNTEDITFAIL); skip_dbtriggers = TRUE; /* do not want to invoke any triggers for updates done by journal recovery */ murgbl.extr_buff = (char *)malloc(murgbl.max_extr_record_length); for (recstat = (enum broken_type)0; recstat < TOT_EXTR_TYPES; recstat++) murgbl.extr_file_create[recstat] = TRUE; jgbl.dont_reset_gbl_jrec_time = jgbl.forw_phase_recovery = TRUE; assert(NULL == jgbl.mur_pini_addr_reset_fnptr); jgbl.mur_pini_addr_reset_fnptr = (pini_addr_reset_fnptr)mur_pini_addr_reset; gv_keysize = DBKEYSIZE(MAX_KEY_SZ); mu_gv_stack_init(); murgbl.consist_jnl_seqno = 0; /* Note down passed in values in murgbl global so "mur_forward_play_cur_jrec" function can see it as well */ murgbl.min_broken_time = min_broken_time; murgbl.min_broken_seqno = min_broken_seqno; murgbl.losttn_seqno = losttn_seqno; assert(!mur_options.rollback || (murgbl.losttn_seqno <= murgbl.min_broken_seqno)); prev_rctl = NULL; rctl_start = NULL; assert(0 == murgbl.regcnt_remaining); for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { if (mur_options.forward) { assert(NULL == rctl->jctl_turn_around); jctl = rctl->jctl = rctl->jctl_head; assert(jctl->reg_ctl == rctl); jctl->rec_offset = JNL_HDR_LEN; jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END; /* initialized to reflect journaling is not enabled */ } else { jctl = rctl->jctl = (NULL == rctl->jctl_turn_around) ? rctl->jctl_head : rctl->jctl_turn_around; assert(jctl->reg_ctl == rctl); jctl->rec_offset = jctl->turn_around_offset; jgbl.mur_jrec_seqno = jctl->turn_around_seqno; if (mur_options.rollback && murgbl.consist_jnl_seqno < jgbl.mur_jrec_seqno) murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno; assert(murgbl.consist_jnl_seqno <= murgbl.losttn_seqno); assert((NULL != rctl->jctl_turn_around) || (0 == jctl->rec_offset)); } if (mur_options.update || mur_options.extr[GOOD_TN]) { reg = rctl->gd; gv_cur_region = reg; tp_change_reg(); /* note : sets cs_addrs to non-NULL value even if gv_cur_region->open is FALSE * (cs_data could still be NULL). */ rctl->csa = cs_addrs; cs_addrs->rctl = rctl; rctl->csd = cs_data; rctl->sgm_info_ptr = cs_addrs->sgm_info_ptr; SET_CSA_DIR_TREE(cs_addrs, MAX_KEY_SZ, reg); gv_target = cs_addrs->dir_tree; } jctl->after_end_of_data = FALSE; status = mur_next(jctl, jctl->rec_offset); assert(ERR_JNLREADEOF != status); /* cannot get EOF at start of forward processing */ if (SS_NORMAL != status) return status; PRINT_VERBOSE_STAT(jctl, "mur_forward:at the start"); /* Any multi-region TP transaction will be processed as multiple single-region TP transactions up * until the tp-resolve-time is reached. From then on, they will be treated as one multi-region TP * transaction. This is needed for proper lost-tn determination (any multi-region transaction that * gets played in a region AFTER it has already encountered a broken tn should treat this as a lost tn). */ do { assert(jctl == rctl->jctl); rec = rctl->mur_desc->jnlrec; rec_time = rec->prefix.time; if (rec_time > mur_options.before_time) break; /* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */ if (rec_time < mur_options.after_time) { status = mur_next_rec(&jctl); continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */ } if (rec_time >= jgbl.mur_tp_resolve_time) break; /* Records after tp-resolve-time will be processed below */ /* TODO: what do we do if we find a BROKEN tn here? */ status = mur_forward_play_cur_jrec(rctl); if (SS_NORMAL != status) break; status = mur_next_rec(&jctl); } while (SS_NORMAL == status); CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */ if (SS_NORMAL != status) return status; if (rctl->forw_eof_seen) { PRINT_VERBOSE_STAT(jctl, "mur_forward:Reached EOF before tp_resolve_time"); continue; /* Reached EOF before even getting to tp_resolve_time. * Do not even consider region for next processing loop */ } rctl->last_tn = 0; rctl->process_losttn = FALSE; murgbl.regcnt_remaining++; /* # of regions participating in recovery at this point */ if (NULL == rctl_start) rctl_start = rctl; if (NULL != prev_rctl) { prev_rctl->next_rctl = rctl; rctl->prev_rctl = prev_rctl; } prev_rctl = rctl; assert(murgbl.ok_to_update_db || !rctl->db_updated); PRINT_VERBOSE_STAT(jctl, "mur_forward:at tp_resolve_time"); } /* Note that it is possible for rctl_start to be NULL at this point. That is there is no journal record in any region * AFTER the calculated tp-resolve-time. This is possible if for example -AFTER_TIME was used and has a time later * than any journal record in all journal files. If rctl_start is NULL, prev_rctl should also be NULL and vice versa. */ if (prev_rctl != rctl_start) { assert(NULL != prev_rctl); assert(NULL != rctl_start); prev_rctl->next_rctl = rctl_start; rctl_start->prev_rctl = prev_rctl; } else { /* prev_rctl & rctl_start are identical. They both should be NULL or should point to a single element linked list */ assert((NULL == rctl_start) || (NULL == rctl_start->next_rctl) && (NULL == rctl_start->prev_rctl)); } rctl = rctl_start; regcnt_stuck = 0; /* # of regions we are stuck in waiting for other regions to resolve a multi-region TP transaction */ assert((NULL == rctl) || (NULL == rctl->forw_multi)); gv_cur_region = NULL; /* clear out any previous value to ensure gv_cur_region/cs_addrs/cs_data * all get set in sync by the MUR_CHANGE_REG macro below. */ while (NULL != rctl) { /* while there is at least one region remaining with unprocessed journal records */ assert(NULL != rctl_start); assert(0 < murgbl.regcnt_remaining); if (NULL != rctl->forw_multi) { /* This region's current journal record is part of a TP transaction waiting for other regions */ regcnt_stuck++; if (regcnt_stuck >= murgbl.regcnt_remaining) GTMASSERT; /* Out-of-design situation. Stuck in ALL regions. */ rctl = rctl->next_rctl; /* Move on to the next available region */ assert(NULL != rctl); continue; } regcnt_stuck = 0; /* restart the counter now that we found at least one non-stuck region */ MUR_CHANGE_REG(rctl); jctl = rctl->jctl; this_reg_stuck = FALSE; for ( status = SS_NORMAL; SS_NORMAL == status; ) { assert(jctl == rctl->jctl); rec = rctl->mur_desc->jnlrec; rec_time = rec->prefix.time; assert(rec_time >= jgbl.mur_tp_resolve_time); if (rec_time > mur_options.before_time) break; /* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */ assert((0 == mur_options.after_time) || mur_options.forward && !rctl->db_updated); if (rec_time < mur_options.after_time) { status = mur_next_rec(&jctl); continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */ } /* Check if current journal record can be played right away or need to wait for corresponding journal * records from other participating TP regions to be reached. A non-TP or ZTP transaction can be played * without issues (i.e. has no dependencies with any other regions). A single-region TP transaction too * falls in the same category. A multi-region TP transaction needs to wait until all participating regions * have played all journal records BEFORE this TP in order to ensure recover plays records in the exact * same order that GT.M performed them in. */ /* If FENCE_NONE is specified, we would not have maintained any multi hashtable in mur_back_process for * broken transaction processing. So we process multi-region TP transactions as multiple single-region * TP transactions in forward phase. */ if (FENCE_NONE != mur_options.fences) { rectype = (enum jnl_record_type)rec->prefix.jrec_type; if (IS_TP(rectype) && IS_TUPD(rectype)) { assert(IS_SET_KILL_ZKILL_ZTRIG_ZTWORM(rectype)); assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants); num_partners = rec->jrec_set_kill.num_participants; assert(0 < num_partners); if (1 < num_partners) { this_reg_stuck = TRUE; assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num); } } } if (this_reg_stuck) { rec_token_seq = GET_JNL_SEQNO(rec); VMS_ONLY( /* In VMS, pid is not unique. We need "image_count" as well. But this is not needed * in case of rollback as the token is guaranteed to be unique in that case. */ if (!mur_options.rollback) { MUR_GET_IMAGE_COUNT(jctl, rec, rec_image_count, status); if (SS_NORMAL != status) { this_reg_stuck = FALSE; /* so abnormal "status" gets checked below */ break; } } ) /* In Unix, "rec_image_count" is ignored by the MUR_FORW* macros */ MUR_FORW_TOKEN_LOOKUP(forw_multi, rec_token_seq, rec_time, rec_image_count); if (NULL != forw_multi) { /* This token has already been seen in another region in forward processing. * Add current region as well. If all regions have been resolved, then play * the entire transaction maintaining the exact same order of updates within. */ MUR_FORW_TOKEN_ONE_MORE_REG(forw_multi, rctl); } else { /* First time we are seeing this token in forward processing. Check if this * has already been determined to be a broken transaction. */ recstat = GOOD_TN; multi = NULL; if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq)) { multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_image_count, rec_time, TPFENCE); if ((NULL != multi) && (0 < multi->partner)) recstat = BROKEN_TN; } MUR_FORW_TOKEN_ADD(forw_multi, rec_token_seq, rec_time, rctl, num_partners, recstat, multi, rec_image_count); } /* Check that "tabent" field has been initialized above (by either the MUR_FORW_TOKEN_LOOKUP * or MUR_FORW_TOKEN_ADD macros). This is relied upon by "mur_forward_play_multireg_tp" below. */ assert(NULL != forw_multi->u.tabent); assert(forw_multi->num_reg_seen_forward <= forw_multi->num_reg_seen_backward); if (forw_multi->num_reg_seen_forward == forw_multi->num_reg_seen_backward) { /* All regions have been seen in forward processing. Now play it. * Note that the TP could be BROKEN_TN or GOOD_TN. The callee handles it. */ assert(forw_multi == rctl->forw_multi); status = mur_forward_play_multireg_tp(forw_multi, rctl); this_reg_stuck = FALSE; /* Note that as part of playing the TP transaction, we could have reached * the EOF of rctl. In this case, we need to break out of the loop. */ if ((SS_NORMAL != status) || rctl->forw_eof_seen) break; assert(NULL == rctl->forw_multi); assert(!dollar_tlevel); jctl = rctl->jctl; /* In case the first record after the most recently processed * TP transaction is in the next generation journal file */ continue; } break; } else { status = mur_forward_play_cur_jrec(rctl); if (SS_NORMAL != status) break; } assert(!this_reg_stuck); status = mur_next_rec(&jctl); }