void process_reorg_encrypt_restart(void) { intrpt_state_t prev_intrpt_state; enc_info_t *encr_ptr; int gtmcrypt_errno; gd_segment *seg; sgmnt_addrs *csa; csa = reorg_encrypt_restart_csa; assert(NULL != csa); /* caller should have ensured this */ /* Opening handles for encryption is a heavyweight operation. Caller should have ensured we are not in crit for * any region when the new key handles are opened for any one region. Assert that. */ assert(0 == have_crit(CRIT_HAVE_ANY_REG)); DEFER_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state); encr_ptr = csa->encr_ptr; assert(NULL != encr_ptr); DBG_RECORD_CRYPT_RECEIVE(csa->hdr, csa, csa->nl, process_id, encr_ptr); seg = csa->region->dyn.addr; INIT_DB_OR_JNL_ENCRYPTION(csa, encr_ptr, seg->fname_len, seg->fname, gtmcrypt_errno); if (0 != gtmcrypt_errno) { ENABLE_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state); GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname); } reorg_encrypt_restart_csa = NULL; ENABLE_INTERRUPTS(INTRPT_IN_CRYPT_RECONFIG, prev_intrpt_state); }
void gtmsource_onln_rlbk_clnup() { gtmsource_local_ptr_t gtmsource_local; boolean_t was_crit; sgmnt_addrs *repl_csa; gtmsource_local = jnlpool.gtmsource_local; repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs; was_crit = repl_csa->now_crit; assert(!repl_csa->hold_onto_crit); assert(was_crit || (process_id == gtmsource_local->gtmsource_srv_latch.u.parts.latch_pid) || (0 != have_crit(CRIT_HAVE_ANY_REG))); /* Reset source server context to indicate a fresh connection that is about to take place */ assert(NULL != gtmsource_local); if (NULL != gtmsource_local) { /* If ROLLBACK has not taken the instance past the source server's read_jnl_seqno, then the source server should * just continue from where it currently is and start sending the journal records from that point onwards. But, this * is non-trivial. The reason is because, when the source server detected the online rollback, it could be in the * READ_POOL state. But, since the instance has been rolled back, the journal pool cannot be relied upon in its * entirety. To illustrate this -- consider that the journal pool contains the data from 1-100 and the source server * is currently sending sequence number 30 and is reading from the pool. Assume an online rollback happens that * takes the instance from sequence number 100 to sequence number 80 and leaves the journal pool write_addr and * early_write_addr untouched. Now, lets say GT.M process comes in after this and does a few more updates. All of * these updates will be written in the journal pool right after the "old-rolled-back" sequence number 100. If the * source server continues to read from the pool, it will send the valid data until sequence number 80. After that, * it will start sending the "old-rolled-back" sequence numbers 81-100 which is not right. To avoid this, rollback * should set the write_addr and early_write_addr by searching in the journal pool for sequence number 81. This is * currently not done, but is something that we can think about when it comes to optimization. Until then, force * rollback to reset jnlpool's write_addr, write and early_write_addr to 0 and let source server be forced into * READ_FILE mode. */ gtmsource_local->read_state = READ_FILE; /* Set the state which gets bubbled up the call chain to gtmsource_process at which point we will close and * re-establish the connection with the other end. */ gtmsource_local->gtmsource_state = gtmsource_state = GTMSOURCE_HANDLE_ONLN_RLBK; if (!was_crit) grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK); /* We have to let the read files logic know that until we have sent data "upto" the current journal sequence number * at this point, we cannot rely on the journal pool. Indicate this through the gtmsource_save_read_jnl_seqno global * variable */ gtmsource_save_read_jnl_seqno = jnlpool.jnlpool_ctl->jnl_seqno; gtmsource_local->read = jnlpool.jnlpool_ctl->write; gtmsource_local->read_addr = jnlpool.jnlpool_ctl->write_addr; if (!was_crit) rel_lock(jnlpool.jnlpool_dummy_reg); } return; }
void deferred_signal_handler(void) { void (*signal_routine)(); error_def(ERR_KILLBYSIG); error_def(ERR_KILLBYSIGUINFO); error_def(ERR_KILLBYSIGSINFO1); error_def(ERR_KILLBYSIGSINFO2); /* To avoid nested calls to this routine, we set forced_exit to FALSE at the very beginning */ forced_exit = FALSE; if (exit_handler_active) { assert(FALSE); /* at this point in time (June 2003) there is no way we know of to get here, hence the assert */ return; /* since anyway we are exiting currently, resume exit handling instead of reissuing another one */ } /* For signals that get a delayed response so we can get out of crit, we also delay the messages. * This routine will output those delayed messages from the appropriate structures to both the * user and the system console. */ /* note can't use switch here because ERR_xxx are not defined as constants */ if (ERR_KILLBYSIG == forced_exit_err) { send_msg(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal); gtm_putmsg(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal); } else if (ERR_KILLBYSIGUINFO == forced_exit_err) { send_msg(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.send_pid, signal_info.send_uid); gtm_putmsg(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.send_pid, signal_info.send_uid); } else if (ERR_KILLBYSIGSINFO1 == forced_exit_err) { send_msg(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr); gtm_putmsg(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr); } else if (ERR_KILLBYSIGSINFO2 == forced_exit_err) { send_msg(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr); gtm_putmsg(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr); } else { send_msg(VARLSTCNT(1) forced_exit_err); gtm_putmsg(VARLSTCNT(1) forced_exit_err); } /* As on VMS, a mupip stop does not drive the condition handlers unless we are in crit */ if ((0 != have_crit(CRIT_HAVE_ANY_REG) || SIGTERM != exi_condition) && CHANDLER_EXISTS) DRIVECH(0); /* If a special routine was registered to be driven on a signal, drive it now */ if (0 != exi_condition && call_on_signal) { signal_routine = call_on_signal; call_on_signal = NULL; /* So we don't recursively call ourselves */ (*signal_routine)(); } /* If the condition handler didn't cause an exit, drive the defined exit handler */ exit(-exi_condition); }
void mutex_deadlock_check(mutex_struct_ptr_t criticalPtr) { tp_region *tr; sgmnt_addrs *csa; int4 save_crit_count; if (in_mutex_deadlock_check) return; in_mutex_deadlock_check = TRUE; /* A zero value of "crit_count" implies asynchronous activities can occur (e.g. db flush timer, periodic epoch timers etc.). * At this point, although we are here through grab_crit()/grab_lock() (which would have incremented "crit_count"), we are * in a safe and consistent state as far as the mutex structures go so it is ok to set "crit_count" to 0 implying we * are now in an interruptible state (of course, we need to restore "crit_count" to what it was before returning). * The other alternative of not changing "crit_count" presents us with complex situations wherein recursion * of grab_crit/rel_crit might occur (through direct or indirect calls from mutex_deadlock_check()) * causing crit_count to be > 1 and in turn causing the crit_count-reset-logic in grab_crit/rel_crit to * do a "crit_count--" (instead of "crit_count = 0"). This suffers from the problem that in case of an error code path * crit_count might not get decremented appropriately and hence become out-of-sync (i.e. a positive value instead * of zero) and a non-zero value might cause indefinite deferrals of asynchronous events. */ assert(1 == crit_count); save_crit_count = crit_count; crit_count = 0; /* Need to determine who should and should not go through the deadlock checker. * * List of who needs to be considered * ------------------------------------ * -> GT.M, Update process, MUPIP LOAD and GT.CM GNP/OMI server : since they go through t_end() to update the database. * Note that all of the above (and only those) have the "is_replicator" flag set to TRUE. * -> MUPIP REORG, since it does non-TP transactions and goes through t_end() (has "mu_reorg_process" flag set). * * List of who does not need to be considered (with reasons) * ----------------------------------------------------------- * -> MUPIP RECOVER can hold crit on several regions (through TP or non-TP transactions). * But it has standalone access and hence no possibility of a deadlock. * -> MUPIP RESTORE too holds standalone access so does not need to be considered. * -> Source Server, Receiver Server etc. can hold only one CRIT resource at any point of time. * -> DSE, MUPIP BACKUP, MUPIP SET JOURNAL etc. can legitimately hold crit on several regions though in non-TP. */ if (is_replicator || mu_reorg_process) { if (0 == dollar_tlevel) { if ((NULL != jnlpool.jnlpool_dummy_reg) && jnlpool.jnlpool_dummy_reg->open) { ++crit_deadlock_check_cycle; if (FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs.critical == criticalPtr) { /* grab_lock going for crit on the jnlpool region. gv_cur_region points to the * current region of interest, which better have replication enabled, and be now crit */ assert(cs_addrs == &FILE_INFO(gv_cur_region)->s_addrs); csa = &FILE_INFO(gv_cur_region)->s_addrs; if (FALSE == csa->now_crit || !REPL_ENABLED(csa->hdr)) GTMASSERT; /* should have crit on gv_cur_region before asking for jnlpool */ csa->crit_check_cycle = crit_deadlock_check_cycle; /* allow for crit in gv_cur_region */ } } } else { /* Need to mark the regions allowed to have crit as follows: * Place the current cycle into the csa's of regions allowed to have crit so have_crit() can easily test. * Note that should the system be up long enough for the 2**32 cycle value to * wrap and a region be unused for most of that time, such a region might not be entitled to crit * but have an old csa->crit_cycle_check matching the current crit_deadlock_cycle_check - * that case would not trigger have_crit() to release crit on that region; * however, the next call to this routine increments crit_deadlock_check_cycle and so * crit on that region gets released after two calls instead of (the usual) one. */ ++crit_deadlock_check_cycle; for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr) { if (!tr->reg->open) continue; csa = &FILE_INFO(tr->reg)->s_addrs; if (csa->now_crit) csa->crit_check_cycle = crit_deadlock_check_cycle; else { /* Seen first non-crit region. Make sure either of the following is true. * (i) this is the region we are currently grabbing crit on * (ii) we do not hold crit on any region in the tp_reg_list. * If neither of the above, we have an out of design condition that can only * warrant blowing the process up.. */ if ((csa->critical != criticalPtr) && (tr != tp_reg_list)) GTMASSERT; break; } } } /* Release crit in regions not legitimately part of this TP/non-TP transaction */ have_crit(CRIT_RELEASE | CRIT_NOT_TRANS_REG); } crit_count = save_crit_count; in_mutex_deadlock_check = FALSE; }
/* Note we don't increment fast_lock_count as part of getting the latch and decrement it when releasing it because ROLLBACK * can hold onto this latch for a long while and can do updates in this duration and we should NOT have a non-zero fast_lock_count * as many places like t_begin/dsk_read have asserts to this effect. It is okay to NOT increment fast_lock_count as ROLLBACK * anyways have logic to disable interrupts the moment it starts doing database updates. */ boolean_t grab_gtmsource_srv_latch(sm_global_latch_ptr_t latch, uint4 max_timeout_in_secs, uint4 onln_rlbk_action) { int spins, maxspins, retries, max_retries; unix_db_info *udi; sgmnt_addrs *repl_csa; boolean_t cycle_mismatch; assert(!have_crit(CRIT_HAVE_ANY_REG)); udi = FILE_INFO(jnlpool.jnlpool_dummy_reg); repl_csa = &udi->s_addrs; maxspins = num_additional_processors ? MAX_LOCK_SPINS(LOCK_SPINS, num_additional_processors) : 1; max_retries = max_timeout_in_secs * 4 * 1000; /* outer-loop : X minutes, 1 loop in 4 is sleep of 1 ms */ for (retries = max_retries - 1; 0 < retries; retries--) { for (spins = maxspins; 0 < spins; spins--) { assert(latch->u.parts.latch_pid != process_id); /* We better not hold it if trying to get it */ if (GET_SWAPLOCK(latch)) { DEBUG_ONLY(locknl = repl_csa->nl); /* Use the journal pool to maintain lock history */ LOCK_HIST("OBTN", latch, process_id, retries); DEBUG_ONLY(locknl = NULL); if (jnlpool.repl_inst_filehdr->file_corrupt && !jgbl.onlnrlbk) { /* Journal pool indicates an abnormally terminated online rollback. Cannot continue until * the rollback command is re-run to bring the journal pool/file and instance file to a * consistent state. */ /* No need to release the latch before rts_error (mupip_exit_handler will do it for us) */ rts_error(VARLSTCNT(8) ERR_REPLREQROLLBACK, 2, LEN_AND_STR(udi->fn), ERR_TEXT, 2, LEN_AND_LIT("file_corrupt field in instance file header is set to" " TRUE")); } cycle_mismatch = (repl_csa->onln_rlbk_cycle != jnlpool.jnlpool_ctl->onln_rlbk_cycle); assert((ASSERT_NO_ONLINE_ROLLBACK != onln_rlbk_action) || !cycle_mismatch); if ((HANDLE_CONCUR_ONLINE_ROLLBACK == onln_rlbk_action) && cycle_mismatch) { assert(is_src_server); SYNC_ONLN_RLBK_CYCLES; gtmsource_onln_rlbk_clnup(); /* side-effect : sets gtmsource_state */ rel_gtmsource_srv_latch(latch); } return TRUE; } } if (retries & 0x3) { /* On all but every 4th pass, do a simple rel_quant */ rel_quant(); } else { /* On every 4th pass, we bide for awhile */ wcs_sleep(LOCK_SLEEP); if (RETRY_CASLATCH_CUTOFF == (retries % LOCK_TRIES)) performCASLatchCheck(latch, TRUE); } } DUMP_LOCKHIST(); assert(FALSE); assert(jnlpool.gtmsource_local && jnlpool.gtmsource_local->gtmsource_pid); rts_error(VARLSTCNT(5) ERR_SRVLCKWT2LNG, 2, max_timeout_in_secs, jnlpool.gtmsource_local->gtmsource_pid); return FALSE; /* to keep the compiler happy */ }