int continue_proc(pid_t pid) { DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; DEBUG_ONLY(if (!TREF(gtm_usesecshr))) /* Cause debug builds to talk to gtmsecshr more often */ { if (WBTEST_ENABLED(WBTEST_HOLD_GTMSOURCE_SRV_LATCH)) { /* Simulate the kill below, but ignore its return status so that we end up invoking gtmsecshr */ kill(pid, SIGCONT); /* Wait until the target quits so that kill() call by gtmsecshr fails with ESRCH */ while (is_proc_alive(pid, 0)) LONG_SLEEP(1); } else if (0 == kill(pid, SIGCONT)) return 0; else if (ESRCH == errno) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(5) ERR_NOSUCHPROC, 3, pid, RTS_ERROR_LITERAL("continue")); return ESRCH; } else assert(EINVAL != errno); } return send_mesg2gtmsecshr(CONTINUE_PROCESS, pid, NULL, 0); }
int main(int argc, char *argv[]) { DCL_THREADGBL_ACCESS; GTM_THREADGBL_INIT; set_blocksig(); gtm_imagetype_init(DSE_IMAGE); gtm_wcswidth_fnptr = gtm_wcswidth; gtm_env_init(); /* read in all environment variables */ licensed = TRUE; TREF(transform) = TRUE; op_open_ptr = op_open; patch_curr_blk = get_dir_root(); err_init(util_base_ch); GTM_ICU_INIT_IF_NEEDED; /* Note: should be invoked after err_init (since it may error out) and before CLI parsing */ sig_init(generic_signal_handler, dse_ctrlc_handler, suspsigs_handler); atexit(util_exit_handler); SET_LATCH_GLOBAL(&defer_latch, LOCK_AVAILABLE); get_page_size(); stp_init(STP_INITSIZE); rts_stringpool = stringpool; getjobname(); INVOKE_INIT_SECSHR_ADDRS; getzdir(); prealloc_gt_timers(); initialize_pattern_table(); gvinit(); region_init(FALSE); INIT_GBL_ROOT(); /* Needed for GVT initialization */ getjobnum(); util_out_print("!/File !_!AD", TRUE, DB_LEN_STR(gv_cur_region)); util_out_print("Region!_!AD!/", TRUE, REG_LEN_STR(gv_cur_region)); cli_lex_setup(argc, argv); CREATE_DUMMY_GBLDIR(gd_header, original_header, gv_cur_region, gd_map, gd_map_top); gtm_chk_dist(argv[0]); # ifdef DEBUG if ((gtm_white_box_test_case_enabled && (WBTEST_SEMTOOLONG_STACK_TRACE == gtm_white_box_test_case_number) )) { sgmnt_addrs * csa; node_local_ptr_t cnl; csa = &FILE_INFO(gv_cur_region)->s_addrs; cnl = csa->nl; cnl->wbox_test_seq_num = 1; /*Signal the first step and wait here*/ while (2 != cnl->wbox_test_seq_num) /*Wait for another process to get hold of the semaphore and signal next step*/ LONG_SLEEP(10); } # endif if (argc < 2) display_prompt(); io_init(TRUE); while (1) { if (!dse_process(argc)) break; display_prompt(); } dse_exit(); REVERT; }
void deferred_signal_handler(void) { void (*signal_routine)(); DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; /* To avoid nested calls to this routine, we set forced_exit to FALSE at the very beginning */ forced_exit = FALSE; if (exit_handler_active) { assert(FALSE); /* at this point in time (June 2003) there is no way we know of to get here, hence the assert */ return; /* since anyway we are exiting currently, resume exit handling instead of reissuing another one */ } /* For signals that get a delayed response so we can get out of crit, we also delay the messages. * This routine will output those delayed messages from the appropriate structures to both the * user and the system console. */ /* note can't use switch here because ERR_xxx are not defined as constants */ if (ERR_KILLBYSIG == forced_exit_err) { send_msg(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal); gtm_putmsg(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal); } else if (ERR_KILLBYSIGUINFO == forced_exit_err) { send_msg(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.send_pid, signal_info.send_uid); gtm_putmsg(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.send_pid, signal_info.send_uid); } else if (ERR_KILLBYSIGSINFO1 == forced_exit_err) { send_msg(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr); gtm_putmsg(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr); } else if (ERR_KILLBYSIGSINFO2 == forced_exit_err) { send_msg(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr); gtm_putmsg(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr); } else if (ERR_KILLBYSIGSINFO3 == forced_exit_err) { send_msg(VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.bad_vadr); gtm_putmsg(VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.bad_vadr); } else if (ERR_FORCEDHALT != forced_exit_err || !gtm_quiet_halt) { /* No HALT messages if quiet halt is requested */ send_msg(VARLSTCNT(1) forced_exit_err); gtm_putmsg(VARLSTCNT(1) forced_exit_err); } assert(OK_TO_INTERRUPT); /* Signal intent to exit BEFORE driving condition handlers. This avoids checks that will otherwise fail (for example * if mdb_condition_handler/preemptive_ch gets called below, that could invoke the RESET_GV_TARGET macro which in turn * would assert that gv_target->gd_csa is equal to cs_addrs. This could not be true in case we were in mainline code * that was interrupted by the flush timer for a different region which in turn was interrupted by an external signal * that would drive us to exit. Setting the "process_exiting" variable causes those csa checks to pass. */ SET_PROCESS_EXITING_TRUE; # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_DEFERRED_TIMERS == gtm_white_box_test_case_number) && (2 == gtm_white_box_test_case_count)) { DEFER_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); DBGFPF((stderr, "DEFERRED_SIGNAL_HANDLER: will sleep for 20 seconds\n")); LONG_SLEEP(20); DBGFPF((stderr, "DEFERRED_SIGNAL_HANDLER: done sleeping\n")); ENABLE_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); } # endif /* If any special routines are registered to be driven on a signal, drive them now */ if ((0 != exi_condition) && (NULL != call_on_signal)) { signal_routine = call_on_signal; call_on_signal = NULL; /* So we don't recursively call ourselves */ (*signal_routine)(); } /* Note, we do not drive create_fatal_error zshow_dmp() in this routine since any deferrable signals are * by definition not fatal. */ exit(-exi_condition); }
uint4 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog) { sm_uc_ptr_t old_base[2], mmap_retaddr; boolean_t was_crit, is_mm; int result, save_errno, status; DEBUG_ONLY(int first_save_errno); uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total; uint4 jnl_status; gtm_uint64_t avail_blocks, mmap_sz; off_t new_eof, new_size; trans_num curr_tn; unix_db_info *udi; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; cache_rec_ptr_t cr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(!IS_DSE_IMAGE); assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */ assert(!process_exiting); DEBUG_ONLY(old_base[0] = old_base[1] = NULL); assert(!gv_cur_region->read_only); udi = FILE_INFO(gv_cur_region); is_mm = (dba_mm == cs_addrs->hdr->acc_meth); # if !defined(MM_FILE_EXT_OK) if (!udi->grabbed_access_sem && is_mm) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ # endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); # if defined(__sun) || defined(__hpux) cs_data->defer_allocate = TRUE; # endif if (!blocks && (cs_data->defer_allocate || (TRANS_IN_PROG_TRUE == trans_in_prog))) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks * and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this * manner as every non-bitmap block must have an associated bitmap) * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert((0 < (int)new_blocks) || (!cs_data->defer_allocate && (0 == new_blocks))); if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { assert(WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { if (!(gtmDebugLevel & GDL_IgnoreAvailSpace)) { /* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB * in space it could never fit it if wasn't sparse. Needed for some tests. */ avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (blocks > (uint4)avail_blocks) { if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs)) return (uint4)(NO_FREE_SPACE); else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks); } else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); } } } # ifdef DEBUG if (WBTEST_ENABLED(WBTEST_MM_CONCURRENT_FILE_EXTEND) && dollar_tlevel && !MEMCMP_LIT(gv_cur_region->rname, "DEFAULT")) { SYSTEM("$gtm_dist/mumps -run $gtm_wbox_mrtn"); assert(1 == cs_addrs->nl->wbox_test_seq_num); /* should have been set by mubfilcpy */ cs_addrs->nl->wbox_test_seq_num = 2; /* signal mupip backup to stop sleeping in mubfilcpy */ } # endif /* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */ was_crit = cs_addrs->now_crit; assert(!cs_addrs->hold_onto_crit || was_crit); /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur. * If we are coming from online rollback (when that feature is available), we will come in holding crit and in * the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself. * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for * freeze. * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited * for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this * function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that * at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region * to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused * op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which * we hold crit. */ assert(!was_crit || !FROZEN_HARD(cs_data) || (dollar_tlevel && (CDB_STAGNATE <= t_tries))); /* * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) { for ( ; ; ) { grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(cs_addrs, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } } else if (FROZEN_HARD(cs_data) && dollar_tlevel) { /* We don't want to continue with file extension as explained above. Hence return with an error code which * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly. */ assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_FREEZE_PROG; } else WAIT_FOR_REGION_TO_UNCHILL(cs_addrs, cs_data); if (IS_REPL_INST_FROZEN && trans_in_prog) { assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_INST_FREEZE; } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); old_total = cs_data->trans_hist.total_blks; if (old_total != filesize) { /* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM, * we still need to remap the database */ assert((old_total > filesize) || !is_mm); /* For BG, someone else could have truncated or extended - we have no idea */ GDSFILEXT_CLNUP; return (SS_NORMAL); } if (trans_in_prog && SUSPICIOUS_EXTEND) { if (!was_crit) { GDSFILEXT_CLNUP; return (uint4)(EXTEND_SUSPECT); } /* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then it means we held * crit through bm_getfree into gdsfilext and still didn't get it right. */ assertpro(!is_free_blks_ctr_ok() && !SUSPICIOUS_EXTEND); } if (JNL_ENABLED(cs_data)) { if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = cs_addrs->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, cs_addrs); if (jnl_status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); return (uint4)(NO_FREE_SPACE); /* should have better return status */ } } if (is_mm) { cs_addrs->nl->mm_extender_pid = process_id; status = wcs_wtstart(gv_cur_region, 0, NULL, NULL); cs_addrs->nl->mm_extender_pid = 0; assertpro(SS_NORMAL == status); old_base[0] = cs_addrs->db_addrs[0]; old_base[1] = cs_addrs->db_addrs[1]; cs_addrs->db_addrs[0] = NULL; /* don't rely on it until the mmap below */ # ifdef _AIX status = shmdt(old_base[0] - BLK_ZERO_OFF(cs_data->start_vbn)); # else status = munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0])); # endif if (0 != status) { save_errno = errno; GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), ERR_SYSCALL, 5, LEN_AND_STR(MEM_UNMAP_SYSCALL), CALLFROM, save_errno); return (uint4)(NO_FREE_SPACE); } } else { /* Due to concurrency issues, it is possible some process had issued a disk read of the GDS block# corresponding * to "old_total" right after a truncate wrote a GDS-block of zeros on disk (to signal end of the db file). * If so, the global buffer containing this block needs to be invalidated now as part of the extend. If not, it is * possible the EOF block on disk is now going to be overwritten by a properly initialized bitmap block (as part * of the gdsfilext below) while the global buffer continues to have an incorrect copy of that bitmap block and * this in turn would cause XXXX failures due to a bad bitmap block in shared memory. (GTM-7519) */ cr = db_csh_get((block_id)old_total); if ((NULL != cr) && ((cache_rec_ptr_t)CR_NOTVALID != cr)) { assert((0 == cr->dirty) && (0 == cr->bt_index) && !cr->stopped); cr->cycle++; cr->blk = CR_BLKEMPTY; } } CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ new_total = old_total + new_blocks; new_eof = BLK_ZERO_OFF(cs_data->start_vbn) + ((off_t)new_total * cs_data->blk_size); # if !defined(__sun) && !defined(__hpux) if (!cs_data->defer_allocate) { new_size = new_eof + cs_data->blk_size; save_errno = posix_fallocate(udi->fd, 0, new_size); DEBUG_ONLY(first_save_errno = save_errno); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_fallocate(udi, new_size); if (0 != save_errno) { GDSFILEXT_CLNUP; assert(ENOSPC == save_errno); if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_PREALLOCATEFAIL, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } } # endif save_errno = db_write_eof_block(udi, udi->fd, cs_data->blk_size, new_eof, &(TREF(dio_buff))); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_write(udi, cs_data->blk_size, new_eof); if (0 != save_errno) { GDSFILEXT_CLNUP; if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_1)) { LONG_SLEEP(600); assert(FALSE); } /* Ensure the EOF and metadata get to disk BEFORE any bitmap writes. Otherwise, the file size could no longer reflect * a proper extent and subsequent invocations of gdsfilext could corrupt the database. */ if (!IS_STATSDB_CSA(cs_addrs)) { GTM_DB_FSYNC(cs_addrs, udi->fd, status); assert(0 == status); if (0 != status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(8) ERR_DBFILERR, 5, RTS_ERROR_LITERAL("fsync1()"), CALLFROM, status); return (uint4)(NO_FREE_SPACE); } } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_2)) { LONG_SLEEP(600); assert(FALSE); /* Should be killed before that */ } DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
void mu_int_reg(gd_region *reg, boolean_t *return_value, boolean_t return_after_open) { boolean_t read_only, was_crit; freeze_status status; node_local_ptr_t cnl; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; sgmnt_data *csd_copy_ptr; gd_segment *seg; int gtmcrypt_errno; # ifdef DEBUG boolean_t need_to_wait = FALSE; int trynum; uint4 curr_wbox_seq_num; # endif *return_value = FALSE; jnlpool_init_needed = TRUE; ESTABLISH(mu_int_reg_ch); if (dba_usr == reg->dyn.addr->acc_meth) { util_out_print("!/Can't integ region !AD; not GDS format", TRUE, REG_LEN_STR(reg)); mu_int_skipreg_cnt++; return; } gv_cur_region = reg; if (reg_cmcheck(reg)) { util_out_print("!/Can't integ region across network", TRUE); mu_int_skipreg_cnt++; return; } gvcst_init(gv_cur_region); if (gv_cur_region->was_open) { /* already open under another name */ gv_cur_region->open = FALSE; return; } if (return_after_open) { *return_value = TRUE; return; } change_reg(); csa = &FILE_INFO(gv_cur_region)->s_addrs; cnl = csa->nl; csd = csa->hdr; read_only = gv_cur_region->read_only; assert(NULL != mu_int_master); /* Ensure that we don't see an increase in the file header and master map size compared to it's maximum values */ assert(SGMNT_HDR_LEN >= SIZEOF(sgmnt_data) && (MASTER_MAP_SIZE_MAX >= MASTER_MAP_SIZE(csd))); /* ONLINE INTEG if asked for explicitly by specifying -ONLINE is an error if the db has partial V4 blocks. * However, if -ONLINE is not explicitly specified but rather assumed implicitly (as default for -REG) * then turn off ONLINE INTEG for this region and continue as if -NOONLINE was specified */ if (!csd->fully_upgraded) { ointeg_this_reg = FALSE; /* Turn off ONLINE INTEG for this region */ if (online_specified) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_SSV4NOALLOW, 2, DB_LEN_STR(gv_cur_region)); util_out_print(NO_ONLINE_ERR_MSG, TRUE); mu_int_skipreg_cnt++; return; } } if (!ointeg_this_reg || read_only) { status = region_freeze(gv_cur_region, TRUE, FALSE, TRUE, FALSE, !read_only); switch (status) { case REG_ALREADY_FROZEN: if (csa->read_only_fs) break; util_out_print("!/Database for region !AD is already frozen, not integing", TRUE, REG_LEN_STR(gv_cur_region)); mu_int_skipreg_cnt++; return; case REG_FLUSH_ERROR: gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT(MUPIP_INTEG), DB_LEN_STR(gv_cur_region)); mu_int_skipreg_cnt++; return; case REG_HAS_KIP: /* We have already waited for KIP to reset. This time do not wait for KIP */ status = region_freeze(gv_cur_region, TRUE, FALSE, FALSE, FALSE, !read_only); if (REG_ALREADY_FROZEN == status) { if (csa->read_only_fs) break; util_out_print("!/Database for region !AD is already frozen, not integing", TRUE, REG_LEN_STR(gv_cur_region)); mu_int_skipreg_cnt++; return; } else if (REG_FLUSH_ERROR == status) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT(MUPIP_INTEG), DB_LEN_STR(gv_cur_region)); mu_int_skipreg_cnt++; return; } assert(REG_FREEZE_SUCCESS == status); /* no break */ case REG_FREEZE_SUCCESS: break; default: assert(FALSE); /* no break */ } if (read_only && (dba_bg == csa->hdr->acc_meth) && !mu_int_wait_rdonly(csa, MUPIP_INTEG)) { mu_int_skipreg_cnt++; return; } } if (!ointeg_this_reg) { /* Take a copy of the file-header. To ensure it is consistent, do it while holding crit. */ was_crit = csa->now_crit; if (!was_crit) grab_crit(gv_cur_region); memcpy((uchar_ptr_t)&mu_int_data, (uchar_ptr_t)csd, SIZEOF(sgmnt_data)); if (!was_crit) rel_crit(gv_cur_region); memcpy(mu_int_master, MM_ADDR(csd), MASTER_MAP_SIZE(csd)); csd_copy_ptr = &mu_int_data; } else { if (!ss_initiate(gv_cur_region, util_ss_ptr, &csa->ss_ctx, preserve_snapshot, MUPIP_INTEG)) { mu_int_skipreg_cnt++; assert(NULL != csa->ss_ctx); ss_release(&csa->ss_ctx); ointeg_this_reg = FALSE; /* Turn off ONLINE INTEG for this region */ assert(process_id != cnl->in_crit); /* Ensure ss_initiate released the crit before returning */ assert(!FROZEN_HARD(csd)); /* Ensure region is unfrozen before returning from ss_initiate */ assert(INTRPT_IN_SS_INITIATE != intrpt_ok_state); /* Ensure ss_initiate released intrpt_ok_state */ return; } assert(process_id != cnl->in_crit); /* Ensure ss_initiate released the crit before returning */ assert(INTRPT_IN_SS_INITIATE != intrpt_ok_state); /* Ensure ss_initiate released intrpt_ok_state */ csd_copy_ptr = &csa->ss_ctx->ss_shm_ptr->shadow_file_header; # if defined(DEBUG) curr_wbox_seq_num = 1; cnl->wbox_test_seq_num = curr_wbox_seq_num; /* indicate we took the next step */ GTM_WHITE_BOX_TEST(WBTEST_OINTEG_WAIT_ON_START, need_to_wait, TRUE); if (need_to_wait) /* wait for them to take next step */ { trynum = 30; /* given 30 cycles to tell you to go */ while ((curr_wbox_seq_num == cnl->wbox_test_seq_num) && trynum--) LONG_SLEEP(1); cnl->wbox_test_seq_num++; /* let them know we took the next step */ assert(trynum); } # endif } if (USES_ANY_KEY(csd_copy_ptr)) { /* Initialize mu_int_encrypt_key_handle to be used in mu_int_read */ seg = gv_cur_region->dyn.addr; INIT_DB_OR_JNL_ENCRYPTION(&mu_int_encr_handles, csd_copy_ptr, seg->fname_len, (char *)seg->fname, gtmcrypt_errno); if (0 != gtmcrypt_errno) { GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, gtm_putmsg, seg->fname_len, seg->fname); mu_int_skipreg_cnt++; return; } } *return_value = mu_int_fhead(); REVERT; return; }
int gtmrecv_poll_actions1(int *pending_data_len, int *buff_unprocessed, unsigned char *buffp) { static int report_cnt = 1; static int next_report_at = 1; static boolean_t send_xoff = FALSE; static boolean_t xoff_sent = FALSE; static seq_num send_seqno; static boolean_t log_draining_msg = FALSE; static boolean_t send_badtrans = FALSE; static boolean_t send_cmp2uncmp = FALSE; static boolean_t upd_shut_too_early_logged = FALSE; static time_t last_reap_time = 0; repl_msg_t xoff_msg; repl_badtrans_msg_t bad_trans_msg; boolean_t alert = FALSE, info = FALSE; int return_status; gd_region *region_top; unsigned char *msg_ptr; /* needed for REPL_{SEND,RECV}_LOOP */ int tosend_len, sent_len, sent_this_iter; /* needed for REPL_SEND_LOOP */ int torecv_len, recvd_len, recvd_this_iter; /* needed for REPL_RECV_LOOP */ int status, poll_dir; /* needed for REPL_{SEND,RECV}_LOOP */ int temp_len, pending_msg_size; int upd_start_status, upd_start_attempts; int buffered_data_len; int upd_exit_status; seq_num temp_send_seqno; boolean_t bad_trans_detected = FALSE, onln_rlbk_flg_set = FALSE; recvpool_ctl_ptr_t recvpool_ctl; upd_proc_local_ptr_t upd_proc_local; gtmrecv_local_ptr_t gtmrecv_local; upd_helper_ctl_ptr_t upd_helper_ctl; pid_t waitpid_res; int4 msg_type, msg_len; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; recvpool_ctl = recvpool.recvpool_ctl; upd_proc_local = recvpool.upd_proc_local; gtmrecv_local = recvpool.gtmrecv_local; upd_helper_ctl = recvpool.upd_helper_ctl; if (SHUTDOWN == gtmrecv_local->shutdown) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Shutdown signalled\n"); gtmrecv_end(); /* Won't return */ } # ifdef GTM_TLS /* If we sent a REPL_RENEG_ACK, then we cannot afford to send anymore asynchronous messages (like XOFF_ACK_ME) until we * receive a REPL_RENEG_COMPLETE from the source server. This ensures that while the source server attempts to do a SSL/TLS * renegotiation, it doesn't have any application data (like XOFF_ACK_ME) sitting in the pipe. */ if (REPLTLS_WAITING_FOR_RENEG_COMPLETE == repl_tls.renegotiate_state) return STOP_POLL; # endif /* Reset report_cnt and next_report_at to 1 when a new upd proc is forked */ if ((1 == report_cnt) || (report_cnt == next_report_at)) { /* A comment on the usage of NO_SHUTDOWN below for the alert variable. Since upd_proc_local->upd_proc_shutdown is * a shared memory field (and could be concurrently changed by either the receiver server or the update process), * we want to make sure it is the same value BEFORE and AFTER checking whether the update process is alive or not. * If it is not NO_SHUTDOWN (i.e. is SHUTDOWN or NORMAL_SHUTDOWN or ABNORMAL_SHUTDOWN) it has shut down due to * an external request so we do want to send out a false update-process-is-not-alive alert. */ if ((alert = ((NO_SHUTDOWN == upd_proc_local->upd_proc_shutdown) && (SRV_DEAD == is_updproc_alive()) && (NO_SHUTDOWN == upd_proc_local->upd_proc_shutdown))) || (info = (((NORMAL_SHUTDOWN == upd_proc_local->upd_proc_shutdown) || (ABNORMAL_SHUTDOWN == upd_proc_local->upd_proc_shutdown)) && (SRV_DEAD == is_updproc_alive())))) { if (alert) repl_log(gtmrecv_log_fp, TRUE, TRUE, "ALERT : Receiver Server detected that Update Process is not ALIVE\n"); else repl_log(gtmrecv_log_fp, TRUE, TRUE, "INFO : Update process not running due to user initiated shutdown\n"); if (1 == report_cnt) { send_xoff = TRUE; recvpool_ctl->old_jnl_seqno = recvpool_ctl->jnl_seqno; recvpool_ctl->jnl_seqno = 0; /* Even though we have identified that the update process is NOT alive, a waitpid on the update * process PID is necessary so that the system doesn't leave any zombie process lying around. * This is possible since any child process that dies without the parent doing a waitpid on it * will be defunct unless the parent dies at which point the "init" process takes the role of * the parent and invokes waitpid to remove the zombies. * NOTE: It is possible that the update process was killed before the receiver server got a * chance to record it's PID in the recvpool.upd_proc_local structure. In such a case, don't * invoke waitpid as that will block us (receiver server) if this instance of the receiver * server was started with helper processes. */ if (0 < upd_proc_local->upd_proc_pid) { WAITPID(upd_proc_local->upd_proc_pid, &upd_exit_status, 0, waitpid_res); /* Since the update process as part of its shutdown does NOT reset the upd_proc_pid, reset * it here ONLY if the update process was NOT kill -9ed. This is needed because receiver * server as part of its shutdown relies on this field (upd_proc_pid) to determine if the * update process was cleanly shutdown or was kill -9ed. */ if (!alert) upd_proc_local->upd_proc_pid = 0; } upd_proc_local->bad_trans = FALSE; /* No point in doing bad transaction processing */ upd_proc_local->onln_rlbk_flg = FALSE; /* No point handling online rollback */ } gtmrecv_wait_for_jnl_seqno = TRUE; REPL_DPRINT1( "gtmrecv_poll_actions : Setting gtmrecv_wait_for_jnl_seqno to TRUE because of upd crash/shutdown\n"); next_report_at *= GTMRECV_NEXT_REPORT_FACTOR; report_cnt++; } } else report_cnt++; /* Check if REPL_CMP2UNCMP or REPL_BADTRANS message needs to be sent */ if (upd_proc_local->onln_rlbk_flg) { /* Update process detected an online rollback and is requesting us to restart the connection. But before that, send * REPL_XOFF source side and drain the replication pipe */ onln_rlbk_flg_set = TRUE; send_xoff = TRUE; } else if (!send_cmp2uncmp && gtmrecv_send_cmp2uncmp) { send_xoff = TRUE; send_seqno = recvpool_ctl->jnl_seqno; send_cmp2uncmp = TRUE; } else if (!send_badtrans && upd_proc_local->bad_trans) { send_xoff = TRUE; send_seqno = upd_proc_local->read_jnl_seqno; send_badtrans = TRUE; bad_trans_detected = TRUE; } else if (!upd_proc_local->bad_trans && send_badtrans && 1 != report_cnt) { send_badtrans = FALSE; bad_trans_detected = FALSE; } if (send_xoff && !xoff_sent) { /* Send XOFF_ACK_ME if the receiver has a connection to the source. Do not attempt to send it if we dont even * know the endianness of the remote side. In that case, we are guaranteed no initial handshake occurred and * so no point sending the XOFF too. This saves us lots of trouble in case of cross-endian replication connections. */ assert((FD_INVALID != gtmrecv_sock_fd) || repl_connection_reset); if ((FD_INVALID != gtmrecv_sock_fd) && remote_side->endianness_known) { send_seqno = upd_proc_local->read_jnl_seqno; if (!remote_side->cross_endian) { xoff_msg.type = REPL_XOFF_ACK_ME; xoff_msg.len = MIN_REPL_MSGLEN; memcpy((uchar_ptr_t)&xoff_msg.msg[0], (uchar_ptr_t)&send_seqno, SIZEOF(seq_num)); } else { xoff_msg.type = GTM_BYTESWAP_32(REPL_XOFF_ACK_ME); xoff_msg.len = GTM_BYTESWAP_32(MIN_REPL_MSGLEN); temp_send_seqno = GTM_BYTESWAP_64(send_seqno); memcpy((uchar_ptr_t)&xoff_msg.msg[0], (uchar_ptr_t)&temp_send_seqno, SIZEOF(seq_num)); } REPL_SEND_LOOP(gtmrecv_sock_fd, &xoff_msg, MIN_REPL_MSGLEN, REPL_POLL_NOWAIT) ; /* Empty Body */ if (SS_NORMAL != status) { if (REPL_CONN_RESET(status) && EREPL_SEND == repl_errno) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Connection reset while sending XOFF_ACK_ME. " "Status = %d ; %s\n", status, STRERROR(status)); repl_close(>mrecv_sock_fd); repl_connection_reset = TRUE; xoff_sent = FALSE; send_badtrans = FALSE; } else if (EREPL_SEND == repl_errno) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, LEN_AND_LIT("Error sending XOFF msg due to BAD_TRANS or UPD crash/shutdown. " "Error in send"), status); else { assert(EREPL_SELECT == repl_errno); rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, LEN_AND_LIT("Error sending XOFF msg due to BAD_TRANS or UPD crash/shutdown. " "Error in select"), status); } } else { xoff_sent = TRUE; log_draining_msg = TRUE; } repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL_XOFF_ACK_ME sent due to upd shutdown/crash or bad trans " "or ONLINE_ROLLBACK\n"); send_xoff = FALSE; } else { /* Connection has been lost OR initial handshake needs to happen again, so no point sending XOFF/BADTRANS */ send_xoff = FALSE; send_badtrans = FALSE; } } /* Drain pipe */ if (xoff_sent) { if (log_draining_msg) { /* avoid multiple logs per instance */ repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL INFO - Draining replication pipe due to %s\n", send_cmp2uncmp ? "CMP2UNCMP" : (send_badtrans ? "BAD_TRANS" : (onln_rlbk_flg_set ? "ONLINE_ROLLBACK" : "UPD shutdown/crash"))); log_draining_msg = FALSE; } if (0 != *buff_unprocessed) { /* Throw away the current contents of the buffer */ buffered_data_len = ((*pending_data_len <= *buff_unprocessed) ? *pending_data_len : *buff_unprocessed); *buff_unprocessed -= buffered_data_len; buffp += buffered_data_len; *pending_data_len -= buffered_data_len; REPL_DPRINT2("gtmrecv_poll_actions : (1) Throwing away %d bytes from old buffer while draining\n", buffered_data_len); assert(remote_side->endianness_known); /* only then is remote_side->cross_endian reliable */ while (REPL_MSG_HDRLEN <= *buff_unprocessed) { assert(0 == (((unsigned long)buffp) % REPL_MSG_ALIGN)); msg_len = ((repl_msg_ptr_t)buffp)->len; msg_type = ((repl_msg_ptr_t)buffp)->type; if (remote_side->cross_endian) { msg_len = GTM_BYTESWAP_32(msg_len); msg_type = GTM_BYTESWAP_32(msg_type); } msg_type = (msg_type & REPL_TR_CMP_MSG_TYPE_MASK); assert((REPL_TR_CMP_JNL_RECS == msg_type) || (0 == (msg_len % REPL_MSG_ALIGN))); *pending_data_len = ROUND_UP2(msg_len, REPL_MSG_ALIGN); buffered_data_len = ((*pending_data_len <= *buff_unprocessed) ? *pending_data_len : *buff_unprocessed); *buff_unprocessed -= buffered_data_len; buffp += buffered_data_len; *pending_data_len -= buffered_data_len; REPL_DPRINT3("gtmrecv_poll_actions : (1) Throwing away message of " "type %d and length %d from old buffer while draining\n", msg_type, buffered_data_len); } if (0 < *buff_unprocessed) { memmove((unsigned char *)gtmrecv_msgp, buffp, *buff_unprocessed); REPL_DPRINT2("gtmrecv_poll_actions : Incomplete header of length %d while draining\n", *buff_unprocessed); } } status = SS_NORMAL; if (0 != *buff_unprocessed || 0 == *pending_data_len) { /* Receive the header of a message */ assert(REPL_MSG_HDRLEN > *buff_unprocessed); /* so we dont pass negative length in REPL_RECV_LOOP */ REPL_RECV_LOOP(gtmrecv_sock_fd, ((unsigned char *)gtmrecv_msgp) + *buff_unprocessed, (REPL_MSG_HDRLEN - *buff_unprocessed), REPL_POLL_WAIT) ; /* Empty Body */ if (SS_NORMAL == status) { assert(remote_side->endianness_known); /* only then is remote_side->cross_endian reliable */ if (!remote_side->cross_endian) { msg_len = gtmrecv_msgp->len; msg_type = gtmrecv_msgp->type; } else { msg_len = GTM_BYTESWAP_32(gtmrecv_msgp->len); msg_type = GTM_BYTESWAP_32(gtmrecv_msgp->type); } msg_type = (msg_type & REPL_TR_CMP_MSG_TYPE_MASK); assert((REPL_TR_CMP_JNL_RECS == msg_type) || (0 == (msg_len % REPL_MSG_ALIGN))); msg_len = ROUND_UP2(msg_len, REPL_MSG_ALIGN); REPL_DPRINT3("gtmrecv_poll_actions : Received message of type %d and length %d while draining\n", msg_type, msg_len); } } if ((SS_NORMAL == status) && (0 != *buff_unprocessed || 0 == *pending_data_len) && (REPL_XOFF_ACK == msg_type)) { /* Receive the rest of the XOFF_ACK msg and signal the drain as complete */ REPL_RECV_LOOP(gtmrecv_sock_fd, gtmrecv_msgp, (MIN_REPL_MSGLEN - REPL_MSG_HDRLEN), REPL_POLL_WAIT) ; /* Empty Body */ if (SS_NORMAL == status) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL INFO - XOFF_ACK received. Drained replication pipe completely\n"); upd_shut_too_early_logged = FALSE; xoff_sent = FALSE; return_status = STOP_POLL; } } else if (SS_NORMAL == status) { /* Drain the rest of the message */ if (0 < *pending_data_len) { pending_msg_size = *pending_data_len; REPL_DPRINT2("gtmrecv_poll_actions : (2) Throwing away %d bytes from pipe\n", pending_msg_size); } else { pending_msg_size = msg_len - REPL_MSG_HDRLEN; REPL_DPRINT3("gtmrecv_poll_actions : (2) Throwing away message of " "type %d and length %d from pipe\n", msg_type, msg_len); } for ( ; SS_NORMAL == status && 0 < pending_msg_size; pending_msg_size -= gtmrecv_max_repl_msglen) { temp_len = (pending_msg_size < gtmrecv_max_repl_msglen)? pending_msg_size : gtmrecv_max_repl_msglen; REPL_RECV_LOOP(gtmrecv_sock_fd, gtmrecv_msgp, temp_len, REPL_POLL_WAIT) ; /* Empty Body */ } *buff_unprocessed = 0; *pending_data_len = 0; if (SS_NORMAL == status && info && !upd_shut_too_early_logged) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "ALERT : User initiated shutdown of Update Process done " "when there was data in the replication pipe\n"); upd_shut_too_early_logged = TRUE; } return_status = CONTINUE_POLL; } if (SS_NORMAL != status) { if (EREPL_RECV == repl_errno) { if (REPL_CONN_RESET(status)) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Connection reset while receiving XOFF_ACK. " "Status = %d ; %s\n", status, STRERROR(status)); repl_close(>mrecv_sock_fd); repl_connection_reset = TRUE; xoff_sent = FALSE; send_badtrans = FALSE; return_status = STOP_POLL; } else rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, LEN_AND_LIT("Error while draining replication pipe. Error in recv"), status); } else { assert(EREPL_SELECT == repl_errno); rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, LEN_AND_LIT("Error while draining replication pipe. Error in select"), status); } } } else return_status = STOP_POLL; /* Like was done before for the XOFF_ACK_ME message, send a BADTRANS/CMP2UNCMP message only if we know * the endianness of the other side. If not, no point in sending one anyways and saves us trouble in * case of cross-endian replication connections. */ if ((STOP_POLL == return_status) && (send_badtrans || send_cmp2uncmp) && (FD_INVALID != gtmrecv_sock_fd) && remote_side->endianness_known) { /* Send REPL_BADTRANS or REPL_CMP2UNCMP message */ if (!remote_side->cross_endian) { bad_trans_msg.type = send_cmp2uncmp ? REPL_CMP2UNCMP : REPL_BADTRANS; bad_trans_msg.len = MIN_REPL_MSGLEN; bad_trans_msg.start_seqno = send_seqno; } else { bad_trans_msg.type = send_cmp2uncmp ? GTM_BYTESWAP_32(REPL_CMP2UNCMP) : GTM_BYTESWAP_32(REPL_BADTRANS); bad_trans_msg.len = GTM_BYTESWAP_32(MIN_REPL_MSGLEN); bad_trans_msg.start_seqno = GTM_BYTESWAP_64(send_seqno); } REPL_SEND_LOOP(gtmrecv_sock_fd, &bad_trans_msg, bad_trans_msg.len, REPL_POLL_NOWAIT) ; /* Empty Body */ if (SS_NORMAL == status) { if (send_cmp2uncmp) repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL_CMP2UNCMP message sent with seqno %llu\n", send_seqno); else repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL_BADTRANS message sent with seqno %llu\n", send_seqno); } else { if (REPL_CONN_RESET(status) && EREPL_SEND == repl_errno) { if (send_cmp2uncmp) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Connection reset while sending REPL_CMP2UNCMP. " "Status = %d ; %s\n", status, STRERROR(status)); } else { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Connection reset while sending REPL_BADTRANS. " "Status = %d ; %s\n", status, STRERROR(status)); } repl_close(>mrecv_sock_fd); repl_connection_reset = TRUE; return_status = STOP_POLL; } else if (EREPL_SEND == repl_errno) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, LEN_AND_LIT("Error sending REPL_BADTRANS/REPL_CMP2UNCMP. Error in send"), status); else { assert(EREPL_SELECT == repl_errno); rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, LEN_AND_LIT("Error sending REPL_BADTRANS/REPL_CMP2UNCMP. Error in select"), status); } } send_badtrans = FALSE; if (send_cmp2uncmp) { REPL_DPRINT1("gtmrecv_poll_actions : Setting gtmrecv_wait_for_jnl_seqno to TRUE because this receiver" "server requested a fall-back from compressed to uncompressed operation\n"); gtmrecv_wait_for_jnl_seqno = TRUE;/* set this to TRUE to break out and go back to a fresh "do_main_loop" */ gtmrecv_bad_trans_sent = TRUE; gtmrecv_send_cmp2uncmp = FALSE; send_cmp2uncmp = FALSE; } } if ((upd_proc_local->bad_trans && bad_trans_detected) || onln_rlbk_flg_set || (UPDPROC_START == upd_proc_local->start_upd) && (1 != report_cnt)) { if (UPDPROC_START == upd_proc_local->start_upd) { assert(is_updproc_alive() != SRV_ALIVE); upd_proc_local->upd_proc_shutdown = NO_SHUTDOWN; } recvpool_ctl->wrapped = FALSE; recvpool_ctl->write_wrap = recvpool_ctl->recvpool_size; recvpool_ctl->write = 0; /* Reset last_rcvd_histinfo, last_valid_histinfo etc. as they reflect context from unprocessed data * in the receive pool and those are no longer valid because we have drained the receive pool. */ GTMRECV_CLEAR_CACHED_HISTINFO(recvpool.recvpool_ctl, jnlpool, jnlpool_ctl, INSERT_STRM_HISTINFO_FALSE); if (UPDPROC_START == upd_proc_local->start_upd) { /* Attempt starting the update process */ for (upd_start_attempts = 0; UPDPROC_START_ERR == (upd_start_status = gtmrecv_upd_proc_init(FALSE)) && GTMRECV_MAX_UPDSTART_ATTEMPTS > upd_start_attempts; upd_start_attempts++) { if (EREPL_UPDSTART_SEMCTL == repl_errno || EREPL_UPDSTART_BADPATH == repl_errno) { gtmrecv_autoshutdown(); } else if (EREPL_UPDSTART_FORK == repl_errno) { /* Couldn't start up update now, can try later */ LONG_SLEEP(GTMRECV_WAIT_FOR_PROC_SLOTS); continue; } else if (EREPL_UPDSTART_EXEC == repl_errno) { /* In forked child, could not exec, should exit */ gtmrecv_exit(ABNORMAL_SHUTDOWN); } } if (UPDPROC_STARTED == (upd_proc_local->start_upd = upd_start_status)) { REPL_DPRINT1("gtmrecv_poll_actions : Setting gtmrecv_wait_for_jnl_seqno to TRUE because of " "upd restart\n"); gtmrecv_wait_for_jnl_seqno = TRUE; report_cnt = next_report_at = 1; if (send_xoff && (FD_INVALID == gtmrecv_sock_fd)) { /* Update start command was issued before connection was established, * no point in sending XOFF. */ send_xoff = FALSE; } } else { repl_log(gtmrecv_log_fp, TRUE, TRUE, "%d failed attempts to fork update process. Try later\n", upd_start_attempts); } } else { gtmrecv_wait_for_jnl_seqno = TRUE;/* set this to TRUE to break out and go back to a fresh "do_main_loop" */ if (onln_rlbk_flg_set) { assert(NULL != jnlpool_ctl); repl_log(gtmrecv_log_fp, TRUE, TRUE, "Closing connection due to ONLINE ROLLBACK\n"); repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL INFO - Current Jnlpool Seqno : %llu\n", jnlpool_ctl->jnl_seqno); repl_log(gtmrecv_log_fp, TRUE, TRUE, "REPL INFO - Current Receive Pool Seqno : %llu\n", recvpool_ctl->jnl_seqno); repl_close(>mrecv_sock_fd); repl_connection_reset = TRUE; xoff_sent = FALSE; send_badtrans = FALSE; upd_proc_local->onln_rlbk_flg = FALSE; /* Before restarting afresh, sync the online rollback cycles. This way any future grab_lock that * we do after restarting should not realize an unhandled online rollback. For receiver, it is * just syncing the journal pool cycles as the databases are not opened. But, to be safe, grab * the lock and sync the cycles. */ grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, GRAB_LOCK_ONLY); SYNC_ONLN_RLBK_CYCLES; rel_lock(jnlpool.jnlpool_dummy_reg); return_status = STOP_POLL; recvpool_ctl->jnl_seqno = 0; } else { REPL_DPRINT1("gtmrecv_poll_actions : Setting gtmrecv_wait_for_jnl_seqno to TRUE because bad trans" "sent\n"); gtmrecv_bad_trans_sent = TRUE; upd_proc_local->bad_trans = FALSE; recvpool_ctl->jnl_seqno = upd_proc_local->read_jnl_seqno; } } } if ((0 == *pending_data_len) && (0 != gtmrecv_local->changelog)) { if (gtmrecv_local->changelog & REPLIC_CHANGE_LOGINTERVAL) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Changing log interval from %u to %u\n", log_interval, gtmrecv_local->log_interval); log_interval = gtmrecv_local->log_interval; gtmrecv_reinit_logseqno(); /* will force a LOG on the first recv following the interval change */ } if (gtmrecv_local->changelog & REPLIC_CHANGE_LOGFILE) { repl_log(gtmrecv_log_fp, TRUE, TRUE, "Changing log file to %s\n", gtmrecv_local->log_file); repl_log_init(REPL_GENERAL_LOG, >mrecv_log_fd, gtmrecv_local->log_file); repl_log_fd2fp(>mrecv_log_fp, gtmrecv_log_fd); repl_log(gtmrecv_log_fp, TRUE, TRUE, "Change log to %s successful\n",gtmrecv_local->log_file); } /* NOTE: update process and receiver each ignore any setting specific to the other (REPLIC_CHANGE_UPD_LOGINTERVAL, * REPLIC_CHANGE_LOGINTERVAL) */ if (REPLIC_CHANGE_LOGINTERVAL == gtmrecv_local->changelog) upd_proc_local->changelog = 0; else upd_proc_local->changelog = gtmrecv_local->changelog; /* Pass changelog request to the update process */ gtmrecv_local->changelog = 0; } if (0 == *pending_data_len && !gtmrecv_logstats && gtmrecv_local->statslog) { gtmrecv_logstats = TRUE; repl_log(gtmrecv_log_fp, TRUE, TRUE, "Begin statistics logging\n"); } else if (0 == *pending_data_len && gtmrecv_logstats && !gtmrecv_local->statslog) { gtmrecv_logstats = FALSE; /* Force all data out to the file before closing the file */ repl_log(gtmrecv_log_fp, TRUE, TRUE, "End statistics logging\n"); } if (0 == *pending_data_len) { if (upd_helper_ctl->start_helpers) { gtmrecv_helpers_init(upd_helper_ctl->start_n_readers, upd_helper_ctl->start_n_writers); upd_helper_ctl->start_helpers = FALSE; } if (HELPER_REAP_NONE != (status = upd_helper_ctl->reap_helpers) || (double)GTMRECV_REAP_HELPERS_INTERVAL <= difftime(gtmrecv_now, last_reap_time)) { gtmrecv_reap_helpers(HELPER_REAP_WAIT == status); last_reap_time = gtmrecv_now; } } return (return_status); }
/* * ------------------------------------------ * Hang the process for a specified time. * * Goes to sleep for a positive value. * Any caught signal will terminate the sleep * following the execution of that signal's catching routine. * * Arguments: * num - time to sleep * * Return: * none * ------------------------------------------ */ void op_hang(mval* num) { int ms; mv_stent *mv_zintcmd; ABS_TIME cur_time, end_time; # ifdef VMS uint4 time[2]; int4 efn_mask, status; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; ms = 0; MV_FORCE_NUM(num); if (num->mvtype & MV_INT) { if (0 < num->m[1]) { assert(MV_BIAS >= 1000); /* if formats change overflow may need attention */ ms = num->m[1] * (1000 / MV_BIAS); } } else if (0 == num->sgn) /* if sign is not 0 it means num is negative */ ms = mval2i(num) * 1000; /* too big to care about fractional amounts */ if (ms) { if (TREF(tpnotacidtime) * 1000 < ms) TPNOTACID_CHECK(HANGSTR); # if defined(DEBUG) && defined(UNIX) if (gtm_white_box_test_case_enabled && (WBTEST_DEFERRED_TIMERS == gtm_white_box_test_case_number) && (3 > gtm_white_box_test_case_count) && (123000 == ms)) { DEFER_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); DBGFPF((stderr, "OP_HANG: will sleep for 20 seconds\n")); LONG_SLEEP(20); DBGFPF((stderr, "OP_HANG: done sleeping\n")); ENABLE_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); return; } if (gtm_white_box_test_case_enabled && (WBTEST_BREAKMPC == gtm_white_box_test_case_number) && (0 == gtm_white_box_test_case_count) && (999 == ms)) { frame_pointer->old_frame_pointer->mpc = (unsigned char *)GTM64_ONLY(0xdeadbeef12345678) NON_GTM64_ONLY(0xdead1234); return; } /* Upon seeing a .999s hang this white-box test launches a timer that pops with a period of UTIL_OUT_SYSLOG_INTERVAL * and prints a long message via util_out_ptr. */ if (gtm_white_box_test_case_enabled && (WBTEST_UTIL_OUT_BUFFER_PROTECTION == gtm_white_box_test_case_number) && (0 == gtm_white_box_test_case_count) && (999 == ms)) { start_timer((TID)&util_out_syslog_dump, UTIL_OUT_SYSLOG_INTERVAL, util_out_syslog_dump, 0, NULL); return; } # endif sys_get_curr_time(&cur_time); mv_zintcmd = find_mvstent_cmd(ZINTCMD_HANG, restart_pc, restart_ctxt, FALSE); if (!mv_zintcmd) add_int_to_abs_time(&cur_time, ms, &end_time); else { end_time = mv_zintcmd->mv_st_cont.mvs_zintcmd.end_or_remain; cur_time = sub_abs_time(&end_time, &cur_time); /* get remaing time to sleep */ if (0 <= cur_time.at_sec) ms = (int4)(cur_time.at_sec * 1000 + cur_time.at_usec / 1000); else ms = 0; /* all done */ /* restore/pop previous zintcmd_active[ZINTCMD_HANG] hints */ TAREF1(zintcmd_active, ZINTCMD_HANG).restart_pc_last = mv_zintcmd->mv_st_cont.mvs_zintcmd.restart_pc_prior; TAREF1(zintcmd_active, ZINTCMD_HANG).restart_ctxt_last = mv_zintcmd->mv_st_cont.mvs_zintcmd.restart_ctxt_prior; TAREF1(zintcmd_active, ZINTCMD_HANG).count--; assert(0 <= TAREF1(zintcmd_active, ZINTCMD_HANG).count); if (mv_chain == mv_zintcmd) POP_MV_STENT(); /* just pop if top of stack */ else { /* flag as not active */ mv_zintcmd->mv_st_cont.mvs_zintcmd.command = ZINTCMD_NOOP; mv_zintcmd->mv_st_cont.mvs_zintcmd.restart_pc_check = NULL; } if (0 == ms) return; /* done HANGing */ } UNIX_ONLY(hiber_start(ms);) VMS_ONLY( time[0] = -time_low_ms(ms); time[1] = -time_high_ms(ms) - 1; efn_mask = (1 << efn_outofband | 1 << efn_timer); if (SS$_NORMAL != (status = sys$setimr(efn_timer, &time, NULL, &time, 0))) rts_error(VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("$setimr"), CALLFROM, status); if (SS$_NORMAL != (status = sys$wflor(efn_outofband, efn_mask))) rts_error(VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("$wflor"), CALLFROM, status); ) if (outofband)
/* * ------------------------------------------ * Hang the process for a specified time. * * Goes to sleep for a positive value. * Any caught signal will terminate the sleep * following the execution of that signal's catching routine. * * The actual hang duration should be NO LESS than the specified * duration for specified durations greater than .001 seconds. * Certain applications depend on this assumption. * * Arguments: * num - time to sleep * * Return: * none * ------------------------------------------ */ void op_hang(mval* num) { int ms; double tmp; mv_stent *mv_zintcmd; ABS_TIME cur_time, end_time; # ifdef VMS uint4 time[2]; int4 efn_mask, status; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; ms = 0; MV_FORCE_NUM(num); if (num->mvtype & MV_INT) { if (0 < num->m[1]) { assert(MV_BIAS >= 1000); /* if formats change overflow may need attention */ ms = num->m[1] * (1000 / MV_BIAS); } } else if (0 == num->sgn) /* if sign is not 0 it means num is negative */ { tmp = mval2double(num) * (double)1000; ms = ((double)MAXPOSINT4 >= tmp) ? (int)tmp : (int)MAXPOSINT4; } if (ms) { if (TREF(tpnotacidtime) * 1000 < ms) TPNOTACID_CHECK(HANGSTR); # if defined(DEBUG) && defined(UNIX) if (WBTEST_ENABLED(WBTEST_DEFERRED_TIMERS) && (3 > gtm_white_box_test_case_count) && (123000 == ms)) { DEFER_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); DBGFPF((stderr, "OP_HANG: will sleep for 20 seconds\n")); LONG_SLEEP(20); DBGFPF((stderr, "OP_HANG: done sleeping\n")); ENABLE_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); return; } if (WBTEST_ENABLED(WBTEST_BREAKMPC)&& (0 == gtm_white_box_test_case_count) && (999 == ms)) { frame_pointer->old_frame_pointer->mpc = (unsigned char *)GTM64_ONLY(0xdeadbeef12345678) NON_GTM64_ONLY(0xdead1234); return; } if (WBTEST_ENABLED(WBTEST_UTIL_OUT_BUFFER_PROTECTION) && (0 == gtm_white_box_test_case_count) && (999 == ms)) { /* Upon seeing a .999s hang this white-box test launches a timer that pops with a period of * UTIL_OUT_SYSLOG_INTERVAL and prints a long message via util_out_ptr. */ start_timer((TID)&util_out_syslog_dump, UTIL_OUT_SYSLOG_INTERVAL, util_out_syslog_dump, 0, NULL); return; } # endif sys_get_curr_time(&cur_time); mv_zintcmd = find_mvstent_cmd(ZINTCMD_HANG, restart_pc, restart_ctxt, FALSE); if (!mv_zintcmd) add_int_to_abs_time(&cur_time, ms, &end_time); else { end_time = mv_zintcmd->mv_st_cont.mvs_zintcmd.end_or_remain; cur_time = sub_abs_time(&end_time, &cur_time); /* get remaing time to sleep */ if (0 <= cur_time.at_sec) ms = (int4)(cur_time.at_sec * 1000 + cur_time.at_usec / 1000); else ms = 0; /* all done */ /* restore/pop previous zintcmd_active[ZINTCMD_HANG] hints */ TAREF1(zintcmd_active, ZINTCMD_HANG).restart_pc_last = mv_zintcmd->mv_st_cont.mvs_zintcmd.restart_pc_prior; TAREF1(zintcmd_active, ZINTCMD_HANG).restart_ctxt_last = mv_zintcmd->mv_st_cont.mvs_zintcmd.restart_ctxt_prior; TAREF1(zintcmd_active, ZINTCMD_HANG).count--; assert(0 <= TAREF1(zintcmd_active, ZINTCMD_HANG).count); if (mv_chain == mv_zintcmd) POP_MV_STENT(); /* just pop if top of stack */ else { /* flag as not active */ mv_zintcmd->mv_st_cont.mvs_zintcmd.command = ZINTCMD_NOOP; mv_zintcmd->mv_st_cont.mvs_zintcmd.restart_pc_check = NULL; } if (0 == ms) return; /* done HANGing */ } # ifdef UNIX if (ms < 10) SLEEP_USEC(ms * 1000, TRUE); /* Finish the sleep if it is less than 10ms. */ else hiber_start(ms); # elif defined(VMS) time[0] = -time_low_ms(ms); time[1] = -time_high_ms(ms) - 1; efn_mask = (1 << efn_outofband | 1 << efn_timer); if (SS$_NORMAL != (status = sys$setimr(efn_timer, &time, NULL, &time, 0))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("$setimr"), CALLFROM, status); if (SS$_NORMAL != (status = sys$wflor(efn_outofband, efn_mask))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("$wflor"), CALLFROM, status); if (outofband) { if (SS$_WASCLR == (status = sys$readef(efn_timer, &efn_mask))) { if (SS$_NORMAL != (status = sys$cantim(&time, 0))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("$cantim"), CALLFROM, status); } else assertpro(SS$_WASSET == status); } # endif } else rel_quant(); if (outofband) { PUSH_MV_STENT(MVST_ZINTCMD); mv_chain->mv_st_cont.mvs_zintcmd.end_or_remain = end_time; mv_chain->mv_st_cont.mvs_zintcmd.restart_ctxt_check = restart_ctxt; mv_chain->mv_st_cont.mvs_zintcmd.restart_pc_check = restart_pc; /* save current information from zintcmd_active */ mv_chain->mv_st_cont.mvs_zintcmd.restart_ctxt_prior = TAREF1(zintcmd_active, ZINTCMD_HANG).restart_ctxt_last; mv_chain->mv_st_cont.mvs_zintcmd.restart_pc_prior = TAREF1(zintcmd_active, ZINTCMD_HANG).restart_pc_last; TAREF1(zintcmd_active, ZINTCMD_HANG).restart_pc_last = restart_pc; TAREF1(zintcmd_active, ZINTCMD_HANG).restart_ctxt_last = restart_ctxt; TAREF1(zintcmd_active, ZINTCMD_HANG).count++; mv_chain->mv_st_cont.mvs_zintcmd.command = ZINTCMD_HANG; outofband_action(FALSE); } return; }
int gtmsource_changelog(void) { uint4 changelog_accepted = 0; int log_fd = 0; /*used to indicate whether the new specified log file is writable*/ int close_status = 0; /*used to indicate if log file is successfully closed*/ char* err_code; int save_errno = 0; int retry_count = 5; assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]); repl_log(stderr, TRUE, TRUE, "Initiating CHANGELOG operation on source server pid [%d] for secondary instance [%s]\n", jnlpool.gtmsource_local->gtmsource_pid, jnlpool.gtmsource_local->secondary_instname); if (0 != jnlpool.gtmsource_local->changelog) { retry_count = 5; while (0 != retry_count--) { LONG_SLEEP(5); if (!jnlpool.gtmsource_local->changelog) break; } } if (0 != jnlpool.gtmsource_local->changelog) { util_out_print("Change log is already in progress. Not initiating change in log file or log interval", TRUE); return (ABNORMAL_SHUTDOWN); } if ('\0' != gtmsource_options.log_file[0]) /* trigger change in log file */ { if (0 != STRCMP(jnlpool.gtmsource_local->log_file, gtmsource_options.log_file)) { /*check if the new log file is writable*/ OPENFILE3_CLOEXEC(gtmsource_options.log_file, O_RDWR | O_CREAT | O_APPEND, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH, log_fd); if (log_fd < 0) { save_errno = ERRNO; err_code = STRERROR(save_errno); if ('\0' != jnlpool.gtmsource_local->log_file[0]) gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_REPLLOGOPN, 6, LEN_AND_STR(gtmsource_options.log_file), LEN_AND_STR(err_code), LEN_AND_STR(jnlpool.gtmsource_local->log_file)); else gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_REPLLOGOPN, 6, LEN_AND_STR(gtmsource_options.log_file), LEN_AND_STR(err_code), LEN_AND_STR(NULL_DEVICE)); } else { CLOSEFILE_IF_OPEN(log_fd, close_status); assert(close_status==0); changelog_accepted |= REPLIC_CHANGE_LOGFILE; STRCPY(jnlpool.gtmsource_local->log_file, gtmsource_options.log_file); util_out_print("Change log initiated with file !AD", TRUE, LEN_AND_STR(gtmsource_options.log_file)); } } else util_out_print("Log file is already !AD. Not initiating change in log file", TRUE, LEN_AND_STR(gtmsource_options.log_file)); } if (0 != gtmsource_options.src_log_interval) /* trigger change in log interval */ { if (gtmsource_options.src_log_interval != jnlpool.gtmsource_local->log_interval) { changelog_accepted |= REPLIC_CHANGE_LOGINTERVAL; jnlpool.gtmsource_local->log_interval = gtmsource_options.src_log_interval; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_CHANGELOGINTERVAL, 5, LEN_AND_LIT("Source"), LEN_AND_STR(jnlpool.gtmsource_local->log_file), gtmsource_options.src_log_interval); } else util_out_print("Log interval is already !UL. Not initiating change in log interval", TRUE, gtmsource_options.src_log_interval); } if (0 != changelog_accepted) jnlpool.gtmsource_local->changelog = changelog_accepted; else util_out_print("No change to log file or log interval", TRUE); return (0 != save_errno) ? ABNORMAL_SHUTDOWN : NORMAL_SHUTDOWN; }
int main(int argc, char *argv[]) { DCL_THREADGBL_ACCESS; GTM_THREADGBL_INIT; common_startup_init(DSE_IMAGE); licensed = TRUE; TREF(transform) = TRUE; TREF(no_spangbls) = TRUE; /* dse operates on a per-region basis irrespective of global mapping in gld */ TREF(skip_file_corrupt_check) = TRUE; /* do not let csd->file_corrupt flag cause errors in dse */ op_open_ptr = op_open; patch_curr_blk = get_dir_root(); err_init(util_base_ch); UNICODE_ONLY(gtm_strToTitle_ptr = >m_strToTitle); GTM_ICU_INIT_IF_NEEDED; /* Note: should be invoked after err_init (since it may error out) and before CLI parsing */ sig_init(generic_signal_handler, dse_ctrlc_handler, suspsigs_handler, continue_handler); atexit(util_exit_handler); SET_LATCH_GLOBAL(&defer_latch, LOCK_AVAILABLE); stp_init(STP_INITSIZE); rts_stringpool = stringpool; getjobname(); INVOKE_INIT_SECSHR_ADDRS; io_init(TRUE); getzdir(); gtm_chk_dist(argv[0]); prealloc_gt_timers(); gt_timers_add_safe_hndlrs(); initialize_pattern_table(); gvinit(); region_init(FALSE); util_out_print("!/File !_!AD", TRUE, DB_LEN_STR(gv_cur_region)); util_out_print("Region!_!AD!/", TRUE, REG_LEN_STR(gv_cur_region)); cli_lex_setup(argc, argv); /* Since DSE operates on a region-by-region basis (for the most part), do not use a global directory at all from now on */ original_header = gd_header; gd_header = NULL; OPERATOR_LOG_MSG; # ifdef DEBUG if ((gtm_white_box_test_case_enabled && (WBTEST_SEMTOOLONG_STACK_TRACE == gtm_white_box_test_case_number) )) { sgmnt_addrs * csa; node_local_ptr_t cnl; csa = &FILE_INFO(gv_cur_region)->s_addrs; cnl = csa->nl; cnl->wbox_test_seq_num = 1; /*Signal the first step and wait here*/ /* The signal to the shell. MUPIP must not start BEFORE DSE */ util_out_print("DSE is ready. MUPIP can start. Note: This message is a part of WBTEST_SEMTOOLONG_STACK_TRACE test. " "It will not appear in PRO version.", TRUE); while (2 != cnl->wbox_test_seq_num) /*Wait for another process to get hold of the semaphore and signal next step*/ LONG_SLEEP(1); } # endif if (argc < 2) display_prompt(); while (1) { if (!dse_process(argc)) break; display_prompt(); } dse_exit(); REVERT; return 0; }
int gtmsource() { int status, log_init_status, waitpid_res, save_errno; char print_msg[1024], tmpmsg[1024]; gd_region *reg, *region_top; sgmnt_addrs *csa, *repl_csa; boolean_t all_files_open, isalive; pid_t pid, ppid, procgp; seq_num read_jnl_seqno, jnl_seqno; unix_db_info *udi; gtmsource_local_ptr_t gtmsource_local; boolean_t this_side_std_null_coll; int null_fd, rc; memset((uchar_ptr_t)&jnlpool, 0, SIZEOF(jnlpool_addrs)); call_on_signal = gtmsource_sigstop; ESTABLISH_RET(gtmsource_ch, SS_NORMAL); if (-1 == gtmsource_get_opt()) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_MUPCLIERR); if (gtmsource_options.shut_down) { /* Wait till shutdown time nears even before going to "jnlpool_init". This is because the latter will return * with the ftok semaphore and access semaphore held and we do not want to be holding those locks (while * waiting for the user specified timeout to expire) as that will affect new GTM processes and/or other * MUPIP REPLIC commands that need these locks for their function. */ if (0 < gtmsource_options.shutdown_time) { repl_log(stdout, TRUE, TRUE, "Waiting for %d seconds before signalling shutdown\n", gtmsource_options.shutdown_time); LONG_SLEEP(gtmsource_options.shutdown_time); } else repl_log(stdout, TRUE, TRUE, "Signalling shutdown immediate\n"); } else if (gtmsource_options.start) { repl_log(stdout, TRUE, TRUE, "Initiating START of source server for secondary instance [%s]\n", gtmsource_options.secondary_instname); } if (gtmsource_options.activate && (ROOTPRIMARY_SPECIFIED == gtmsource_options.rootprimary)) { /* MUPIP REPLIC -SOURCE -ACTIVATE -UPDOK has been specified. We need to open the gld and db regions now * in case this is a secondary -> primary transition. This is so we can later switch journal files in all * journaled regions when the transition actually happens inside "gtmsource_rootprimary_init". But since * we have not yet done a "jnlpool_init", we dont know if updates are disabled in it or not. Although we * need to do the gld/db open only if updates are currently disabled in the jnlpool, we do this always * because once we do a jnlpool_init, we will come back with the ftok on the jnlpool held and that has * issues with later db open since we will try to hold the db ftok as part of db open and the ftok logic * currently has assumptions that a process holds only one ftok at any point in time. */ assert(NULL == gd_header); gvinit(); all_files_open = region_init(FALSE); if (!all_files_open) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN); gtmsource_exit(ABNORMAL_SHUTDOWN); } } jnlpool_init(GTMSOURCE, gtmsource_options.start, &is_jnlpool_creator); /* is_jnlpool_creator == TRUE ==> this process created the journal pool * is_jnlpool_creator == FALSE ==> journal pool already existed and this process simply attached to it. */ if (gtmsource_options.shut_down) gtmsource_exit(gtmsource_shutdown(FALSE, NORMAL_SHUTDOWN) - NORMAL_SHUTDOWN); else if (gtmsource_options.activate) gtmsource_exit(gtmsource_mode_change(GTMSOURCE_MODE_ACTIVE_REQUESTED) - NORMAL_SHUTDOWN); else if (gtmsource_options.deactivate) gtmsource_exit(gtmsource_mode_change(GTMSOURCE_MODE_PASSIVE_REQUESTED) - NORMAL_SHUTDOWN); else if (gtmsource_options.checkhealth) gtmsource_exit(gtmsource_checkhealth() - NORMAL_SHUTDOWN); else if (gtmsource_options.changelog) gtmsource_exit(gtmsource_changelog() - NORMAL_SHUTDOWN); else if (gtmsource_options.showbacklog) gtmsource_exit(gtmsource_showbacklog() - NORMAL_SHUTDOWN); else if (gtmsource_options.stopsourcefilter) gtmsource_exit(gtmsource_stopfilter() - NORMAL_SHUTDOWN); else if (gtmsource_options.jnlpool) gtmsource_exit(gtmsource_jnlpool() - NORMAL_SHUTDOWN); else if (gtmsource_options.losttncomplete) gtmsource_exit(gtmsource_losttncomplete() - NORMAL_SHUTDOWN); else if (gtmsource_options.needrestart) gtmsource_exit(gtmsource_needrestart() - NORMAL_SHUTDOWN); else if (gtmsource_options.showfreeze) gtmsource_exit(gtmsource_showfreeze() - NORMAL_SHUTDOWN); else if (gtmsource_options.setfreeze) gtmsource_exit(gtmsource_setfreeze() - NORMAL_SHUTDOWN); else if (!gtmsource_options.start) { assert(CLI_PRESENT == cli_present("STATSLOG")); gtmsource_exit(gtmsource_statslog() - NORMAL_SHUTDOWN); } assert(gtmsource_options.start); # ifndef REPL_DEBUG_NOBACKGROUND /* Set "child_server_running" to FALSE before forking off child. Wait for it to be set to TRUE by the child. */ gtmsource_local = jnlpool.gtmsource_local; gtmsource_local->child_server_running = FALSE; FORK(pid); if (0 > pid) { save_errno = errno; rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Could not fork source server"), save_errno); } else if (0 < pid) { /* Parent. Wait until child sets "child_server_running" to FALSE. That is an indication that the child * source server has completed its initialization phase and is all set so the parent command can return. */ while (isalive = is_proc_alive(pid, 0)) /* note : intended assignment */ { if (gtmsource_local->child_server_running) break; /* To take care of reassignment of PIDs, the while condition should be && with the condition * (PPID of pid == process_id) */ SHORT_SLEEP(GTMSOURCE_WAIT_FOR_SRV_START); WAITPID(pid, &status, WNOHANG, waitpid_res); /* Release defunct child if dead */ } if (isalive) { /* Child process is alive and started with no issues */ if (0 != (save_errno = rel_sem(SOURCE, JNL_POOL_ACCESS_SEM))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error in rel_sem"), save_errno); ftok_sem_release(jnlpool.jnlpool_dummy_reg, TRUE, TRUE); } else { /* Child source server process errored out at startup and is no longer alive. * If we were the one who created the journal pool, let us clean it up. */ repl_log(stdout, TRUE, TRUE, "Source server startup failed. See source server log file\n"); if (is_jnlpool_creator) status = gtmsource_shutdown(TRUE, NORMAL_SHUTDOWN); } /* If the parent is killed (or crashes) between the fork and exit, checkhealth may not detect that startup * is in progress - parent forks and dies, the system will release sem 0 and 1, checkhealth might test the * value of sem 1 before the child grabs sem 1. */ gtmsource_exit(isalive ? SRV_ALIVE : SRV_ERR); } /* Point stdin to /dev/null */ OPENFILE("/dev/null", O_RDONLY, null_fd); if (0 > null_fd) rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to open /dev/null for read"), errno, 0); FCNTL3(null_fd, F_DUPFD, 0, rc); if (0 > rc) rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to set stdin to /dev/null"), errno, 0); CLOSEFILE(null_fd, rc); if (0 > rc) rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to close /dev/null"), errno, 0); /* The parent process (source server startup command) will be holding the ftok semaphore and jnlpool access semaphore * at this point. The variables that indicate this would have been copied over to the child during the fork. This will * make the child think it is actually holding them as well when actually it is not. Reset those variables in the child * to ensure they do not misrepresent the holder of those semaphores. */ ftok_sem_reg = NULL; udi = FILE_INFO(jnlpool.jnlpool_dummy_reg); assert(udi->grabbed_ftok_sem); udi->grabbed_ftok_sem = FALSE; assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]); holds_sem[SOURCE][JNL_POOL_ACCESS_SEM] = FALSE; assert(!holds_sem[SOURCE][SRC_SERV_COUNT_SEM]); /* Start child source server initialization */ is_src_server = TRUE; OPERATOR_LOG_MSG; process_id = getpid(); /* Reinvoke secshr related initialization with the child's pid */ INVOKE_INIT_SECSHR_ADDRS; /* Initialize mutex socket, memory semaphore etc. before any "grab_lock" is done by this process on the journal pool. * Note that the initialization would already have been done by the parent receiver startup command but we need to * redo the initialization with the child process id. */ assert(mutex_per_process_init_pid && (mutex_per_process_init_pid != process_id)); mutex_per_process_init(); START_HEARTBEAT_IF_NEEDED; ppid = getppid(); log_init_status = repl_log_init(REPL_GENERAL_LOG, >msource_log_fd, gtmsource_options.log_file); assert(SS_NORMAL == log_init_status); repl_log_fd2fp(>msource_log_fp, gtmsource_log_fd); if (-1 == (procgp = setsid())) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Source server error in setsid"), errno); # endif /* REPL_DEBUG_NOBACKGROUND */ if (ZLIB_CMPLVL_NONE != gtm_zlib_cmp_level) gtm_zlib_init(); /* Open zlib shared library for compression/decompression */ REPL_DPRINT1("Setting up regions\n"); gvinit(); /* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */ all_files_open = region_init(FALSE); if (!all_files_open) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN); gtmsource_exit(ABNORMAL_SHUTDOWN); } /* Determine primary side null subscripts collation order */ /* Also check whether all regions have same null collation order */ this_side_std_null_coll = -1; for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++) { csa = &FILE_INFO(reg)->s_addrs; if (this_side_std_null_coll != csa->hdr->std_null_coll) { if (-1 == this_side_std_null_coll) this_side_std_null_coll = csa->hdr->std_null_coll; else { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NULLCOLLDIFF); gtmsource_exit(ABNORMAL_SHUTDOWN); } } if (!REPL_ALLOWED(csa) && JNL_ALLOWED(csa)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_REPLOFFJNLON, 2, DB_LEN_STR(reg)); gtmsource_exit(ABNORMAL_SHUTDOWN); } if (reg->read_only && REPL_ALLOWED(csa)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Source Server does not have write permissions to one or " "more database files that are replicated")); gtmsource_exit(ABNORMAL_SHUTDOWN); } } /* Initialize source server alive/dead state related fields in "gtmsource_local" before the ftok semaphore is released */ gtmsource_local->gtmsource_pid = process_id; gtmsource_local->gtmsource_state = GTMSOURCE_START; if (is_jnlpool_creator) { DEBUG_ONLY(jnlpool.jnlpool_ctl->jnlpool_creator_pid = process_id); gtmsource_seqno_init(this_side_std_null_coll); if (ROOTPRIMARY_SPECIFIED == gtmsource_options.rootprimary) { /* Created the journal pool as a root primary. Append a history record to the replication instance file. * Invoke the function "gtmsource_rootprimary_init" to do that. */ gtmsource_rootprimary_init(jnlpool.jnlpool_ctl->jnl_seqno); } } /* after this point we can no longer have the case where all the regions are unreplicated/non-journaled. */ # ifndef REPL_DEBUG_NOBACKGROUND /* It is necessary for every process that is using the ftok semaphore to increment the counter by 1. This is used * by the last process that shuts down to delete the ftok semaphore when it notices the counter to be 0. * Note that the parent source server startup command would have done an increment of the ftok counter semaphore * for the replication instance file. But the source server process (the child) that comes here would not have done * that. Do that while the parent is still holding on to the ftok semaphore waiting for our okay. */ if (!ftok_sem_incrcnt(jnlpool.jnlpool_dummy_reg)) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_JNLPOOLSETUP); /* Increment the source server count semaphore */ status = incr_sem(SOURCE, SRC_SERV_COUNT_SEM); if (0 != status) { save_errno = errno; rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Counter semaphore increment failure in child source server"), save_errno); } # else if (0 != (save_errno = rel_sem_immediate(SOURCE, JNL_POOL_ACCESS_SEM))) { rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error in rel_sem_immediate"), save_errno); } # endif /* REPL_DEBUG_NOBACKGROUND */ gtmsource_srv_count++; gtmsource_local->child_server_running = TRUE; /* At this point, the parent startup command will stop waiting for child */ gtm_event_log_init(); /* Log source server startup command line first */ SPRINTF(tmpmsg, "%s %s\n", cli_lex_in_ptr->argv[0], cli_lex_in_ptr->in_str); repl_log(gtmsource_log_fp, TRUE, TRUE, tmpmsg); SPRINTF(tmpmsg, "GTM Replication Source Server with Pid [%d] started for Secondary Instance [%s]", process_id, gtmsource_local->secondary_instname); sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, LEN_AND_STR(tmpmsg)); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); if (is_jnlpool_creator) { repl_log(gtmsource_log_fp, TRUE, TRUE, "Created jnlpool with shmid = [%d] and semid = [%d]\n", jnlpool.repl_inst_filehdr->jnlpool_shmid, jnlpool.repl_inst_filehdr->jnlpool_semid); } else repl_log(gtmsource_log_fp, TRUE, TRUE, "Attached to existing jnlpool with shmid = [%d] and semid = [%d]\n", jnlpool.repl_inst_filehdr->jnlpool_shmid, jnlpool.repl_inst_filehdr->jnlpool_semid); gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg); # ifdef GTM_TLS if (REPL_TLS_REQUESTED) { repl_do_tls_init(gtmsource_log_fp); assert(REPL_TLS_REQUESTED || PLAINTEXT_FALLBACK); } # endif if (jnlpool.jnlpool_ctl->freeze) { last_seen_freeze_flag = jnlpool.jnlpool_ctl->freeze; sgtm_putmsg(print_msg, VARLSTCNT(3) ERR_REPLINSTFROZEN, 1, jnlpool.repl_inst_filehdr->inst_info.this_instname); repl_log(gtmsource_log_fp, TRUE, FALSE, print_msg); sgtm_putmsg(print_msg, VARLSTCNT(3) ERR_REPLINSTFREEZECOMMENT, 1, jnlpool.jnlpool_ctl->freeze_comment); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); } gtmsource_local->jnlfileonly = gtmsource_options.jnlfileonly; do { /* If mode is passive, go to sleep. Wakeup every now and then and check to see if I have to become active. */ gtmsource_state = gtmsource_local->gtmsource_state = GTMSOURCE_START; if ((gtmsource_local->mode == GTMSOURCE_MODE_PASSIVE) && (gtmsource_local->shutdown == NO_SHUTDOWN)) { gtmsource_poll_actions(FALSE); SHORT_SLEEP(GTMSOURCE_WAIT_FOR_MODE_CHANGE); continue; } if (GTMSOURCE_MODE_PASSIVE == gtmsource_local->mode) { /* Shutdown initiated */ assert(gtmsource_local->shutdown == SHUTDOWN); sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, RTS_ERROR_LITERAL("GTM Replication Source Server Shutdown signalled")); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg); break; } gtmsource_poll_actions(FALSE); if (GTMSOURCE_CHANGING_MODE == gtmsource_state) continue; if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsource_local->mode) gtmsource_local->mode = GTMSOURCE_MODE_ACTIVE; SPRINTF(tmpmsg, "GTM Replication Source Server now in ACTIVE mode using port %d", gtmsource_local->secondary_port); sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, LEN_AND_STR(tmpmsg)); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg); DEBUG_ONLY(repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs;) assert(!repl_csa->hold_onto_crit); /* so it is ok to invoke "grab_lock" and "rel_lock" unconditionally */ grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, HANDLE_CONCUR_ONLINE_ROLLBACK); if (GTMSOURCE_HANDLE_ONLN_RLBK == gtmsource_state) { repl_log(gtmsource_log_fp, TRUE, TRUE, "Starting afresh due to ONLINE ROLLBACK\n"); repl_log(gtmsource_log_fp, TRUE, TRUE, "REPL INFO - Current Jnlpool Seqno : %llu\n", jnlpool.jnlpool_ctl->jnl_seqno); continue; } QWASSIGN(gtmsource_local->read_addr, jnlpool.jnlpool_ctl->write_addr); gtmsource_local->read = jnlpool.jnlpool_ctl->write; gtmsource_local->read_state = gtmsource_local->jnlfileonly ? READ_FILE : READ_POOL; read_jnl_seqno = gtmsource_local->read_jnl_seqno; assert(read_jnl_seqno <= jnlpool.jnlpool_ctl->jnl_seqno); if (read_jnl_seqno < jnlpool.jnlpool_ctl->jnl_seqno) { gtmsource_local->read_state = READ_FILE; QWASSIGN(gtmsource_save_read_jnl_seqno, jnlpool.jnlpool_ctl->jnl_seqno); gtmsource_pool2file_transition = TRUE; /* so that we read the latest gener jnl files */ } rel_lock(jnlpool.jnlpool_dummy_reg); if (SS_NORMAL != (status = gtmsource_alloc_tcombuff())) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error allocating initial tcom buffer space. Malloc error"), status); gtmsource_filter = NO_FILTER; if ('\0' != gtmsource_local->filter_cmd[0]) { if (SS_NORMAL == (status = repl_filter_init(gtmsource_local->filter_cmd))) gtmsource_filter |= EXTERNAL_FILTER; else gtmsource_exit(ABNORMAL_SHUTDOWN); } gtmsource_process(); /* gtmsource_process returns only when mode needs to be changed to PASSIVE */ assert(gtmsource_state == GTMSOURCE_CHANGING_MODE); gtmsource_ctl_close(); gtmsource_free_msgbuff(); gtmsource_free_tcombuff(); gtmsource_free_filter_buff(); gtmsource_stop_heartbeat(); if (FD_INVALID != gtmsource_sock_fd) repl_close(>msource_sock_fd); if (gtmsource_filter & EXTERNAL_FILTER) repl_stop_filter(); } while (TRUE);
sm_uc_ptr_t t_qread(block_id blk, sm_int_ptr_t cycle, cache_rec_ptr_ptr_t cr_out) /* cycle is used in t_end to detect if the buffer has been refreshed since the t_qread */ { int4 status; uint4 blocking_pid; cache_rec_ptr_t cr; bt_rec_ptr_t bt; boolean_t clustered, hold_onto_crit, was_crit, issued_db_init_crypt_warning, sync_needed; int dummy, lcnt, ocnt; cw_set_element *cse; off_chain chain1; register sgmnt_addrs *csa; register sgmnt_data_ptr_t csd; enum db_ver ondsk_blkver; int4 dummy_errno, gtmcrypt_errno; boolean_t already_built, is_mm, reset_first_tp_srch_status, set_wc_blocked, sleep_invoked; ht_ent_int4 *tabent; srch_blk_status *blkhist; trans_num dirty, blkhdrtn; sm_uc_ptr_t buffaddr; uint4 stuck_cnt = 0; boolean_t lcl_blk_free; node_local_ptr_t cnl; gd_segment *seg; uint4 buffs_per_flush, flush_target; enc_info_t *encr_ptr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; lcl_blk_free = block_is_free; block_is_free = FALSE; /* Reset to FALSE so that if t_qread fails below, we don't have an incorrect state of this var */ first_tp_srch_status = NULL; reset_first_tp_srch_status = FALSE; csa = cs_addrs; csd = csa->hdr; INCR_DB_CSH_COUNTER(csa, n_t_qreads, 1); is_mm = (dba_mm == csd->acc_meth); /* We better hold crit in the final retry (TP & non-TP). Only exception is journal recovery */ assert((t_tries < CDB_STAGNATE) || csa->now_crit || mupip_jnl_recover); if (dollar_tlevel) { assert(sgm_info_ptr); if (0 != sgm_info_ptr->cw_set_depth) { chain1 = *(off_chain *)&blk; if (1 == chain1.flag) { assert(sgm_info_ptr->cw_set_depth); if ((int)chain1.cw_index < sgm_info_ptr->cw_set_depth) tp_get_cw(sgm_info_ptr->first_cw_set, (int)chain1.cw_index, &cse); else { assert(FALSE == csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); cse = first_tp_srch_status ? first_tp_srch_status->cse : NULL; } assert(!cse || !cse->high_tlevel); assert(!chain1.flag || cse); if (cse) { /* transaction has modified the sought after block */ if ((gds_t_committed != cse->mode) || (n_gds_t_op < cse->old_mode)) { /* Changes have not been committed to shared memory, i.e. still in private memory. * Build block in private buffer if not already done and return the same. */ assert(gds_t_writemap != cse->mode); if (FALSE == cse->done) { /* out of date, so make it current */ assert(gds_t_committed != cse->mode); already_built = (NULL != cse->new_buff); /* Validate the block's search history right after building a private copy. * This is not needed in case gvcst_search is going to reuse the clue's search * history and return (because tp_hist will do the validation of this block). * But if gvcst_search decides to do a fresh traversal (because the clue does not * cover the path of the current input key etc.) the block build that happened now * will not get validated in tp_hist since it will instead be given the current * key's search history path (a totally new path) for validation. Since a private * copy of the block has been built, tp_tend would also skip validating this block * so it is necessary that we validate the block right here. Since it is tricky to * accurately differentiate between the two cases, we do the validation * unconditionally here (besides it is only a few if checks done per block build * so it is considered okay performance-wise). */ gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, 0); assert(NULL != cse->blk_target); if (!already_built && !chain1.flag) { buffaddr = first_tp_srch_status->buffaddr; cr = first_tp_srch_status->cr; assert((is_mm || cr) && buffaddr); blkhdrtn = ((blk_hdr_ptr_t)buffaddr)->tn; if (TP_IS_CDB_SC_BLKMOD3(cr, first_tp_srch_status, blkhdrtn)) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_blkmod; /* should this be something else */ TP_TRACE_HIST_MOD(blk, gv_target, tp_blkmod_t_qread, cs_data, first_tp_srch_status->tn, blkhdrtn, ((blk_hdr_ptr_t)buffaddr)->levl); return (sm_uc_ptr_t)NULL; } if (!is_mm && ((first_tp_srch_status->cycle != cr->cycle) || (first_tp_srch_status->blk_num != cr->blk))) { assert(CDB_STAGNATE > t_tries); rdfail_detail = cdb_sc_lostcr; /* should this be something else */ return (sm_uc_ptr_t)NULL; } } cse->done = TRUE; } *cycle = CYCLE_PVT_COPY; *cr_out = 0; return (sm_uc_ptr_t)cse->new_buff; } else { /* Block changes are already committed to shared memory (possible if we are in TP * in the 2nd phase of M-Kill in gvcst_expand_free_subtree.c). In this case, read * block from shared memory; do not look at private memory (i.e. cse) as that might * not be as uptodate as shared memory. */ assert(csa->now_crit); /* gvcst_expand_free_subtree does t_qread in crit */ /* If this block was newly created as part of the TP transaction, it should not be killed * as part of the 2nd phase of M-kill. This is because otherwise the block's cse would * have had an old_mode of kill_t_create in which case we would not have come into this * else block. Assert accordingly. */ assert(!chain1.flag); first_tp_srch_status = NULL; /* do not use any previous srch_hist information */ } } } else { if (NULL != (tabent = lookup_hashtab_int4(sgm_info_ptr->blks_in_use, (uint4 *)&blk))) first_tp_srch_status = tabent->value; else first_tp_srch_status = NULL; } ASSERT_IS_WITHIN_TP_HIST_ARRAY_BOUNDS(first_tp_srch_status, sgm_info_ptr); if (!is_mm && first_tp_srch_status) { cr = first_tp_srch_status->cr; assert(cr && !first_tp_srch_status->cse); if (first_tp_srch_status->cycle == cr->cycle) { *cycle = first_tp_srch_status->cycle; *cr_out = cr; cr->refer = TRUE; if (CDB_STAGNATE <= t_tries) /* mu_reorg doesn't use TP else should have an || for that */ CWS_INSERT(blk); return (sm_uc_ptr_t)first_tp_srch_status->buffaddr; } else { /* Block was already part of the read-set of this transaction, but got recycled in the cache. * Allow block recycling by resetting first_tp_srch_status for this blk to reflect the new * buffer, cycle and cache-record. tp_hist (invoked much later) has validation checks to detect * if block recycling happened within the same mini-action and restart in that case. * Updating first_tp_srch_status has to wait until the end of t_qread since only then do we know * the values to update to. Set a variable that will enable the updation before returning. * Also assert that if we are in the final retry, we are never in a situation where we have a * block that got recycled since the start of the current mini-action. This is easily detected since * as part of the final retry we maintain a hash-table "cw_stagnate" that holds the blocks that * have been read as part of the current mini-action until now. */ assert(CDB_STAGNATE > t_tries || (NULL == lookup_hashtab_int4(&cw_stagnate, (uint4 *)&blk))); reset_first_tp_srch_status = TRUE; } } } if ((uint4)blk >= (uint4)csa->ti->total_blks) { /* Requested block out of range; could occur because of a concurrency conflict. mm_read and dsk_read assume blk is * never negative or greater than the maximum possible file size. If a concurrent REORG truncates the file, t_qread * can proceed despite blk being greater than total_blks. But dsk_read handles this fine; see comments below. */ assert((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data)); assert(!csa->now_crit); rdfail_detail = cdb_sc_blknumerr; return (sm_uc_ptr_t)NULL; } if (is_mm) { *cycle = CYCLE_SHRD_COPY; *cr_out = 0; return (sm_uc_ptr_t)(mm_read(blk)); } was_crit = csa->now_crit; cnl = csa->nl; encr_ptr = csa->encr_ptr; if (NULL != encr_ptr) { /* If this is an encrypted database and we hold crit, make sure our private cycle matches the shared cycle. * Or else we would need to call "process_reorg_encrypt_restart" below (a heavyweight operation) holding crit. */ assert(!was_crit || (cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle)); seg = gv_cur_region->dyn.addr; issued_db_init_crypt_warning = encr_ptr->issued_db_init_crypt_warning; if (!IS_BITMAP_BLK(blk) && issued_db_init_crypt_warning) { /* A non-GT.M process is attempting to read a non-bitmap block, yet it has previously encountered an error * during db_init (because it did not have access to the encryption keys) and reported it with a -W- * severity. Since the block it is attempting to read can be in the unencrypted shared memory (read from * disk by another process with access to the encryption keys), we cannot let it access it without a valid * handle, so issue an rts_error. * * TODO: DSE and LKE could bypass getting the ftok semaphore. LKE is not an issue, but DSE does care about * the csa->reorg_encrypt_cycle. So it means DSE could get an inconsistent copy of reorg_encrypt_cycle * and associated hashes if it had done a bypass and a concurrent REORG -ENCRYPT is holding the ftok * semaphore and changing these values at the same time. */ assert(!IS_GTM_IMAGE); /* GT.M would have error'ed out in db_init */ gtmcrypt_errno = SET_REPEAT_MSG_MASK(SET_CRYPTERR_MASK(ERR_CRYPTBADCONFIG)); GTMCRYPT_REPORT_ERROR(gtmcrypt_errno, rts_error, seg->fname_len, seg->fname); } else if (cnl->reorg_encrypt_cycle != encr_ptr->reorg_encrypt_cycle) { /* A concurrent MUPIP REORG ENCRYPT occurred. Cannot proceed with the read even if the block is * already loaded from disk into the unencrypted global buffers (security issue). Need to load the * new encryption keys and only let those processes which are able to successfully do this proceed * with the read. First, copy the key hashes from csd into csa->encr_ptr. That needs crit * to ensure a concurrent MUPIP REORG ENCRYPT does not sneak in. * * Note: Even though we asserted a few lines above that if "was_crit" is TRUE, then we expect * the encryption cycles to be in sync, we handle this out-of-design situation in "pro" by fixing * the cycles while holding crit (hopefully rare case so it is okay to hold crit for a heavyweight call). */ if (!was_crit) grab_crit(gv_cur_region); /* Now that we have crit, sync them up by copying the new keys inside crit and opening the key handles * outside crit (a potentially long running operation). */ SIGNAL_REORG_ENCRYPT_RESTART(mu_reorg_encrypt_in_prog, reorg_encrypt_restart_csa, cnl, csa, csd, rdfail_detail, process_id); assert(csa == reorg_encrypt_restart_csa); if (!was_crit) rel_crit(gv_cur_region); /* If we are inside a TP read-write transaction, it is possible we already used the old keys for * prior calls to "jnl_format" so we have to restart (cannot sync up cycles). Do the same for * TP read-only transaction as well as NON-TP read-write transaction. In all these cases we know * the caller is capable of restarting. All other cases we dont know if the caller is capable so * sync up the cycles and proceed using the new keys for the read. * * But since it is possible the caller does not call t_retry right away (e.g. mupip reorg which can * choose to abandone this tree path and move on to another block without aborting this transaction) * it is better we finish the pending call to "process_reorg_encrypt_restart" right here before returning. */ process_reorg_encrypt_restart(); assert(NULL == reorg_encrypt_restart_csa); if (IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans)) { assert(cdb_sc_reorg_encrypt == rdfail_detail); /* set by SIGNAL_REORG_ENCRYPT_RESTART macro */ return (sm_uc_ptr_t)NULL; } } } assert(dba_bg == csd->acc_meth); assert(!first_tp_srch_status || !first_tp_srch_status->cr || first_tp_srch_status->cycle != first_tp_srch_status->cr->cycle); if (FALSE == (clustered = csd->clustered)) bt = NULL; ocnt = 0; set_wc_blocked = FALSE; /* to indicate whether cnl->wc_blocked was set to TRUE by us */ hold_onto_crit = csa->hold_onto_crit; /* note down in local to avoid csa-> dereference in multiple usages below */ do { if (NULL == (cr = db_csh_get(blk))) { /* not in memory */ if (clustered && (NULL != (bt = bt_get(blk))) && (FALSE == bt->flushing)) bt = NULL; if (!csa->now_crit) { assert(!hold_onto_crit); if (NULL != bt) { /* at this point, bt is not NULL only if clustered and flushing - wait no crit */ assert(clustered); wait_for_block_flush(bt, blk); /* try for no other node currently writing the block */ } /* assume defaults for flush_target and buffs_per_flush */ flush_target = csd->flush_trigger; buffs_per_flush = 0; if ((0 != csd->epoch_taper) && (FALSE == gv_cur_region->read_only) && JNL_ENABLED(csd) && (0 != cnl->wcs_active_lvl) && (NOJNL != csa->jnl->channel) && (0 != cnl->jnl_file.u.inode) && csd->jnl_before_image) { EPOCH_TAPER_IF_NEEDED(csa, csd, cnl, (gd_region *) 0, FALSE, buffs_per_flush, flush_target); } if ((flush_target <= cnl->wcs_active_lvl) && (FALSE == gv_cur_region->read_only)) JNL_ENSURE_OPEN_WCS_WTSTART(csa, gv_cur_region, buffs_per_flush, dummy_errno); /* a macro that dclast's "wcs_wtstart" and checks for errors etc. */ /* Get crit but also ensure encryption cycles are in sync ("dsk_read" relies on this). * Note: "sync_needed" should be TRUE very rarely since we synced the cycles just a few lines * above. But in case a MUPIP REORG ENCRYPT concurrently sneaked in between these lines we * need to resync. */ sync_needed = grab_crit_encr_cycle_sync(gv_cur_region); assert(NULL == reorg_encrypt_restart_csa); assert(!sync_needed || (NULL != encr_ptr)); if (sync_needed && IS_NOT_SAFE_TO_SYNC_NEW_KEYS(dollar_tlevel, update_trans)) { assert(cnl->reorg_encrypt_cycle == encr_ptr->reorg_encrypt_cycle); rel_crit(gv_cur_region); rdfail_detail = cdb_sc_reorg_encrypt; /* set by SIGNAL_REORG_ENCRYPT_RESTART macro */ return (sm_uc_ptr_t)NULL; } cr = db_csh_get(blk); /* in case blk arrived before crit */ } if (clustered && (NULL != (bt = bt_get(blk))) && (TRUE == bt->flushing)) { /* Once crit, need to assure that if clustered, that flushing is [still] complete * If it isn't, we missed an entire WM cycle and have to wait for another node to finish */ wait_for_block_flush(bt, blk); /* ensure no other node currently writing the block */ } if (NULL == cr) { /* really not in memory - must get a new buffer */ assert(csa->now_crit); cr = db_csh_getn(blk); if (CR_NOTVALID == (sm_long_t)cr) { assert(cnl->wc_blocked); /* only reason we currently know wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wc_blocked_t_qread_db_csh_getn_invalid_blk); set_wc_blocked = TRUE; break; } assert(0 <= cr->read_in_progress); *cycle = cr->cycle; cr->tn = csd->trans_hist.curr_tn; /* Record history of most recent disk reads only in dbg builds for now. Although the macro * is just a couple dozen instructions, it is done while holding crit so we want to avoid * delaying crit unless really necessary. Whoever wants this information can enable it * by a build change to remove the DEBUG_ONLY part below. */ DEBUG_ONLY(DSKREAD_TRACE(csa, GDS_ANY_ABS2REL(csa,cr), cr->tn, process_id, blk, cr->cycle);) if (!was_crit && !hold_onto_crit) rel_crit(gv_cur_region); /* read outside of crit may be of a stale block but should be detected by t_end or tp_tend */ assert(0 == cr->dirty); assert(cr->read_in_progress >= 0); CR_BUFFER_CHECK(gv_cur_region, csa, csd, cr); buffaddr = (sm_uc_ptr_t)GDS_REL2ABS(cr->buffaddr); # ifdef DEBUG /* stop self to test sechshr_db_clnup clears the read state */ if (gtm_white_box_test_case_enabled && (WBTEST_SIGTSTP_IN_T_QREAD == gtm_white_box_test_case_number)) { /* this should never fail, but because of the way we developed the test we got paranoid */ dummy = kill(process_id, SIGTERM); assert(0 == dummy); for (dummy = 10; dummy; dummy--) LONG_SLEEP(10); /* time for sigterm to take hit before we clear block_now_locked */ } # endif if (SS_NORMAL != (status = dsk_read(blk, buffaddr, &ondsk_blkver, lcl_blk_free))) { /* buffer does not contain valid data, so reset blk to be empty */ cr->cycle++; /* increment cycle for blk number changes (for tp_hist and others) */ cr->blk = CR_BLKEMPTY; cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); TREF(block_now_locked) = NULL; assert(-1 <= cr->read_in_progress); assert(was_crit == csa->now_crit); if (ERR_DYNUPGRDFAIL == status) { /* if we dont hold crit on the region, it is possible due to concurrency conflicts * that this block is unused (i.e. marked free/recycled in bitmap, see comments in * gds_blk_upgrade.h). in this case we should not error out but instead restart. */ if (was_crit) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) status, 3, blk, DB_LEN_STR(gv_cur_region)); } else { rdfail_detail = cdb_sc_lostcr; return (sm_uc_ptr_t)NULL; } } if ((-1 == status) && !was_crit) { /* LSEEKREAD and, consequently, dsk_read return -1 in case pread is unable to fetch * a full database block's length of data. This can happen if the requested read is * past the end of the file, which can happen if a concurrent truncate occurred * after the blk >= csa->ti->total_blks comparison above. Allow for this scenario * by restarting. However, if we've had crit the whole time, no truncate could have * happened. -1 indicates a problem with the file, so fall through to DBFILERR. */ rdfail_detail = cdb_sc_truncate; return (sm_uc_ptr_t)NULL; } else if (IS_CRYPTERR_MASK(status)) { seg = gv_cur_region->dyn.addr; GTMCRYPT_REPORT_ERROR(status, rts_error, seg->fname_len, seg->fname); } else { /* A DBFILERR can be thrown for two possible reasons: * (1) LSEEKREAD returned an unexpected error due to a filesystem problem; or * (2) csa/cs_addrs/csd/cs_data are out of sync, and we're trying to read a block * number for one region from another region with fewer total_blks. * We suspect the former is what happened in GTM-7623. Apparently the latter * has been an issue before, too. If either occurs again in pro, this assertpro * distinguishes the two possibilities. */ assertpro((&FILE_INFO(gv_cur_region)->s_addrs == csa) && (csd == cs_data)); rts_error_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); } } disk_blk_read = TRUE; assert(0 <= cr->read_in_progress); assert(0 == cr->dirty); /* Only set in cache if read was success */ cr->ondsk_blkver = (lcl_blk_free ? GDSVCURR : ondsk_blkver); cr->r_epid = 0; RELEASE_BUFF_READ_LOCK(cr); TREF(block_now_locked) = NULL; assert(-1 <= cr->read_in_progress); *cr_out = cr; assert(was_crit == csa->now_crit); if (reset_first_tp_srch_status) RESET_FIRST_TP_SRCH_STATUS(first_tp_srch_status, cr, *cycle); return buffaddr; } else if (!was_crit && (BAD_LUCK_ABOUNDS > ocnt)) { assert(!hold_onto_crit); assert(TRUE == csa->now_crit); assert(cnl->in_crit == process_id); rel_crit(gv_cur_region); } }