int mu_rndwn_sem_all(void) { int save_errno, exit_status = SS_NORMAL, semid; char entry[MAX_ENTRY_LEN]; FILE *pf; char fname[MAX_FN_LEN + 1], *fgets_res; boolean_t rem_sem; shm_parms *parm_buff; if (NULL == (pf = POPEN(IPCS_SEM_CMD_STR ,"r"))) { save_errno = errno; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("POPEN()"), CALLFROM, save_errno); return ERR_MUNOTALLSEC; } while (NULL != (FGETS(entry, SIZEOF(entry), pf, fgets_res)) && entry[0] != '\n') { if (-1 != (semid = parse_sem_id(entry))) { if (is_orphaned_gtm_semaphore(semid)) { /* semval == 0 and corresponding shared memory has been removed */ if (-1 != semctl(semid, 0, IPC_RMID)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(3) ERR_SEMREMOVED, 1, semid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(3) ERR_SEMREMOVED, 1, semid); } } } } pclose(pf); return exit_status; }
int continue_proc(pid_t pid) { DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; DEBUG_ONLY(if (!TREF(gtm_usesecshr))) /* Cause debug builds to talk to gtmsecshr more often */ { if (WBTEST_ENABLED(WBTEST_HOLD_GTMSOURCE_SRV_LATCH)) { /* Simulate the kill below, but ignore its return status so that we end up invoking gtmsecshr */ kill(pid, SIGCONT); /* Wait until the target quits so that kill() call by gtmsecshr fails with ESRCH */ while (is_proc_alive(pid, 0)) LONG_SLEEP(1); } else if (0 == kill(pid, SIGCONT)) return 0; else if (ESRCH == errno) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(5) ERR_NOSUCHPROC, 3, pid, RTS_ERROR_LITERAL("continue")); return ESRCH; } else assert(EINVAL != errno); } return send_mesg2gtmsecshr(CONTINUE_PROCESS, pid, NULL, 0); }
int create_server(void) { int child_pid, done_pid, status = 0; # ifdef _BSD union wait chld_status; # define CSTAT chld_status # else # define CSTAT status # endif int save_errno; FORK(child_pid); if (0 == child_pid) { process_id = getpid(); /* Do exec using gtmsecshr_path, which was initialize in file check code - send_mesg2gtmsecshr */ if (WBTEST_ENABLED(WBTEST_BADEXEC_SECSHR_PROCESS)) STRCPY(gtmsecshr_path, ""); status = EXECL(gtmsecshr_path, gtmsecshr_path, 0); if (-1 == status) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(9) ERR_GTMSECSHRSTART, 3, RTS_ERROR_TEXT("Client"), process_id, ERR_TEXT, 2, RTS_ERROR_STRING(secshrstart_error_code[UNABLETOEXECGTMSECSHR])); UNDERSCORE_EXIT(UNABLETOEXECGTMSECSHR); } } else { if (-1 == child_pid) { status = GNDCHLDFORKFLD; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_GTMSECSHRSTART, 3, RTS_ERROR_TEXT("Client"), process_id, ERR_TEXT, 2, RTS_ERROR_TEXT("Failed to fork off gtmsecshr"), errno); /* Sleep for a while and hope a subsequent fork will succeed */ hiber_start(1000); } for (; !status ;) { /* To prevent a warning message that the compiler issues */ done_pid = wait(&CSTAT); if (done_pid == child_pid) { status = WEXITSTATUS(CSTAT); break; } else if (-1 == done_pid) { if (ECHILD == errno) /* Assume normal exit status */ break; else if (EINTR != errno) { status = GNDCHLDFORKFLD; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_GTMSECSHRSTART, 3, RTS_ERROR_TEXT("Client"), process_id, ERR_TEXT, 2, RTS_ERROR_TEXT("Error spawning gtmsecshr"), errno); } } } } return status; }
/* -------------------------------------------------------------------------------- This function renames a file, if exists. Otherwise do nothing. --------------------------------------------------------------------------------- */ int rename_file_if_exists(char *org_fn, int org_fn_len, char *rename_fn, int *rename_fn_len, uint4 *ustatus) { mstr orgfile; int status; jnl_tm_t now; memcpy(rename_fn, org_fn, org_fn_len + 1); /* Ensure it to be NULL terminated */ *rename_fn_len = org_fn_len; orgfile.addr = org_fn; orgfile.len = org_fn_len; if (FILE_NOT_FOUND == (status = gtm_file_stat(&orgfile, NULL, NULL, FALSE, ustatus))) return RENAME_NOT_REQD; else if (FILE_STAT_ERROR == status) { assert(SS_NORMAL != *ustatus); return RENAME_FAILED; } /* File is present in the system */ assert(0 < MAX_FN_LEN - org_fn_len - 1); JNL_SHORT_TIME(now); if (SS_NORMAL != (status = prepare_unique_name(org_fn, org_fn_len, "", "", rename_fn, rename_fn_len, now, ustatus))) { /* "prepare_unique_name" would not have set "ustatus" to the error code. So set it here and return */ assert(SS_NORMAL == *ustatus); *ustatus = status; assert(SS_NORMAL != *ustatus); return RENAME_FAILED; } assert(0 == rename_fn[*rename_fn_len]); if (SS_NORMAL != (status = gtm_rename(org_fn, org_fn_len, rename_fn, *rename_fn_len, ustatus))) { *ustatus = status; assert(SS_NORMAL != *ustatus); if (IS_GTM_IMAGE) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(9) ERR_RENAMEFAIL, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn, status, 0, *ustatus); else gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT1(8) ERR_RENAMEFAIL, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn, status, PUT_SYS_ERRNO(*ustatus)); return RENAME_FAILED; } if (IS_GTM_IMAGE) send_msg_csa(CSA_ARG(NULL) VARLSTCNT (6) ERR_FILERENAME, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn); else gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT (6) ERR_FILERENAME, 4, org_fn_len, org_fn, *rename_fn_len, rename_fn); return RENAME_SUCCESS; }
void set_enospc_flags(gd_addr *addr_ptr, char enospc_enable_list[], boolean_t ok_to_interrupt) { gd_region *r_local, *r_top; int i; sgmnt_addrs *csa; const char *syslog_msg; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions, i = 0; r_local < r_top; r_local++, i++) { if (!r_local->open || r_local->was_open) continue; if ((dba_bg != r_local->dyn.addr->acc_meth) && (dba_mm != r_local->dyn.addr->acc_meth)) continue; csa = &FILE_INFO(r_local)->s_addrs; if (ANTICIPATORY_FREEZE_ENABLED(csa)) { switch(enospc_enable_list[i]) { case NONE: syslog_msg = "Turning off fake ENOSPC for both database and journal file."; csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = FALSE; break; case DB_ON: syslog_msg = "Turning on fake ENOSPC only for database file."; csa->nl->fake_db_enospc = TRUE; csa->nl->fake_jnl_enospc = FALSE; break; case JNL_ON: syslog_msg = "Turning on fake ENOSPC only for journal file."; csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = TRUE; break; case DB_AND_JNL_ON: syslog_msg = "Turning on fake ENOSPC for both database and journal file."; csa->nl->fake_db_enospc = TRUE; csa->nl->fake_jnl_enospc = TRUE; break; default: assert(FALSE); } if (ok_to_interrupt) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT, 2, LEN_AND_STR(syslog_msg)); } } }
void set_enospc_if_needed() { gd_addr *addr_ptr; char enospc_enable_list[MAX_REGIONS]; boolean_t ok_to_interrupt, is_time_to_act; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; if (TREF(gtm_test_fake_enospc) && is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE) { ok_to_interrupt = (INTRPT_OK_TO_INTERRUPT == intrpt_ok_state) && (0 == gtmMallocDepth); is_time_to_act = (next_heartbeat_counter == heartbeat_counter); if (syslog_deferred && ok_to_interrupt) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(3) ERR_FAKENOSPCLEARED, 1, (heartbeat_counter - syslog_deferred)); syslog_deferred = 0; } if (!is_time_to_act || syslog_deferred || (!ok_to_interrupt && !IS_REPL_INST_FROZEN)) { /* We have to skip this because we have just fallen into deferred zone or we are currently in it */ if (is_time_to_act) next_heartbeat_counter++; /* Try again in the next heartbeat */ return; } assert(0 == syslog_deferred); srand(time(NULL)); addr_ptr = get_next_gdr(NULL); if (NULL == addr_ptr) /* Ensure that there is a global directory to operate on. */ return; assert(NULL == get_next_gdr(addr_ptr)); /* Randomly simulate ENOSPC or free space. NO more than 50 regions are allowed to avoid unnecessary * malloc/frees in debug-only code */ assert(MAX_REGIONS >= addr_ptr->n_regions); if (!IS_REPL_INST_FROZEN) { /* We are in an UNFROZEN state, and about to be FROZEN due to ENOSPC */ choose_random_reg_list(enospc_enable_list, addr_ptr->n_regions); next_heartbeat_counter = heartbeat_counter + ENOSPC_FROZEN_DURATION; } else { /* We are in a FROZEN state, and about to be UNFROZEN due to free space */ memset(enospc_enable_list, 0, MAX_REGIONS); next_heartbeat_counter = heartbeat_counter + ENOSPC_UNFROZEN_DURATION; if (!ok_to_interrupt) syslog_deferred = heartbeat_counter; } set_enospc_flags(addr_ptr, enospc_enable_list, ok_to_interrupt); } }
void ch_overrun(void) { PRN_ERROR; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOCHLEFT); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOCHLEFT); /* If exit handler is already active, we will just core and die */ if (exit_handler_active) gtm_dump_core(); else { /* Otherwise, we generate a core and exit to drive the condition handler */ gtm_fork_n_core(); MUMPS_EXIT; } }
/* This function sets the "ftok_counter_halted" field to TRUE in the instance file header and flushes it to disk. * Caller could be attached to the journal pool or not. If not, update file header directly. If yes, go through locks. */ void repl_inst_ftok_counter_halted(unix_db_info *udi, char *file_type, repl_inst_hdr *repl_instance) { assert(udi->grabbed_ftok_sem); /* this ensures we have a lock before we modify the instance file header */ if (NULL != jnlpool.repl_inst_filehdr) { assert(!jnlpool.repl_inst_filehdr->ftok_counter_halted); jnlpool.repl_inst_filehdr->ftok_counter_halted = TRUE; grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK); repl_inst_flush_filehdr(); rel_lock(jnlpool.jnlpool_dummy_reg); } else { assert(!repl_instance->ftok_counter_halted); repl_instance->ftok_counter_halted = TRUE; repl_inst_write(udi->fn, (off_t)0, (sm_uc_ptr_t)repl_instance, SIZEOF(repl_inst_hdr)); } /* Ignore any errors while flushing the "halted" value to the file header. The only consequence is other processes * will incur a performance overhead trying to unnecessarily bump the semaphore counter when it is already ERANGE. */ send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_NOMORESEMCNT, 5, LEN_AND_LIT("ftok"), file_type, LEN_AND_STR(udi->fn)); }
void deferred_signal_handler(void) { void (*signal_routine)(); DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; /* To avoid nested calls to this routine, progress the forced_exit state. */ SET_FORCED_EXIT_STATE_ALREADY_EXITING; if (exit_handler_active) { assert(FALSE); /* at this point in time (June 2003) there is no way we know of to get here, hence the assert */ return; /* since anyway we are exiting currently, resume exit handling instead of reissuing another one */ } /* For signals that get a delayed response so we can get out of crit, we also delay the messages. * This routine will output those delayed messages from the appropriate structures to both the * user and the system console. */ /* note can't use switch here because ERR_xxx are not defined as constants */ if (ERR_KILLBYSIG == forced_exit_err) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal); } else if (ERR_KILLBYSIGUINFO == forced_exit_err) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.send_pid, signal_info.send_uid); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.send_pid, signal_info.send_uid); } else if (ERR_KILLBYSIGSINFO1 == forced_exit_err) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr, signal_info.bad_vadr); } else if (ERR_KILLBYSIGSINFO2 == forced_exit_err) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.int_iadr); } else if (ERR_KILLBYSIGSINFO3 == forced_exit_err) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.bad_vadr); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type), process_id, signal_info.signal, signal_info.bad_vadr); } else if (ERR_FORCEDHALT != forced_exit_err || !gtm_quiet_halt) { /* No HALT messages if quiet halt is requested */ send_msg_csa(CSA_ARG(NULL) VARLSTCNT(1) forced_exit_err); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) forced_exit_err); } assert(OK_TO_INTERRUPT); /* Signal intent to exit BEFORE driving condition handlers. This avoids checks that will otherwise fail (for example * if mdb_condition_handler/preemptive_db_clnup gets called below, that could invoke the RESET_GV_TARGET macro which in turn * would assert that gv_target->gd_csa is equal to cs_addrs. This could not be true in case we were in mainline code * that was interrupted by the flush timer for a different region which in turn was interrupted by an external signal * that would drive us to exit. Setting the "process_exiting" variable causes those csa checks to pass. */ SET_PROCESS_EXITING_TRUE; # ifdef DEBUG if (gtm_white_box_test_case_enabled && (WBTEST_DEFERRED_TIMERS == gtm_white_box_test_case_number) && (2 == gtm_white_box_test_case_count)) { DEFER_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); DBGFPF((stderr, "DEFERRED_SIGNAL_HANDLER: will sleep for 20 seconds\n")); LONG_SLEEP(20); DBGFPF((stderr, "DEFERRED_SIGNAL_HANDLER: done sleeping\n")); ENABLE_INTERRUPTS(INTRPT_NO_TIMER_EVENTS); } # endif /* If any special routines are registered to be driven on a signal, drive them now */ if ((0 != exi_condition) && (NULL != call_on_signal)) { signal_routine = call_on_signal; call_on_signal = NULL; /* So we don't recursively call ourselves */ (*signal_routine)(); } /* Note, we do not drive create_fatal_error zshow_dmp() in this routine since any deferrable signals are * by definition not fatal. */ exit(-exi_condition); }
void mu_reorg_upgrd_dwngrd(void) { blk_hdr new_hdr; blk_segment *bs1, *bs_ptr; block_id *blkid_ptr, curblk, curbmp, start_blk, stop_blk, start_bmp, last_bmp; block_id startblk_input, stopblk_input; boolean_t upgrade, downgrade, safejnl, nosafejnl, region, first_reorg_in_this_db_fmt, reorg_entiredb; boolean_t startblk_specified, stopblk_specified, set_fully_upgraded, db_got_to_v5_once, mark_blk_free; cache_rec_ptr_t cr; char *bml_lcl_buff = NULL, *command, *reorg_command; sm_uc_ptr_t bptr = NULL; cw_set_element *cse; enum cdb_sc cdb_status; enum db_ver new_db_format, ondsk_blkver; gd_region *reg; int cycle; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */ int4 blocks_left, expected_blks2upgrd, actual_blks2upgrd, total_blks, free_blks; int4 status, status1, mapsize, lcnt, bml_status; reorg_stats_t reorg_stats; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; sm_uc_ptr_t blkBase, bml_sm_buff; /* shared memory pointer to the bitmap global buffer */ srch_hist alt_hist; srch_blk_status *blkhist, bmlhist; tp_region *rptr; trans_num curr_tn; unsigned char save_cw_set_depth; uint4 lcl_update_trans; region = (CLI_PRESENT == cli_present("REGION")); upgrade = (CLI_PRESENT == cli_present("UPGRADE")); downgrade = (CLI_PRESENT == cli_present("DOWNGRADE")); assert(upgrade && !downgrade || !upgrade && downgrade); command = upgrade ? "UPGRADE" : "DOWNGRADE"; reorg_command = upgrade ? "MUPIP REORG UPGRADE" : "MUPIP REORG DOWNGRADE"; reorg_entiredb = TRUE; /* unless STARTBLK or STOPBLK is specified we are going to {up,down}grade the entire database */ startblk_specified = FALSE; assert(SIZEOF(block_id) == SIZEOF(uint4)); if ((CLI_PRESENT == cli_present("STARTBLK")) && (cli_get_hex("STARTBLK", (uint4 *)&startblk_input))) { reorg_entiredb = FALSE; startblk_specified = TRUE; } stopblk_specified = FALSE; assert(SIZEOF(block_id) == SIZEOF(uint4)); if ((CLI_PRESENT == cli_present("STOPBLK")) && (cli_get_hex("STOPBLK", (uint4 *)&stopblk_input))) { reorg_entiredb = FALSE; stopblk_specified = TRUE; } mu_reorg_upgrd_dwngrd_in_prog = TRUE; mu_reorg_nosafejnl = (CLI_NEGATED == cli_present("SAFEJNL")) ? TRUE : FALSE; assert(region); status = SS_NORMAL; error_mupip = FALSE; gvinit(); /* initialize gd_header (needed by the later call to mu_getlst) */ mu_getlst("REG_NAME", SIZEOF(tp_region)); /* get the parameter corresponding to REGION qualifier */ if (error_mupip) { util_out_print("!/MUPIP REORG !AD cannot proceed with above errors!/", TRUE, LEN_AND_STR(command)); mupip_exit(ERR_MUNOACTION); } assert(DBKEYSIZE(MAX_KEY_SZ) == gv_keysize); /* no need to invoke GVKEYSIZE_INIT_IF_NEEDED macro */ gv_target = targ_alloc(gv_keysize, NULL, NULL); /* t_begin needs this initialized */ gv_target_list = NULL; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ blkhist = &alt_hist.h[0]; for (rptr = grlist; NULL != rptr; rptr = rptr->fPtr) { if (mu_ctrly_occurred || mu_ctrlc_occurred) break; reg = rptr->reg; util_out_print("!/Region !AD : MUPIP REORG !AD started", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); if (reg_cmcheck(reg)) { util_out_print("Region !AD : MUPIP REORG !AD cannot run across network", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = ERR_MUNOFINISH; continue; } mu_reorg_process = TRUE; /* gvcst_init will use this value to use gtm_poollimit settings. */ gvcst_init(reg); mu_reorg_process = FALSE; assert(update_array != NULL); /* access method stored in global directory and database file header might be different in which case * the database setting prevails. therefore, the access method check can be done only after opening * the database (i.e. after the gvcst_init) */ if (dba_bg != REG_ACC_METH(reg)) { util_out_print("Region !AD : MUPIP REORG !AD cannot continue as access method is not BG", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = ERR_MUNOFINISH; continue; } /* The mu_getlst call above uses insert_region to create the grlist, which ensures that duplicate regions mapping to * the same db file correspond to only one grlist entry. */ assert(FALSE == reg->was_open); TP_CHANGE_REG(reg); /* sets gv_cur_region, cs_addrs, cs_data */ csa = cs_addrs; csd = cs_data; blk_size = csd->blk_size; /* "blk_size" is used by the BLK_FINI macro */ if (reg->read_only) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(reg)); status = ERR_MUNOFINISH; continue; } assert(GDSVCURR == GDSV6); /* so we trip this assert in case GDSVCURR changes without a change to this module */ new_db_format = (upgrade ? GDSV6 : GDSV4); grab_crit(reg); curr_tn = csd->trans_hist.curr_tn; /* set the desired db format in the file header to the appropriate version, increment transaction number */ status1 = desired_db_format_set(reg, new_db_format, reorg_command); assert(csa->now_crit); /* desired_db_format_set() should not have released crit */ first_reorg_in_this_db_fmt = TRUE; /* with the current desired_db_format, this is the first reorg */ if (SS_NORMAL != status1) { /* "desired_db_format_set" would have printed appropriate error messages */ if (ERR_MUNOACTION != status1) { /* real error occurred while setting the db format. skip to next region */ status = ERR_MUNOFINISH; rel_crit(reg); continue; } util_out_print("Region !AD : Desired DB Format remains at !AD after !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command)); if (csd->reorg_db_fmt_start_tn == csd->desired_db_format_tn) first_reorg_in_this_db_fmt = FALSE; } else util_out_print("Region !AD : Desired DB Format set to !AD by !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(gtm_dbversion_table[new_db_format]), LEN_AND_STR(reorg_command)); assert(dba_bg == csd->acc_meth); /* Check blks_to_upgrd counter to see if upgrade/downgrade is complete */ total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; actual_blks2upgrd = csd->blks_to_upgrd; /* If MUPIP REORG UPGRADE and there is no block to upgrade in the database as indicated by BOTH * "csd->blks_to_upgrd" and "csd->fully_upgraded", then we can skip processing. * If MUPIP REORG UPGRADE and all non-free blocks need to be upgraded then again we can skip processing. */ if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded) || (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd))) { util_out_print("Region !AD : Blocks to Upgrade counter indicates no action needed for MUPIP REORG !AD", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : " "Blocks to upgrade = [0x!XL]", TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd); util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); rel_crit(reg); continue; } stop_blk = total_blks; if (stopblk_specified && stopblk_input <= stop_blk) stop_blk = stopblk_input; if (first_reorg_in_this_db_fmt) { /* Note down reorg start tn (in case we are interrupted, future reorg will know to resume) */ csd->reorg_db_fmt_start_tn = csd->desired_db_format_tn; csd->reorg_upgrd_dwngrd_restart_block = 0; start_blk = (startblk_specified ? startblk_input : 0); } else { /* Either a concurrent MUPIP REORG of the same type ({up,down}grade) is currently running * or a previously running REORG of the same type was interrupted (Ctrl-Ced). * In either case resume processing from whatever restart block number is stored in fileheader * the only exception is if "STARTBLK" was specified in the input in which use that unconditionally. */ start_blk = (startblk_specified ? startblk_input : csd->reorg_upgrd_dwngrd_restart_block); } if (start_blk > stop_blk) start_blk = stop_blk; mu_reorg_upgrd_dwngrd_start_tn = csd->reorg_db_fmt_start_tn; /* Before releasing crit, flush the file-header and dirty buffers in cache to disk. This is because we are now * going to read each GDS block directly from disk to determine if it needs to be upgraded/downgraded or not. */ if (!wcs_flu(WCSFLU_FLUSH_HDR)) /* wcs_flu assumes gv_cur_region is set (which it is in this routine) */ { rel_crit(reg); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg)); status = ERR_MUNOFINISH; continue; } rel_crit(reg); /* Loop through entire database one GDS block at a time and upgrade/downgrade each of them */ status1 = SS_NORMAL; start_bmp = ROUND_DOWN2(start_blk, BLKS_PER_LMAP); last_bmp = ROUND_DOWN2(stop_blk - 1, BLKS_PER_LMAP); curblk = start_blk; /* curblk is the block to be upgraded/downgraded */ util_out_print("Region !AD : Started processing from block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk); if (NULL != bptr) { /* malloc/free "bptr" for each region as GDS block-size can be different */ free(bptr); bptr = NULL; } memset(&reorg_stats, 0, SIZEOF(reorg_stats)); /* initialize statistics for this region */ for (curbmp = start_bmp; curbmp <= last_bmp; curbmp += BLKS_PER_LMAP) { if (mu_ctrly_occurred || mu_ctrlc_occurred) { status1 = ERR_MUNOFINISH; break; } /* -------------------------------------------------------------- * Read in current bitmap block * -------------------------------------------------------------- */ assert(!csa->now_crit); bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* bring block into the cache outside of crit */ reorg_stats.blks_read_from_disk_bmp++; grab_crit_encr_cycle_sync(reg); /* needed so t_qread does not return NULL below */ if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn) { /* csd->desired_db_format changed since reorg started. discontinue the reorg */ /* see later comment on "csd->reorg_upgrd_dwngrd_restart_block" for why the assignment * of this field should be done only if a db format change did not occur. */ rel_crit(reg); status1 = ERR_MUNOFINISH; /* This "start_tn" check is redone after the for-loop and an error message is printed there */ break; } else if (reorg_entiredb) { /* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */ assert(csd->reorg_upgrd_dwngrd_restart_block <= MAX(start_blk, curbmp)); csd->reorg_upgrd_dwngrd_restart_block = curbmp; /* previous blocks have been upgraded/downgraded */ } /* Check blks_to_upgrd counter to see if upgrade/downgrade is complete. * Repeat check done a few steps earlier outside of this for loop. */ total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; actual_blks2upgrd = csd->blks_to_upgrd; if ((upgrade && (0 == actual_blks2upgrd) && csd->fully_upgraded) || (!upgrade && ((total_blks - free_blks) == actual_blks2upgrd))) { rel_crit(reg); break; } bml_sm_buff = t_qread(curbmp, (sm_int_ptr_t)&cycle, &cr); /* now that in crit, note down stable buffer */ if (NULL == bml_sm_buff) rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL); ondsk_blkver = cr->ondsk_blkver; /* note down db fmt on disk for bitmap block */ /* Take a copy of the shared memory bitmap buffer into process-private memory before releasing crit. * We are interested in those blocks that are currently marked as USED in the bitmap. * It is possible that once we release crit, concurrent updates change the bitmap state of those blocks. * In that case, those updates will take care of doing the upgrade/downgrade of those blocks in the * format currently set in csd->desired_db_format i.e. accomplishing MUPIP REORG UPGRADE/DOWNGRADE's job. * If the desired_db_format changes concurrently, we will stop doing REORG UPGRADE/DOWNGRADE processing. */ if (NULL == bml_lcl_buff) bml_lcl_buff = malloc(BM_SIZE(BLKS_PER_LMAP)); memcpy(bml_lcl_buff, (blk_hdr_ptr_t)bml_sm_buff, BM_SIZE(BLKS_PER_LMAP)); if (FALSE == cert_blk(reg, curbmp, (blk_hdr_ptr_t)bml_lcl_buff, 0, FALSE)) { /* certify the block while holding crit as cert_blk uses fields from file-header (shared memory) */ assert(FALSE); /* in pro, skip ugprading/downgarding all blks in this unreliable local bitmap */ rel_crit(reg); util_out_print("Region !AD : Bitmap Block [0x!XL] has integrity errors. Skipping this bitmap.", TRUE, REG_LEN_STR(reg), curbmp); status1 = ERR_MUNOFINISH; continue; } rel_crit(reg); /* ------------------------------------------------------------------------ * Upgrade/Downgrade all BUSY blocks in the current bitmap * ------------------------------------------------------------------------ */ curblk = (curbmp == start_bmp) ? start_blk : curbmp; mapsize = (curbmp == last_bmp) ? (stop_blk - curbmp) : BLKS_PER_LMAP; assert(0 != mapsize); assert(mapsize <= BLKS_PER_LMAP); db_got_to_v5_once = csd->db_got_to_v5_once; for (lcnt = curblk - curbmp; lcnt < mapsize; lcnt++, curblk++) { if (mu_ctrly_occurred || mu_ctrlc_occurred) { status1 = ERR_MUNOFINISH; goto stop_reorg_on_this_reg; /* goto needed because of nested FOR Loop */ } GET_BM_STATUS(bml_lcl_buff, lcnt, bml_status); assert(BLK_MAPINVALID != bml_status); /* cert_blk ran clean so we dont expect invalid entries */ if (BLK_FREE == bml_status) { reorg_stats.blks_skipped_free++; continue; } /* MUPIP REORG UPGRADE/DOWNGRADE will convert USED & RECYCLED blocks */ if (db_got_to_v5_once || (BLK_RECYCLED != bml_status)) { /* Do NOT read recycled V4 block from disk unless it is guaranteed NOT to be too full */ if (lcnt) { /* non-bitmap block */ /* read in block from disk into private buffer. dont pollute the cache yet */ if (NULL == bptr) bptr = (sm_uc_ptr_t)malloc(blk_size); status1 = dsk_read(curblk, bptr, &ondsk_blkver, FALSE); /* dsk_read on curblk could return an error (DYNUPGRDFAIL) if curblk needs to be * upgraded and if its block size was too big to allow the extra block-header space * requirements for a dynamic upgrade. a MUPIP REORG DOWNGRADE should not error out * in that case as the block is already in the downgraded format. */ if (SS_NORMAL != status1) { if (!upgrade && (ERR_DYNUPGRDFAIL == status1)) { assert(GDSV4 == new_db_format); ondsk_blkver = new_db_format; } else { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status1); util_out_print("Region !AD : Error occurred while reading block " "[0x!XL]", TRUE, REG_LEN_STR(reg), curblk); status1 = ERR_MUNOFINISH; goto stop_reorg_on_this_reg;/* goto needed due to nested FOR Loop */ } } reorg_stats.blks_read_from_disk_nonbmp++; } /* else bitmap block has been read in crit earlier and ondsk_blkver appropriately set */ if (new_db_format == ondsk_blkver) { assert((SS_NORMAL == status1) || (!upgrade && (ERR_DYNUPGRDFAIL == status1))); status1 = SS_NORMAL; /* treat DYNUPGRDFAIL as no error in case of downgrade */ reorg_stats.blks_skipped_newfmtindisk++; continue; /* current disk version is identical to what is desired */ } assert(SS_NORMAL == status1); } /* Begin non-TP transaction to upgrade/downgrade the block. * The way we do that is by updating the block using a null update array. * Any update to a block will trigger an automatic upgrade/downgrade of the block based on * the current fileheader desired_db_format setting and we use that here. */ t_begin(ERR_MUREORGFAIL, UPDTRNS_DB_UPDATED_MASK); for (; ;) { CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ curr_tn = csd->trans_hist.curr_tn; db_got_to_v5_once = csd->db_got_to_v5_once; if (db_got_to_v5_once || (BLK_RECYCLED != bml_status)) { blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ blkBase = t_qread(curblk, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); if (NULL == blkBase) { t_retry((enum cdb_sc)rdfail_detail); continue; } blkhist->blk_num = curblk; blkhist->buffaddr = blkBase; ondsk_blkver = blkhist->cr->ondsk_blkver; new_hdr = *(blk_hdr_ptr_t)blkBase; mu_reorg_upgrd_dwngrd_blktn = new_hdr.tn; mark_blk_free = FALSE; inctn_opcode = upgrade ? inctn_blkupgrd : inctn_blkdwngrd; } else { mark_blk_free = TRUE; inctn_opcode = inctn_blkmarkfree; } inctn_detail.blknum_struct.blknum = curblk; /* t_end assumes that the history it is passed does not contain a bitmap block. * for bitmap block, the history validation information is passed through cse instead. * therefore we need to handle bitmap and non-bitmap cases separately. */ if (!lcnt) { /* Means a bitmap block. * At this point we can do a "new_db_format != ondsk_blkver" check to determine * if the block got converted since we did the dsk_read (see the non-bitmap case * for a similar check done there), but in that case we will have a transaction * which has read 1 bitmap block and is updating no block. "t_end" currently cannot * handle this case as it expects any bitmap block that needs validation to also * have a corresponding cse which will hold its history. Hence we avoid doing the * new_db_format check. The only disadvantage of this is that we will end up * modifying the bitmap block as part of this transaction (in an attempt to convert * its ondsk_blkver) even though it is already in the right format. Since this * overhead is going to be one per bitmap block and since the block is in the cache * at this point, we should not lose much. */ assert(!mark_blk_free); BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); assert(&alt_hist.h[0] == blkhist); alt_hist.h[0].blk_num = 0; /* create empty history for bitmap block */ assert(update_trans); } else { /* non-bitmap block. fill in history for validation in t_end */ assert(curblk); /* we should never come here for block 0 (bitmap) */ if (!mark_blk_free) { assert(blkhist->blk_num == curblk); assert(blkhist->buffaddr == blkBase); blkhist->tn = curr_tn; alt_hist.h[1].blk_num = 0; } /* Also need to pass the bitmap as history to detect if any concurrent M-kill * is freeing up the same USED block that we are trying to convert OR if any * concurrent M-set is reusing the same RECYCLED block that we are trying to * convert. Because of t_end currently not being able to validate a bitmap * without that simultaneously having a cse, we need to create a cse for the * bitmap that is used only for bitmap history validation, but should not be * used to update the contents of the bitmap block in bg_update. */ bmlhist.buffaddr = t_qread(curbmp, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr); if (NULL == bmlhist.buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } bmlhist.blk_num = curbmp; bmlhist.tn = curr_tn; GET_BM_STATUS(bmlhist.buffaddr, lcnt, bml_status); if (BLK_MAPINVALID == bml_status) { t_retry(cdb_sc_lostbmlcr); continue; } if (!mark_blk_free) { if ((new_db_format != ondsk_blkver) && (BLK_FREE != bml_status)) { /* block still needs to be converted. create cse */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blkBase + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr)); BLK_FINI(bs_ptr, bs1); t_write(blkhist, (unsigned char *)bs1, 0, 0, ((blk_hdr_ptr_t)blkBase)->levl, FALSE, FALSE, GDS_WRITE_PLAIN); /* The directory tree status for now is only used to determine * whether writing the block to snapshot file (see t_end_sysops.c). * For reorg upgrade/downgrade process, the block is updated in a * sequential way without changing the gv_target. In this case, we * assume the block is in directory tree so as to have it written to * the snapshot file. */ BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state); /* reset update_trans in case previous retry had set it to 0 */ update_trans = UPDTRNS_DB_UPDATED_MASK; if (BLK_RECYCLED == bml_status) { /* If block that we are upgarding is RECYCLED, indicate to * bg_update that blks_to_upgrd counter should NOT be * touched in this case by setting "mode" to a special value */ assert(cw_set[cw_set_depth-1].mode == gds_t_write); cw_set[cw_set_depth-1].mode = gds_t_write_recycled; /* we SET block as NOT RECYCLED, otherwise, the mm_update() * or bg_update_phase2 may skip writing it to snapshot file * when its level is 0 */ BIT_CLEAR_RECYCLED(cw_set[cw_set_depth-1].blk_prior_state); } } else { /* Block got converted by another process since we did the dsk_read. * or this block became marked free in the bitmap. * No need to update this block. just call t_end for validation of * both the non-bitmap block as well as the bitmap block. * Note down that this transaction is no longer updating any blocks. */ update_trans = 0; } /* Need to put bit maps on the end of the cw set for concurrency checking. * We want to simulate t_write_map, except we want to update "cw_map_depth" * instead of "cw_set_depth". Hence the save and restore logic below. * This part of the code is similar to the one in mu_swap_blk.c */ save_cw_set_depth = cw_set_depth; assert(!cw_map_depth); t_write_map(&bmlhist, NULL, curr_tn, 0); /* will increment cw_set_depth */ cw_map_depth = cw_set_depth; /* set cw_map_depth to latest cw_set_depth */ cw_set_depth = save_cw_set_depth;/* restore cw_set_depth */ /* t_write_map simulation end */ } else { if (BLK_RECYCLED != bml_status) { /* Block was RECYCLED at beginning but no longer so. Retry */ t_retry(cdb_sc_bmlmod); continue; } /* Mark recycled block as FREE in bitmap */ assert(lcnt == (curblk - curbmp)); assert(update_array_ptr == update_array); *((block_id *)update_array_ptr) = lcnt; update_array_ptr += SIZEOF(block_id); /* the following assumes SIZEOF(block_id) == SIZEOF(int) */ assert(SIZEOF(block_id) == SIZEOF(int)); *(int *)update_array_ptr = 0; t_write_map(&bmlhist, (unsigned char *)update_array, curr_tn, 0); update_trans = UPDTRNS_DB_UPDATED_MASK; } } assert(SIZEOF(lcl_update_trans) == SIZEOF(update_trans)); lcl_update_trans = update_trans; /* take a copy before t_end modifies it */ if ((trans_num)0 != t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) { /* In case this is MM and t_end() remapped an extended database, reset csd */ assert(csd == cs_data); if (!lcl_update_trans) { assert(lcnt); assert(!mark_blk_free); assert((new_db_format == ondsk_blkver) || (BLK_BUSY != bml_status)); if (BLK_BUSY != bml_status) reorg_stats.blks_skipped_free++; else reorg_stats.blks_skipped_newfmtincache++; } else if (!lcnt) reorg_stats.blks_converted_bmp++; else reorg_stats.blks_converted_nonbmp++; break; } assert(csd == cs_data); } } } stop_reorg_on_this_reg: /* even though ctrl-c occurred, update file-header fields to store reorg's progress before exiting */ grab_crit(reg); blocks_left = 0; assert(csd->trans_hist.total_blks >= csd->blks_to_upgrd); actual_blks2upgrd = csd->blks_to_upgrd; total_blks = csd->trans_hist.total_blks; free_blks = csd->trans_hist.free_blocks; /* Care should be taken not to set "csd->reorg_upgrd_dwngrd_restart_block" in case of a concurrent db fmt * change. This is because let us say we are doing REORG UPGRADE. A concurrent REORG DOWNGRADE would * have reset "csd->reorg_upgrd_dwngrd_restart_block" field to 0 and if that reorg is interrupted by a * Ctrl-C (before this reorg came here) it would have updated "csd->reorg_upgrd_dwngrd_restart_block" to * a non-zero value indicating how many blocks from 0 have been downgraded. We should not reset this * field to "curblk" as it will be mis-interpreted as the number of blocks that have been DOWNgraded. */ set_fully_upgraded = FALSE; if (mu_reorg_upgrd_dwngrd_start_tn != csd->desired_db_format_tn) { /* csd->desired_db_format changed since reorg started. discontinue the reorg */ util_out_print("Region !AD : Desired DB Format changed during REORG. Stopping REORG.", TRUE, REG_LEN_STR(reg)); status1 = ERR_MUNOFINISH; } else if (reorg_entiredb) { /* Change "csd->reorg_upgrd_dwngrd_restart_block" only if STARTBLK or STOPBLK was NOT specified */ assert(csd->reorg_upgrd_dwngrd_restart_block <= curblk); csd->reorg_upgrd_dwngrd_restart_block = curblk; /* blocks lesser than this have been upgraded/downgraded */ expected_blks2upgrd = upgrade ? 0 : (total_blks - free_blks); blocks_left = upgrade ? actual_blks2upgrd : (expected_blks2upgrd - actual_blks2upgrd); /* If this reorg command went through all blocks in the database, then it should have * correctly concluded at this point whether the reorg is complete or not. * If this reorg command started from where a previous incomplete reorg left * (i.e. first_reorg_in_this_db_fmt is FALSE), it cannot determine if the initial * GDS blocks that it skipped are completely {up,down}graded or not. */ assert((0 == blocks_left) || (SS_NORMAL != status1) || !first_reorg_in_this_db_fmt); /* If this is a MUPIP REORG UPGRADE that did go through every block in the database (indicated by * "reorg_entiredb" && "first_reorg_in_this_db_fmt") and the current count of "blks_to_upgrd" is * 0 in the file-header and the desired_db_format did not change since the start of the REORG, * we can be sure that the entire database has been upgraded. Set "csd->fully_upgraded" to TRUE. */ if ((SS_NORMAL == status1) && first_reorg_in_this_db_fmt && upgrade && (0 == actual_blks2upgrd)) { csd->fully_upgraded = TRUE; csd->db_got_to_v5_once = TRUE; set_fully_upgraded = TRUE; } /* flush all changes noted down in the file-header */ if (!wcs_flu(WCSFLU_FLUSH_HDR)) /* wcs_flu assumes gv_cur_region is set (which it is in this routine) */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG UPGRADE/DOWNGRADE"), DB_LEN_STR(reg)); status = ERR_MUNOFINISH; rel_crit(reg); continue; } } curr_tn = csd->trans_hist.curr_tn; rel_crit(reg); util_out_print("Region !AD : Stopped processing at block number [0x!XL]", TRUE, REG_LEN_STR(reg), curblk); /* Print statistics */ util_out_print("Region !AD : Statistics : Blocks Read From Disk (Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_bmp); util_out_print("Region !AD : Statistics : Blocks Skipped (Free) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_free); util_out_print("Region !AD : Statistics : Blocks Read From Disk (Non-Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_read_from_disk_nonbmp); util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in disk) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtindisk); util_out_print("Region !AD : Statistics : Blocks Skipped (new fmt in cache) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_skipped_newfmtincache); util_out_print("Region !AD : Statistics : Blocks Converted (Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_bmp); util_out_print("Region !AD : Statistics : Blocks Converted (Non-Bitmap) : 0x!XL", TRUE, REG_LEN_STR(reg), reorg_stats.blks_converted_nonbmp); if (reorg_entiredb && (SS_NORMAL == status1) && (0 != blocks_left)) { /* file-header counter does not match what reorg on the entire database expected to see */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBBTUWRNG, 2, expected_blks2upgrd, actual_blks2upgrd); util_out_print("Region !AD : Run MUPIP INTEG (without FAST qualifier) to fix the counter", TRUE, REG_LEN_STR(reg)); status1 = ERR_MUNOFINISH; } else util_out_print("Region !AD : Total Blocks = [0x!XL] : Free Blocks = [0x!XL] : " "Blocks to upgrade = [0x!XL]", TRUE, REG_LEN_STR(reg), total_blks, free_blks, actual_blks2upgrd); /* Issue success or failure message for this region */ if (SS_NORMAL == status1) { /* issue success only if REORG did not encounter any error in its processing */ if (set_fully_upgraded) util_out_print("Region !AD : Database is now FULLY UPGRADED", TRUE, REG_LEN_STR(reg)); util_out_print("Region !AD : MUPIP REORG !AD finished!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUREUPDWNGRDEND, 5, REG_LEN_STR(reg), process_id, process_id, &curr_tn); } else { assert(ERR_MUNOFINISH == status1); assert((SS_NORMAL == status) || (ERR_MUNOFINISH == status)); util_out_print("Region !AD : MUPIP REORG !AD incomplete. See above messages.!/", TRUE, REG_LEN_STR(reg), LEN_AND_STR(command)); status = status1; } } if (NULL != bptr) free(bptr); if (NULL != bml_lcl_buff) free(bml_lcl_buff); if (mu_ctrly_occurred || mu_ctrlc_occurred) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_REORGCTRLY); status = ERR_MUNOFINISH; } mupip_exit(status); }
void trigger_delete_all(char *trigger_rec, uint4 len, uint4 *trig_stats) { int count; sgmnt_addrs *csa; mval curr_gbl_name; mval after_hash_cycle; int cycle; mval *mv_count_ptr; mval *mv_cycle_ptr; gd_region *reg, *reg_top; int4 result; gd_region *lgtrig_reg; mval trigger_cycle; mval trigger_count; boolean_t this_db_updated; uint4 triggers_deleted; mval trigjrec; boolean_t jnl_format_done; boolean_t delete_required; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(0 < dollar_tlevel); jnl_format_done = FALSE; lgtrig_reg = NULL; trigjrec.mvtype = MV_STR; trigjrec.str.len = len; trigjrec.str.addr = trigger_rec; triggers_deleted = 0; for (reg = gd_header->regions, reg_top = reg + gd_header->n_regions; reg < reg_top; reg++) { GVTR_SWITCH_REG_AND_HASHT_BIND_NAME(reg); csa = cs_addrs; if (NULL == csa) /* not BG or MM access method */ continue; /* gv_target now points to ^#t in region "reg" */ /* To write the LGTRIG logical jnl record, choose some region that has journaling enabled */ if (!reg->read_only && !jnl_format_done && JNL_WRITE_LOGICAL_RECS(csa)) lgtrig_reg = reg; if (!gv_target->root) continue; /* kill ^#t("#TNAME") */ BUILD_HASHT_SUB_CURRKEY(LITERAL_HASHTNAME, STRLEN(LITERAL_HASHTNAME)); if (0 != gvcst_data()) { /* Issue error if we dont have permissions to touch ^#t global */ if (reg->read_only) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_TRIGMODREGNOTRW, 2, REG_LEN_STR(reg)); gvcst_kill(TRUE); } /* Kill all descendents of ^#t(trigvn, ...) where trigvn is any global with a trigger, * but skip the ^#t("#...",...) entries. Setup ^#t("$") as the key for op_gvorder */ BUILD_HASHT_SUB_CURRKEY(LITERAL_MAXHASHVAL, STRLEN(LITERAL_MAXHASHVAL)); TREF(gv_last_subsc_null) = FALSE; /* We know its not null, but prior state is unreliable */ this_db_updated = FALSE; while (TRUE) { op_gvorder(&curr_gbl_name); /* quit:$length(curr_gbl_name)=0 */ if (0 == curr_gbl_name.str.len) break; count = cycle = 0; /* $get(^#t(curr_gbl_name,#COUNT)) */ BUILD_HASHT_SUB_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len, LITERAL_HASHCOUNT, STRLEN(LITERAL_HASHCOUNT)); if (TRUE == (delete_required = gvcst_get(&trigger_count))) /* inline assignment */ { mv_count_ptr = &trigger_count; count = MV_FORCE_UINT(mv_count_ptr); } /* $get(^#t(curr_gbl_name,#CYCLE)) */ BUILD_HASHT_SUB_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len, LITERAL_HASHCYCLE, STRLEN(LITERAL_HASHCYCLE)); if (!gvcst_get(&trigger_cycle)) { /* Found hasht record, there must be #CYCLE */ if (CDB_STAGNATE > t_tries) t_retry(cdb_sc_triggermod); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6, curr_gbl_name.str.len, curr_gbl_name.str.addr, curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#CYCLE\""), ERR_TEXT, 2, RTS_ERROR_TEXT("#CYCLE field is missing")); send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6, curr_gbl_name.str.len, curr_gbl_name.str.addr, curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#CYCLE\""), ERR_TEXT, 2, RTS_ERROR_TEXT("#CYCLE field is missing")); assert(WBTEST_HELPOUT_TRIGDEFBAD == gtm_white_box_test_case_number); } else { mv_cycle_ptr = &trigger_cycle; cycle = MV_FORCE_UINT(mv_cycle_ptr); cycle++; MV_FORCE_MVAL(&trigger_cycle, cycle); } if (!delete_required) { /* $order(^#t(curr_gbl_name,#LABEL)); should be the NULL string if #COUNT not found * Use #LABEL vs #CYCLE because MUPIP TRIGGER -UPGRADE unconditionally inserts it */ BUILD_HASHT_SUB_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len, LITERAL_HASHLABEL, STRLEN(LITERAL_HASHLABEL)); op_gvorder(&after_hash_cycle); /* quit:$length(after_hash_cycle)=0 */ if (0 != after_hash_cycle.str.len) { /* Found hasht record after #LABEL, but #COUNT is not defined; Force removal */ if (CDB_STAGNATE > t_tries) t_retry(cdb_sc_triggermod); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6, curr_gbl_name.str.len, curr_gbl_name.str.addr, curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#COUNT\""), ERR_TEXT, 2, RTS_ERROR_TEXT("#COUNT field is missing. Skipped in results")); send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) MAKE_MSG_WARNING(ERR_TRIGDEFBAD), 6, curr_gbl_name.str.len, curr_gbl_name.str.addr, curr_gbl_name.str.len, curr_gbl_name.str.addr, LEN_AND_LIT("\"#COUNT\""), ERR_TEXT, 2, RTS_ERROR_TEXT("#COUNT field is missing. Skipped in results")); assert(WBTEST_HELPOUT_TRIGDEFBAD == gtm_white_box_test_case_number); delete_required = TRUE; } } if (delete_required) { /* Now that we know there is something to kill, check if we have permissions to touch ^#t global */ if (reg->read_only) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_TRIGMODREGNOTRW, 2, REG_LEN_STR(reg)); if (!jnl_format_done && JNL_WRITE_LOGICAL_RECS(csa)) { jnl_format(JNL_LGTRIG, NULL, &trigjrec, 0); jnl_format_done = TRUE; } /* kill ^#t(curr_gbl_name); kills ^#t(curr_gbl_name,"#TRHASH") as well */ BUILD_HASHT_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len); gvcst_kill(TRUE); if (0 < cycle) { /* set ^#t(curr_gbl_name,#CYCLE)=trigger_cycle */ SET_TRIGGER_GLOBAL_SUB_SUB_MVAL(curr_gbl_name.str.addr, curr_gbl_name.str.len, LITERAL_HASHCYCLE, STRLEN(LITERAL_HASHCYCLE), trigger_cycle, result); assert(PUT_SUCCESS == result); } this_db_updated = TRUE; triggers_deleted += count; } /* else there is nothing under the hasht record, leave #CYCLE alone */ /* get ready for op_gvorder() call for next trigger under ^#t */ BUILD_HASHT_SUB_CURRKEY(curr_gbl_name.str.addr, curr_gbl_name.str.len); } if (this_db_updated) { csa->incr_db_trigger_cycle = TRUE; if (dollar_ztrigger_invoked) { /* increment db_dztrigger_cycle so that next gvcst_put/gvcst_kill in this transaction, * on this region, will re-read. See trigger_update.c for a comment on why it is okay * for db_dztrigger_cycle to be incremented more than once in the same transaction */ csa->db_dztrigger_cycle++; } } } if (!jnl_format_done && (NULL != lgtrig_reg)) { /* There was no journaled region that had a ^#t update, but found at least one journaled region * so write a LGTRIG logical jnl record there. */ GVTR_SWITCH_REG_AND_HASHT_BIND_NAME(lgtrig_reg); csa = cs_addrs; JNLPOOL_INIT_IF_NEEDED(csa, csa->hdr, csa->nl); /* see previous usage for comment on why it is needed */ assert(dollar_tlevel); T_BEGIN_SETORKILL_NONTP_OR_TP(ERR_TRIGLOADFAIL); /* needed to set update_trans TRUE on this region * even if NO db updates happen to ^#t nodes. */ jnl_format(JNL_LGTRIG, NULL, &trigjrec, 0); jnl_format_done = TRUE; } if (triggers_deleted) { util_out_print_gtmio("All existing triggers (count = !UL) deleted", FLUSH, triggers_deleted); trig_stats[STATS_DELETED] += triggers_deleted; trig_stats[STATS_NOERROR_TRIGFILE]++; } else { util_out_print_gtmio("No matching triggers found for deletion", FLUSH); trig_stats[STATS_UNCHANGED_TRIGFILE]++; } }
int gtmsource() { int status, log_init_status, waitpid_res, save_errno; char print_msg[1024], tmpmsg[1024]; gd_region *reg, *region_top; sgmnt_addrs *csa, *repl_csa; boolean_t all_files_open, isalive; pid_t pid, ppid, procgp; seq_num read_jnl_seqno, jnl_seqno; unix_db_info *udi; gtmsource_local_ptr_t gtmsource_local; boolean_t this_side_std_null_coll; int null_fd, rc; memset((uchar_ptr_t)&jnlpool, 0, SIZEOF(jnlpool_addrs)); call_on_signal = gtmsource_sigstop; ESTABLISH_RET(gtmsource_ch, SS_NORMAL); if (-1 == gtmsource_get_opt()) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_MUPCLIERR); if (gtmsource_options.shut_down) { /* Wait till shutdown time nears even before going to "jnlpool_init". This is because the latter will return * with the ftok semaphore and access semaphore held and we do not want to be holding those locks (while * waiting for the user specified timeout to expire) as that will affect new GTM processes and/or other * MUPIP REPLIC commands that need these locks for their function. */ if (0 < gtmsource_options.shutdown_time) { repl_log(stdout, TRUE, TRUE, "Waiting for %d seconds before signalling shutdown\n", gtmsource_options.shutdown_time); LONG_SLEEP(gtmsource_options.shutdown_time); } else repl_log(stdout, TRUE, TRUE, "Signalling shutdown immediate\n"); } else if (gtmsource_options.start) { repl_log(stdout, TRUE, TRUE, "Initiating START of source server for secondary instance [%s]\n", gtmsource_options.secondary_instname); } if (gtmsource_options.activate && (ROOTPRIMARY_SPECIFIED == gtmsource_options.rootprimary)) { /* MUPIP REPLIC -SOURCE -ACTIVATE -UPDOK has been specified. We need to open the gld and db regions now * in case this is a secondary -> primary transition. This is so we can later switch journal files in all * journaled regions when the transition actually happens inside "gtmsource_rootprimary_init". But since * we have not yet done a "jnlpool_init", we dont know if updates are disabled in it or not. Although we * need to do the gld/db open only if updates are currently disabled in the jnlpool, we do this always * because once we do a jnlpool_init, we will come back with the ftok on the jnlpool held and that has * issues with later db open since we will try to hold the db ftok as part of db open and the ftok logic * currently has assumptions that a process holds only one ftok at any point in time. */ assert(NULL == gd_header); gvinit(); all_files_open = region_init(FALSE); if (!all_files_open) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN); gtmsource_exit(ABNORMAL_SHUTDOWN); } } jnlpool_init(GTMSOURCE, gtmsource_options.start, &is_jnlpool_creator); /* is_jnlpool_creator == TRUE ==> this process created the journal pool * is_jnlpool_creator == FALSE ==> journal pool already existed and this process simply attached to it. */ if (gtmsource_options.shut_down) gtmsource_exit(gtmsource_shutdown(FALSE, NORMAL_SHUTDOWN) - NORMAL_SHUTDOWN); else if (gtmsource_options.activate) gtmsource_exit(gtmsource_mode_change(GTMSOURCE_MODE_ACTIVE_REQUESTED) - NORMAL_SHUTDOWN); else if (gtmsource_options.deactivate) gtmsource_exit(gtmsource_mode_change(GTMSOURCE_MODE_PASSIVE_REQUESTED) - NORMAL_SHUTDOWN); else if (gtmsource_options.checkhealth) gtmsource_exit(gtmsource_checkhealth() - NORMAL_SHUTDOWN); else if (gtmsource_options.changelog) gtmsource_exit(gtmsource_changelog() - NORMAL_SHUTDOWN); else if (gtmsource_options.showbacklog) gtmsource_exit(gtmsource_showbacklog() - NORMAL_SHUTDOWN); else if (gtmsource_options.stopsourcefilter) gtmsource_exit(gtmsource_stopfilter() - NORMAL_SHUTDOWN); else if (gtmsource_options.jnlpool) gtmsource_exit(gtmsource_jnlpool() - NORMAL_SHUTDOWN); else if (gtmsource_options.losttncomplete) gtmsource_exit(gtmsource_losttncomplete() - NORMAL_SHUTDOWN); else if (gtmsource_options.needrestart) gtmsource_exit(gtmsource_needrestart() - NORMAL_SHUTDOWN); else if (gtmsource_options.showfreeze) gtmsource_exit(gtmsource_showfreeze() - NORMAL_SHUTDOWN); else if (gtmsource_options.setfreeze) gtmsource_exit(gtmsource_setfreeze() - NORMAL_SHUTDOWN); else if (!gtmsource_options.start) { assert(CLI_PRESENT == cli_present("STATSLOG")); gtmsource_exit(gtmsource_statslog() - NORMAL_SHUTDOWN); } assert(gtmsource_options.start); # ifndef REPL_DEBUG_NOBACKGROUND /* Set "child_server_running" to FALSE before forking off child. Wait for it to be set to TRUE by the child. */ gtmsource_local = jnlpool.gtmsource_local; gtmsource_local->child_server_running = FALSE; FORK(pid); if (0 > pid) { save_errno = errno; rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Could not fork source server"), save_errno); } else if (0 < pid) { /* Parent. Wait until child sets "child_server_running" to FALSE. That is an indication that the child * source server has completed its initialization phase and is all set so the parent command can return. */ while (isalive = is_proc_alive(pid, 0)) /* note : intended assignment */ { if (gtmsource_local->child_server_running) break; /* To take care of reassignment of PIDs, the while condition should be && with the condition * (PPID of pid == process_id) */ SHORT_SLEEP(GTMSOURCE_WAIT_FOR_SRV_START); WAITPID(pid, &status, WNOHANG, waitpid_res); /* Release defunct child if dead */ } if (isalive) { /* Child process is alive and started with no issues */ if (0 != (save_errno = rel_sem(SOURCE, JNL_POOL_ACCESS_SEM))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error in rel_sem"), save_errno); ftok_sem_release(jnlpool.jnlpool_dummy_reg, TRUE, TRUE); } else { /* Child source server process errored out at startup and is no longer alive. * If we were the one who created the journal pool, let us clean it up. */ repl_log(stdout, TRUE, TRUE, "Source server startup failed. See source server log file\n"); if (is_jnlpool_creator) status = gtmsource_shutdown(TRUE, NORMAL_SHUTDOWN); } /* If the parent is killed (or crashes) between the fork and exit, checkhealth may not detect that startup * is in progress - parent forks and dies, the system will release sem 0 and 1, checkhealth might test the * value of sem 1 before the child grabs sem 1. */ gtmsource_exit(isalive ? SRV_ALIVE : SRV_ERR); } /* Point stdin to /dev/null */ OPENFILE("/dev/null", O_RDONLY, null_fd); if (0 > null_fd) rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to open /dev/null for read"), errno, 0); FCNTL3(null_fd, F_DUPFD, 0, rc); if (0 > rc) rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to set stdin to /dev/null"), errno, 0); CLOSEFILE(null_fd, rc); if (0 > rc) rts_error_csa(CSA_ARG(NULL) ERR_REPLERR, RTS_ERROR_LITERAL("Failed to close /dev/null"), errno, 0); /* The parent process (source server startup command) will be holding the ftok semaphore and jnlpool access semaphore * at this point. The variables that indicate this would have been copied over to the child during the fork. This will * make the child think it is actually holding them as well when actually it is not. Reset those variables in the child * to ensure they do not misrepresent the holder of those semaphores. */ ftok_sem_reg = NULL; udi = FILE_INFO(jnlpool.jnlpool_dummy_reg); assert(udi->grabbed_ftok_sem); udi->grabbed_ftok_sem = FALSE; assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]); holds_sem[SOURCE][JNL_POOL_ACCESS_SEM] = FALSE; assert(!holds_sem[SOURCE][SRC_SERV_COUNT_SEM]); /* Start child source server initialization */ is_src_server = TRUE; OPERATOR_LOG_MSG; process_id = getpid(); /* Reinvoke secshr related initialization with the child's pid */ INVOKE_INIT_SECSHR_ADDRS; /* Initialize mutex socket, memory semaphore etc. before any "grab_lock" is done by this process on the journal pool. * Note that the initialization would already have been done by the parent receiver startup command but we need to * redo the initialization with the child process id. */ assert(mutex_per_process_init_pid && (mutex_per_process_init_pid != process_id)); mutex_per_process_init(); START_HEARTBEAT_IF_NEEDED; ppid = getppid(); log_init_status = repl_log_init(REPL_GENERAL_LOG, >msource_log_fd, gtmsource_options.log_file); assert(SS_NORMAL == log_init_status); repl_log_fd2fp(>msource_log_fp, gtmsource_log_fd); if (-1 == (procgp = setsid())) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Source server error in setsid"), errno); # endif /* REPL_DEBUG_NOBACKGROUND */ if (ZLIB_CMPLVL_NONE != gtm_zlib_cmp_level) gtm_zlib_init(); /* Open zlib shared library for compression/decompression */ REPL_DPRINT1("Setting up regions\n"); gvinit(); /* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */ all_files_open = region_init(FALSE); if (!all_files_open) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN); gtmsource_exit(ABNORMAL_SHUTDOWN); } /* Determine primary side null subscripts collation order */ /* Also check whether all regions have same null collation order */ this_side_std_null_coll = -1; for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++) { csa = &FILE_INFO(reg)->s_addrs; if (this_side_std_null_coll != csa->hdr->std_null_coll) { if (-1 == this_side_std_null_coll) this_side_std_null_coll = csa->hdr->std_null_coll; else { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NULLCOLLDIFF); gtmsource_exit(ABNORMAL_SHUTDOWN); } } if (!REPL_ALLOWED(csa) && JNL_ALLOWED(csa)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_REPLOFFJNLON, 2, DB_LEN_STR(reg)); gtmsource_exit(ABNORMAL_SHUTDOWN); } if (reg->read_only && REPL_ALLOWED(csa)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Source Server does not have write permissions to one or " "more database files that are replicated")); gtmsource_exit(ABNORMAL_SHUTDOWN); } } /* Initialize source server alive/dead state related fields in "gtmsource_local" before the ftok semaphore is released */ gtmsource_local->gtmsource_pid = process_id; gtmsource_local->gtmsource_state = GTMSOURCE_START; if (is_jnlpool_creator) { DEBUG_ONLY(jnlpool.jnlpool_ctl->jnlpool_creator_pid = process_id); gtmsource_seqno_init(this_side_std_null_coll); if (ROOTPRIMARY_SPECIFIED == gtmsource_options.rootprimary) { /* Created the journal pool as a root primary. Append a history record to the replication instance file. * Invoke the function "gtmsource_rootprimary_init" to do that. */ gtmsource_rootprimary_init(jnlpool.jnlpool_ctl->jnl_seqno); } } /* after this point we can no longer have the case where all the regions are unreplicated/non-journaled. */ # ifndef REPL_DEBUG_NOBACKGROUND /* It is necessary for every process that is using the ftok semaphore to increment the counter by 1. This is used * by the last process that shuts down to delete the ftok semaphore when it notices the counter to be 0. * Note that the parent source server startup command would have done an increment of the ftok counter semaphore * for the replication instance file. But the source server process (the child) that comes here would not have done * that. Do that while the parent is still holding on to the ftok semaphore waiting for our okay. */ if (!ftok_sem_incrcnt(jnlpool.jnlpool_dummy_reg)) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_JNLPOOLSETUP); /* Increment the source server count semaphore */ status = incr_sem(SOURCE, SRC_SERV_COUNT_SEM); if (0 != status) { save_errno = errno; rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Counter semaphore increment failure in child source server"), save_errno); } # else if (0 != (save_errno = rel_sem_immediate(SOURCE, JNL_POOL_ACCESS_SEM))) { rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_JNLPOOLSETUP, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error in rel_sem_immediate"), save_errno); } # endif /* REPL_DEBUG_NOBACKGROUND */ gtmsource_srv_count++; gtmsource_local->child_server_running = TRUE; /* At this point, the parent startup command will stop waiting for child */ gtm_event_log_init(); /* Log source server startup command line first */ SPRINTF(tmpmsg, "%s %s\n", cli_lex_in_ptr->argv[0], cli_lex_in_ptr->in_str); repl_log(gtmsource_log_fp, TRUE, TRUE, tmpmsg); SPRINTF(tmpmsg, "GTM Replication Source Server with Pid [%d] started for Secondary Instance [%s]", process_id, gtmsource_local->secondary_instname); sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, LEN_AND_STR(tmpmsg)); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); if (is_jnlpool_creator) { repl_log(gtmsource_log_fp, TRUE, TRUE, "Created jnlpool with shmid = [%d] and semid = [%d]\n", jnlpool.repl_inst_filehdr->jnlpool_shmid, jnlpool.repl_inst_filehdr->jnlpool_semid); } else repl_log(gtmsource_log_fp, TRUE, TRUE, "Attached to existing jnlpool with shmid = [%d] and semid = [%d]\n", jnlpool.repl_inst_filehdr->jnlpool_shmid, jnlpool.repl_inst_filehdr->jnlpool_semid); gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg); # ifdef GTM_TLS if (REPL_TLS_REQUESTED) { repl_do_tls_init(gtmsource_log_fp); assert(REPL_TLS_REQUESTED || PLAINTEXT_FALLBACK); } # endif if (jnlpool.jnlpool_ctl->freeze) { last_seen_freeze_flag = jnlpool.jnlpool_ctl->freeze; sgtm_putmsg(print_msg, VARLSTCNT(3) ERR_REPLINSTFROZEN, 1, jnlpool.repl_inst_filehdr->inst_info.this_instname); repl_log(gtmsource_log_fp, TRUE, FALSE, print_msg); sgtm_putmsg(print_msg, VARLSTCNT(3) ERR_REPLINSTFREEZECOMMENT, 1, jnlpool.jnlpool_ctl->freeze_comment); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); } gtmsource_local->jnlfileonly = gtmsource_options.jnlfileonly; do { /* If mode is passive, go to sleep. Wakeup every now and then and check to see if I have to become active. */ gtmsource_state = gtmsource_local->gtmsource_state = GTMSOURCE_START; if ((gtmsource_local->mode == GTMSOURCE_MODE_PASSIVE) && (gtmsource_local->shutdown == NO_SHUTDOWN)) { gtmsource_poll_actions(FALSE); SHORT_SLEEP(GTMSOURCE_WAIT_FOR_MODE_CHANGE); continue; } if (GTMSOURCE_MODE_PASSIVE == gtmsource_local->mode) { /* Shutdown initiated */ assert(gtmsource_local->shutdown == SHUTDOWN); sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, RTS_ERROR_LITERAL("GTM Replication Source Server Shutdown signalled")); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg); break; } gtmsource_poll_actions(FALSE); if (GTMSOURCE_CHANGING_MODE == gtmsource_state) continue; if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsource_local->mode) gtmsource_local->mode = GTMSOURCE_MODE_ACTIVE; SPRINTF(tmpmsg, "GTM Replication Source Server now in ACTIVE mode using port %d", gtmsource_local->secondary_port); sgtm_putmsg(print_msg, VARLSTCNT(4) ERR_REPLINFO, 2, LEN_AND_STR(tmpmsg)); repl_log(gtmsource_log_fp, TRUE, TRUE, print_msg); gtm_event_log(GTM_EVENT_LOG_ARGC, "MUPIP", "REPLINFO", print_msg); DEBUG_ONLY(repl_csa = &FILE_INFO(jnlpool.jnlpool_dummy_reg)->s_addrs;) assert(!repl_csa->hold_onto_crit); /* so it is ok to invoke "grab_lock" and "rel_lock" unconditionally */ grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, HANDLE_CONCUR_ONLINE_ROLLBACK); if (GTMSOURCE_HANDLE_ONLN_RLBK == gtmsource_state) { repl_log(gtmsource_log_fp, TRUE, TRUE, "Starting afresh due to ONLINE ROLLBACK\n"); repl_log(gtmsource_log_fp, TRUE, TRUE, "REPL INFO - Current Jnlpool Seqno : %llu\n", jnlpool.jnlpool_ctl->jnl_seqno); continue; } QWASSIGN(gtmsource_local->read_addr, jnlpool.jnlpool_ctl->write_addr); gtmsource_local->read = jnlpool.jnlpool_ctl->write; gtmsource_local->read_state = gtmsource_local->jnlfileonly ? READ_FILE : READ_POOL; read_jnl_seqno = gtmsource_local->read_jnl_seqno; assert(read_jnl_seqno <= jnlpool.jnlpool_ctl->jnl_seqno); if (read_jnl_seqno < jnlpool.jnlpool_ctl->jnl_seqno) { gtmsource_local->read_state = READ_FILE; QWASSIGN(gtmsource_save_read_jnl_seqno, jnlpool.jnlpool_ctl->jnl_seqno); gtmsource_pool2file_transition = TRUE; /* so that we read the latest gener jnl files */ } rel_lock(jnlpool.jnlpool_dummy_reg); if (SS_NORMAL != (status = gtmsource_alloc_tcombuff())) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(7) ERR_REPLCOMM, 0, ERR_TEXT, 2, RTS_ERROR_LITERAL("Error allocating initial tcom buffer space. Malloc error"), status); gtmsource_filter = NO_FILTER; if ('\0' != gtmsource_local->filter_cmd[0]) { if (SS_NORMAL == (status = repl_filter_init(gtmsource_local->filter_cmd))) gtmsource_filter |= EXTERNAL_FILTER; else gtmsource_exit(ABNORMAL_SHUTDOWN); } gtmsource_process(); /* gtmsource_process returns only when mode needs to be changed to PASSIVE */ assert(gtmsource_state == GTMSOURCE_CHANGING_MODE); gtmsource_ctl_close(); gtmsource_free_msgbuff(); gtmsource_free_tcombuff(); gtmsource_free_filter_buff(); gtmsource_stop_heartbeat(); if (FD_INVALID != gtmsource_sock_fd) repl_close(>msource_sock_fd); if (gtmsource_filter & EXTERNAL_FILTER) repl_stop_filter(); } while (TRUE);
/* 1. Allocate a basic initial condition handler stack that can be expanded later if necessary. * 2. On Linux, make sure bits 0,1, 4, 5, and 6 are set in /proc/PID/coredump_filter so dumps the sections that GT.M * cores need to have in them. */ void err_init(void (*x)()) { chnd = (condition_handler *)malloc((CONDSTK_INITIAL_INCR + CONDSTK_RESERVE) * SIZEOF(condition_handler)); chnd[0].ch_active = FALSE; chnd[0].save_active_ch = NULL; active_ch = ctxt = &chnd[0]; ctxt->ch = x; chnd_end = &chnd[CONDSTK_INITIAL_INCR]; /* chnd_end is the end of the condition handler stack */ chnd_incr = CONDSTK_INITIAL_INCR * 2; # if defined(__linux__) || defined(__NetBSD__) /* Read the coredump_filter value from /proc for this process, update the value if necessary so we have the proper * flags set to get the info we (and gtmpcat) need to properly process a core file. Note any errors we encounter just * send a message to the operator log and return as nothing here should prevent GT.M from running. * * Note "man 5 core" on x86-64 Linux (Ubuntu 12.04) notes that the /proc/PID/coredump_filter file is only provided when * the Linux kernel is built with the CONFIG_ELF_CORE configuration option. This *seems* to control whether or not the * kernel supports the ELF loader or not. To date, all Linux flavors GT.M supports use ELF so we regard this as largely * mandatory though in the future it may happen that GT.M works yet runs with something other than ELF. In that case, * we'd need to change the below to avoid the operator log messages every time GT.M initializes. */ { int rc; unsigned int filterbits; char procfn[SIZEOF(COREDUMPFILTERFN) + MAX_DIGITS_IN_INT]; /* File name of file to update */ char filter[FILTERPARMSIZE], *filterend; /* Value read in & written out */ char *rcc; FILE *filterstrm; /* filter stream file block */ /* Note use simple basic methods since this early in initialization not everything is necessarily setup to * be able to properly use the *print*() wrapper functions. */ rc = snprintf(procfn, SIZEOF(procfn), COREDUMPFILTERFN, getpid()); if (0 > rc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("sprintf()"), CALLFROM, rc); return; } filterstrm = fopen(procfn, "r"); if (NULL == filterstrm) { rc = errno; send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fopen()"), CALLFROM, rc); return; } rcc = fgets(filter, SIZEOF(filter), filterstrm); if (NULL == rcc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fgets()"), CALLFROM, rc); return; } rc = fclose(filterstrm); if (0 > rc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fclose()"), CALLFROM, rc); return; } filterend = filter + SIZEOF(filter); filterbits = (unsigned int)strtol(filter, &filterend, 16); if (FILTERENABLEBITS != (filterbits & FILTERENABLEBITS)) { /* At least one flag was missing - reset them */ filterbits = filterbits | FILTERENABLEBITS; filterstrm = fopen(procfn, "w"); if (NULL == filterstrm) { rc = errno; send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fopen()"), CALLFROM, rc); return; } rc = fprintf(filterstrm, "0x%08x", filterbits); if (0 > rc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fprintf"), CALLFROM, rc); return; } fclose(filterstrm); if (0 > rc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("fclose()"), CALLFROM, rc); return; } } } # endif }
OS_PAGE_SIZE_DECLARE uint4 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog) { sm_uc_ptr_t old_base[2], mmap_retaddr; boolean_t was_crit, is_mm; char buff[DISK_BLOCK_SIZE]; int result, save_errno, status; uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total; uint4 jnl_status, to_wait, to_msg, wait_period; gtm_uint64_t avail_blocks, mmap_sz; off_t new_eof; trans_num curr_tn; unix_db_info *udi; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; cache_rec_ptr_t cr; DCL_THREADGBL_ACCESS; assert(!IS_DSE_IMAGE); assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */ assert(!process_exiting); DEBUG_ONLY(old_base[0] = old_base[1] = NULL); assert(!gv_cur_region->read_only); udi = FILE_INFO(gv_cur_region); is_mm = (dba_mm == cs_addrs->hdr->acc_meth); # if !defined(MM_FILE_EXT_OK) if (!udi->grabbed_access_sem && is_mm) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ # endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); if (!blocks) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks * and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this * manner as every non-bitmap block must have an associated bitmap) * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert(0 < (int)new_blocks); if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { assert(FALSE); send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { if (!(gtmDebugLevel & GDL_IgnoreAvailSpace)) { /* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB * in space it could never fit it if wasn't sparse. Needed for some tests. */ avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (blocks > (uint4)avail_blocks) { SETUP_THREADGBL_ACCESS; if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs)) return (uint4)(NO_FREE_SPACE); else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks); } else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); } } } /* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */ was_crit = cs_addrs->now_crit; assert(!cs_addrs->hold_onto_crit || was_crit); /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur. * If we are coming from online rollback (when that feature is available), we will come in holding crit and in * the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself. * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for * freeze. * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited * for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this * function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that * at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region * to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused * op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which * we hold crit. */ assert(!was_crit || !cs_data->freeze || (dollar_tlevel && (CDB_STAGNATE <= t_tries))); /* * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) { for ( ; ; ) { grab_crit(gv_cur_region); if (!cs_data->freeze && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (cs_data->freeze || IS_REPL_INST_FROZEN) hiber_start(1000); } } else if (cs_data->freeze && dollar_tlevel) { /* We don't want to continue with file extension as explained above. Hence return with an error code which * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly. */ assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_FREEZE_PROG; } if (IS_REPL_INST_FROZEN && trans_in_prog) { assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_INST_FREEZE; } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); old_total = cs_data->trans_hist.total_blks; if (old_total != filesize) { /* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM, * we still need to remap the database */ assert((old_total > filesize) GTM_TRUNCATE_ONLY( || !is_mm)); /* For BG, someone else could have truncated or extended - we have no idea */ GDSFILEXT_CLNUP; return (SS_NORMAL); }
int send_mesg2gtmsecshr(unsigned int code, unsigned int id, char *path, int path_len) { int client_sockfd, create_server_status, fcntl_res; int req_code, wait_count = 0; int recv_len, send_len; ssize_t num_chars_recvd, num_chars_sent; int save_errno, ret_code = 0, init_ret_code = 0; int loop_count = 0; int recv_complete, send_complete; boolean_t retry = FALSE; size_t server_proc_len; int semop_res; int selstat, status; char *recv_ptr, *send_ptr; struct sockaddr_un server_proc; struct sembuf sop[4]; fd_set wait_on_fd; gtmsecshr_mesg mesg; TID timer_id; int4 msec_timeout; char *gtm_tmp_ptr; struct stat stat_buf; struct shmid_ds shm_info; int len; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; DBGGSSHR((LOGFLAGS, "secshr_client: New send request\n")); if (!gtm_dist_ok_to_use) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_GTMDISTUNVERIF, 4, STRLEN(gtm_dist), gtm_dist, gtmImageNames[image_type].imageNameLen, gtmImageNames[image_type].imageName); /* Create communication key (hash of release name) if it has not already been done */ if (0 == TREF(gtmsecshr_comkey)) { STR_HASH((char *)gtm_release_name, gtm_release_name_len, TREF(gtmsecshr_comkey), 0); } timer_id = (TID)send_mesg2gtmsecshr; if (!gtmsecshr_file_check_done) { len = STRLEN(gtm_dist); memcpy(gtmsecshr_path, gtm_dist, len); gtmsecshr_path[len] = '/'; memcpy(gtmsecshr_path + len + 1, GTMSECSHR_EXECUTABLE, STRLEN(GTMSECSHR_EXECUTABLE)); gtmsecshr_pathname.addr = gtmsecshr_path; gtmsecshr_pathname.len = len + 1 + STRLEN(GTMSECSHR_EXECUTABLE); assertpro(GTM_PATH_MAX > gtmsecshr_pathname.len); gtmsecshr_pathname.addr[gtmsecshr_pathname.len] = '\0'; if (-1 == Stat(gtmsecshr_pathname.addr, &stat_buf)) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_LIT("stat"), CALLFROM, errno); if ((ROOTUID != stat_buf.st_uid) || !(stat_buf.st_mode & S_ISUID) || (0 != ACCESS(gtmsecshr_pathname.addr, (X_OK)))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_GTMSECSHRPERM); gtmsecshr_file_check_done = TRUE; } if (!gtmsecshr_sock_init_done && (0 < (init_ret_code = gtmsecshr_sock_init(CLIENT)))) /* Note assignment */ return init_ret_code; DEBUG_ONLY(mesg.usesecshr = TREF(gtm_usesecshr)); /* Flag ignored in PRO build */ while (MAX_COMM_ATTEMPTS >= loop_count) { /* first, try the sendto */ req_code = mesg.code = code; send_len = (int4)(GTM_MESG_HDR_SIZE); if (REMOVE_FILE == code) { assert(GTM_PATH_MAX > path_len); /* Name is not user supplied so simple check */ memcpy(mesg.mesg.path, path, path_len); send_len += path_len; } else if (FLUSH_DB_IPCS_INFO == code) { assert(GTM_PATH_MAX > db_ipcs.fn_len); memcpy(&mesg.mesg.db_ipcs, &db_ipcs, (offsetof(ipcs_mesg, fn[0]) + db_ipcs.fn_len + 1)); /* Most of the time file length is much smaller than GTM_PATH_MAX */ send_len += offsetof(ipcs_mesg, fn[0]); send_len += mesg.mesg.db_ipcs.fn_len + 1; } else { mesg.mesg.id = id; send_len += SIZEOF(mesg.mesg.id); } DBGGSSHR((LOGFLAGS, "secshr_client: loop %d frm-pid: %d to-pid: %d send_len: %d code: %d\n", loop_count, process_id, id, send_len, code)); mesg.comkey = TREF(gtmsecshr_comkey); /* Version communication key */ mesg.pid = process_id; /* Process id of client */ mesg.seqno = ++cur_seqno; send_ptr = (char *)&mesg; send_complete = FALSE; SENDTO_SOCK(gtmsecshr_sockfd, send_ptr, send_len, 0, (struct sockaddr *)>msecshr_sock_name, (GTM_SOCKLEN_TYPE)gtmsecshr_sockpath_len, num_chars_sent); /* This form handles EINTR internally */ save_errno = errno; DBGGSSHR((LOGFLAGS, "secshr_client: sendto rc: %d errno: %d (only important if rc=-1)\n", (int)num_chars_sent, save_errno)); if (0 >= num_chars_sent) { /* SENDTO_SOCK failed - start server and attempt to resend */ if ((EISCONN == save_errno) || (EBADF == save_errno)) { gtmsecshr_sock_cleanup(CLIENT); gtmsecshr_sock_init(CLIENT); wcs_backoff(loop_count + 1); DBGGSSHR((LOGFLAGS, "secshr_client: Connection error, reset socket\n")); } else { if (0 < loop_count) /* No message unless attempted server start at least once */ send_msg_csa(CSA_ARG(NULL) VARLSTCNT(11) ERR_GTMSECSHRSRVF, 4, RTS_ERROR_TEXT("Client"), process_id, loop_count - 1, ERR_TEXT, 2, RTS_ERROR_TEXT("sendto to gtmsecshr failed"), save_errno); START_SERVER; DBGGSSHR((LOGFLAGS, "secshr_client: sendto() failed - restarting server\n")); } loop_count++; continue; } SETUP_FOR_RECV; /* Sets timer, recvcomplete = FALSE */ do { /* Note RECVFROM does not loop on EINTR return codes so must be handled. Note also we only expect * to receive the message header back as an acknowledgement. */ num_chars_recvd = RECVFROM(gtmsecshr_sockfd, recv_ptr, GTM_MESG_HDR_SIZE, 0, (struct sockaddr *)0, (GTM_SOCKLEN_TYPE *)0); save_errno = errno; DBGGSSHR((LOGFLAGS, "secshr_client: recvfrom rc: %d errno: %d (only important if rc=-1)\n", (int)num_chars_recvd, save_errno)); if (0 <= num_chars_recvd) { /* Message received - make sure it is large enough to have set seqno before we do anything * to rely on it. */ if ((GTM_MESG_HDR_SIZE <= num_chars_recvd) && (mesg.seqno == cur_seqno) && (TREF(gtmsecshr_comkey) == mesg.comkey)) recv_complete = TRUE; else { /* Message too short or not correct sequence */ cancel_timer(timer_id); /* Print True/False for the possibilities we failed */ DBGGSSHR((LOGFLAGS, "secshr_client: Message incorrect - chars: %d, seq: %d\n", (GTM_MESG_HDR_SIZE <= num_chars_recvd), (mesg.seqno == cur_seqno))); SETUP_FOR_RECV; continue; } } else { /* Something untoward happened */ if (client_timer_popped) break; if (EINTR == save_errno) /* Had an irrelevant interrupt - ignore */ continue; if (EBADF == save_errno) break; send_msg_csa(CSA_ARG(NULL) VARLSTCNT(11) ERR_GTMSECSHRSRVF, 4, RTS_ERROR_TEXT("Client"), process_id, loop_count - 1, ERR_TEXT, 2, RTS_ERROR_TEXT("recvfrom from gtmsecshr failed"), save_errno); if ((ECONNRESET == save_errno) || (ENOTCONN == save_errno)) { num_chars_recvd = 0; break; } gtmsecshr_sock_cleanup(CLIENT); return save_errno; } } while (!recv_complete); cancel_timer(timer_id); if (client_timer_popped || (EBADF == save_errno) || (0 == num_chars_recvd)) { /* Timeout, connection issues, bad descriptor block - retry */ gtmsecshr_sock_cleanup(CLIENT); gtmsecshr_sock_init(CLIENT); retry = TRUE; if (client_timer_popped) { START_SERVER; DBGGSSHR((LOGFLAGS, "secshr_client: Read timer popped - restarting server\n")); } else DBGGSSHR((LOGFLAGS, "secshr_client: Read error - socket reset, retrying\n")); loop_count++; continue; } /* Response to *our* latest message available */ assert(recv_complete); if (ret_code = mesg.code) /* Warning - assignment */ { DBGGSSHR((LOGFLAGS, "secshr_client: non-zero response from gtmsecshr - request: %d retcode: %d\n", req_code, ret_code)); if (INVALID_COMKEY == ret_code) { /* Comkey mismatch means for a different version of GT.M - we will not handle it */ send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFIL, 7, RTS_ERROR_TEXT("Client"), process_id, mesg.pid, req_code, RTS_ERROR_TEXT(mesg.mesg.path), ERR_TEXT, 2, RTS_ERROR_STRING("Communicating with wrong GT.M version")); rts_error_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFIL, 7, RTS_ERROR_TEXT("Client"), process_id, mesg.pid, req_code, RTS_ERROR_TEXT(mesg.mesg.path), ERR_TEXT, 2, RTS_ERROR_STRING("Communicating with wrong GT.M version")); break; /* rts_error should not return */ } switch(req_code) { case REMOVE_FILE: /* Called from mutex_sock_init(). Path (and length) contain null terminator byte. * See if file still exists (may have been deleted by earlier attempt). Caller * handles actual error. */ if ((-1 != Stat(path, &stat_buf)) || (ENOENT != ret_code)) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(14) ERR_GTMSECSHRSRVFIL, 7, RTS_ERROR_TEXT("Client"), process_id, mesg.pid, req_code, RTS_ERROR_TEXT(mesg.mesg.path), ERR_TEXT, 2, RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]), mesg.code); else ret_code = 0; /* File is gone so this or a previous try actually worked */ break; case REMOVE_SEM: /* See if semaphore still eixsts (may have been removed by earlier attempt that * got a reply confused or lost). If not there, no error. Else error to op-log. */ if ((-1 != semctl(id, 0, GETVAL)) && !SEM_REMOVED(errno)) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFID, 6, RTS_ERROR_TEXT("Client"), process_id, mesg.pid, req_code, mesg.mesg.id, ERR_TEXT, 2, RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]), mesg.code); else ret_code = 0; /* File is gone so this or a previous try actually worked */ case REMOVE_SHM: /* See if shmem still eixsts (may have been removed by earlier attempt that * got a reply confused or lost). If not there, no error. Else error to op-log. * Note - */ if ((-1 != shmctl(id, IPC_STAT, &shm_info)) && !SEM_REMOVED(errno)) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFID, 6, RTS_ERROR_TEXT("Client"), process_id, mesg.pid, req_code, mesg.mesg.id, ERR_TEXT, 2, RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]), mesg.code); else ret_code = 0; /* File is gone so this or a previous try actually worked */ break; case FLUSH_DB_IPCS_INFO: /* Errors handled by caller */ break; default: if (EPERM != mesg.code && EACCES != mesg.code) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(13) ERR_GTMSECSHRSRVFID, 6, RTS_ERROR_TEXT("Client"), process_id, mesg.pid, req_code, mesg.mesg.id, ERR_TEXT, 2, RTS_ERROR_STRING(secshr_fail_mesg_code[req_code]), mesg.code); break; } } break; } if (MAX_COMM_ATTEMPTS < loop_count) { ret_code = -1; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_GTMSECSHRSRVF, 4, RTS_ERROR_TEXT("Client"), process_id, loop_count - 1, ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to communicate with gtmsecshr")); /* If gtm_tmp is not defined, show default path */ if (gtm_tmp_ptr = GETENV("gtm_tmp")) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_GTMSECSHRTMPPATH, 2, RTS_ERROR_TEXT(gtm_tmp_ptr), ERR_TEXT, 2, RTS_ERROR_TEXT("(from $gtm_tmp)")); else send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_GTMSECSHRTMPPATH, 2, RTS_ERROR_TEXT("/tmp")); } if (ONETIMESOCKET == init_ret_code) gtmsecshr_sock_cleanup(CLIENT); return ret_code; }
void generic_signal_handler(int sig, siginfo_t *info, void *context) { gtm_sigcontext_t *context_ptr; void (*signal_routine)(); DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; FORWARD_SIG_TO_MAIN_THREAD_IF_NEEDED(sig); /* Save parameter value in global variables for easy access in core */ dont_want_core = FALSE; /* (re)set in case we recurse */ created_core = FALSE; /* we can deal with a second core if needbe */ exi_condition = sig; if (NULL != info) exi_siginfo = *info; else memset(&exi_siginfo, 0, SIZEOF(*info)); # if defined(__ia64) && defined(__hpux) context_ptr = (gtm_sigcontext_t *)context; /* no way to make a copy of the context */ memset(&exi_context, 0, SIZEOF(exi_context)); # else if (NULL != context) exi_context = *(gtm_sigcontext_t *)context; else memset(&exi_context, 0, SIZEOF(exi_context)); context_ptr = &exi_context; # endif /* Check if we are fielding nested immediate shutdown signals */ if (EXIT_IMMED <= exit_state) { switch(sig) { /* If we are dealing with one of these three dangerous signals which we have * already hit while attempting to shutdown once, die with core now. */ case SIGSEGV: case SIGBUS: case SIGILL: if (core_in_progress) { if (exit_handler_active) _exit(sig); else exit(sig); } ++core_in_progress; DUMP_CORE; GTMASSERT; default: ; } } switch(sig) { case SIGTERM: if (!IS_GTMSECSHR_IMAGE) { forced_exit_err = ERR_FORCEDHALT; /* If nothing pending AND we have crit or in wcs_wtstart() or already in exit processing, wait to * invoke shutdown. wcs_wtstart() manipulates the active queue that a concurrent process in crit * in bt_put() might be waiting for. interrupting it can cause deadlocks (see C9C11-002178). */ if (DEFER_EXIT_PROCESSING) { SET_FORCED_EXIT_STATE; exit_state++; /* Make exit pending, may still be tolerant though */ assert(!IS_GTMSECSHR_IMAGE); if (exit_handler_active && !gtm_quiet_halt) SEND_AND_PUT_MSG(VARLSTCNT(1) forced_exit_err); return; } exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; /* Set this BEFORE cancelling timers as wcs_phase2_commit_wait * relies on this. */ if (ERR_FORCEDHALT != forced_exit_err || !gtm_quiet_halt) SEND_AND_PUT_MSG(VARLSTCNT(1) forced_exit_err); } else { /* Special case for gtmsecshr - no deferral just exit */ forced_exit_err = ERR_GTMSECSHRSHUTDN; if (OK_TO_SEND_MSG) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(1) forced_exit_err); } dont_want_core = TRUE; break; case SIGQUIT: /* Handle SIGQUIT specially which we ALWAYS want to defer if possible as it is always sent */ dont_want_core = TRUE; extract_signal_info(sig, &exi_siginfo, context_ptr, &signal_info); switch(signal_info.infotype) { case GTMSIGINFO_NONE: forced_exit_err = ERR_KILLBYSIG; break; case GTMSIGINFO_USER: forced_exit_err = ERR_KILLBYSIGUINFO; break; case GTMSIGINFO_ILOC + GTMSIGINFO_BADR: forced_exit_err = ERR_KILLBYSIGSINFO1; break; case GTMSIGINFO_ILOC: forced_exit_err = ERR_KILLBYSIGSINFO2; break; case GTMSIGINFO_BADR: forced_exit_err = ERR_KILLBYSIGSINFO3; break; default: exit_state = EXIT_IMMED; GTMASSERT; } /* If nothing pending AND we have crit or already in exit processing, wait to invoke shutdown */ if (DEFER_EXIT_PROCESSING) { SET_FORCED_EXIT_STATE; exit_state++; /* Make exit pending, may still be tolerant though */ assert(!IS_GTMSECSHR_IMAGE); return; } exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; switch(signal_info.infotype) { case GTMSIGINFO_NONE: SEND_AND_PUT_MSG(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, sig); break; case GTMSIGINFO_USER: SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.send_pid, signal_info.send_uid); break; case GTMSIGINFO_ILOC + GTMSIGINFO_BADR: SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.int_iadr, signal_info.bad_vadr); break; case GTMSIGINFO_ILOC: SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.int_iadr); break; case GTMSIGINFO_BADR: SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.bad_vadr); break; } break; # ifdef _AIX case SIGDANGER: forced_exit_err = ERR_KRNLKILL; /* If nothing pending AND we have crit or already in exit processing, wait to invoke shutdown */ if (DEFER_EXIT_PROCESSING) { SET_FORCED_EXIT_STATE; exit_state++; /* Make exit pending, may still be tolerant though */ assert(!IS_GTMSECSHR_IMAGE); return; } exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; SEND_AND_PUT_MSG(VARLSTCNT(1) forced_exit_err); dont_want_core = TRUE; break; # endif default: extract_signal_info(sig, &exi_siginfo, context_ptr, &signal_info); switch(signal_info.infotype) { case GTMSIGINFO_NONE: exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; SEND_AND_PUT_MSG(VARLSTCNT(6) ERR_KILLBYSIG, 4, GTMIMAGENAMETXT(image_type), process_id, sig); break; case GTMSIGINFO_USER: /* This signal was SENT to us so it can wait until we are out of crit to cause an exit */ forced_exit_err = ERR_KILLBYSIGUINFO; /* If nothing pending AND we have crit or already exiting, wait to invoke shutdown */ if (DEFER_EXIT_PROCESSING) { assert(!IS_GTMSECSHR_IMAGE); SET_FORCED_EXIT_STATE; exit_state++; /* Make exit pending, may still be tolerant though */ need_core = TRUE; gtm_fork_n_core(); /* Generate "virgin" core while we can */ return; } exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGUINFO, 6, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.send_pid, signal_info.send_uid); break; case GTMSIGINFO_ILOC + GTMSIGINFO_BADR: exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; SEND_AND_PUT_MSG(VARLSTCNT(8) ERR_KILLBYSIGSINFO1, 6, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.int_iadr, signal_info.bad_vadr); break; case GTMSIGINFO_ILOC: exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO2, 5, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.int_iadr); break; case GTMSIGINFO_BADR: exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; SEND_AND_PUT_MSG(VARLSTCNT(7) ERR_KILLBYSIGSINFO3, 5, GTMIMAGENAMETXT(image_type), process_id, sig, signal_info.bad_vadr); break; default: exit_state = EXIT_IMMED; SET_PROCESS_EXITING_TRUE; GTMASSERT; } if (0 != signal_info.sig_err) { SEND_AND_PUT_MSG(VARLSTCNT(1) signal_info.sig_err); } break; } /* switch (sig) */ /* Stop the timers but do not cancel them. This allows the timer structures to appear in the core where gtmpcat can * extract them allowing us to see what was going on. */ sys_canc_timer(); FFLUSH(stdout); if (!dont_want_core) { need_core = TRUE; gtm_fork_n_core(); } /* If any special routines are registered to be driven on a signal, drive them now */ if (0 != exi_condition) { /* If this is a GTM-ish process in runtime mode, call the routine to generate the zshow dump. This separate * routine necessary because (1) generic_signal_handler() no longer calls condition handlers and (2) we couldn't * use call_on_signal because it would already be in use in updproc() where it is also possible this routine * needs to be called. Bypass this code if we didn't create a core since that means it is not a GTM * issue that forced its demise (and since this is an uncaring interrupt, we could be in any number of * situations that would cause a ZSHOW dump to explode). Better for user to use jobexam to cause a dump prior * to terminating the process in a deferrable fashion. */ if (!dont_want_core && IS_MCODE_RUNNING && (NULL != (signal_routine = RFPTR(create_fatal_error_zshow_dmp_fptr)))) { /* note assignment of signal_routine above */ SFPTR(create_fatal_error_zshow_dmp_fptr, NULL); (*signal_routine)(exi_condition, FALSE); } /* Some mupip functions define an entry point to drive on signals. Make sure to do this AFTER we create the * dump file above as it may detach things (like the recvpool) we need to create the above dump. */ if (NULL != (signal_routine = call_on_signal)) /* Note assignment */ { call_on_signal = NULL; /* So we don't recursively call ourselves */ (*signal_routine)(); } } if (!IS_GTMSECSHR_IMAGE) { assert((EXIT_IMMED <= exit_state) || !exit_handler_active); exit(-exi_condition); } else return; }
boolean_t mu_truncate(int4 truncate_percent) { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int num_local_maps; int lmap_num, lmap_blk_num; int bml_status, sigkill; int save_errno; int ftrunc_status; uint4 jnl_status; uint4 old_total, new_total; uint4 old_free, new_free; uint4 end_blocks; int4 blks_in_lmap, blk; gtm_uint64_t before_trunc_file_size; off_t trunc_file_size; off_t padding; uchar_ptr_t lmap_addr; boolean_t was_crit; uint4 found_busy_blk; srch_blk_status bmphist; srch_blk_status *blkhist; srch_hist alt_hist; trans_num curr_tn; blk_hdr_ptr_t lmap_blk_hdr; block_id *blkid_ptr; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; char *err_msg; intrpt_state_t prev_intrpt_state; off_t offset; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csa = cs_addrs; csd = cs_data; if (dba_mm == csd->acc_meth) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } /* already checked for parallel truncates on this region --- see mupip_reorg.c */ gv_target = NULL; assert(csa->nl->trunc_pid == process_id); assert(dba_mm != csd->acc_meth); old_total = csa->ti->total_blks; old_free = csa->ti->free_blocks; sigkill = 0; found_busy_blk = 0; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ assert(csd->bplmap == BLKS_PER_LMAP); end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */ if (0 == end_blocks) end_blocks = BLKS_PER_LMAP; num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP); /* ======================================== PHASE 1 ======================================== */ for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--) { if (mu_ctrly_occurred || mu_ctrlc_occurred) return TRUE; assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */ if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } lmap_blk_num = lmap_num * BLKS_PER_LMAP; if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { found_busy_blk = lmap_blk_num; break; } blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP; /* Loop through non-bitmap blocks of this lmap, do recycled2free */ DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n", lmap_num, lmap_blk_num, blks_in_lmap)); for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;) { t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) /* retry loop for recycled to free transactions */ { curr_tn = csd->trans_hist.curr_tn; /* Read the nth local bitmap into memory */ bmphist.blk_num = lmap_blk_num; bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr); lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr; if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr); /* starting from the hint (blk itself), find the first busy or recycled block */ blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status); assert(blk < BLKS_PER_LMAP); if (blk == -1 || blk >= blks_in_lmap) { /* done with this lmap, continue to next */ t_abort(gv_cur_region, csa); break; } else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { /* stop processing blocks... skip ahead to phase 2 */ found_busy_blk = lmap_blk_num; t_abort(gv_cur_region, csa); break; } else if (BLK_RECYCLED == bml_status) { /* Write PBLK records for recycled blocks only if before_image journaling is * enabled. t_end() takes care of checking if journaling is enabled and * writing PBLK record. We have to at least mark the recycled block as free. */ RESET_UPDATE_ARRAY; update_trans = UPDTRNS_DB_UPDATED_MASK; *((block_id *)update_array_ptr) = blk; update_array_ptr += SIZEOF(block_id); *(int *)update_array_ptr = 0; alt_hist.h[1].blk_num = 0; alt_hist.h[0].level = 0; alt_hist.h[0].cse = NULL; alt_hist.h[0].tn = curr_tn; alt_hist.h[0].blk_num = lmap_blk_num + blk; alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num, &alt_hist.h[0].cycle, &alt_hist.h[0].cr); if (!alt_hist.h[0].buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } if (!t_recycled2free(&alt_hist.h[0])) { t_retry(cdb_sc_lostbmlcr); continue; } t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0); /* Set the opcode for INCTN record written by t_end() */ inctn_opcode = inctn_blkmarkfree; if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; /* block processed, scan from the next one */ blk++; break; } else { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_badbitmap); continue; } } /* END recycled2free retry loop */ } /* END scanning blocks of this particular lmap */ /* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */ /* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */ DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num)); t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { RESET_UPDATE_ARRAY; BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */ curr_tn = csd->trans_hist.curr_tn; blkhist = &alt_hist.h[0]; blkhist->blk_num = lmap_blk_num; blkhist->tn = curr_tn; blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ /* Read the nth local bitmap into memory */ blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr; if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); blkhist->blk_num = 0; /* create empty history for bitmap block */ if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; break; } } /* END scanning lmaps */ /* ======================================== PHASE 2 ======================================== */ assert(!csa->now_crit); for (;;) { /* wait for FREEZE, we don't want to truncate a frozen database */ grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(csa, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } assert(csa->nl->trunc_pid == process_id); /* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */ if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) { assert(FALSE); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"), DB_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return FALSE; } csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk); assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk)); new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP); if (mu_ctrly_occurred || mu_ctrlc_occurred) { rel_crit(gv_cur_region); return TRUE; } else if (csa->ti->total_blks != old_total || new_total == old_total) { assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); rel_crit(gv_cur_region); return TRUE; } else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (SNAPSHOTS_IN_PROG(csa->nl)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); if (JNL_ENABLED(csa)) { /* Write JRT_TRUNC and INCTN records */ if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = csa->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, csa); if (SS_NORMAL != jnl_status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); else { if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total); inctn_opcode = inctn_mu_reorg; jnl_write_inctn_rec(csa); jnl_status = jnl_flush(gv_cur_region); if (SS_NORMAL != jnl_status) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"), jnl_status); assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */ } } } /* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */ curr_tn = csa->ti->curr_tn; CHECK_TN(csa, csd, curr_tn); udi = FILE_INFO(gv_cur_region); /* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */ trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1)); csd->after_trunc_total_blks = new_total; csd->before_trunc_free_blocks = csa->ti->free_blocks; csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */ /* file size and total blocks: INCONSISTENT */ csa->ti->total_blks = new_total; /* past the point of no return -- shared memory intact */ assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total)); csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total); new_free = csa->ti->free_blocks; KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */ fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); CHECK_DBSYNC(gv_cur_region, save_errno); /* past the point of no return -- shared memory deleted */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */ clear_cache_array(csa, csd, gv_cur_region, new_total, old_total); offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size; save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff))); if (0 != save_errno) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); return FALSE; } KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */ /* Execute an ftruncate() and truncate the DB file * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS) * It ignores kill -9 signal till its operation is completed. * So we can safely assume that the result of ftruncate() will be complete. */ FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status); if (0 != ftrunc_status) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); /* should go through recover_truncate now, which will again try to FTRUNCATE */ return FALSE; } /* file size and total blocks: CONSISTENT (shrunk) */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */ csa->nl->root_search_cycle++; /* Force concurrent processes to restart in t_end/tp_tend to make sure no one * tries to commit updates past the end of the file. Bitmap validations together * with highest_lbm_with_busy_blk should actually be sufficient, so this is * just to be safe. */ csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */ /* Increment TN */ assert(csa->ti->early_tn == csa->ti->curr_tn); csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1; INCREMENT_CURR_TN(csd); fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */ CHECK_DBSYNC(gv_cur_region, save_errno); ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); curr_tn = csa->ti->curr_tn; rel_crit(gv_cur_region); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn); util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].", FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free); return TRUE; } /* END of mu_truncate() */
uint4 jnl_file_lost(jnl_private_control *jpc, uint4 jnl_stat) { /* Notify operator and terminate journaling */ unsigned int status; sgmnt_addrs *csa; seq_num reg_seqno, jnlseqno; boolean_t was_lockid = FALSE, instfreeze_environ; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: assertpro(FALSE && jpc->region->dyn.addr->acc_meth); } # ifdef VMS /* The following assert has been removed as it could be FALSE if the caller is "jnl_file_extend" * assert(0 != memcmp(csa->nl->jnl_file.jnl_file_id.fid, zero_fid, SIZEOF(zero_fid))); */ # endif assert(csa->now_crit); /* We issue an rts_error (instead of shutting off journaling) in the following cases : {BYPASSOK} * 1) $gtm_error_on_jnl_file_lost is set to issue runtime error (if not already issued) in case of journaling issues. * 2) The process has the given message set in $gtm_custom_errors (indicative of instance freeze on error setup) * in which case the goal is to never shut-off journaling */ UNIX_ONLY(assert(jnlpool.jnlpool_ctl == jnlpool_ctl)); UNIX_ONLY(instfreeze_environ = INST_FREEZE_ON_MSG_ENABLED(csa, jnl_stat)); VMS_ONLY(instfreeze_environ = FALSE); if ((JNL_FILE_LOST_ERRORS == TREF(error_on_jnl_file_lost)) || instfreeze_environ) { VMS_ONLY(assert(FALSE)); /* Not fully implemented / supported on VMS. */ if (!process_exiting || instfreeze_environ || !csa->jnl->error_reported) { csa->jnl->error_reported = TRUE; in_wcs_recover = FALSE; /* in case we're called in wcs_recover() */ if (SS_NORMAL != jpc->status) rts_error_csa(CSA_ARG(csa) VARLSTCNT(7) jnl_stat, 4, JNL_LEN_STR(csa->hdr), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_stat, 4, JNL_LEN_STR(csa->hdr), DB_LEN_STR(gv_cur_region)); } return jnl_stat; } if (0 != jnl_stat) jnl_send_oper(jpc, jnl_stat); csa->hdr->jnl_state = jnl_closed; jpc->jnl_buff->cycle++; /* increment shared cycle so all future callers of jnl_ensure_open recognize journal switch */ assert(jpc->cycle < jpc->jnl_buff->cycle); if (REPL_ENABLED(csa->hdr)) { csa->hdr->repl_state = repl_was_open; reg_seqno = csa->hdr->reg_seqno; jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO; send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(jpc->region), ®_seqno, ®_seqno, &jnlseqno, &jnlseqno); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_JNLCLOSED, 3, DB_LEN_STR(jpc->region), &csa->ti->curr_tn); #ifdef VMS /* We can get a jnl_file_lost before the file is even created, so locking is done only if the lock exist */ if (0 != csa->jnl->jnllsb->lockid) { was_lockid = TRUE; status = gtm_enqw(EFN$C_ENF, LCK$K_EXMODE, csa->jnl->jnllsb, LCK$M_CONVERT | LCK$M_NODLCKBLK, NULL, 0, NULL, 0, NULL, PSL$C_USER, 0); if (SS$_NORMAL == status) status = csa->jnl->jnllsb->cond; } jnl_file_close(jpc->region, FALSE, FALSE); if (was_lockid) { if (SS$_NORMAL == status) status = gtm_deq(csa->jnl->jnllsb->lockid, NULL, PSL$C_USER, 0); assertpro(SS$_NORMAL == status); } # else jnl_file_close(jpc->region, FALSE, FALSE); #endif return EXIT_NRM; }
void gv_rundown(void) { gd_region *r_top, *r_save, *r_local; gd_addr *addr_ptr; sgm_info *si; int4 rundown_status = EXIT_NRM; /* if gds_rundown went smoothly */ # ifdef VMS vms_gds_info *gds_info; # elif UNIX unix_db_info *udi; # endif #if defined(DEBUG) && defined(UNIX) sgmnt_addrs *csa; # endif DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; r_save = gv_cur_region; /* Save for possible core dump */ gvcmy_rundown(); ENABLE_AST if (pool_init) rel_lock(jnlpool.jnlpool_dummy_reg); for (addr_ptr = get_next_gdr(NULL); addr_ptr; addr_ptr = get_next_gdr(addr_ptr)) { for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions; r_local < r_top; r_local++) { if (r_local->open && !r_local->was_open && dba_cm != r_local->dyn.addr->acc_meth) { /* Rundown has already occurred for GT.CM client regions through gvcmy_rundown() above. * Hence the (dba_cm != ...) check in the if above. Note that for GT.CM client regions, * region->open is TRUE although cs_addrs is NULL. */ # if defined(DEBUG) && defined(UNIX) if (is_jnlpool_creator && ANTICIPATORY_FREEZE_AVAILABLE && TREF(gtm_test_fake_enospc)) { /* Clear ENOSPC faking now that we are running down */ csa = REG2CSA(r_local); if (csa->nl->fake_db_enospc || csa->nl->fake_jnl_enospc) { send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT, 2, LEN_AND_LIT("Resetting fake_db_enospc and fake_jnl_enospc")); csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = FALSE; } } # endif gv_cur_region = r_local; tp_change_reg(); UNIX_ONLY(rundown_status |=) gds_rundown(); /* Now that gds_rundown is done, free up the memory associated with the region. * Ideally the following memory freeing code should go to gds_rundown, but * GT.CM calls gds_rundown() and we want to reuse memory for GT.CM. */ if (NULL != cs_addrs) { if (NULL != cs_addrs->dir_tree) FREE_CSA_DIR_TREE(cs_addrs); if (cs_addrs->sgm_info_ptr) { si = cs_addrs->sgm_info_ptr; /* It is possible we got interrupted before initializing all fields of "si" * completely so account for NULL values while freeing/releasing those fields. */ assert((si->tp_csa == cs_addrs) || (NULL == si->tp_csa)); if (si->jnl_tail) { CAREFUL_FREEUP_BUDDY_LIST(si->format_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->jnl_list); } CAREFUL_FREEUP_BUDDY_LIST(si->recompute_list); CAREFUL_FREEUP_BUDDY_LIST(si->new_buff_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_info_list); CAREFUL_FREEUP_BUDDY_LIST(si->tlvl_cw_set_list); CAREFUL_FREEUP_BUDDY_LIST(si->cw_set_list); if (NULL != si->blks_in_use) { free_hashtab_int4(si->blks_in_use); free(si->blks_in_use); si->blks_in_use = NULL; } if (si->cr_array_size) { assert(NULL != si->cr_array); if (NULL != si->cr_array) free(si->cr_array); } if (NULL != si->first_tp_hist) free(si->first_tp_hist); free(si); } if (cs_addrs->jnl) { assert(&FILE_INFO(cs_addrs->jnl->region)->s_addrs == cs_addrs); if (cs_addrs->jnl->jnllsb) { UNIX_ONLY(assert(FALSE)); free(cs_addrs->jnl->jnllsb); } free(cs_addrs->jnl); } GTMCRYPT_ONLY( if (cs_addrs->encrypted_blk_contents) free(cs_addrs->encrypted_blk_contents); ) } assert(gv_cur_region->dyn.addr->file_cntl->file_info); VMS_ONLY( gds_info = (vms_gds_info *)gv_cur_region->dyn.addr->file_cntl->file_info; if (gds_info->xabpro) free(gds_info->xabpro); if (gds_info->xabfhc) free(gds_info->xabfhc); if (gds_info->nam) { free(gds_info->nam->nam$l_esa); free(gds_info->nam); } if (gds_info->fab) free(gds_info->fab); ) free(gv_cur_region->dyn.addr->file_cntl->file_info); free(gv_cur_region->dyn.addr->file_cntl); } r_local->open = r_local->was_open = FALSE; } }
uint4 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog) { sm_uc_ptr_t old_base[2], mmap_retaddr; boolean_t was_crit, is_mm; int result, save_errno, status; DEBUG_ONLY(int first_save_errno); uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total; uint4 jnl_status; gtm_uint64_t avail_blocks, mmap_sz; off_t new_eof, new_size; trans_num curr_tn; unix_db_info *udi; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; cache_rec_ptr_t cr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(!IS_DSE_IMAGE); assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */ assert(!process_exiting); DEBUG_ONLY(old_base[0] = old_base[1] = NULL); assert(!gv_cur_region->read_only); udi = FILE_INFO(gv_cur_region); is_mm = (dba_mm == cs_addrs->hdr->acc_meth); # if !defined(MM_FILE_EXT_OK) if (!udi->grabbed_access_sem && is_mm) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ # endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); # if defined(__sun) || defined(__hpux) cs_data->defer_allocate = TRUE; # endif if (!blocks && (cs_data->defer_allocate || (TRANS_IN_PROG_TRUE == trans_in_prog))) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks * and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this * manner as every non-bitmap block must have an associated bitmap) * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert((0 < (int)new_blocks) || (!cs_data->defer_allocate && (0 == new_blocks))); if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { assert(WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { if (!(gtmDebugLevel & GDL_IgnoreAvailSpace)) { /* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB * in space it could never fit it if wasn't sparse. Needed for some tests. */ avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (blocks > (uint4)avail_blocks) { if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs)) return (uint4)(NO_FREE_SPACE); else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks); } else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); } } } # ifdef DEBUG if (WBTEST_ENABLED(WBTEST_MM_CONCURRENT_FILE_EXTEND) && dollar_tlevel && !MEMCMP_LIT(gv_cur_region->rname, "DEFAULT")) { SYSTEM("$gtm_dist/mumps -run $gtm_wbox_mrtn"); assert(1 == cs_addrs->nl->wbox_test_seq_num); /* should have been set by mubfilcpy */ cs_addrs->nl->wbox_test_seq_num = 2; /* signal mupip backup to stop sleeping in mubfilcpy */ } # endif /* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */ was_crit = cs_addrs->now_crit; assert(!cs_addrs->hold_onto_crit || was_crit); /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur. * If we are coming from online rollback (when that feature is available), we will come in holding crit and in * the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself. * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for * freeze. * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited * for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this * function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that * at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region * to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused * op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which * we hold crit. */ assert(!was_crit || !FROZEN_HARD(cs_data) || (dollar_tlevel && (CDB_STAGNATE <= t_tries))); /* * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) { for ( ; ; ) { grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(cs_addrs, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } } else if (FROZEN_HARD(cs_data) && dollar_tlevel) { /* We don't want to continue with file extension as explained above. Hence return with an error code which * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly. */ assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_FREEZE_PROG; } else WAIT_FOR_REGION_TO_UNCHILL(cs_addrs, cs_data); if (IS_REPL_INST_FROZEN && trans_in_prog) { assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_INST_FREEZE; } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); old_total = cs_data->trans_hist.total_blks; if (old_total != filesize) { /* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM, * we still need to remap the database */ assert((old_total > filesize) || !is_mm); /* For BG, someone else could have truncated or extended - we have no idea */ GDSFILEXT_CLNUP; return (SS_NORMAL); } if (trans_in_prog && SUSPICIOUS_EXTEND) { if (!was_crit) { GDSFILEXT_CLNUP; return (uint4)(EXTEND_SUSPECT); } /* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then it means we held * crit through bm_getfree into gdsfilext and still didn't get it right. */ assertpro(!is_free_blks_ctr_ok() && !SUSPICIOUS_EXTEND); } if (JNL_ENABLED(cs_data)) { if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = cs_addrs->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, cs_addrs); if (jnl_status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); return (uint4)(NO_FREE_SPACE); /* should have better return status */ } } if (is_mm) { cs_addrs->nl->mm_extender_pid = process_id; status = wcs_wtstart(gv_cur_region, 0, NULL, NULL); cs_addrs->nl->mm_extender_pid = 0; assertpro(SS_NORMAL == status); old_base[0] = cs_addrs->db_addrs[0]; old_base[1] = cs_addrs->db_addrs[1]; cs_addrs->db_addrs[0] = NULL; /* don't rely on it until the mmap below */ # ifdef _AIX status = shmdt(old_base[0] - BLK_ZERO_OFF(cs_data->start_vbn)); # else status = munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0])); # endif if (0 != status) { save_errno = errno; GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), ERR_SYSCALL, 5, LEN_AND_STR(MEM_UNMAP_SYSCALL), CALLFROM, save_errno); return (uint4)(NO_FREE_SPACE); } } else { /* Due to concurrency issues, it is possible some process had issued a disk read of the GDS block# corresponding * to "old_total" right after a truncate wrote a GDS-block of zeros on disk (to signal end of the db file). * If so, the global buffer containing this block needs to be invalidated now as part of the extend. If not, it is * possible the EOF block on disk is now going to be overwritten by a properly initialized bitmap block (as part * of the gdsfilext below) while the global buffer continues to have an incorrect copy of that bitmap block and * this in turn would cause XXXX failures due to a bad bitmap block in shared memory. (GTM-7519) */ cr = db_csh_get((block_id)old_total); if ((NULL != cr) && ((cache_rec_ptr_t)CR_NOTVALID != cr)) { assert((0 == cr->dirty) && (0 == cr->bt_index) && !cr->stopped); cr->cycle++; cr->blk = CR_BLKEMPTY; } } CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ new_total = old_total + new_blocks; new_eof = BLK_ZERO_OFF(cs_data->start_vbn) + ((off_t)new_total * cs_data->blk_size); # if !defined(__sun) && !defined(__hpux) if (!cs_data->defer_allocate) { new_size = new_eof + cs_data->blk_size; save_errno = posix_fallocate(udi->fd, 0, new_size); DEBUG_ONLY(first_save_errno = save_errno); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_fallocate(udi, new_size); if (0 != save_errno) { GDSFILEXT_CLNUP; assert(ENOSPC == save_errno); if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_PREALLOCATEFAIL, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } } # endif save_errno = db_write_eof_block(udi, udi->fd, cs_data->blk_size, new_eof, &(TREF(dio_buff))); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_write(udi, cs_data->blk_size, new_eof); if (0 != save_errno) { GDSFILEXT_CLNUP; if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_1)) { LONG_SLEEP(600); assert(FALSE); } /* Ensure the EOF and metadata get to disk BEFORE any bitmap writes. Otherwise, the file size could no longer reflect * a proper extent and subsequent invocations of gdsfilext could corrupt the database. */ if (!IS_STATSDB_CSA(cs_addrs)) { GTM_DB_FSYNC(cs_addrs, udi->fd, status); assert(0 == status); if (0 != status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(8) ERR_DBFILERR, 5, RTS_ERROR_LITERAL("fsync1()"), CALLFROM, status); return (uint4)(NO_FREE_SPACE); } } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_2)) { LONG_SLEEP(600); assert(FALSE); /* Should be killed before that */ } DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
int4 gds_rundown(void) { boolean_t canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin; boolean_t have_standalone_access, ipc_deleted, err_caught; boolean_t is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm; boolean_t unsafe_last_writer; char time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status, rc; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; shm_snapshot_t *ss_shm_ptr; uint4 ss_pid, onln_rlbk_pid, holder_pid; boolean_t was_crit; boolean_t safe_mode; /* Do not flush or take down shared memory. */ boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen, ftok_counter_halted, access_counter_halted; int secshrstat; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return EXIT_NRM; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return EXIT_NRM; } /* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on * that later to determine if the process had standalone access or not when it entered this function. We need to guarantee * that none else access database file header when semid/shmid fields are reset. We already have created ftok semaphore in * db_init or, mu_rndwn_file and did not remove it. So just lock it. We do it in blocking mode. */ have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */ DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); ESTABLISH_NORET(gds_rundown_ch, err_caught); if (err_caught) { REVERT; WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0); ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE); return EXIT_ERR; } assert(reg->open); /* if we failed to open, dbinit_ch should have taken care of proper clean up */ assert(!reg->opening); /* see comment above */ assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth)); is_mm = (dba_bg != csd->acc_meth); assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk)); /* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown. */ was_crit = (csa->now_crit && jgbl.onlnrlbk); /* Cancel any pending flush timer for this region by this task */ canceled_flush_timer = FALSE; canceled_dbsync_timer = FALSE; CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); we_are_last_user = FALSE; inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr); if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); if (!was_crit) { rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); } /* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach * mupip_exit_handler while holding access control semaphore. Assert accordingly. */ assert(!have_standalone_access || mupip_jnl_recover || process_exiting); /* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it * needs the access control lock as well. The only expection is we are online rollback and currently running down. */ cnl = csa->nl; onln_rlbk_pid = cnl->onln_rlbk_pid; assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0)); if (!have_standalone_access) { if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */ { save_errno = errno; assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); } may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */ /* We need to guarantee that no one else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it. */ if (!ftok_sem_lock(reg, may_bypass_ftok)) { if (may_bypass_ftok) { /* We did a non-blocking wait. It's ok to proceed without locking */ bypassed_ftok = TRUE; holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */ { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at rundown")); } } else { /* We did a blocking wait but something bad happened. */ FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } } sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (0 != status) { save_errno = errno; /* Check # of processes counted on access sem. */ if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); } bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt; /* Before attempting again in the blocking mode, see if the holding process is an online rollback. * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we * are better off skipping rundown and continuing with sanity cleanup and exit. */ holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!bypassed_access) { /* We couldn't get it in one shot-- see if we already have it */ if (holder_pid == process_id) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); assert(FALSE); return EXIT_ERR; } if (EAGAIN != save_errno) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status, FORCED_WAIT); if (-1 == status) /* We couldn't get it at all.. */ rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, errno); } else if (!IS_GTM_IMAGE) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at rundown")); } udi->grabbed_access_sem = !bypassed_access; } } /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the * ftok semaphore and trying it could cause deadlock */ /* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE. * But there could be other processes still having the database open so we cannot safely reset the halted fields. */ if (have_standalone_access && !jgbl.onlnrlbk) csd->ftok_counter_halted = csd->access_counter_halted = FALSE; ftok_counter_halted = csd->ftok_counter_halted; access_counter_halted = csd->access_counter_halted; /* If we bypassed any of the semaphores, activate safe mode. * Also, if the replication instance is frozen and this db has replication turned on (which means * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode. */ ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen); safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted; /* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --cnl->ref_cnt; if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode; /* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */ assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen); if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); /* There's one writer left and I am it */ assert(reg->read_only || semval >= 0); unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch; we_are_last_writer = unsafe_last_writer && !safe_mode; assert(!we_are_last_writer || !safe_mode); assert(!we_are_last_user || !safe_mode); /* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */ assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen); GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */ if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL)))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); if (NULL != csa->ss_ctx) ss_destroy_context(csa->ss_ctx); /* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */ assert(1 == MAX_SNAPSHOTS); ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); ss_pid = ss_shm_ptr->ss_info.ss_pid; is_cur_process_ss_initiator = (process_id == ss_pid); if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user)) { /* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP * INTEG or by a MUPIP RUNDOWN. */ if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid) && (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0))) { ss_release(NULL); ss_release_lock(reg); } } /* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a * consistent state. */ if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY * if there is NO concurrent online rollback running (as we need crit to set wc_blocked) */ if (csa->wbuf_dqd && !is_mm) { /* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked * BEFORE secshr_db_clnup in mur_close_files. * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in * wcs_wtstart or wcs_get_space. Assert accordingly. */ assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode); if (!was_crit) grab_crit(reg); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); BG_TRACE_PRO_ANY(csa, lost_block_recovery); if (!was_crit) rel_crit(reg); } if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ assert(!safe_mode); if (is_mm) { MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg); cnl->remove_shm = TRUE; } if (cnl->wc_blocked && jgbl.onlnrlbk) { /* if the last update done by online rollback was not committed in the normal code-path but was * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This * could result in the last update never getting flushed to the disk and if online rollback happened * to be the last writer then the shared memory will be flushed and removed and the last update will * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is * possible only if phase1 or phase2 errors are induced using white box test cases */ assert(WB_COMMIT_ERR_ENABLED); wcs_recover(reg); } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); } else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen) { /* canceled pending db or jnl flush timers - flush database and journal buffers to disk */ if (!was_crit) grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); if (!was_crit) rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ if (!was_crit) grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing * a pini, so allow for that. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr)); /* If we haven't written a pini, let jnl_file_close write the pini/pfin. */ if (!jgbl.mur_extract && (0 != jpc->pini_addr)) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > cnl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } if (!was_crit) rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { if (!was_crit) grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!have_standalone_access && we_are_last_user) { /* mupip_exit_handler will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); if (!was_crit) rel_crit(reg); if (!is_mm) { GTM_DB_FSYNC(csa, udi->fd, rc); /* Sync it all */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ assert(csa->ti->total_blks == csa->total_blks); #ifdef _AIX GTM_DB_FSYNC(csa, udi->fd, rc); if (-1 == rc) #else if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1])) #endif { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } } else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg)); cnl->lastwriterbypas_msg_issued = TRUE; } } /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */ /* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this). */ CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm) { /* mupip_exit_handler will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (!csa->read_only_fs) { secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0); if (0 != secshrstat) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } } /* Done with file now, close it */ CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ # if !defined(_AIX) if (is_mm && (NULL != csa->db_addrs[0])) { assert(csa->db_addrs[1] > csa->db_addrs[0]); munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]); if (0 < munmap_len) munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len)); } # endif /* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down * this region. In the process also get the remove_shm status from node_local before detaching. * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl); /* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0. * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid). */ if (jgbl.onlnrlbk) cnl->onln_rlbk_pid = 0; rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */ /* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down. * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes. */ if (safe_mode) /* indicates flushing was skipped */ { if (bypassed_access) cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */ if (bypassed_ftok) cnl->dbrndwn_ftok_skip++; } if (jgbl.onlnrlbk) csa->hold_onto_crit = FALSE; GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0); status = shmdt((caddr_t)cnl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ /* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so * we are safe passing "csa". */ if (-1 == status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); REMOVE_CSA_FROM_CSADDRSLIST(csa); /* remove "csa" from list of open regions (cs_addrs_list) */ reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ assertpro(0 == csa->wbuf_dqd); ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); /* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */ udi->new_shm = FALSE; /* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from * mur_close_files()) */ if (!have_standalone_access) { if (0 != sem_rmid(udi->semid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); udi->new_sem = FALSE; /* Note that we no longer have a new semaphore */ udi->grabbed_access_sem = FALSE; udi->counter_acc_incremented = FALSE; } } else if (is_src_server || is_updproc) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else { assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode); if (!jgbl.onlnrlbk && !have_standalone_access) { /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) { if (!access_counter_halted) { save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO); if (0 != save_errno) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"), CALLFROM, save_errno); } udi->counter_acc_incremented = FALSE; } assert(safe_mode || !bypassed_access); /* Now remove the rundown lock */ if (!bypassed_access) { if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore release"), CALLFROM, save_errno); udi->grabbed_access_sem = FALSE; } } /* else access control semaphore will be released in db_ipcs_reset */ } if (!have_standalone_access) { if (bypassed_ftok) { if (!ftok_counter_halted) if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE)) { FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } udi->grabbed_ftok_sem = FALSE; udi->counter_ftok_incremented = FALSE; } ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); if (!ipc_deleted) { GET_CUR_TIME(time_str); if (is_src_server) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; return EXIT_NRM; }