/* Waits for a concurrently running write (of a global buffer to disk) to complete. * * Returns TRUE if write completes within timeout of approx. 1 minute. * Returns FALSE otherwise. */ boolean_t wcs_write_in_progress_wait(node_local_ptr_t cnl, cache_rec_ptr_t cr, wbtest_code_t wbox_test_code) { uint4 lcnt; int4 n; for (lcnt = 1; ; lcnt++) { /* the design here is that either this process owns the block, or the writer does. * if the writer does, it must be allowed to finish its write; then it will release the block * and the next LOCK will establish ownership */ LOCK_BUFF_FOR_UPDATE(cr, n, &cnl->db_latch); /* This destroys evidence of writer ownership, but this is really a test that * there was no prior owner. It will only be true if the writer has cleared it. */ if (OWN_BUFF(n)) break; else { GTM_WHITE_BOX_TEST(wbox_test_code, lcnt, (2 * BUF_OWNER_STUCK)); /* We have noticed the below assert to fail occasionally on some platforms * We suspect it is because of waiting for another writer that is in jnl_fsync * (as part of flushing a global buffer) which takes more than a minute to finish. * To avoid false failures (where the other writer finishes its job in a little over * a minute) we wait for twice the time in the debug version. */ DEBUG_ONLY( if ((BUF_OWNER_STUCK == lcnt) && cr->epid) GET_C_STACK_FROM_SCRIPT("WRITEWAITPID", process_id, cr->epid, ONCE); ) if (BUF_OWNER_STUCK DEBUG_ONLY( * 2) < lcnt) { /* sick of waiting */ if (0 == cr->dirty) { /* someone dropped something; assume it was the writer and go on */ LOCK_NEW_BUFF_FOR_UPDATE(cr); break; } else { if (cr->epid) { #ifdef DEBUG GET_C_STACK_FROM_SCRIPT("WRITEWAITPID", process_id, cr->epid, TWICE); send_msg(VARLSTCNT(8) ERR_WRITEWAITPID, 6, process_id, TWICE, \ cr->epid, cr->blk, DB_LEN_STR(gv_cur_region)); #else GET_C_STACK_FROM_SCRIPT("WRITEWAITPID", process_id, cr->epid, ONCE); send_msg(VARLSTCNT(8) ERR_WRITEWAITPID, 6, process_id, ONCE, \ cr->epid, cr->blk, DB_LEN_STR(gv_cur_region)); #endif } return FALSE; } } if (WRITER_STILL_OWNS_BUFF(cr, n)) wcs_sleep(lcnt); } } /* end of for loop to control buffer */
void jnl_file_lost(jnl_private_control *jpc, uint4 jnl_stat) { /* Notify operator and terminate journaling */ unsigned int status; sgmnt_addrs *csa; seq_num reg_seqno, jnlseqno; error_def(ERR_REPLJNLCLOSED); error_def(ERR_JNLCLOSED); switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } #ifdef VMS /* The following assert has been removed as it could be FALSE if the caller is "jnl_file_extend" * assert(0 != memcmp(csa->nl->jnl_file.jnl_file_id.fid, zero_fid, sizeof(zero_fid))); */ #endif assert(csa->now_crit); if (0 != jnl_stat) jnl_send_oper(jpc, jnl_stat); csa->hdr->jnl_state = jnl_closed; jpc->jnl_buff->cycle++; /* increment shared cycle so all future callers of jnl_ensure_open recognize journal switch */ assert(jpc->cycle < jpc->jnl_buff->cycle); if (REPL_ENABLED(csa->hdr)) { csa->hdr->repl_state = repl_was_open; reg_seqno = csa->hdr->reg_seqno; jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO; send_msg(VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(jpc->region), ®_seqno, ®_seqno, &jnlseqno, &jnlseqno); } else send_msg(VARLSTCNT(5) ERR_JNLCLOSED, 3, DB_LEN_STR(jpc->region), &csa->ti->curr_tn); #ifdef VMS assert(0 != csa->jnl->jnllsb->lockid); status = gtm_enqw(EFN$C_ENF, LCK$K_EXMODE, csa->jnl->jnllsb, LCK$M_CONVERT | LCK$M_NODLCKBLK, NULL, 0, NULL, 0, NULL, PSL$C_USER, 0); if (SS$_NORMAL == status) status = csa->jnl->jnllsb->cond; jnl_file_close(jpc->region, FALSE, FALSE); if (SS$_NORMAL == status) status = gtm_deq(csa->jnl->jnllsb->lockid, NULL, PSL$C_USER, 0); if (SS$_NORMAL != status) GTMASSERT; # else jnl_file_close(jpc->region, FALSE, FALSE); #endif }
/* input parameter "command_name" is a string that is either "MUPIP REORG UPGRADE/DOWNGRADE" or "MUPIP SET VERSION" */ int4 desired_db_format_set(gd_region *reg, enum db_ver new_db_format, char *command_name) { boolean_t was_crit; char *db_fmt_str; char *wcblocked_ptr; int4 status; uint4 jnl_status; inctn_opcode_t save_inctn_opcode; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; trans_num curr_tn; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; assert(reg->open); csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; GTMCRYPT_ONLY( /* We don't allow databases to be encrypted if the version is V4 */ if (csd->is_encrypted && (GDSV4 == new_db_format)) { gtm_putmsg(VARLSTCNT(4) ERR_CRYPTNOV4, 2, DB_LEN_STR(reg)); return ERR_CRYPTNOV4; } )
boolean_t mur_report_error(jnl_ctl_list *jctl, enum mur_error code) { error_def(ERR_BOVTMGTEOVTM); error_def(ERR_DUPTOKEN); error_def(ERR_JNLBADRECFMT); error_def(ERR_PREVJNLNOEOF); error_def(ERR_UNKNOWNRECTYPE); switch (code) { default: assert(FALSE); break; case MUR_DUPTOKEN: assert(FALSE); gtm_putmsg(VARLSTCNT(7) ERR_DUPTOKEN, 5, &((struct_jrec_tcom *)jctl->reg_ctl->mur_desc->jnlrec)->token_seq.token, jctl->jnl_fn_len, jctl->jnl_fn, DB_LEN_STR(jctl->reg_ctl->gd)); break; case MUR_PREVJNLNOEOF: gtm_putmsg(VARLSTCNT(4) ERR_PREVJNLNOEOF, 2, jctl->jnl_fn_len, jctl->jnl_fn); break; case MUR_JNLBADRECFMT: gtm_putmsg(VARLSTCNT(5) ERR_JNLBADRECFMT, 3, jctl->jnl_fn_len, jctl->jnl_fn, jctl->rec_offset); break; case MUR_BOVTMGTEOVTM: gtm_putmsg(VARLSTCNT(6) ERR_BOVTMGTEOVTM, 4, jctl->jnl_fn_len, jctl->jnl_fn, &jctl->jfh->bov_timestamp, &jctl->jfh->eov_timestamp); break; } return MUR_WITHIN_ERROR_LIMIT(murgbl.err_cnt, mur_options.error_limit); /* side-effect : increments murgbl.err_cnt */ }
boolean_t mur_report_error(enum mur_error code) { error_def(ERR_UNKNOWNRECTYPE); error_def(ERR_DUPTOKEN); error_def(ERR_PREVJNLNOEOF); error_def(ERR_JNLBADRECFMT); switch (code) { default: assert(FALSE); break; case MUR_DUPTOKEN: assert(FALSE); gtm_putmsg(VARLSTCNT(7) ERR_DUPTOKEN, 5, &((struct_jrec_tcom *)mur_rab.jnlrec)->token_seq.token, mur_jctl->jnl_fn_len, mur_jctl->jnl_fn, DB_LEN_STR(mur_ctl[mur_regno].gd)); break; case MUR_PREVJNLNOEOF: gtm_putmsg(VARLSTCNT(4) ERR_PREVJNLNOEOF, 2, mur_jctl->jnl_fn_len, mur_jctl->jnl_fn); break; case MUR_JNLBADRECFMT: gtm_putmsg(VARLSTCNT(5) ERR_JNLBADRECFMT, 3, mur_jctl->jnl_fn_len, mur_jctl->jnl_fn, mur_jctl->rec_offset); break; case MUR_BOVTMGTEOVTM: break; } return MUR_WITHIN_ERROR_LIMIT(murgbl.err_cnt, mur_options.error_limit); /* side-effect : increments murgbl.err_cnt */ }
int main(int argc, char *argv[]) { DCL_THREADGBL_ACCESS; GTM_THREADGBL_INIT; set_blocksig(); gtm_imagetype_init(DSE_IMAGE); gtm_wcswidth_fnptr = gtm_wcswidth; gtm_env_init(); /* read in all environment variables */ licensed = TRUE; TREF(transform) = TRUE; op_open_ptr = op_open; patch_curr_blk = get_dir_root(); err_init(util_base_ch); GTM_ICU_INIT_IF_NEEDED; /* Note: should be invoked after err_init (since it may error out) and before CLI parsing */ sig_init(generic_signal_handler, dse_ctrlc_handler, suspsigs_handler); atexit(util_exit_handler); SET_LATCH_GLOBAL(&defer_latch, LOCK_AVAILABLE); get_page_size(); stp_init(STP_INITSIZE); rts_stringpool = stringpool; getjobname(); INVOKE_INIT_SECSHR_ADDRS; getzdir(); prealloc_gt_timers(); initialize_pattern_table(); gvinit(); region_init(FALSE); INIT_GBL_ROOT(); /* Needed for GVT initialization */ getjobnum(); util_out_print("!/File !_!AD", TRUE, DB_LEN_STR(gv_cur_region)); util_out_print("Region!_!AD!/", TRUE, REG_LEN_STR(gv_cur_region)); cli_lex_setup(argc, argv); CREATE_DUMMY_GBLDIR(gd_header, original_header, gv_cur_region, gd_map, gd_map_top); gtm_chk_dist(argv[0]); # ifdef DEBUG if ((gtm_white_box_test_case_enabled && (WBTEST_SEMTOOLONG_STACK_TRACE == gtm_white_box_test_case_number) )) { sgmnt_addrs * csa; node_local_ptr_t cnl; csa = &FILE_INFO(gv_cur_region)->s_addrs; cnl = csa->nl; cnl->wbox_test_seq_num = 1; /*Signal the first step and wait here*/ while (2 != cnl->wbox_test_seq_num) /*Wait for another process to get hold of the semaphore and signal next step*/ LONG_SLEEP(10); } # endif if (argc < 2) display_prompt(); io_init(TRUE); while (1) { if (!dse_process(argc)) break; display_prompt(); } dse_exit(); REVERT; }
uint4 mupip_set_journal_newstate(set_jnl_options *jnl_options, jnl_create_info *jnl_info, mu_set_rlist *rptr) { enum jnl_state_codes jnl_curr_state; enum repl_state_codes repl_curr_state; boolean_t current_image; enum db_acc_method acc_meth; error_def(ERR_REPLNOBEFORE); error_def(ERR_REPLJNLCNFLCT); error_def(ERR_JNLDISABLE); error_def(ERR_MMBEFOREJNL); error_def(ERR_MMNOBFORRPL); jnl_curr_state = (enum jnl_state_codes)rptr->sd->jnl_state; repl_curr_state = (enum repl_state_codes)rptr->sd->repl_state; acc_meth = rptr->sd->acc_meth; current_image = rptr->sd->jnl_before_image; if (CLI_ABSENT == jnl_options->cli_journal) rptr->jnl_new_state = jnl_curr_state; else if ((CLI_NEGATED == jnl_options->cli_journal) || (CLI_NEGATED == jnl_options->cli_enable)) rptr->jnl_new_state = jnl_notallowed; /* DISABLE specified */ else if ((jnl_notallowed != jnl_curr_state) || (CLI_PRESENT == jnl_options->cli_enable)) { /* journaling is already ENABLED or ENABLE is explicitly specified */ if (CLI_NEGATED == jnl_options->cli_on) /* OFF specified */ rptr->jnl_new_state = jnl_closed; else if (repl_curr_state == repl_was_open && CLI_PRESENT != jnl_options->cli_replic_on) { /* Journaling was turned OFF by jnl_file_lost(). Do not allow turning journaling ON without also turning replication ON */ gtm_putmsg(VARLSTCNT(10) ERR_REPLJNLCNFLCT, 8, LEN_AND_STR(jnl_state_lit[jnl_open]), DB_LEN_STR(gv_cur_region), LEN_AND_STR(repl_state_lit[repl_closed]), LEN_AND_STR(jnl_state_lit[jnl_open])); return EXIT_WRN; } else /* ON explicitly specified or present by default */ rptr->jnl_new_state = jnl_open; } else /* jnl_notallowed == jnl_curr_state && CLI_ABSENT == jnl_options->cli_enable */ { if (CLI_PRESENT != jnl_options->cli_replic_on) { gtm_putmsg(VARLSTCNT(4) ERR_JNLDISABLE, 2, DB_LEN_STR(gv_cur_region)); return EXIT_WRN; } else rptr->jnl_new_state = jnl_open; /* turn journaling on for REPLICATION=ON */ } VMS_ONLY(rptr->before_images = (jnl_options->image_type_specified ? jnl_info->before_images : current_image);)
void set_enospc_flags(gd_addr *addr_ptr, char enospc_enable_list[], boolean_t ok_to_interrupt) { gd_region *r_local, *r_top; int i; sgmnt_addrs *csa; const char *syslog_msg; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; for (r_local = addr_ptr->regions, r_top = r_local + addr_ptr->n_regions, i = 0; r_local < r_top; r_local++, i++) { if (!r_local->open || r_local->was_open) continue; if ((dba_bg != r_local->dyn.addr->acc_meth) && (dba_mm != r_local->dyn.addr->acc_meth)) continue; csa = &FILE_INFO(r_local)->s_addrs; if (ANTICIPATORY_FREEZE_ENABLED(csa)) { switch(enospc_enable_list[i]) { case NONE: syslog_msg = "Turning off fake ENOSPC for both database and journal file."; csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = FALSE; break; case DB_ON: syslog_msg = "Turning on fake ENOSPC only for database file."; csa->nl->fake_db_enospc = TRUE; csa->nl->fake_jnl_enospc = FALSE; break; case JNL_ON: syslog_msg = "Turning on fake ENOSPC only for journal file."; csa->nl->fake_db_enospc = FALSE; csa->nl->fake_jnl_enospc = TRUE; break; case DB_AND_JNL_ON: syslog_msg = "Turning on fake ENOSPC for both database and journal file."; csa->nl->fake_db_enospc = TRUE; csa->nl->fake_jnl_enospc = TRUE; break; default: assert(FALSE); } if (ok_to_interrupt) send_msg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_TEXT, 2, DB_LEN_STR(r_local), ERR_TEXT, 2, LEN_AND_STR(syslog_msg)); } } }
STATICFNDEF void mu_rndwn_all_helper(shm_parms *parm_buff, char *fname, int *exit_status, int *tmp_exit_status) { replpool_identifier replpool_id; boolean_t ret_status, jnlpool_sem_created; unsigned char ipcs_buff[MAX_IPCS_ID_BUF], *ipcs_ptr; ESTABLISH(mu_rndwn_all_helper_ch); if (validate_db_shm_entry(parm_buff, fname, tmp_exit_status)) { if (SS_NORMAL == *tmp_exit_status) { /* shm still exists */ mu_gv_cur_reg_init(); gv_cur_region->dyn.addr->fname_len = strlen(fname); STRNCPY_STR(gv_cur_region->dyn.addr->fname, fname, gv_cur_region->dyn.addr->fname_len); if (mu_rndwn_file(gv_cur_region, FALSE)) gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_MUFILRNDWNSUC, 2, DB_LEN_STR(gv_cur_region)); else { /* Save semid so that it will not be removed by mu_rndwn_sem_all() */ add_to_semids_list(FILE_INFO(gv_cur_region)->semid); *exit_status = ERR_MUNOTALLSEC; } mu_gv_cur_reg_free(); } else { /* shm has been cleaned up by "validate_db_shm_entry" so no need of any more cleanup here */ assert(ERR_SHMREMOVED == *tmp_exit_status); *tmp_exit_status = SS_NORMAL; /* reset tmp_exit_status for below logic to treat this as normal */ } } else if ((SS_NORMAL == *tmp_exit_status) && validate_replpool_shm_entry(parm_buff, (replpool_id_ptr_t)&replpool_id, tmp_exit_status)) { if (SS_NORMAL == *tmp_exit_status) { assert(JNLPOOL_SEGMENT == replpool_id.pool_type || RECVPOOL_SEGMENT == replpool_id.pool_type); ret_status = mu_rndwn_repl_instance(&replpool_id, TRUE, FALSE, &jnlpool_sem_created); ipcs_ptr = i2asc((uchar_ptr_t)ipcs_buff, parm_buff->shmid); *ipcs_ptr = '\0'; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) (JNLPOOL_SEGMENT == replpool_id.pool_type) ? (ret_status ? ERR_MUJPOOLRNDWNSUC : ERR_MUJPOOLRNDWNFL) : (ret_status ? ERR_MURPOOLRNDWNSUC : ERR_MURPOOLRNDWNFL), 4, LEN_AND_STR(ipcs_buff), LEN_AND_STR(replpool_id.instfilename)); if (!ret_status) *exit_status = ERR_MUNOTALLSEC; } else { /* shm has been cleaned up by "validate_replpool_shm_entry" so no need of any more cleanup here */ assert(ERR_SHMREMOVED == *tmp_exit_status); *tmp_exit_status = SS_NORMAL; /* reset tmp_exit_status for below logic to treat this as normal */ } } REVERT; }
/* This routine returns whether the free_blocks counter in the file-header is ok (TRUE) or not (FALSE). * If not, it corrects it. This assumes cs_addrs, cs_data and gv_cur_region to point to the region of interest. * It also assumes that the master-map is correct and finds out non-full local bitmaps and counts the number of * free blocks in each of them and sums them up to determine the perceived correct free_blocks count. * The reason why this is ok is that even if the master-map incorrectly reports a local bitmap as full, our new free_blocks * count will effectively make the free space in that local-bitmap invisible and make a gdsfilext necessary and valid. * A later mupip integ will scavenge that invisible space for us. The worst that can therefore happen is that we will transiently * not be using up existing space. But we will always ensure that the free_blocks counter goes in sync with the master-map. */ boolean_t is_free_blks_ctr_ok(void) { boolean_t blk_used; block_id bml, free_bit, free_bml, maxbitsthismap; cache_rec_ptr_t cr; int cycle; sm_uc_ptr_t bmp; unsigned int local_maps, total_blks, free_blocks; error_def(ERR_DBBADFREEBLKCTR); assert(&FILE_INFO(gv_cur_region)->s_addrs == cs_addrs && cs_addrs->hdr == cs_data && cs_addrs->now_crit); total_blks = (dba_mm == cs_data->acc_meth) ? cs_addrs->total_blks : cs_addrs->ti->total_blks; local_maps = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP); for (free_blocks = 0, free_bml = 0; free_bml < local_maps; free_bml++) { bml = bmm_find_free((uint4)free_bml, (sm_uc_ptr_t)MM_ADDR(cs_data), local_maps); if (bml < free_bml) break; free_bml = bml; bml *= BLKS_PER_LMAP; if (!(bmp = t_qread(bml, (sm_int_ptr_t)&cycle, &cr)) || (BM_SIZE(BLKS_PER_LMAP) != ((blk_hdr_ptr_t)bmp)->bsiz) || (LCL_MAP_LEVL != ((blk_hdr_ptr_t)bmp)->levl)) { assert(FALSE); /* In pro, we will simply skip counting this local bitmap. */ continue; } assert(free_bml <= (local_maps - 1)); maxbitsthismap = (free_bml != (local_maps - 1)) ? BLKS_PER_LMAP : total_blks - bml; for (free_bit = 0; free_bit < maxbitsthismap; free_bit++) { free_bit = bm_find_blk(free_bit, (sm_uc_ptr_t)bmp + sizeof(blk_hdr), maxbitsthismap, &blk_used); assert(NO_FREE_SPACE <= free_bit); if (0 > free_bit) break; free_blocks++; } } assert(cs_addrs->ti->free_blocks == free_blocks); if (cs_addrs->ti->free_blocks != free_blocks) { send_msg(VARLSTCNT(6) ERR_DBBADFREEBLKCTR, 4, DB_LEN_STR(gv_cur_region), cs_addrs->ti->free_blocks, free_blocks); cs_addrs->ti->free_blocks = free_blocks; return FALSE; } return TRUE; }
bool gtcmtr_kill(void) { cm_region_list *reg_ref; unsigned char *ptr, regnum; unsigned short len; static readonly gds_file_id file; error_def(ERR_DBPRIVERR); ptr = curr_entry->clb_ptr->mbf; assert(*ptr == CMMS_Q_KILL); ptr++; GET_SHORT(len, ptr); ptr += sizeof(unsigned short); regnum = *ptr++; reg_ref = gtcm_find_region(curr_entry,regnum); len--; /* subtract size of regnum */ CM_GET_GVCURRKEY(ptr, len); gtcm_bind_name(reg_ref->reghead, TRUE); if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBPRIVERR, 2, DB_LEN_STR(gv_cur_region)); if (JNL_ALLOWED(cs_addrs)) { /* we need to copy client's specific prc_vec into the global variable in order that the gvcst* routines * do the right job. actually we need to do this only if JNL_ENABLED(cs_addrs), but since it is not * easy to re-execute the following two assignments in case gvcst_kill()'s call to t_end() encounters a * cdb_sc_jnlstatemod retry code, we choose the easier approach of executing the following segment * if JNL_ALLOWED(cs_addrs) is TRUE instead of checking for JNL_ENABLED(cs_addrs) to be TRUE. * this approach has the overhead that we will be doing the following assignments even though JNL_ENABLED * might not be TRUE but since the following two are just pointer copies, it is not considered a big overhead. * this approach ensures that the jnl_put_jrt_pini() gets the appropriate prc_vec for writing into the * journal record in case JNL_ENABLED turns out to be TRUE in t_end() time. * note that the value of JNL_ALLOWED(cs_addrs) cannot be changed on the fly without obtaining standalone access * and hence the correctness of prc_vec whenever it turns out necessary is guaranteed. */ originator_prc_vec = curr_entry->pvec; cs_addrs->jnl->pini_addr = reg_ref->pini_addr; } if (gv_target->root) gvcst_kill(TRUE); if (JNL_ALLOWED(cs_addrs)) reg_ref->pini_addr = cs_addrs->jnl->pini_addr; /* In case journal switch occurred */ ptr = curr_entry->clb_ptr->mbf; *ptr++ = CMMS_R_KILL; curr_entry->clb_ptr->cbl = S_HDRSIZE; return TRUE; }
uint4 set_jnl_file_close(set_jnl_file_close_opcode_t set_jnl_file_close_opcode) { uint4 jnl_status = 0; cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs; jnl_status = jnl_ensure_open(); if (0 == jnl_status) { if (0 == cs_addrs->jnl->pini_addr) jnl_put_jrt_pini(cs_addrs); wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH); jnl_put_jrt_pfin(cs_addrs); jnl_file_close(gv_cur_region, TRUE, TRUE); } else gtm_putmsg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_addrs->hdr), DB_LEN_STR(gv_cur_region)); return jnl_status; }
void op_gvkill(void) { gd_region *reg; error_def(ERR_DBPRIVERR); if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBPRIVERR, 2, DB_LEN_STR(gv_cur_region)); if (gv_curr_subsc_null && gv_cur_region->null_subs == FALSE) sgnl_gvnulsubsc(); if (gv_cur_region->dyn.addr->acc_meth == dba_bg || gv_cur_region->dyn.addr->acc_meth == dba_mm) { if (gv_target->root) { gvcst_kill(TRUE); } } else if (gv_cur_region->dyn.addr->acc_meth == dba_cm) { gvcmx_kill(TRUE); }else { gvusr_kill(TRUE); } if (gv_cur_region->dyn.addr->repl_list) { gv_replication_error = gv_replopen_error; gv_replopen_error = FALSE; reg = gv_cur_region; while (gv_cur_region = gv_cur_region->dyn.addr->repl_list) /* set replicated segments */ { if (gv_cur_region->open) { change_reg(); kill_var(); } else gv_replication_error = TRUE; } gv_cur_region = reg; change_reg(); if (gv_replication_error) sgnl_gvreplerr(); } }
uint4 jnl_file_open_switch(gd_region *reg, uint4 sts) { sgmnt_addrs *csa; jnl_private_control *jpc; jnl_create_info create; char prev_jnl_fn[JNL_NAME_SIZE]; csa = &FILE_INFO(reg)->s_addrs; jpc = csa->jnl; assert((ERR_JNLFILOPN != sts) && (NOJNL != jpc->channel) || (ERR_JNLFILOPN == sts) && (NOJNL == jpc->channel)); if ((ERR_JNLFILOPN != sts) && (NOJNL != jpc->channel)) F_CLOSE(jpc->channel); jpc->channel = NOJNL; jnl_send_oper(jpc, sts); /* attempt to create a new journal file */ memset(&create, 0, sizeof(create)); create.status = create.status2 = SS_NORMAL; create.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(reg, &create); create.no_prev_link = TRUE; create.no_rename = FALSE; if (!jgbl.forw_phase_recovery) JNL_SHORT_TIME(jgbl.gbl_jrec_time); /* needed for cre_jnl_file() */ /* else mur_output_record() would have already set jgbl.gbl_jrec_time */ assert(jgbl.gbl_jrec_time); if (EXIT_NRM != cre_jnl_file(&create)) { jpc->status = create.status; jpc->status2 = create.status2; return ERR_JNLINVALID; } else { jpc->status = SS_NORMAL; sts = 0; } send_msg(VARLSTCNT(6) ERR_PREVJNLLINKCUT, 4, JNL_LEN_STR(csa->hdr), DB_LEN_STR(reg)); assert(csa->hdr->jnl_file_len == create.jnl_len); assert(0 == memcmp(csa->hdr->jnl_file_name, create.jnl, create.jnl_len)); return sts; }
void op_gvincr(mval *increment, mval *result) { unsigned char buff[MAX_ZWR_KEY_SZ], *end; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; /* If specified var name is global ^%Y*, the name is illegal to use in a SET or KILL command, only GETs are allowed */ if ((RESERVED_NAMESPACE_LEN <= gv_currkey->end) && (0 == MEMCMP_LIT(gv_currkey->base, RESERVED_NAMESPACE))) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_PCTYRESERVED); if (gv_cur_region->read_only) rts_error_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_DBPRIVERR, 2, DB_LEN_STR(gv_cur_region)); if ((TREF(gv_last_subsc_null) || TREF(gv_some_subsc_null)) && (ALWAYS != gv_cur_region->null_subs)) sgnl_gvnulsubsc(); assert(gv_currkey->end + 1 <= gv_cur_region->max_key_size); MV_FORCE_NUM(increment); switch (gv_cur_region->dyn.addr->acc_meth) { case dba_bg: case dba_mm: gvcst_incr(increment, result); break; case dba_cm: gvcmx_increment(increment, result); break; case dba_usr: /* $INCR not supported for DDP/USR access method */ if (0 == (end = format_targ_key(buff, MAX_ZWR_KEY_SZ, gv_currkey, TRUE))) end = &buff[MAX_ZWR_KEY_SZ - 1]; rts_error_csa(CSA_ARG(NULL) VARLSTCNT(10) ERR_UNIMPLOP, 0, ERR_TEXT, 2, LEN_AND_LIT("GTCM DDP server does not support $INCREMENT"), ERR_GVIS, 2, end - buff, buff, ERR_TEXT, 2, REG_LEN_STR(gv_cur_region)); break; default: assertpro(FALSE); } assert(MV_DEFINED(result)); }
void op_gvkill(void) { gd_region *reg; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBPRIVERR, 2, DB_LEN_STR(gv_cur_region)); if (TREF(gv_last_subsc_null) && NEVER == gv_cur_region->null_subs) sgnl_gvnulsubsc(); if (gv_cur_region->dyn.addr->acc_meth == dba_bg || gv_cur_region->dyn.addr->acc_meth == dba_mm) { if (IS_OK_TO_INVOKE_GVCST_KILL(gv_target)) gvcst_kill(TRUE); } else if (gv_cur_region->dyn.addr->acc_meth == dba_cm) gvcmx_kill(TRUE); else gvusr_kill(TRUE); if (gv_cur_region->dyn.addr->repl_list) { gv_replication_error = gv_replopen_error; gv_replopen_error = FALSE; reg = gv_cur_region; while (gv_cur_region = gv_cur_region->dyn.addr->repl_list) /* set replicated segments */ { if (gv_cur_region->open) { change_reg(); kill_var(); } else gv_replication_error = TRUE; } gv_cur_region = reg; change_reg(); if (gv_replication_error) sgnl_gvreplerr(); } }
block_id dse_getblk(char *element, boolean_t nobml, boolean_t carry_curr) { block_id blk; if (!cli_get_hex(element, (uint4 *)&blk)) blk = patch_curr_blk; else CLEAR_DSE_COMPRESS_KEY; if ((blk < 0) || (blk >= cs_addrs->ti->total_blks)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(6) ERR_BLKINVALID, 4, blk, DB_LEN_STR(gv_cur_region), cs_addrs->ti->total_blks); return BADDSEBLK; } if (nobml && IS_BITMAP_BLK(blk)) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_CANTBITMAP); return BADDSEBLK; } if (carry_curr) patch_curr_blk = blk; return blk; }
uint4 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog) { sm_uc_ptr_t old_base[2], mmap_retaddr; boolean_t was_crit, is_mm; int result, save_errno, status; DEBUG_ONLY(int first_save_errno); uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total; uint4 jnl_status; gtm_uint64_t avail_blocks, mmap_sz; off_t new_eof, new_size; trans_num curr_tn; unix_db_info *udi; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; cache_rec_ptr_t cr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(!IS_DSE_IMAGE); assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */ assert(!process_exiting); DEBUG_ONLY(old_base[0] = old_base[1] = NULL); assert(!gv_cur_region->read_only); udi = FILE_INFO(gv_cur_region); is_mm = (dba_mm == cs_addrs->hdr->acc_meth); # if !defined(MM_FILE_EXT_OK) if (!udi->grabbed_access_sem && is_mm) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ # endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); # if defined(__sun) || defined(__hpux) cs_data->defer_allocate = TRUE; # endif if (!blocks && (cs_data->defer_allocate || (TRANS_IN_PROG_TRUE == trans_in_prog))) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks * and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this * manner as every non-bitmap block must have an associated bitmap) * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert((0 < (int)new_blocks) || (!cs_data->defer_allocate && (0 == new_blocks))); if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { assert(WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { if (!(gtmDebugLevel & GDL_IgnoreAvailSpace)) { /* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB * in space it could never fit it if wasn't sparse. Needed for some tests. */ avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (blocks > (uint4)avail_blocks) { if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs)) return (uint4)(NO_FREE_SPACE); else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks); } else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); } } } # ifdef DEBUG if (WBTEST_ENABLED(WBTEST_MM_CONCURRENT_FILE_EXTEND) && dollar_tlevel && !MEMCMP_LIT(gv_cur_region->rname, "DEFAULT")) { SYSTEM("$gtm_dist/mumps -run $gtm_wbox_mrtn"); assert(1 == cs_addrs->nl->wbox_test_seq_num); /* should have been set by mubfilcpy */ cs_addrs->nl->wbox_test_seq_num = 2; /* signal mupip backup to stop sleeping in mubfilcpy */ } # endif /* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */ was_crit = cs_addrs->now_crit; assert(!cs_addrs->hold_onto_crit || was_crit); /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur. * If we are coming from online rollback (when that feature is available), we will come in holding crit and in * the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself. * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for * freeze. * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited * for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this * function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that * at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region * to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused * op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which * we hold crit. */ assert(!was_crit || !FROZEN_HARD(cs_data) || (dollar_tlevel && (CDB_STAGNATE <= t_tries))); /* * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) { for ( ; ; ) { grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(cs_addrs, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } } else if (FROZEN_HARD(cs_data) && dollar_tlevel) { /* We don't want to continue with file extension as explained above. Hence return with an error code which * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly. */ assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_FREEZE_PROG; } else WAIT_FOR_REGION_TO_UNCHILL(cs_addrs, cs_data); if (IS_REPL_INST_FROZEN && trans_in_prog) { assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_INST_FREEZE; } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); old_total = cs_data->trans_hist.total_blks; if (old_total != filesize) { /* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM, * we still need to remap the database */ assert((old_total > filesize) || !is_mm); /* For BG, someone else could have truncated or extended - we have no idea */ GDSFILEXT_CLNUP; return (SS_NORMAL); } if (trans_in_prog && SUSPICIOUS_EXTEND) { if (!was_crit) { GDSFILEXT_CLNUP; return (uint4)(EXTEND_SUSPECT); } /* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then it means we held * crit through bm_getfree into gdsfilext and still didn't get it right. */ assertpro(!is_free_blks_ctr_ok() && !SUSPICIOUS_EXTEND); } if (JNL_ENABLED(cs_data)) { if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = cs_addrs->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, cs_addrs); if (jnl_status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); return (uint4)(NO_FREE_SPACE); /* should have better return status */ } } if (is_mm) { cs_addrs->nl->mm_extender_pid = process_id; status = wcs_wtstart(gv_cur_region, 0, NULL, NULL); cs_addrs->nl->mm_extender_pid = 0; assertpro(SS_NORMAL == status); old_base[0] = cs_addrs->db_addrs[0]; old_base[1] = cs_addrs->db_addrs[1]; cs_addrs->db_addrs[0] = NULL; /* don't rely on it until the mmap below */ # ifdef _AIX status = shmdt(old_base[0] - BLK_ZERO_OFF(cs_data->start_vbn)); # else status = munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0])); # endif if (0 != status) { save_errno = errno; GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), ERR_SYSCALL, 5, LEN_AND_STR(MEM_UNMAP_SYSCALL), CALLFROM, save_errno); return (uint4)(NO_FREE_SPACE); } } else { /* Due to concurrency issues, it is possible some process had issued a disk read of the GDS block# corresponding * to "old_total" right after a truncate wrote a GDS-block of zeros on disk (to signal end of the db file). * If so, the global buffer containing this block needs to be invalidated now as part of the extend. If not, it is * possible the EOF block on disk is now going to be overwritten by a properly initialized bitmap block (as part * of the gdsfilext below) while the global buffer continues to have an incorrect copy of that bitmap block and * this in turn would cause XXXX failures due to a bad bitmap block in shared memory. (GTM-7519) */ cr = db_csh_get((block_id)old_total); if ((NULL != cr) && ((cache_rec_ptr_t)CR_NOTVALID != cr)) { assert((0 == cr->dirty) && (0 == cr->bt_index) && !cr->stopped); cr->cycle++; cr->blk = CR_BLKEMPTY; } } CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ new_total = old_total + new_blocks; new_eof = BLK_ZERO_OFF(cs_data->start_vbn) + ((off_t)new_total * cs_data->blk_size); # if !defined(__sun) && !defined(__hpux) if (!cs_data->defer_allocate) { new_size = new_eof + cs_data->blk_size; save_errno = posix_fallocate(udi->fd, 0, new_size); DEBUG_ONLY(first_save_errno = save_errno); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_fallocate(udi, new_size); if (0 != save_errno) { GDSFILEXT_CLNUP; assert(ENOSPC == save_errno); if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_PREALLOCATEFAIL, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } } # endif save_errno = db_write_eof_block(udi, udi->fd, cs_data->blk_size, new_eof, &(TREF(dio_buff))); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_write(udi, cs_data->blk_size, new_eof); if (0 != save_errno) { GDSFILEXT_CLNUP; if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_1)) { LONG_SLEEP(600); assert(FALSE); } /* Ensure the EOF and metadata get to disk BEFORE any bitmap writes. Otherwise, the file size could no longer reflect * a proper extent and subsequent invocations of gdsfilext could corrupt the database. */ if (!IS_STATSDB_CSA(cs_addrs)) { GTM_DB_FSYNC(cs_addrs, udi->fd, status); assert(0 == status); if (0 != status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(8) ERR_DBFILERR, 5, RTS_ERROR_LITERAL("fsync1()"), CALLFROM, status); return (uint4)(NO_FREE_SPACE); } } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_2)) { LONG_SLEEP(600); assert(FALSE); /* Should be killed before that */ } DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
void lke_show(void) { bool locks, all = TRUE, wait = TRUE, interactive = FALSE, match = FALSE, memory = TRUE, nocrit = TRUE; boolean_t exact = FALSE, was_crit; int4 pid; size_t ls_len; int n; char regbuf[MAX_RN_LEN], nodebuf[32], one_lockbuf[MAX_KEY_SZ]; mlk_ctldata_ptr_t ctl; mstr reg, node, one_lock; int shr_sub_len = 0; float ls_free = 0; /* Free space in bottleneck subspace */ /* Get all command parameters */ reg.addr = regbuf; reg.len = SIZEOF(regbuf); node.addr = nodebuf; node.len = SIZEOF(nodebuf); one_lock.addr = one_lockbuf; one_lock.len = SIZEOF(one_lockbuf); if (lke_getcli(&all, &wait, &interactive, &pid, ®, &node, &one_lock, &memory, &nocrit, &exact) == 0) return; /* Search all regions specified on the command line */ for (gv_cur_region = gd_header->regions, n = 0; n != gd_header->n_regions; ++gv_cur_region, ++n) { /* If region matches and is open */ if ((reg.len == 0 || gv_cur_region->rname_len == reg.len && memcmp(gv_cur_region->rname, reg.addr, reg.len) == 0) && gv_cur_region->open) { match = TRUE; util_out_print("!/!AD!/", NOFLUSH, REG_LEN_STR(gv_cur_region)); /* If distributed database, the region is located on another node */ if (gv_cur_region->dyn.addr->acc_meth == dba_cm) { # if defined(LKE_WORKS_OK_WITH_CM) /* Obtain lock info from the remote node */ locks = gtcmtr_lke_showreq(gv_cur_region->dyn.addr->cm_blk, gv_cur_region->cmx_regnum, all, wait, pid, &node); # else gtm_putmsg(VARLSTCNT(10) ERR_UNIMPLOP, 0, ERR_TEXT, 2, LEN_AND_LIT("GT.CM region - locks must be displayed on the local node"), ERR_TEXT, 2, REG_LEN_STR(gv_cur_region)); continue; # endif } else if (gv_cur_region->dyn.addr->acc_meth == dba_bg || gv_cur_region->dyn.addr->acc_meth == dba_mm) { /* Local region */ cs_addrs = &FILE_INFO(gv_cur_region)->s_addrs; ls_len = (size_t)(cs_addrs->lock_addrs[1] - cs_addrs->lock_addrs[0]); ctl = (mlk_ctldata_ptr_t)malloc(ls_len); /* Prevent any modification of the lock space while we make a local copy of it */ if (cs_addrs->critical != NULL) crash_count = cs_addrs->critical->crashcnt; was_crit = cs_addrs->now_crit; if (!nocrit && !was_crit) grab_crit(gv_cur_region); longcpy((uchar_ptr_t)ctl, (uchar_ptr_t)cs_addrs->lock_addrs[0], ls_len); assert((ctl->max_blkcnt > 0) && (ctl->max_prccnt > 0) && ((ctl->subtop - ctl->subbase) > 0)); if (!nocrit && !was_crit) rel_crit(gv_cur_region); shr_sub_len = 0; locks = ctl->blkroot == 0 ? FALSE: lke_showtree(NULL, (mlk_shrblk_ptr_t)R2A(ctl->blkroot), all, wait, pid, one_lock, memory, &shr_sub_len); /* lock space usage consists of: control_block + nodes(locks) + processes + substrings */ /* any of those subspaces can be bottleneck. * Therefore we will report the subspace which is running out. */ ls_free = MIN(((float)ctl->blkcnt) / ctl->max_blkcnt, ((float)ctl->prccnt) / ctl->max_prccnt); ls_free = MIN(1-(((float)shr_sub_len) / (ctl->subtop - ctl->subbase)), ls_free); ls_free *= 100; /* Scale to [0-100] range. (couldn't do this inside util_out_print) */ if (ls_free < 1) /* No memory? Notify user. */ gtm_putmsg(VARLSTCNT(4) ERR_LOCKSPACEFULL, 2, DB_LEN_STR(gv_cur_region)); if (ls_free < 1 || memory) { if (ctl->subtop > ctl->subfree) gtm_putmsg(VARLSTCNT(10) ERR_LOCKSPACEINFO, 8, REG_LEN_STR(gv_cur_region), (ctl->max_prccnt - ctl->prccnt), ctl->max_prccnt, (ctl->max_blkcnt - ctl->blkcnt), ctl->max_blkcnt, LEN_AND_LIT(" not ")); else gtm_putmsg(VARLSTCNT(10) ERR_LOCKSPACEINFO, 8, REG_LEN_STR(gv_cur_region), (ctl->max_prccnt - ctl->prccnt), ctl->max_prccnt, (ctl->max_blkcnt - ctl->blkcnt), ctl->max_blkcnt, LEN_AND_LIT(" ")); } free(ctl); } else { gtm_putmsg(VARLSTCNT(2) ERR_BADREGION, 0); locks = TRUE; } if (!locks) { gtm_putmsg(VARLSTCNT(4) ERR_NOLOCKMATCH, 2, REG_LEN_STR(gv_cur_region)); } assert((ls_free <= 100) && (ls_free >= 0)); gtm_putmsg(VARLSTCNT(4) ERR_LOCKSPACEUSE, 2, ((int)ls_free), cs_addrs->hdr->lock_space_size/OS_PAGELET_SIZE); } } if (!match && reg.len != 0) rts_error(VARLSTCNT(4) ERR_NOREGION, 2, reg.len, reg.addr); }
uint4 jnl_file_lost(jnl_private_control *jpc, uint4 jnl_stat) { /* Notify operator and terminate journaling */ unsigned int status; sgmnt_addrs *csa; seq_num reg_seqno, jnlseqno; boolean_t was_lockid = FALSE, instfreeze_environ; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: assertpro(FALSE && jpc->region->dyn.addr->acc_meth); } # ifdef VMS /* The following assert has been removed as it could be FALSE if the caller is "jnl_file_extend" * assert(0 != memcmp(csa->nl->jnl_file.jnl_file_id.fid, zero_fid, SIZEOF(zero_fid))); */ # endif assert(csa->now_crit); /* We issue an rts_error (instead of shutting off journaling) in the following cases : {BYPASSOK} * 1) $gtm_error_on_jnl_file_lost is set to issue runtime error (if not already issued) in case of journaling issues. * 2) The process has the given message set in $gtm_custom_errors (indicative of instance freeze on error setup) * in which case the goal is to never shut-off journaling */ UNIX_ONLY(assert(jnlpool.jnlpool_ctl == jnlpool_ctl)); UNIX_ONLY(instfreeze_environ = INST_FREEZE_ON_MSG_ENABLED(csa, jnl_stat)); VMS_ONLY(instfreeze_environ = FALSE); if ((JNL_FILE_LOST_ERRORS == TREF(error_on_jnl_file_lost)) || instfreeze_environ) { VMS_ONLY(assert(FALSE)); /* Not fully implemented / supported on VMS. */ if (!process_exiting || instfreeze_environ || !csa->jnl->error_reported) { csa->jnl->error_reported = TRUE; in_wcs_recover = FALSE; /* in case we're called in wcs_recover() */ if (SS_NORMAL != jpc->status) rts_error_csa(CSA_ARG(csa) VARLSTCNT(7) jnl_stat, 4, JNL_LEN_STR(csa->hdr), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_stat, 4, JNL_LEN_STR(csa->hdr), DB_LEN_STR(gv_cur_region)); } return jnl_stat; } if (0 != jnl_stat) jnl_send_oper(jpc, jnl_stat); csa->hdr->jnl_state = jnl_closed; jpc->jnl_buff->cycle++; /* increment shared cycle so all future callers of jnl_ensure_open recognize journal switch */ assert(jpc->cycle < jpc->jnl_buff->cycle); if (REPL_ENABLED(csa->hdr)) { csa->hdr->repl_state = repl_was_open; reg_seqno = csa->hdr->reg_seqno; jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO; send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(jpc->region), ®_seqno, ®_seqno, &jnlseqno, &jnlseqno); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_JNLCLOSED, 3, DB_LEN_STR(jpc->region), &csa->ti->curr_tn); #ifdef VMS /* We can get a jnl_file_lost before the file is even created, so locking is done only if the lock exist */ if (0 != csa->jnl->jnllsb->lockid) { was_lockid = TRUE; status = gtm_enqw(EFN$C_ENF, LCK$K_EXMODE, csa->jnl->jnllsb, LCK$M_CONVERT | LCK$M_NODLCKBLK, NULL, 0, NULL, 0, NULL, PSL$C_USER, 0); if (SS$_NORMAL == status) status = csa->jnl->jnllsb->cond; } jnl_file_close(jpc->region, FALSE, FALSE); if (was_lockid) { if (SS$_NORMAL == status) status = gtm_deq(csa->jnl->jnllsb->lockid, NULL, PSL$C_USER, 0); assertpro(SS$_NORMAL == status); } # else jnl_file_close(jpc->region, FALSE, FALSE); #endif return EXIT_NRM; }
void db_init(gd_region *reg, sgmnt_data_ptr_t tsd) { static boolean_t mutex_init_done = FALSE; boolean_t is_bg, read_only; char machine_name[MAX_MCNAMELEN]; file_control *fc; int gethostname_res, stat_res, mm_prot; int4 status, semval, dblksize, fbwsize; sm_long_t status_l; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct sembuf sop[3]; struct stat stat_buf; union semun semarg; struct semid_ds semstat; struct shmid_ds shmstat; struct statvfs dbvfs; uint4 sopcnt; unix_db_info *udi; #ifdef periodic_timer_removed void periodic_flush_check(); #endif error_def(ERR_CLSTCONFLICT); error_def(ERR_CRITSEMFAIL); error_def(ERR_DBNAMEMISMATCH); error_def(ERR_DBIDMISMATCH); error_def(ERR_NLMISMATCHCALC); error_def(ERR_REQRUNDOWN); error_def(ERR_SYSCALL); assert(tsd->acc_meth == dba_bg || tsd->acc_meth == dba_mm); is_bg = (dba_bg == tsd->acc_meth); read_only = reg->read_only; new_dbinit_ipc = FALSE; /* we did not create a new ipc resource */ udi = FILE_INFO(reg); memset(machine_name, 0, sizeof(machine_name)); if (GETHOSTNAME(machine_name, MAX_MCNAMELEN, gethostname_res)) rts_error(VARLSTCNT(5) ERR_TEXT, 2, LEN_AND_LIT("Unable to get the hostname"), errno); assert(strlen(machine_name) < MAX_MCNAMELEN); csa = &udi->s_addrs; csa->db_addrs[0] = csa->db_addrs[1] = csa->lock_addrs[0] = NULL; /* to help in dbinit_ch and gds_rundown */ reg->opening = TRUE; /* * Create ftok semaphore for this region. * We do not want to make ftok counter semaphore to be 2 for on mupip journal recover process. */ if (!ftok_sem_get(reg, !mupip_jnl_recover, GTM_ID, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * At this point we have ftok_semid sempahore based on ftok key. * Any ftok conflicted region will block at this point. * Say, a.dat and b.dat both has same ftok and we have process A to access a.dat and * process B to access b.dat. In this case only one can continue to do db_init() */ fc = reg->dyn.addr->file_cntl; fc->file_type = reg->dyn.addr->acc_meth; fc->op = FC_READ; fc->op_buff = (sm_uc_ptr_t)tsd; fc->op_len = sizeof(*tsd); fc->op_pos = 1; dbfilop(fc); /* Read file header */ udi->shmid = tsd->shmid; udi->semid = tsd->semid; udi->sem_ctime = tsd->sem_ctime.ctime; udi->shm_ctime = tsd->shm_ctime.ctime; dbsecspc(reg, tsd); /* Find db segment size */ if (!mupip_jnl_recover) { if (INVALID_SEMID == udi->semid) { if (0 != udi->sem_ctime || INVALID_SHMID != udi->shmid || 0 != udi->shm_ctime) /* We must have somthing wrong in protocol or, code, if this happens */ GTMASSERT; /* * Create new semaphore using IPC_PRIVATE. System guarantees a unique id. */ if (-1 == (udi->semid = semget(IPC_PRIVATE, FTOK_SEM_PER_ID, RWDALL | IPC_CREAT))) { udi->semid = INVALID_SEMID; rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semget"), errno); } udi->shmid = INVALID_SHMID; /* reset shmid so dbinit_ch does not get confused in case we go there */ new_dbinit_ipc = TRUE; tsd->semid = udi->semid; semarg.val = GTM_ID; /* * Following will set semaphore number 2 (=FTOK_SEM_PER_ID - 1) value as GTM_ID. * In case we have orphaned semaphore for some reason, mupip rundown will be * able to identify GTM semaphores from the value and can remove. */ if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, SETVAL, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl SETVAL"), errno); /* * Warning: We must read the sem_ctime using IPC_STAT after SETVAL, which changes it. * We must NOT do any more SETVAL after this. Our design is to use * sem_ctime as creation time of semaphore. */ semarg.buf = &semstat; if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, IPC_STAT, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl IPC_STAT"), errno); tsd->sem_ctime.ctime = udi->sem_ctime = semarg.buf->sem_ctime; } else { if (INVALID_SHMID == udi->shmid) /* if mu_rndwn_file gets standalone access of this region and * somehow mupip process crashes, we can have semid != -1 but shmid == -1 */ rts_error(VARLSTCNT(10) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name), ERR_TEXT, 2, LEN_AND_LIT("semid is valid but shmid is invalid")); semarg.buf = &semstat; if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg)) /* file header has valid semid but semaphore does not exists */ rts_error(VARLSTCNT(6) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name)); else if (semarg.buf->sem_ctime != tsd->sem_ctime.ctime) rts_error(VARLSTCNT(10) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name), ERR_TEXT, 2, LEN_AND_LIT("sem_ctime does not match")); if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl"), errno); else if (shmstat.shm_ctime != tsd->shm_ctime.ctime) rts_error(VARLSTCNT(10) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name), ERR_TEXT, 2, LEN_AND_LIT("shm_ctime does not match")); } /* We already have ftok semaphore of this region, so just plainly do semaphore operation */ /* This is the database access control semaphore for any region */ sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; if (!read_only) { sop[2].sem_num = 1; sop[2].sem_op = 1; /* increment r/w access counter */ sopcnt = 3; } sop[0].sem_flg = sop[1].sem_flg = sop[2].sem_flg = SEM_UNDO | IPC_NOWAIT; SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) { errno_save = errno; gtm_putmsg(VARLSTCNT(4) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg)); rts_error(VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("semop()"), CALLFROM, errno_save); } } else /* for mupip_jnl_recover we were already in mu_rndwn_file and got "semid" semaphore */ { if (INVALID_SEMID == udi->semid || 0 == udi->sem_ctime) /* make sure mu_rndwn_file() has reset created semaphore for standalone access */ GTMASSERT; if (INVALID_SHMID != udi->shmid || 0 != udi->shm_ctime) /* make sure mu_rndwn_file() has reset shared memory */ GTMASSERT; udi->shmid = INVALID_SHMID; /* reset shmid so dbinit_ch does not get confused in case we go there */ new_dbinit_ipc = TRUE; } sem_incremented = TRUE; if (new_dbinit_ipc) { /* Create new shared memory using IPC_PRIVATE. System guarantees a unique id */ #ifdef __MVS__ if (-1 == (status_l = udi->shmid = shmget(IPC_PRIVATE, ROUND_UP(reg->sec_size, MEGA_BOUND), __IPC_MEGA | IPC_CREAT | RWDALL))) #else if (-1 == (status_l = udi->shmid = shmget(IPC_PRIVATE, reg->sec_size, RWDALL | IPC_CREAT))) #endif { udi->shmid = status_l = INVALID_SHMID; rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database shmget"), errno); } tsd->shmid = udi->shmid; if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl"), errno); tsd->shm_ctime.ctime = udi->shm_ctime = shmstat.shm_ctime; } #ifdef DEBUG_DB64 status_l = (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)do_shmat(udi->shmid, next_smseg, SHM_RND)); next_smseg = (sm_uc_ptr_t)ROUND_UP((sm_long_t)(next_smseg + reg->sec_size), SHMAT_ADDR_INCS); #else status_l = (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)do_shmat(udi->shmid, 0, SHM_RND)); #endif if (-1 == status_l) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error attaching to database shared memory"), errno); } csa->nl = (node_local_ptr_t)csa->db_addrs[0]; csa->critical = (mutex_struct_ptr_t)(csa->db_addrs[0] + NODE_LOCAL_SIZE); assert(((int)csa->critical & 0xf) == 0); /* critical should be 16-byte aligned */ #ifdef CACHELINE_SIZE assert(0 == ((int)csa->critical & (CACHELINE_SIZE - 1))); #endif /* Note: Here we check jnl_sate from database file and its value cannot change without standalone access. * The jnl_buff buffer should be initialized irrespective of read/write process */ JNL_INIT(csa, reg, tsd); csa->backup_buffer = (backup_buff_ptr_t)(csa->db_addrs[0] + NODE_LOCAL_SPACE + JNL_SHARE_SIZE(tsd)); csa->lock_addrs[0] = (sm_uc_ptr_t)csa->backup_buffer + BACKUP_BUFFER_SIZE + 1; csa->lock_addrs[1] = csa->lock_addrs[0] + LOCK_SPACE_SIZE(tsd) - 1; csa->total_blks = tsd->trans_hist.total_blks; /* For test to see if file has extended */ if (new_dbinit_ipc) { memset(csa->nl, 0, sizeof(*csa->nl)); /* We allocated shared storage -- we have to init it */ if (JNL_ALLOWED(csa)) { /* initialize jb->cycle to a value different from initial value of jpc->cycle (0). although this is not * necessary right now, in the future, the plan is to change jnl_ensure_open() to only do a cycle mismatch * check in order to determine whether to call jnl_file_open() or not. this is in preparation for that. */ csa->jnl->jnl_buff->cycle = 1; } } if (is_bg) csd = csa->hdr = (sgmnt_data_ptr_t)(csa->lock_addrs[1] + 1 + CACHE_CONTROL_SIZE(tsd)); else { csa->acc_meth.mm.mmblk_state = (mmblk_que_heads_ptr_t)(csa->lock_addrs[1] + 1); FSTAT_FILE(udi->fd, &stat_buf, stat_res); if (-1 == stat_res) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); mm_prot = read_only ? PROT_READ : (PROT_READ | PROT_WRITE); #ifdef DEBUG_DB64 if (-1 == (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)mmap((caddr_t)get_mmseg((size_t)stat_buf.st_size), (size_t)stat_buf.st_size, mm_prot, GTM_MM_FLAGS, udi->fd, (off_t)0))) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); put_mmseg((caddr_t)(csa->db_addrs[0]), (size_t)stat_buf.st_size); #else if (-1 == (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)mmap((caddr_t)NULL, (size_t)stat_buf.st_size, mm_prot, GTM_MM_FLAGS, udi->fd, (off_t)0))) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); #endif csa->db_addrs[1] = csa->db_addrs[0] + stat_buf.st_size - 1; csd = csa->hdr = (sgmnt_data_ptr_t)csa->db_addrs[0]; } if (!csa->nl->glob_sec_init) { assert(new_dbinit_ipc); if (is_bg) *csd = *tsd; if (csd->machine_name[0]) /* crash occured */ { if (0 != memcmp(csd->machine_name, machine_name, MAX_MCNAMELEN)) /* crashed on some other node */ rts_error(VARLSTCNT(6) ERR_CLSTCONFLICT, 4, DB_LEN_STR(reg), LEN_AND_STR(csd->machine_name)); else rts_error(VARLSTCNT(6) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(csd->machine_name)); } if (is_bg) { bt_malloc(csa); csa->nl->cache_off = -CACHE_CONTROL_SIZE(tsd); db_csh_ini(csa); } db_csh_ref(csa); strcpy(csa->nl->machine_name, machine_name); /* machine name */ assert(MAX_REL_NAME > gtm_release_name_len); memcpy(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1); /* GT.M release name */ memcpy(csa->nl->label, GDS_LABEL, GDS_LABEL_SZ - 1); /* GDS label */ memcpy(csa->nl->fname, reg->dyn.addr->fname, reg->dyn.addr->fname_len); /* database filename */ csa->nl->creation_date_time = csd->creation.date_time; csa->nl->highest_lbm_blk_changed = -1; csa->nl->wcs_timers = -1; csa->nl->nbb = BACKUP_NOT_IN_PROGRESS; csa->nl->unique_id.uid = FILE_INFO(reg)->fileid; /* save what file we initialized this storage for */ /* save pointers in csa to access shared memory */ csa->nl->critical = (sm_off_t)((sm_uc_ptr_t)csa->critical - (sm_uc_ptr_t)csa->nl); if (JNL_ALLOWED(csa)) csa->nl->jnl_buff = (sm_off_t)((sm_uc_ptr_t)csa->jnl->jnl_buff - (sm_uc_ptr_t)csa->nl); csa->nl->backup_buffer = (sm_off_t)((sm_uc_ptr_t)csa->backup_buffer - (sm_uc_ptr_t)csa->nl); csa->nl->hdr = (sm_off_t)((sm_uc_ptr_t)csd - (sm_uc_ptr_t)csa->nl); csa->nl->lock_addrs = (sm_off_t)((sm_uc_ptr_t)csa->lock_addrs[0] - (sm_uc_ptr_t)csa->nl); if (!read_only || is_bg) { csd->trans_hist.early_tn = csd->trans_hist.curr_tn; csd->max_update_array_size = csd->max_non_bm_update_array_size = ROUND_UP2(MAX_NON_BITMAP_UPDATE_ARRAY_SIZE(csd), UPDATE_ARRAY_ALIGN_SIZE); csd->max_update_array_size += ROUND_UP2(MAX_BITMAP_UPDATE_ARRAY_SIZE, UPDATE_ARRAY_ALIGN_SIZE); /* add current db_csh counters into the cumulative counters and reset the current counters */ #define TAB_DB_CSH_ACCT_REC(COUNTER, DUMMY1, DUMMY2) \ csd->COUNTER.cumul_count += csd->COUNTER.curr_count; \ csd->COUNTER.curr_count = 0; #include "tab_db_csh_acct_rec.h" #undef TAB_DB_CSH_ACCT_REC } if (!read_only) { if (is_bg) { assert(memcmp(csd, GDS_LABEL, GDS_LABEL_SZ - 1) == 0); LSEEKWRITE(udi->fd, (off_t)0, (sm_uc_ptr_t)csd, sizeof(sgmnt_data), errno_save); if (0 != errno_save) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database write"), errno_save); } } } reg->dyn.addr->ext_blk_count = csd->extension_size; mlk_shr_init(csa->lock_addrs[0], csd->lock_space_size, csa, (FALSE == read_only)); DEBUG_ONLY(locknl = csa->nl;) /* for DEBUG_ONLY LOCK_HIST macro */
gd_region *dbfilopn (gd_region *reg) { unix_db_info *udi; parse_blk pblk; mstr file; char *fnptr, fbuff[MAX_FBUFF + 1]; struct stat buf; gd_region *prev_reg; gd_segment *seg; int status; bool raw; int stat_res; seg = reg->dyn.addr; assert(seg->acc_meth == dba_bg || seg->acc_meth == dba_mm); if (NULL == seg->file_cntl) { seg->file_cntl = (file_control *)malloc(sizeof(*seg->file_cntl)); memset(seg->file_cntl, 0, sizeof(*seg->file_cntl)); } if (NULL == seg->file_cntl->file_info) { seg->file_cntl->file_info = (void *)malloc(sizeof(unix_db_info)); memset(seg->file_cntl->file_info, 0, sizeof(unix_db_info)); } file.addr = (char *)seg->fname; file.len = seg->fname_len; memset(&pblk, 0, sizeof(pblk)); pblk.buffer = fbuff; pblk.buff_size = MAX_FBUFF; pblk.fop = (F_SYNTAXO | F_PARNODE); memcpy(fbuff,file.addr,file.len); *(fbuff + file.len) = '\0'; if (is_raw_dev(fbuff)) { raw = TRUE; pblk.def1_buf = DEF_NODBEXT; pblk.def1_size = sizeof(DEF_NODBEXT) - 1; } else { raw = FALSE; pblk.def1_buf = DEF_DBEXT; pblk.def1_size = sizeof(DEF_DBEXT) - 1; } status = parse_file(&file, &pblk); if (!(status & 1)) { if (GTCM_GNP_SERVER_IMAGE != image_type) { free(seg->file_cntl->file_info); free(seg->file_cntl); seg->file_cntl = 0; } rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), status); } assert(pblk.b_esl < sizeof(seg->fname)); memcpy(seg->fname, pblk.buffer, pblk.b_esl); pblk.buffer[pblk.b_esl] = 0; seg->fname[pblk.b_esl] = 0; seg->fname_len = pblk.b_esl; if (pblk.fnb & F_HAS_NODE) { /* Remote node specification given */ assert(pblk.b_node && pblk.l_node[pblk.b_node - 1] == ':'); gvcmy_open(reg, &pblk); return (gd_region *)-1; } fnptr = (char *)seg->fname + pblk.b_node; udi = FILE_INFO(reg); udi->raw = raw; udi->fn = (char *)fnptr; OPENFILE(fnptr, O_RDWR, udi->fd); udi->ftok_semid = INVALID_SEMID; udi->semid = INVALID_SEMID; udi->shmid = INVALID_SHMID; udi->sem_ctime = 0; udi->shm_ctime = 0; reg->read_only = FALSE; /* maintain csa->read_write simultaneously */ udi->s_addrs.read_write = TRUE; /* maintain reg->read_only simultaneously */ if (udi->fd == -1) { OPENFILE(fnptr, O_RDONLY, udi->fd); if (udi->fd == -1) { errno_save = errno; if (GTCM_GNP_SERVER_IMAGE != image_type) { free(seg->file_cntl->file_info); free(seg->file_cntl); seg->file_cntl = 0; } rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno_save); } reg->read_only = TRUE; /* maintain csa->read_write simultaneously */ udi->s_addrs.read_write = FALSE; /* maintain reg->read_only simultaneously */ } STAT_FILE(fnptr, &buf, stat_res); set_gdid_from_stat(&udi->fileid, &buf); if (prev_reg = gv_match(reg)) { close(udi->fd); free(seg->file_cntl->file_info); free(seg->file_cntl); seg->file_cntl = 0; return prev_reg; } return reg; }
boolean_t mu_truncate(int4 truncate_percent) { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int num_local_maps; int lmap_num, lmap_blk_num; int bml_status, sigkill; int save_errno; int ftrunc_status; uint4 jnl_status; uint4 old_total, new_total; uint4 old_free, new_free; uint4 end_blocks; int4 blks_in_lmap, blk; gtm_uint64_t before_trunc_file_size; off_t trunc_file_size; off_t padding; uchar_ptr_t lmap_addr; boolean_t was_crit; uint4 found_busy_blk; srch_blk_status bmphist; srch_blk_status *blkhist; srch_hist alt_hist; trans_num curr_tn; blk_hdr_ptr_t lmap_blk_hdr; block_id *blkid_ptr; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; char *err_msg; intrpt_state_t prev_intrpt_state; off_t offset; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csa = cs_addrs; csd = cs_data; if (dba_mm == csd->acc_meth) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } /* already checked for parallel truncates on this region --- see mupip_reorg.c */ gv_target = NULL; assert(csa->nl->trunc_pid == process_id); assert(dba_mm != csd->acc_meth); old_total = csa->ti->total_blks; old_free = csa->ti->free_blocks; sigkill = 0; found_busy_blk = 0; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ assert(csd->bplmap == BLKS_PER_LMAP); end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */ if (0 == end_blocks) end_blocks = BLKS_PER_LMAP; num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP); /* ======================================== PHASE 1 ======================================== */ for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--) { if (mu_ctrly_occurred || mu_ctrlc_occurred) return TRUE; assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */ if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } lmap_blk_num = lmap_num * BLKS_PER_LMAP; if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { found_busy_blk = lmap_blk_num; break; } blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP; /* Loop through non-bitmap blocks of this lmap, do recycled2free */ DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n", lmap_num, lmap_blk_num, blks_in_lmap)); for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;) { t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) /* retry loop for recycled to free transactions */ { curr_tn = csd->trans_hist.curr_tn; /* Read the nth local bitmap into memory */ bmphist.blk_num = lmap_blk_num; bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr); lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr; if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr); /* starting from the hint (blk itself), find the first busy or recycled block */ blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status); assert(blk < BLKS_PER_LMAP); if (blk == -1 || blk >= blks_in_lmap) { /* done with this lmap, continue to next */ t_abort(gv_cur_region, csa); break; } else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { /* stop processing blocks... skip ahead to phase 2 */ found_busy_blk = lmap_blk_num; t_abort(gv_cur_region, csa); break; } else if (BLK_RECYCLED == bml_status) { /* Write PBLK records for recycled blocks only if before_image journaling is * enabled. t_end() takes care of checking if journaling is enabled and * writing PBLK record. We have to at least mark the recycled block as free. */ RESET_UPDATE_ARRAY; update_trans = UPDTRNS_DB_UPDATED_MASK; *((block_id *)update_array_ptr) = blk; update_array_ptr += SIZEOF(block_id); *(int *)update_array_ptr = 0; alt_hist.h[1].blk_num = 0; alt_hist.h[0].level = 0; alt_hist.h[0].cse = NULL; alt_hist.h[0].tn = curr_tn; alt_hist.h[0].blk_num = lmap_blk_num + blk; alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num, &alt_hist.h[0].cycle, &alt_hist.h[0].cr); if (!alt_hist.h[0].buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } if (!t_recycled2free(&alt_hist.h[0])) { t_retry(cdb_sc_lostbmlcr); continue; } t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0); /* Set the opcode for INCTN record written by t_end() */ inctn_opcode = inctn_blkmarkfree; if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; /* block processed, scan from the next one */ blk++; break; } else { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_badbitmap); continue; } } /* END recycled2free retry loop */ } /* END scanning blocks of this particular lmap */ /* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */ /* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */ DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num)); t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { RESET_UPDATE_ARRAY; BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */ curr_tn = csd->trans_hist.curr_tn; blkhist = &alt_hist.h[0]; blkhist->blk_num = lmap_blk_num; blkhist->tn = curr_tn; blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ /* Read the nth local bitmap into memory */ blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr; if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); blkhist->blk_num = 0; /* create empty history for bitmap block */ if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; break; } } /* END scanning lmaps */ /* ======================================== PHASE 2 ======================================== */ assert(!csa->now_crit); for (;;) { /* wait for FREEZE, we don't want to truncate a frozen database */ grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(csa, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } assert(csa->nl->trunc_pid == process_id); /* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */ if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) { assert(FALSE); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"), DB_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return FALSE; } csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk); assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk)); new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP); if (mu_ctrly_occurred || mu_ctrlc_occurred) { rel_crit(gv_cur_region); return TRUE; } else if (csa->ti->total_blks != old_total || new_total == old_total) { assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); rel_crit(gv_cur_region); return TRUE; } else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (SNAPSHOTS_IN_PROG(csa->nl)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); if (JNL_ENABLED(csa)) { /* Write JRT_TRUNC and INCTN records */ if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = csa->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, csa); if (SS_NORMAL != jnl_status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); else { if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total); inctn_opcode = inctn_mu_reorg; jnl_write_inctn_rec(csa); jnl_status = jnl_flush(gv_cur_region); if (SS_NORMAL != jnl_status) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"), jnl_status); assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */ } } } /* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */ curr_tn = csa->ti->curr_tn; CHECK_TN(csa, csd, curr_tn); udi = FILE_INFO(gv_cur_region); /* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */ trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1)); csd->after_trunc_total_blks = new_total; csd->before_trunc_free_blocks = csa->ti->free_blocks; csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */ /* file size and total blocks: INCONSISTENT */ csa->ti->total_blks = new_total; /* past the point of no return -- shared memory intact */ assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total)); csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total); new_free = csa->ti->free_blocks; KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */ fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); CHECK_DBSYNC(gv_cur_region, save_errno); /* past the point of no return -- shared memory deleted */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */ clear_cache_array(csa, csd, gv_cur_region, new_total, old_total); offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size; save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff))); if (0 != save_errno) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); return FALSE; } KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */ /* Execute an ftruncate() and truncate the DB file * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS) * It ignores kill -9 signal till its operation is completed. * So we can safely assume that the result of ftruncate() will be complete. */ FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status); if (0 != ftrunc_status) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); /* should go through recover_truncate now, which will again try to FTRUNCATE */ return FALSE; } /* file size and total blocks: CONSISTENT (shrunk) */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */ csa->nl->root_search_cycle++; /* Force concurrent processes to restart in t_end/tp_tend to make sure no one * tries to commit updates past the end of the file. Bitmap validations together * with highest_lbm_with_busy_blk should actually be sufficient, so this is * just to be safe. */ csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */ /* Increment TN */ assert(csa->ti->early_tn == csa->ti->curr_tn); csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1; INCREMENT_CURR_TN(csd); fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */ CHECK_DBSYNC(gv_cur_region, save_errno); ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); curr_tn = csa->ti->curr_tn; rel_crit(gv_cur_region); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn); util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].", FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free); return TRUE; } /* END of mu_truncate() */
void dse_crit(void) { int util_len, dse_crit_count; char util_buff[MAX_UTIL_LEN]; boolean_t crash = FALSE, cycle = FALSE, owner = FALSE; gd_region *save_region, *r_local, *r_top; crash = ((cli_present("CRASH") == CLI_PRESENT) || (cli_present("RESET") == CLI_PRESENT)); cycle = (CLI_PRESENT == cli_present("CYCLE")); if (cli_present("SEIZE") == CLI_PRESENT || cycle) { if (gv_cur_region->read_only && !cycle) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); if (cs_addrs->now_crit) { util_out_print("!/Write critical section already seized.!/", TRUE); return; } crash_count = cs_addrs->critical->crashcnt; grab_crit_encr_cycle_sync(gv_cur_region); cs_addrs->hold_onto_crit = TRUE; /* need to do this AFTER grab_crit */ cs_addrs->dse_crit_seize_done = TRUE; util_out_print("!/Seized write critical section.!/", TRUE); if (!cycle) return; } if (cli_present("RELEASE") == CLI_PRESENT || cycle) { if (gv_cur_region->read_only && !cycle) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); if (!cs_addrs->now_crit) { util_out_print("!/Critical section already released.!/", TRUE); return; } crash_count = cs_addrs->critical->crashcnt; if (cs_addrs->now_crit) { /* user wants crit to be released unconditionally so "was_crit" not checked like everywhere else */ assert(cs_addrs->hold_onto_crit && cs_addrs->dse_crit_seize_done); cs_addrs->dse_crit_seize_done = FALSE; cs_addrs->hold_onto_crit = FALSE; /* need to do this before the rel_crit */ rel_crit(gv_cur_region); util_out_print("!/Released write critical section.!/", TRUE); } # ifdef DEBUG else assert(!cs_addrs->hold_onto_crit && !cs_addrs->dse_crit_seize_done); # endif return; } if (cli_present("INIT") == CLI_PRESENT) { if (gv_cur_region->read_only) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); cs_addrs->hdr->image_count = 0; UNIX_ONLY(gtm_mutex_init(gv_cur_region, NUM_CRIT_ENTRY(cs_addrs->hdr), crash)); VMS_ONLY(mutex_init(cs_addrs->critical, NUM_CRIT_ENTRY(cs_addrs->hdr), crash)); cs_addrs->nl->in_crit = 0; cs_addrs->now_crit = FALSE; util_out_print("!/Reinitialized critical section.!/", TRUE); return; } if (cli_present("REMOVE") == CLI_PRESENT) { if (gv_cur_region->read_only) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); if (cs_addrs->nl->in_crit == 0) { util_out_print("!/The write critical section is unowned!/", TRUE); return; } UNIX_ONLY(assert(LOCK_AVAILABLE != cs_addrs->critical->semaphore.u.parts.latch_pid);) VMS_ONLY(assert(cs_addrs->critical->semaphore >= 0);)
void dse_chng_bhead(void) { blk_hdr new_hdr; blk_segment *bs1, *bs_ptr; block_id blk; boolean_t chng_blk, ismap, was_hold_onto_crit; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */ int4 x; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; srch_blk_status blkhist; trans_num tn; uint4 mapsize; csa = cs_addrs; if (gv_cur_region->read_only) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ chng_blk = FALSE; if (BADDSEBLK == (blk = dse_getblk("BLOCK", DSEBMLOK, DSEBLKCUR))) /* WARNING: assignment */ return; csd = csa->hdr; assert(csd == cs_data); blk_size = csd->blk_size; ismap = IS_BITMAP_BLK(blk); mapsize = BM_SIZE(csd->bplmap); t_begin_crit(ERR_DSEFAIL); blkhist.blk_num = blk; if (!(blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL); new_hdr = *(blk_hdr_ptr_t)blkhist.buffaddr; if (CLI_PRESENT == cli_present("LEVEL")) { if (!cli_get_hex("LEVEL", (uint4 *)&x)) { t_abort(gv_cur_region, csa); return; } if (ismap && (unsigned char)x != LCL_MAP_LEVL) { util_out_print("Error: invalid level for a bit map block.", TRUE); t_abort(gv_cur_region, csa); return; } if (!ismap && (x < 0 || x > MAX_BT_DEPTH + 1)) { util_out_print("Error: invalid level.", TRUE); t_abort(gv_cur_region, csa); return; } new_hdr.levl = (unsigned char)x; chng_blk = TRUE; if (new_hdr.bsiz < SIZEOF(blk_hdr)) new_hdr.bsiz = SIZEOF(blk_hdr); if (new_hdr.bsiz > blk_size) new_hdr.bsiz = blk_size; } if (CLI_PRESENT == cli_present("BSIZ")) { if (!cli_get_hex("BSIZ", (uint4 *)&x)) { t_abort(gv_cur_region, csa); return; } if (ismap && x != mapsize) { util_out_print("Error: invalid bsiz.", TRUE); t_abort(gv_cur_region, csa); return; } else if (x < SIZEOF(blk_hdr) || x > blk_size) { util_out_print("Error: invalid bsiz.", TRUE); t_abort(gv_cur_region, csa); return; } chng_blk = TRUE; new_hdr.bsiz = x; } if (!chng_blk) t_abort(gv_cur_region, csa); else { BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blkhist.buffaddr + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr)); if (!BLK_FINI(bs_ptr, bs1)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_AIMGBLKFAIL, 3, blk, DB_LEN_STR(gv_cur_region)); t_abort(gv_cur_region, csa); return; } t_write(&blkhist, (unsigned char *)bs1, 0, 0, new_hdr.levl, TRUE, FALSE, GDS_WRITE_KILLTN); BUILD_AIMG_IF_JNL_ENABLED(csd, csa->ti->curr_tn); t_end(&dummy_hist, NULL, TN_NOT_SPECIFIED); } if (CLI_PRESENT == cli_present("TN")) { if (!cli_get_hex64("TN", &tn)) return; t_begin_crit(ERR_DSEFAIL); CHECK_TN(csa, csd, csd->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ assert(csa->ti->early_tn == csa->ti->curr_tn); if (NULL == (blkhist.buffaddr = t_qread(blkhist.blk_num, &blkhist.cycle, &blkhist.cr))) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(1) ERR_DSEBLKRDFAIL); t_abort(gv_cur_region, csa); return; } if (new_hdr.bsiz < SIZEOF(blk_hdr)) new_hdr.bsiz = SIZEOF(blk_hdr); if (new_hdr.bsiz > blk_size) new_hdr.bsiz = blk_size; BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, blkhist.buffaddr + SIZEOF(new_hdr), new_hdr.bsiz - SIZEOF(new_hdr)); BLK_FINI(bs_ptr, bs1); t_write(&blkhist, (unsigned char *)bs1, 0, 0, ((blk_hdr_ptr_t)blkhist.buffaddr)->levl, TRUE, FALSE, GDS_WRITE_KILLTN); /* Pass the desired tn as argument to bg_update/mm_update below */ BUILD_AIMG_IF_JNL_ENABLED_AND_T_END_WITH_EFFECTIVE_TN(csa, csd, tn, &dummy_hist); } return; }
int jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header header; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, result; GTM_BAVAIL_TYPE avail_blocks; uint4 aligned_tot_jrec_size, count; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(jpc->region == gv_cur_region); assert(csa->jnl_state == csd->jnl_state); if (!JNL_ENABLED(csa) || (NOJNL == jpc->channel) || (JNL_FILE_SWITCHED(jpc))) GTMASSERT; /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { if ((new_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* if we cannot satisfy the request, it is an error */ send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - new_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, sizeof(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH); jnl_file_close(gv_cur_region, TRUE, TRUE); } else rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); if (jgbl.forw_phase_recovery && (NULL != jgbl.mur_pini_addr_reset_fnptr)) (*jgbl.mur_pini_addr_reset_fnptr)(); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->trans_hist.header_open_tn = jnl_info.tn; /* needed for successful jnl_file_open() */ send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREATE, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); assert(jb->filesize == csd->jnl_alq); aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } else { send_msg(VARLSTCNT(4) ERR_JNLCREATERR, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jb->filesize = new_alq; /* Actually this is virtual file size blocks */ DO_FILE_READ(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); assert((header.virtual_size + new_blocks) == new_alq); header.virtual_size = new_alq; DO_FILE_WRITE(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } if (0 >= new_blocks) break; } if (0 >= new_blocks) { jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); new_blocks = -1; } return new_blocks; }
/* This function is called primarily to append a new histinfo record to the replication instance file by one of the following * 1) MUPIP REPLIC -SOURCE -START -ROOTPRIMARY command (after forking the child source server) if it created the journal pool. * 2) MUPIP REPLIC -SOURCE -ACTIVATE -ROOTPRIMARY command if this is a propagating primary to root primary transition. * In addition, this function also initializes the "lms_group_info" field in the instance file (from the "inst_info" field) * if the current value is NULL. */ void gtmsource_rootprimary_init(seq_num start_seqno) { unix_db_info *udi; repl_histinfo histinfo; boolean_t was_crit, switch_jnl; gd_region *reg, *region_top; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; uint4 jnl_status; udi = FILE_INFO(jnlpool.jnlpool_dummy_reg); assert(NULL != jnlpool.repl_inst_filehdr); /* Update journal pool fields to reflect this is a root primary startup and updates are enabled */ assert(!udi->s_addrs.hold_onto_crit || jgbl.onlnrlbk); was_crit = udi->s_addrs.now_crit; if (!was_crit) grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK); jnlpool.repl_inst_filehdr->root_primary_cycle++; /* If this instance is transitioning from a non-rootprimary to rootprimary, switch journal files. * This helps with maintaining accurate value of csd->zqgblmod_tn when the former primary connects * to the current primary through a fetchresync-rollback or receiver-server-autorollback.. */ switch_jnl = (!jnlpool.repl_inst_filehdr->was_rootprimary && (0 < jnlpool.repl_inst_filehdr->num_histinfo)); jnlpool.repl_inst_filehdr->was_rootprimary = TRUE; assert(start_seqno >= jnlpool.jnlpool_ctl->start_jnl_seqno); assert(start_seqno == jnlpool.jnlpool_ctl->jnl_seqno); jnlpool.repl_inst_filehdr->jnl_seqno = start_seqno; assert(jgbl.onlnrlbk || jnlpool.jnlpool_ctl->upd_disabled); if (!jgbl.onlnrlbk) jnlpool.jnlpool_ctl->upd_disabled = FALSE; if (IS_REPL_INST_UUID_NULL(jnlpool.repl_inst_filehdr->lms_group_info)) { /* This is the first time this instance is being brought up either as a root primary or as a propagating * primary. Initialize the "lms_group_info" fields in the instance file header in journal pool shared memory. * They will be flushed to the instance file as part of the "repl_inst_histinfo_add -> repl_inst_flush_filehdr" * function invocation below. */ assert('\0' == jnlpool.repl_inst_filehdr->lms_group_info.created_nodename[0]); assert('\0' == jnlpool.repl_inst_filehdr->lms_group_info.this_instname[0]); assert(!jnlpool.repl_inst_filehdr->lms_group_info.creator_pid); jnlpool.repl_inst_filehdr->lms_group_info = jnlpool.repl_inst_filehdr->inst_info; assert('\0' != jnlpool.repl_inst_filehdr->lms_group_info.created_nodename[0]); DBG_CHECK_CREATED_NODENAME(jnlpool.repl_inst_filehdr->lms_group_info.created_nodename); assert('\0' != jnlpool.repl_inst_filehdr->lms_group_info.this_instname[0]); assert(jnlpool.repl_inst_filehdr->lms_group_info.created_time); assert(jnlpool.repl_inst_filehdr->lms_group_info.creator_pid); } /* Initialize histinfo fields */ memcpy(histinfo.root_primary_instname, jnlpool.repl_inst_filehdr->inst_info.this_instname, MAX_INSTNAME_LEN - 1); histinfo.root_primary_instname[MAX_INSTNAME_LEN - 1] = '\0'; assert('\0' != histinfo.root_primary_instname[0]); histinfo.start_seqno = start_seqno; assert(jnlpool.jnlpool_ctl->strm_seqno[0] == jnlpool.repl_inst_filehdr->strm_seqno[0]); assert(jnlpool.repl_inst_filehdr->is_supplementary || (0 == jnlpool.jnlpool_ctl->strm_seqno[0])); histinfo.strm_seqno = (!jnlpool.repl_inst_filehdr->is_supplementary) ? 0 : jnlpool.jnlpool_ctl->strm_seqno[0]; histinfo.root_primary_cycle = jnlpool.repl_inst_filehdr->root_primary_cycle; assert(process_id == getpid()); histinfo.creator_pid = process_id; JNL_SHORT_TIME(histinfo.created_time); histinfo.strm_index = 0; histinfo.history_type = HISTINFO_TYPE_NORMAL; NULL_INITIALIZE_REPL_INST_UUID(histinfo.lms_group); /* The following fields will be initialized in the "repl_inst_histinfo_add" function call below. * histinfo.histinfo_num * histinfo.prev_histinfo_num * histinfo.last_histinfo_num[] */ /* Add the histinfo record to the instance file and flush the changes in the journal pool to the file header */ repl_inst_histinfo_add(&histinfo); if (!was_crit) rel_lock(jnlpool.jnlpool_dummy_reg); if (switch_jnl) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_file_extend and its callees assume jgbl.gbl_jrec_time is set */ for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++) { gv_cur_region = reg; change_reg(); /* sets cs_addrs/cs_data (needed by jnl_ensure_open) */ if (!JNL_ENABLED(cs_addrs)) continue; grab_crit(gv_cur_region); jpc = cs_addrs->jnl; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order of jnl * records. This needs to be done BEFORE the jnl_ensure_open as that could write journal records * (if it decides to switch to a new journal file) */ jbp = jpc->jnl_buff; ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { if (EXIT_ERR == SWITCH_JNL_FILE(jpc)) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_JNLEXTEND, 2, JNL_LEN_STR(cs_data)); } else { if (SS_NORMAL != jpc->status) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); } rel_crit(gv_cur_region); } } }
int mur_forward_multi_proc(reg_ctl_list *rctl) { boolean_t multi_proc, this_reg_stuck, release_latch, ok_to_play; boolean_t cancelled_dbsync_timer, cancelled_timer; reg_ctl_list *rctl_top, *prev_rctl; jnl_ctl_list *jctl; gd_region *reg; sgmnt_addrs *csa; seq_num rec_token_seq; jnl_tm_t rec_time; enum broken_type recstat; jnl_record *rec; enum jnl_record_type rectype; char errstr[256]; int i, rctl_index, save_errno, num_procs_stuck, num_reg_stuck; uint4 status, regcnt_stuck, num_partners, start_hrtbt_cntr; forw_multi_struct *forw_multi; shm_forw_multi_t *sfm; multi_struct *multi; jnl_tm_t adjusted_resolve_time; shm_reg_ctl_t *shm_rctl_start, *shm_rctl, *first_shm_rctl; size_t shm_size, reccnt, copy_size; int4 *size_ptr; char *shmPtr; /* not using "shm_ptr" since it is already used in an AIX include file */ int shmid; multi_proc_shm_hdr_t *mp_hdr; /* Pointer to "multi_proc_shm_hdr_t" structure in shared memory */ status = 0; /* Although we made sure the # of tasks is the same as the # of processes forked off (in the "gtm_multi_proc" * invocation in "mur_forward"), it is possible one of the forked process finishes one invocation of * "mur_forward_multi_proc" before even another forked process gets assigned one task in "gtm_multi_proc_helper". * In this case, we would be invoked more than once. But the first invocation would have done all the needed stuff * so return for later invocations. */ if (mur_forward_multi_proc_done) return 0; mur_forward_multi_proc_done = TRUE; /* Note: "rctl" is unused. But cannot avoid passing it since "gtm_multi_proc" expects something */ prev_rctl = NULL; rctl_start = NULL; adjusted_resolve_time = murgbl.adjusted_resolve_time; assert(0 == murgbl.regcnt_remaining); multi_proc = multi_proc_in_use; /* cache value in "local" to speed up access inside loops below */ if (multi_proc) { mp_hdr = multi_proc_shm_hdr; shm_rctl_start = mur_shm_hdr->shm_rctl_start; if (jgbl.onlnrlbk) { for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { assert(rctl->csa->hold_onto_crit); /* would have been set in parent process */ rctl->csa->hold_onto_crit = FALSE; /* reset since we dont own this region */ assert(rctl->csa->now_crit); /* would have been set in parent process */ rctl->csa->now_crit = FALSE; /* reset since we dont own this region */ } } START_HEARTBEAT_IF_NEEDED; /* heartbeat timer needed later (in case not already started by "gtm_multi_proc") */ } first_shm_rctl = NULL; /* Phase1 of forward recovery starts */ for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { /* Check if "rctl" is available for us or if some other concurrent process has taken it */ if (multi_proc) { rctl_index = rctl - &mur_ctl[0]; shm_rctl = &shm_rctl_start[rctl_index]; if (shm_rctl->owning_pid) { assert(process_id != shm_rctl->owning_pid); continue; } GRAB_MULTI_PROC_LATCH_IF_NEEDED(release_latch); assert(release_latch); for ( ; rctl < rctl_top; rctl++, shm_rctl++) { if (shm_rctl->owning_pid) { assert(process_id != shm_rctl->owning_pid); continue; } shm_rctl->owning_pid = process_id; /* Declare ownership */ rctl->this_pid_is_owner = TRUE; if (jgbl.onlnrlbk) { /* This is an online rollback and crit was grabbed on all regions by the parent rollback * process. But this child process now owns this region and does the actual rollback on * this region so borrow crit for the duration of this child process. */ csa = rctl->csa; csa->hold_onto_crit = TRUE; csa->now_crit = TRUE; assert(csa->nl->in_crit == mp_hdr->parent_pid); csa->nl->in_crit = process_id; assert(csa->nl->onln_rlbk_pid == mp_hdr->parent_pid); csa->nl->onln_rlbk_pid = process_id; } if (NULL == first_shm_rctl) first_shm_rctl = shm_rctl; break; } REL_MULTI_PROC_LATCH_IF_NEEDED(release_latch); if (rctl >= rctl_top) { assert(rctl == rctl_top); break; } /* Set key to print this rctl'ss region-name as prefix in case this forked off process prints any output */ MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); # ifdef MUR_DEBUG fprintf(stderr, "pid = %d : Owns region %s\n", process_id, multi_proc_key); # endif } else rctl->this_pid_is_owner = TRUE; if (mur_options.forward) { assert(NULL == rctl->jctl_turn_around); jctl = rctl->jctl = rctl->jctl_head; assert(jctl->reg_ctl == rctl); jctl->rec_offset = JNL_HDR_LEN; jnl_fence_ctl.fence_list = JNL_FENCE_LIST_END; /* initialized to reflect journaling is not enabled */ if (mur_options.rollback) jgbl.mur_jrec_seqno = jctl->jfh->start_seqno; } else { jctl = rctl->jctl = (NULL == rctl->jctl_turn_around) ? rctl->jctl_head : rctl->jctl_turn_around; assert(jctl->reg_ctl == rctl); jctl->rec_offset = jctl->turn_around_offset; jgbl.mur_jrec_seqno = jctl->turn_around_seqno; assert((NULL != rctl->jctl_turn_around) || (0 == jctl->rec_offset)); } if (mur_options.rollback) { if (murgbl.consist_jnl_seqno < jgbl.mur_jrec_seqno) { /* Assert that murgbl.losttn_seqno is never lesser than jgbl.mur_jrec_seqno (the turnaround * point seqno) as this is what murgbl.consist_jnl_seqno is going to be set to and will * eventually be the post-rollback seqno. If this condition is violated, the result of the * recovery is a compromised database (the file header will indicate a Region Seqno which * is not necessarily correct since seqnos prior to it might be absent in the database). * Therefore, this is an out-of-design situation with respect to rollback and so stop it. */ assert(murgbl.losttn_seqno >= jgbl.mur_jrec_seqno); murgbl.consist_jnl_seqno = jgbl.mur_jrec_seqno; } assert(murgbl.consist_jnl_seqno <= murgbl.losttn_seqno); } if (mur_options.update || mur_options.extr[GOOD_TN]) { reg = rctl->gd; gv_cur_region = reg; tp_change_reg(); /* note : sets cs_addrs to non-NULL value even if gv_cur_region->open is FALSE * (cs_data could still be NULL). */ rctl->csa = cs_addrs; cs_addrs->miscptr = (void *)rctl; rctl->csd = cs_data; rctl->sgm_info_ptr = cs_addrs->sgm_info_ptr; assert(!reg->open || (NULL != cs_addrs->dir_tree)); gv_target = cs_addrs->dir_tree; } jctl->after_end_of_data = FALSE; status = mur_next(jctl, jctl->rec_offset); assert(ERR_JNLREADEOF != status); /* cannot get EOF at start of forward processing */ if (SS_NORMAL != status) goto finish; PRINT_VERBOSE_STAT(jctl, "mur_forward:at the start"); rctl->process_losttn = FALSE; /* Any multi-region TP transaction will be processed as multiple single-region TP transactions up * until the tp-resolve-time is reached. From then on, they will be treated as one multi-region TP * transaction. This is needed for proper lost-tn determination (any multi-region transaction that * gets played in a region AFTER it has already encountered a broken tn should treat this as a lost tn). */ do { if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr)) { /* We are at a logical point. So exit if signaled by parent */ status = ERR_FORCEDHALT; goto finish; } assert(jctl == rctl->jctl); rec = rctl->mur_desc->jnlrec; rec_time = rec->prefix.time; if (rec_time > mur_options.before_time) break; /* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */ if (rec_time < mur_options.after_time) { status = mur_next_rec(&jctl); continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */ } if (rec_time >= adjusted_resolve_time) break; /* Records after this adjusted resolve_time will be processed below in phase2 */ /* Note: Since we do hashtable token processing only for records from tp_resolve_time onwards, * it is possible that if we encounter any broken transactions here we wont know they are broken * but will play them as is. That is unavoidable. Specify -SINCE_TIME (for -BACKWARD rollback/recover) * and -VERIFY (for -FORWARD rollback/recover) to control tp_resolve_time (and in turn more * effective broken tn determination). */ status = mur_forward_play_cur_jrec(rctl); if (SS_NORMAL != status) break; status = mur_next_rec(&jctl); } while (SS_NORMAL == status); CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */ if (SS_NORMAL != status) { /* ERR_FILENOTCREATE is possible from "mur_cre_file_extfmt" OR ERR_FORCEDHALT is possible * from "mur_forward_play_cur_jrec". No other errors are known to occur here. Assert accordingly. */ assert((ERR_FILENOTCREATE == status) || (ERR_FORCEDHALT == status)); goto finish; } if (rctl->forw_eof_seen) { PRINT_VERBOSE_STAT(jctl, "mur_forward:Reached EOF before tp_resolve_time"); continue; /* Reached EOF before even getting to tp_resolve_time. * Do not even consider region for next processing loop */ } rctl->last_tn = 0; murgbl.regcnt_remaining++; /* # of regions participating in recovery at this point */ if (NULL == rctl_start) rctl_start = rctl; if (NULL != prev_rctl) { prev_rctl->next_rctl = rctl; rctl->prev_rctl = prev_rctl; } prev_rctl = rctl; assert(murgbl.ok_to_update_db || !rctl->db_updated); PRINT_VERBOSE_STAT(jctl, "mur_forward:at tp_resolve_time"); } if (multi_proc) multi_proc_key = NULL; /* reset key until it can be set to rctl's region-name again */ /* Note that it is possible for rctl_start to be NULL at this point. That is there is no journal record in any region * AFTER the calculated tp-resolve-time. This is possible if for example -AFTER_TIME was used and has a time later * than any journal record in all journal files. If rctl_start is NULL, prev_rctl should also be NULL and vice versa. */ if (NULL != rctl_start) { assert(NULL != prev_rctl); prev_rctl->next_rctl = rctl_start; rctl_start->prev_rctl = prev_rctl; } rctl = rctl_start; regcnt_stuck = 0; /* # of regions we are stuck in waiting for other regions to resolve a multi-region TP transaction */ assert((NULL == rctl) || (NULL == rctl->forw_multi)); gv_cur_region = NULL; /* clear out any previous value to ensure gv_cur_region/cs_addrs/cs_data * all get set in sync by the MUR_CHANGE_REG macro below. */ /* Phase2 of forward recovery starts */ while (NULL != rctl) { /* while there is at least one region remaining with unprocessed journal records */ assert(NULL != rctl_start); assert(0 < murgbl.regcnt_remaining); if (NULL != rctl->forw_multi) { /* This region's current journal record is part of a TP transaction waiting for other regions */ regcnt_stuck++; assert(regcnt_stuck <= murgbl.regcnt_remaining); if (regcnt_stuck == murgbl.regcnt_remaining) { assertpro(multi_proc_in_use); /* Else : Out-of-design situation. Stuck in ALL regions. */ /* Check one last time if all regions are stuck waiting for another process to resolve the * multi-region TP transaction. If so, wait in a sleep loop. If not, we can proceed. */ rctl = rctl_start; start_hrtbt_cntr = heartbeat_counter; do { if (IS_FORCED_MULTI_PROC_EXIT(mp_hdr)) { /* We are at a logical point. So exit if signaled by parent */ status = ERR_FORCEDHALT; goto finish; } forw_multi = rctl->forw_multi; assert(NULL != forw_multi); sfm = forw_multi->shm_forw_multi; assert(NULL != sfm); assert(sfm->num_reg_seen_forward <= sfm->num_reg_seen_backward); # ifdef MUR_DEBUG fprintf(stderr, "Pid = %d : Line %d : token = %llu : forward = %d : backward = %d\n", process_id, __LINE__, (long long int)sfm->token, sfm->num_reg_seen_forward, sfm->num_reg_seen_backward); # endif if (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward) { /* We are no longer stuck in this region */ assert(!forw_multi->no_longer_stuck); forw_multi->no_longer_stuck = TRUE; break; } rctl = rctl->next_rctl; /* Move on to the next available region */ assert(NULL != rctl); if (rctl == rctl_start) { /* We went through all regions once and are still stuck. * Sleep until at leat TWO heartbeats have elapsed after which check for deadlock. * Do this only in the child process that owns the FIRST region in the region list. * This way we dont have contention for the GRAB_MULTI_PROC_LATCH from * all children at more or less the same time. */ if ((rctl == mur_ctl) && (heartbeat_counter > (start_hrtbt_cntr + 2))) { /* Check if all processes are stuck for a while. If so assertpro */ GRAB_MULTI_PROC_LATCH_IF_NEEDED(release_latch); assert(release_latch); shm_rctl_start = mur_shm_hdr->shm_rctl_start; num_reg_stuck = 0; for (i = 0; i < murgbl.reg_total; i++) { shm_rctl = &shm_rctl_start[i]; sfm = shm_rctl->shm_forw_multi; if (NULL != sfm) { if (sfm->num_reg_seen_forward != sfm->num_reg_seen_backward) num_reg_stuck++; } } REL_MULTI_PROC_LATCH_IF_NEEDED(release_latch); /* If everyone is stuck at this point, it is an out-of-design situation */ assertpro(num_reg_stuck < murgbl.reg_total); start_hrtbt_cntr = heartbeat_counter; } else { /* Sleep and recheck if any region we are stuck in got resolved. * To minimize time spent sleeping, we just yield our timeslice. */ rel_quant(); continue; } } } while (TRUE); } else { rctl = rctl->next_rctl; /* Move on to the next available region */ assert(NULL != rctl); continue; } } regcnt_stuck = 0; /* restart the counter now that we found at least one non-stuck region */ MUR_CHANGE_REG(rctl); jctl = rctl->jctl; this_reg_stuck = FALSE; for ( status = SS_NORMAL; SS_NORMAL == status; ) { if (multi_proc && IS_FORCED_MULTI_PROC_EXIT(mp_hdr)) { /* We are at a logical point. So exit if signaled by parent */ status = ERR_FORCEDHALT; goto finish; } assert(jctl == rctl->jctl); rec = rctl->mur_desc->jnlrec; rec_time = rec->prefix.time; if (rec_time > mur_options.before_time) break; /* Records after -BEFORE_TIME do not go to extract or losttrans or brkntrans files */ assert((rec_time >= adjusted_resolve_time) || (mur_options.notncheck && !mur_options.verify)); assert((0 == mur_options.after_time) || (mur_options.forward && !rctl->db_updated)); if (rec_time < mur_options.after_time) { status = mur_next_rec(&jctl); continue; /* Records before -AFTER_TIME do not go to extract or losttrans or brkntrans files */ } /* Check if current journal record can be played right away or need to wait for corresponding journal * records from other participating TP regions to be reached. A non-TP or ZTP transaction can be played * without issues (i.e. has no dependencies with any other regions). A single-region TP transaction too * falls in the same category. A multi-region TP transaction needs to wait until all participating regions * have played all journal records BEFORE this TP in order to ensure recover plays records in the exact * same order that GT.M performed them in. */ /* If FENCE_NONE is specified, we would not have maintained any multi hashtable in mur_back_process for * broken transaction processing. So we process multi-region TP transactions as multiple single-region * TP transactions in forward phase. */ if (FENCE_NONE != mur_options.fences) { rectype = (enum jnl_record_type)rec->prefix.jrec_type; if (IS_TP(rectype) && IS_TUPD(rectype)) { assert(IS_SET_KILL_ZKILL_ZTWORM_LGTRIG_ZTRIG(rectype)); assert(&rec->jrec_set_kill.num_participants == &rec->jrec_ztworm.num_participants); assert(&rec->jrec_set_kill.num_participants == &rec->jrec_lgtrig.num_participants); num_partners = rec->jrec_set_kill.num_participants; assert(0 < num_partners); if (1 < num_partners) { this_reg_stuck = TRUE; assert(&rec->jrec_set_kill.update_num == &rec->jrec_ztworm.update_num); assert(&rec->jrec_set_kill.update_num == &rec->jrec_lgtrig.update_num); } } } if (this_reg_stuck) { rec_token_seq = GET_JNL_SEQNO(rec); MUR_FORW_TOKEN_LOOKUP(forw_multi, rec_token_seq, rec_time); if (NULL != forw_multi) { /* This token has already been seen in another region in forward processing. * Add current region as well. If all regions have been resolved, then play * the entire transaction maintaining the exact same order of updates within. */ if (!forw_multi->no_longer_stuck) MUR_FORW_TOKEN_ONE_MORE_REG(forw_multi, rctl); } else { /* First time we are seeing this token in forward processing. Check if this * has already been determined to be a broken transaction. */ recstat = GOOD_TN; multi = NULL; if (IS_REC_POSSIBLY_BROKEN(rec_time, rec_token_seq)) { multi = MUR_TOKEN_LOOKUP(rec_token_seq, rec_time, TPFENCE); if ((NULL != multi) && (0 < multi->partner)) recstat = BROKEN_TN; } MUR_FORW_TOKEN_ADD(forw_multi, rec_token_seq, rec_time, rctl, num_partners, recstat, multi); } /* Check that "tabent" field has been initialized above (by either the MUR_FORW_TOKEN_LOOKUP * or MUR_FORW_TOKEN_ADD macros). This is relied upon by "mur_forward_play_multireg_tp" below. */ assert(NULL != forw_multi->u.tabent); assert(forw_multi->num_reg_seen_forward <= forw_multi->num_reg_seen_backward); if (multi_proc) { sfm = forw_multi->shm_forw_multi; ok_to_play = (NULL == sfm) || (sfm->num_reg_seen_forward == sfm->num_reg_seen_backward); } else ok_to_play = (forw_multi->num_reg_seen_forward == forw_multi->num_reg_seen_backward); assert(ok_to_play || !forw_multi->no_longer_stuck); if (ok_to_play ) { /* We have enough information to proceed with playing this multi-region TP in * forward processing (even if we might not have seen all needed regions). Now play it. * Note that the TP could be BROKEN_TN or GOOD_TN. The callee handles it. */ assert(forw_multi == rctl->forw_multi); status = mur_forward_play_multireg_tp(forw_multi, rctl); this_reg_stuck = FALSE; /* Note that as part of playing the TP transaction, we could have reached * the EOF of rctl. In this case, we need to break out of the loop. */ if ((SS_NORMAL != status) || rctl->forw_eof_seen) break; assert(NULL == rctl->forw_multi); assert(!dollar_tlevel); jctl = rctl->jctl; /* In case the first record after the most recently processed * TP transaction is in the next generation journal file */ continue; } break; } else { status = mur_forward_play_cur_jrec(rctl); if (SS_NORMAL != status) break; } assert(!this_reg_stuck); status = mur_next_rec(&jctl); } assert((NULL == rctl->forw_multi) || this_reg_stuck); assert((NULL != rctl->forw_multi) || !this_reg_stuck); if (!this_reg_stuck) { /* We are not stuck in this region (to resolve a multi-region TP). * This means we are done processing all the records of this region. */ assert(NULL == rctl->forw_multi); if (!rctl->forw_eof_seen) { CHECK_IF_EOF_REACHED(rctl, status); /* sets rctl->forw_eof_seen if needed; resets "status" to SS_NORMAL */ if (SS_NORMAL != status) { assert(ERR_FILENOTCREATE == status); goto finish; } assert(!dollar_tlevel); DELETE_RCTL_FROM_UNPROCESSED_LIST(rctl); /* since all of its records should have been processed */ } else { /* EOF was seen in rctl inside "mur_forward_play_multireg_tp" and it was removed * from the unprocessed list of rctls. At the time rctl was removed, its "next_rctl" * field could have been pointing to another <rctl> that has since then also been * removed inside the same function. Therefore the "next_rctl" field is not reliable * in this case but instead we should rely on the global variable "rctl_start" which * points to the list of unprocessed rctls. Set "next_rctl" accordingly. */ rctl->next_rctl = rctl_start; if (ERR_JNLREADEOF == status) status = SS_NORMAL; } assert(rctl->deleted_from_unprocessed_list); } assert(SS_NORMAL == status); assert(!this_reg_stuck || !rctl->forw_eof_seen); assert((NULL == rctl->next_rctl) || (NULL != rctl_start)); assert((NULL == rctl->next_rctl) || (0 < murgbl.regcnt_remaining)); rctl = rctl->next_rctl; /* Note : even though "rctl" could have been deleted from the doubly linked list above, * rctl->next_rctl is not touched so we can still use it to get to the next element. */ } assert(0 == murgbl.regcnt_remaining); jgbl.mur_pini_addr_reset_fnptr = NULL; /* No more simulation of GT.M activity for any region */ prc_vec = murgbl.prc_vec; /* Use process-vector of MUPIP RECOVER (not any simulating GT.M process) now onwards */ assert(0 == dollar_tlevel); for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++) { if (!rctl->this_pid_is_owner) { assert(multi_proc_in_use); continue; /* in a parallel processing environment, process only regions we own */ } if (multi_proc) { /* Set key to print this rctl's region-name as prefix in case this forked off process prints any output */ MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); } PRINT_VERBOSE_STAT(rctl->jctl, "mur_forward:at the end"); assert(!mur_options.rollback || (0 != murgbl.consist_jnl_seqno)); assert(mur_options.rollback || (0 == murgbl.consist_jnl_seqno)); assert(!dollar_tlevel); /* In case it applied a broken TUPD */ assert(murgbl.ok_to_update_db || !rctl->db_updated); rctl->mur_plst = NULL; /* reset now that simulation of GT.M updates is done */ /* Ensure mur_block_count_correct is called if updates allowed */ if (murgbl.ok_to_update_db && (SS_NORMAL != mur_block_count_correct(rctl))) { gtm_putmsg_csa(CSA_ARG(rctl->csa) VARLSTCNT(4) ERR_BLKCNTEDITFAIL, 2, DB_LEN_STR(rctl->gd)); murgbl.wrn_count++; } } finish: if (multi_proc) multi_proc_key = NULL; /* reset key until it can be set to rctl's region-name again */ if ((SS_NORMAL == status) && mur_options.show) mur_output_show(); if (NULL != first_shm_rctl) { /* Transfer needed process-private information to shared memory so parent process can later inherit this. */ first_shm_rctl->err_cnt = murgbl.err_cnt; first_shm_rctl->wrn_count = murgbl.wrn_count; first_shm_rctl->consist_jnl_seqno = murgbl.consist_jnl_seqno; /* If extract files were created by this process for one or more regions, then copy that information to * shared memory so parent process can use this information to do a merge sort. */ shm_rctl = mur_shm_hdr->shm_rctl_start; for (rctl = mur_ctl, rctl_top = mur_ctl + murgbl.reg_total; rctl < rctl_top; rctl++, shm_rctl++) { assert(multi_proc_in_use); if (!rctl->this_pid_is_owner) continue; /* in a parallel processing environment, process only regions we own */ /* Cancel any flush/dbsync timers by this child process for this region. This is because the * child is not going to go through exit handling code (no gds_rundown etc.). And we need to * clear up csa->nl->wcs_timers. (normally done by gds_rundown). */ if (NULL != rctl->csa) /* rctl->csa can be NULL in case of "mupip journal -extract" etc. */ CANCEL_DB_TIMERS(rctl->gd, rctl->csa, cancelled_timer, cancelled_dbsync_timer); reccnt = 0; for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0; recstat < TOT_EXTR_TYPES; recstat++, size_ptr++) { /* Assert "extr_file_created" information is in sync between rctl and shm_rctl. * This was done at the end of "mur_cre_file_extfmt". */ assert(shm_rctl->extr_file_created[recstat] == rctl->extr_file_created[recstat]); /* Assert that if *size_ptr is non-zero, then we better have created an extract file. * Note that the converse is not true. It is possible we created a file for example to * write an INCTN record but decided to not write anything because it was not a -detail * type of extract. So *sizeptr could be 0 even though we created the extract file. */ assert(!*size_ptr || rctl->extr_file_created[recstat]); shm_rctl->jnlext_list_size[recstat] = *size_ptr; reccnt += *size_ptr; } assert(INVALID_SHMID == shm_rctl->jnlext_shmid); shm_size = reccnt * SIZEOF(jnlext_multi_t); /* If we are quitting because of an abnormal status OR a forced signal to terminate * OR if the parent is dead (kill -9) dont bother creating shmid to communicate back with parent. */ if (mp_hdr->parent_pid != getppid()) { SET_FORCED_MULTI_PROC_EXIT; /* Also signal sibling children to stop processing */ if (SS_NORMAL != status) status = ERR_FORCEDHALT; } if ((SS_NORMAL == status) && shm_size) { shmid = shmget(IPC_PRIVATE, shm_size, 0600 | IPC_CREAT); if (-1 == shmid) { save_errno = errno; SNPRINTF(errstr, SIZEOF(errstr), "shmget() : shmsize=0x%llx", shm_size); MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); /* to print region name prefix */ rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno); } shmPtr = (char *)do_shmat(shmid, 0, 0); if (-1 == (sm_long_t)shmPtr) { save_errno = errno; SNPRINTF(errstr, SIZEOF(errstr), "shmat() : shmid=%d shmsize=0x%llx", shmid, shm_size); MUR_SET_MULTI_PROC_KEY(rctl, multi_proc_key); /* to print region name prefix */ rts_error_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_SYSCALL, 5, LEN_AND_STR(errstr), CALLFROM, save_errno); } shm_rctl->jnlext_shmid = shmid; shm_rctl->jnlext_shm_size = shm_size; for (size_ptr = &rctl->jnlext_multi_list_size[0], recstat = 0; recstat < TOT_EXTR_TYPES; recstat++, size_ptr++) { shm_size = *size_ptr; if (shm_size) { copy_size = copy_list_to_buf(rctl->jnlext_multi_list[recstat], (int4)shm_size, shmPtr); assert(copy_size == (shm_size * SIZEOF(jnlext_multi_t))); shmPtr += copy_size; } } } } } mur_close_file_extfmt(IN_MUR_CLOSE_FILES_FALSE); /* Need to flush buffered extract/losttrans/brokentrans files */ return (int)status; }
trans_num gvcst_bmp_mark_free(kill_set *ks) { block_id bit_map, next_bm, *updptr; blk_ident *blk, *blk_top, *nextblk; trans_num ctn, start_db_fmt_tn; unsigned int len; # if defined(UNIX) && defined(DEBUG) unsigned int lcl_t_tries; # endif int4 blk_prev_version; srch_hist alt_hist; trans_num ret_tn = 0; boolean_t visit_blks; srch_blk_status bmphist; cache_rec_ptr_t cr; enum db_ver ondsk_blkver; enum cdb_sc status; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; TREF(in_gvcst_bmp_mark_free) = TRUE; assert(inctn_bmp_mark_free_gtm == inctn_opcode || inctn_bmp_mark_free_mu_reorg == inctn_opcode); /* Note down the desired_db_format_tn before you start relying on cs_data->fully_upgraded. * If the db is fully_upgraded, take the optimal path that does not need to read each block being freed. * But in order to detect concurrent desired_db_format changes, note down the tn (when the last format change occurred) * before the fully_upgraded check and after having noted down the database current_tn. * If they are the same, then we are guaranteed no concurrent desired_db_format change occurred. * If they are not, then fall through to the non-optimal path where each to-be-killed block has to be visited. * The reason we need to visit every block in case desired_db_format changes is to take care of the case where * MUPIP REORG DOWNGRADE concurrently changes a block that we are about to free. */ start_db_fmt_tn = cs_data->desired_db_format_tn; visit_blks = (!cs_data->fully_upgraded); /* Local evaluation */ assert(!visit_blks || (visit_blks && dba_bg == cs_addrs->hdr->acc_meth)); /* must have blks_to_upgrd == 0 for non-BG */ assert(!dollar_tlevel); /* Should NOT be in TP now */ blk = &ks->blk[0]; blk_top = &ks->blk[ks->used]; if (!visit_blks) { /* Database has been completely upgraded. Free all blocks in one bitmap as part of one transaction. */ assert(cs_data->db_got_to_v5_once); /* assert all V4 fmt blocks (including RECYCLED) have space for V5 upgrade */ inctn_detail.blknum_struct.blknum = 0; /* to indicate no adjustment to "blks_to_upgrd" necessary */ /* If any of the mini transaction below restarts because of an online rollback, we don't want the application * refresh to happen (like $ZONLNRLBK++ or rts_error(DBROLLEDBACK). This is because, although we are currently in * non-tp (dollar_tleve = 0), we could actually be in a TP transaction and have actually faked dollar_tlevel. In * such a case, we should NOT * be issuing a DBROLLEDBACK error as TP transactions are supposed to just restart in * case of an online rollback. So, set the global variable that gtm_onln_rlbk_clnup can check and skip doing the * application refresh, but will reset the clues. The next update will see the cycle mismatch and will accordingly * take the right action. */ for ( ; blk < blk_top; blk = nextblk) { if (0 != blk->flag) { nextblk = blk + 1; continue; } assert(0 < blk->block); assert((int4)blk->block < cs_addrs->ti->total_blks); bit_map = ROUND_DOWN2((int)blk->block, BLKS_PER_LMAP); next_bm = bit_map + BLKS_PER_LMAP; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ /* Scan for the next local bitmap */ updptr = (block_id *)update_array_ptr; for (nextblk = blk; (0 == nextblk->flag) && (nextblk < blk_top) && ((block_id)nextblk->block < next_bm); ++nextblk) { assert((block_id)nextblk->block - bit_map); *updptr++ = (block_id)nextblk->block - bit_map; } len = (unsigned int)((char *)nextblk - (char *)blk); update_array_ptr = (char *)updptr; alt_hist.h[0].blk_num = 0; /* need for calls to T_END for bitmaps */ alt_hist.h[0].blk_target = NULL; /* need to initialize for calls to T_END */ /* the following assumes SIZEOF(blk_ident) == SIZEOF(int) */ assert(SIZEOF(blk_ident) == SIZEOF(int)); *(int *)update_array_ptr = 0; t_begin(ERR_GVKILLFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { ctn = cs_addrs->ti->curr_tn; /* Need a read fence before reading fields from cs_data as we are reading outside * of crit and relying on this value to detect desired db format state change. */ SHM_READ_MEMORY_BARRIER; if (start_db_fmt_tn != cs_data->desired_db_format_tn) { /* Concurrent db format change has occurred. Need to visit every block to be killed * to determine its block format. Fall through to the non-optimal path below */ ret_tn = 0; break; } bmphist.blk_num = bit_map; if (NULL == (bmphist.buffaddr = t_qread(bmphist.blk_num, (sm_int_ptr_t)&bmphist.cycle, &bmphist.cr))) { t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(&bmphist, (uchar_ptr_t)update_array, ctn, -(int4)(nextblk - blk)); UNIX_ONLY(DEBUG_ONLY(lcl_t_tries = t_tries)); if ((trans_num)0 == (ret_tn = t_end(&alt_hist, NULL, TN_NOT_SPECIFIED))) { # ifdef UNIX assert((CDB_STAGNATE == t_tries) || (lcl_t_tries == t_tries - 1)); status = LAST_RESTART_CODE; if ((cdb_sc_onln_rlbk1 == status) || (cdb_sc_onln_rlbk2 == status) || TREF(rlbk_during_redo_root)) { /* t_end restarted due to online rollback. Discard bitmap free-up and return control * to the application. But, before that reset only_reset_clues_if_onln_rlbk to FALSE */ TREF(in_gvcst_bmp_mark_free) = FALSE; send_msg(VARLSTCNT(6) ERR_IGNBMPMRKFREE, 4, REG_LEN_STR(gv_cur_region), DB_LEN_STR(gv_cur_region)); t_abort(gv_cur_region, cs_addrs); return ret_tn; /* actually 0 */ } # endif continue; } break; } if (0 == ret_tn) /* db format change occurred. Fall through to below for loop to visit each block */ { /* Abort any active transaction to get rid of lingering Non-TP artifacts */ t_abort(gv_cur_region, cs_addrs); break; } } } /* for all blocks in the kill_set */
bool gtcmtr_increment(void) { cm_region_list *reg_ref; mval incr_delta, post_incr; unsigned char buff[MAX_ZWR_KEY_SZ], *end; unsigned char *ptr, regnum; short n; unsigned short top, len, temp_short; static readonly gds_file_id file; error_def(ERR_KEY2BIG); error_def(ERR_GVIS); error_def(ERR_DBPRIVERR); ptr = curr_entry->clb_ptr->mbf; assert(*ptr == CMMS_Q_INCREMENT); ptr++; GET_USHORT(len, ptr); ptr += SIZEOF(unsigned short); regnum = *ptr++; reg_ref = gtcm_find_region(curr_entry,regnum); len--; /* subtract size of regnum */ CM_GET_GVCURRKEY(ptr, len); gtcm_bind_name(reg_ref->reghead, TRUE); if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBPRIVERR, 2, DB_LEN_STR(gv_cur_region)); if (JNL_ALLOWED(cs_addrs)) { /* we need to copy client's specific prc_vec into the global variable in order that the gvcst* routines * do the right job. actually we need to do this only if JNL_ENABLED(cs_addrs), but since it is not * easy to re-execute the following two assignments in case gvcst_incr's call to t_end encounters a * cdb_sc_jnlstatemod retry code, we choose the easier approach of executing the following segment * if JNL_ALLOWED(cs_addrs) is TRUE instead of checking for JNL_ENABLED(cs_addrs) to be TRUE. * this approach has the overhead that we will be doing the following assignments even though JNL_ENABLED * might not be TRUE but since the following two are just pointer copies, it is not considered a big overhead. * this approach ensures that the jnl_put_jrt_pini gets the appropriate prc_vec for writing into the * journal record in case JNL_ENABLED turns out to be TRUE in t_end time. * note that the value of JNL_ALLOWED(cs_addrs) cannot be changed on the fly without obtaining standalone access * and hence the correctness of prc_vec (whenever it turns out necessary) is guaranteed. */ originator_prc_vec = curr_entry->pvec; cs_addrs->jnl->pini_addr = reg_ref->pini_addr; } GET_USHORT(len, ptr); ptr += SIZEOF(unsigned short); incr_delta.mvtype = MV_STR; incr_delta.str.len = len; incr_delta.str.addr = (char *)ptr; if ((n = gv_currkey->end + 1) > gv_cur_region->max_key_size) { if ((end = format_targ_key(&buff[0], MAX_ZWR_KEY_SZ, gv_currkey, TRUE)) == 0) end = &buff[MAX_ZWR_KEY_SZ - 1]; rts_error(VARLSTCNT(11) ERR_KEY2BIG, 4, n, (int4)gv_cur_region->max_key_size, REG_LEN_STR(gv_cur_region), 0, ERR_GVIS, 2, end - buff, buff); } MV_FORCE_NUMD(&incr_delta); gvcst_incr(&incr_delta, &post_incr); if (JNL_ALLOWED(cs_addrs)) reg_ref->pini_addr = cs_addrs->jnl->pini_addr; /* In case journal switch occurred */ ptr = curr_entry->clb_ptr->mbf; if (MV_DEFINED(&post_incr)) { temp_short = (unsigned short)post_incr.str.len; assert((int4)temp_short == post_incr.str.len); /* ushort <- int4 assignment lossy? */ if (curr_entry->clb_ptr->mbl < 1 + /* msg header */ SIZEOF(temp_short) + /* size of length of $INCR return value */ temp_short) /* length of $INCR return value */ { /* resize buffer */ cmi_realloc_mbf(curr_entry->clb_ptr, 1 + SIZEOF(temp_short) + temp_short); ptr = curr_entry->clb_ptr->mbf; } *ptr++ = CMMS_R_INCREMENT; PUT_USHORT(ptr, temp_short); ptr += SIZEOF(unsigned short); memcpy(ptr, post_incr.str.addr, temp_short); ptr += temp_short; } else