void t_busy2free(srch_blk_status *blkhist) { cw_set_element *cse; sgmnt_addrs *csa; blk_hdr_ptr_t old_block; unsigned int bsiz; jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */ assert(!dollar_tlevel); assert(cw_set_depth < CDB_CW_SET_SIZE); cse = &cw_set[cw_set_depth]; cse->mode = gds_t_busy2free; cse->blk = blkhist->blk_num; cse->old_block = blkhist->buffaddr; old_block = (blk_hdr_ptr_t)cse->old_block; cse->was_free = FALSE; /* t_busy2free operates on BUSY blocks and hence cse->was_free is set to FALSE unconditionally */ csa = cs_addrs; assert(NULL != old_block); jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL; if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn)) { /* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to * access the buffer more than the db blk_size. */ bsiz = MIN(old_block->bsiz, csa->hdr->blk_size); cse->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz); } cse->upd_addr = NULL; cse->jnl_freeaddr = 0; /* reset jnl_freeaddr that previous transaction might have filled in */ cse->done = FALSE; blkhist->cse = cse; /* indicate to t_end/tp_tend that this block is part of the write-set */ cw_set_depth++; }
/* make sure that the journal file is available if appropriate */ uint4 jnl_ensure_open(gd_region *reg, sgmnt_addrs *csa) { uint4 jnl_status; jnl_private_control *jpc; sgmnt_data_ptr_t csd; boolean_t first_open_of_jnl, need_to_open_jnl; int close_res; csd = csa->hdr; assert(csa->now_crit); jpc = csa->jnl; assert(&FILE_INFO(jpc->region)->s_addrs == csa); assert(&FILE_INFO(reg)->s_addrs == csa); assert(NULL != jpc); assert(JNL_ENABLED(csa->hdr)); /* The goal is to change the code below to do only one JNL_FILE_SWITCHED(jpc) check instead of the additional * (NOJNL == jpc->channel) check done below. The assert below ensures that the NOJNL check can indeed * be subsumed by the JNL_FILE_SWITCHED check (with the exception of the source-server which has a special case that * needs to be fixed in C9D02-002241). Over time, this has to be changed to one check. */ assert((NOJNL != jpc->channel) || JNL_FILE_SWITCHED(jpc) || is_src_server); need_to_open_jnl = FALSE; jnl_status = 0; if (NOJNL == jpc->channel) need_to_open_jnl = TRUE; else if (JNL_FILE_SWITCHED(jpc)) { /* The journal file has been changed "on the fly"; close the old one and open the new one */ JNL_FD_CLOSE(jpc->channel, close_res); /* sets jpc->channel to NOJNL */ need_to_open_jnl = TRUE; } if (need_to_open_jnl) { /* Whenever journal file get switch, reset the pini_addr and new_freeaddr. */ jpc->pini_addr = 0; jpc->new_freeaddr = 0; if (IS_GTCM_GNP_SERVER_IMAGE) gtcm_jnl_switched(reg); /* Reset pini_addr of all clients that had any older journal file open */ first_open_of_jnl = (0 == csa->nl->jnl_file.u.inode); jnl_status = jnl_file_open(reg, first_open_of_jnl); } # ifdef DEBUG else GTM_WHITE_BOX_TEST(WBTEST_JNL_FILE_OPEN_FAIL, jnl_status, ERR_JNLFILOPN); # endif assert((0 != jnl_status) || !JNL_FILE_SWITCHED(jpc) || (is_src_server && !JNL_ENABLED(csa) && REPL_WAS_ENABLED(csa))); return jnl_status; }
void ccp_exitwm3( ccp_db_header *db) { sgmnt_addrs *csa; bt_rec *que_base, *que_top, *p; ccp_action_record request; uint4 status; assert(lib$ast_in_prog()); csa = db->segment; assert(csa->nl->ccp_state == CCST_WMXREQ); assert(csa->ti->curr_tn == csa->ti->early_tn); db->wm_iosb.valblk[CCP_VALBLK_TRANS_HIST] = csa->ti->curr_tn + csa->ti->lock_sequence; if (JNL_ENABLED(csa->hdr) && csa->jnl != NULL) { assert(csa->jnl->channel != 0); db->wm_iosb.valblk[CCP_VALBLK_JNL_ADDR] = csa->jnl->jnl_buff->freeaddr; db->wm_iosb.valblk[CCP_VALBLK_EPOCH_TN] = csa->jnl->jnl_buff->epoch_tn; /* lastaddr is no longer a field in jnl_buff * db->wm_iosb.valblk[CCP_VALBLK_LST_ADDR] = csa->jnl->jnl_buff->lastaddr; */ } /* Convert Write-mode lock from Protected Write to Concurrent Read, writing the lock value block */ status = ccp_enqw(EFN$C_ENF, LCK$K_CRMODE, &db->wm_iosb, LCK$M_CONVERT | LCK$M_VALBLK, NULL, 0, NULL, 0, NULL, PSL$C_USER, 0); /***** Check error status here? *****/ for (que_base = csa->bt_header, que_top = que_base + csa->hdr->bt_buckets; que_base < que_top; ++que_base) { assert(que_base->blk == BT_QUEHEAD); for (p = (bt_rec *)((char *)que_base + que_base->blkque.fl); p != que_base; p = (bt_rec *)((char *)p + p->blkque.fl)) { if (((int4)p & 3) != 0) ccp_signal_cont(ERR_GTMCHECK); /***** Is this reasonable? *****/ p->flushing = FALSE; } } db->blocking_ast_received = FALSE; db->wmexit_requested = FALSE; csa->nl->ccp_state = CCST_WMXGNT; db->wc_rover = 0; request.action = CCTR_EWMWTBF; request.pid = 0; request.v.h = db; ccp_act_request(&request); return; }
/* jpc : Journal private control * rectype : Record type * jnl_rec : This contains fixed part of a variable size record or the complete fixed size records. * blk_ptr : For JRT_PBLK and JRT_AIMG this has the block image * jfb : For SET/KILL/ZKILL/ZTWORM records entire record is formatted in this. * For JRT_PBLK and JRT_AIMG it contains partial records */ void jnl_write(jnl_private_control *jpc, enum jnl_record_type rectype, jnl_record *jnl_rec, blk_hdr_ptr_t blk_ptr, jnl_format_buffer *jfb) { int4 align_rec_len, rlen, rlen_with_align, dstlen, lcl_size, lcl_free, lcl_orig_free; jnl_buffer_ptr_t jb; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct_jrec_align align_rec; uint4 status; jrec_suffix suffix; boolean_t nowrap, is_replicated; struct_jrec_blk *jrec_blk; uint4 checksum, jnlpool_size, lcl_freeaddr; sm_uc_ptr_t lcl_buff; gd_region *reg; char *ptr; int jnl_wrt_start_modulus, jnl_wrt_start_mask; uint4 jnl_fs_block_size, aligned_lcl_free, padding_size; uint4 tmp_csum1, tmp_csum2; # ifdef DEBUG uint4 lcl_dskaddr, mumps_node_sz; char *mumps_node_ptr; # endif assert(jnl_write_recursion_depth++ < MAX_JNL_WRITE_RECURSION_DEPTH); reg = jpc->region; csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; is_replicated = jrt_is_replicated[rectype]; /* Ensure that no replicated journal record is written by this routine if REPL-WAS_ENABLED(csa) is TRUE */ assert((JNL_ENABLED(csa) && !REPL_WAS_ENABLED(csa)) || !is_replicated); /* Assert that the only journal records that the source server ever writes are PINI/PFIN/EPOCH/EOF * which it does at the very end when the database is about to be shut down */ assert(!is_src_server || (JRT_EOF == rectype) || (JRT_PINI == rectype) || (JRT_EPOCH == rectype) || (JRT_PFIN == rectype)); assert(csa->now_crit || (csd->clustered && csa->nl->ccp_state == CCST_CLOSED)); assert(rectype > JRT_BAD && rectype < JRT_RECTYPES && JRT_ALIGN != rectype); jb = jpc->jnl_buff; /* Before taking a copy of jb->freeaddr, determine if both free and freeaddr are in sync. If not fix that first. */ if (jb->free_update_pid) { FIX_NONZERO_FREE_UPDATE_PID(csa, jb); } lcl_freeaddr = jb->freeaddr; lcl_free = jb->free; lcl_size = jb->size; lcl_buff = &jb->buff[jb->buff_off]; DBG_CHECK_JNL_BUFF_FREEADDR(jb); ++jb->reccnt[rectype]; assert(NULL != jnl_rec); rlen = jnl_rec->prefix.forwptr; /* Do high-level check on rlen */ assert(rlen <= jb->max_jrec_len); /* Do fine-grained checks on rlen */ GTMTRIG_ONLY(assert(!IS_ZTWORM(rectype) || (MAX_ZTWORM_JREC_LEN >= rlen));) /* ZTWORMHOLE */
/* make sure that the journal file is available if appropriate */ uint4 jnl_ensure_open(void) { uint4 jnl_status; jnl_private_control *jpc; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; boolean_t first_open_of_jnl, need_to_open_jnl; int close_res; # if defined(VMS) static const gds_file_id file; uint4 status; # endif error_def(ERR_JNLFILOPN); csa = cs_addrs; csd = csa->hdr; assert(csa->now_crit); jpc = csa->jnl; assert(NULL != jpc); assert(JNL_ENABLED(csa->hdr)); /* The goal is to change the code below to do only one JNL_FILE_SWITCHED(jpc) check instead of the additional * (NOJNL == jpc->channel) check done below. The assert below ensures that the NOJNL check can indeed * be subsumed by the JNL_FILE_SWITCHED check (with the exception of the source-server which has a special case that * needs to be fixed in C9D02-002241). Over time, this has to be changed to one check. */ assert((NOJNL != jpc->channel) || JNL_FILE_SWITCHED(jpc) || is_src_server); need_to_open_jnl = FALSE; jnl_status = 0; if (NOJNL == jpc->channel) { # ifdef VMS if (NOJNL != jpc->old_channel) { if (lib$ast_in_prog()) /* called from wcs_wipchk_ast */ jnl_oper_user_ast(gv_cur_region); else { status = sys$setast(DISABLE); jnl_oper_user_ast(gv_cur_region); if (SS$_WASSET == status) ENABLE_AST; } } # endif need_to_open_jnl = TRUE; } else if (JNL_FILE_SWITCHED(jpc)) { /* The journal file has been changed "on the fly"; close the old one and open the new one */ VMS_ONLY(assert(FALSE);) /* everyone having older jnl open should have closed it at time of switch in VMS */ JNL_FD_CLOSE(jpc->channel, close_res); /* sets jpc->channel to NOJNL */ need_to_open_jnl = TRUE; }
void jnl_wait(gd_region *reg) { jnl_private_control *jpc; sgmnt_addrs *csa; uint4 status; if ((NULL != reg) && (TRUE == reg->open)) { csa = &FILE_INFO(reg)->s_addrs; jpc = csa->jnl; if ((TRUE == JNL_ENABLED(csa->hdr)) && (NULL != jpc)) { /* wait on jnl writes for region */ status = jnl_write_attempt(jpc, jpc->new_freeaddr); if (SS_NORMAL == status && !JNL_FILE_SWITCHED(jpc)) jnl_fsync(reg, jpc->new_freeaddr); } } return; }
void jnl_flush(gd_region *reg) { sgmnt_addrs *csa; jnl_private_control *jpc; jnl_buffer_ptr_t jb; if (!reg || !reg->open) return; csa = &FILE_INFO(reg)->s_addrs; assert(csa->now_crit); jpc = csa->jnl; if (!JNL_ENABLED(csa->hdr) || (NULL == jpc) || (NOJNL == jpc->channel)) return; jb = jpc->jnl_buff; jb->blocked = process_id; if (jb->freeaddr != jb->dskaddr && SS_NORMAL != jnl_write_attempt(jpc, jb->freeaddr)) return; assert(jb->dskaddr == jb->freeaddr); jb->blocked = 0; return; }
int jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header header; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, result; GTM_BAVAIL_TYPE avail_blocks; uint4 aligned_tot_jrec_size, count; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(jpc->region == gv_cur_region); assert(csa->jnl_state == csd->jnl_state); if (!JNL_ENABLED(csa) || (NOJNL == jpc->channel) || (JNL_FILE_SWITCHED(jpc))) GTMASSERT; /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { if ((new_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* if we cannot satisfy the request, it is an error */ send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - new_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, sizeof(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH); jnl_file_close(gv_cur_region, TRUE, TRUE); } else rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); if (jgbl.forw_phase_recovery && (NULL != jgbl.mur_pini_addr_reset_fnptr)) (*jgbl.mur_pini_addr_reset_fnptr)(); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->trans_hist.header_open_tn = jnl_info.tn; /* needed for successful jnl_file_open() */ send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREATE, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); assert(jb->filesize == csd->jnl_alq); aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } else { send_msg(VARLSTCNT(4) ERR_JNLCREATERR, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jb->filesize = new_alq; /* Actually this is virtual file size blocks */ DO_FILE_READ(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); assert((header.virtual_size + new_blocks) == new_alq); header.virtual_size = new_alq; DO_FILE_WRITE(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } if (0 >= new_blocks) break; } if (0 >= new_blocks) { jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); new_blocks = -1; } return new_blocks; }
uint4 jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header *header; unsigned char hdr_buff[REAL_JNL_HDR_LEN + MAX_IO_BLOCK_SIZE]; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, warn_blocks, result; gtm_uint64_t avail_blocks; uint4 aligned_tot_jrec_size, count; uint4 jnl_fs_block_size, read_write_size; DCL_THREADGBL_ACCESS; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(&FILE_INFO(jpc->region)->s_addrs == csa); assert(csa->jnl_state == csd->jnl_state); assertpro(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && (!JNL_FILE_SWITCHED(jpc))); /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq || (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit)) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { warn_blocks = (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit) ? ((csd->jnl_deq > csd->autoswitchlimit) ? csd->jnl_deq : csd->autoswitchlimit) : new_blocks; if ((warn_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* If we cannot satisfy the request, it is an error, unless the anticipatory freeze * scheme is in effect in which case, we will assume space is available even if * it is not and go ahead with writes to the disk. If the writes fail with ENOSPC * we will freeze the instance and wait for space to become available and keep * retrying the writes. Therefore, we make the NOSPACEEXT a warning in this case. */ SETUP_THREADGBL_ACCESS; if (!ANTICIPATORY_FREEZE_ENABLED(csa)) { send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - warn_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, SIZEOF(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. Set a global variable in_jnl_file_autoswitch * so jnl_write can know not to do the padding check. But because this is a global * variable, we also need to make sure it is reset in case of errors during the * autoswitch (or else calls to jnl_write after we are out of the autoswitch logic * will continue to incorrectly not do the padding check. Hence a condition handler. */ assert(!in_jnl_file_autoswitch); in_jnl_file_autoswitch = TRUE; /* Also make sure time is not changed. This way if "jnl_write" as part of writing a * journal record invokes jnl_file_extend, when the autoswitch is done and writing * of the parent jnl_write resumes, we want it to continue with the same timestamp * and not have to reset its time (non-trivial task) to reflect any changes since then. */ assert(!jgbl.save_dont_reset_gbl_jrec_time); jgbl.save_dont_reset_gbl_jrec_time = jgbl.dont_reset_gbl_jrec_time; jgbl.dont_reset_gbl_jrec_time = TRUE; /* Establish a condition handler so we reset a few global variables that have * temporarily been modified in case of errors inside wcs_flu/jnl_file_close. */ ESTABLISH_RET(jnl_file_autoswitch_ch, EXIT_ERR); /* It is possible we still have not written a PINI record in this journal file * (e.g. mupip extend saw the need to do jnl_file_extend inside jnl_write while * trying to write a PINI record). Write a PINI record in that case before closing * the journal file that way the EOF record will have a non-zero pini_addr. */ if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SPEEDUP_NOBEFORE); jnl_file_close(gv_cur_region, TRUE, TRUE); REVERT; in_jnl_file_autoswitch = FALSE; jgbl.dont_reset_gbl_jrec_time = jgbl.save_dont_reset_gbl_jrec_time; DEBUG_ONLY(jgbl.save_dont_reset_gbl_jrec_time = FALSE); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In MM, wcs_flu() can remap an extended DB, so reset csd to be sure */ } else { if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); assert(jgbl.forw_phase_recovery || (NULL == jgbl.mur_pini_addr_reset_fnptr)); if (NULL != jgbl.mur_pini_addr_reset_fnptr) (*jgbl.mur_pini_addr_reset_fnptr)(csa); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->jnl_checksum = jnl_info.checksum; csd->jnl_eovtn = csd->trans_hist.curr_tn; send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREAT, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; fc->op_len = SGMNT_HDR_LEN; fc->op_pos = 1; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) { if (jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } assert(jb->filesize == csd->jnl_alq); if (csd->jnl_alq + csd->jnl_deq <= csd->autoswitchlimit) { aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } } else { send_msg(VARLSTCNT(4) ERR_JNLNOCREATE, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jnl_fs_block_size = jb->fs_block_size; header = (jnl_file_header *)(ROUND_UP2((uintszofptr_t)hdr_buff, jnl_fs_block_size)); read_write_size = ROUND_UP2(REAL_JNL_HDR_LEN, jnl_fs_block_size); assert((unsigned char *)header + read_write_size <= ARRAYTOP(hdr_buff)); DO_FILE_READ(jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) { assert(FALSE); rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); } assert((header->virtual_size + new_blocks) == new_alq); jb->filesize = new_alq; /* Actually this is virtual file size blocks */ header->virtual_size = new_alq; JNL_DO_FILE_WRITE(csa, csd->jnl_file_name, jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) { assert(FALSE); rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } } if (0 >= new_blocks) break; } if (0 < new_blocks) { INCR_GVSTATS_COUNTER(csa, csa->nl, n_jnl_extends, 1); return EXIT_NRM; } jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); return EXIT_ERR; }
void gtcmd_rundown(connection_struct *cnx, bool clean_exit) { int4 link; cm_region_list *ptr, *last, *que_next, *que_last; cm_region_head *region; uint4 jnl_status; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; int refcnt; boolean_t was_crit; int4 rundown_status = EXIT_NRM; /* if gds_rundown went smoothly */ for (ptr = cnx->region_root; ptr;) { region = ptr->reghead; TP_CHANGE_REG(region->reg); jpc = cs_addrs->jnl; if (ptr->pini_addr && clean_exit && JNL_ENABLED(cs_data) && (NOJNL != jpc->channel)) { was_crit = cs_addrs->now_crit; if (!was_crit) grab_crit(gv_cur_region); if (JNL_ENABLED(cs_data)) { jpc->pini_addr = ptr->pini_addr; SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pfin needs this to be set */ jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { if (0 != jpc->pini_addr) jnl_put_jrt_pfin(cs_addrs); } else send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); } if (!was_crit) rel_crit(gv_cur_region); } refcnt = --region->refcnt; /* Dont know how refcnt can become negative but in pro handle it by bypassing this region. The reason is the * following. refcnt should have originally been a positive value. Every time this function is invoked, it would * be decremented by one. There should have been one invocation that saw refcnt to be zero. That would have * done the rundown of the region or if it is still in the stack the rundown is still in progress. Therefore * it is not a good idea to try running down this region when we see refcnt to be negative (as otherwise we * will get confused and could potentially end up with SIG-11 or ACCVIO errors). The worst case is that we * would not have rundown the region in which case an externally issued MUPIP RUNDOWN would be enough. */ assert(0 <= refcnt); if (0 == refcnt) { /* free up only as little as needed to facilitate structure reuse when the region is opened again */ assert(region->head.fl == region->head.bl); VMS_ONLY(gtcm_ast_avail++); if (JNL_ALLOWED(cs_data)) jpc->pini_addr = 0; UNIX_ONLY(rundown_status |=) gds_rundown(); gd_ht_kill(region->reg_hash, TRUE); /* TRUE to free up the table and the gv_targets it holds too */ FREE_CSA_DIR_TREE(cs_addrs); cm_del_gdr_ptr(gv_cur_region); } que_next = (cm_region_list *)((unsigned char *)ptr + ptr->regque.fl); que_last = (cm_region_list *)((unsigned char *)ptr + ptr->regque.bl); link = (int4)((unsigned char *)que_next - (unsigned char *)que_last); que_last->regque.fl = link; que_next->regque.bl = -link; last = ptr; ptr = ptr->next; free(last); }
uint4 gdsfilext(uint4 blocks, uint4 filesize, boolean_t trans_in_prog) { sm_uc_ptr_t old_base[2], mmap_retaddr; boolean_t was_crit, is_mm; int result, save_errno, status; DEBUG_ONLY(int first_save_errno); uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks, old_total; uint4 jnl_status; gtm_uint64_t avail_blocks, mmap_sz; off_t new_eof, new_size; trans_num curr_tn; unix_db_info *udi; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; cache_rec_ptr_t cr; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; assert(!IS_DSE_IMAGE); assert((cs_addrs->nl == NULL) || (process_id != cs_addrs->nl->trunc_pid)); /* mu_truncate shouldn't extend file... */ assert(!process_exiting); DEBUG_ONLY(old_base[0] = old_base[1] = NULL); assert(!gv_cur_region->read_only); udi = FILE_INFO(gv_cur_region); is_mm = (dba_mm == cs_addrs->hdr->acc_meth); # if !defined(MM_FILE_EXT_OK) if (!udi->grabbed_access_sem && is_mm) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ # endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert((blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)) || WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); # if defined(__sun) || defined(__hpux) cs_data->defer_allocate = TRUE; # endif if (!blocks && (cs_data->defer_allocate || (TRANS_IN_PROG_TRUE == trans_in_prog))) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* New total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired * There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks * and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this * manner as every non-bitmap block must have an associated bitmap) * Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. * Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert((0 < (int)new_blocks) || (!cs_data->defer_allocate && (0 == new_blocks))); if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { assert(WBTEST_ENABLED(WBTEST_FILE_EXTEND_ERROR)); send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { if (!(gtmDebugLevel & GDL_IgnoreAvailSpace)) { /* Bypass this space check if debug flag above is on. Allows us to create a large sparce DB * in space it could never fit it if wasn't sparse. Needed for some tests. */ avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (blocks > (uint4)avail_blocks) { if (!INST_FREEZE_ON_NOSPC_ENABLED(cs_addrs)) return (uint4)(NO_FREE_SPACE); else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, DB_LEN_STR(gv_cur_region), new_blocks, (uint4)avail_blocks); } else send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); } } } # ifdef DEBUG if (WBTEST_ENABLED(WBTEST_MM_CONCURRENT_FILE_EXTEND) && dollar_tlevel && !MEMCMP_LIT(gv_cur_region->rname, "DEFAULT")) { SYSTEM("$gtm_dist/mumps -run $gtm_wbox_mrtn"); assert(1 == cs_addrs->nl->wbox_test_seq_num); /* should have been set by mubfilcpy */ cs_addrs->nl->wbox_test_seq_num = 2; /* signal mupip backup to stop sleeping in mubfilcpy */ } # endif /* From here on, we need to use GDSFILEXT_CLNUP before returning to the caller */ was_crit = cs_addrs->now_crit; assert(!cs_addrs->hold_onto_crit || was_crit); /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur. * If we are coming from online rollback (when that feature is available), we will come in holding crit and in * the final retry. In that case too, we expect to have waited for unfreezes to occur in the caller itself. * Therefore if we are coming in holding crit from MUPIP, we expect the db to be unfrozen so no need to wait for * freeze. * If we are coming from GT.M and final retry (in which case we come in holding crit) we expect to have waited * for any unfreezes (by invoking tp_crit_all_regions) to occur (TP or non-TP) before coming into this * function. However, there is one exception. In the final retry, if tp_crit_all_regions notices that * at least one of the participating regions did ONLY READs, it will not wait for any freeze on THAT region * to complete before grabbing crit. Later, in the final retry, if THAT region did an update which caused * op_tcommit to invoke bm_getfree->gdsfilext, then we would have come here with a frozen region on which * we hold crit. */ assert(!was_crit || !FROZEN_HARD(cs_data) || (dollar_tlevel && (CDB_STAGNATE <= t_tries))); /* * If we are in the final retry and already hold crit, it is possible that csa->nl->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) { for ( ; ; ) { grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(cs_addrs, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } } else if (FROZEN_HARD(cs_data) && dollar_tlevel) { /* We don't want to continue with file extension as explained above. Hence return with an error code which * op_tcommit will recognize (as a cdb_sc_needcrit/cdb_sc_instancefreeze type of restart) and restart accordingly. */ assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_FREEZE_PROG; } else WAIT_FOR_REGION_TO_UNCHILL(cs_addrs, cs_data); if (IS_REPL_INST_FROZEN && trans_in_prog) { assert(CDB_STAGNATE <= t_tries); GDSFILEXT_CLNUP; return (uint4)FINAL_RETRY_INST_FREEZE; } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); old_total = cs_data->trans_hist.total_blks; if (old_total != filesize) { /* Somebody else has already extended it, since we are in crit, this is trust-worthy. However, in case of MM, * we still need to remap the database */ assert((old_total > filesize) || !is_mm); /* For BG, someone else could have truncated or extended - we have no idea */ GDSFILEXT_CLNUP; return (SS_NORMAL); } if (trans_in_prog && SUSPICIOUS_EXTEND) { if (!was_crit) { GDSFILEXT_CLNUP; return (uint4)(EXTEND_SUSPECT); } /* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then it means we held * crit through bm_getfree into gdsfilext and still didn't get it right. */ assertpro(!is_free_blks_ctr_ok() && !SUSPICIOUS_EXTEND); } if (JNL_ENABLED(cs_data)) { if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = cs_addrs->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, cs_addrs); if (jnl_status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); return (uint4)(NO_FREE_SPACE); /* should have better return status */ } } if (is_mm) { cs_addrs->nl->mm_extender_pid = process_id; status = wcs_wtstart(gv_cur_region, 0, NULL, NULL); cs_addrs->nl->mm_extender_pid = 0; assertpro(SS_NORMAL == status); old_base[0] = cs_addrs->db_addrs[0]; old_base[1] = cs_addrs->db_addrs[1]; cs_addrs->db_addrs[0] = NULL; /* don't rely on it until the mmap below */ # ifdef _AIX status = shmdt(old_base[0] - BLK_ZERO_OFF(cs_data->start_vbn)); # else status = munmap((caddr_t)old_base[0], (size_t)(old_base[1] - old_base[0])); # endif if (0 != status) { save_errno = errno; GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(12) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), ERR_SYSCALL, 5, LEN_AND_STR(MEM_UNMAP_SYSCALL), CALLFROM, save_errno); return (uint4)(NO_FREE_SPACE); } } else { /* Due to concurrency issues, it is possible some process had issued a disk read of the GDS block# corresponding * to "old_total" right after a truncate wrote a GDS-block of zeros on disk (to signal end of the db file). * If so, the global buffer containing this block needs to be invalidated now as part of the extend. If not, it is * possible the EOF block on disk is now going to be overwritten by a properly initialized bitmap block (as part * of the gdsfilext below) while the global buffer continues to have an incorrect copy of that bitmap block and * this in turn would cause XXXX failures due to a bad bitmap block in shared memory. (GTM-7519) */ cr = db_csh_get((block_id)old_total); if ((NULL != cr) && ((cache_rec_ptr_t)CR_NOTVALID != cr)) { assert((0 == cr->dirty) && (0 == cr->bt_index) && !cr->stopped); cr->cycle++; cr->blk = CR_BLKEMPTY; } } CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ new_total = old_total + new_blocks; new_eof = BLK_ZERO_OFF(cs_data->start_vbn) + ((off_t)new_total * cs_data->blk_size); # if !defined(__sun) && !defined(__hpux) if (!cs_data->defer_allocate) { new_size = new_eof + cs_data->blk_size; save_errno = posix_fallocate(udi->fd, 0, new_size); DEBUG_ONLY(first_save_errno = save_errno); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_fallocate(udi, new_size); if (0 != save_errno) { GDSFILEXT_CLNUP; assert(ENOSPC == save_errno); if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_PREALLOCATEFAIL, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } } # endif save_errno = db_write_eof_block(udi, udi->fd, cs_data->blk_size, new_eof, &(TREF(dio_buff))); if ((ENOSPC == save_errno) && IS_GTM_IMAGE) save_errno = extend_wait_for_write(udi, cs_data->blk_size, new_eof); if (0 != save_errno) { GDSFILEXT_CLNUP; if (ENOSPC != save_errno) send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_1)) { LONG_SLEEP(600); assert(FALSE); } /* Ensure the EOF and metadata get to disk BEFORE any bitmap writes. Otherwise, the file size could no longer reflect * a proper extent and subsequent invocations of gdsfilext could corrupt the database. */ if (!IS_STATSDB_CSA(cs_addrs)) { GTM_DB_FSYNC(cs_addrs, udi->fd, status); assert(0 == status); if (0 != status) { GDSFILEXT_CLNUP; send_msg_csa(CSA_ARG(cs_addrs) VARLSTCNT(8) ERR_DBFILERR, 5, RTS_ERROR_LITERAL("fsync1()"), CALLFROM, status); return (uint4)(NO_FREE_SPACE); } } if (WBTEST_ENABLED(WBTEST_FILE_EXTEND_INTERRUPT_2)) { LONG_SLEEP(600); assert(FALSE); /* Should be killed before that */ } DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
void t_write_map ( srch_blk_status *blkhist, /* Search History of the block to be written. Currently the * following members in this structure are used by "t_write_map" * "blk_num" --> Block number being modified * "buffaddr" --> Address of before image of the block * "cr" --> cache-record that holds the block (BG only) * "cycle" --> cycle when block was read by t_qread (BG only) * "cr->ondsk_blkver" --> Actual block version on disk */ unsigned char *upd_addr, /* Address of the update array containing list of blocks to be cleared in bitmap */ trans_num tn, /* Transaction Number when this block was read. Used for cdb_sc_blkmod validation */ int4 reference_cnt) /* Same meaning as cse->reference_cnt (see gdscc.h for comments) */ { cw_set_element *cs; cache_rec_ptr_t cr; jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */ sgmnt_addrs *csa; blk_hdr_ptr_t old_block; unsigned int bsiz; block_id blkid; uint4 *updptr; csa = cs_addrs; if (!dollar_tlevel) { assert(cw_set_depth < CDB_CW_SET_SIZE); cs = &cw_set[cw_set_depth]; cs->mode = gds_t_noop; /* initialize it to a value that is not "gds_t_committed" before incrementing * cw_set_depth as secshr_db_clnup relies on it */ cw_set_depth++; } else { tp_cw_list(&cs); sgm_info_ptr->cw_set_depth++; } cs->mode = gds_t_writemap; cs->blk_checksum = 0; cs->blk = blkhist->blk_num; assert((cs->blk < csa->ti->total_blks) GTM_TRUNCATE_ONLY(|| (CDB_STAGNATE > t_tries))); cs->old_block = blkhist->buffaddr; /* t_write_map operates on BUSY blocks and hence cs->blk_prior_state's free_status is set to FALSE unconditionally */ BIT_CLEAR_FREE(cs->blk_prior_state); BIT_CLEAR_RECYCLED(cs->blk_prior_state); old_block = (blk_hdr_ptr_t)cs->old_block; assert(NULL != old_block); jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL; if ((NULL != jbbp) && (old_block->tn < jbbp->epoch_tn)) { /* Pre-compute CHECKSUM. Since we dont necessarily hold crit at this point, ensure we never try to * access the buffer more than the db blk_size. */ bsiz = MIN(old_block->bsiz, csa->hdr->blk_size); cs->blk_checksum = jnl_get_checksum((uint4*)old_block, csa, bsiz); } cs->cycle = blkhist->cycle; cr = blkhist->cr; cs->cr = cr; /* the buffer in shared memory holding the GDS block contents currently does not have in its block header the * on-disk format of that block. if it had, we could have easily copied that over to the cw-set-element. * until then, we have to use the cache-record's field "ondsk_blkver". but the cache-record is available only in BG. * thankfully, in MM, we do not allow GDSV4 type blocks, so we can safely assign GDSV6 (or GDSVCURR) to this field. */ assert((NULL != cr) || (dba_mm == csa->hdr->acc_meth)); cs->ondsk_blkver = (NULL == cr) ? GDSVCURR : cr->ondsk_blkver; cs->ins_off = 0; cs->index = 0; assert(reference_cnt < csa->hdr->bplmap); /* Cannot allocate more blocks than a bitmap holds */ assert(reference_cnt > -csa->hdr->bplmap); /* Cannot free more blocks than a bitmap holds */ cs->reference_cnt = reference_cnt; cs->jnl_freeaddr = 0; /* reset jnl_freeaddr that previous transaction might have filled in */ cs->upd_addr = upd_addr; DEBUG_ONLY( if (reference_cnt < 0) reference_cnt = -reference_cnt; /* Check that all block numbers are relative to the bitmap block number (i.e. bit number) */ updptr = (uint4 *)upd_addr; while (reference_cnt--) { blkid = *updptr; assert((int4)blkid < csa->hdr->bplmap); updptr++; } )
boolean_t mu_truncate(int4 truncate_percent) { sgmnt_addrs *csa; sgmnt_data_ptr_t csd; int num_local_maps; int lmap_num, lmap_blk_num; int bml_status, sigkill; int save_errno; int ftrunc_status; uint4 jnl_status; uint4 old_total, new_total; uint4 old_free, new_free; uint4 end_blocks; int4 blks_in_lmap, blk; gtm_uint64_t before_trunc_file_size; off_t trunc_file_size; off_t padding; uchar_ptr_t lmap_addr; boolean_t was_crit; uint4 found_busy_blk; srch_blk_status bmphist; srch_blk_status *blkhist; srch_hist alt_hist; trans_num curr_tn; blk_hdr_ptr_t lmap_blk_hdr; block_id *blkid_ptr; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; char *err_msg; intrpt_state_t prev_intrpt_state; off_t offset; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csa = cs_addrs; csd = cs_data; if (dba_mm == csd->acc_meth) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOTBG, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if ((GDSVCURR != csd->desired_db_format) || (csd->blks_to_upgrd != 0)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); return TRUE; } if (csa->ti->free_blocks < (truncate_percent * csa->ti->total_blks / 100)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } /* already checked for parallel truncates on this region --- see mupip_reorg.c */ gv_target = NULL; assert(csa->nl->trunc_pid == process_id); assert(dba_mm != csd->acc_meth); old_total = csa->ti->total_blks; old_free = csa->ti->free_blocks; sigkill = 0; found_busy_blk = 0; memset(&alt_hist, 0, SIZEOF(alt_hist)); /* null-initialize history */ assert(csd->bplmap == BLKS_PER_LMAP); end_blocks = old_total % BLKS_PER_LMAP; /* blocks in the last lmap (first one we start scanning) */ if (0 == end_blocks) end_blocks = BLKS_PER_LMAP; num_local_maps = DIVIDE_ROUND_UP(old_total, BLKS_PER_LMAP); /* ======================================== PHASE 1 ======================================== */ for (lmap_num = num_local_maps - 1; (lmap_num > 0 && !found_busy_blk); lmap_num--) { if (mu_ctrly_occurred || mu_ctrlc_occurred) return TRUE; assert(csa->ti->total_blks >= old_total); /* otherwise, a concurrent truncate happened... */ if (csa->ti->total_blks != old_total) /* Extend (likely called by mupip extend) -- don't truncate */ { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); return TRUE; } lmap_blk_num = lmap_num * BLKS_PER_LMAP; if (csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { found_busy_blk = lmap_blk_num; break; } blks_in_lmap = (lmap_num == num_local_maps - 1) ? end_blocks : BLKS_PER_LMAP; /* Loop through non-bitmap blocks of this lmap, do recycled2free */ DBGEHND((stdout, "DBG:: lmap_num = [%lu], lmap_blk_num = [%lu], blks_in_lmap = [%lu]\n", lmap_num, lmap_blk_num, blks_in_lmap)); for (blk = 1; blk < blks_in_lmap && blk != -1 && !found_busy_blk;) { t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) /* retry loop for recycled to free transactions */ { curr_tn = csd->trans_hist.curr_tn; /* Read the nth local bitmap into memory */ bmphist.blk_num = lmap_blk_num; bmphist.buffaddr = t_qread(bmphist.blk_num, &bmphist.cycle, &bmphist.cr); lmap_blk_hdr = (blk_hdr_ptr_t)bmphist.buffaddr; if (!(bmphist.buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } lmap_addr = bmphist.buffaddr + SIZEOF(blk_hdr); /* starting from the hint (blk itself), find the first busy or recycled block */ blk = bml_find_busy_recycled(blk, lmap_addr, blks_in_lmap, &bml_status); assert(blk < BLKS_PER_LMAP); if (blk == -1 || blk >= blks_in_lmap) { /* done with this lmap, continue to next */ t_abort(gv_cur_region, csa); break; } else if (BLK_BUSY == bml_status || csa->nl->highest_lbm_with_busy_blk >= lmap_blk_num) { /* stop processing blocks... skip ahead to phase 2 */ found_busy_blk = lmap_blk_num; t_abort(gv_cur_region, csa); break; } else if (BLK_RECYCLED == bml_status) { /* Write PBLK records for recycled blocks only if before_image journaling is * enabled. t_end() takes care of checking if journaling is enabled and * writing PBLK record. We have to at least mark the recycled block as free. */ RESET_UPDATE_ARRAY; update_trans = UPDTRNS_DB_UPDATED_MASK; *((block_id *)update_array_ptr) = blk; update_array_ptr += SIZEOF(block_id); *(int *)update_array_ptr = 0; alt_hist.h[1].blk_num = 0; alt_hist.h[0].level = 0; alt_hist.h[0].cse = NULL; alt_hist.h[0].tn = curr_tn; alt_hist.h[0].blk_num = lmap_blk_num + blk; alt_hist.h[0].buffaddr = t_qread(alt_hist.h[0].blk_num, &alt_hist.h[0].cycle, &alt_hist.h[0].cr); if (!alt_hist.h[0].buffaddr) { t_retry((enum cdb_sc)rdfail_detail); continue; } if (!t_recycled2free(&alt_hist.h[0])) { t_retry(cdb_sc_lostbmlcr); continue; } t_write_map(&bmphist, (unsigned char *)update_array, curr_tn, 0); /* Set the opcode for INCTN record written by t_end() */ inctn_opcode = inctn_blkmarkfree; if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; /* block processed, scan from the next one */ blk++; break; } else { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_badbitmap); continue; } } /* END recycled2free retry loop */ } /* END scanning blocks of this particular lmap */ /* Write PBLK for the bitmap block, in case it hasn't been written i.e. t_end() was never called above */ /* Do a transaction that just increments the bitmap block's tn so that t_end() can do its thing */ DBGEHND((stdout, "DBG:: bitmap block inctn -- lmap_blk_num = [%lu]\n", lmap_blk_num)); t_begin(ERR_MUTRUNCFAIL, UPDTRNS_DB_UPDATED_MASK); for (;;) { RESET_UPDATE_ARRAY; BLK_ADDR(blkid_ptr, SIZEOF(block_id), block_id); *blkid_ptr = 0; update_trans = UPDTRNS_DB_UPDATED_MASK; inctn_opcode = inctn_mu_reorg; /* inctn_mu_truncate */ curr_tn = csd->trans_hist.curr_tn; blkhist = &alt_hist.h[0]; blkhist->blk_num = lmap_blk_num; blkhist->tn = curr_tn; blkhist->cse = NULL; /* start afresh (do not use value from previous retry) */ /* Read the nth local bitmap into memory */ blkhist->buffaddr = t_qread(lmap_blk_num, (sm_int_ptr_t)&blkhist->cycle, &blkhist->cr); lmap_blk_hdr = (blk_hdr_ptr_t)blkhist->buffaddr; if (!(blkhist->buffaddr) || (BM_SIZE(BLKS_PER_LMAP) != lmap_blk_hdr->bsiz)) { /* Could not read the block successfully. Retry. */ t_retry((enum cdb_sc)rdfail_detail); continue; } t_write_map(blkhist, (unsigned char *)blkid_ptr, curr_tn, 0); blkhist->blk_num = 0; /* create empty history for bitmap block */ if ((trans_num)0 == t_end(&alt_hist, NULL, TN_NOT_SPECIFIED)) continue; break; } } /* END scanning lmaps */ /* ======================================== PHASE 2 ======================================== */ assert(!csa->now_crit); for (;;) { /* wait for FREEZE, we don't want to truncate a frozen database */ grab_crit(gv_cur_region); if (FROZEN_CHILLED(cs_data)) DO_CHILLED_AUTORELEASE(csa, cs_data); if (!FROZEN(cs_data) && !IS_REPL_INST_FROZEN) break; rel_crit(gv_cur_region); while (FROZEN(cs_data) || IS_REPL_INST_FROZEN) { hiber_start(1000); if (FROZEN_CHILLED(cs_data) && CHILLED_AUTORELEASE(cs_data)) break; } } assert(csa->nl->trunc_pid == process_id); /* Flush pending updates to disk. If this is not done, old updates can be flushed AFTER ftruncate, extending the file. */ if (!wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_MSYNC_DB)) { assert(FALSE); gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_BUFFLUFAILED, 4, LEN_AND_LIT("MUPIP REORG TRUNCATE"), DB_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return FALSE; } csa->nl->highest_lbm_with_busy_blk = MAX(found_busy_blk, csa->nl->highest_lbm_with_busy_blk); assert(IS_BITMAP_BLK(csa->nl->highest_lbm_with_busy_blk)); new_total = MIN(old_total, csa->nl->highest_lbm_with_busy_blk + BLKS_PER_LMAP); if (mu_ctrly_occurred || mu_ctrlc_occurred) { rel_crit(gv_cur_region); return TRUE; } else if (csa->ti->total_blks != old_total || new_total == old_total) { assert(csa->ti->total_blks >= old_total); /* Better have been an extend, not a truncate... */ gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(5) ERR_MUTRUNCNOSPACE, 3, REG_LEN_STR(gv_cur_region), truncate_percent); rel_crit(gv_cur_region); return TRUE; } else if (GDSVCURR != csd->desired_db_format || csd->blks_to_upgrd != 0 || !csd->fully_upgraded) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCNOV4, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (SNAPSHOTS_IN_PROG(csa->nl)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCSSINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } else if (BACKUP_NOT_IN_PROGRESS != cs_addrs->nl->nbb) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_MUTRUNCBACKINPROG, 2, REG_LEN_STR(gv_cur_region)); rel_crit(gv_cur_region); return TRUE; } DEFER_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); if (JNL_ENABLED(csa)) { /* Write JRT_TRUNC and INCTN records */ if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = csa->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(gv_cur_region, csa); if (SS_NORMAL != jnl_status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); else { if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); jnl_write_trunc_rec(csa, old_total, csa->ti->free_blocks, new_total); inctn_opcode = inctn_mu_reorg; jnl_write_inctn_rec(csa); jnl_status = jnl_flush(gv_cur_region); if (SS_NORMAL != jnl_status) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush during mu_truncate"), jnl_status); assert(NOJNL == jpc->channel); /* jnl file lost has been triggered */ } } } /* Good to go ahead and REALLY truncate (reduce total_blks, clear cache_array, FTRUNCATE) */ curr_tn = csa->ti->curr_tn; CHECK_TN(csa, csd, curr_tn); udi = FILE_INFO(gv_cur_region); /* Information used by recover_truncate to check if the file size and csa->ti->total_blks are INCONSISTENT */ trunc_file_size = BLK_ZERO_OFF(csd->start_vbn) + ((off_t)csd->blk_size * (new_total + 1)); csd->after_trunc_total_blks = new_total; csd->before_trunc_free_blocks = csa->ti->free_blocks; csd->before_trunc_total_blks = old_total; /* Flags interrupted truncate for recover_truncate */ /* file size and total blocks: INCONSISTENT */ csa->ti->total_blks = new_total; /* past the point of no return -- shared memory intact */ assert(csa->ti->free_blocks >= DELTA_FREE_BLOCKS(old_total, new_total)); csa->ti->free_blocks -= DELTA_FREE_BLOCKS(old_total, new_total); new_free = csa->ti->free_blocks; KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_1); /* 55 : Issue a kill -9 before 1st fsync */ fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); CHECK_DBSYNC(gv_cur_region, save_errno); /* past the point of no return -- shared memory deleted */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_2); /* 56 : Issue a kill -9 after 1st fsync */ clear_cache_array(csa, csd, gv_cur_region, new_total, old_total); offset = (off_t)BLK_ZERO_OFF(csd->start_vbn) + (off_t)new_total * csd->blk_size; save_errno = db_write_eof_block(udi, udi->fd, csd->blk_size, offset, &(TREF(dio_buff))); if (0 != save_errno) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); return FALSE; } KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_3); /* 57 : Issue a kill -9 after reducing csa->ti->total_blks, before FTRUNCATE */ /* Execute an ftruncate() and truncate the DB file * ftruncate() is a SYSTEM CALL on almost all platforms (except SunOS) * It ignores kill -9 signal till its operation is completed. * So we can safely assume that the result of ftruncate() will be complete. */ FTRUNCATE(FILE_INFO(gv_cur_region)->fd, trunc_file_size, ftrunc_status); if (0 != ftrunc_status) { err_msg = (char *)STRERROR(errno); rts_error_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_MUTRUNCERROR, 4, REG_LEN_STR(gv_cur_region), LEN_AND_STR(err_msg)); /* should go through recover_truncate now, which will again try to FTRUNCATE */ return FALSE; } /* file size and total blocks: CONSISTENT (shrunk) */ KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_4); /* 58 : Issue a kill -9 after FTRUNCATE, before 2nd fsync */ csa->nl->root_search_cycle++; /* Force concurrent processes to restart in t_end/tp_tend to make sure no one * tries to commit updates past the end of the file. Bitmap validations together * with highest_lbm_with_busy_blk should actually be sufficient, so this is * just to be safe. */ csd->before_trunc_total_blks = 0; /* indicate CONSISTENT */ /* Increment TN */ assert(csa->ti->early_tn == csa->ti->curr_tn); csd->trans_hist.early_tn = csd->trans_hist.curr_tn + 1; INCREMENT_CURR_TN(csd); fileheader_sync(gv_cur_region); DB_FSYNC(gv_cur_region, udi, csa, db_fsync_in_prog, save_errno); KILL_TRUNC_TEST(WBTEST_CRASH_TRUNCATE_5); /* 58 : Issue a kill -9 after after 2nd fsync */ CHECK_DBSYNC(gv_cur_region, save_errno); ENABLE_INTERRUPTS(INTRPT_IN_TRUNC, prev_intrpt_state); curr_tn = csa->ti->curr_tn; rel_crit(gv_cur_region); send_msg_csa(CSA_ARG(csa) VARLSTCNT(7) ERR_MUTRUNCSUCCESS, 5, DB_LEN_STR(gv_cur_region), old_total, new_total, &curr_tn); util_out_print("Truncated region: !AD. Reduced total blocks from [!UL] to [!UL]. Reduced free blocks from [!UL] to [!UL].", FLUSH, REG_LEN_STR(gv_cur_region), old_total, new_total, old_free, new_free); return TRUE; } /* END of mu_truncate() */
/****************************************************************************************** Input Parameters: level: level of working block dest_blk_id: last destination used for swap Output Parameters: kill_set_ptr: Kill set to be freed *exclude_glist_ptr: List of globals not to be moved for a swap destination Input/Output Parameters: gv_target : as working block's history reorg_gv_target->hist : as desitnitions block's history ******************************************************************************************/ enum cdb_sc mu_swap_blk(int level, block_id *pdest_blk_id, kill_set *kill_set_ptr, glist *exclude_glist_ptr) { unsigned char x_blk_lmap; unsigned short temp_ushort; int rec_size1, rec_size2; int wlevel, nslevel, dest_blk_level; int piece_len1, piece_len2, first_offset, second_offset, work_blk_size, work_parent_size, dest_blk_size, dest_parent_size; int dest_child_cycle; int blk_seg_cnt, blk_size; trans_num ctn; int key_len, key_len_dir; block_id dest_blk_id, work_blk_id, child1, child2; enum cdb_sc status; srch_hist *dest_hist_ptr, *dir_hist_ptr; cache_rec_ptr_t dest_child_cr; blk_segment *bs1, *bs_ptr; sm_uc_ptr_t saved_blk, work_blk_ptr, work_parent_ptr, dest_parent_ptr, dest_blk_ptr, bn_ptr, bmp_buff, tblk_ptr, rec_base, rPtr1; boolean_t gbl_target_was_set, blk_was_free, deleted; gv_namehead *save_targ; srch_blk_status bmlhist, destblkhist, *hist_ptr; unsigned char save_cw_set_depth; cw_set_element *tmpcse; jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */ unsigned int bsiz; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; dest_blk_id = *pdest_blk_id; CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ if (NULL == TREF(gv_reorgkey)) GVKEY_INIT(TREF(gv_reorgkey), DBKEYSIZE(MAX_KEY_SZ)); dest_hist_ptr = &(reorg_gv_target->hist); dir_hist_ptr = reorg_gv_target->alt_hist; blk_size = cs_data->blk_size; work_parent_ptr = gv_target->hist.h[level+1].buffaddr; work_parent_size = ((blk_hdr_ptr_t)work_parent_ptr)->bsiz; work_blk_ptr = gv_target->hist.h[level].buffaddr; work_blk_size = ((blk_hdr_ptr_t)work_blk_ptr)->bsiz; work_blk_id = gv_target->hist.h[level].blk_num; if (blk_size < work_blk_size) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } cws_reorg_remove_index = 0; /*===== Infinite loop to find the destination block =====*/ for ( ; ; ) { blk_was_free = FALSE; INCR_BLK_NUM(dest_blk_id); /* A Pre-order traversal should not cause a child block to go to its parent. * However, in case it happens because already the organization was like that or for any other reason, skip swap. * If we decide to swap, code below should be changed to take care of the special case. * Still a grand-child can go to its grand-parent. This is rare and following code can handle it. */ if (dest_blk_id == gv_target->hist.h[level+1].blk_num) continue; if (cs_data->trans_hist.total_blks <= dest_blk_id || dest_blk_id == work_blk_id) { *pdest_blk_id = dest_blk_id; return cdb_sc_oprnotneeded; } ctn = cs_addrs->ti->curr_tn; /* We need to save the block numbers that were NEWLY ADDED (since entering this function "mu_swap_blk") * through the CWS_INSERT macro (in db_csh_get/db_csh_getn which can be called by t_qread or gvcst_search below). * This is so that we can delete these blocks from the "cw_stagnate" hashtable in case we determine the need to * choose a different "dest_blk_id" in this for loop (i.e. come to the next iteration). If these blocks are not * deleted, then the hashtable will keep growing (a good example will be if -EXCLUDE qualifier is specified and * a lot of prospective dest_blk_ids get skipped because they contain EXCLUDEd global variables) and very soon * the hashtable will contain more entries than there are global buffers and at that point db_csh_getn will not * be able to get a free global buffer for a new block (since it checks the "cw_stagnate" hashtable before reusing * a buffer in case of MUPIP REORG). To delete these previous iteration blocks, we use the "cws_reorg_remove_array" * variable. This array should have enough entries to accommodate the maximum number of blocks that can be t_qread * in one iteration down below. And that number is the sum of * + MAX_BT_DEPTH : for the t_qread while loop down the tree done below * + 2 * MAX_BT_DEPTH : for the two calls to gvcst_search done below * + 2 : 1 for the t_qread of dest_blk_id and 1 more for the t_qread of a * bitmap block done inside the call to get_lmap below * = 3 * MAX_BT_DEPTH + 2 * To be safe, we give a buffer of MAX_BT_DEPTH elements i.e. (4 * MAX_BT_DEPTH) + 2. * This is defined in the macro CWS_REMOVE_ARRAYSIZE in cws_insert.h */ /* reset whatever blocks the previous iteration of this for loop had filled in the cw_stagnate hashtable */ for ( ; cws_reorg_remove_index > 0; cws_reorg_remove_index--) { deleted = delete_hashtab_int4(&cw_stagnate, (uint4 *)&cws_reorg_remove_array[cws_reorg_remove_index]); assert(deleted); } /* read corresponding bitmap block before attempting to read destination block. * if bitmap indicates block is free, we will not read the destination block */ bmp_buff = get_lmap(dest_blk_id, &x_blk_lmap, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr); if (!bmp_buff || BLK_MAPINVALID == x_blk_lmap || ((blk_hdr_ptr_t)bmp_buff)->bsiz != BM_SIZE(BLKS_PER_LMAP) || ((blk_hdr_ptr_t)bmp_buff)->levl != LCL_MAP_LEVL) { assert(CDB_STAGNATE > t_tries); return cdb_sc_badbitmap; } if (BLK_FREE != x_blk_lmap) { /* x_blk_lmap is either BLK_BUSY or BLK_RECYCLED. In either case, we need to read destination block * in case we later detect that the before-image needs to be written. */ if (!(dest_blk_ptr = t_qread(dest_blk_id, (sm_int_ptr_t)&destblkhist.cycle, &destblkhist.cr))) { assert(t_tries < CDB_STAGNATE); return (enum cdb_sc)rdfail_detail; } destblkhist.blk_num = dest_blk_id; destblkhist.buffaddr = dest_blk_ptr; destblkhist.level = dest_blk_level = ((blk_hdr_ptr_t)dest_blk_ptr)->levl; } if (BLK_BUSY != x_blk_lmap) { /* x_blk_map is either BLK_FREE or BLK_RECYCLED both of which mean the block is not used in the bitmap */ blk_was_free = TRUE; break; } /* dest_blk_id might contain a *-record only. * So follow the pointer to go to the data/index block, which has a non-* key to search. */ nslevel = dest_blk_level; if (MAX_BT_DEPTH <= nslevel) { assert(CDB_STAGNATE > t_tries); return cdb_sc_maxlvl; } rec_base = dest_blk_ptr + SIZEOF(blk_hdr); GET_RSIZ(rec_size1, rec_base); tblk_ptr = dest_blk_ptr; while ((BSTAR_REC_SIZE == rec_size1) && (0 != nslevel)) { GET_LONG(child1, (rec_base + SIZEOF(rec_hdr))); if (0 == child1 || child1 > cs_data->trans_hist.total_blks - 1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_rdfail; } if (!(tblk_ptr = t_qread(child1, (sm_int_ptr_t)&dest_child_cycle, &dest_child_cr))) { assert(t_tries < CDB_STAGNATE); return (enum cdb_sc)rdfail_detail; } /* leaf of a killed GVT can have block header only. Skip those blocks */ if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz) break; nslevel--; rec_base = tblk_ptr + SIZEOF(blk_hdr); GET_RSIZ(rec_size1, rec_base); } /* leaf of a killed GVT can have block header only. Skip those blocks */ if (SIZEOF(blk_hdr) >= ((blk_hdr_ptr_t)tblk_ptr)->bsiz) continue; /* get length of global variable name (do not read subscript) for dest_blk_id */ GET_GBLNAME_LEN(key_len_dir, rec_base + SIZEOF(rec_hdr)); /* key_len = length of 1st key value (including subscript) for dest_blk_id */ GET_KEY_LEN(key_len, rec_base + SIZEOF(rec_hdr)); if ((1 >= key_len_dir || MAX_MIDENT_LEN + 1 < key_len_dir) || (2 >= key_len || MAX_KEY_SZ < key_len)) { /* Earlier used to restart here always. But dest_blk_id can be a block, * which is just killed and still marked busy. Skip it, if we are in last retry. */ if (CDB_STAGNATE <= t_tries) continue; else return cdb_sc_blkmod; } memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len_dir); (TREF(gv_reorgkey))->base[key_len_dir] = 0; (TREF(gv_reorgkey))->end = key_len_dir; if (exclude_glist_ptr->next) { /* exclude blocks for globals in the list of EXCLUDE option */ if (in_exclude_list(&((TREF(gv_reorgkey))->base[0]), key_len_dir - 1, exclude_glist_ptr)) continue; } save_targ = gv_target; if (INVALID_GV_TARGET != reset_gv_target) gbl_target_was_set = TRUE; else { gbl_target_was_set = FALSE; reset_gv_target = save_targ; } gv_target = reorg_gv_target; gv_target->root = cs_addrs->dir_tree->root; gv_target->clue.end = 0; /* assign Directory tree path to find dest_blk_id in dir_hist_ptr */ status = gvcst_search(TREF(gv_reorgkey), dir_hist_ptr); if (cdb_sc_normal != status) { assert(t_tries < CDB_STAGNATE); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); return status; } if (dir_hist_ptr->h[0].curr_rec.match != (TREF(gv_reorgkey))->end + 1) { /* may be in a kill_set of another process */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); continue; } for (wlevel = 0; wlevel <= dir_hist_ptr->depth && dir_hist_ptr->h[wlevel].blk_num != dest_blk_id; wlevel++); if (dir_hist_ptr->h[wlevel].blk_num == dest_blk_id) { /* do not swap a dir_tree block */ RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); continue; } /* gv_reorgkey will now have the first key from dest_blk_id, * or, from a descendant of dest_blk_id (in case it had a *-key only). */ memcpy(&((TREF(gv_reorgkey))->base[0]), rec_base + SIZEOF(rec_hdr), key_len); (TREF(gv_reorgkey))->end = key_len - 1; GET_KEY_LEN(key_len_dir, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr)); /* Get root of GVT for dest_blk_id */ GET_LONG(gv_target->root, dir_hist_ptr->h[0].buffaddr + dir_hist_ptr->h[0].curr_rec.offset + SIZEOF(rec_hdr) + key_len_dir); if ((0 == gv_target->root) || (gv_target->root > (cs_data->trans_hist.total_blks - 1))) { assert(t_tries < CDB_STAGNATE); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); return cdb_sc_blkmod; } /* Assign Global Variable Tree path to find dest_blk_id in dest_hist_ptr */ gv_target->clue.end = 0; status = gvcst_search(TREF(gv_reorgkey), dest_hist_ptr); RESET_GV_TARGET_LCL_AND_CLR_GBL(save_targ, DO_GVT_GVKEY_CHECK); if (dest_blk_level >= dest_hist_ptr->depth || /* do not swap in root level */ dest_hist_ptr->h[dest_blk_level].blk_num != dest_blk_id) /* must be in a kill set of another process. */ continue; if ((cdb_sc_normal != status) || (dest_hist_ptr->h[nslevel].curr_rec.match != ((TREF(gv_reorgkey))->end + 1))) { assert(t_tries < CDB_STAGNATE); return (cdb_sc_normal != status ? status : cdb_sc_blkmod); } for (wlevel = nslevel; wlevel <= dest_blk_level; wlevel++) dest_hist_ptr->h[wlevel].tn = ctn; dest_blk_ptr = dest_hist_ptr->h[dest_blk_level].buffaddr; dest_blk_size = ((blk_hdr_ptr_t)dest_blk_ptr)->bsiz; dest_parent_ptr = dest_hist_ptr->h[dest_blk_level+1].buffaddr; dest_parent_size = ((blk_hdr_ptr_t)dest_parent_ptr)->bsiz; break; } /*===== End of infinite loop to find the destination block =====*/ /*----------------------------------------------------- Now modify blocks for swapping. Maximum of 4 blocks. -----------------------------------------------------*/ if (!blk_was_free) { /* 1: dest_blk_id into work_blk_id */ BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, dest_blk_ptr + SIZEOF(blk_hdr), dest_blk_size - SIZEOF(blk_hdr)); if (!BLK_FINI (bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(gv_target->hist.h[level].blk_num == work_blk_id); assert(gv_target->hist.h[level].buffaddr == work_blk_ptr); t_write(&gv_target->hist.h[level], (unsigned char *)bs1, 0, 0, dest_blk_level, TRUE, TRUE, GDS_WRITE_KILLTN); } /* 2: work_blk_id into dest_blk_id */ if (!blk_was_free && work_blk_id == dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* work_blk_id will be swapped with its child. * This is the only vertical swap. Here working block goes to its child. * Working block cannot goto its parent because of traversal */ if (dest_blk_level + 1 != level || dest_parent_size != work_blk_size) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, dest_parent_size, unsigned char); memcpy(saved_blk, dest_parent_ptr, dest_parent_size); first_offset = dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset; GET_RSIZ(rec_size1, saved_blk + first_offset); if (work_blk_size < first_offset + rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } piece_len1 = first_offset + rec_size1; BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), piece_len1 - SIZEOF(block_id) - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, work_blk_id); /* since work_blk_id will now be the child of dest_blk_id */ BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, saved_blk + piece_len1, dest_parent_size - piece_len1); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(dest_blk_id == dest_hist_ptr->h[dest_blk_level].blk_num); assert(dest_blk_ptr == dest_hist_ptr->h[dest_blk_level].buffaddr); t_write(&dest_hist_ptr->h[dest_blk_level], (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN); } else /* free block or, when working block does not move vertically (swap with parent/child) */ { BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, work_blk_size, unsigned char); memcpy(saved_blk, work_blk_ptr, work_blk_size); BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), work_blk_size - SIZEOF(blk_hdr)); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } if (blk_was_free) { tmpcse = &cw_set[cw_set_depth]; t_create(dest_blk_id, (unsigned char *)bs1, 0, 0, level); /* Although we invoked t_create, we do not want t_end to allocate the block (i.e. change mode * from gds_t_create to gds_t_acquired). Instead we do that and a little more (that t_end does) all here. */ assert(dest_blk_id == tmpcse->blk); tmpcse->mode = gds_t_acquired; /* If snapshots are in progress, we might want to read the before images of the FREE blocks also. * Since mu_swap_blk mimics a small part of t_end, it sets cse->mode to gds_t_acquired and hence * will not read the before images of the FREE blocks in t_end. To workaround this, set * cse->was_free to TRUE so that in t_end, this condition can be used to read the before images of * the FREE blocks if needed. */ (BLK_FREE == x_blk_lmap) ? SET_FREE(tmpcse) : SET_NFREE(tmpcse); /* No need to write before-image in case the block is FREE. In case the database had never been fully * upgraded from V4 to V5 format (after the MUPIP UPGRADE), all RECYCLED blocks can basically be considered * FREE (i.e. no need to write before-images since backward journal recovery will never be expected * to take the database to a point BEFORE the mupip upgrade). */ if ((BLK_FREE == x_blk_lmap) || !cs_data->db_got_to_v5_once) tmpcse->old_block = NULL; else { /* Destination is a recycled block that needs a before image */ tmpcse->old_block = destblkhist.buffaddr; /* Record cr,cycle. This is used later in t_end to determine if checksums need to be recomputed */ tmpcse->cr = destblkhist.cr; tmpcse->cycle = destblkhist.cycle; jbbp = (JNL_ENABLED(cs_addrs) && cs_addrs->jnl_before_image) ? cs_addrs->jnl->jnl_buff : NULL; if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn)) { /* Compute CHECKSUM for writing PBLK record before getting crit. * It is possible that we are reading a block that is actually marked free in * the bitmap (due to concurrency issues at this point). Therefore we might be * actually reading uninitialized block headers and in turn a bad value of * "old_block->bsiz". Restart if we ever access a buffer whose size is greater * than the db block size. */ bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz; if (bsiz > blk_size) { assert(CDB_STAGNATE > t_tries); return cdb_sc_lostbmlcr; } JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, cs_data, cs_addrs, tmpcse->old_block, bsiz); } } assert(GDSVCURR == tmpcse->ondsk_blkver); /* should have been set by t_create above */ } else { hist_ptr = &dest_hist_ptr->h[dest_blk_level]; assert(dest_blk_id == hist_ptr->blk_num); assert(dest_blk_ptr == hist_ptr->buffaddr); t_write(hist_ptr, (unsigned char *)bs1, 0, 0, level, TRUE, TRUE, GDS_WRITE_KILLTN); } } if (!blk_was_free) { /* 3: Parent of destination block (may be parent of working block too) */ if (gv_target->hist.h[level+1].blk_num == dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* dest parent == work_blk parent */ BLK_INIT(bs_ptr, bs1); /* Interchange pointer to dest_blk_id and work_blk_id */ if (level != dest_blk_level || gv_target->hist.h[level+1].curr_rec.offset == dest_hist_ptr->h[level+1].curr_rec.offset) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } if (gv_target->hist.h[level+1].curr_rec.offset < dest_hist_ptr->h[level+1].curr_rec.offset) { first_offset = gv_target->hist.h[level+1].curr_rec.offset; second_offset = dest_hist_ptr->h[level+1].curr_rec.offset; } else { first_offset = dest_hist_ptr->h[level+1].curr_rec.offset; second_offset = gv_target->hist.h[level+1].curr_rec.offset; } GET_RSIZ(rec_size1, dest_parent_ptr + first_offset); GET_RSIZ(rec_size2, dest_parent_ptr + second_offset); if (dest_parent_size < first_offset + rec_size1 || dest_parent_size < second_offset + rec_size2 || BSTAR_REC_SIZE >= rec_size1 || BSTAR_REC_SIZE > rec_size2) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } piece_len1 = first_offset + rec_size1 - SIZEOF(block_id); piece_len2 = second_offset + rec_size2 - SIZEOF(block_id); GET_LONG(child1, dest_parent_ptr + piece_len1); GET_LONG(child2, dest_parent_ptr + piece_len2); BLK_SEG(bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), piece_len1 - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, child2); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + first_offset + rec_size1, second_offset + rec_size2 - SIZEOF(block_id) - first_offset - rec_size1); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, child1); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + second_offset + rec_size2, dest_parent_size - second_offset - rec_size2); if (!BLK_FINI(bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(level == dest_blk_level); assert(dest_parent_ptr == dest_hist_ptr->h[level+1].buffaddr); t_write(&dest_hist_ptr->h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } else if (work_blk_id != dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* Destination block moved in the position of working block. * So destination block's parent's pointer should be changed to work_blk_id */ BLK_INIT(bs_ptr, bs1); GET_RSIZ(rec_size1, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset); if (dest_parent_size < rec_size1 + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_SEG (bs_ptr, dest_parent_ptr + SIZEOF(blk_hdr), dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, work_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, dest_parent_ptr + dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset + rec_size1, dest_parent_size - dest_hist_ptr->h[dest_blk_level+1].curr_rec.offset - rec_size1); if (!BLK_FINI(bs_ptr,bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(dest_parent_ptr == dest_hist_ptr->h[dest_blk_level+1].buffaddr); t_write(&dest_hist_ptr->h[dest_blk_level+1], (unsigned char *)bs1, 0, 0, dest_blk_level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } } /* 4: Parent of working block, if different than destination's parent or, destination was a free block */ if (blk_was_free || gv_target->hist.h[level+1].blk_num != dest_hist_ptr->h[dest_blk_level+1].blk_num) { /* Parent block of working blk should correctly point the working block. Working block went to dest_blk_id */ GET_RSIZ(rec_size1, (work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset)); if (work_parent_size < rec_size1 + gv_target->hist.h[level+1].curr_rec.offset || BSTAR_REC_SIZE > rec_size1) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, work_parent_ptr + SIZEOF(blk_hdr), gv_target->hist.h[level+1].curr_rec.offset + rec_size1 - SIZEOF(blk_hdr) - SIZEOF(block_id)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, dest_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, work_parent_ptr + gv_target->hist.h[level+1].curr_rec.offset + rec_size1, work_parent_size - gv_target->hist.h[level+1].curr_rec.offset - rec_size1); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); return cdb_sc_blkmod; } assert(gv_target->hist.h[level+1].buffaddr == work_parent_ptr); t_write(&gv_target->hist.h[level+1], (unsigned char *)bs1, 0, 0, level+1, FALSE, TRUE, GDS_WRITE_KILLTN); } /* else already taken care of, when dest_blk_id moved */ if (blk_was_free) { /* A free/recycled block will become busy block. * So the local bitmap must be updated. * Local bit map block will be added in the list of update arrray for concurrency check and * also the cw_set element will be created to mark the free/recycled block as free. * kill_set_ptr will save the block which will become free. */ child1 = ROUND_DOWN2(dest_blk_id, BLKS_PER_LMAP); /* bit map block */ bmlhist.buffaddr = bmp_buff; bmlhist.blk_num = child1; child1 = dest_blk_id - child1; assert(child1); PUT_LONG(update_array_ptr, child1); /* Need to put bit maps on the end of the cw set for concurrency checking. * We want to simulate t_write_map, except we want to update "cw_map_depth" instead of "cw_set_depth". * Hence the save and restore logic (for "cw_set_depth") below. */ save_cw_set_depth = cw_set_depth; assert(!cw_map_depth); t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, ctn, 1); /* will increment cw_set_depth */ cw_map_depth = cw_set_depth; /* set cw_map_depth to the latest cw_set_depth */ cw_set_depth = save_cw_set_depth; /* restore cw_set_depth */ /* t_write_map simulation end */ update_array_ptr += SIZEOF(block_id); child1 = 0; PUT_LONG(update_array_ptr, child1); update_array_ptr += SIZEOF(block_id); assert(1 == cw_set[cw_map_depth - 1].reference_cnt); /* 1 free block is now becoming BLK_USED in the bitmap */ /* working block will be removed */ kill_set_ptr->blk[kill_set_ptr->used].flag = 0; kill_set_ptr->blk[kill_set_ptr->used].level = 0; kill_set_ptr->blk[kill_set_ptr->used++].block = work_blk_id; } *pdest_blk_id = dest_blk_id; return cdb_sc_normal; }
bt_rec_ptr_t bt_put(gd_region *reg, int4 block) { bt_rec_ptr_t bt, q0, q1, hdr; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; cache_rec_ptr_t cr; th_rec_ptr_t th; trans_num lcl_tn; uint4 lcnt; csa = (sgmnt_addrs *)&FILE_INFO(reg)->s_addrs; csd = csa->hdr; assert(csa->now_crit || csd->clustered); assert(dba_mm != csa->hdr->acc_meth); lcl_tn = csa->ti->curr_tn; hdr = csa->bt_header + (block % csd->bt_buckets); assert(BT_QUEHEAD == hdr->blk); for (lcnt = 0, bt = (bt_rec_ptr_t)((sm_uc_ptr_t)hdr + hdr->blkque.fl); ; bt = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl), lcnt++) { if (BT_QUEHEAD == bt->blk) { /* there is no matching bt */ assert(bt == hdr); bt = (bt_rec_ptr_t)((sm_uc_ptr_t)(csa->th_base) + csa->th_base->tnque.fl - SIZEOF(th->tnque)); if (CR_NOTVALID != bt->cache_index) { /* the oldest bt is still valid */ assert(!in_wcs_recover); cr = (cache_rec_ptr_t)GDS_ANY_REL2ABS(csa, bt->cache_index); if (cr->dirty) { /* get it written so it can be reused */ BG_TRACE_PRO_ANY(csa, bt_put_flush_dirty); if (FALSE == wcs_get_space(reg, 0, cr)) { assert(csa->nl->wc_blocked); /* only reason we currently know * why wcs_get_space could fail */ assert(gtm_white_box_test_case_enabled); BG_TRACE_PRO_ANY(csa, wcb_bt_put); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_bt_put"), process_id, &lcl_tn, DB_LEN_STR(reg)); return NULL; } } bt->cache_index = CR_NOTVALID; cr->bt_index = 0; } q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->blkque.fl); q1 = (bt_rec_ptr_t)remqt((que_ent_ptr_t)q0); if (EMPTY_QUEUE == (sm_long_t)q1) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 1); bt->blk = block; bt->killtn = lcl_tn; insqt((que_ent_ptr_t)bt, (que_ent_ptr_t)hdr); th = (th_rec_ptr_t)remqh((que_ent_ptr_t)csa->th_base); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (bt->blk == block) { /* bt_put should never be called twice for the same block with the same lcl_tn. This is because * t_end/tp_tend update every block only once as part of each update transaction. Assert this. * The two exceptions are * a) Forward journal recovery which simulates a 2-phase M-kill where the same block * could get updated in both phases (example bitmap block gets updated for blocks created * within the TP transaction as well as for blocks that are freed up in the 2nd phase of * the M-kill) with the same transaction number. This is because although GT.M would have * updated the same block with different transaction numbers in the two phases, forward * recovery will update it with the same tn and instead increment the db tn on seeing the * following INCTN journal record(s). * b) Cache recovery (wcs_recover). It could call bt_put more than once for the same block * and potentially with the same tn. This is because the state of the queues is questionable * and there could be more than one cache record for a given block number. */ assert(in_wcs_recover || (bt->tn < lcl_tn) || (jgbl.forw_phase_recovery && !JNL_ENABLED(csa))); q0 = (bt_rec_ptr_t)((sm_uc_ptr_t)bt + bt->tnque.fl); th = (th_rec_ptr_t)remqt((que_ent_ptr_t)((sm_uc_ptr_t)q0 + SIZEOF(th->tnque))); if (EMPTY_QUEUE == (sm_long_t)th) GTMASSERT; break; } if (0 == bt->blkque.fl) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 2); if (lcnt >= csd->n_bts) rts_error(VARLSTCNT(3) ERR_BTFAIL, 1, 3); } insqt((que_ent_ptr_t)th, (que_ent_ptr_t)csa->th_base); bt->tn = lcl_tn; return bt; }
bool gtcmtr_bufflush(void) { cm_region_list *reg_ref; mval v; short n; unsigned short num_trans, data_len; unsigned char buff[MAX_ZWR_KEY_SZ], *end; unsigned char *ptr, regnum, len, cc, prv; static readonly gds_file_id file; error_def(ERR_KEY2BIG); error_def(ERR_REC2BIG); error_def(ERR_GVIS); ptr = curr_entry->clb_ptr->mbf; assert(*ptr == CMMS_B_BUFFLUSH); ptr++; v.mvtype = MV_STR; GET_USHORT(num_trans, ptr); ptr += sizeof (short); for (; num_trans-- > 0;) { regnum = *ptr++; reg_ref = gtcm_find_region(curr_entry, regnum); len = *ptr++; cc = *ptr++; prv = *ptr++; assert (len + cc - 1 < gv_currkey->top); memcpy(&gv_currkey->base[cc], ptr, len); ptr += len; gv_currkey->end = len + cc - 1; gv_currkey->prev = prv; assert(prv < gv_currkey->end); if ((n = gv_currkey->end + 1) > gv_cur_region->max_key_size) { if ((end = format_targ_key(&buff[0], MAX_ZWR_KEY_SZ, gv_currkey, TRUE)) == 0) end = &buff[MAX_ZWR_KEY_SZ - 1]; rts_error(VARLSTCNT(11) ERR_KEY2BIG, 4, n, (int4)gv_cur_region->max_key_size, REG_LEN_STR(gv_cur_region), 0, ERR_GVIS, 2, end - buff, buff); } gtcm_bind_name(reg_ref->reghead, TRUE); if (JNL_ENABLED(cs_addrs->hdr)) { cs_addrs->jnl->pini_addr = reg_ref->pini_addr; originator_prc_vec = curr_entry->pvec; } GET_USHORT(data_len, ptr); ptr += sizeof(short); v.str.len = data_len; v.str.addr = (char *)ptr; if (n + v.str.len + sizeof(rec_hdr) > gv_cur_region->max_rec_size) { if ((end = format_targ_key(&buff[0], MAX_ZWR_KEY_SZ, gv_currkey, TRUE)) == 0) end = &buff[MAX_ZWR_KEY_SZ - 1]; rts_error(VARLSTCNT(11) ERR_REC2BIG, 4, n + v.str.len + sizeof(rec_hdr), (int4)gv_cur_region->max_rec_size, REG_LEN_STR(gv_cur_region), 0, ERR_GVIS, 2, end - buff, buff); } gvcst_put(&v); if (JNL_ENABLED(cs_addrs->hdr)) reg_ref->pini_addr = cs_addrs->jnl->pini_addr; /* In case journal switch occurred */ ptr += data_len; } ptr = curr_entry->clb_ptr->mbf; *ptr++ = CMMS_C_BUFFLUSH; curr_entry->clb_ptr->cbl = S_HDRSIZE; return TRUE; }
int4 gds_rundown(void) { boolean_t canceled_dbsync_timer, canceled_flush_timer, ok_to_write_pfin; boolean_t have_standalone_access, ipc_deleted, err_caught; boolean_t is_cur_process_ss_initiator, remove_shm, vermismatch, we_are_last_user, we_are_last_writer, is_mm; boolean_t unsafe_last_writer; char time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status, rc; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; node_local_ptr_t cnl; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; shm_snapshot_t *ss_shm_ptr; uint4 ss_pid, onln_rlbk_pid, holder_pid; boolean_t was_crit; boolean_t safe_mode; /* Do not flush or take down shared memory. */ boolean_t bypassed_ftok = FALSE, bypassed_access = FALSE, may_bypass_ftok, inst_is_frozen, ftok_counter_halted, access_counter_halted; int secshrstat; intrpt_state_t prev_intrpt_state; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return EXIT_NRM; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return EXIT_NRM; } /* If the process has standalone access, it has udi->grabbed_access_sem set to TRUE at this point. Note that down in a local * variable as the udi->grabbed_access_sem is set to TRUE even for non-standalone access below and hence we can't rely on * that later to determine if the process had standalone access or not when it entered this function. We need to guarantee * that none else access database file header when semid/shmid fields are reset. We already have created ftok semaphore in * db_init or, mu_rndwn_file and did not remove it. So just lock it. We do it in blocking mode. */ have_standalone_access = udi->grabbed_access_sem; /* process holds standalone access */ DEFER_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); ESTABLISH_NORET(gds_rundown_ch, err_caught); if (err_caught) { REVERT; WITH_CH(gds_rundown_ch, gds_rundown_err_cleanup(have_standalone_access), 0); ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); DEBUG_ONLY(ok_to_UNWIND_in_exit_handling = FALSE); return EXIT_ERR; } assert(reg->open); /* if we failed to open, dbinit_ch should have taken care of proper clean up */ assert(!reg->opening); /* see comment above */ assert((dba_bg == csd->acc_meth) || (dba_mm == csd->acc_meth)); is_mm = (dba_bg != csd->acc_meth); assert(!csa->hold_onto_crit || (csa->now_crit && jgbl.onlnrlbk)); /* If we are online rollback, we should already be holding crit and should release it only at the end of this module. This * is usually done by noting down csa->now_crit in a local variable (was_crit) and using it whenever we are about to * grab_crit. But, there are instances (like mupip_set_journal.c) where we grab_crit but invoke gds_rundown without any * preceeding rel_crit. Such code relies on the fact that gds_rundown does rel_crit unconditionally (to get locks to a known * state). So, augment csa->now_crit with jgbl.onlnrlbk to track if we can rel_crit unconditionally or not in gds_rundown. */ was_crit = (csa->now_crit && jgbl.onlnrlbk); /* Cancel any pending flush timer for this region by this task */ canceled_flush_timer = FALSE; canceled_dbsync_timer = FALSE; CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); we_are_last_user = FALSE; inst_is_frozen = IS_REPL_INST_FROZEN && REPL_ALLOWED(csa->hdr); if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); if (!was_crit) { rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); } /* The only process that can invoke gds_rundown while holding access control semaphore is RECOVER/ROLLBACK. All the others * (like MUPIP SET -FILE/MUPIP EXTEND would have invoked db_ipcs_reset() before invoking gds_rundown (from * mupip_exit_handler). The only exception is when these processes encounter a terminate signal and they reach * mupip_exit_handler while holding access control semaphore. Assert accordingly. */ assert(!have_standalone_access || mupip_jnl_recover || process_exiting); /* If we have standalone access, then ensure that a concurrent online rollback cannot be running at the same time as it * needs the access control lock as well. The only expection is we are online rollback and currently running down. */ cnl = csa->nl; onln_rlbk_pid = cnl->onln_rlbk_pid; assert(!have_standalone_access || mupip_jnl_recover || !onln_rlbk_pid || !is_proc_alive(onln_rlbk_pid, 0)); if (!have_standalone_access) { if (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL))) /* Check # of procs counted on FTOK */ { save_errno = errno; assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); } may_bypass_ftok = CAN_BYPASS(ftok_semval, csd, inst_is_frozen); /* Do we need a blocking wait? */ /* We need to guarantee that no one else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or mu_rndwn_file and did not remove it. So just lock it. */ if (!ftok_sem_lock(reg, may_bypass_ftok)) { if (may_bypass_ftok) { /* We did a non-blocking wait. It's ok to proceed without locking */ bypassed_ftok = TRUE; holder_pid = semctl(udi->ftok_semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!IS_GTM_IMAGE) /* MUMPS processes should not flood syslog with bypass messages. */ { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("FTOK"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("FTOK bypassed at rundown")); } } else { /* We did a blocking wait but something bad happened. */ FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_lock, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } } sop[0].sem_num = DB_CONTROL_SEM; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = DB_CONTROL_SEM; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status, NO_WAIT); if (0 != status) { save_errno = errno; /* Check # of processes counted on access sem. */ if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); } bypassed_access = CAN_BYPASS(semval, csd, inst_is_frozen) || onln_rlbk_pid || csd->file_corrupt; /* Before attempting again in the blocking mode, see if the holding process is an online rollback. * If so, it is likely we won't get the access control semaphore anytime soon. In that case, we * are better off skipping rundown and continuing with sanity cleanup and exit. */ holder_pid = semctl(udi->semid, DB_CONTROL_SEM, GETPID); if ((uint4)-1 == holder_pid) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get holder_pid"), CALLFROM, errno); if (!bypassed_access) { /* We couldn't get it in one shot-- see if we already have it */ if (holder_pid == process_id) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); assert(FALSE); return EXIT_ERR; } if (EAGAIN != save_errno) { assert(FALSE); rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status, FORCED_WAIT); if (-1 == status) /* We couldn't get it at all.. */ rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMOP on access control semaphore"), CALLFROM, errno); } else if (!IS_GTM_IMAGE) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_RESRCINTRLCKBYPAS, 10, LEN_AND_STR(gtmImageNames[image_type].imageName), process_id, LEN_AND_LIT("access control"), REG_LEN_STR(reg), DB_LEN_STR(reg), holder_pid); send_msg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_TEXT, 2, LEN_AND_LIT("Access control bypassed at rundown")); } udi->grabbed_access_sem = !bypassed_access; } } /* else we we hold the access control semaphore and therefore have standalone access. We do not release it now - we * release it later in mupip_exit_handler.c. Since we already hold the access control semaphore, we don't need the * ftok semaphore and trying it could cause deadlock */ /* Note that in the case of online rollback, "udi->grabbed_access_sem" (and in turn "have_standalone_access") is TRUE. * But there could be other processes still having the database open so we cannot safely reset the halted fields. */ if (have_standalone_access && !jgbl.onlnrlbk) csd->ftok_counter_halted = csd->access_counter_halted = FALSE; ftok_counter_halted = csd->ftok_counter_halted; access_counter_halted = csd->access_counter_halted; /* If we bypassed any of the semaphores, activate safe mode. * Also, if the replication instance is frozen and this db has replication turned on (which means * no flushes of dirty buffers to this db can happen while the instance is frozen) activate safe mode. */ ok_to_write_pfin = !(bypassed_access || bypassed_ftok || inst_is_frozen); safe_mode = !ok_to_write_pfin || ftok_counter_halted || access_counter_halted; /* At this point we are guaranteed no one else is doing a db_init/rundown as we hold the access control semaphore */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --cnl->ref_cnt; if (memcmp(cnl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown shmctl"), CALLFROM, save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch && !safe_mode; /* recover => one user except ONLINE ROLLBACK, or standalone with frozen instance */ assert(!have_standalone_access || we_are_last_user || jgbl.onlnrlbk || inst_is_frozen); if (-1 == (semval = semctl(udi->semid, DB_COUNTER_SEM, GETVAL))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get semval"), CALLFROM, errno); /* There's one writer left and I am it */ assert(reg->read_only || semval >= 0); unsafe_last_writer = (DB_COUNTER_SEM_INCR == semval) && (FALSE == reg->read_only) && !vermismatch; we_are_last_writer = unsafe_last_writer && !safe_mode; assert(!we_are_last_writer || !safe_mode); assert(!we_are_last_user || !safe_mode); /* recover + R/W region => one writer except ONLINE ROLLBACK, or standalone with frozen instance, leading to safe_mode */ assert(!(have_standalone_access && !reg->read_only) || we_are_last_writer || jgbl.onlnrlbk || inst_is_frozen); GTM_WHITE_BOX_TEST(WBTEST_ANTIFREEZE_JNLCLOSE, we_are_last_writer, 1); /* Assume we are the last writer to invoke wcs_flu */ if (!have_standalone_access && (-1 == (ftok_semval = semctl(udi->ftok_semid, DB_COUNTER_SEM, GETVAL)))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown SEMCTL failed to get ftok_semval"), CALLFROM, errno); if (NULL != csa->ss_ctx) ss_destroy_context(csa->ss_ctx); /* SS_MULTI: If multiple snapshots are supported, then we have to run through each of the snapshots */ assert(1 == MAX_SNAPSHOTS); ss_shm_ptr = (shm_snapshot_ptr_t)SS_GETSTARTPTR(csa); ss_pid = ss_shm_ptr->ss_info.ss_pid; is_cur_process_ss_initiator = (process_id == ss_pid); if (ss_pid && (is_cur_process_ss_initiator || we_are_last_user)) { /* Try getting snapshot crit latch. If we don't get latch, we won't hang for eternity and will skip * doing the orphaned snapshot cleanup. It will be cleaned up eventually either by subsequent MUPIP * INTEG or by a MUPIP RUNDOWN. */ if (ss_get_lock_nowait(reg) && (ss_pid == ss_shm_ptr->ss_info.ss_pid) && (is_cur_process_ss_initiator || !is_proc_alive(ss_pid, 0))) { ss_release(NULL); ss_release_lock(reg); } } /* If cnl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we need not flush * shared memory contents to disk as they might be in an inconsistent state. Moreover, any more flushing will only cause * future rollback to undo more journal records (PBLKs). In this case, we will go ahead and remove shared memory (without * flushing the contents) in this routine. A reissue of the recover/rollback command will restore the database to a * consistent state. */ if (!cnl->donotflush_dbjnl && !reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover. Do it ONLY * if there is NO concurrent online rollback running (as we need crit to set wc_blocked) */ if (csa->wbuf_dqd && !is_mm) { /* If we had an orphaned block and were interrupted, mupip_exit_handler will invoke secshr_db_clnup which * will clear this field and so we should never come to gds_rundown with a non-zero wbuf_dqd. The only * exception is if we are recover/rollback in which case gds_rundown (from mur_close_files) is invoked * BEFORE secshr_db_clnup in mur_close_files. * Note: It is NOT possible for online rollback to reach here with wbuf_dqd being non-zero. This is because * the moment we apply the first PBLK, we stop all interrupts and hence can never be interrupted in * wcs_wtstart or wcs_get_space. Assert accordingly. */ assert(mupip_jnl_recover && !jgbl.onlnrlbk && !safe_mode); if (!was_crit) grab_crit(reg); SET_TRACEABLE_VAR(cnl->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); BG_TRACE_PRO_ANY(csa, lost_block_recovery); if (!was_crit) rel_crit(reg); } if (JNL_ENABLED(csd) && IS_GTCM_GNP_SERVER_IMAGE) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ assert(!safe_mode); if (is_mm) { MM_DBFILEXT_REMAP_IF_NEEDED(csa, reg); cnl->remove_shm = TRUE; } if (cnl->wc_blocked && jgbl.onlnrlbk) { /* if the last update done by online rollback was not committed in the normal code-path but was * completed by secshr_db_clnup, wc_blocked will be set to TRUE. But, since online rollback never * invokes grab_crit (since csa->hold_onto_crit is set to TRUE), wcs_recover is never invoked. This * could result in the last update never getting flushed to the disk and if online rollback happened * to be the last writer then the shared memory will be flushed and removed and the last update will * be lost. So, force wcs_recover if we find ourselves in such a situation. But, wc_blocked is * possible only if phase1 or phase2 errors are induced using white box test cases */ assert(WB_COMMIT_ERR_ENABLED); wcs_recover(reg); } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ cnl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); } else if (((canceled_flush_timer && (0 > cnl->wcs_timers)) || canceled_dbsync_timer) && !inst_is_frozen) { /* canceled pending db or jnl flush timers - flush database and journal buffers to disk */ if (!was_crit) grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); if (!was_crit) rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if ((((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != cnl->jnl_file.u.inode)) && ok_to_write_pfin) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of cnl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ if (!was_crit) grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * If the freeaddr == post_epoch_freeaddr, wcs_flu may have skipped writing * a pini, so allow for that. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || (0 != jpc->pini_addr) || jgbl.mur_extract || (jpc->jnl_buff->freeaddr == jpc->jnl_buff->post_epoch_freeaddr)); /* If we haven't written a pini, let jnl_file_close write the pini/pfin. */ if (!jgbl.mur_extract && (0 != jpc->pini_addr)) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > cnl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } if (!was_crit) rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { if (!was_crit) grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!have_standalone_access && we_are_last_user) { /* mupip_exit_handler will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); if (!was_crit) rel_crit(reg); if (!is_mm) { GTM_DB_FSYNC(csa, udi->fd, rc); /* Sync it all */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ assert(csa->ti->total_blks == csa->total_blks); #ifdef _AIX GTM_DB_FSYNC(csa, udi->fd, rc); if (-1 == rc) #else if (-1 == MSYNC((caddr_t)csa->db_addrs[0], (caddr_t)csa->db_addrs[1])) #endif { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } } else if (unsafe_last_writer && !cnl->lastwriterbypas_msg_issued) { send_msg_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_LASTWRITERBYPAS, 2, DB_LEN_STR(reg)); cnl->lastwriterbypas_msg_issued = TRUE; } } /* end if (!reg->read_only && !cnl->donotflush_dbjnl) */ /* We had canceled all db timers at start of rundown. In case as part of rundown (wcs_flu above), we had started * any timers, cancel them BEFORE setting reg->open to FALSE (assert in wcs_clean_dbsync relies on this). */ CANCEL_DB_TIMERS(reg, csa, canceled_flush_timer, canceled_dbsync_timer); if (reg->read_only && we_are_last_user && !have_standalone_access && cnl->remove_shm) { /* mupip_exit_handler will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ WAIT_FOR_REPL_INST_UNFREEZE_SAFE(csa); if (!csa->read_only_fs) { secshrstat = send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0); if (0 != secshrstat) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } } /* Done with file now, close it */ CLOSEFILE_RESET(udi->fd, rc); /* resets "udi->fd" to FD_INVALID */ if (-1 == rc) { rts_error_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ # if !defined(_AIX) if (is_mm && (NULL != csa->db_addrs[0])) { assert(csa->db_addrs[1] > csa->db_addrs[0]); munmap_len = (sm_long_t)(csa->db_addrs[1] - csa->db_addrs[0]); if (0 < munmap_len) munmap((caddr_t)(csa->db_addrs[0]), (size_t)(munmap_len)); } # endif /* Detach our shared memory while still under lock so reference counts will be correct for the next process to run down * this region. In the process also get the remove_shm status from node_local before detaching. * If cnl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (cnl->remove_shm || cnl->donotflush_dbjnl); /* We are done with online rollback on this region. Indicate to other processes by setting the onln_rlbk_pid to 0. * Do it before releasing crit (t_end relies on this ordering when accessing cnl->onln_rlbk_pid). */ if (jgbl.onlnrlbk) cnl->onln_rlbk_pid = 0; rel_crit(reg); /* Since we are about to detach from the shared memory, release crit and reset onln_rlbk_pid */ /* If we had skipped flushing journal and database buffers due to a concurrent online rollback, increment the counter * indicating that in the shared memory so that online rollback can report the # of such processes when it shuts down. * The same thing is done for both FTOK and access control semaphores when there are too many MUMPS processes. */ if (safe_mode) /* indicates flushing was skipped */ { if (bypassed_access) cnl->dbrndwn_access_skip++; /* Access semaphore can be bypassed during online rollback */ if (bypassed_ftok) cnl->dbrndwn_ftok_skip++; } if (jgbl.onlnrlbk) csa->hold_onto_crit = FALSE; GTM_WHITE_BOX_TEST(WBTEST_HOLD_SEM_BYPASS, cnl->wbox_test_seq_num, 0); status = shmdt((caddr_t)cnl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ /* Note that although csa->nl is NULL, we use CSA_ARG(csa) below (not CSA_ARG(NULL)) to be consistent with similar * usages before csa->nl became NULL. The "is_anticipatory_freeze_needed" function (which is in turn called by the * CHECK_IF_FREEZE_ON_ERROR_NEEDED macro) does a check of csa->nl before dereferencing shared memory contents so * we are safe passing "csa". */ if (-1 == status) send_msg_csa(CSA_ARG(csa) VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); REMOVE_CSA_FROM_CSADDRSLIST(csa); /* remove "csa" from list of open regions (cs_addrs_list) */ reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ assertpro(0 == csa->wbuf_dqd); ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); /* Note that we no longer have a new shared memory. Currently only used/usable for standalone rollback. */ udi->new_shm = FALSE; /* mupip recover/rollback don't release the semaphore here, but do it later in db_ipcs_reset (invoked from * mur_close_files()) */ if (!have_standalone_access) { if (0 != sem_rmid(udi->semid)) rts_error_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); udi->new_sem = FALSE; /* Note that we no longer have a new semaphore */ udi->grabbed_access_sem = FALSE; udi->counter_acc_incremented = FALSE; } } else if (is_src_server || is_updproc) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg_csa(CSA_ARG(csa) VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else { assert(!have_standalone_access || jgbl.onlnrlbk || safe_mode); if (!jgbl.onlnrlbk && !have_standalone_access) { /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) { if (!access_counter_halted) { save_errno = do_semop(udi->semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO); if (0 != save_errno) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore decrement"), CALLFROM, save_errno); } udi->counter_acc_incremented = FALSE; } assert(safe_mode || !bypassed_access); /* Now remove the rundown lock */ if (!bypassed_access) { if (0 != (save_errno = do_semop(udi->semid, DB_CONTROL_SEM, -1, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(12) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_SYSCALL, 5, RTS_ERROR_TEXT("gds_rundown access control semaphore release"), CALLFROM, save_errno); udi->grabbed_access_sem = FALSE; } } /* else access control semaphore will be released in db_ipcs_reset */ } if (!have_standalone_access) { if (bypassed_ftok) { if (!ftok_counter_halted) if (0 != (save_errno = do_semop(udi->ftok_semid, DB_COUNTER_SEM, -DB_COUNTER_SEM_INCR, SEM_UNDO))) rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } else if (!ftok_sem_release(reg, !ftok_counter_halted, FALSE)) { FTOK_TRACE(csa, csa->ti->curr_tn, ftok_ops_release, process_id); rts_error_csa(CSA_ARG(csa) VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); } udi->grabbed_ftok_sem = FALSE; udi->counter_ftok_incremented = FALSE; } ENABLE_INTERRUPTS(INTRPT_IN_GDS_RUNDOWN, prev_intrpt_state); if (!ipc_deleted) { GET_CUR_TIME(time_str); if (is_src_server) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover && (!jgbl.onlnrlbk || !we_are_last_user)) { gtm_putmsg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg_csa(CSA_ARG(csa) VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_str, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; return EXIT_NRM; }
uint4 mur_block_count_correct(reg_ctl_list *rctl) { unsigned int native_size, size; sgmnt_data_ptr_t mu_data; int4 mu_int_ovrhd; uint4 total_blks; uint4 status; uint4 new_bit_maps, bplmap, new_blocks; MUR_CHANGE_REG(rctl); mu_data = cs_data; switch (mu_data->acc_meth) { default: GTMASSERT; break; #if defined(VMS) && defined(GT_CX_DEF) case dba_bg: /* necessary to do calculation in this manner to prevent double rounding causing an error */ if (mu_data->unbacked_cache) mu_int_ovrhd = DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + mu_data->free_space + mu_data->lock_space_size, DISK_BLOCK_SIZE); else mu_int_ovrhd = DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + BT_SIZE(mu_data) + mu_data->free_space + mu_data->lock_space_size, DISK_BLOCK_SIZE); break; #else case dba_bg: #endif case dba_mm: mu_int_ovrhd = (int4)DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + mu_data->free_space, DISK_BLOCK_SIZE); break; } mu_int_ovrhd += 1; assert(mu_int_ovrhd == mu_data->start_vbn); size = mu_int_ovrhd + (mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks; native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl); /* In the following tests, the EOF block should always be 1 greater than the actual size of the file. * This is due to the GDS being allocated in even DISK_BLOCK_SIZE-byte blocks. */ if (native_size && (size < native_size)) { total_blks = (dba_mm == mu_data->acc_meth) ? cs_addrs->total_blks : cs_addrs->ti->total_blks; if (JNL_ENABLED(cs_addrs)) cs_addrs->jnl->pini_addr = 0; /* Stop simulation of GTM process journal record writing (if any active)*/ /* If journaling, gdsfilext will need to write an inctn record. The timestamp of that journal record will * need to be adjusted to the current system time to reflect that it is recovery itself writing that record * instead of simulating GT.M activity. Since the variable jgbl.dont_reset_gbl_jrec_time is still set, gdsfilext * will NOT modify jgbl.gbl_jrec_time. Temporarily reset it to allow for adjustments to gbl_jrec_time. */ assert(jgbl.dont_reset_gbl_jrec_time); jgbl.dont_reset_gbl_jrec_time = FALSE; /* Calculate the number of blocks to add based on the difference between the real file size and the file size * computed from the header->total_blks. Takes into account that gdsfilext() will automatically add new_bit_maps * to the amount of blocks we request. */ bplmap = cs_data->bplmap; new_blocks = (native_size - size)/(mu_data->blk_size / DISK_BLOCK_SIZE); new_bit_maps = DIVIDE_ROUND_UP(total_blks + new_blocks, bplmap) - DIVIDE_ROUND_UP(total_blks, bplmap); if (SS_NORMAL != (status = gdsfilext(new_blocks - new_bit_maps, total_blks))) { jgbl.dont_reset_gbl_jrec_time = TRUE; return (status); } jgbl.dont_reset_gbl_jrec_time = TRUE; DEBUG_ONLY( /* Check that the filesize and blockcount in the fileheader match now after the extend */ size = mu_int_ovrhd + (mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks; native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl); assert(size == native_size); ) }
boolean_t tp_tend(boolean_t crit_only) { block_id tp_blk; boolean_t history_validated, is_mm, was_crit, x_lock, do_validation; boolean_t do_deferred_writes = FALSE, replication = FALSE; bt_rec_ptr_t bt; cache_rec_ptr_t cr; cw_set_element *cse; file_control *fc; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; jnl_format_buffer *jfb; sgm_info *si, *tmpsi; tp_region *tr, *tr_last; sgmnt_addrs *csa, *tmpcsa; sgmnt_data_ptr_t csd; srch_blk_status *t1; trans_num ctn, tnque_earliest_tn; trans_num valid_thru; /* buffers touched by this transaction will be valid thru this tn */ enum cdb_sc status; gd_region *save_gv_cur_region; int lcnt, participants; jnldata_hdr_ptr_t jnl_header; int repl_tp_region_count = 0; boolean_t first_time = TRUE, release_crit, yes_jnl_no_repl, retvalue; uint4 jnl_status, leafmods, indexmods; uint4 total_jnl_rec_size; jnlpool_ctl_ptr_t jpl, tjpl; error_def(ERR_DLCKAVOIDANCE); error_def(ERR_JNLTRANS2BIG); assert(dollar_tlevel > 0); assert(0 == jnl_fence_ctl.level); participants = 0; status = cdb_sc_normal; /* if the transaction does no updates and the transaction history has not changed, we do not need any more validation */ do_validation = FALSE; /* initially set to FALSE, but set to TRUE below */ jnl_status = 0; if (FALSE == crit_only) { for (si = first_sgm_info; (NULL != si); si = si->next_sgm_info) { sgm_info_ptr = si; TP_CHANGE_REG_IF_NEEDED(si->gv_cur_region); csa = cs_addrs; csd = cs_data; if ((csd->wc_blocked) || /* If blocked, or.. */ ((dba_mm == csa->hdr->acc_meth) && /* we have MM and.. */ (csa->total_blks != csa->ti->total_blks))) /* and file has been extended */ { /* Force repair */ t_fail_hist[t_tries] = cdb_sc_helpedout; /* special status to prevent punishing altruism */ TP_TRACE_HIST(CR_BLKEMPTY, NULL); return FALSE; } /* whenever si->first_cw_set is non-NULL, ensure that si->update_trans is TRUE */ assert((NULL == si->first_cw_set) || si->update_trans); /* whenever si->first_cw_set is NULL, ensure that si->update_trans is FALSE * except when the set noop optimization is enabled */ assert((NULL != si->first_cw_set) || !si->update_trans || gvdupsetnoop); if (!si->update_trans) { if (si->start_tn == csa->ti->early_tn) { /* read with no change to the transaction history. ensure we haven't overrun * our history buffer and we have reasonable values for first and last */ assert(si->last_tp_hist - si->first_tp_hist <= si->tp_hist_size); continue; } else do_validation = TRUE; } else { do_validation = TRUE; is_mm = (dba_mm == cs_data->acc_meth); /* We are still out of crit if this is not our last attempt. If so, run the region list and check * that we have sufficient free blocks for our update. If not, get them now while we can. * We will repeat this check later in crit but it will hopefully have little or nothing to do. * bypass 1st check if already in crit -- check later */ if (!csa->now_crit && !is_mm && (csa->nl->wc_in_free < si->cw_set_depth + 1) && !wcs_get_space(si->gv_cur_region, si->cw_set_depth + 1, NULL)) assert(FALSE); /* wcs_get_space() should have returned TRUE unconditionally in this case */ } if (si->update_trans && JNL_ENABLED(csa)) { /* compute the total journal record size requirements before grab_crit(). * there is code later that will check for state changes from now to then */ TOTAL_TPJNL_REC_SIZE(total_jnl_rec_size, si, csa); /* compute current transaction's maximum journal space needs in number of disk blocks */ si->tot_jrec_size = MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size); /* check if current TP transaction's journal size needs are greater than max jnl file size */ if (si->tot_jrec_size > csd->autoswitchlimit) /* can't fit in current transaction's journal records into one journal file */ rts_error(VARLSTCNT(6) ERR_JNLTRANS2BIG, 4, si->tot_jrec_size, JNL_LEN_STR(csd), csd->autoswitchlimit); } } /* for (si ... ) */ if (!do_validation) { if (CDB_STAGNATE <= t_tries) { for (tr = tp_reg_list; NULL != tr; tr = tr->fPtr) rel_crit(tr->reg); } UNIX_ONLY( /* Must be done after REVERT since we are no longer in crit */ if (unhandled_stale_timer_pop) process_deferred_stale(); ) return TRUE; } }
void ccp_tr_writedb1(ccp_action_record *rec) { ccp_action_record request; ccp_db_header *db; jnl_buffer *jb; sgmnt_addrs *csa; int4 curr_time[2]; uint4 status, *p1, *p2, *top; if ((db = rec->v.h) == NULL) return; csa = db->segment; assert(csa->nl->ccp_state == CCST_RDMODE || csa->nl->ccp_state == CCST_CLOSED || csa->nl->ccp_state == CCST_OPNREQ); if (JNL_ENABLED(csa->hdr)) { assert(csa->jnl->channel != 0); jb = csa->jnl->jnl_buff; if (jb->blocked != 0) { /* jnl writes from a previous write mode are not done yet; try again later */ if ((status = sys$setimr(0, delta_100_msec, ccp_writedb5, db, 0)) != SS$_NORMAL) ccp_signal_cont(status); /***** Is this reasonable? *****/ return; } jb->epoch_tn = db->wm_iosb.valblk[CCP_VALBLK_EPOCH_TN]; jb->freeaddr = jb->dskaddr = ROUND_UP(db->wm_iosb.valblk[CCP_VALBLK_JNL_ADDR], DISK_BLOCK_SIZE); /* lastaddr is no longer a field in jnl_buff * jb->lastaddr = db->wm_iosb.valblk[CCP_VALBLK_LST_ADDR]; */ jb->free = jb->dsk = 0; } /* Note: We must clear these flags prior to changing state or a set from ccp_exitwm_blkast may be missed */ db->quantum_expired = FALSE; db->tick_in_progress = FALSE; if (db->remote_wakeup) { for (p2 = NULL, p1 = csa->lock_addrs[0], top = p1 + SIZEOF(int4) * NUM_CLST_LCKS; *p1 != 0 && p1 <= top; ++p1) { if ((*p1 & NODENUMBER) == (process_id & NODENUMBER)) { crit_wake(p1); if (p2 == NULL) p2 = p1; *p1 = 0; } else if (p2 != NULL) { *p2++ = *p1; *p1 = 0; } } db->remote_wakeup = FALSE; status = ccp_enq(0, LCK$K_CRMODE, &db->lock_iosb, LCK$M_CONVERT | LCK$M_SYNCSTS, NULL, 0, ccp_lkrqwake1, db, ccp_lkdowake_blkast, PSL$C_USER, 0); /***** Check error status here? *****/ } db->glob_sec->freeze = 0; /* db->glob_sec->wcs_active_lvl = db->glob_sec->n_bts / 2; */ db->drop_lvl = 0; sys$gettim(curr_time); lib$add_times(curr_time, db->glob_sec->ccp_quantum_interval, &db->start_wm_time); csa->nl->ccp_state = CCST_WRTGNT; if (db->blocking_ast_received) { status = sys$dclast(ccp_exitwm_blkast, &db->exitwm_timer_id, PSL$C_USER); if (status != SS$_NORMAL) ccp_signal_cont(status); /***** Is this reasonable? *****/ } csa->nl->ccp_crit_blocked = FALSE; ++csa->nl->ccp_cycle; status = sys$setimr(0, &db->glob_sec->ccp_quantum_interval, ccp_quantum_interrupt, &db->quantum_timer_id, 0); if (status != SS$_NORMAL) ccp_signal_cont(status); /***** Is this reasonable? *****/ db->dirty_buffers_acquired = FALSE; ccp_pndg_proc_wake(&db->write_wait); status = ccp_enq(0, LCK$K_EXMODE, &db->flush_iosb, LCK$M_CONVERT | LCK$M_SYNCSTS, NULL, 0, ccp_reqdrtbuf_interrupt, db, NULL, PSL$C_USER, 0); if (status == SS$_SYNCH) { request.action = CCTR_GOTDRT; request.pid = 0; request.v.h = db; ccp_act_request(&request); } /***** Check error status here? *****/ }
uint4 mur_block_count_correct(reg_ctl_list *rctl) { gtm_uint64_t native_size, size; sgmnt_data_ptr_t mu_data; int4 mu_int_ovrhd; uint4 total_blks; uint4 status; uint4 new_bit_maps, bplmap, new_blocks, tmpcnt; enum db_acc_method acc_meth; MUR_CHANGE_REG(rctl); mu_data = cs_data; acc_meth = mu_data->acc_meth; switch (acc_meth) { case dba_bg: case dba_mm: mu_int_ovrhd = (int4)DIVIDE_ROUND_UP(SIZEOF_FILE_HDR(mu_data) + mu_data->free_space, DISK_BLOCK_SIZE); break; default: assertpro(FALSE && acc_meth); } mu_int_ovrhd += 1; assert(mu_int_ovrhd == mu_data->start_vbn); size = mu_int_ovrhd + (off_t)(mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks; native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl); /* In the following tests, the EOF block should always be 1 greater than the actual size of the file. * This is due to the GDS being allocated in even DISK_BLOCK_SIZE-byte blocks. */ if (native_size && (size < native_size)) { total_blks = (dba_mm == acc_meth) ? cs_addrs->total_blks : cs_addrs->ti->total_blks; if (JNL_ENABLED(cs_addrs)) cs_addrs->jnl->pini_addr = 0; /* Stop simulation of GTM process journal record writing (if any active)*/ /* If journaling, gdsfilext will need to write an inctn record. The timestamp of that journal record will * need to be adjusted to the current system time to reflect that it is recovery itself writing that record * instead of simulating GT.M activity. Since the variable jgbl.dont_reset_gbl_jrec_time is still set, gdsfilext * will NOT modify jgbl.gbl_jrec_time. Temporarily reset it to allow for adjustments to gbl_jrec_time. */ assert(jgbl.dont_reset_gbl_jrec_time); jgbl.dont_reset_gbl_jrec_time = FALSE; /* Calculate the number of blocks to add based on the difference between the real file size and the file size * computed from the header->total_blks. Takes into account that gdsfilext() will automatically add new_bit_maps * to the amount of blocks we request. */ bplmap = cs_data->bplmap; new_blocks = (native_size - size)/(mu_data->blk_size / DISK_BLOCK_SIZE); new_bit_maps = DIVIDE_ROUND_UP(total_blks + new_blocks, bplmap) - DIVIDE_ROUND_UP(total_blks, bplmap); tmpcnt = new_blocks - new_bit_maps; /* Call GDSFILEXT only if the no of blocks by which DB needs to be extended is not '0' since GDSFILEXT() treats * extension by count 0 as unavailability of space(NO_FREE_SPACE error). And in the following case, tmpcnt could * be '0' on AIX because in MM mode AIX increases the native_size to the nearest multiple of OS_PAGE_SIZE. * And this increase could be less than GT.M block size.*/ if (tmpcnt && SS_NORMAL != (status = GDSFILEXT(new_blocks - new_bit_maps, total_blks, TRANS_IN_PROG_FALSE))) { jgbl.dont_reset_gbl_jrec_time = TRUE; return (status); } jgbl.dont_reset_gbl_jrec_time = TRUE; # ifdef DEBUG /* Check that the filesize and blockcount in the fileheader match now after the extend */ size = mu_int_ovrhd + (off_t)(mu_data->blk_size / DISK_BLOCK_SIZE) * mu_data->trans_hist.total_blks; native_size = gds_file_size(gv_cur_region->dyn.addr->file_cntl); ALIGN_DBFILE_SIZE_IF_NEEDED(size, native_size); assert(size == native_size); # endif } return SS_NORMAL; }
/* This function is called primarily to append a new histinfo record to the replication instance file by one of the following * 1) MUPIP REPLIC -SOURCE -START -ROOTPRIMARY command (after forking the child source server) if it created the journal pool. * 2) MUPIP REPLIC -SOURCE -ACTIVATE -ROOTPRIMARY command if this is a propagating primary to root primary transition. * In addition, this function also initializes the "lms_group_info" field in the instance file (from the "inst_info" field) * if the current value is NULL. */ void gtmsource_rootprimary_init(seq_num start_seqno) { unix_db_info *udi; repl_histinfo histinfo; boolean_t was_crit, switch_jnl; gd_region *reg, *region_top; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; uint4 jnl_status; udi = FILE_INFO(jnlpool.jnlpool_dummy_reg); assert(NULL != jnlpool.repl_inst_filehdr); /* Update journal pool fields to reflect this is a root primary startup and updates are enabled */ assert(!udi->s_addrs.hold_onto_crit || jgbl.onlnrlbk); was_crit = udi->s_addrs.now_crit; if (!was_crit) grab_lock(jnlpool.jnlpool_dummy_reg, TRUE, ASSERT_NO_ONLINE_ROLLBACK); jnlpool.repl_inst_filehdr->root_primary_cycle++; /* If this instance is transitioning from a non-rootprimary to rootprimary, switch journal files. * This helps with maintaining accurate value of csd->zqgblmod_tn when the former primary connects * to the current primary through a fetchresync-rollback or receiver-server-autorollback.. */ switch_jnl = (!jnlpool.repl_inst_filehdr->was_rootprimary && (0 < jnlpool.repl_inst_filehdr->num_histinfo)); jnlpool.repl_inst_filehdr->was_rootprimary = TRUE; assert(start_seqno >= jnlpool.jnlpool_ctl->start_jnl_seqno); assert(start_seqno == jnlpool.jnlpool_ctl->jnl_seqno); jnlpool.repl_inst_filehdr->jnl_seqno = start_seqno; assert(jgbl.onlnrlbk || jnlpool.jnlpool_ctl->upd_disabled); if (!jgbl.onlnrlbk) jnlpool.jnlpool_ctl->upd_disabled = FALSE; if (IS_REPL_INST_UUID_NULL(jnlpool.repl_inst_filehdr->lms_group_info)) { /* This is the first time this instance is being brought up either as a root primary or as a propagating * primary. Initialize the "lms_group_info" fields in the instance file header in journal pool shared memory. * They will be flushed to the instance file as part of the "repl_inst_histinfo_add -> repl_inst_flush_filehdr" * function invocation below. */ assert('\0' == jnlpool.repl_inst_filehdr->lms_group_info.created_nodename[0]); assert('\0' == jnlpool.repl_inst_filehdr->lms_group_info.this_instname[0]); assert(!jnlpool.repl_inst_filehdr->lms_group_info.creator_pid); jnlpool.repl_inst_filehdr->lms_group_info = jnlpool.repl_inst_filehdr->inst_info; assert('\0' != jnlpool.repl_inst_filehdr->lms_group_info.created_nodename[0]); DBG_CHECK_CREATED_NODENAME(jnlpool.repl_inst_filehdr->lms_group_info.created_nodename); assert('\0' != jnlpool.repl_inst_filehdr->lms_group_info.this_instname[0]); assert(jnlpool.repl_inst_filehdr->lms_group_info.created_time); assert(jnlpool.repl_inst_filehdr->lms_group_info.creator_pid); } /* Initialize histinfo fields */ memcpy(histinfo.root_primary_instname, jnlpool.repl_inst_filehdr->inst_info.this_instname, MAX_INSTNAME_LEN - 1); histinfo.root_primary_instname[MAX_INSTNAME_LEN - 1] = '\0'; assert('\0' != histinfo.root_primary_instname[0]); histinfo.start_seqno = start_seqno; assert(jnlpool.jnlpool_ctl->strm_seqno[0] == jnlpool.repl_inst_filehdr->strm_seqno[0]); assert(jnlpool.repl_inst_filehdr->is_supplementary || (0 == jnlpool.jnlpool_ctl->strm_seqno[0])); histinfo.strm_seqno = (!jnlpool.repl_inst_filehdr->is_supplementary) ? 0 : jnlpool.jnlpool_ctl->strm_seqno[0]; histinfo.root_primary_cycle = jnlpool.repl_inst_filehdr->root_primary_cycle; assert(process_id == getpid()); histinfo.creator_pid = process_id; JNL_SHORT_TIME(histinfo.created_time); histinfo.strm_index = 0; histinfo.history_type = HISTINFO_TYPE_NORMAL; NULL_INITIALIZE_REPL_INST_UUID(histinfo.lms_group); /* The following fields will be initialized in the "repl_inst_histinfo_add" function call below. * histinfo.histinfo_num * histinfo.prev_histinfo_num * histinfo.last_histinfo_num[] */ /* Add the histinfo record to the instance file and flush the changes in the journal pool to the file header */ repl_inst_histinfo_add(&histinfo); if (!was_crit) rel_lock(jnlpool.jnlpool_dummy_reg); if (switch_jnl) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_file_extend and its callees assume jgbl.gbl_jrec_time is set */ for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++) { gv_cur_region = reg; change_reg(); /* sets cs_addrs/cs_data (needed by jnl_ensure_open) */ if (!JNL_ENABLED(cs_addrs)) continue; grab_crit(gv_cur_region); jpc = cs_addrs->jnl; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order of jnl * records. This needs to be done BEFORE the jnl_ensure_open as that could write journal records * (if it decides to switch to a new journal file) */ jbp = jpc->jnl_buff; ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { if (EXIT_ERR == SWITCH_JNL_FILE(jpc)) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(4) ERR_JNLEXTEND, 2, JNL_LEN_STR(cs_data)); } else { if (SS_NORMAL != jpc->status) rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error_csa(CSA_ARG(cs_addrs) VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); } rel_crit(gv_cur_region); } } }
void ccp_tr_opendb3(ccp_action_record *rec) { ccp_db_header *db; sgmnt_addrs *csa; sgmnt_data *csd; jnl_private_control *jpc; uint4 status; db = rec->v.h; db->master_map_start_tn = 0; csa = db->segment; csd = db->glob_sec = csa->hdr = csa->db_addrs[0]; adawi(1, &csa->nl->ref_cnt); /* GT.M process that sent open request */ csa->lock_addrs[0] = (unsigned char *)csd + (LOCK_BLOCK(csd) * DISK_BLOCK_SIZE); csa->lock_addrs[1] = csa->lock_addrs[0] + csd->lock_space_size; if (JNL_ENABLED(csd)) { jpc = csa->jnl = malloc(SIZEOF(jnl_private_control)); memset(jpc, 0, SIZEOF(jnl_private_control)); jpc->region = db->greg; jpc->jnl_buff = csa->lock_addrs[1] + CACHE_CONTROL_SIZE(csd) + JNL_NAME_EXP_SIZE; FILE_INFO(db->greg)->s_addrs.jnl->jnllsb->lockid = 0; if (db->wm_iosb.valblk[CCP_VALBLK_JNL_ADDR] == 0) { /* Open jnl file based on data in file header, first machine to open */ jnl_file_open(db->greg, FALSE, ccp_closejnl_ast); if (jpc->channel == 0) { /* Open failed */ ccp_close1(db); ccp_signal_cont(ERR_CCPJNLOPNERR); /***** Is this reasonable? *****/ return; } /* Write out file id for other machines */ csd->trans_hist.ccp_jnl_filesize = jpc->jnl_buff->filesize; csd->ccp_jnl_before = jpc->jnl_buff->before_images; status = sys$qio(0, FILE_INFO(db->greg)->fab->fab$l_stv, IO$_WRITEVBLK, &db->qio_iosb, ccp_opendb3a, db, csd, (MM_BLOCK - 1) * OS_PAGELET_SIZE, 1, 0, 0, 0); } else { /* ??? --> */ /* Open jnl file based on id in header. Read in header to make sure get file id from first machine */ status = sys$qio(0, FILE_INFO(db->greg)->fab->fab$l_stv, IO$_READVBLK, &db->qio_iosb, ccp_opendb3a, db, csd, (MM_BLOCK - 1) * OS_PAGELET_SIZE, 1, 0, 0, 0); } if (status != SS$_NORMAL) { ccp_signal_cont(status); /***** Is this reasonable? *****/ ccp_close1(db); } } else ccp_opendb3b(db); }
uint4 gdsfilext (uint4 blocks, uint4 filesize) { sm_uc_ptr_t old_base[2]; boolean_t was_crit, need_to_restore_mask = FALSE; char *buff; int mm_prot, result, save_errno, status; uint4 new_bit_maps, bplmap, map, new_blocks, new_total, max_tot_blks; uint4 jnl_status, to_wait, to_msg, wait_period; GTM_BAVAIL_TYPE avail_blocks; sgmnt_data_ptr_t tmp_csd; off_t new_eof; trans_num curr_tn; unix_db_info *udi; sigset_t savemask; inctn_opcode_t save_inctn_opcode; int4 prev_extend_blks_to_upgrd; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; error_def(ERR_DBFILERR); error_def(ERR_DBFILEXT); error_def(ERR_DSKSPACEFLOW); error_def(ERR_JNLFLUSH); error_def(ERR_TEXT); error_def(ERR_TOTALBLKMAX); error_def(ERR_WAITDSKSPACE); #ifdef __hppa if (dba_mm == cs_addrs->hdr->acc_meth) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not allowed ? */ #endif /* Both blocks and total blocks are unsigned ints so make sure we aren't asking for huge numbers that will overflow and end up doing silly things. */ assert(blocks <= (MAXTOTALBLKS(cs_data) - cs_data->trans_hist.total_blks)); if (!blocks) return (uint4)(NO_FREE_SPACE); /* should this be changed to show extension not enabled ? */ bplmap = cs_data->bplmap; /* new total of non-bitmap blocks will be number of current, non-bitmap blocks, plus new blocks desired There are (bplmap - 1) non-bitmap blocks per bitmap, so add (bplmap - 2) to number of non-bitmap blocks and divide by (bplmap - 1) to get total number of bitmaps for expanded database. (must round up in this manner as every non-bitmap block must have an associated bitmap) Current number of bitmaps is (total number of current blocks + bplmap - 1) / bplmap. Subtract current number of bitmaps from number needed for expanded database to get number of new bitmaps needed. */ new_bit_maps = DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap) + blocks, bplmap - 1) - DIVIDE_ROUND_UP(cs_data->trans_hist.total_blks, bplmap); new_blocks = blocks + new_bit_maps; assert(0 < (int)new_blocks); udi = FILE_INFO(gv_cur_region); if (0 != (save_errno = disk_block_available(udi->fd, &avail_blocks, FALSE))) { send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); } else { avail_blocks = avail_blocks / (cs_data->blk_size / DISK_BLOCK_SIZE); if ((blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, DB_LEN_STR(gv_cur_region), (uint4)(avail_blocks - ((new_blocks <= avail_blocks) ? new_blocks : 0))); #ifndef __MVS__ if (blocks > (uint4)avail_blocks) return (uint4)(NO_FREE_SPACE); #endif } } cs_addrs->extending = TRUE; was_crit = cs_addrs->now_crit; /* If we are coming from mupip_extend (which gets crit itself) we better have waited for any unfreezes to occur */ assert(!was_crit || CDB_STAGNATE == t_tries || FALSE == cs_data->freeze); for ( ; ; ) { /* If we are in the final retry and already hold crit, it is possible that csd->wc_blocked is also set to TRUE * (by a concurrent process in phase2 which encountered an error in the midst of commit and secshr_db_clnup * finished the job for it). In this case we do NOT want to invoke wcs_recover as that will update the "bt" * transaction numbers without correspondingly updating the history transaction numbers (effectively causing * a cdb_sc_blkmod type of restart). Therefore do NOT call grab_crit (which unconditionally invokes wcs_recover) * if we already hold crit. */ if (!was_crit) grab_crit(gv_cur_region); if (FALSE == cs_data->freeze) break; rel_crit(gv_cur_region); if (was_crit) { /* Two cases. * (i) Final retry and in TP. We might be holding crit in other regions too. * We can't do a grab_crit() on this region again unless it is deadlock-safe. * To be on the safer side, we do a restart. The tp_restart() logic will wait * for this region's freeze to be removed before grabbing crit. * (ii) Final retry and not in TP. In that case too, it is better to restart in case there is * some validation code that shortcuts the checking for the final retry assuming we were * in crit from t_begin() to t_end(). t_retry() has logic that will wait for unfreeze. * In either case, we need to restart. Returning EXTEND_UNFREEZECRIT will cause one in t_end/tp_tend. */ return (uint4)(EXTEND_UNFREEZECRIT); } while (cs_data->freeze) hiber_start(1000); } assert(cs_addrs->ti->total_blks == cs_data->trans_hist.total_blks); if (cs_data->trans_hist.total_blks != filesize) { /* somebody else has already extended it, since we are in crit, this is trust-worthy * however, in case of MM, we still need to remap the database */ assert(cs_data->trans_hist.total_blks > filesize); GDSFILEXT_CLNUP; return (SS_NORMAL); } if (run_time && (2 * ((0 < dollar_tlevel) ? sgm_info_ptr->cw_set_depth : cw_set_depth) < cs_addrs->ti->free_blocks)) { if (FALSE == was_crit) { rel_crit(gv_cur_region); return (uint4)(EXTEND_SUSPECT); } /* If free_blocks counter is not ok, then correct it. Do the check again. If still fails, then GTMASSERT. */ if (is_free_blks_ctr_ok() || (2 * ((0 < dollar_tlevel) ? sgm_info_ptr->cw_set_depth : cw_set_depth) < cs_addrs->ti->free_blocks)) GTMASSERT; /* held crit through bm_getfree into gdsfilext and still didn't get it right */ } if (JNL_ENABLED(cs_data)) { if (!jgbl.dont_reset_gbl_jrec_time) SET_GBL_JREC_TIME; /* needed before jnl_ensure_open as that can write jnl records */ jpc = cs_addrs->jnl; jbp = jpc->jnl_buff; /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (jnl_status) { GDSFILEXT_CLNUP; send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); return (uint4)(NO_FREE_SPACE); /* should have better return status */ } } if (dba_mm == cs_addrs->hdr->acc_meth) { #if defined(UNTARGETED_MSYNC) status = msync((caddr_t)cs_addrs->db_addrs[0], (size_t)(cs_addrs->db_addrs[1] - cs_addrs->db_addrs[0]), MS_SYNC); #else cs_addrs->nl->mm_extender_pid = process_id; status = wcs_wtstart(gv_cur_region, 0); cs_addrs->nl->mm_extender_pid = 0; if (0 != cs_addrs->acc_meth.mm.mmblk_state->mmblkq_active.fl) GTMASSERT; status = 0; #endif if (0 == status) { /* Block SIGALRM for the duration when cs_data and cs_addrs are out of sync */ sigprocmask(SIG_BLOCK, &blockalrm, &savemask); need_to_restore_mask = TRUE; tmp_csd = cs_data; cs_data = (sgmnt_data_ptr_t)malloc(sizeof(*cs_data)); memcpy((sm_uc_ptr_t)cs_data, (uchar_ptr_t)tmp_csd, sizeof(*cs_data)); status = munmap((caddr_t)cs_addrs->db_addrs[0], (size_t)(cs_addrs->db_addrs[1] - cs_addrs->db_addrs[0])); #ifdef DEBUG_DB64 if (-1 != status) rel_mmseg((caddr_t)cs_addrs->db_addrs[0]); #endif } else tmp_csd = NULL; if (0 != status) { if (tmp_csd) { free(cs_data); cs_data = tmp_csd; } GDSFILEXT_CLNUP; send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); return (uint4)(NO_FREE_SPACE); } cs_addrs->hdr = cs_data; cs_addrs->ti = &cs_data->trans_hist; } if (new_blocks + cs_data->trans_hist.total_blks > MAXTOTALBLKS(cs_data)) { GDSFILEXT_CLNUP; send_msg(VARLSTCNT(1) ERR_TOTALBLKMAX); return (uint4)(NO_FREE_SPACE); } CHECK_TN(cs_addrs, cs_data, cs_data->trans_hist.curr_tn); /* can issue rts_error TNTOOLARGE */ assert(0 < (int)new_blocks); new_total = cs_data->trans_hist.total_blks + new_blocks; new_eof = ((off_t)(cs_data->start_vbn - 1) * DISK_BLOCK_SIZE) + ((off_t)new_total * cs_data->blk_size); buff = (char *)malloc(DISK_BLOCK_SIZE); memset(buff, 0, DISK_BLOCK_SIZE); LSEEKWRITE(udi->fd, new_eof, buff, DISK_BLOCK_SIZE, save_errno); if ((ENOSPC == save_errno) && run_time) { /* try to write it every second, and send message to operator * log every 1/20 of cs_data->wait_disk_space */ wait_period = to_wait = DIVIDE_ROUND_UP(cs_data->wait_disk_space, CDB_STAGNATE + 1); to_msg = (to_wait / 8) ? (to_wait / 8) : 1; /* send around 8 messages during 1 wait_period */ while ((to_wait > 0) && (ENOSPC == save_errno)) { if ((to_wait == cs_data->wait_disk_space) || (to_wait % to_msg == 0)) { send_msg(VARLSTCNT(11) ERR_WAITDSKSPACE, 4, process_id, to_wait + (CDB_STAGNATE - t_tries) * wait_period, DB_LEN_STR(gv_cur_region), ERR_TEXT, 2, RTS_ERROR_TEXT("Please make more disk space available or shutdown GT.M to avoid data loss"), save_errno); gtm_putmsg(VARLSTCNT(11) ERR_WAITDSKSPACE, 4, process_id, to_wait + (CDB_STAGNATE - t_tries) * wait_period, DB_LEN_STR(gv_cur_region), ERR_TEXT, 2, RTS_ERROR_TEXT("Please make more disk space available or shutdown GT.M to avoid data loss"), save_errno); } if (!was_crit) rel_crit(gv_cur_region); hiber_start(1000); to_wait--; if (!was_crit) grab_crit(gv_cur_region); LSEEKWRITE(udi->fd, new_eof, buff, DISK_BLOCK_SIZE, save_errno); } } free(buff); if (0 != save_errno) { GDSFILEXT_CLNUP; if (ENOSPC == save_errno) return (uint4)(NO_FREE_SPACE); send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), save_errno); return (uint4)(NO_FREE_SPACE); } DEBUG_ONLY(prev_extend_blks_to_upgrd = cs_data->blks_to_upgrd;)
/* Finds a free block and adds information to update array and cw_set */ block_id swap_root_or_directory_block(int parent_blk_lvl, int child_blk_lvl, srch_hist *dir_hist_ptr, block_id child_blk_id, sm_uc_ptr_t child_blk_ptr, kill_set *kill_set_list, trans_num curr_tn) { sgmnt_data_ptr_t csd; sgmnt_addrs *csa; node_local_ptr_t cnl; srch_blk_status bmlhist, freeblkhist; block_id hint_blk_num, free_blk_id, parent_blk_id; boolean_t free_blk_recycled; int4 master_bit, num_local_maps, free_bit, hint_bit, maxbitsthismap; uint4 total_blks; int blk_seg_cnt, blk_size; sm_uc_ptr_t parent_blk_ptr, bn_ptr, saved_blk; blk_segment *bs1, *bs_ptr; int parent_blk_size, child_blk_size, bsiz; int rec_size1, curr_offset, bpntr_end, hdr_len; int tmp_cmpc; cw_set_element *tmpcse; jnl_buffer_ptr_t jbbp; /* jbbp is non-NULL only if before-image journaling */ unsigned short temp_ushort; unsigned long temp_long; unsigned char save_cw_set_depth; DCL_THREADGBL_ACCESS; SETUP_THREADGBL_ACCESS; csd = cs_data; csa = cs_addrs; cnl = csa->nl; blk_size = csd->blk_size; /* Find a free/recycled block for new block location. */ hint_blk_num = 0; total_blks = csa->ti->total_blks; num_local_maps = DIVIDE_ROUND_UP(total_blks, BLKS_PER_LMAP); master_bit = bmm_find_free((hint_blk_num / BLKS_PER_LMAP), csa->bmm, num_local_maps); if ((NO_FREE_SPACE == master_bit)) { t_abort(gv_cur_region, csa); return ABORT_SWAP; } bmlhist.blk_num = (block_id)master_bit * BLKS_PER_LMAP; if (NULL == (bmlhist.buffaddr = t_qread(bmlhist.blk_num, (sm_int_ptr_t)&bmlhist.cycle, &bmlhist.cr))) { assert(t_tries < CDB_STAGNATE); t_retry((enum cdb_sc)rdfail_detail); return RETRY_SWAP; } hint_bit = 0; maxbitsthismap = (master_bit != (num_local_maps - 1)) ? BLKS_PER_LMAP : total_blks - bmlhist.blk_num; free_bit = bm_find_blk(hint_bit, bmlhist.buffaddr + SIZEOF(blk_hdr), maxbitsthismap, &free_blk_recycled); free_blk_id = bmlhist.blk_num + free_bit; if (DIR_ROOT >= free_blk_id) { /* Bitmap block 0 and directory tree root block 1 should always be marked busy. */ assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_badbitmap); return RETRY_SWAP; } if (child_blk_id <= free_blk_id) { /* stop swapping root or DT blocks once the database is truncated well enough. A good heuristic for this is to check * if the block is to be swapped into a higher block number and if so do not swap */ t_abort(gv_cur_region, csa); return ABORT_SWAP; } /* ====== begin update array ====== * Four blocks get changed. * 1. Free block becomes busy and gains the contents of child (root block/directory tree block) * 2. Parent block in directory tree remains busy, but points to new root block location. * 3. Free block's corresponding bitmap reflects above change. * 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE) */ parent_blk_ptr = dir_hist_ptr->h[parent_blk_lvl].buffaddr; /* parent_blk_lvl is 0 iff we're moving a gvt root block */ parent_blk_id = dir_hist_ptr->h[parent_blk_lvl].blk_num; CHECK_AND_RESET_UPDATE_ARRAY; if (free_blk_recycled) { /* Otherwise, it's a completely free block, in which case no need to read. */ freeblkhist.blk_num = (block_id)free_blk_id; if (NULL == (freeblkhist.buffaddr = t_qread(free_blk_id, (sm_int_ptr_t)&freeblkhist.cycle, &freeblkhist.cr))) { assert(t_tries < CDB_STAGNATE); t_retry((enum cdb_sc)rdfail_detail); return RETRY_SWAP; } } child_blk_size = ((blk_hdr_ptr_t)child_blk_ptr)->bsiz; BLK_INIT(bs_ptr, bs1); BLK_ADDR(saved_blk, child_blk_size, unsigned char); memcpy(saved_blk, child_blk_ptr, child_blk_size); BLK_SEG(bs_ptr, saved_blk + SIZEOF(blk_hdr), child_blk_size - SIZEOF(blk_hdr)); assert(blk_seg_cnt == child_blk_size); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_blkmod); return RETRY_SWAP; } tmpcse = &cw_set[cw_set_depth]; (free_blk_recycled) ? BIT_SET_RECYCLED_AND_CLEAR_FREE(tmpcse->blk_prior_state) : BIT_CLEAR_RECYCLED_AND_SET_FREE(tmpcse->blk_prior_state); t_create(free_blk_id, (unsigned char *)bs1, 0, 0, child_blk_lvl); tmpcse->mode = gds_t_acquired; if (!free_blk_recycled || !cs_data->db_got_to_v5_once) tmpcse->old_block = NULL; else { tmpcse->old_block = freeblkhist.buffaddr; tmpcse->cr = freeblkhist.cr; tmpcse->cycle = freeblkhist.cycle; jbbp = (JNL_ENABLED(csa) && csa->jnl_before_image) ? csa->jnl->jnl_buff : NULL; if ((NULL != jbbp) && (((blk_hdr_ptr_t)tmpcse->old_block)->tn < jbbp->epoch_tn)) { bsiz = ((blk_hdr_ptr_t)(tmpcse->old_block))->bsiz; if (bsiz > blk_size) { assert(CDB_STAGNATE > t_tries); t_retry(cdb_sc_lostbmlcr); return RETRY_SWAP; } JNL_GET_CHECKSUM_ACQUIRED_BLK(tmpcse, csd, csa, tmpcse->old_block, bsiz); } } /* 2. Parent block in directory tree remains busy, but points to new child block location. */ curr_offset = dir_hist_ptr->h[parent_blk_lvl].curr_rec.offset; parent_blk_size = ((blk_hdr_ptr_t)parent_blk_ptr)->bsiz; GET_RSIZ(rec_size1, (parent_blk_ptr + curr_offset)); if ((parent_blk_size < rec_size1 + curr_offset) || (BSTAR_REC_SIZE > rec_size1)) { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_blkmod); return RETRY_SWAP; } BLK_INIT(bs_ptr, bs1); if (0 == parent_blk_lvl) /* There can be collation stuff in the record value after the block pointer. See gvcst_root_search. */ hdr_len = SIZEOF(rec_hdr) + gv_altkey->end + 1 - EVAL_CMPC((rec_hdr_ptr_t)(parent_blk_ptr + curr_offset)); else hdr_len = rec_size1 - SIZEOF(block_id); bpntr_end = curr_offset + hdr_len + SIZEOF(block_id); BLK_SEG(bs_ptr, parent_blk_ptr + SIZEOF(blk_hdr), curr_offset + hdr_len - SIZEOF(blk_hdr)); BLK_ADDR(bn_ptr, SIZEOF(block_id), unsigned char); PUT_LONG(bn_ptr, free_blk_id); BLK_SEG(bs_ptr, bn_ptr, SIZEOF(block_id)); BLK_SEG(bs_ptr, parent_blk_ptr + bpntr_end, parent_blk_size - bpntr_end); assert(blk_seg_cnt == parent_blk_size); if (!BLK_FINI(bs_ptr, bs1)) { assert(t_tries < CDB_STAGNATE); t_retry(cdb_sc_blkmod); return RETRY_SWAP; } t_write(&dir_hist_ptr->h[parent_blk_lvl], (unsigned char *)bs1, 0, 0, parent_blk_lvl, FALSE, TRUE, GDS_WRITE_KILLTN); /* To indicate later snapshot file writing process during fast_integ not to skip writing the block to snapshot file */ BIT_SET_DIR_TREE(cw_set[cw_set_depth-1].blk_prior_state); /* 3. Free block's corresponding bitmap reflects above change. */ PUT_LONG(update_array_ptr, free_bit); save_cw_set_depth = cw_set_depth; /* Bit maps go on end of cw_set (more fake acquired) */ assert(!cw_map_depth); t_write_map(&bmlhist, (uchar_ptr_t)update_array_ptr, curr_tn, 1); cw_map_depth = cw_set_depth; cw_set_depth = save_cw_set_depth; update_array_ptr += SIZEOF(block_id); temp_long = 0; PUT_LONG(update_array_ptr, temp_long); update_array_ptr += SIZEOF(block_id); assert(1 == cw_set[cw_map_depth - 1].reference_cnt); /* 4. Child block gets marked recycled in bitmap. (GVCST_BMP_MARK_FREE) */ kill_set_list->blk[kill_set_list->used].flag = 0; kill_set_list->blk[kill_set_list->used].level = 0; kill_set_list->blk[kill_set_list->used++].block = child_blk_id; return free_blk_id; }
void bm_setmap(block_id bml, block_id blk, int4 busy) { sm_uc_ptr_t bmp; trans_num ctn; srch_hist alt_hist; srch_blk_status blkhist; /* block-history to fill in for t_write_map which uses "blk_num", "buffaddr", "cr", "cycle" */ cw_set_element *cse; int lbm_status; /* local bitmap status of input "blk" i.e. BUSY or FREE or RECYCLED */ int4 reference_cnt; uint4 bitnum; error_def(ERR_DSEFAIL); t_begin_crit(ERR_DSEFAIL); ctn = cs_addrs->ti->curr_tn; if (!(bmp = t_qread(bml, &blkhist.cycle, &blkhist.cr))) t_retry((enum cdb_sc)rdfail_detail); blkhist.blk_num = bml; blkhist.buffaddr = bmp; alt_hist.h[0].blk_num = 0; /* Need for calls to T_END for bitmaps */ CHECK_AND_RESET_UPDATE_ARRAY; /* reset update_array_ptr to update_array */ bitnum = blk - bml; /* Find out current status in order to determine if there is going to be a state transition */ assert(ROUND_DOWN2(blk, cs_data->bplmap) == bml); GET_BM_STATUS(bmp, bitnum, lbm_status); switch(lbm_status) { case BLK_BUSY: reference_cnt = busy ? 0 : -1; break; case BLK_FREE: case BLK_MAPINVALID: case BLK_RECYCLED: assert(BLK_MAPINVALID != lbm_status); reference_cnt = busy ? 1 : 0; break; default: assert(FALSE); break; } if (reference_cnt) { /* Initialize update array with non-zero bitnum only if reference_cnt is non-zero. */ assert(bitnum); *((block_id_ptr_t)update_array_ptr) = bitnum; update_array_ptr += sizeof(block_id); } /* Terminate update array unconditionally with zero bitnum. */ *((block_id_ptr_t)update_array_ptr) = 0; update_array_ptr += sizeof(block_id); t_write_map(&blkhist, (uchar_ptr_t)update_array, ctn, reference_cnt); if (JNL_ENABLED(cs_data)) { cse = (cw_set_element *)(&cw_set[0]); cse->new_buff = non_tp_jfb_buff_ptr; memcpy(non_tp_jfb_buff_ptr, bmp, ((blk_hdr_ptr_t)bmp)->bsiz); gvcst_map_build((uint4 *)cse->upd_addr, (uchar_ptr_t)cse->new_buff, cse, cs_addrs->ti->curr_tn); cse->done = TRUE; } /* Call t_end till it succeeds or aborts (error will be reported) */ while ((trans_num)0 == t_end(&alt_hist, 0)) ; return; }
int gtmsource_checkhealth(void) { uint4 gtmsource_pid; int status, semval, save_errno; boolean_t srv_alive, all_files_open; gtmsource_local_ptr_t gtmsourcelocal_ptr; int4 index, num_servers; seq_num reg_seqno, jnlseqno; gd_region *reg, *region_top; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char errtxt[OUT_BUFF_SIZE]; char *modestr; assert(holds_sem[SOURCE][JNL_POOL_ACCESS_SEM]); if (NULL != jnlpool.gtmsource_local) /* Check health of a specific source server */ gtmsourcelocal_ptr = jnlpool.gtmsource_local; else gtmsourcelocal_ptr = &jnlpool.gtmsource_local_array[0]; num_servers = 0; status = SRV_ALIVE; for (index = 0; index < NUM_GTMSRC_LCL; index++, gtmsourcelocal_ptr++) { if ('\0' == gtmsourcelocal_ptr->secondary_instname[0]) { assert(NULL == jnlpool.gtmsource_local); continue; } gtmsource_pid = gtmsourcelocal_ptr->gtmsource_pid; /* If CHECKHEALTH on a specific secondary instance is requested, print the health information irrespective * of whether a source server for that instance is alive or not. For CHECKHEALTH on ALL secondary instances * print health information only for those instances that have an active or passive source server alive. */ if ((NULL == jnlpool.gtmsource_local) && (0 == gtmsource_pid)) continue; repl_log(stdout, TRUE, TRUE, "Initiating CHECKHEALTH operation on source server pid [%d] for secondary instance" " name [%s]\n", gtmsource_pid, gtmsourcelocal_ptr->secondary_instname); srv_alive = (0 == gtmsource_pid) ? FALSE : is_proc_alive(gtmsource_pid, 0); if (srv_alive) { if (GTMSOURCE_MODE_ACTIVE == gtmsourcelocal_ptr->mode) modestr = "ACTIVE"; else if (GTMSOURCE_MODE_ACTIVE_REQUESTED == gtmsourcelocal_ptr->mode) modestr = "ACTIVE REQUESTED"; else if (GTMSOURCE_MODE_PASSIVE == gtmsourcelocal_ptr->mode) modestr = "PASSIVE"; else if (GTMSOURCE_MODE_PASSIVE_REQUESTED == gtmsourcelocal_ptr->mode) modestr = "PASSIVE REQUESTED"; else { assert(gtmsourcelocal_ptr->mode != gtmsourcelocal_ptr->mode); modestr = "UNKNOWN"; } repl_log(stderr, FALSE, TRUE, FORMAT_STR1, gtmsource_pid, "Source server", "", modestr); status |= SRV_ALIVE; num_servers++; } else { repl_log(stderr, FALSE, TRUE, FORMAT_STR, gtmsource_pid, "Source server", " NOT"); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(4) ERR_SRCSRVNOTEXIST, 2, LEN_AND_STR(gtmsourcelocal_ptr->secondary_instname)); status |= SRV_DEAD; } if (NULL != jnlpool.gtmsource_local) break; } if (NULL == jnlpool.gtmsource_local) { /* Compare number of servers that were found alive with the current value of the COUNT semaphore. * If they are not equal, report the discrepancy. */ semval = get_sem_info(SOURCE, SRC_SERV_COUNT_SEM, SEM_INFO_VAL); if (-1 == semval) { save_errno = errno; repl_log(stderr, FALSE, TRUE, "Error fetching source server count semaphore value : %s\n", STRERROR(save_errno)); status |= SRV_ERR; } else if (semval != num_servers) { repl_log(stderr, FALSE, FALSE, "Error : Expected %d source server(s) to be alive but found %d actually alive\n", semval, num_servers); repl_log(stderr, FALSE, TRUE, "Error : Check if any pid reported above is NOT a source server process\n"); status |= SRV_ERR; } } /* Check that there are no regions with replication state = WAS_ON (i.e. repl_was_open). If so report that. * But to determine that, we need to attach to all the database regions. */ gvinit(); /* We use the same code dse uses to open all regions but we must make sure they are all open before proceeding. */ all_files_open = region_init(FALSE); if (!all_files_open) { gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) ERR_NOTALLDBOPN); status |= SRV_ERR; } else { for (reg = gd_header->regions, region_top = gd_header->regions + gd_header->n_regions; reg < region_top; reg++) { csa = &FILE_INFO(reg)->s_addrs; csd = csa->hdr; if (REPL_WAS_ENABLED(csd)) { assert(!JNL_ENABLED(csd) || REPL_ENABLED(csd)); /* || is for turning replication on concurrently */ reg_seqno = csd->reg_seqno; jnlseqno = (NULL != jnlpool.jnlpool_ctl) ? jnlpool.jnlpool_ctl->jnl_seqno : MAX_SEQNO; sgtm_putmsg(errtxt, VARLSTCNT(8) ERR_REPLJNLCLOSED, 6, DB_LEN_STR(reg), ®_seqno, ®_seqno, &jnlseqno, &jnlseqno); repl_log(stderr, FALSE, TRUE, errtxt); status |= SRV_ERR; } } } if (jnlpool.jnlpool_ctl->freeze) { repl_log(stderr, FALSE, FALSE, "Warning: Instance Freeze is ON\n"); repl_log(stderr, FALSE, TRUE, " Freeze Comment: %s\n", jnlpool.jnlpool_ctl->freeze_comment); status |= SRV_ERR; } return (status + NORMAL_SHUTDOWN); }
void gds_rundown(void) { bool is_mm, we_are_last_user, we_are_last_writer; boolean_t ipc_deleted, remove_shm, cancelled_timer, cancelled_dbsync_timer, vermismatch; now_t now; /* for GET_CUR_TIME macro */ char *time_ptr, time_str[CTIME_BEFORE_NL + 2]; /* for GET_CUR_TIME macro */ gd_region *reg; int save_errno, status; int4 semval, ftok_semval, sopcnt, ftok_sopcnt; short crash_count; sm_long_t munmap_len; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct shmid_ds shm_buf; struct sembuf sop[2], ftok_sop[2]; uint4 jnl_status; unix_db_info *udi; jnl_private_control *jpc; jnl_buffer_ptr_t jbp; error_def(ERR_CRITSEMFAIL); error_def(ERR_DBCCERR); error_def(ERR_DBFILERR); error_def(ERR_DBRNDWNWRN); error_def(ERR_ERRCALL); error_def(ERR_GBLOFLOW); error_def(ERR_GTMASSERT); error_def(ERR_IPCNOTDEL); error_def(ERR_JNLFLUSH); error_def(ERR_RNDWNSEMFAIL); error_def(ERR_TEXT); error_def(ERR_WCBLOCKED); forced_exit = FALSE; /* Okay, we're dying already -- let rel_crit live in peace now. * If coming through a DAL, not necessarily dying. what to do then? -- nars -- 8/15/2001 */ grabbed_access_sem = FALSE; jnl_status = 0; reg = gv_cur_region; /* Local copy */ /* * early out for cluster regions * to avoid tripping the assert below. * Note: * This early out is consistent with VMS. It has been * noted that all of the gtcm assignments * to gv_cur_region should use the TP_CHANGE_REG * macro. This would also avoid the assert problem * and should be done eventually. */ if (dba_cm == reg->dyn.addr->acc_meth) return; udi = FILE_INFO(reg); csa = &udi->s_addrs; csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); if ((reg->open) && (dba_usr == csd->acc_meth)) { change_reg(); gvusr_rundown(); return; } ESTABLISH(gds_rundown_ch); if (!reg->open) /* Not open, no point to rundown */ { if (reg->opening) /* Died partway open, kill rest of way */ { rel_crit(reg); mutex_cleanup(reg); /* revist this to handle MM properly SMW 98/12/16 if (NULL != csa->nl) { status = shmdt((caddr_t)csa->nl); if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); } */ shmdt((caddr_t)csa->nl); csa->nl = NULL; } REVERT; return; } switch(csd->acc_meth) { /* Pass mm and bg through */ case dba_bg: is_mm = FALSE; break; case dba_mm: is_mm = TRUE; break; case dba_usr: assert(FALSE); default: REVERT; return; } /* Cancel any pending flush timer for this region by this task */ CANCEL_DB_TIMERS(reg, cancelled_timer, cancelled_dbsync_timer); we_are_last_user = FALSE; if (!csa->persistent_freeze) region_freeze(reg, FALSE, FALSE, FALSE); assert(!csa->read_lock); rel_crit(reg); /* get locks to known state */ mutex_cleanup(reg); /* * We need to guarantee that none else access database file header when semid/shmid fields are reset. * We already have created ftok semaphore in db_init or, mu_rndwn_file and did not remove it. * So just lock it. We do it in blocking mode. */ if (!ftok_sem_lock(reg, FALSE, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * For mupip_jnl_recover we already have database access control semaphore. * We do not release it. We release it from mur_close_files. */ if (!mupip_jnl_recover) { sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO | IPC_NOWAIT; /* Don't wait the first time thru */ SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) /* We couldn't get it in one shot -- see if we already have it */ { save_errno = errno; /* see comment about Linux specific difference in behaviour of semctl() with GETPID in gds_rundown_ch() */ if (semctl(udi->semid, 0, GETPID) == process_id) { send_msg(VARLSTCNT(5) MAKE_MSG_INFO(ERR_CRITSEMFAIL), 2, DB_LEN_STR(reg), ERR_RNDWNSEMFAIL); REVERT; return; /* Already in rundown for this region */ } if (EAGAIN != save_errno) { assert(FALSE); rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown first semop/semctl"), save_errno); } sop[0].sem_flg = sop[1].sem_flg = SEM_UNDO; /* Try again - blocking this time */ SEMOP(udi->semid, sop, 2, status); if (-1 == status) /* We couldn't get it at all.. */ rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); } } grabbed_access_sem = TRUE; /* * We now have the dbinit/rundown lock, so we are alone in this code for this region * and nobody else can attach. * See if we are all alone in accessing this database shared memory. */ assert(csa->ref_cnt); /* decrement private ref_cnt before shared ref_cnt decrement. */ csa->ref_cnt--; /* Currently journaling logic in gds_rundown() in VMS relies on this order to detect last writer */ assert(!csa->ref_cnt); --csa->nl->ref_cnt; if (memcmp(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1)) { /* VERMISMATCH condition. Possible only if DSE */ assert(dse_running); vermismatch = TRUE; } else vermismatch = FALSE; if (-1 == shmctl(udi->shmid, IPC_STAT, &shm_buf)) { save_errno = errno; rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown shmctl"), save_errno); } else we_are_last_user = (1 == shm_buf.shm_nattch) && !vermismatch; assert(!mupip_jnl_recover || we_are_last_user); /* recover => one user */ if (-1 == (semval = semctl(udi->semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); we_are_last_writer = (1 == semval) && (FALSE == reg->read_only) && !vermismatch;/* There's one writer left and I am it */ assert(!(mupip_jnl_recover && !reg->read_only) || we_are_last_writer); /* recover + R/W region => one writer */ if (-1 == (ftok_semval = semctl(udi->ftok_semid, 1, GETVAL))) rts_error(VARLSTCNT(5) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), errno); /* If csa->nl->donotflush_dbjnl is set, it means mupip recover/rollback was interrupted and therefore we should * not flush shared memory contents to disk as they might be in an inconsistent state. * In this case, we will go ahead and remove shared memory (without flushing the contents) in this routine. * A reissue of the recover/rollback command will restore the database to a consistent state. * Otherwise, if we have write access to this region, let us perform a few writing tasks. */ if (csa->nl->donotflush_dbjnl) csa->wbuf_dqd = 0; /* ignore csa->wbuf_dqd status as we do not care about the cache contents */ else if (!reg->read_only && !vermismatch) { /* If we had an orphaned block and were interrupted, set wc_blocked so we can invoke wcs_recover */ if (csa->wbuf_dqd) { grab_crit(reg); SET_TRACEABLE_VAR(csd->wc_blocked, TRUE); BG_TRACE_PRO_ANY(csa, wcb_gds_rundown); send_msg(VARLSTCNT(8) ERR_WCBLOCKED, 6, LEN_AND_LIT("wcb_gds_rundown"), process_id, &csa->ti->curr_tn, DB_LEN_STR(reg)); csa->wbuf_dqd = 0; wcs_recover(reg); if (is_mm) { assert(FALSE); csd = csa->hdr; } BG_TRACE_PRO_ANY(csa, lost_block_recovery); rel_crit(reg); } if (JNL_ENABLED(csd) && (GTCM_GNP_SERVER_IMAGE == image_type)) originator_prc_vec = NULL; /* If we are the last writing user, then everything must be flushed */ if (we_are_last_writer) { /* Time to flush out all of our buffers */ if (is_mm) { if (csa->total_blks != csa->ti->total_blks) /* do remap if file had been extended */ { grab_crit(reg); wcs_mm_recover(reg); csd = csa->hdr; rel_crit(reg); } csa->nl->remove_shm = TRUE; } /* Note WCSFLU_SYNC_EPOCH ensures the epoch is synced to the journal and indirectly * also ensures that the db is fsynced. We don't want to use it in the calls to * wcs_flu() from t_end() and tp_tend() since we can defer it to out-of-crit there. * In this case, since we are running down, we don't have any such option. */ csa->nl->remove_shm = wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); /* Since we_are_last_writer, we should be guaranteed that wcs_flu() did not change csd, (in * case of MM for potential file extension), even if it did a grab_crit(). Therefore, make * sure that's true. */ assert(csd == csa->hdr); assert(0 == memcmp(csd->label, GDS_LABEL, GDS_LABEL_SZ - 1)); csd->trans_hist.header_open_tn = csd->trans_hist.curr_tn; } else if ((cancelled_timer && (0 > csa->nl->wcs_timers)) || cancelled_dbsync_timer) { /* cancelled pending db or jnl flush timers - flush database and journal buffers to disk */ grab_crit(reg); /* we need to sync the epoch as the fact that there is no active pending flush timer implies * there will be noone else who will flush the dirty buffers and EPOCH to disk in a timely fashion */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SYNC_EPOCH); rel_crit(reg); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In case this is MM and wcs_flu() remapped an extended database, reset csd */ } /* Do rundown journal processing after buffer flushes since they require jnl to be open */ if (JNL_ENABLED(csd)) { /* the following tp_change_reg() is not needed due to the assert csa == cs_addrs at the beginning * of gds_rundown(), but just to be safe. To be removed by 2002!! --- nars -- 2001/04/25. */ tp_change_reg(); /* call this because jnl_ensure_open checks cs_addrs rather than gv_cur_region */ jpc = csa->jnl; jbp = jpc->jnl_buff; if (jbp->fsync_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->fsync_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (jbp->io_in_prog_latch.u.parts.latch_pid == process_id) { assert(FALSE); COMPSWAP_UNLOCK(&jbp->io_in_prog_latch, process_id, 0, LOCK_AVAILABLE, 0); } if (((NOJNL != jpc->channel) && !JNL_FILE_SWITCHED(jpc)) || we_are_last_writer && (0 != csa->nl->jnl_file.u.inode)) { /* We need to close the journal file cleanly if we have the latest generation journal file open * or if we are the last writer and the journal file is open in shared memory (not necessarily * by ourselves e.g. the only process that opened the journal got shot abnormally) * Note: we should not infer anything from the shared memory value of csa->nl->jnl_file.u.inode * if we are not the last writer as it can be concurrently updated. */ grab_crit(reg); if (JNL_ENABLED(csd)) { SET_GBL_JREC_TIME; /* jnl_ensure_open/jnl_put_jrt_pini/pfin/jnl_file_close all need it */ /* Before writing to jnlfile, adjust jgbl.gbl_jrec_time if needed to maintain time order * of jnl records. This needs to be done BEFORE the jnl_ensure_open as that could write * journal records (if it decides to switch to a new journal file). */ ADJUST_GBL_JREC_TIME(jgbl, jbp); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* If we_are_last_writer, we would have already done a wcs_flu() which would * have written an epoch record and we are guaranteed no further updates * since we are the last writer. So, just close the journal. * Although we assert pini_addr should be non-zero for last_writer, we * play it safe in PRO and write a PINI record if not written already. */ assert(!jbp->before_images || is_mm || !we_are_last_writer || 0 != jpc->pini_addr); if (we_are_last_writer && 0 == jpc->pini_addr) jnl_put_jrt_pini(csa); if (0 != jpc->pini_addr) jnl_put_jrt_pfin(csa); /* If not the last writer and no pending flush timer left, do jnl flush now */ if (!we_are_last_writer && (0 > csa->nl->wcs_timers)) { if (SS_NORMAL == (jnl_status = jnl_flush(reg))) { assert(jbp->freeaddr == jbp->dskaddr); jnl_fsync(reg, jbp->dskaddr); assert(jbp->fsync_dskaddr == jbp->dskaddr); } else { send_msg(VARLSTCNT(9) ERR_JNLFLUSH, 2, JNL_LEN_STR(csd), ERR_TEXT, 2, RTS_ERROR_TEXT("Error with journal flush in gds_rundown"), jnl_status); assert(NOJNL == jpc->channel);/* jnl file lost has been triggered */ /* In this routine, all code that follows from here on does not * assume anything about the journaling characteristics of this * database so it is safe to continue execution even though * journaling got closed in the middle. */ } } jnl_file_close(reg, we_are_last_writer, FALSE); } else send_msg(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(reg)); } rel_crit(reg); } } if (we_are_last_writer) /* Flush the fileheader last and harden the file to disk */ { grab_crit(reg); /* To satisfy crit requirement in fileheader_sync() */ memset(csd->machine_name, 0, MAX_MCNAMELEN); /* clear the machine_name field */ if (!mupip_jnl_recover && we_are_last_user) { /* mupip_jnl_recover will do this after mur_close_file */ csd->semid = INVALID_SEMID; csd->shmid = INVALID_SHMID; csd->gt_sem_ctime.ctime = 0; csd->gt_shm_ctime.ctime = 0; } fileheader_sync(reg); rel_crit(reg); if (FALSE == is_mm) { if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } } else { /* Now do final MM file sync before exit */ #if !defined(TARGETED_MSYNC) && !defined(NO_MSYNC) if (-1 == fsync(udi->fd)) /* Sync it all */ { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file sync at close"), errno); } #else if (-1 == msync((caddr_t)csa->db_addrs[0], (size_t)(csa->db_addrs[1] - csa->db_addrs[0]), MS_SYNC)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Error during file msync at close"), errno); } #endif } } } /* end if (!reg->read_only && !csa->nl->donotflush_dbjnl) */ if (reg->read_only && we_are_last_user && !mupip_jnl_recover) { /* mupip_jnl_recover will do this after mur_close_file */ db_ipcs.semid = INVALID_SEMID; db_ipcs.shmid = INVALID_SHMID; db_ipcs.gt_sem_ctime = 0; db_ipcs.gt_shm_ctime = 0; db_ipcs.fn_len = reg->dyn.addr->fname_len; memcpy(db_ipcs.fn, reg->dyn.addr->fname, reg->dyn.addr->fname_len); db_ipcs.fn[reg->dyn.addr->fname_len] = 0; /* request gtmsecshr to flush. read_only cannot flush itself */ if (0 != send_mesg2gtmsecshr(FLUSH_DB_IPCS_INFO, 0, (char *)NULL, 0)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gtmsecshr failed to update database file header")); } /* Done with file now, close it */ if (-1 == close(udi->fd)) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during file close"), errno); } /* Unmap storage if mm mode but only the part that is not the fileheader (so shows up in dumps) */ if (is_mm) { munmap_len = (sm_long_t)((csa->db_addrs[1] - csa->db_addrs[0]) - ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)); if (munmap_len > 0) { munmap((caddr_t)(csa->db_addrs[0] + ROUND_UP(SIZEOF_FILE_HDR(csa->hdr), MSYNC_ADDR_INCS)), (size_t)(munmap_len)); #ifdef DEBUG_DB64 rel_mmseg((caddr_t)csa->db_addrs[0]); #endif } } /* Detach our shared memory while still under lock so reference counts will be * correct for the next process to run down this region. * In the process also get the remove_shm status from node_local before detaching. * If csa->nl->donotflush_dbjnl is TRUE, it means we can safely remove shared memory without compromising data * integrity as a reissue of recover will restore the database to a consistent state. */ remove_shm = !vermismatch && (csa->nl->remove_shm || csa->nl->donotflush_dbjnl); status = shmdt((caddr_t)csa->nl); csa->nl = NULL; /* dereferencing nl after detach is not right, so we set it to NULL so that we can test before dereference*/ if (-1 == status) send_msg(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error during shmdt"), errno); reg->open = FALSE; /* If file is still not in good shape, die here and now before we get rid of our storage */ if (csa->wbuf_dqd) GTMASSERT; ipc_deleted = FALSE; /* If we are the very last user, remove shared storage id and the semaphores */ if (we_are_last_user) { /* remove shared storage, only if last writer to rundown did a successful wcs_flu() */ assert(!vermismatch); if (remove_shm) { ipc_deleted = TRUE; if (0 != shm_rmid(udi->shmid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove shared memory")); } else if (is_src_server || is_updproc) { gtm_putmsg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); } else send_msg(VARLSTCNT(6) ERR_DBRNDWNWRN, 4, DB_LEN_STR(reg), process_id, process_id); /* * Don't release semaphore in case of mupip recover/rollback; since it has standalone access. * It will release the semaphore in mur_close_files. */ if (!mupip_jnl_recover) { if (0 != sem_rmid(udi->semid)) rts_error(VARLSTCNT(8) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("Unable to remove semaphore")); grabbed_access_sem = FALSE; } } else { assert(!mupip_jnl_recover); /* If we were writing, get rid of our writer access count semaphore */ if (!reg->read_only) if (0 != (save_errno = do_semop(udi->semid, 1, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown write semaphore release"), save_errno); /* Now remove the rundown lock */ if (0 != (save_errno = do_semop(udi->semid, 0, -1, SEM_UNDO))) rts_error(VARLSTCNT(9) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg), ERR_TEXT, 2, RTS_ERROR_TEXT("gds_rundown rundown semaphore release"), save_errno); grabbed_access_sem = FALSE; } if (!ftok_sem_release(reg, !mupip_jnl_recover, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); if (!ipc_deleted) { GET_CUR_TIME; if (is_src_server) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Source server"), REG_LEN_STR(reg)); if (is_updproc) gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Update process"), REG_LEN_STR(reg)); if (mupip_jnl_recover) { gtm_putmsg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); send_msg(VARLSTCNT(8) ERR_IPCNOTDEL, 6, CTIME_BEFORE_NL, time_ptr, LEN_AND_LIT("Mupip journal process"), REG_LEN_STR(reg)); } } REVERT; }
int4 mupip_set_file(int db_fn_len, char *db_fn) { bool got_standalone; boolean_t bypass_partial_recov, need_standalone = FALSE; char acc_spec[MAX_ACC_METH_LEN], ver_spec[MAX_DB_VER_LEN], exit_stat, *fn; unsigned short acc_spec_len = MAX_ACC_METH_LEN, ver_spec_len = MAX_DB_VER_LEN; int fd, fn_len; int4 status; int4 status1; int glbl_buff_status, defer_status, rsrvd_bytes_status, extn_count_status, lock_space_status, disk_wait_status; int4 new_disk_wait, new_cache_size, new_extn_count, new_lock_space, reserved_bytes, defer_time; sgmnt_data_ptr_t csd; tp_region *rptr, single; enum db_acc_method access, access_new; enum db_ver desired_dbver; gd_region *temp_cur_region; char *errptr, *command = "MUPIP SET VERSION"; int save_errno; error_def(ERR_DBPREMATEOF); error_def(ERR_DBRDERR); error_def(ERR_DBRDONLY); error_def(ERR_INVACCMETHOD); error_def(ERR_MUNOACTION); error_def(ERR_RBWRNNOTCHG); error_def(ERR_WCERRNOTCHG); error_def(ERR_WCWRNNOTCHG); error_def(ERR_MMNODYNDWNGRD); exit_stat = EXIT_NRM; defer_status = cli_present("DEFER_TIME"); if (defer_status) need_standalone = TRUE; bypass_partial_recov = cli_present("PARTIAL_RECOV_BYPASS") == CLI_PRESENT; if (bypass_partial_recov) need_standalone = TRUE; if (disk_wait_status = cli_present("WAIT_DISK")) { if (cli_get_int("WAIT_DISK", &new_disk_wait)) { if (new_disk_wait < 0) { util_out_print("!UL negative, minimum WAIT_DISK allowed is 0.", TRUE, new_disk_wait); return (int4)ERR_WCWRNNOTCHG; } need_standalone = TRUE; } else { util_out_print("Error getting WAIT_DISK qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } } if (glbl_buff_status = cli_present("GLOBAL_BUFFERS")) { if (cli_get_int("GLOBAL_BUFFERS", &new_cache_size)) { if (new_cache_size > WC_MAX_BUFFS) { util_out_print("!UL too large, maximum write cache buffers allowed is !UL", TRUE, new_cache_size, WC_MAX_BUFFS); return (int4)ERR_WCWRNNOTCHG; } if (new_cache_size < WC_MIN_BUFFS) { util_out_print("!UL too small, minimum cache buffers allowed is !UL", TRUE, new_cache_size, WC_MIN_BUFFS); return (int4)ERR_WCWRNNOTCHG; } } else { util_out_print("Error getting GLOBAL BUFFER qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } need_standalone = TRUE; } /* EXTENSION_COUNT does not require standalone access and hence need_standalone will not be set to TRUE for this. */ if (extn_count_status = cli_present("EXTENSION_COUNT")) { if (cli_get_int("EXTENSION_COUNT", &new_extn_count)) { if (new_extn_count > MAX_EXTN_COUNT) { util_out_print("!UL too large, maximum extension count allowed is !UL", TRUE, new_extn_count, MAX_EXTN_COUNT); return (int4)ERR_WCWRNNOTCHG; } if (new_extn_count < MIN_EXTN_COUNT) { util_out_print("!UL too small, minimum extension count allowed is !UL", TRUE, new_extn_count, MIN_EXTN_COUNT); return (int4)ERR_WCWRNNOTCHG; } } else { util_out_print("Error getting EXTENSION COUNT qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } } if (lock_space_status = cli_present("LOCK_SPACE")) { if (cli_get_int("LOCK_SPACE", &new_lock_space)) { if (new_lock_space > MAX_LOCK_SPACE) { util_out_print("!UL too large, maximum lock space allowed is !UL", TRUE, new_lock_space, MAX_LOCK_SPACE); return (int4)ERR_WCWRNNOTCHG; } else if (new_lock_space < MIN_LOCK_SPACE) { util_out_print("!UL too small, minimum lock space allowed is !UL", TRUE, new_lock_space, MIN_LOCK_SPACE); return (int4)ERR_WCWRNNOTCHG; } } else { util_out_print("Error getting LOCK_SPACE qualifier value", TRUE); return (int4)ERR_WCWRNNOTCHG; } need_standalone = TRUE; } if (rsrvd_bytes_status = cli_present("RESERVED_BYTES")) { if (!cli_get_int("RESERVED_BYTES", &reserved_bytes)) { util_out_print("Error getting RESERVED BYTES qualifier value", TRUE); return (int4)ERR_RBWRNNOTCHG; } need_standalone = TRUE; } if (cli_present("ACCESS_METHOD")) { cli_get_str("ACCESS_METHOD", acc_spec, &acc_spec_len); cli_strupper(acc_spec); if (0 == memcmp(acc_spec, "MM", acc_spec_len)) access = dba_mm; else if (0 == memcmp(acc_spec, "BG", acc_spec_len)) access = dba_bg; else mupip_exit(ERR_INVACCMETHOD); need_standalone = TRUE; } else access = n_dba; /* really want to keep current method, which has not yet been read */ if (cli_present("VERSION")) { assert(!need_standalone); cli_get_str("VERSION", ver_spec, &ver_spec_len); cli_strupper(ver_spec); if (0 == memcmp(ver_spec, "V4", ver_spec_len)) desired_dbver = GDSV4; else if (0 == memcmp(ver_spec, "V5", ver_spec_len)) desired_dbver = GDSV5; else GTMASSERT; /* CLI should prevent us ever getting here */ } else desired_dbver = GDSVLAST; /* really want to keep version, which has not yet been read */ if (region) rptr = grlist; else { rptr = &single; memset(&single, 0, sizeof(single)); } csd = (sgmnt_data *)malloc(ROUND_UP(sizeof(sgmnt_data), DISK_BLOCK_SIZE)); in_backup = FALSE; /* Only want yes/no from mupfndfil, not an address */ for (; rptr != NULL; rptr = rptr->fPtr) { if (region) { if (dba_usr == rptr->reg->dyn.addr->acc_meth) { util_out_print("!/Region !AD is not a GDS access type", TRUE, REG_LEN_STR(rptr->reg)); exit_stat |= EXIT_WRN; continue; } if (!mupfndfil(rptr->reg, NULL)) continue; fn = (char *)rptr->reg->dyn.addr->fname; fn_len = rptr->reg->dyn.addr->fname_len; } else { fn = db_fn; fn_len = db_fn_len; } mu_gv_cur_reg_init(); strcpy((char *)gv_cur_region->dyn.addr->fname, fn); gv_cur_region->dyn.addr->fname_len = fn_len; if (!need_standalone) { gvcst_init(gv_cur_region); change_reg(); /* sets cs_addrs and cs_data */ if (gv_cur_region->read_only) { gtm_putmsg(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); exit_stat |= EXIT_ERR; gds_rundown(); mu_gv_cur_reg_free(); continue; } grab_crit(gv_cur_region); status = EXIT_NRM; access_new = (n_dba == access ? cs_data->acc_meth : access); /* recalculate; n_dba is a proxy for no change */ change_fhead_timer("FLUSH_TIME", cs_data->flush_time, (dba_bg == access_new ? TIM_FLU_MOD_BG : TIM_FLU_MOD_MM), FALSE); if (GDSVLAST != desired_dbver) { if ((dba_mm != access_new) || (GDSV4 != desired_dbver)) status1 = desired_db_format_set(gv_cur_region, desired_dbver, command); else { status1 = ERR_MMNODYNDWNGRD; gtm_putmsg(VARLSTCNT(4) status1, 2, REG_LEN_STR(gv_cur_region)); } if (SS_NORMAL != status1) { /* "desired_db_format_set" would have printed appropriate error messages */ if (ERR_MUNOACTION != status1) { /* real error occurred while setting the db format. skip to next region */ status = EXIT_ERR; } } } if (EXIT_NRM == status) { if (extn_count_status) cs_data->extension_size = (uint4)new_extn_count; wcs_flu(WCSFLU_FLUSH_HDR); if (extn_count_status) util_out_print("Database file !AD now has extension count !UL", TRUE, fn_len, fn, cs_data->extension_size); if (GDSVLAST != desired_dbver) util_out_print("Database file !AD now has desired DB format !AD", TRUE, fn_len, fn, LEN_AND_STR(gtm_dbversion_table[cs_data->desired_db_format])); } else exit_stat |= status; rel_crit(gv_cur_region); gds_rundown(); } else { /* Following part needs standalone access */ assert(GDSVLAST == desired_dbver); got_standalone = mu_rndwn_file(gv_cur_region, TRUE); if (FALSE == got_standalone) return (int4)ERR_WCERRNOTCHG; /* we should open it (for changing) after mu_rndwn_file, since mu_rndwn_file changes the file header too */ if (-1 == (fd = OPEN(fn, O_RDWR))) { save_errno = errno; errptr = (char *)STRERROR(save_errno); util_out_print("open : !AZ", TRUE, errptr); exit_stat |= EXIT_ERR; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } LSEEKREAD(fd, 0, csd, sizeof(sgmnt_data), status); if (0 != status) { save_errno = errno; PERROR("Error reading header of file"); errptr = (char *)STRERROR(save_errno); util_out_print("read : !AZ", TRUE, errptr); util_out_print("Error reading header of file", TRUE); util_out_print("Database file !AD not changed: ", TRUE, fn_len, fn); if (-1 != status) rts_error(VARLSTCNT(4) ERR_DBRDERR, 2, fn_len, fn); else rts_error(VARLSTCNT(4) ERR_DBPREMATEOF, 2, fn_len, fn); } if (rsrvd_bytes_status) { if (reserved_bytes > MAX_RESERVE_B(csd)) { util_out_print("!UL too large, maximum reserved bytes allowed is !UL for database file !AD", TRUE, reserved_bytes, MAX_RESERVE_B(csd), fn_len, fn); close(fd); db_ipcs_reset(gv_cur_region, FALSE); return (int4)ERR_RBWRNNOTCHG; } csd->reserved_bytes = reserved_bytes; } access_new = (n_dba == access ? csd->acc_meth : access); /* recalculate; n_dba is a proxy for no change */ change_fhead_timer("FLUSH_TIME", csd->flush_time, (dba_bg == access_new ? TIM_FLU_MOD_BG : TIM_FLU_MOD_MM), FALSE); if ((n_dba != access) && (csd->acc_meth != access)) /* n_dba is a proxy for no change */ { if (dba_mm == access) csd->defer_time = 1; /* defer defaults to 1 */ csd->acc_meth = access; if (0 == csd->n_bts) { csd->n_bts = WC_DEF_BUFFS; csd->bt_buckets = getprime(csd->n_bts); } } if (glbl_buff_status) { csd->n_bts = BT_FACTOR(new_cache_size); csd->bt_buckets = getprime(csd->n_bts); csd->n_wrt_per_flu = 7; csd->flush_trigger = FLUSH_FACTOR(csd->n_bts); } if (disk_wait_status) csd->wait_disk_space = new_disk_wait; if (extn_count_status) csd->extension_size = (uint4)new_extn_count; if (lock_space_status) csd->lock_space_size = (uint4)new_lock_space * OS_PAGELET_SIZE; if (bypass_partial_recov) { csd->file_corrupt = FALSE; util_out_print("Database file !AD now has partial recovery flag set to !UL(FALSE) ", TRUE, fn_len, fn, csd->file_corrupt); } if (dba_mm == access_new) { if (CLI_NEGATED == defer_status) csd->defer_time = 0; else if (CLI_PRESENT == defer_status) { if (!cli_get_num("DEFER_TIME", &defer_time)) { util_out_print("Error getting DEFER_TIME qualifier value", TRUE); db_ipcs_reset(gv_cur_region, FALSE); return (int4)ERR_RBWRNNOTCHG; } if (-1 > defer_time) { util_out_print("DEFER_TIME cannot take negative values less than -1", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } csd->defer_time = defer_time; } if (csd->blks_to_upgrd) { util_out_print("MM access method cannot be set if there are blocks to upgrade", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } if (GDSVCURR != csd->desired_db_format) { util_out_print("MM access method cannot be set in DB compatibility mode", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } if (JNL_ENABLED(csd) && csd->jnl_before_image) { util_out_print("MM access method cannot be set with BEFORE image journaling", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } csd->jnl_before_image = FALSE; } else { if (defer_status) { util_out_print("DEFER cannot be specified with BG access method.", TRUE); util_out_print("Database file !AD not changed", TRUE, fn_len, fn); exit_stat |= EXIT_WRN; db_ipcs_reset(gv_cur_region, FALSE); mu_gv_cur_reg_free(); continue; } } LSEEKWRITE(fd, 0, csd, sizeof(sgmnt_data), status); if (0 != status) { save_errno = errno; errptr = (char *)STRERROR(save_errno); util_out_print("write : !AZ", TRUE, errptr); util_out_print("Error writing header of file", TRUE); util_out_print("Database file !AD not changed: ", TRUE, fn_len, fn); rts_error(VARLSTCNT(4) ERR_DBRDERR, 2, fn_len, fn); } close(fd); /* --------------------- report results ------------------------- */ if (glbl_buff_status) util_out_print("Database file !AD now has !UL global buffers", TRUE, fn_len, fn, csd->n_bts); if (defer_status && (dba_mm == csd->acc_meth)) util_out_print("Database file !AD now has defer_time set to !SL", TRUE, fn_len, fn, csd->defer_time); if (rsrvd_bytes_status) util_out_print("Database file !AD now has !UL reserved bytes", TRUE, fn_len, fn, csd->reserved_bytes); if (extn_count_status) util_out_print("Database file !AD now has extension count !UL", TRUE, fn_len, fn, csd->extension_size); if (lock_space_status) util_out_print("Database file !AD now has lock space !UL pages", TRUE, fn_len, fn, csd->lock_space_size/OS_PAGELET_SIZE); if (disk_wait_status) util_out_print("Database file !AD now has wait disk set to !UL seconds", TRUE, fn_len, fn, csd->wait_disk_space); db_ipcs_reset(gv_cur_region, FALSE); } /* end of else part if (!need_standalone) */ mu_gv_cur_reg_free(); } free(csd); assert(!(exit_stat & EXIT_INF)); return (exit_stat & EXIT_ERR ? (int4)ERR_WCERRNOTCHG : (exit_stat & EXIT_WRN ? (int4)ERR_WCWRNNOTCHG : SS_NORMAL)); }
void dse_chng_bhead(void) { block_id blk; block_id *blkid_ptr; sgm_info *dummysi = NULL; int4 x; cache_rec_ptr_t cr; uchar_ptr_t bp; sm_uc_ptr_t blkBase; blk_hdr new_hdr; blk_segment *bs1, *bs_ptr; cw_set_element *cse; int4 blk_seg_cnt, blk_size; /* needed for BLK_INIT,BLK_SEG and BLK_FINI macros */ bool ismap; bool chng_blk; uint4 mapsize; uint4 jnl_status; error_def(ERR_DSEBLKRDFAIL); error_def(ERR_DSEFAIL); error_def(ERR_DBRDONLY); if (gv_cur_region->read_only) rts_error(VARLSTCNT(4) ERR_DBRDONLY, 2, DB_LEN_STR(gv_cur_region)); assert(update_array); /* reset new block mechanism */ update_array_ptr = update_array; chng_blk = FALSE; if (cli_present("BLOCK") == CLI_PRESENT) { if (!cli_get_hex("BLOCK",&blk)) return; if (blk < 0 || blk > cs_addrs->ti->total_blks) { util_out_print("Error: invalid block number.",TRUE); return; } patch_curr_blk = blk; } blk_size = cs_addrs->hdr->blk_size; ismap = (patch_curr_blk / cs_addrs->hdr->bplmap * cs_addrs->hdr->bplmap == patch_curr_blk); mapsize = BM_SIZE(cs_addrs->hdr->bplmap); t_begin_crit (ERR_DSEFAIL); if (!(bp = t_qread (patch_curr_blk,&dummy_hist.h[0].cycle,&dummy_hist.h[0].cr))) rts_error(VARLSTCNT(1) ERR_DSEBLKRDFAIL); new_hdr = *(blk_hdr_ptr_t)bp; if (cli_present("LEVEL") == CLI_PRESENT) { if (!cli_get_num("LEVEL",&x)) { t_abort(gv_cur_region, cs_addrs); return; } if (ismap && (unsigned char)x != LCL_MAP_LEVL) { util_out_print("Error: invalid level for a bit map block.",TRUE); t_abort(gv_cur_region, cs_addrs); return; } if (!ismap && (x < 0 || x > MAX_BT_DEPTH + 1)) { util_out_print("Error: invalid level.",TRUE); t_abort(gv_cur_region, cs_addrs); return; } new_hdr.levl = (unsigned char)x; chng_blk = TRUE; if (new_hdr.bsiz < sizeof(blk_hdr)) new_hdr.bsiz = sizeof(blk_hdr); if (new_hdr.bsiz > blk_size) new_hdr.bsiz = blk_size; } if (cli_present("BSIZ") == CLI_PRESENT) { if (!cli_get_hex("BSIZ",&x)) { t_abort(gv_cur_region, cs_addrs); return; } if (ismap && x != mapsize) { util_out_print("Error: invalid bsiz.",TRUE); t_abort(gv_cur_region, cs_addrs); return; } else if (x < sizeof(blk_hdr) || x > blk_size) { util_out_print("Error: invalid bsiz.",TRUE); t_abort(gv_cur_region, cs_addrs); return; } chng_blk = TRUE; new_hdr.bsiz = x; } if (!chng_blk) t_abort(gv_cur_region, cs_addrs); else { BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, bp + sizeof(new_hdr), new_hdr.bsiz - sizeof(new_hdr)); if (!BLK_FINI(bs_ptr, bs1)) { util_out_print("Error: bad block build.",TRUE); t_abort(gv_cur_region, cs_addrs); return; } t_write (patch_curr_blk, (unsigned char *)bs1, 0, 0, bp, new_hdr.levl, TRUE, FALSE); BUILD_AIMG_IF_JNL_ENABLED(cs_addrs, cs_data, non_tp_jfb_buff_ptr, cse); t_end(&dummy_hist, 0); } if (cli_present("TN") == CLI_PRESENT) { if (!cli_get_hex("TN",&x)) return; t_begin_crit(ERR_DSEFAIL); assert(cs_addrs->ti->early_tn == cs_addrs->ti->curr_tn); cs_addrs->ti->early_tn++; blkBase = t_qread(patch_curr_blk, &dummy_hist.h[0].cycle, &dummy_hist.h[0].cr); if (NULL == blkBase) { rel_crit(gv_cur_region); util_out_print("Error: Unable to read buffer.", TRUE); t_abort(gv_cur_region, cs_addrs); return; } /* Create a null update array for a block */ if (ismap) { BLK_ADDR(blkid_ptr, sizeof(block_id), block_id); *blkid_ptr = 0; t_write_map(patch_curr_blk, blkBase, (unsigned char *)blkid_ptr, cs_addrs->ti->curr_tn); cr_array_index = 0; block_saved = FALSE; } else { BLK_INIT(bs_ptr, bs1); BLK_SEG(bs_ptr, bp + sizeof(new_hdr), new_hdr.bsiz - sizeof(new_hdr)); BLK_FINI(bs_ptr, bs1); t_write(patch_curr_blk, (unsigned char *)bs1, 0, 0, blkBase, ((blk_hdr_ptr_t)blkBase)->levl, TRUE, FALSE); cr_array_index = 0; block_saved = FALSE; if (JNL_ENABLED(cs_data)) { JNL_SHORT_TIME(jgbl.gbl_jrec_time); /* needed for jnl_put_jrt_pini() and jnl_write_aimg_rec() */ jnl_status = jnl_ensure_open(); if (0 == jnl_status) { cse = (cw_set_element *)(&cw_set[0]); cse->new_buff = non_tp_jfb_buff_ptr; gvcst_blk_build(cse, (uchar_ptr_t)cse->new_buff, x); cse->done = TRUE; if (0 == cs_addrs->jnl->pini_addr) jnl_put_jrt_pini(cs_addrs); jnl_write_aimg_rec(cs_addrs, cse->blk, (blk_hdr_ptr_t)cse->new_buff); } else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(cs_data), DB_LEN_STR(gv_cur_region)); } } /* Pass the desired tn "x" as argument to bg_update or mm_update */ if (dba_bg == cs_addrs->hdr->acc_meth) bg_update(cw_set, cw_set + cw_set_depth, cs_addrs->ti->curr_tn, x, dummysi); else mm_update(cw_set, cw_set + cw_set_depth, cs_addrs->ti->curr_tn, x, dummysi); cs_addrs->ti->curr_tn++; assert(cs_addrs->ti->early_tn == cs_addrs->ti->curr_tn); /* the following code is analogous to that in t_end and should be maintained in a similar fashion */ while (cr_array_index) cr_array[--cr_array_index]->in_cw_set = FALSE; rel_crit(gv_cur_region); if (block_saved) backup_buffer_flush(gv_cur_region); UNIX_ONLY( if (unhandled_stale_timer_pop) process_deferred_stale(); ) wcs_timer_start(gv_cur_region, TRUE); }