/* #GTM_THREAD_SAFE : The below function (mur_apply_pblk) is thread-safe */ uint4 mur_apply_pblk(reg_ctl_list *rctl) { boolean_t was_holder, apply_intrpt_pblk; gd_region *reg; uint4 status; jnl_ctl_list *tmpjctl; file_control *fc; inctn_opcode_t opcode; struct_jrec_inctn *inctn_rec; jnl_ctl_list *jctl; enum jnl_record_type rectype; int save_errno; jnl_record *jnlrec; unix_db_info *udi; status = gtm_pthread_init_key(rctl->gd); if (0 != status) return status; apply_intrpt_pblk = (MUR_STATE_INTRPT_RECOVERY == murgbl.mur_state); if (!apply_intrpt_pblk) { assert(NULL != rctl->jctl_turn_around); if (!rctl->jfh_recov_interrupted) { if (mur_options.verify) { jctl = rctl->jctl; assert(jctl->reg_ctl == rctl); while (NULL != jctl->next_gen) { jctl = jctl->next_gen; assert(jctl->reg_ctl == rctl); } rctl->jctl = jctl; jctl->rec_offset = jctl->lvrec_off; /* Start from last record */ } else { jctl = rctl->jctl = rctl->jctl_apply_pblk; assert(NULL != jctl); assert(jctl->reg_ctl == rctl); jctl->rec_offset = jctl->apply_pblk_stop_offset; } } else /* recover interrupted earlier */ { /* We already called mur_apply_pblk() to undo recover generated PBLKs. * Later we followed the next_jnl_file_name links to setup jctl list for this region. * We later called mur_back_process() to resolve transactions using the new turn-around point, * but mur_back_process() did not apply PBLKs for interrupted recovery (even for NOVERIFY). * Last time we called this routine, we set rctl->jctl_apply_pblk. * Now we are in the phase to apply original GT.M generated PBLKs. * We skip application of PBLKs till the last recover's turn-around point. */ assert(!mur_options.rollback_losttnonly); jctl = rctl->jctl = rctl->jctl_apply_pblk; assert(jctl->reg_ctl == rctl); assert(jctl->apply_pblk_stop_offset); jctl->rec_offset = jctl->apply_pblk_stop_offset; DEBUG_ONLY( /* assert that first pass turn-around-point is later than the final turn-around-point */ for (tmpjctl = jctl; NULL != tmpjctl && tmpjctl != rctl->jctl_turn_around; tmpjctl = tmpjctl->prev_gen) ; assert(NULL != tmpjctl && ((tmpjctl != jctl) || (jctl->rec_offset >= jctl->turn_around_offset))); ) } if (mur_options.verify || rctl->jfh_recov_interrupted) { /* if going to apply pblks then store prospective turnaround point now itself * so we remember to undo PBLKs at least upto here in case this recovery is interrupted. * in case of normal recovery with -noverify, we would have written this information * out in mur_back_process() itself so we do not need to write it again here. */ PTHREAD_MUTEX_LOCK_IF_NEEDED(was_holder); /* get thread lock in case threads are in use */ rctl->csd->intrpt_recov_tp_resolve_time = jgbl.mur_tp_resolve_time; rctl->csd->intrpt_recov_resync_seqno = murgbl.resync_seqno; MUR_SAVE_RESYNC_STRM_SEQNO(rctl, rctl->csd); PTHREAD_MUTEX_UNLOCK_IF_NEEDED(was_holder); /* release exclusive thread lock if needed */ /* flush the changed csd to disk */ fc = rctl->gd->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)rctl->csd; fc->op_len = ROUND_UP(SGMNT_HDR_LEN, DISK_BLOCK_SIZE); fc->op_pos = 1; dbfilop(fc); } } else
void db_init(gd_region *reg, sgmnt_data_ptr_t tsd) { static boolean_t mutex_init_done = FALSE; boolean_t is_bg, read_only; char machine_name[MAX_MCNAMELEN]; file_control *fc; int gethostname_res, stat_res, mm_prot; int4 status, semval, dblksize, fbwsize; sm_long_t status_l; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; struct sembuf sop[3]; struct stat stat_buf; union semun semarg; struct semid_ds semstat; struct shmid_ds shmstat; struct statvfs dbvfs; uint4 sopcnt; unix_db_info *udi; #ifdef periodic_timer_removed void periodic_flush_check(); #endif error_def(ERR_CLSTCONFLICT); error_def(ERR_CRITSEMFAIL); error_def(ERR_DBNAMEMISMATCH); error_def(ERR_DBIDMISMATCH); error_def(ERR_NLMISMATCHCALC); error_def(ERR_REQRUNDOWN); error_def(ERR_SYSCALL); assert(tsd->acc_meth == dba_bg || tsd->acc_meth == dba_mm); is_bg = (dba_bg == tsd->acc_meth); read_only = reg->read_only; new_dbinit_ipc = FALSE; /* we did not create a new ipc resource */ udi = FILE_INFO(reg); memset(machine_name, 0, sizeof(machine_name)); if (GETHOSTNAME(machine_name, MAX_MCNAMELEN, gethostname_res)) rts_error(VARLSTCNT(5) ERR_TEXT, 2, LEN_AND_LIT("Unable to get the hostname"), errno); assert(strlen(machine_name) < MAX_MCNAMELEN); csa = &udi->s_addrs; csa->db_addrs[0] = csa->db_addrs[1] = csa->lock_addrs[0] = NULL; /* to help in dbinit_ch and gds_rundown */ reg->opening = TRUE; /* * Create ftok semaphore for this region. * We do not want to make ftok counter semaphore to be 2 for on mupip journal recover process. */ if (!ftok_sem_get(reg, !mupip_jnl_recover, GTM_ID, FALSE)) rts_error(VARLSTCNT(4) ERR_DBFILERR, 2, DB_LEN_STR(reg)); /* * At this point we have ftok_semid sempahore based on ftok key. * Any ftok conflicted region will block at this point. * Say, a.dat and b.dat both has same ftok and we have process A to access a.dat and * process B to access b.dat. In this case only one can continue to do db_init() */ fc = reg->dyn.addr->file_cntl; fc->file_type = reg->dyn.addr->acc_meth; fc->op = FC_READ; fc->op_buff = (sm_uc_ptr_t)tsd; fc->op_len = sizeof(*tsd); fc->op_pos = 1; dbfilop(fc); /* Read file header */ udi->shmid = tsd->shmid; udi->semid = tsd->semid; udi->sem_ctime = tsd->sem_ctime.ctime; udi->shm_ctime = tsd->shm_ctime.ctime; dbsecspc(reg, tsd); /* Find db segment size */ if (!mupip_jnl_recover) { if (INVALID_SEMID == udi->semid) { if (0 != udi->sem_ctime || INVALID_SHMID != udi->shmid || 0 != udi->shm_ctime) /* We must have somthing wrong in protocol or, code, if this happens */ GTMASSERT; /* * Create new semaphore using IPC_PRIVATE. System guarantees a unique id. */ if (-1 == (udi->semid = semget(IPC_PRIVATE, FTOK_SEM_PER_ID, RWDALL | IPC_CREAT))) { udi->semid = INVALID_SEMID; rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semget"), errno); } udi->shmid = INVALID_SHMID; /* reset shmid so dbinit_ch does not get confused in case we go there */ new_dbinit_ipc = TRUE; tsd->semid = udi->semid; semarg.val = GTM_ID; /* * Following will set semaphore number 2 (=FTOK_SEM_PER_ID - 1) value as GTM_ID. * In case we have orphaned semaphore for some reason, mupip rundown will be * able to identify GTM semaphores from the value and can remove. */ if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, SETVAL, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl SETVAL"), errno); /* * Warning: We must read the sem_ctime using IPC_STAT after SETVAL, which changes it. * We must NOT do any more SETVAL after this. Our design is to use * sem_ctime as creation time of semaphore. */ semarg.buf = &semstat; if (-1 == semctl(udi->semid, FTOK_SEM_PER_ID - 1, IPC_STAT, semarg)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control semctl IPC_STAT"), errno); tsd->sem_ctime.ctime = udi->sem_ctime = semarg.buf->sem_ctime; } else { if (INVALID_SHMID == udi->shmid) /* if mu_rndwn_file gets standalone access of this region and * somehow mupip process crashes, we can have semid != -1 but shmid == -1 */ rts_error(VARLSTCNT(10) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name), ERR_TEXT, 2, LEN_AND_LIT("semid is valid but shmid is invalid")); semarg.buf = &semstat; if (-1 == semctl(udi->semid, 0, IPC_STAT, semarg)) /* file header has valid semid but semaphore does not exists */ rts_error(VARLSTCNT(6) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name)); else if (semarg.buf->sem_ctime != tsd->sem_ctime.ctime) rts_error(VARLSTCNT(10) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name), ERR_TEXT, 2, LEN_AND_LIT("sem_ctime does not match")); if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl"), errno); else if (shmstat.shm_ctime != tsd->shm_ctime.ctime) rts_error(VARLSTCNT(10) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(tsd->machine_name), ERR_TEXT, 2, LEN_AND_LIT("shm_ctime does not match")); } /* We already have ftok semaphore of this region, so just plainly do semaphore operation */ /* This is the database access control semaphore for any region */ sop[0].sem_num = 0; sop[0].sem_op = 0; /* Wait for 0 */ sop[1].sem_num = 0; sop[1].sem_op = 1; /* Lock */ sopcnt = 2; if (!read_only) { sop[2].sem_num = 1; sop[2].sem_op = 1; /* increment r/w access counter */ sopcnt = 3; } sop[0].sem_flg = sop[1].sem_flg = sop[2].sem_flg = SEM_UNDO | IPC_NOWAIT; SEMOP(udi->semid, sop, sopcnt, status); if (-1 == status) { errno_save = errno; gtm_putmsg(VARLSTCNT(4) ERR_CRITSEMFAIL, 2, DB_LEN_STR(reg)); rts_error(VARLSTCNT(8) ERR_SYSCALL, 5, RTS_ERROR_LITERAL("semop()"), CALLFROM, errno_save); } } else /* for mupip_jnl_recover we were already in mu_rndwn_file and got "semid" semaphore */ { if (INVALID_SEMID == udi->semid || 0 == udi->sem_ctime) /* make sure mu_rndwn_file() has reset created semaphore for standalone access */ GTMASSERT; if (INVALID_SHMID != udi->shmid || 0 != udi->shm_ctime) /* make sure mu_rndwn_file() has reset shared memory */ GTMASSERT; udi->shmid = INVALID_SHMID; /* reset shmid so dbinit_ch does not get confused in case we go there */ new_dbinit_ipc = TRUE; } sem_incremented = TRUE; if (new_dbinit_ipc) { /* Create new shared memory using IPC_PRIVATE. System guarantees a unique id */ #ifdef __MVS__ if (-1 == (status_l = udi->shmid = shmget(IPC_PRIVATE, ROUND_UP(reg->sec_size, MEGA_BOUND), __IPC_MEGA | IPC_CREAT | RWDALL))) #else if (-1 == (status_l = udi->shmid = shmget(IPC_PRIVATE, reg->sec_size, RWDALL | IPC_CREAT))) #endif { udi->shmid = status_l = INVALID_SHMID; rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database shmget"), errno); } tsd->shmid = udi->shmid; if (-1 == shmctl(udi->shmid, IPC_STAT, &shmstat)) rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database control shmctl"), errno); tsd->shm_ctime.ctime = udi->shm_ctime = shmstat.shm_ctime; } #ifdef DEBUG_DB64 status_l = (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)do_shmat(udi->shmid, next_smseg, SHM_RND)); next_smseg = (sm_uc_ptr_t)ROUND_UP((sm_long_t)(next_smseg + reg->sec_size), SHMAT_ADDR_INCS); #else status_l = (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)do_shmat(udi->shmid, 0, SHM_RND)); #endif if (-1 == status_l) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error attaching to database shared memory"), errno); } csa->nl = (node_local_ptr_t)csa->db_addrs[0]; csa->critical = (mutex_struct_ptr_t)(csa->db_addrs[0] + NODE_LOCAL_SIZE); assert(((int)csa->critical & 0xf) == 0); /* critical should be 16-byte aligned */ #ifdef CACHELINE_SIZE assert(0 == ((int)csa->critical & (CACHELINE_SIZE - 1))); #endif /* Note: Here we check jnl_sate from database file and its value cannot change without standalone access. * The jnl_buff buffer should be initialized irrespective of read/write process */ JNL_INIT(csa, reg, tsd); csa->backup_buffer = (backup_buff_ptr_t)(csa->db_addrs[0] + NODE_LOCAL_SPACE + JNL_SHARE_SIZE(tsd)); csa->lock_addrs[0] = (sm_uc_ptr_t)csa->backup_buffer + BACKUP_BUFFER_SIZE + 1; csa->lock_addrs[1] = csa->lock_addrs[0] + LOCK_SPACE_SIZE(tsd) - 1; csa->total_blks = tsd->trans_hist.total_blks; /* For test to see if file has extended */ if (new_dbinit_ipc) { memset(csa->nl, 0, sizeof(*csa->nl)); /* We allocated shared storage -- we have to init it */ if (JNL_ALLOWED(csa)) { /* initialize jb->cycle to a value different from initial value of jpc->cycle (0). although this is not * necessary right now, in the future, the plan is to change jnl_ensure_open() to only do a cycle mismatch * check in order to determine whether to call jnl_file_open() or not. this is in preparation for that. */ csa->jnl->jnl_buff->cycle = 1; } } if (is_bg) csd = csa->hdr = (sgmnt_data_ptr_t)(csa->lock_addrs[1] + 1 + CACHE_CONTROL_SIZE(tsd)); else { csa->acc_meth.mm.mmblk_state = (mmblk_que_heads_ptr_t)(csa->lock_addrs[1] + 1); FSTAT_FILE(udi->fd, &stat_buf, stat_res); if (-1 == stat_res) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); mm_prot = read_only ? PROT_READ : (PROT_READ | PROT_WRITE); #ifdef DEBUG_DB64 if (-1 == (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)mmap((caddr_t)get_mmseg((size_t)stat_buf.st_size), (size_t)stat_buf.st_size, mm_prot, GTM_MM_FLAGS, udi->fd, (off_t)0))) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); put_mmseg((caddr_t)(csa->db_addrs[0]), (size_t)stat_buf.st_size); #else if (-1 == (sm_long_t)(csa->db_addrs[0] = (sm_uc_ptr_t)mmap((caddr_t)NULL, (size_t)stat_buf.st_size, mm_prot, GTM_MM_FLAGS, udi->fd, (off_t)0))) rts_error(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(reg), errno); #endif csa->db_addrs[1] = csa->db_addrs[0] + stat_buf.st_size - 1; csd = csa->hdr = (sgmnt_data_ptr_t)csa->db_addrs[0]; } if (!csa->nl->glob_sec_init) { assert(new_dbinit_ipc); if (is_bg) *csd = *tsd; if (csd->machine_name[0]) /* crash occured */ { if (0 != memcmp(csd->machine_name, machine_name, MAX_MCNAMELEN)) /* crashed on some other node */ rts_error(VARLSTCNT(6) ERR_CLSTCONFLICT, 4, DB_LEN_STR(reg), LEN_AND_STR(csd->machine_name)); else rts_error(VARLSTCNT(6) ERR_REQRUNDOWN, 4, DB_LEN_STR(reg), LEN_AND_STR(csd->machine_name)); } if (is_bg) { bt_malloc(csa); csa->nl->cache_off = -CACHE_CONTROL_SIZE(tsd); db_csh_ini(csa); } db_csh_ref(csa); strcpy(csa->nl->machine_name, machine_name); /* machine name */ assert(MAX_REL_NAME > gtm_release_name_len); memcpy(csa->nl->now_running, gtm_release_name, gtm_release_name_len + 1); /* GT.M release name */ memcpy(csa->nl->label, GDS_LABEL, GDS_LABEL_SZ - 1); /* GDS label */ memcpy(csa->nl->fname, reg->dyn.addr->fname, reg->dyn.addr->fname_len); /* database filename */ csa->nl->creation_date_time = csd->creation.date_time; csa->nl->highest_lbm_blk_changed = -1; csa->nl->wcs_timers = -1; csa->nl->nbb = BACKUP_NOT_IN_PROGRESS; csa->nl->unique_id.uid = FILE_INFO(reg)->fileid; /* save what file we initialized this storage for */ /* save pointers in csa to access shared memory */ csa->nl->critical = (sm_off_t)((sm_uc_ptr_t)csa->critical - (sm_uc_ptr_t)csa->nl); if (JNL_ALLOWED(csa)) csa->nl->jnl_buff = (sm_off_t)((sm_uc_ptr_t)csa->jnl->jnl_buff - (sm_uc_ptr_t)csa->nl); csa->nl->backup_buffer = (sm_off_t)((sm_uc_ptr_t)csa->backup_buffer - (sm_uc_ptr_t)csa->nl); csa->nl->hdr = (sm_off_t)((sm_uc_ptr_t)csd - (sm_uc_ptr_t)csa->nl); csa->nl->lock_addrs = (sm_off_t)((sm_uc_ptr_t)csa->lock_addrs[0] - (sm_uc_ptr_t)csa->nl); if (!read_only || is_bg) { csd->trans_hist.early_tn = csd->trans_hist.curr_tn; csd->max_update_array_size = csd->max_non_bm_update_array_size = ROUND_UP2(MAX_NON_BITMAP_UPDATE_ARRAY_SIZE(csd), UPDATE_ARRAY_ALIGN_SIZE); csd->max_update_array_size += ROUND_UP2(MAX_BITMAP_UPDATE_ARRAY_SIZE, UPDATE_ARRAY_ALIGN_SIZE); /* add current db_csh counters into the cumulative counters and reset the current counters */ #define TAB_DB_CSH_ACCT_REC(COUNTER, DUMMY1, DUMMY2) \ csd->COUNTER.cumul_count += csd->COUNTER.curr_count; \ csd->COUNTER.curr_count = 0; #include "tab_db_csh_acct_rec.h" #undef TAB_DB_CSH_ACCT_REC } if (!read_only) { if (is_bg) { assert(memcmp(csd, GDS_LABEL, GDS_LABEL_SZ - 1) == 0); LSEEKWRITE(udi->fd, (off_t)0, (sm_uc_ptr_t)csd, sizeof(sgmnt_data), errno_save); if (0 != errno_save) { rts_error(VARLSTCNT(9) ERR_DBFILERR, 2, DB_LEN_STR(reg), ERR_TEXT, 2, LEN_AND_LIT("Error with database write"), errno_save); } } } reg->dyn.addr->ext_blk_count = csd->extension_size; mlk_shr_init(csa->lock_addrs[0], csd->lock_space_size, csa, (FALSE == read_only)); DEBUG_ONLY(locknl = csa->nl;) /* for DEBUG_ONLY LOCK_HIST macro */
/* Takes an entry from 'ipcs -m' and checks for its validity to be a GT.M db segment. * Returns TRUE if the shared memory segment is a valid GT.M db segment * (based on a check on some fields in the shared memory) else FALSE. * If the segment belongs to GT.M it returns the database file name by the second argument. * Sets exit_stat to ERR_MUNOTALLSEC if appropriate. */ boolean_t validate_db_shm_entry(shm_parms *parm_buff, char *fname, int *exit_stat) { boolean_t remove_shmid; file_control *fc; int fname_len, save_errno, status, shmid; node_local_ptr_t nl_addr; sm_uc_ptr_t start_addr; struct stat st_buff; struct shmid_ds shmstat; sgmnt_data tsd; unix_db_info *udi; char msgbuff[OUT_BUFF_SIZE]; if (NULL == parm_buff) return FALSE; /* check for the bare minimum size of the shared memory segment that we expect * (with no fileheader related information at hand) */ if (MIN_NODE_LOCAL_SPACE + SHMPOOL_SECTION_SIZE > parm_buff->sgmnt_siz) return FALSE; if (IPC_PRIVATE != parm_buff->key) return FALSE; shmid = parm_buff->shmid; /* we do not need to lock the shm for reading the rundown information as * the other rundowns (if any) can also be allowed to share reading the * same info concurrently. */ if (-1 == (sm_long_t)(start_addr = (sm_uc_ptr_t) do_shmat(shmid, 0, SHM_RND))) return FALSE; nl_addr = (node_local_ptr_t)start_addr; memcpy(fname, nl_addr->fname, MAX_FN_LEN + 1); fname[MAX_FN_LEN] = '\0'; /* make sure the fname is null terminated */ fname_len = STRLEN(fname); msgbuff[0] = '\0'; if (memcmp(nl_addr->label, GDS_LABEL, GDS_LABEL_SZ - 1)) { if (!memcmp(nl_addr->label, GDS_LABEL, GDS_LABEL_SZ - 3)) { util_out_print("Cannot rundown shmid = !UL for database !AD as it has format !AD " "but this mupip uses format !AD", TRUE, shmid, fname_len, fname, GDS_LABEL_SZ - 1, nl_addr->label, GDS_LABEL_SZ - 1, GDS_LABEL); *exit_stat = ERR_MUNOTALLSEC; } shmdt((void *)start_addr); return FALSE; } if (-1 == shmctl(shmid, IPC_STAT, &shmstat)) { save_errno = errno; assert(FALSE);/* we were able to attach to this shmid before so should be able to get stats on it */ util_out_print("!AD -> Error with shmctl for shmid = !UL", TRUE, fname_len, fname, shmid); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) save_errno); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } remove_shmid = FALSE; /* Check if db filename reported in shared memory still exists. If not, clean this shared memory section * without even invoking "mu_rndwn_file" as that expects the db file to exist. Same case if shared memory * points back to a database whose file header does not have this shmid. */ if (-1 == Stat(fname, &st_buff)) { if (ENOENT == errno) { SNPRINTF(msgbuff, OUT_BUFF_SIZE, "File %s does not exist", fname); if (1 < shmstat.shm_nattch) { PRINT_AND_SEND_DBRNDWN_FAILURE_MSG(msgbuff, fname, shmid); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } remove_shmid = TRUE; } else { /* Stat errored out e.g. due to file permissions. Log that */ save_errno = errno; util_out_print("Cannot rundown shmid !UL for database file !AD as stat() on the file" " returned the following error", TRUE, shmid, fname_len, fname); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) save_errno); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } } else { mu_gv_cur_reg_init(); gv_cur_region->dyn.addr->fname_len = strlen(fname); STRNCPY_STR(gv_cur_region->dyn.addr->fname, fname, gv_cur_region->dyn.addr->fname_len); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_OPEN; status = dbfilop(fc); if (SS_NORMAL != status) { util_out_print("!AD -> Error with dbfilop for shmid = !UL", TRUE, fname_len, fname, shmid); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(5) status, 2, DB_LEN_STR(gv_cur_region), errno); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } udi = FILE_INFO(gv_cur_region); LSEEKREAD(udi->fd, 0, &tsd, SIZEOF(sgmnt_data), status); if (0 != status) { save_errno = errno; util_out_print("!AD -> Error with LSEEKREAD for shmid = !UL", TRUE, fname_len, fname, shmid); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) save_errno); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } mu_gv_cur_reg_free(); if (tsd.shmid != shmid) { SNPRINTF(msgbuff, OUT_BUFF_SIZE, "Shared memory ID (%d) in the DB file header does not match with the one" " reported by \"ipcs\" command (%d)", tsd.shmid, shmid); if (1 < shmstat.shm_nattch) { PRINT_AND_SEND_DBRNDWN_FAILURE_MSG(msgbuff, fname, shmid); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } remove_shmid = TRUE; } else if (tsd.gt_shm_ctime.ctime != shmstat.shm_ctime) { SNPRINTF(msgbuff, OUT_BUFF_SIZE, "Shared memory creation time in the DB file header does not match with" " the one reported by shmctl"); if (1 < shmstat.shm_nattch) { PRINT_AND_SEND_DBRNDWN_FAILURE_MSG(msgbuff, fname, shmid); *exit_stat = ERR_MUNOTALLSEC; shmdt((void *)start_addr); return FALSE; } remove_shmid = TRUE; } } shmdt((void *)start_addr); if (remove_shmid) { assert('\0' != msgbuff[0]); if (0 != shm_rmid(shmid)) { save_errno = errno; gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(8) ERR_DBFILERR, 2, fname_len, fname, ERR_TEXT, 2, RTS_ERROR_TEXT("Error removing shared memory")); util_out_print("!AD -> Error removing shared memory for shmid = !UL", TRUE, fname_len, fname, shmid); gtm_putmsg_csa(CSA_ARG(NULL) VARLSTCNT(1) save_errno); *exit_stat = ERR_MUNOTALLSEC; return FALSE; } PRINT_AND_SEND_SHMREMOVED_MSG(msgbuff, fname_len, fname, shmid); *exit_stat = ERR_SHMREMOVED; } else *exit_stat = SS_NORMAL; return TRUE; }
int jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header header; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, result; GTM_BAVAIL_TYPE avail_blocks; uint4 aligned_tot_jrec_size, count; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(jpc->region == gv_cur_region); assert(csa->jnl_state == csd->jnl_state); if (!JNL_ENABLED(csa) || (NOJNL == jpc->channel) || (JNL_FILE_SWITCHED(jpc))) GTMASSERT; /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { if ((new_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* if we cannot satisfy the request, it is an error */ send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - new_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, sizeof(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. */ wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH); jnl_file_close(gv_cur_region, TRUE, TRUE); } else rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); if (jgbl.forw_phase_recovery && (NULL != jgbl.mur_pini_addr_reset_fnptr)) (*jgbl.mur_pini_addr_reset_fnptr)(); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->trans_hist.header_open_tn = jnl_info.tn; /* needed for successful jnl_file_open() */ send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREATE, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); assert(jb->filesize == csd->jnl_alq); aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } else { send_msg(VARLSTCNT(4) ERR_JNLCREATERR, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jb->filesize = new_alq; /* Actually this is virtual file size blocks */ DO_FILE_READ(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); assert((header.virtual_size + new_blocks) == new_alq); header.virtual_size = new_alq; DO_FILE_WRITE(jpc->channel, 0, &header, JNL_HDR_LEN, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } if (0 >= new_blocks) break; } if (0 >= new_blocks) { jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); new_blocks = -1; } return new_blocks; }
uint4 jnl_file_extend(jnl_private_control *jpc, uint4 total_jnl_rec_size) { file_control *fc; boolean_t need_extend; jnl_buffer_ptr_t jb; jnl_create_info jnl_info; jnl_file_header *header; unsigned char hdr_buff[REAL_JNL_HDR_LEN + MAX_IO_BLOCK_SIZE]; uint4 new_alq; sgmnt_addrs *csa; sgmnt_data_ptr_t csd; char prev_jnl_fn[JNL_NAME_SIZE]; uint4 jnl_status = 0, status; int new_blocks, warn_blocks, result; gtm_uint64_t avail_blocks; uint4 aligned_tot_jrec_size, count; uint4 jnl_fs_block_size, read_write_size; DCL_THREADGBL_ACCESS; switch(jpc->region->dyn.addr->acc_meth) { case dba_mm: case dba_bg: csa = &FILE_INFO(jpc->region)->s_addrs; break; default: GTMASSERT; } csd = csa->hdr; assert(csa == cs_addrs && csd == cs_data); assert(csa->now_crit || (csd->clustered && (CCST_CLOSED == csa->nl->ccp_state))); assert(&FILE_INFO(jpc->region)->s_addrs == csa); assert(csa->jnl_state == csd->jnl_state); assertpro(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && (!JNL_FILE_SWITCHED(jpc))); /* crit and messing with the journal file - how could it have vanished? */ if (!csd->jnl_deq || (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit)) { assert(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE) <= csd->jnl_alq); assert(csd->jnl_alq == csd->autoswitchlimit); new_blocks = csd->jnl_alq; } else /* May cause extension of csd->jnl_deq * n blocks where n > 0 */ new_blocks = ROUND_UP(DIVIDE_ROUND_UP(total_jnl_rec_size, DISK_BLOCK_SIZE), csd->jnl_deq); jpc->status = SS_NORMAL; jb = jpc->jnl_buff; assert(0 <= new_blocks); DEBUG_ONLY(count = 0); for (need_extend = (0 != new_blocks); need_extend; ) { DEBUG_ONLY(count++); /* usually we will do the loop just once where we do the file extension. * rarely we might need to do an autoswitch instead after which again rarely * we might need to do an extension on the new journal to fit in the transaction's journal requirements. * therefore we should do this loop a maximum of twice. hence the assert below. */ assert(count <= 2); need_extend = FALSE; if (SS_NORMAL == (status = disk_block_available(jpc->channel, &avail_blocks, TRUE))) { warn_blocks = (csd->jnl_alq + csd->jnl_deq > csd->autoswitchlimit) ? ((csd->jnl_deq > csd->autoswitchlimit) ? csd->jnl_deq : csd->autoswitchlimit) : new_blocks; if ((warn_blocks * EXTEND_WARNING_FACTOR) > avail_blocks) { if (new_blocks > avail_blocks) { /* If we cannot satisfy the request, it is an error, unless the anticipatory freeze * scheme is in effect in which case, we will assume space is available even if * it is not and go ahead with writes to the disk. If the writes fail with ENOSPC * we will freeze the instance and wait for space to become available and keep * retrying the writes. Therefore, we make the NOSPACEEXT a warning in this case. */ SETUP_THREADGBL_ACCESS; if (!ANTICIPATORY_FREEZE_ENABLED(csa)) { send_msg(VARLSTCNT(6) ERR_NOSPACEEXT, 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); new_blocks = 0; jpc->status = SS_NORMAL; break; } else send_msg(VARLSTCNT(6) MAKE_MSG_WARNING(ERR_NOSPACEEXT), 4, JNL_LEN_STR(csd), new_blocks, avail_blocks); } else send_msg(VARLSTCNT(5) ERR_DSKSPACEFLOW, 3, JNL_LEN_STR(csd), (avail_blocks - warn_blocks)); } } else send_msg(VARLSTCNT(5) ERR_JNLFILEXTERR, 2, JNL_LEN_STR(csd), status); new_alq = jb->filesize + new_blocks; /* ensure current journal file size is well within autoswitchlimit --> design constraint */ assert(csd->autoswitchlimit >= jb->filesize); if (csd->autoswitchlimit < (jb->filesize + (EXTEND_WARNING_FACTOR * new_blocks))) /* close to max */ send_msg(VARLSTCNT(5) ERR_JNLSPACELOW, 3, JNL_LEN_STR(csd), csd->autoswitchlimit - jb->filesize); if (csd->autoswitchlimit < new_alq) { /* Reached max, need to autoswitch */ /* Ensure new journal file can hold the entire current transaction's journal record requirements */ assert(csd->autoswitchlimit >= MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size)); memset(&jnl_info, 0, SIZEOF(jnl_info)); jnl_info.prev_jnl = &prev_jnl_fn[0]; set_jnl_info(gv_cur_region, &jnl_info); assert(JNL_ENABLED(csa) && (NOJNL != jpc->channel) && !(JNL_FILE_SWITCHED(jpc))); jnl_status = jnl_ensure_open(); if (0 == jnl_status) { /* flush the cache and jnl-buffer-contents to current journal file before * switching to a new journal. Set a global variable in_jnl_file_autoswitch * so jnl_write can know not to do the padding check. But because this is a global * variable, we also need to make sure it is reset in case of errors during the * autoswitch (or else calls to jnl_write after we are out of the autoswitch logic * will continue to incorrectly not do the padding check. Hence a condition handler. */ assert(!in_jnl_file_autoswitch); in_jnl_file_autoswitch = TRUE; /* Also make sure time is not changed. This way if "jnl_write" as part of writing a * journal record invokes jnl_file_extend, when the autoswitch is done and writing * of the parent jnl_write resumes, we want it to continue with the same timestamp * and not have to reset its time (non-trivial task) to reflect any changes since then. */ assert(!jgbl.save_dont_reset_gbl_jrec_time); jgbl.save_dont_reset_gbl_jrec_time = jgbl.dont_reset_gbl_jrec_time; jgbl.dont_reset_gbl_jrec_time = TRUE; /* Establish a condition handler so we reset a few global variables that have * temporarily been modified in case of errors inside wcs_flu/jnl_file_close. */ ESTABLISH_RET(jnl_file_autoswitch_ch, EXIT_ERR); /* It is possible we still have not written a PINI record in this journal file * (e.g. mupip extend saw the need to do jnl_file_extend inside jnl_write while * trying to write a PINI record). Write a PINI record in that case before closing * the journal file that way the EOF record will have a non-zero pini_addr. */ if (0 == jpc->pini_addr) jnl_put_jrt_pini(csa); wcs_flu(WCSFLU_FLUSH_HDR | WCSFLU_WRITE_EPOCH | WCSFLU_SPEEDUP_NOBEFORE); jnl_file_close(gv_cur_region, TRUE, TRUE); REVERT; in_jnl_file_autoswitch = FALSE; jgbl.dont_reset_gbl_jrec_time = jgbl.save_dont_reset_gbl_jrec_time; DEBUG_ONLY(jgbl.save_dont_reset_gbl_jrec_time = FALSE); assert((dba_mm == cs_data->acc_meth) || (csd == cs_data)); csd = cs_data; /* In MM, wcs_flu() can remap an extended DB, so reset csd to be sure */ } else { if (SS_NORMAL != jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } assert(!jgbl.forw_phase_recovery || (NULL != jgbl.mur_pini_addr_reset_fnptr)); assert(jgbl.forw_phase_recovery || (NULL == jgbl.mur_pini_addr_reset_fnptr)); if (NULL != jgbl.mur_pini_addr_reset_fnptr) (*jgbl.mur_pini_addr_reset_fnptr)(csa); assert(!jnl_info.no_rename); assert(!jnl_info.no_prev_link); if (EXIT_NRM == cre_jnl_file(&jnl_info)) { assert(0 == memcmp(csd->jnl_file_name, jnl_info.jnl, jnl_info.jnl_len)); assert(csd->jnl_file_name[jnl_info.jnl_len] == '\0'); assert(csd->jnl_file_len == jnl_info.jnl_len); assert(csd->jnl_buffer_size == jnl_info.buffer); assert(csd->jnl_alq == jnl_info.alloc); assert(csd->jnl_deq == jnl_info.extend); assert(csd->jnl_before_image == jnl_info.before_images); csd->jnl_checksum = jnl_info.checksum; csd->jnl_eovtn = csd->trans_hist.curr_tn; send_msg(VARLSTCNT(4) ERR_NEWJNLFILECREAT, 2, JNL_LEN_STR(csd)); fc = gv_cur_region->dyn.addr->file_cntl; fc->op = FC_WRITE; fc->op_buff = (sm_uc_ptr_t)csd; fc->op_len = SGMNT_HDR_LEN; fc->op_pos = 1; status = dbfilop(fc); if (SS_NORMAL != status) send_msg(VARLSTCNT(5) ERR_DBFILERR, 2, DB_LEN_STR(gv_cur_region), status); assert(JNL_ENABLED(csa)); /* call jnl_ensure_open instead of jnl_file_open to make sure jpc->pini_addr is set to 0 */ jnl_status = jnl_ensure_open(); /* sets jpc->status */ if (0 != jnl_status) { if (jpc->status) rts_error(VARLSTCNT(7) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region), jpc->status); else rts_error(VARLSTCNT(6) jnl_status, 4, JNL_LEN_STR(csd), DB_LEN_STR(gv_cur_region)); } assert(jb->filesize == csd->jnl_alq); if (csd->jnl_alq + csd->jnl_deq <= csd->autoswitchlimit) { aligned_tot_jrec_size = ALIGNED_ROUND_UP(MAX_REQD_JNL_FILE_SIZE(total_jnl_rec_size), csd->jnl_alq, csd->jnl_deq); if (aligned_tot_jrec_size > csd->jnl_alq) { /* need to extend more than initial allocation in the new journal file * to accommodate the current transaction. */ new_blocks = aligned_tot_jrec_size - csd->jnl_alq; assert(new_blocks); assert(0 == new_blocks % csd->jnl_deq); need_extend = TRUE; } } } else { send_msg(VARLSTCNT(4) ERR_JNLNOCREATE, 2, JNL_LEN_STR(csd)); jpc->status = ERR_JNLNOCREATE; new_blocks = -1; } } else { assert(!need_extend); /* ensure we won't go through the for loop again */ /* Virtually extend currently used journal file */ jnl_fs_block_size = jb->fs_block_size; header = (jnl_file_header *)(ROUND_UP2((uintszofptr_t)hdr_buff, jnl_fs_block_size)); read_write_size = ROUND_UP2(REAL_JNL_HDR_LEN, jnl_fs_block_size); assert((unsigned char *)header + read_write_size <= ARRAYTOP(hdr_buff)); DO_FILE_READ(jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) { assert(FALSE); rts_error(VARLSTCNT(5) ERR_JNLRDERR, 2, JNL_LEN_STR(csd), jpc->status); } assert((header->virtual_size + new_blocks) == new_alq); jb->filesize = new_alq; /* Actually this is virtual file size blocks */ header->virtual_size = new_alq; JNL_DO_FILE_WRITE(csa, csd->jnl_file_name, jpc->channel, 0, header, read_write_size, jpc->status, jpc->status2); if (SS_NORMAL != jpc->status) { assert(FALSE); rts_error(VARLSTCNT(5) ERR_JNLWRERR, 2, JNL_LEN_STR(csd), jpc->status); } } if (0 >= new_blocks) break; } if (0 < new_blocks) { INCR_GVSTATS_COUNTER(csa, csa->nl, n_jnl_extends, 1); return EXIT_NRM; } jpc->status = ERR_JNLREADEOF; jnl_file_lost(jpc, ERR_JNLEXTEND); return EXIT_ERR; }