// - Compute weighted running average for pageFaultRate // (# of disk writes per second). // - Start a new cycle when the main thread sees the pressure. // [Eric] void MemoryMonitor::updatePageFaultRate(Int64 pageFault) { float delta = (float)(pageFault - prevPageFault_); if (delta < 0) return; Int64 currTime = JULIANTIMESTAMP(OMIT,OMIT,OMIT,OMIT); // Just in case if (currTime <= prevTime_) return; if (resetEntryCount_) { // Start a new cycle after the main thread sees pressure peek. resetEntryCount_ = FALSE; entryCount_ = 1; prevPageFault_ = pageFault; prevTime_ = currTime; return; } float ratio = (float)(1.0 / entryCount_); #pragma warning (disable : 4244) //warning elimination #pragma nowarn(1506) // warning elimination pageFaultRate_ = (1 - ratio) * pageFaultRate_ + ratio * delta / ((float)(currTime - prevTime_) / 1E6); #pragma warn(1506) // warning elimination #pragma warning (default : 4244) //warning elimination prevPageFault_ = pageFault; prevTime_ = currTime; if (entryCount_ < 3) entryCount_++; }
NABoolean HHDFSTableStats::populate(struct hive_tbl_desc *htd) { // here is the basic outline how this works: // // 1. Walk SD descriptors of the table, one for the table // itself and one for each partition. Each one represents // one HDFS directory with files for the table. // 2. For each list partition directory (or the directory for // an unpartitioned table): // 3. Walk through every file. For every file: // 4. Determine bucket number (0 if file is not bucketed) // 5. Add file to its bucket // 6. Walk through blocks of file. For every block: // 7. Get host list for this block and add it // 9. Get file stats // 10. Aggregate file stats for all files and buckets // 11. Aggregate bucket stats for all buckets of the partition // 12. Aggregate partition stats for all partitions of the table NABoolean result = TRUE; struct hive_sd_desc *hsd = htd->getSDs(); tableDir_ = hsd->location_; numOfPartCols_ = htd->getNumOfPartCols(); recordTerminator_ = hsd->getRecordTerminator(); fieldTerminator_ = hsd->getFieldTerminator() ; NAString hdfsHost; Int32 hdfsPort = -1; NAString tableDir; while (hsd) { // split table URL into host, port and filename splitLocation(hsd->location_, hdfsHost, hdfsPort, tableDir); if (! connectHDFS(hdfsHost, hdfsPort)) CMPASSERT(fs_); // put back fully qualified URI tableDir = hsd->location_; // visit the directory result = processDirectory(tableDir, hsd->buckets_, hsd->isTrulyText(), hsd->getRecordTerminator(), hsd->isSequenceFile()); hsd = hsd->next_; } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); return result; }
// This function now for non-NSKLite platforms only (UNIX) Int64 ComSmallDef_local_GetTimeStamp(void) { //#if defined(NA_HSC_LINUX) || defined(NA_LINUX) #if defined(NA_HSC_LINUX) struct timeval tv; gettimeofday(&tv, NULL); return(Int64(tv.tv_usec) + (Int64(tv.tv_sec)*Int64(1000000L))); #else return(JULIANTIMESTAMP()); #endif }
NABoolean HHDFSTableStats::validateAndRefresh(Int64 expirationJTimestamp, NABoolean refresh) { NABoolean result = TRUE; // initial heap allocation size Int32 initialSize = heap_->getAllocSize(); diags_.reset(); // check if the stats needs to be fetched within a specified time interval // when not requested to refresh if (! refresh && (expirationJTimestamp == -1 || (expirationJTimestamp > 0 && validationJTimestamp_ < expirationJTimestamp))) return result; // consider the stats still valid // if partitions get added or deleted, that gets // caught in the Hive metadata, so no need to check for // that here for (int p=0; p<totalNumPartitions_ && result && diags_.isSuccess(); p++) { HHDFSListPartitionStats *partStats = listPartitionStatsList_[p]; NAString hdfsHost; Int32 hdfsPort; NAString partDir; result = splitLocation(partStats->getDirName(), hdfsHost, hdfsPort, partDir, diags_, hdfsPortOverride_); if (! result) break; if (! connectHDFS(hdfsHost, hdfsPort)) return FALSE; subtract(partStats); result = partStats->validateAndRefresh(fs_, diags_, refresh); if (result) add(partStats); } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); // account for the heap used by stats. Heap released during // stats refresh will also be included hiveStatsSize_ += (heap_->getAllocSize() - initialSize); return result; }
NABoolean HHDFSTableStats::validateAndRefresh(Int64 expirationJTimestamp, NABoolean refresh) { NABoolean result = TRUE; // initial heap allocation size Int32 initialSize = heap_->getAllocSize(); // check only once within a specified time interval if (expirationJTimestamp == -1 || (expirationJTimestamp > 0 && validationJTimestamp_ < expirationJTimestamp)) return result; // consider the stats still valid // if partitions get added or deleted, that gets // caught in the Hive metadata, so no need to check for // that here for (int p=0; p<totalNumPartitions_ && result; p++) { HHDFSListPartitionStats *partStats = listPartitionStatsList_[p]; NAString hdfsHost; Int32 hdfsPort; NAString partDir; splitLocation(partStats->getDirName(), hdfsHost, hdfsPort, partDir); if (! connectHDFS(hdfsHost, hdfsPort)) CMPASSERT(fs_); subtract(partStats); result = partStats->validateAndRefresh(fs_, refresh); add(partStats); } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); // account for the heap used by stats. Heap released during // stats refresh will also be included hiveStatsSize_ += (heap_->getAllocSize() - initialSize); return result; }
int main(int argc, char *argv[], char *envp[]) { INITSRVRTRC CEE_status sts = CEE_SUCCESS; SRVR_INIT_PARAM_Def initParam; DWORD processId; char tmpString[128]; char tmpString2[32]; char tmpString3[512]; CEECFG_Transport transport; CEECFG_TcpPortNumber portNumber; BOOL retcode; IDL_OBJECT_def srvrObjRef; CEECFG_TcpProcessName TcpProcessName; int TransportTrace = 0; CALL_COMP_DOVERS(ndcs,argc,argv); try { regZnodeName[0] = '\x0'; zkHost[0] = '\x0'; zkRootNode[0] = '\x0'; // Initialize seabed int sbResult; char buffer[FILENAME_MAX] = {0}; bzero(buffer, sizeof(buffer)); sbResult = file_init_attach(&argc, &argv, true, buffer); if(sbResult != XZFIL_ERR_OK){ exit(3); } sbResult = file_mon_process_startup(true); if(sbResult != XZFIL_ERR_OK){ exit(3); } msg_mon_enable_mon_messages(true); } catch(SB_Fatal_Excep sbfe) { exit(3); } sigset_t newset, oldset; sigemptyset(&newset); sigaddset(&newset,SIGQUIT); sigaddset(&newset,SIGTERM); sigprocmask(SIG_BLOCK,&newset,&oldset); processId = GetCurrentProcessId(); retcode = getInitParamSrvr(argc, argv, initParam, tmpString, tmpString3); retcode = TRUE; mxosrvr_init_seabed_trace_dll(); atexit(mxosrvr_atexit_function); // +++ Todo: Duplicating calls here. Should try to persist in srvrGlobal MS_Mon_Process_Info_Type proc_info; msg_mon_get_process_info_detail(NULL, &proc_info); myNid = proc_info.nid; myPid = proc_info.pid; myProcName = proc_info.process_name; char logNameSuffix[32]; sprintf( logNameSuffix, "_%d_%d.log", myNid, myPid ); CommonLogger::instance().initLog4cxx("log4cxx.trafodion.masterexe.config", logNameSuffix); if(retcode == FALSE ) { //LCOV_EXCL_START SendEventMsg( MSG_SET_SRVR_CONTEXT_FAILED, EVENTLOG_ERROR_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 2, tmpString, tmpString3); exit(0); //LCOV_EXCL_STOP } GTransport.initialize(); if(GTransport.error != 0 ) { //LCOV_EXCL_START SendEventMsg( MSG_SET_SRVR_CONTEXT_FAILED, EVENTLOG_ERROR_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, GTransport.error_message); exit(0); //LCOV_EXCL_STOP } chdir(GTransport.myPathname); initParam.srvrType = CORE_SRVR; //LCOV_EXCL_START if (initParam.debugFlag & SRVR_DEBUG_BREAK) { volatile int done = 0; while (!done) { sleep(10); } } //LCOV_EXCL_STOP char zkErrStr[2048]; stringstream zk_ip_port; // zoo_set_debug_level(ZOO_LOG_LEVEL_DEBUG); if( zkHost[0] == '\x0' && regZnodeName[0] == '\x0' ) { sprintf(zkErrStr, "***** Cannot get Zookeeper properties or registered znode info from startup params"); SendEventMsg( MSG_SET_SRVR_CONTEXT_FAILED, EVENTLOG_ERROR_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); // exit(1); } else { zk_ip_port << zkHost; sprintf(zkErrStr, "zk_ip_port is: %s", zk_ip_port.str().c_str()); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); } if (initParam.debugFlag & SRVR_DEBUG_BREAK) zkSessionTimeout = 600; zoo_deterministic_conn_order(1); // enable deterministic order zh = zookeeper_init(zk_ip_port.str().c_str(), watcher, zkSessionTimeout * 1000, &myid, 0, 0); if (zh == 0){ sprintf(zkErrStr, "***** zookeeper_init() failed for host:port %s",zk_ip_port.str().c_str()); SendEventMsg( MSG_SET_SRVR_CONTEXT_FAILED, EVENTLOG_ERROR_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); // exit(1); } bool found = false; int rc; stringstream ss; ss.str(""); ss << zkRootNode << "/dcs/master"; string dcsMaster(ss.str()); Stat stat; int startPortNum = 0, portRangeNum; char masterHostName[MAX_HOST_NAME_LEN]; char startPort[12], portRange[12], masterTS[24]; struct String_vector children; children.count = 0; children.data = NULL; // Get the instance ID from registered node char *tkn; char tmpStr[256]; strcpy( tmpStr, regZnodeName ); tkn = strtok(tmpStr, ":" ); if(tkn!=NULL) strcpy(hostname,tkn); tkn = strtok(NULL, ":" ); if( tkn != NULL ) strcpy( instanceId, tkn ); tkn = strtok(NULL, ":" ); if( tkn != NULL ) strcpy( childId, tkn ); else ; // +++ Todo handle error while(!found) { rc = zoo_exists(zh, dcsMaster.c_str(), 0, &stat); if( rc == ZNONODE ) continue; else if( rc == ZOK ) { rc = zoo_get_children(zh, dcsMaster.c_str(), 0, &children); if( children.count > 0 ) { char zknodeName[2048]; strcpy(zknodeName, children.data[0]); tkn = strtok(zknodeName, ":" ); if( tkn != NULL ) strcpy( masterHostName, tkn ); tkn = strtok(NULL, ":" ); if( tkn != NULL ) { strcpy( startPort, tkn ); startPortNum = atoi(tkn); } tkn = strtok(NULL, ":" ); if( tkn != NULL ) { strcpy( portRange, tkn ); portRangeNum = atoi(tkn); } tkn = strtok(NULL, ":" ); if( tkn != NULL ) strcpy( masterTS, tkn ); free_String_vector(&children); found = true; } else continue; } else // error { sprintf(zkErrStr, "***** zoo_exists() for %s failed with error %d",dcsMaster.c_str(), rc); SendEventMsg( MSG_SET_SRVR_CONTEXT_FAILED, EVENTLOG_ERROR_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); break; } } // Initialize initparam to defaults initParam.transport = CEE_TRANSPORT_TCP; // -T 3 initParam.majorVersion = 3; // -V 3 // Will need to remove $ZTC0 and NonStopODBC from below sprintf( initParam.asSrvrObjRef, "TCP:$ZTC0/%s:NonStopODBC", startPort); // -A TCP:$ZTC0/52500:NonStopODBC // Will need to remove this after we get rid off all existing AS related processing sprintf( initParam.ASProcessName, "$MXOAS" ); // -AS $MXOAS // Will need to remove this after we get rid off all existing WMS related processing sprintf( initParam.QSProcessName, "$ZWMGR" ); // -QS $ZWMGR // moved this here from begining of the function BUILD_OBJECTREF(initParam.asSrvrObjRef, srvrObjRef, "NonStopODBC", initParam.portNumber); ss.str(""); ss << zkRootNode << "/dcs/servers/registered"; string dcsRegistered(ss.str()); char realpath[1024]; bool zk_error = false; if( found ) { sprintf(zkErrStr, "Found master node in Zookeeper"); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); found = false; while(!found) { rc = zoo_exists(zh, dcsRegistered.c_str(), 0, &stat); if( rc == ZNONODE ) continue; else if( rc == ZOK ) { int i; //This section is the original port finding mechanism. //All servers (the herd) start looking for any available port //between starting port number+2 through port range max. //This is mainly for backward compatability for DcsServers //that don't pass PORTMAPTOSECS and PORTBINDTOSECS param if(portMapToSecs == -1 && portBindToSecs == -1) { for(i = startPortNum+2; i < startPortNum+portRangeNum; i++) { if (GTransport.m_listener->verifyPortAvailable("SRVR", i)) break; } if( i == startPortNum+portRangeNum ) { zk_error = true; sprintf(zkErrStr, "***** No ports free"); break; } } else { //This section is for new port map params, PORTMAPTOSECS and PORTBINDTOSECS, //passed in by DcsServer. DcsMaster writes the port map to data portion of //<username>/dcs/servers/registered znode. Wait PORTMAPTOSECS for port map //to appear in registered znode. When it appears read it and scan looking for //match of instance and child Id. long retryTimeout = 500;//.5 second long long timeout = JULIANTIMESTAMP(); bool isPortsMapped = false; char *zkData = new char[1000000]; int zkDataLen = 1000000; while(! isPortsMapped) { memset(zkData,0,1000000); rc = zoo_get(zh, dcsRegistered.c_str(), false, zkData, &zkDataLen, &stat); if( rc == ZOK && zkDataLen > 0 ) { sprintf(zkErrStr, "DCS port map = %s", zkData); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); int myInstanceId = atoi(instanceId); int myChildId = atoi(childId); sprintf(zkErrStr, "Searching for my id (%d:%d) in port map",myInstanceId,myChildId); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); char portMapInstanceId[8]; char portMapChildId[8]; char portMapPortNum[8]; char* saveptr; char* token = strtok_r (zkData,":",&saveptr); while (token != NULL) { if( token != NULL )//instance Id strcpy( portMapInstanceId, token ); token = strtok_r(NULL, ":",&saveptr); if( token != NULL )//child id strcpy( portMapChildId, token ); token = strtok_r(NULL, ":",&saveptr); if( token != NULL )//port number strcpy( portMapPortNum, token ); int currPortMapInstanceId = atoi(portMapInstanceId); int currPortMapChildId = atoi(portMapChildId); int currPortMapPortNum = atoi(portMapPortNum); if(myInstanceId == currPortMapInstanceId && myChildId == currPortMapChildId) { i = currPortMapPortNum; sprintf(zkErrStr, "Found my port number = %d in port map", i); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); break; } else { token = strtok_r (NULL, ":",&saveptr); } } timeout = JULIANTIMESTAMP(); bool isAvailable = false; while ( isAvailable == false ) { if (GTransport.m_listener->verifyPortAvailable("SRVR", i)) { isAvailable = true; } else { if((JULIANTIMESTAMP() - timeout) > (portBindToSecs * 1000000)) { sprintf(zkErrStr, "Port bind timeout...exiting"); zk_error = true; break; } else { sprintf(zkErrStr, "Port = %d is already in use...retrying", i); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); DELAY(retryTimeout); } } } isPortsMapped = true; } else { if((JULIANTIMESTAMP() - timeout) > (portMapToSecs * 1000000)) { sprintf(zkErrStr, "Port map read timeout...exiting"); zk_error = true; break; } else { sprintf(zkErrStr, "Waiting for port map"); SendEventMsg(MSG_SERVER_TRACE_INFO, EVENTLOG_INFORMATION_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); DELAY(retryTimeout); rc = zoo_exists(zh, dcsRegistered.c_str(), 0, &stat); } } } delete[] zkData; } initParam.portNumber = i; stringstream newpath; newpath.str(""); newpath << dcsRegistered.c_str() << "/" << regZnodeName; // dcsRegisteredNode.str(""); // dcsRegisteredNode << dcsRegistered.c_str() << "/" << regZnodeName; dcsRegisteredNode = newpath.str(); ss.str(""); ss << myPid; string pid(ss.str()); ss.str(""); ss << "STARTING" << ":" << JULIANTIMESTAMP() << ":" << ":" // Dialogue ID << myNid << ":" << myPid << ":" << myProcName.c_str() << ":" // Server IP address << ":" // Server Port << ":" // Client computer name << ":" // Client address << ":" // Client port << ":" // Client Appl name << ":"; regSrvrData = ss.str(); rc = zoo_create(zh, dcsRegisteredNode.c_str(), regSrvrData.c_str(), regSrvrData.length(), &ZOO_OPEN_ACL_UNSAFE, ZOO_EPHEMERAL, realpath, sizeof(realpath)-1); if( rc != ZOK ) { zk_error = true; sprintf(zkErrStr, "***** zoo_create() failed with error %d", rc); break; } found = true; } else // error { zk_error = true; sprintf(zkErrStr, "***** zoo_exists() for %s failed with error %d",dcsRegistered.c_str(), rc); break; } } } if( zk_error ) { SendEventMsg( MSG_SET_SRVR_CONTEXT_FAILED, EVENTLOG_ERROR_TYPE, processId, ODBCMX_SERVER, srvrObjRef, 1, zkErrStr); exit(1); } //LCOV_EXCL_START // when a server dies, the MXOAS sends message to CFG. CFG creates the MXOSRVR process // and passess only one command line atribute: -SQL CLEANUP OBSOLETE VOLATILE TABLES // It is for cleanup resources (volatile tables). // Newly created MXOSRVR process executes CLEANUP OBSOLETE VOLATILE TABLES and exits. // (This process is not managed by AS!. It is only a helper. if (initParam.sql != NULL) { if (strncmp(initParam.sql, "SELECT COUNT", 12) == 0) { //You can specify a completion code with any positive value in a PROCESS_STOP_. //Negative completion codes are reserved for HP use. //Therefore negative codes will return as 1000 + abs(completionCode) short completionCode = -1; completionCode = SQL_EXECDIRECT_FETCH(&initParam); if (completionCode < 0) completionCode = 1000 + abs(completionCode); #ifdef NSK_PLATFORM PROCESS_STOP_(,,,completionCode,,,,); #else /* * TODO: * need to revisit this logic to return a value via exit code * */ #endif } else {
NABoolean HHDFSTableStats::populate(struct hive_tbl_desc *htd) { // here is the basic outline how this works: // // 1. Walk SD descriptors of the table, one for the table // itself and one for each partition. Each one represents // one HDFS directory with files for the table. // 2. For each list partition directory (or the directory for // an unpartitioned table): // 3. Walk through every file. For every file: // 4. Determine bucket number (0 if file is not bucketed) // 5. Add file to its bucket // 6. Walk through blocks of file. For every block: // 7. Get host list for this block and add it // 9. Get file stats // 10. Aggregate file stats for all files and buckets // 11. Aggregate bucket stats for all buckets of the partition // 12. Aggregate partition stats for all partitions of the table struct hive_sd_desc *hsd = htd->getSDs(); if (hsd == NULL) return TRUE; // nothing need to be done diags_.reset(); tableDir_ = hsd->location_; numOfPartCols_ = htd->getNumOfPartCols(); recordTerminator_ = hsd->getRecordTerminator(); fieldTerminator_ = hsd->getFieldTerminator() ; nullFormat_ = hsd->getNullFormat(); NAString hdfsHost; Int32 hdfsPort = -1; NAString tableDir; if (hsd) { if (hsd->isTextFile()) type_ = TEXT_; else if (hsd->isSequenceFile()) type_ = SEQUENCE_; else if (hsd->isOrcFile()) type_ = ORC_; else type_ = UNKNOWN_; } // split table URL into host, port and filename if (! splitLocation(hsd->location_, hdfsHost, hdfsPort, tableDir, diags_, hdfsPortOverride_)) return FALSE; if (! connectHDFS(hdfsHost, hdfsPort)) return FALSE; // diags_ is set // put back fully qualified URI tableDir = hsd->location_; computeModificationTSmsec(); if (diags_.isSuccess()) { modificationTSInMillisec_ = htd->setRedeftime(modificationTSInMillisec_); while (hsd && diags_.isSuccess()) { // visit the directory processDirectory(hsd->location_, hsd->buckets_, hsd->isTrulyText(), hsd->getRecordTerminator()); hsd = hsd->next_; } } disconnectHDFS(); validationJTimestamp_ = JULIANTIMESTAMP(); return diags_.isSuccess(); }
ExWorkProcRetcode ExCancelTcb::work() { ExMasterStmtGlobals *masterGlobals = getGlobals()->castToExExeStmtGlobals()->castToExMasterStmtGlobals(); CliGlobals *cliGlobals = masterGlobals->getCliGlobals(); while ((qparent_.down->isEmpty() == FALSE) && (qparent_.up->isFull() == FALSE)) { ex_queue_entry *pentry_down = qparent_.down->getHeadEntry(); switch (step_) { case NOT_STARTED: { if (pentry_down->downState.request == ex_queue::GET_NOMORE) step_ = DONE; else { retryCount_ = 0; // Priv checking is done during compilation. To support // REVOKE, prevent a prepared CANCEL/SUSPEND/ACTIVATE // that was compiled more than 1 second ago from executing // by raising the 8734 error to force an AQR. Int64 microSecondsSinceCompile = NA_JulianTimestamp() - masterGlobals->getStatement()->getCompileEndTime(); if (microSecondsSinceCompile > 1000*1000) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-CLI_INVALID_QUERY_PRIVS); reportError(diagsArea); step_ = DONE; break; } // Figure out which MXSSMP broker to use. if (cancelTdb().getAction() == ComTdbCancel::CancelByPname) { int nid = -1; int rc = msg_mon_get_process_info(cancelTdb().getCancelPname(), &nid, &pid_); switch (rc) { case XZFIL_ERR_OK: cpu_ = (short) nid; break; case XZFIL_ERR_NOTFOUND: case XZFIL_ERR_BADNAME: case XZFIL_ERR_NOSUCHDEV: { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND); *diagsArea << DgString0(cancelTdb().getCancelPname()); reportError(diagsArea); step_ = DONE; break; } default: { char buf[200]; str_sprintf(buf, "Unexpected error %d returned from " "msg_mon_get_process_info", rc); ex_assert(0, buf); } } if (step_ != NOT_STARTED) break; } else if (cancelTdb().getAction() == ComTdbCancel::CancelByNidPid) { cpu_ = (short) cancelTdb().getCancelNid(); pid_ = cancelTdb().getCancelPid(); // check that process exists, if not report error. char processName[MS_MON_MAX_PROCESS_NAME]; int rc = msg_mon_get_process_name(cpu_, pid_, processName); if (XZFIL_ERR_OK == rc) ; // good. nid & pid are valid. else { if ((XZFIL_ERR_NOTFOUND != rc) && (XZFIL_ERR_BADNAME != rc) && (XZFIL_ERR_NOSUCHDEV != rc)) { // Log rc in case it needs investigation later. char buf[200]; str_sprintf(buf, "Unexpected error %d returned from " "msg_mon_get_process_name", rc); SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, buf, 0); } char nid_pid_str[32]; str_sprintf(nid_pid_str, "%d, %d", cpu_, pid_); ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND); *diagsArea << DgString0(nid_pid_str); reportError(diagsArea); step_ = DONE; break; } } else { char * qid = cancelTdb().qid_; Lng32 qid_len = str_len(qid); // This static method is defined in SqlStats.cpp. It side-effects // the nodeName and cpu_ according to the input qid. if (getMasterCpu( qid, qid_len, nodeName_, sizeof(nodeName_) - 1, cpu_) == -1) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_RTS_INVALID_QID); reportError(diagsArea); step_ = DONE; break; } } // Testpoints for hard to reproduce problems: bool fakeError8028 = false; fakeError8028 = (getenv("HP_FAKE_ERROR_8028") != NULL); if ((cliGlobals->getCbServerClass() == NULL) || fakeError8028) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND); *diagsArea << DgString0("$ZSM000"); reportError(diagsArea); step_ = DONE; break; } ComDiagsArea *diagsArea = NULL; bool fakeError2024 = false; fakeError2024 = (getenv("HP_FAKE_ERROR_2024") != NULL); if (fakeError2024) { cbServer_ = NULL; diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); if (getenv("HP_FAKE_ERROR_8142")) { *diagsArea << DgSqlCode(-8142); *diagsArea << DgString0(__FILE__); *diagsArea << DgString1("cbServer_ is NULL"); } else *diagsArea << DgSqlCode(-2024); } else cbServer_ = cliGlobals->getCbServerClass()->allocateServerProcess( &diagsArea, cliGlobals->getEnvironment()->getHeap(), nodeName_, cpu_, IPC_PRIORITY_DONT_CARE, FALSE, // usesTransactions TRUE, // waitedCreation 2 // maxNowaitRequests -- cancel+(1 extra). ); if (cbServer_ == NULL || cbServer_->getControlConnection() == NULL) { ex_assert(diagsArea != NULL, "allocateServerProcess failed, but no diags"); // look for SQLCode 2024 // "*** ERROR[2024] Server Process $0~string0 // is not running or could not be created. Operating System // Error $1~int0 was returned." // Remap to cancel-specfic error 8028. if (diagsArea->contains(-2024) && cancelTdb().actionIsCancel()) { diagsArea->deleteError(diagsArea->returnIndex(-2024)); reportError(diagsArea, true, EXE_CANCEL_PROCESS_NOT_FOUND, nodeName_, cpu_); } else reportError(diagsArea); step_ = DONE; break; } // the reportError method was not called -- see break above. if (diagsArea != NULL) diagsArea->decrRefCount(); //Create the stream on the IpcHeap, since we don't dispose // of it immediately. We just add it to the list of completed // messages in the IpcEnv, and it is disposed of later. cancelStream_ = new (cliGlobals->getIpcHeap()) CancelMsgStream(cliGlobals->getEnvironment(), this); cancelStream_->addRecipient(cbServer_->getControlConnection()); } step_ = SEND_MESSAGE; break; } // end case NOT_STARTED #pragma warning (disable : 4291) case SEND_MESSAGE: { RtsHandle rtsHandle = (RtsHandle) this; if (cancelTdb().actionIsCancel()) { Int64 cancelStartTime = JULIANTIMESTAMP(); Lng32 firstEscalationInterval = cliGlobals->currContext()-> getSessionDefaults()->getCancelEscalationInterval(); Lng32 secondEscalationInterval = cliGlobals->currContext()-> getSessionDefaults()->getCancelEscalationMxosrvrInterval(); NABoolean cancelEscalationSaveabend = cliGlobals->currContext()-> getSessionDefaults()->getCancelEscalationSaveabend(); bool cancelLogging = (TRUE == cliGlobals->currContext()-> getSessionDefaults()->getCancelLogging()); CancelQueryRequest *cancelMsg = new (cliGlobals->getIpcHeap()) CancelQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), cancelStartTime, firstEscalationInterval, secondEscalationInterval, cancelEscalationSaveabend, cancelTdb().getCommentText(), str_len(cancelTdb().getCommentText()), cancelLogging, cancelTdb().action_ != ComTdbCancel::CancelByQid, pid_, cancelTdb().getCancelPidBlockThreshold()); #pragma warning (default : 4291) *cancelStream_ << *cancelMsg; cancelMsg->decrRefCount(); } else if (ComTdbCancel::Suspend == cancelTdb().action_) { bool suspendLogging = (TRUE == cliGlobals->currContext()-> getSessionDefaults()->getSuspendLogging()); #pragma warning (disable : 4291) SuspendQueryRequest * suspendMsg = new (cliGlobals->getIpcHeap()) SuspendQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), ComTdbCancel::Force == cancelTdb().forced_, suspendLogging); #pragma warning (default : 4291) *cancelStream_ << *suspendMsg; suspendMsg->decrRefCount(); } else { ex_assert( ComTdbCancel::Activate == cancelTdb().action_, "invalid action for ExCancelTcb"); bool suspendLogging = (TRUE == cliGlobals->currContext()-> getSessionDefaults()->getSuspendLogging()); #pragma warning (disable : 4291) ActivateQueryRequest * activateMsg = new (cliGlobals->getIpcHeap()) ActivateQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), suspendLogging); #pragma warning (default : 4291) *cancelStream_ << *activateMsg; activateMsg->decrRefCount(); } if ((cancelTdb().getAction() != ComTdbCancel::CancelByPname) && (cancelTdb().getAction() != ComTdbCancel::CancelByNidPid)) { char * qid = cancelTdb().qid_; Lng32 qid_len = str_len(qid); #pragma warning (disable : 4291) RtsQueryId *rtsQueryId = new (cliGlobals->getIpcHeap()) RtsQueryId( cliGlobals->getIpcHeap(), qid, qid_len); #pragma warning (default : 4291) *cancelStream_ << *rtsQueryId; rtsQueryId->decrRefCount(); } // send a no-wait request to the cancel broker. cancelStream_->send(FALSE); step_ = GET_REPLY; // Come back when I/O completes. return WORK_OK; break; } // end case SEND_MESSAGE case GET_REPLY: { // Handle general IPC error. bool fakeError201 = false; fakeError201 = (getenv("HP_FAKE_ERROR_201") != NULL); if ((cbServer_->getControlConnection()->getErrorInfo() != 0) || fakeError201) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); cbServer_->getControlConnection()-> populateDiagsArea( diagsArea, getGlobals()->getDefaultHeap()); if (fakeError201) { *diagsArea << DgSqlCode(-2034) << DgInt0(201) << DgString0("I say") << DgString1("control broker"); } if (diagsArea->contains(-8921)) { // Should not get timeout error 8921. Get a core-file // of the SSMP and this process too so that this can be // debugged. cbServer_->getControlConnection()-> dumpAndStopOtherEnd(true, false); genLinuxCorefile("Unexpected timeout error"); } reportError(diagsArea); step_ = DONE; break; } // See if stream has the reply yet. if (!cancelStream_->moreObjects()) return WORK_OK; #pragma warning (disable : 4291) ControlQueryReply *reply = new (cliGlobals->getIpcHeap()) ControlQueryReply(INVALID_RTS_HANDLE, cliGlobals->getIpcHeap()); #pragma warning (default : 4291) *cancelStream_ >> *reply; if (reply->didAttemptControl()) { // yeaah! cancelStream_->clearAllObjects(); } else { if (cancelStream_->moreObjects() && cancelStream_->getNextObjType() == IPC_SQL_DIAG_AREA) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *cancelStream_ >> *diagsArea; cancelStream_->clearAllObjects(); if ( retryQidNotActive_ && (diagsArea->mainSQLCODE() == -EXE_SUSPEND_QID_NOT_ACTIVE) && (++retryCount_ <= 60)) { SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, "Retrying error 8672.", 0); DELAY(500); diagsArea->decrRefCount(); step_ = SEND_MESSAGE; break; } reportError(diagsArea); } else ex_assert(0, "Control failed, but no diagnostics."); } step_ = DONE; break; } case DONE: { if (cancelStream_) { cancelStream_->addToCompletedList(); cancelStream_ = NULL; } if (cbServer_) { cbServer_->release(); cbServer_ = NULL; } ex_queue_entry * up_entry = qparent_.up->getTailEntry(); up_entry->copyAtp(pentry_down); up_entry->upState.parentIndex = pentry_down->downState.parentIndex; up_entry->upState.downIndex = qparent_.down->getHeadIndex(); up_entry->upState.setMatchNo(1); up_entry->upState.status = ex_queue::Q_NO_DATA; qparent_.up->insert(); qparent_.down->removeHead(); step_ = NOT_STARTED; break; } default: ex_assert( 0, "Unknown step_."); }