Int32 runESP(Int32 argc, char** argv, GuaReceiveFastStart *guaReceiveFastStart) { // initialize ESP global data StatsGlobals * statsGlobals; XCONTROLMESSAGESYSTEM(XCTLMSGSYS_SETRECVLIMIT, XMAX_SETTABLE_RECVLIMIT_H); CliGlobals *cliGlobals = NULL; cliGlobals = CliGlobals::createCliGlobals(TRUE); // TRUE indicates a non-master process (WAIT on LREC) if (cliGlobals == NULL) // Sanity check NAExit(1); // Abend Int32 shmid; statsGlobals = shareStatsSegment(shmid); cliGlobals->setSharedMemId(shmid); //Lng32 numCliCalls = cliGlobals->incrNumOfCliCalls(); cliGlobals->setIsESPProcess(TRUE); NAHeap *espExecutorHeap = cliGlobals->getExecutorMemory(); // must create default context after set IpcEnvironment in CliGlobals first // because context's ExSqlComp object needs IpcEnvironment cliGlobals->initiateDefaultContext(); NAHeap *espIpcHeap = cliGlobals->getIpcHeap(); IpcEnvironment *ipcEnvPtr = cliGlobals->getEnvironment(); if (statsGlobals != NULL) cliGlobals->setMemoryMonitor(statsGlobals->getMemoryMonitor()); else { // Start the memory monitor for dynamic memory management Lng32 memMonitorWindowSize = 10; Lng32 memMonitorSampleInterval = 10; MemoryMonitor *memMonitor = new (espExecutorHeap) MemoryMonitor(memMonitorWindowSize, memMonitorSampleInterval, espExecutorHeap); cliGlobals->setMemoryMonitor(memMonitor); } // After CLI globals are initialized but before we begin ESP message // processing, have the CLI context set its user identity based on // the OS user identity. ContextCli *context = cliGlobals->currContext(); ex_assert(context, "Invalid context pointer"); context->initializeUserInfoFromOS(); ExEspFragInstanceDir espFragInstanceDir(cliGlobals, espExecutorHeap, (StatsGlobals *)statsGlobals); ExEspControlMessage espIpcControlMessage(&espFragInstanceDir, ipcEnvPtr, espIpcHeap); // handle startup (command line args, control connection) DoEspStartup(argc,argv,*ipcEnvPtr,espFragInstanceDir,guaReceiveFastStart); // the control message stream talks through the control connection espIpcControlMessage.addRecipient( ipcEnvPtr->getControlConnection()->getConnection()); // start the first receive operation espIpcControlMessage.receive(FALSE); NABoolean timeout; Int64 prevWaitTime = 0; // while there are requesters while (espFragInstanceDir.getNumMasters() > 0) { // ----------------------------------------------------------------- // The ESPs most important line of code: DO THE WORK // ----------------------------------------------------------------- espFragInstanceDir.work(prevWaitTime); // ----------------------------------------------------------------- // After we have done work, it's necessary to wait for some I/O // (the frag instance dir work procedure works until it is blocked). // ----------------------------------------------------------------- ipcEnvPtr->getAllConnections()-> waitOnAll(IpcInfiniteTimeout, TRUE, &timeout, &prevWaitTime); // TRUE means: Called by ESP main } // nobody wants us anymore, right now that means that we stop return 0; }
ExWorkProcRetcode ExCancelTcb::work() { ExMasterStmtGlobals *masterGlobals = getGlobals()->castToExExeStmtGlobals()->castToExMasterStmtGlobals(); CliGlobals *cliGlobals = masterGlobals->getCliGlobals(); while ((qparent_.down->isEmpty() == FALSE) && (qparent_.up->isFull() == FALSE)) { ex_queue_entry *pentry_down = qparent_.down->getHeadEntry(); switch (step_) { case NOT_STARTED: { if (pentry_down->downState.request == ex_queue::GET_NOMORE) step_ = DONE; else { retryCount_ = 0; // Priv checking is done during compilation. To support // REVOKE, prevent a prepared CANCEL/SUSPEND/ACTIVATE // that was compiled more than 1 second ago from executing // by raising the 8734 error to force an AQR. Int64 microSecondsSinceCompile = NA_JulianTimestamp() - masterGlobals->getStatement()->getCompileEndTime(); if (microSecondsSinceCompile > 1000*1000) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-CLI_INVALID_QUERY_PRIVS); reportError(diagsArea); step_ = DONE; break; } // Figure out which MXSSMP broker to use. if (cancelTdb().getAction() == ComTdbCancel::CancelByPname) { int nid = -1; int rc = msg_mon_get_process_info(cancelTdb().getCancelPname(), &nid, &pid_); switch (rc) { case XZFIL_ERR_OK: cpu_ = (short) nid; break; case XZFIL_ERR_NOTFOUND: case XZFIL_ERR_BADNAME: case XZFIL_ERR_NOSUCHDEV: { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND); *diagsArea << DgString0(cancelTdb().getCancelPname()); reportError(diagsArea); step_ = DONE; break; } default: { char buf[200]; str_sprintf(buf, "Unexpected error %d returned from " "msg_mon_get_process_info", rc); ex_assert(0, buf); } } if (step_ != NOT_STARTED) break; } else if (cancelTdb().getAction() == ComTdbCancel::CancelByNidPid) { cpu_ = (short) cancelTdb().getCancelNid(); pid_ = cancelTdb().getCancelPid(); // check that process exists, if not report error. char processName[MS_MON_MAX_PROCESS_NAME]; int rc = msg_mon_get_process_name(cpu_, pid_, processName); if (XZFIL_ERR_OK == rc) ; // good. nid & pid are valid. else { if ((XZFIL_ERR_NOTFOUND != rc) && (XZFIL_ERR_BADNAME != rc) && (XZFIL_ERR_NOSUCHDEV != rc)) { // Log rc in case it needs investigation later. char buf[200]; str_sprintf(buf, "Unexpected error %d returned from " "msg_mon_get_process_name", rc); SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, buf, 0); } char nid_pid_str[32]; str_sprintf(nid_pid_str, "%d, %d", cpu_, pid_); ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND); *diagsArea << DgString0(nid_pid_str); reportError(diagsArea); step_ = DONE; break; } } else { char * qid = cancelTdb().qid_; Lng32 qid_len = str_len(qid); // This static method is defined in SqlStats.cpp. It side-effects // the nodeName and cpu_ according to the input qid. if (getMasterCpu( qid, qid_len, nodeName_, sizeof(nodeName_) - 1, cpu_) == -1) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_RTS_INVALID_QID); reportError(diagsArea); step_ = DONE; break; } } // Testpoints for hard to reproduce problems: bool fakeError8028 = false; fakeError8028 = (getenv("HP_FAKE_ERROR_8028") != NULL); if ((cliGlobals->getCbServerClass() == NULL) || fakeError8028) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND); *diagsArea << DgString0("$ZSM000"); reportError(diagsArea); step_ = DONE; break; } ComDiagsArea *diagsArea = NULL; bool fakeError2024 = false; fakeError2024 = (getenv("HP_FAKE_ERROR_2024") != NULL); if (fakeError2024) { cbServer_ = NULL; diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); if (getenv("HP_FAKE_ERROR_8142")) { *diagsArea << DgSqlCode(-8142); *diagsArea << DgString0(__FILE__); *diagsArea << DgString1("cbServer_ is NULL"); } else *diagsArea << DgSqlCode(-2024); } else cbServer_ = cliGlobals->getCbServerClass()->allocateServerProcess( &diagsArea, cliGlobals->getEnvironment()->getHeap(), nodeName_, cpu_, IPC_PRIORITY_DONT_CARE, FALSE, // usesTransactions TRUE, // waitedCreation 2 // maxNowaitRequests -- cancel+(1 extra). ); if (cbServer_ == NULL || cbServer_->getControlConnection() == NULL) { ex_assert(diagsArea != NULL, "allocateServerProcess failed, but no diags"); // look for SQLCode 2024 // "*** ERROR[2024] Server Process $0~string0 // is not running or could not be created. Operating System // Error $1~int0 was returned." // Remap to cancel-specfic error 8028. if (diagsArea->contains(-2024) && cancelTdb().actionIsCancel()) { diagsArea->deleteError(diagsArea->returnIndex(-2024)); reportError(diagsArea, true, EXE_CANCEL_PROCESS_NOT_FOUND, nodeName_, cpu_); } else reportError(diagsArea); step_ = DONE; break; } // the reportError method was not called -- see break above. if (diagsArea != NULL) diagsArea->decrRefCount(); //Create the stream on the IpcHeap, since we don't dispose // of it immediately. We just add it to the list of completed // messages in the IpcEnv, and it is disposed of later. cancelStream_ = new (cliGlobals->getIpcHeap()) CancelMsgStream(cliGlobals->getEnvironment(), this); cancelStream_->addRecipient(cbServer_->getControlConnection()); } step_ = SEND_MESSAGE; break; } // end case NOT_STARTED #pragma warning (disable : 4291) case SEND_MESSAGE: { RtsHandle rtsHandle = (RtsHandle) this; if (cancelTdb().actionIsCancel()) { Int64 cancelStartTime = JULIANTIMESTAMP(); Lng32 firstEscalationInterval = cliGlobals->currContext()-> getSessionDefaults()->getCancelEscalationInterval(); Lng32 secondEscalationInterval = cliGlobals->currContext()-> getSessionDefaults()->getCancelEscalationMxosrvrInterval(); NABoolean cancelEscalationSaveabend = cliGlobals->currContext()-> getSessionDefaults()->getCancelEscalationSaveabend(); bool cancelLogging = (TRUE == cliGlobals->currContext()-> getSessionDefaults()->getCancelLogging()); CancelQueryRequest *cancelMsg = new (cliGlobals->getIpcHeap()) CancelQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), cancelStartTime, firstEscalationInterval, secondEscalationInterval, cancelEscalationSaveabend, cancelTdb().getCommentText(), str_len(cancelTdb().getCommentText()), cancelLogging, cancelTdb().action_ != ComTdbCancel::CancelByQid, pid_, cancelTdb().getCancelPidBlockThreshold()); #pragma warning (default : 4291) *cancelStream_ << *cancelMsg; cancelMsg->decrRefCount(); } else if (ComTdbCancel::Suspend == cancelTdb().action_) { bool suspendLogging = (TRUE == cliGlobals->currContext()-> getSessionDefaults()->getSuspendLogging()); #pragma warning (disable : 4291) SuspendQueryRequest * suspendMsg = new (cliGlobals->getIpcHeap()) SuspendQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), ComTdbCancel::Force == cancelTdb().forced_, suspendLogging); #pragma warning (default : 4291) *cancelStream_ << *suspendMsg; suspendMsg->decrRefCount(); } else { ex_assert( ComTdbCancel::Activate == cancelTdb().action_, "invalid action for ExCancelTcb"); bool suspendLogging = (TRUE == cliGlobals->currContext()-> getSessionDefaults()->getSuspendLogging()); #pragma warning (disable : 4291) ActivateQueryRequest * activateMsg = new (cliGlobals->getIpcHeap()) ActivateQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), suspendLogging); #pragma warning (default : 4291) *cancelStream_ << *activateMsg; activateMsg->decrRefCount(); } if ((cancelTdb().getAction() != ComTdbCancel::CancelByPname) && (cancelTdb().getAction() != ComTdbCancel::CancelByNidPid)) { char * qid = cancelTdb().qid_; Lng32 qid_len = str_len(qid); #pragma warning (disable : 4291) RtsQueryId *rtsQueryId = new (cliGlobals->getIpcHeap()) RtsQueryId( cliGlobals->getIpcHeap(), qid, qid_len); #pragma warning (default : 4291) *cancelStream_ << *rtsQueryId; rtsQueryId->decrRefCount(); } // send a no-wait request to the cancel broker. cancelStream_->send(FALSE); step_ = GET_REPLY; // Come back when I/O completes. return WORK_OK; break; } // end case SEND_MESSAGE case GET_REPLY: { // Handle general IPC error. bool fakeError201 = false; fakeError201 = (getenv("HP_FAKE_ERROR_201") != NULL); if ((cbServer_->getControlConnection()->getErrorInfo() != 0) || fakeError201) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); cbServer_->getControlConnection()-> populateDiagsArea( diagsArea, getGlobals()->getDefaultHeap()); if (fakeError201) { *diagsArea << DgSqlCode(-2034) << DgInt0(201) << DgString0("I say") << DgString1("control broker"); } if (diagsArea->contains(-8921)) { // Should not get timeout error 8921. Get a core-file // of the SSMP and this process too so that this can be // debugged. cbServer_->getControlConnection()-> dumpAndStopOtherEnd(true, false); genLinuxCorefile("Unexpected timeout error"); } reportError(diagsArea); step_ = DONE; break; } // See if stream has the reply yet. if (!cancelStream_->moreObjects()) return WORK_OK; #pragma warning (disable : 4291) ControlQueryReply *reply = new (cliGlobals->getIpcHeap()) ControlQueryReply(INVALID_RTS_HANDLE, cliGlobals->getIpcHeap()); #pragma warning (default : 4291) *cancelStream_ >> *reply; if (reply->didAttemptControl()) { // yeaah! cancelStream_->clearAllObjects(); } else { if (cancelStream_->moreObjects() && cancelStream_->getNextObjType() == IPC_SQL_DIAG_AREA) { ComDiagsArea *diagsArea = ComDiagsArea::allocate(getGlobals()->getDefaultHeap()); *cancelStream_ >> *diagsArea; cancelStream_->clearAllObjects(); if ( retryQidNotActive_ && (diagsArea->mainSQLCODE() == -EXE_SUSPEND_QID_NOT_ACTIVE) && (++retryCount_ <= 60)) { SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, "Retrying error 8672.", 0); DELAY(500); diagsArea->decrRefCount(); step_ = SEND_MESSAGE; break; } reportError(diagsArea); } else ex_assert(0, "Control failed, but no diagnostics."); } step_ = DONE; break; } case DONE: { if (cancelStream_) { cancelStream_->addToCompletedList(); cancelStream_ = NULL; } if (cbServer_) { cbServer_->release(); cbServer_ = NULL; } ex_queue_entry * up_entry = qparent_.up->getTailEntry(); up_entry->copyAtp(pentry_down); up_entry->upState.parentIndex = pentry_down->downState.parentIndex; up_entry->upState.downIndex = qparent_.down->getHeadIndex(); up_entry->upState.setMatchNo(1); up_entry->upState.status = ex_queue::Q_NO_DATA; qparent_.up->insert(); qparent_.down->removeHead(); step_ = NOT_STARTED; break; } default: ex_assert( 0, "Unknown step_."); }