Int32 runESP(Int32 argc, char** argv, GuaReceiveFastStart *guaReceiveFastStart)
{
  // initialize ESP global data
  StatsGlobals * statsGlobals;

  XCONTROLMESSAGESYSTEM(XCTLMSGSYS_SETRECVLIMIT, XMAX_SETTABLE_RECVLIMIT_H);
  CliGlobals *cliGlobals = NULL;
  cliGlobals = CliGlobals::createCliGlobals(TRUE); // TRUE indicates a non-master process (WAIT on LREC)
  if (cliGlobals == NULL) // Sanity check
    NAExit(1); // Abend
  Int32 shmid;
  statsGlobals = shareStatsSegment(shmid);
  cliGlobals->setSharedMemId(shmid);
  //Lng32 numCliCalls = cliGlobals->incrNumOfCliCalls();
  cliGlobals->setIsESPProcess(TRUE);
  NAHeap *espExecutorHeap = cliGlobals->getExecutorMemory();
  // must create default context after set IpcEnvironment in CliGlobals first
  // because context's ExSqlComp object needs IpcEnvironment
  cliGlobals->initiateDefaultContext();
  NAHeap *espIpcHeap = cliGlobals->getIpcHeap();
  IpcEnvironment *ipcEnvPtr = cliGlobals->getEnvironment();
  if (statsGlobals != NULL)
     cliGlobals->setMemoryMonitor(statsGlobals->getMemoryMonitor());
  else 
  {
     // Start the  memory monitor for dynamic memory management
     Lng32 memMonitorWindowSize = 10;
     Lng32 memMonitorSampleInterval = 10;
     MemoryMonitor *memMonitor = new (espExecutorHeap) 
                           MemoryMonitor(memMonitorWindowSize,
                           memMonitorSampleInterval,
                           espExecutorHeap);
     cliGlobals->setMemoryMonitor(memMonitor);
  }
  // After CLI globals are initialized but before we begin ESP message
  // processing, have the CLI context set its user identity based on
  // the OS user identity.
  ContextCli *context = cliGlobals->currContext();
  ex_assert(context, "Invalid context pointer");
  context->initializeUserInfoFromOS();

  ExEspFragInstanceDir espFragInstanceDir(cliGlobals,
                                          espExecutorHeap,
                                          (StatsGlobals *)statsGlobals);

  ExEspControlMessage espIpcControlMessage(&espFragInstanceDir,
                                           ipcEnvPtr,
                                           espIpcHeap);

  // handle startup (command line args, control connection)
  DoEspStartup(argc,argv,*ipcEnvPtr,espFragInstanceDir,guaReceiveFastStart);
  // the control message stream talks through the control connection
  espIpcControlMessage.addRecipient(
       ipcEnvPtr->getControlConnection()->getConnection());

  // start the first receive operation
  espIpcControlMessage.receive(FALSE);
 
  NABoolean timeout;
  Int64 prevWaitTime = 0;

  // while there are requesters
  while (espFragInstanceDir.getNumMasters() > 0)
    {
      // -----------------------------------------------------------------
      // The ESPs most important line of code: DO THE WORK
      // -----------------------------------------------------------------

      espFragInstanceDir.work(prevWaitTime);

      // -----------------------------------------------------------------
      // After we have done work, it's necessary to wait for some I/O
      // (the frag instance dir work procedure works until it is blocked).
      // -----------------------------------------------------------------

      ipcEnvPtr->getAllConnections()->
	waitOnAll(IpcInfiniteTimeout, TRUE, &timeout, &prevWaitTime); // TRUE means: Called by ESP main
    }

  // nobody wants us anymore, right now that means that we stop
  return 0;
}
Example #2
0
ExWorkProcRetcode ExCancelTcb::work()
{

  ExMasterStmtGlobals *masterGlobals = 
     getGlobals()->castToExExeStmtGlobals()->castToExMasterStmtGlobals();

  CliGlobals *cliGlobals = masterGlobals->getCliGlobals();

  while ((qparent_.down->isEmpty() == FALSE) && 
         (qparent_.up->isFull() == FALSE))
  {
    ex_queue_entry *pentry_down = qparent_.down->getHeadEntry();
  
    switch (step_)
    {
      case NOT_STARTED:
      {
        if (pentry_down->downState.request == ex_queue::GET_NOMORE)
          step_ = DONE;
        else
        {
          retryCount_ = 0;
          // Priv checking is done during compilation. To support 
          // REVOKE, prevent a prepared CANCEL/SUSPEND/ACTIVATE
          // that was compiled more than 1 second ago from executing 
          // by raising the 8734 error to force an AQR. 
          Int64 microSecondsSinceCompile = NA_JulianTimestamp() - 
              masterGlobals->getStatement()->getCompileEndTime();

          if (microSecondsSinceCompile > 1000*1000)
          {

            ComDiagsArea *diagsArea =
              ComDiagsArea::allocate(getGlobals()->getDefaultHeap());
            *diagsArea << DgSqlCode(-CLI_INVALID_QUERY_PRIVS);
            reportError(diagsArea);
            step_ = DONE;
            break;
          }
          
          // Figure out which MXSSMP broker to use.
          if (cancelTdb().getAction() == ComTdbCancel::CancelByPname)
          {
            int nid = -1;
            int rc = msg_mon_get_process_info(cancelTdb().getCancelPname(),
                                &nid, &pid_);
            switch (rc)
            {
              case XZFIL_ERR_OK:
                cpu_ = (short) nid;
                break;
              case XZFIL_ERR_NOTFOUND:
              case XZFIL_ERR_BADNAME:
              case XZFIL_ERR_NOSUCHDEV:
                {
                  ComDiagsArea *diagsArea =
                    ComDiagsArea::allocate(getGlobals()->getDefaultHeap());

                  *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND);
                  *diagsArea << DgString0(cancelTdb().getCancelPname());
                  reportError(diagsArea);

                  step_ = DONE;
                  break;
                }
              default:
                {
                  char buf[200];
                  str_sprintf(buf, "Unexpected error %d returned from "
                                   "msg_mon_get_process_info", rc);
                  ex_assert(0, buf);
                }
            }
            if (step_ != NOT_STARTED)
              break;
          }
          else if  (cancelTdb().getAction() == ComTdbCancel::CancelByNidPid)
          {
            cpu_ = (short) cancelTdb().getCancelNid();
            pid_ = cancelTdb().getCancelPid();

            // check that process exists, if not report error.
            char processName[MS_MON_MAX_PROCESS_NAME];
            int rc = msg_mon_get_process_name(cpu_, pid_, processName);
            if (XZFIL_ERR_OK == rc)
              ; // good. nid & pid are valid.
            else
            {
              if ((XZFIL_ERR_NOTFOUND  != rc) &&
                  (XZFIL_ERR_BADNAME   != rc) &&
                  (XZFIL_ERR_NOSUCHDEV != rc))
              {
                // Log rc in case it needs investigation later.
               char buf[200];
               str_sprintf(buf, "Unexpected error %d returned from "
                                "msg_mon_get_process_name", rc);
               SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__,
                                               buf, 0);
              }
              char nid_pid_str[32];
              str_sprintf(nid_pid_str, "%d, %d", cpu_, pid_);
              ComDiagsArea *diagsArea =
                    ComDiagsArea::allocate(getGlobals()->getDefaultHeap());

              *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND);
              *diagsArea << DgString0(nid_pid_str);
              reportError(diagsArea);

              step_ = DONE;
              break;
            }
          }
          else
          {
            char * qid = cancelTdb().qid_;
            Lng32 qid_len = str_len(qid);

            // This static method is defined in SqlStats.cpp.  It side-effects
            // the nodeName and cpu_ according to the input qid.
            if (getMasterCpu(
                  qid, qid_len, nodeName_, sizeof(nodeName_) - 1, cpu_) == -1)
            {
              ComDiagsArea *diagsArea =
                ComDiagsArea::allocate(getGlobals()->getDefaultHeap());

              *diagsArea << DgSqlCode(-EXE_RTS_INVALID_QID);

              reportError(diagsArea);

              step_ = DONE;
              break;
            }
          }

          // Testpoints for hard to reproduce problems:
          bool fakeError8028 = false;
          fakeError8028 = (getenv("HP_FAKE_ERROR_8028") != NULL);
          if ((cliGlobals->getCbServerClass() == NULL) || fakeError8028)
          {
            ComDiagsArea *diagsArea = 
              ComDiagsArea::allocate(getGlobals()->getDefaultHeap());

            *diagsArea << DgSqlCode(-EXE_CANCEL_PROCESS_NOT_FOUND);
            *diagsArea << DgString0("$ZSM000");

            reportError(diagsArea);

            step_ = DONE;
            break;
          }

          ComDiagsArea *diagsArea = NULL;
          bool fakeError2024 = false;
          fakeError2024 = (getenv("HP_FAKE_ERROR_2024") != NULL);
        
          if (fakeError2024)
          {
            cbServer_ = NULL;
            diagsArea =
                  ComDiagsArea::allocate(getGlobals()->getDefaultHeap());
            if (getenv("HP_FAKE_ERROR_8142"))
            {
               *diagsArea << DgSqlCode(-8142);
               *diagsArea << DgString0(__FILE__);
               *diagsArea << DgString1("cbServer_ is NULL");
            }
            else
               *diagsArea << DgSqlCode(-2024);
          }
          else
            cbServer_ = cliGlobals->getCbServerClass()->allocateServerProcess(
                      &diagsArea, 
                      cliGlobals->getEnvironment()->getHeap(),
                      nodeName_,
                      cpu_,
                      IPC_PRIORITY_DONT_CARE,
                      FALSE,  // usesTransactions 
                      TRUE,   // waitedCreation
                      2       // maxNowaitRequests -- cancel+(1 extra).
                      );


          if (cbServer_ == NULL || cbServer_->getControlConnection() == NULL)
          {
            ex_assert(diagsArea != NULL, 
                      "allocateServerProcess failed, but no diags");

            // look for SQLCode 2024 
            // "*** ERROR[2024] Server Process $0~string0 
            // is not running or could not be created. Operating System 
            // Error $1~int0 was returned."
            // Remap to cancel-specfic error 8028.
            if (diagsArea->contains(-2024)  &&
                cancelTdb().actionIsCancel())
            {
              diagsArea->deleteError(diagsArea->returnIndex(-2024));
              reportError(diagsArea, true, EXE_CANCEL_PROCESS_NOT_FOUND, 
                          nodeName_, cpu_);
            }
            else
              reportError(diagsArea);

            step_ = DONE;
            break;
          }

          // the reportError method was not called -- see break above.
          if (diagsArea != NULL)
            diagsArea->decrRefCount();

          //Create the stream on the IpcHeap, since we don't dispose 
          // of it immediately.  We just add it to the list of completed 
          // messages in the IpcEnv, and it is disposed of later.

          cancelStream_  = new (cliGlobals->getIpcHeap())
                CancelMsgStream(cliGlobals->getEnvironment(), this);

          cancelStream_->addRecipient(cbServer_->getControlConnection());

        }

        step_ = SEND_MESSAGE;

        break;
      }  // end case NOT_STARTED

#pragma warning (disable : 4291)

      case SEND_MESSAGE:
      {
        RtsHandle rtsHandle = (RtsHandle) this;

        if (cancelTdb().actionIsCancel())
        {
          Int64 cancelStartTime = JULIANTIMESTAMP();

          Lng32 firstEscalationInterval = cliGlobals->currContext()->
                    getSessionDefaults()->getCancelEscalationInterval();

          Lng32 secondEscalationInterval = cliGlobals->currContext()->
                    getSessionDefaults()->getCancelEscalationMxosrvrInterval();

          NABoolean cancelEscalationSaveabend = cliGlobals->currContext()->
                    getSessionDefaults()->getCancelEscalationSaveabend();

          bool cancelLogging = (TRUE == cliGlobals->currContext()->
                    getSessionDefaults()->getCancelLogging());

          CancelQueryRequest *cancelMsg = new (cliGlobals->getIpcHeap()) 
            CancelQueryRequest(rtsHandle, cliGlobals->getIpcHeap(), 
                      cancelStartTime,
                      firstEscalationInterval,
                      secondEscalationInterval,
                      cancelEscalationSaveabend,
                      cancelTdb().getCommentText(),
                      str_len(cancelTdb().getCommentText()),
                      cancelLogging,
                      cancelTdb().action_ != ComTdbCancel::CancelByQid,
                      pid_,
                      cancelTdb().getCancelPidBlockThreshold());

#pragma warning (default : 4291)

          *cancelStream_ << *cancelMsg;

          cancelMsg->decrRefCount();
        }
        else if (ComTdbCancel::Suspend == cancelTdb().action_)
        {

          bool suspendLogging = (TRUE == cliGlobals->currContext()->
                    getSessionDefaults()->getSuspendLogging());

#pragma warning (disable : 4291)
          SuspendQueryRequest * suspendMsg = new (cliGlobals->getIpcHeap()) 
            SuspendQueryRequest(rtsHandle, cliGlobals->getIpcHeap(),
                                ComTdbCancel::Force ==
                                cancelTdb().forced_,
                                suspendLogging);
#pragma warning (default : 4291)

          *cancelStream_ << *suspendMsg;

          suspendMsg->decrRefCount();
        }
        else
        {
          ex_assert(
            ComTdbCancel::Activate == cancelTdb().action_,
            "invalid action for ExCancelTcb");

          bool suspendLogging = (TRUE == cliGlobals->currContext()->
                    getSessionDefaults()->getSuspendLogging());

#pragma warning (disable : 4291)
          ActivateQueryRequest * activateMsg = new (cliGlobals->getIpcHeap()) 
            ActivateQueryRequest(rtsHandle, cliGlobals->getIpcHeap(),
                                 suspendLogging);
#pragma warning (default : 4291)

          *cancelStream_ << *activateMsg;

          activateMsg->decrRefCount();
        }

        if ((cancelTdb().getAction() != ComTdbCancel::CancelByPname) &&
            (cancelTdb().getAction() != ComTdbCancel::CancelByNidPid))
        {
          char * qid = cancelTdb().qid_;
          Lng32 qid_len = str_len(qid);

#pragma warning (disable : 4291)
          RtsQueryId *rtsQueryId = new (cliGlobals->getIpcHeap())
                           RtsQueryId( cliGlobals->getIpcHeap(), qid, qid_len);
#pragma warning (default : 4291)

          *cancelStream_ << *rtsQueryId;
          rtsQueryId->decrRefCount();
        }

        // send a no-wait request to the cancel broker.
        cancelStream_->send(FALSE);

        step_ = GET_REPLY;    
        // Come back when I/O completes.
        return WORK_OK; 

        break;
      }  // end case SEND_MESSAGE

      case GET_REPLY:
      {

        // Handle general IPC error.
        bool fakeError201 = false;
        fakeError201 = (getenv("HP_FAKE_ERROR_201") != NULL);
        if ((cbServer_->getControlConnection()->getErrorInfo() != 0) ||
            fakeError201)
        {
          ComDiagsArea *diagsArea = 
            ComDiagsArea::allocate(getGlobals()->getDefaultHeap());

          cbServer_->getControlConnection()->
              populateDiagsArea( diagsArea, getGlobals()->getDefaultHeap());

          if (fakeError201)
          {
            *diagsArea << DgSqlCode(-2034) << DgInt0(201)
                       << DgString0("I say") << DgString1("control broker");
          }

          if (diagsArea->contains(-8921))
          {
            // Should not get timeout error 8921. Get a core-file
            // of the SSMP and this process too so that this can be
            // debugged.
            cbServer_->getControlConnection()->
              dumpAndStopOtherEnd(true, false);
            genLinuxCorefile("Unexpected timeout error");
          }

          reportError(diagsArea);

          step_ = DONE;
          break;
        }

        // See if stream has the reply yet.
        if (!cancelStream_->moreObjects())
          return WORK_OK; 

#pragma warning (disable : 4291)

        ControlQueryReply *reply = new (cliGlobals->getIpcHeap()) 
              ControlQueryReply(INVALID_RTS_HANDLE, cliGlobals->getIpcHeap());

#pragma warning (default : 4291)

        *cancelStream_ >> *reply;

        if (reply->didAttemptControl())
        {
          // yeaah!
          cancelStream_->clearAllObjects();
        }
        else
        {
          if (cancelStream_->moreObjects() &&
              cancelStream_->getNextObjType() == IPC_SQL_DIAG_AREA)
          {
            ComDiagsArea *diagsArea = 
              ComDiagsArea::allocate(getGlobals()->getDefaultHeap());

            *cancelStream_ >> *diagsArea;
            cancelStream_->clearAllObjects();

            if ( retryQidNotActive_ &&
                (diagsArea->mainSQLCODE() == -EXE_SUSPEND_QID_NOT_ACTIVE) &&
                (++retryCount_ <= 60))
            {
              SQLMXLoggingArea::logExecRtInfo(__FILE__, __LINE__, 
                                             "Retrying error 8672.", 0);
              DELAY(500);
              diagsArea->decrRefCount();
              step_ = SEND_MESSAGE;
              break;
            }

            reportError(diagsArea);
          }
          else 
            ex_assert(0, "Control failed, but no diagnostics.");
        }

        step_ = DONE;
        break;
      }
      case DONE: 
      {
        if (cancelStream_)
        {
          cancelStream_->addToCompletedList();
          cancelStream_ = NULL;
        }
        if (cbServer_)
        {
          cbServer_->release();
          cbServer_ = NULL;
        }

        ex_queue_entry * up_entry = qparent_.up->getTailEntry();
        up_entry->copyAtp(pentry_down);
        up_entry->upState.parentIndex = pentry_down->downState.parentIndex;
        up_entry->upState.downIndex = qparent_.down->getHeadIndex();     
        up_entry->upState.setMatchNo(1);
        up_entry->upState.status = ex_queue::Q_NO_DATA;
        qparent_.up->insert();

        qparent_.down->removeHead();

        step_ = NOT_STARTED;
        break;
      }
      default:
        ex_assert( 0, "Unknown step_.");
    }