Пример #1
0
        /**
         * Called at the start of InitBootstrapConfiguration, this method is a last chance for a subclass to mess with the bootstrap configuration before it is used.
         *
         * On entry, bootstrapConfigPathname is the fully qualified name of the bootstrap file.
         *                bootstrapConfiguration is the raw (no macro expansion or override collapsing) configuration, or NULL if the configuration was not found.
         *
         * On exit, bootstrapConfiguration is the final raw (no macro expansion or override collapsing) bootstrap configuration. If NULL, initialization will fail.
         *
         * The default implementation does nothing.
         *
         * Returns false if initialization should fail.
         */
        virtual bool PreprocessBootstrapConfiguration(const char *bootstrapConfigPathname, Ptr<const IConfiguration>& bootstrapConfiguration)
        {
            if (bootstrapConfiguration == NULL) {
                // The bootstrap config file is missing -- build a default one
                DrStr64 strDataDirLocation;
                DrStr32 strRelDataDirLocation;
                strRelDataDirLocation.SetF(".\\DataDir.%u",  GetCurrentProcessId());
                DrError err = DrCanonicalizeFilePath(strDataDirLocation, strRelDataDirLocation);
                if (err != DrError_OK) {
                    DrLogE( "DryadConfigurationManager",
                        "Failed to canonicalize data directory name %s error=%s",
                        strRelDataDirLocation.GetString(), DRERRORSTRING(err));
                    return false;
                }
                
                Ptr<IMutableConfiguration> cfg = Configuration::GenerateDefaultBootstrapConfig(
                    strDataDirLocation.GetString(),
                    "...",
                    "default",
                    NULL);
                if (cfg == NULL) {
                    DrLogE( "DryadConfigurationManager",
                        "Failed to create default bootstrap file");
                    return false;
                }

                bootstrapConfiguration = cfg;
            }
            
            return true;
        }
Пример #2
0
DrJobTicket* CreateGlobalJob()
{    
    DrError hr = S_OK;
    DrServiceDescriptor sd;
    DrRef<DrJobTicket> jobTicket = g_pDryadConfig->GetDefaultJobTicket();    
    
    sd.Set("xcps", g_pDryadConfig->GetDefaultClusterName(), NULL, "rd.RDRBasic.XComputeProcessScheduler_0");        
    
    DrRef<XcPsCreateJobRequest> msg;
    msg.Attach(new XcPsCreateJobRequest());
    msg->SetCreateJobTicket(jobTicket);    
    XcJobConstraint& constraint = msg->CreateJobConstraint();
    constraint.SetMaxConcurrentProcesses(999);
    constraint.SetMaxExecutionTime(DrTimeInterval_Hour);
    g_pDrClient->SendTo(msg, sd);    
    msg->WaitForResponse( &hr );
    if (hr != DrError_OK)
    {
        DrLogE( "DryadConfigurationManager",
            "CreateJob failied, error=%s",
            DRERRORSTRING(hr));
        LogAssert(false);
    }
    return jobTicket.Detach();    
}
Пример #3
0
        /**
         * Called immediately after attempted loading of the default configuration, this method is a last chance for a subclass to
         * mess with the default configuration before it is used.
         *
         * On entry, defaultConfigPathname is the fully qualified name of the default configuration.
         *                "configuration" is the default filtered view of the configuration, or NULL if the configuration was not found
         *                 rawConfiguration is the raw (no macro expansion or override collapsing) configuration, or NULL if the configuration was not found.
         *
         * On exit, rawConfiguration is the final raw (no macro expansion or override collapsing) default configuration. If NULL, initialization will fail.
         *
         * The default implementation does nothing.
         *
         * Returns false if initialization should fail.
         */
        virtual bool PreprocessDefaultConfiguration(
            const char *defaultConfigPathname,
            const IConfiguration *configuration,
            Ptr<const IConfiguration>& rawConfiguration)
        {
            // Create an editable version of the configuration
            Ptr<IMutableConfiguration> newConfig;
            if (configuration == NULL) {
                // The config file is missing -- build a default one
                newConfig = Configuration::GenerateDefaultConfig();
                if (newConfig == NULL) {
                    DrLogE( "DryadConfigurationManager",
                        "Failed to create default config file");
                    return false;
                }
                for (const char **ppDefaults = s_defaultParameters; *ppDefaults != NULL; ppDefaults += 3) {
                    const char *section = ppDefaults[0];
                    const char *param = ppDefaults[1];
                    LogAssert(param != NULL);
                    const char *value = ppDefaults[2];
                    LogAssert(value != NULL);
                    newConfig->SetParameter(section, param, value);
                }
            } else {
                newConfig = new ConfigurationMap(configuration);
                if (newConfig == NULL) {
                    DrLogE( "DryadConfigurationManager",
                        "Failed to create copy of config file");
                    return false;
                }
            }

            // process the command line to override values
            DrError err = ApplyDryadConfigOverrides(m_argc, m_argv, &m_nOpts, newConfig);
            if (err != DrError_OK) {
                DrLogE( "DryadConfigurationManager",
                    "Failed to apply command line overrides to config file: %s", err);
                return false;
            }

            // replace the configuration with the edited one. Note that overrides, etc. have already been applied and removed.
            rawConfiguration = newConfig;
            return true;
        }
Пример #4
0
//
// if $HPCQUERY_DEBUGVERTEXHOST is defined, break into the debugger
//
void BreakForDebugger()
{
    WCHAR strDebugBreak [MAX_PATH];
    HRESULT hr = DrGetEnvironmentVariable(L"HPCQUERY_DEBUGVERTEXHOST", strDebugBreak);
    if(hr == DrError_OK)
    {
            DrLogE("Waiting for debugger ");
            DrLogging::FlushLog(); 
            
            while (!IsDebuggerPresent()) 
            {
                Sleep(2000);
            }

            DebugBreak();
    }
}
Пример #5
0
//
// Interpret command received from GM
//
DrError DVertexPnController::ActOnCommand(DVertexCommandBlock* commandBlock)
{
    // 
    // Break into debugger if command asks for it
    //
    if (commandBlock->GetDebugBreak())
    {
        ::DebugBreak();
    }

    DVertexCommand command = commandBlock->GetVertexCommand();
    DrError err = DrError_OK;

    //
    // Critical section to issue commands
     //
    {
        AutoCriticalSection acs(&m_baseCS);

        switch (command)
        {
        case DVertexCommand_Start:
            //
            // Command is to start a new vertex
            //
            if (m_currentStatus->GetVertexState() != DrError_OK)
            {
                //
                // If vertex is in an error state, can't start it
                //
                err = DryadError_InvalidCommand;
            }
            else
            {
                //
                // If vertex ok, then start with command from GM
                // this is non-blocking and will return after creating new thread 
                //
                DrLogI("Start command received.");
                Start(commandBlock);
            }
            break;

        case DVertexCommand_ReOpenChannels:
            //
            // If reopen channels command, then reopen channels
            // todo: find out if this is ever used
            //
            DrLogI("Reopen Channels command received.");
            ReOpenChannels(commandBlock);
            break;

        case DVertexCommand_Terminate:
            //
            // Terminate command
            // todo: find out from Victor if this is ever used
            //
            DrError currentState;
            currentState = m_currentStatus->GetVertexState();
            DrLogI("Terminate command received.");
            if (m_waitingForTermination == false)
            {
                //
                // If not waiting for termination already, terminate.
                // 
                DrLogD( "About to terminate");
                if (currentState == DryadError_VertexCompleted)
                {
                    //
                    // If already done, report cause of finish
                    //
                    Terminate(m_currentStatus->GetVertexState(),
                              DrExitCode_OK);
                }
                else
                {
                    //
                    // If not yet done, terminate because of this
                    //
                    Terminate(DryadError_VertexReceivedTermination,
                              DrExitCode_Killed);
                }
            }

            //
            // if waiting for termination, stop asking for new commands
            //
            err = DryadError_VertexReceivedTermination;
            break;

        default:
            //
            // Invalid command
            //
            DrLogE("Unknown command received.");
            err = DryadError_InvalidCommand;
            break;
        }
    }

    return err;
}
Пример #6
0
//
// Create files which contain information used to restart the upcoming vertex command
// Used for post-mortem debugging.
//
void DVertexPnController::DumpRestartCommand(DVertexCommandBlock* commandBlock)
{
    DrError err;

    //
    // Create temporary buffer
    //
    DrRef<DrSimpleHeapBuffer> buf;
    buf.Attach(new DrSimpleHeapBuffer());

    //
    // Write command block into buffer
    //
    {
        DrMemoryBufferWriter writer(buf);
        err = commandBlock->Serialize(&writer);
    }

    //
    // If write fails, log failure and return
    //
    if (err != DrError_OK)
    {
        DrLogE("Can't serialize command block for restart --- %s",
            DRERRORSTRING(err));
        return;
    }

    //
    // Get data reference and byte count
    //
    const void* serializedData;
    Size_t availableToRead;
    serializedData = buf->GetReadAddress(0, &availableToRead);
    LogAssert(availableToRead >= buf->GetAvailableSize());

    //
    // Get the process information 
    //
    DVertexProcessStatus* ps = commandBlock->GetProcessStatus();

    //
    // Build file for data required for rerun, open it
    //
    DrStr64 restartBlockName;
    restartBlockName.SetF("vertex-%u-%u-rerun-data.dat",
                          ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fData = fopen(restartBlockName, "wb");
    if (fData == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run command block file '%s' --- %s",
            restartBlockName.GetString(), DRERRORSTRING(err));
        return;
    }

    //
    // Build file for original information required for rerun, open it
    //
    DrStr64 originalInfoName;
    originalInfoName.SetF("vertex-%u-%u-rerun-originalInfo.txt",
                          ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fOriginalText = fopen(originalInfoName, "w");
    if (fOriginalText == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run original info file '%s' --- %s",
            originalInfoName.GetString(), DRERRORSTRING(err));

        //
        // Close data file
        //
        fclose(fData);
        return;
    }

    //
    // Build file for rerun command line, open it
    //
    DrStr64 originalRestartCommand;
    originalRestartCommand.SetF("vertex-%u-%u-rerun.cmd",
                                ps->GetVertexId(),
                                ps->GetVertexInstanceVersion());
    FILE* fOriginalRestart = fopen(originalRestartCommand, "w");
    if (fOriginalRestart == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run original command file '%s' --- %s",
            originalRestartCommand.GetString(), DRERRORSTRING(err));

        //
        // Close data and original text files
        //
        fclose(fData);
        fclose(fOriginalText);
        return;
    }

    //
    // Open file for local info
    //

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    DrStr64 localInfoName;
    localInfoName.SetF("vertex-%u-%u-rerun-localInfo.txt",
                       ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fLocalText = fopen(localInfoName, "w");
    if (fLocalText == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run local info file '%s' --- %s",
            localInfoName.GetString(), DRERRORSTRING(err));

        //
        // Close data, cmd, and original text files
        //
        fclose(fData);
        fclose(fOriginalText);
        fclose(fOriginalRestart);
        return;
    }
    */


    //
    // Open file for rerun with local inputs
    //

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    DrStr64 localRestartCommand;
    localRestartCommand.SetF("vertex-%u-%u-rerun-local-inputs.cmd",
                             ps->GetVertexId(),
                             ps->GetVertexInstanceVersion());
    FILE* fLocalRestart = fopen(localRestartCommand, "w");
    if (fLocalRestart == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run local command file '%s' --- %s",
            localRestartCommand.GetString(), DRERRORSTRING(err));

        //
        // Close data, cmd, original, and local text files
        //
        fclose(fData);
        fclose(fOriginalText);
        fclose(fOriginalRestart);
        fclose(fLocalText);
        return;
    }
    */

    //
    // Open file for fetching inputs
    //
    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    DrStr64 copyCommand;
    copyCommand.SetF("vertex-%u-%u-rerun-fetch-inputs.cmd",
                     ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fCopyCommand = fopen(copyCommand, "w");
    if (fCopyCommand == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run copy command file '%s' --- %s",
            localRestartCommand.GetString(), DRERRORSTRING(err));

        //
        // Close data, original and localcmd, and original and local text files
        //
        fclose(fData);
        fclose(fOriginalText);
        fclose(fOriginalRestart);
        fclose(fLocalText);
        fclose(fLocalRestart);
        return;
    }
    */

    //
    // Write out data to data file, then close it.
    //
    size_t written = fwrite(serializedData, 1, buf->GetAvailableSize(), fData);
    fclose(fData);
    if (written != buf->GetAvailableSize())
    {
        //
        // If failed to write all the data, log failure
        //
        err = DrGetLastError();
        DrLogE(
            "Failed to write re-run command block file '%s': only %Iu of %Iu bytes written --- %s",
            restartBlockName.GetString(),
            written, (size_t) (buf->GetAvailableSize()),
            DRERRORSTRING(err));
    }

    //
    // Write original restart command
    //
    fprintf(fOriginalRestart,
            "%s --cmd -dump %s -overridetext %s\n",
            m_parent->GetRunningExePathName(),
            restartBlockName.GetString(),
            originalInfoName.GetString());

    //
    // Write local restart command
    //
    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalRestart,
            "%s --vertex --cmd -dump %s -overridetext %s\n",
            m_parent->GetRunningExePathName(),
            restartBlockName.GetString(),
            localInfoName.GetString());
    */

    //
    // Record number of input files
    //
    fprintf(fOriginalText, "%u # input files\n", ps->GetInputChannelCount());

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalText, "%u # input files\n", ps->GetInputChannelCount());
    */

    //
    // Get the input channels and foreach channel, add copy command to copy script
    //
    DryadInputChannelDescription* inputs = ps->GetInputChannels();
    for (UInt32 i=0; i<ps->GetInputChannelCount(); ++i)
    {
        const char* uri = inputs[i].GetChannelURI();

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        if (::_strnicmp(uri, "file://", 7) == 0)
        {
            //
            // If reading from file, copy command doesn't want "file://" prefix
            // todo: remove reference to cosmos
            //
            fprintf(fCopyCommand, "cosmos.exe copy %s v%u.%u-i%u\n",
                    uri+7,
                    ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        }
        else if (::_strnicmp(uri, "cosmos://", 9) == 0)
        {
            //
            // If reading from cosmos path, copy directly
            // todo: remove cosmos code
            //
            fprintf(fCopyCommand, "cosmos.exe copy %s v%u.%u-i%u\n",
                    uri,
                    ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        }
        else
        {
            //
            // Otherwise, unable to copy
            //
            fprintf(fCopyCommand, "echo can't copy URI %s to v%u.%u-i%u\n",
                    uri,
                    ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        }
        */

        // 
        // At reference to this URI to original and relative reference to local
        //
        fprintf(fOriginalText, "%s\n", uri);

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        fprintf(fLocalText, "file://v%u.%u-i%u\n",
                ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        */
    }

    //
    // Record number of output files
    //
    fprintf(fOriginalText, "%u # output files\n", ps->GetOutputChannelCount());

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalText, "%u # output files\n", ps->GetOutputChannelCount());
    */

    //
    // Get the output channels and record each one
    //
    DryadOutputChannelDescription* outputs = ps->GetOutputChannels();
    for (UInt32 i=0; i<ps->GetOutputChannelCount(); ++i)
    {
        const char* uri = outputs[i].GetChannelURI();

        //
        // Check if uri is writting to DSC partition. 
        // If it is, redirect to local temp file to avoid writing to sealed stream
        // 
        DrStr uriMod("");
        if(ConcreteRChannel::IsDscPartition(uri))
        {
            uriMod.AppendF("file://hpcdscpt_redirect_%d.dtf", i);
            uri = uriMod.GetString();
        }

        fprintf(fOriginalText, "%s\n", uri);

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        fprintf(fLocalText, "%s\n", uri);
        */
    }

    //
    // Record number of arguments
    //
    fprintf(fOriginalText, "%u # arguments\n",
            commandBlock->GetArgumentCount());

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalText, "%u # arguments\n", commandBlock->GetArgumentCount());
    */

    //
    // Foreach argument, record its value
    //
    for (UInt32 i=0; i<commandBlock->GetArgumentCount(); ++i)
    {
        DrStr64 arg = commandBlock->GetArgumentVector()[i];
        fprintf(fOriginalText, "%s\n", arg.GetString());

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        fprintf(fLocalText, "%s\n", arg.GetString());
        */
    }

    //
    // Close all files
    // todo: fData closed above, remove duplicate
    //
    fclose(fData);
    fclose(fOriginalText);
    fclose(fOriginalRestart);

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fclose(fLocalText);
    fclose(fLocalRestart);
    fclose(fCopyCommand);
    */
}
Пример #7
0
//
// Runs in new thread created and started by DVertexPnController::Start
// Executes vertex and terminates to report final status
//
unsigned DVertexPnController::ThreadFunc(void* arg)
{
    //
    // Get Controller Reference, and then clean up unnecessary thread block
    //
    DVertexPnControllerThreadBlock* threadBlock =
        (DVertexPnControllerThreadBlock *) arg;
    DVertexPnController* self = threadBlock->m_parent;
    DrRef<DVertexCommandBlock> startCommand = threadBlock->m_commandBlock;

    LogAssert(self != NULL,"Received NULL DVertexPnController pointer.");
	
    if(startCommand == NULL)
    {
        DrLogE("Received NULL DVertexCommandBlock pointer");
        self->Terminate(DrError_Fail, DrError_Fail);
		return DrExitCode_Fail;
    }


    delete threadBlock;
    threadBlock = NULL;

    //
    // Execute dryad vertex - blocking
    //
    DrError vertexState =
        self->m_vertex->RunDryadVertex(startCommand->GetProcessStatus(),
                                       startCommand->GetArgumentCount(),
                                       startCommand->GetArgumentVector());

    //
    // If vertex completed, then success, otherwise failure
    //
    UInt32 exitCode;
    if (vertexState == DryadError_VertexCompleted)
    {
        exitCode = DrExitCode_OK;
    }
    else
    {
        exitCode = DrExitCode_Fail;
    }

    //
    // Enter critical section to turn off active vertex
    //
    {
        AutoCriticalSection acs(&(self->m_baseCS));
        LogAssert(self->m_activeVertex == true);
        self->m_activeVertex = false;
    }

    //
    // Notify GM of completed vertex
    //
    DrLogD( "About to terminate");
    self->Terminate(vertexState, exitCode);

    return DrExitCode_OK;
}
Пример #8
0
//
// called from dvertexmain.cpp
//
UInt32 DVertexPnControllerOuter::Run(int argc, char* argv[])
{
    //
    // Make sure there are at least two arguments
    //
    if (argc < 2)
    {
        DrLogE("No vertex arguments specified to the PN controller");
        return 1;
    }

    //
    // Get path and num verticies
    //
    m_exePathName = argv[0];
    m_numberOfVertices = atoi(argv[1]);

    //
    // Fail if number of verticies cannot be converted
    // todo: also fail if INT_MAX or INT_MIN returned denoting invalid range
    //
    if (m_numberOfVertices == 0)
    {
        DrLogE("No vertices specified to the PN controller");
        return 1;
    }

    //
    // If number of arguments isn't 2*numVerticies + 2, then it doesn't make sense
    //
    if ((UInt32) argc != (2 + 2*m_numberOfVertices))
    {
        DrLogE( "%u vertices specified to the PN controller need "
                "%u not %d arguments to describe them",
                m_numberOfVertices, 2 + 2*m_numberOfVertices, argc);
        return 1;
    }

    //
    // Set up array for controllers for each vertex
    //
    LogAssert(m_controllerArray == NULL);
    m_controllerArray = new DVertexPnController* [m_numberOfVertices];
    LogAssert(m_controllerArray != NULL);

    //
    // Critical section to update the number of active verticies
    //
    {
        AutoCriticalSection acs(&m_baseCS);
        m_assertCounter = 0;
        m_activeVertexCount = m_numberOfVertices;
    }

    //
    // Foreach vertex, get command line arguments and make a controller
    // 
    UInt32 i;
    for (i=0; i<m_numberOfVertices; ++i)
    {
        //
        // cmdline has each vertex info in for <vertexID, vertexVersion>
        //
        UInt32 vertexId = atoi(argv[2 + i*2]);
        UInt32 vertexVersion = atoi(argv[2 + i*2 + 1]);

        //
        // Make a new controller for each vertex
        //
        m_controllerArray[i] = MakePnController(vertexId, vertexVersion);
    }

    DrLogging::SetAssertCallback(AssertCallback, this);

    //
    // foreach vertex, launch the command loop
    // todo: code cleanup: any reason not to launch command loop as soon as it's created?
    //
    for (i=0; i<m_numberOfVertices; ++i)
    {
        m_controllerArray[i]->LaunchCommandLoop();
    }

    //
    // Sleep forever - commandloop will take down process when instructed to do so
    //
    ::Sleep(INFINITE);

    return 0;
}
Пример #9
0
//
// Initialize the XCompute layer
//
DrError DryadInitializeXCompute(const char* netLibName, const char* iniFileName,
                              int argc, char* argv[], int* pNOpts)
{
    DrError err = DrError_OK; 
    *pNOpts = 0;

    //
    // Only initialize XCompute Once
    //
    if (g_initialized)
    {
        return DryadError_AlreadyInitialized;
    }

    g_initialized = true;

    // todo: why is this code commented / should it be removed?
/* JC
    if (iniFileName == NULL) {
        iniFileName = s_configFileName;
    }
*/    
    
    //
    // If library name not set, set it to DryadApplication.X 
    // where X = process id
    // otherwise, copy name into "component" string
    //
    char component[32];
    if (netLibName == NULL)
    {
        err = ::StringCbPrintfA(component,
                               sizeof(component),
                               "DryadApplication.%u",
                               ::GetCurrentProcessId());
        LogAssert(SUCCEEDED(err));
        netLibName = component;
    }
    else
    {
        strcpy(component, netLibName);
    }

    //
    // Initialize Xcompute, providing this semi-unique 
    // component name and any config file specified
    //
    err = XcInitialize(iniFileName, component);
    if (err != DrError_OK)
    {
        //
        // If initialization fails, report and exit
        //
        DrLogE( "DryadConfigurationManager - XcInitialize failed, error=%s",
            DRERRORSTRING(err));
        goto exit;
    }
    
    //
    // Open an Xcompute session, using default session settings
    //
    err = XcOpenSession(NULL, &s_session, NULL);
    if (err != DrError_OK)
    {
        //
        // If opening session failed, report and exit
        //
        DrLogE( "DryadConfigurationManager - XcOpenSession failed, error=%s",
            DRERRORSTRING(err));
        goto exit;
    }
    
    //
    // Get handle to session process
    //
    err = XcOpenCurrentProcessHandle(s_session, &s_processHandle);
    if (err == DrError_UnknownProcess && s_processHandle == INVALID_XCPROCESSHANDLE)
    {
        // todo: remove commented code
//JC        //Job manager not running under a CN will have to do the following initialization
//JC        CreateGlobalJob();

        //
        // If process cannot be found, report and exit
        //
        DrLogE( "DryadConfigurationManager - XcOpenCurrentProcessHandle failed (because not running on a compute node?), error=%s",
            DRERRORSTRING(err));
        goto exit;
    }
    else if(err != DrError_OK)
    {
        //
        // If failure other than unknown process, report and exit
        //
        DrLogE( "DryadConfigurationManager - XcOpenCurrentProcessHandle failed, error=%s",
            DRERRORSTRING(err));
        goto exit;
    }

    //
    // Initialize the dryad metadata and start listening for IO completion events
    //
    DryadInitialize();
    err = DrError_OK;

exit:

    return err;
}
//
// Run in thread for each vertex
//
unsigned DVertexXComputePnController::CommandLoop()
{
    DrError err;
    UInt32 retries = 0;

    //
    // Get the vertex label
    //
    DrStr64 label;
    DVertexCommandBlock::GetPnPropertyLabel(&label,
                                            m_vertexId,
                                            m_vertexVersion);

    //
    // Wait for communication until error
    //
    do
    {
        //
        // Create request to get vertex version property
        //
        XC_SETANDGETPROCESSINFO_REQINPUT request;
        memset(&request, 0, sizeof(request));
        request.Size = sizeof(request);
        request.pBlockOnPropertyLabel = label.GetString();
        request.BlockOnPropertyversionLastSeen = m_currentCommandVersion;
        request.MaxBlockTime = XCTIMEINTERVAL_MINUTE;
        // XXXX
        request.pPropertyFetchTemplate = (char *) label.GetString();

        //
        // Send the request and check for errors
        //
        PXC_SETANDGETPROCESSINFO_REQRESULTS pResults = NULL;
        err = XcSetAndGetProcessInfo(NULL,//GetProcessHandle(),
                                     &request,
                                     &pResults,
                                     NULL);
        if (err == DrError_OK)
        {
            //
            // If request successfully sent, store process status and exit code
            //
            DrLogI( "Got command property");
            retries = 0;
            DrError processStatus = pResults->pProcessInfo->ProcessStatus;
            DrExitCode exitCode = pResults->pProcessInfo->ExitCode;

            if (processStatus == DrError_OK || exitCode != DrExitCode_StillActive)
            {
                //
                // If the PN thinks we have exited, so better make it so
                //
                err = DrError_Fail;
            }
        }

        //
        // If request was successful and other process doesn't think we're done
        //
        if (err == DrError_OK)
        {
            if (pResults->pProcessInfo->NumberofProcessProperties != 0)
            {
                //
                // Make sure there's only one property and it's the version
                //
                LogAssert(pResults->pProcessInfo->
                          NumberofProcessProperties == 1);
                PXC_PROCESSPROPERTY_INFO property =
                    pResults->pProcessInfo->ppProperties[0];
                LogAssert(::strcmp(property->pPropertyLabel, label) == 0);

                //
                // Update vertex version
                //
                UInt64 newVersion = property->PropertyVersion;
                if (newVersion < m_currentCommandVersion)
                {
                    //
                    // If vertex version is less than the current version, fail (logic error)
                    //
                    DrLogE(
                        "Property version went back in time. Property %s old version %I64u new version %I64u",
                        label.GetString(),
                        m_currentCommandVersion, newVersion);
                    err = DrError_ProcessPropertyVersionMismatch;
                }
                else if (newVersion == m_currentCommandVersion)
                {
                    //
                    // If version the same, report version the same
                    //
                    DrLogI(
                        "Command timeout with same version. Property %s version %I64u",
                        label.GetString(), m_currentCommandVersion);
                }
                else if (newVersion > m_currentCommandVersion)
                {
                    //
                    // If new vertex version, let GM know what process is handling it
                    //
                    DrLogI(
                        "Property got new version. Property %s old version %I64u new version %I64u",
                        label.GetString(),
                        m_currentCommandVersion, newVersion);

                    m_currentCommandVersion = newVersion;

                    DrRef<DVertexCommandBlock> newCommand;
                    newCommand.Attach(new DVertexCommandBlock());

                    DrRef<DryadXComputePnProcessPropertyResponse> response;
                    response.Attach(new DryadXComputePnProcessPropertyResponse(pResults->pProcessInfo));

                    //
                    // Get new vertex command
                    //
                    err = newCommand->ReadFromResponseMessage(response, m_vertexId, m_vertexVersion);

                    //
                    // If no errors in getting command, act on it. Log any failures below
                    //
                    if (err == DrError_OK)
                    {
                        err = ActOnCommand(newCommand);
                    }
                }
            }
        }
        else
        {
            //
            // Log error and continue
            //
            DrLogE( "XcSetAndGetProcessInfo got error: %s", DRERRORSTRING(err));
        }

        //
        // If the error is related to disconnection, retry up to 4 times
        //
        if (err == DrError_RemoteDisconnected ||
                err == DrError_LocalDisconnected ||
                err == DrError_ConnectionFailed ||
                err == DrError_ResponseDisconnect)
        {
            ++retries;
            // todo: move 4 to global
            if (retries < 4)
            {
                DrLogW( "Retrying get");
                err = DrError_OK;
            }
        }

        //
        // If result was allocated, free it before next iteration
        //
        if (pResults != NULL)
        {
            XCERROR freeError = XcFreeMemory(pResults);
            LogAssert(freeError == DrError_OK);
        }
    } while (err == DrError_OK);

    //
    // Close this controller and take no more requests
    //
    DrLogD( "About to terminate");
    Terminate(err, DrExitCode_Fail);

    //
    // Sleep forever, waiting for verticies to complete and take down the process
    //
    Sleep(INFINITE);

    return 0;
}