Exemplo n.º 1
0
void DryadSubGraphVertex::EdgeInfo::
    MakeFifo(UInt32 fifoLength, WorkQueue* workQueue)
{
    LogAssert(m_reader == NULL);
    LogAssert(m_writer == NULL);

    UInt32 uniquifier = RChannelFactory::GetUniqueFifoId();

    DrStr64 fifoName;
    fifoName.SetF("fifo://%u/internal-%u-%u.%u--%u.%u",
                  fifoLength, uniquifier,
                  m_sourceVertex, m_sourcePort,
                  m_destinationVertex, m_destinationPort);

    DVErrorReporter errorReporter;
    RChannelFactory::OpenReader(fifoName, NULL, NULL, 1, NULL, 0, 0, workQueue,
                                &errorReporter, &m_reader, NULL);
    LogAssert(errorReporter.NoError());
    RChannelFactory::OpenWriter(fifoName, NULL, NULL, 1, NULL, 0, NULL,
                                &errorReporter, &m_writer);
    LogAssert(errorReporter.NoError());

    m_reader->GetReader()->Start(NULL);
    m_writer->GetWriter()->Start();
}
Exemplo n.º 2
0
        /**
         * Called at the start of InitBootstrapConfiguration, this method is a last chance for a subclass to mess with the bootstrap configuration before it is used.
         *
         * On entry, bootstrapConfigPathname is the fully qualified name of the bootstrap file.
         *                bootstrapConfiguration is the raw (no macro expansion or override collapsing) configuration, or NULL if the configuration was not found.
         *
         * On exit, bootstrapConfiguration is the final raw (no macro expansion or override collapsing) bootstrap configuration. If NULL, initialization will fail.
         *
         * The default implementation does nothing.
         *
         * Returns false if initialization should fail.
         */
        virtual bool PreprocessBootstrapConfiguration(const char *bootstrapConfigPathname, Ptr<const IConfiguration>& bootstrapConfiguration)
        {
            if (bootstrapConfiguration == NULL) {
                // The bootstrap config file is missing -- build a default one
                DrStr64 strDataDirLocation;
                DrStr32 strRelDataDirLocation;
                strRelDataDirLocation.SetF(".\\DataDir.%u",  GetCurrentProcessId());
                DrError err = DrCanonicalizeFilePath(strDataDirLocation, strRelDataDirLocation);
                if (err != DrError_OK) {
                    DrLogE( "DryadConfigurationManager",
                        "Failed to canonicalize data directory name %s error=%s",
                        strRelDataDirLocation.GetString(), DRERRORSTRING(err));
                    return false;
                }
                
                Ptr<IMutableConfiguration> cfg = Configuration::GenerateDefaultBootstrapConfig(
                    strDataDirLocation.GetString(),
                    "...",
                    "default",
                    NULL);
                if (cfg == NULL) {
                    DrLogE( "DryadConfigurationManager",
                        "Failed to create default bootstrap file");
                    return false;
                }

                bootstrapConfiguration = cfg;
            }
            
            return true;
        }
Exemplo n.º 3
0
//
// Create files which contain information used to restart the upcoming vertex command
// Used for post-mortem debugging.
//
void DVertexPnController::DumpRestartCommand(DVertexCommandBlock* commandBlock)
{
    DrError err;

    //
    // Create temporary buffer
    //
    DrRef<DrSimpleHeapBuffer> buf;
    buf.Attach(new DrSimpleHeapBuffer());

    //
    // Write command block into buffer
    //
    {
        DrMemoryBufferWriter writer(buf);
        err = commandBlock->Serialize(&writer);
    }

    //
    // If write fails, log failure and return
    //
    if (err != DrError_OK)
    {
        DrLogE("Can't serialize command block for restart --- %s",
            DRERRORSTRING(err));
        return;
    }

    //
    // Get data reference and byte count
    //
    const void* serializedData;
    Size_t availableToRead;
    serializedData = buf->GetReadAddress(0, &availableToRead);
    LogAssert(availableToRead >= buf->GetAvailableSize());

    //
    // Get the process information 
    //
    DVertexProcessStatus* ps = commandBlock->GetProcessStatus();

    //
    // Build file for data required for rerun, open it
    //
    DrStr64 restartBlockName;
    restartBlockName.SetF("vertex-%u-%u-rerun-data.dat",
                          ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fData = fopen(restartBlockName, "wb");
    if (fData == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run command block file '%s' --- %s",
            restartBlockName.GetString(), DRERRORSTRING(err));
        return;
    }

    //
    // Build file for original information required for rerun, open it
    //
    DrStr64 originalInfoName;
    originalInfoName.SetF("vertex-%u-%u-rerun-originalInfo.txt",
                          ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fOriginalText = fopen(originalInfoName, "w");
    if (fOriginalText == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run original info file '%s' --- %s",
            originalInfoName.GetString(), DRERRORSTRING(err));

        //
        // Close data file
        //
        fclose(fData);
        return;
    }

    //
    // Build file for rerun command line, open it
    //
    DrStr64 originalRestartCommand;
    originalRestartCommand.SetF("vertex-%u-%u-rerun.cmd",
                                ps->GetVertexId(),
                                ps->GetVertexInstanceVersion());
    FILE* fOriginalRestart = fopen(originalRestartCommand, "w");
    if (fOriginalRestart == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run original command file '%s' --- %s",
            originalRestartCommand.GetString(), DRERRORSTRING(err));

        //
        // Close data and original text files
        //
        fclose(fData);
        fclose(fOriginalText);
        return;
    }

    //
    // Open file for local info
    //

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    DrStr64 localInfoName;
    localInfoName.SetF("vertex-%u-%u-rerun-localInfo.txt",
                       ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fLocalText = fopen(localInfoName, "w");
    if (fLocalText == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run local info file '%s' --- %s",
            localInfoName.GetString(), DRERRORSTRING(err));

        //
        // Close data, cmd, and original text files
        //
        fclose(fData);
        fclose(fOriginalText);
        fclose(fOriginalRestart);
        return;
    }
    */


    //
    // Open file for rerun with local inputs
    //

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    DrStr64 localRestartCommand;
    localRestartCommand.SetF("vertex-%u-%u-rerun-local-inputs.cmd",
                             ps->GetVertexId(),
                             ps->GetVertexInstanceVersion());
    FILE* fLocalRestart = fopen(localRestartCommand, "w");
    if (fLocalRestart == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run local command file '%s' --- %s",
            localRestartCommand.GetString(), DRERRORSTRING(err));

        //
        // Close data, cmd, original, and local text files
        //
        fclose(fData);
        fclose(fOriginalText);
        fclose(fOriginalRestart);
        fclose(fLocalText);
        return;
    }
    */

    //
    // Open file for fetching inputs
    //
    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    DrStr64 copyCommand;
    copyCommand.SetF("vertex-%u-%u-rerun-fetch-inputs.cmd",
                     ps->GetVertexId(), ps->GetVertexInstanceVersion());
    FILE* fCopyCommand = fopen(copyCommand, "w");
    if (fCopyCommand == NULL)
    {
        //
        // If failed to open file, log and return
        //
        err = DrGetLastError();
        DrLogE(
            "Can't open re-run copy command file '%s' --- %s",
            localRestartCommand.GetString(), DRERRORSTRING(err));

        //
        // Close data, original and localcmd, and original and local text files
        //
        fclose(fData);
        fclose(fOriginalText);
        fclose(fOriginalRestart);
        fclose(fLocalText);
        fclose(fLocalRestart);
        return;
    }
    */

    //
    // Write out data to data file, then close it.
    //
    size_t written = fwrite(serializedData, 1, buf->GetAvailableSize(), fData);
    fclose(fData);
    if (written != buf->GetAvailableSize())
    {
        //
        // If failed to write all the data, log failure
        //
        err = DrGetLastError();
        DrLogE(
            "Failed to write re-run command block file '%s': only %Iu of %Iu bytes written --- %s",
            restartBlockName.GetString(),
            written, (size_t) (buf->GetAvailableSize()),
            DRERRORSTRING(err));
    }

    //
    // Write original restart command
    //
    fprintf(fOriginalRestart,
            "%s --cmd -dump %s -overridetext %s\n",
            m_parent->GetRunningExePathName(),
            restartBlockName.GetString(),
            originalInfoName.GetString());

    //
    // Write local restart command
    //
    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalRestart,
            "%s --vertex --cmd -dump %s -overridetext %s\n",
            m_parent->GetRunningExePathName(),
            restartBlockName.GetString(),
            localInfoName.GetString());
    */

    //
    // Record number of input files
    //
    fprintf(fOriginalText, "%u # input files\n", ps->GetInputChannelCount());

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalText, "%u # input files\n", ps->GetInputChannelCount());
    */

    //
    // Get the input channels and foreach channel, add copy command to copy script
    //
    DryadInputChannelDescription* inputs = ps->GetInputChannels();
    for (UInt32 i=0; i<ps->GetInputChannelCount(); ++i)
    {
        const char* uri = inputs[i].GetChannelURI();

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        if (::_strnicmp(uri, "file://", 7) == 0)
        {
            //
            // If reading from file, copy command doesn't want "file://" prefix
            // todo: remove reference to cosmos
            //
            fprintf(fCopyCommand, "cosmos.exe copy %s v%u.%u-i%u\n",
                    uri+7,
                    ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        }
        else if (::_strnicmp(uri, "cosmos://", 9) == 0)
        {
            //
            // If reading from cosmos path, copy directly
            // todo: remove cosmos code
            //
            fprintf(fCopyCommand, "cosmos.exe copy %s v%u.%u-i%u\n",
                    uri,
                    ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        }
        else
        {
            //
            // Otherwise, unable to copy
            //
            fprintf(fCopyCommand, "echo can't copy URI %s to v%u.%u-i%u\n",
                    uri,
                    ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        }
        */

        // 
        // At reference to this URI to original and relative reference to local
        //
        fprintf(fOriginalText, "%s\n", uri);

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        fprintf(fLocalText, "file://v%u.%u-i%u\n",
                ps->GetVertexId(), ps->GetVertexInstanceVersion(), i);
        */
    }

    //
    // Record number of output files
    //
    fprintf(fOriginalText, "%u # output files\n", ps->GetOutputChannelCount());

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalText, "%u # output files\n", ps->GetOutputChannelCount());
    */

    //
    // Get the output channels and record each one
    //
    DryadOutputChannelDescription* outputs = ps->GetOutputChannels();
    for (UInt32 i=0; i<ps->GetOutputChannelCount(); ++i)
    {
        const char* uri = outputs[i].GetChannelURI();

        //
        // Check if uri is writting to DSC partition. 
        // If it is, redirect to local temp file to avoid writing to sealed stream
        // 
        DrStr uriMod("");
        if(ConcreteRChannel::IsDscPartition(uri))
        {
            uriMod.AppendF("file://hpcdscpt_redirect_%d.dtf", i);
            uri = uriMod.GetString();
        }

        fprintf(fOriginalText, "%s\n", uri);

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        fprintf(fLocalText, "%s\n", uri);
        */
    }

    //
    // Record number of arguments
    //
    fprintf(fOriginalText, "%u # arguments\n",
            commandBlock->GetArgumentCount());

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fprintf(fLocalText, "%u # arguments\n", commandBlock->GetArgumentCount());
    */

    //
    // Foreach argument, record its value
    //
    for (UInt32 i=0; i<commandBlock->GetArgumentCount(); ++i)
    {
        DrStr64 arg = commandBlock->GetArgumentVector()[i];
        fprintf(fOriginalText, "%s\n", arg.GetString());

        /* BUG 16322: Do not create this for SP3, since it is currently broken.
           Consider fixing for v4.
        fprintf(fLocalText, "%s\n", arg.GetString());
        */
    }

    //
    // Close all files
    // todo: fData closed above, remove duplicate
    //
    fclose(fData);
    fclose(fOriginalText);
    fclose(fOriginalRestart);

    /* BUG 16322: Do not create this for SP3, since it is currently broken.
       Consider fixing for v4.
    fclose(fLocalText);
    fclose(fLocalRestart);
    fclose(fCopyCommand);
    */
}
//
// Run in thread for each vertex
//
unsigned DVertexXComputePnController::CommandLoop()
{
    DrError err;
    UInt32 retries = 0;

    //
    // Get the vertex label
    //
    DrStr64 label;
    DVertexCommandBlock::GetPnPropertyLabel(&label,
                                            m_vertexId,
                                            m_vertexVersion);

    //
    // Wait for communication until error
    //
    do
    {
        //
        // Create request to get vertex version property
        //
        XC_SETANDGETPROCESSINFO_REQINPUT request;
        memset(&request, 0, sizeof(request));
        request.Size = sizeof(request);
        request.pBlockOnPropertyLabel = label.GetString();
        request.BlockOnPropertyversionLastSeen = m_currentCommandVersion;
        request.MaxBlockTime = XCTIMEINTERVAL_MINUTE;
        // XXXX
        request.pPropertyFetchTemplate = (char *) label.GetString();

        //
        // Send the request and check for errors
        //
        PXC_SETANDGETPROCESSINFO_REQRESULTS pResults = NULL;
        err = XcSetAndGetProcessInfo(NULL,//GetProcessHandle(),
                                     &request,
                                     &pResults,
                                     NULL);
        if (err == DrError_OK)
        {
            //
            // If request successfully sent, store process status and exit code
            //
            DrLogI( "Got command property");
            retries = 0;
            DrError processStatus = pResults->pProcessInfo->ProcessStatus;
            DrExitCode exitCode = pResults->pProcessInfo->ExitCode;

            if (processStatus == DrError_OK || exitCode != DrExitCode_StillActive)
            {
                //
                // If the PN thinks we have exited, so better make it so
                //
                err = DrError_Fail;
            }
        }

        //
        // If request was successful and other process doesn't think we're done
        //
        if (err == DrError_OK)
        {
            if (pResults->pProcessInfo->NumberofProcessProperties != 0)
            {
                //
                // Make sure there's only one property and it's the version
                //
                LogAssert(pResults->pProcessInfo->
                          NumberofProcessProperties == 1);
                PXC_PROCESSPROPERTY_INFO property =
                    pResults->pProcessInfo->ppProperties[0];
                LogAssert(::strcmp(property->pPropertyLabel, label) == 0);

                //
                // Update vertex version
                //
                UInt64 newVersion = property->PropertyVersion;
                if (newVersion < m_currentCommandVersion)
                {
                    //
                    // If vertex version is less than the current version, fail (logic error)
                    //
                    DrLogE(
                        "Property version went back in time. Property %s old version %I64u new version %I64u",
                        label.GetString(),
                        m_currentCommandVersion, newVersion);
                    err = DrError_ProcessPropertyVersionMismatch;
                }
                else if (newVersion == m_currentCommandVersion)
                {
                    //
                    // If version the same, report version the same
                    //
                    DrLogI(
                        "Command timeout with same version. Property %s version %I64u",
                        label.GetString(), m_currentCommandVersion);
                }
                else if (newVersion > m_currentCommandVersion)
                {
                    //
                    // If new vertex version, let GM know what process is handling it
                    //
                    DrLogI(
                        "Property got new version. Property %s old version %I64u new version %I64u",
                        label.GetString(),
                        m_currentCommandVersion, newVersion);

                    m_currentCommandVersion = newVersion;

                    DrRef<DVertexCommandBlock> newCommand;
                    newCommand.Attach(new DVertexCommandBlock());

                    DrRef<DryadXComputePnProcessPropertyResponse> response;
                    response.Attach(new DryadXComputePnProcessPropertyResponse(pResults->pProcessInfo));

                    //
                    // Get new vertex command
                    //
                    err = newCommand->ReadFromResponseMessage(response, m_vertexId, m_vertexVersion);

                    //
                    // If no errors in getting command, act on it. Log any failures below
                    //
                    if (err == DrError_OK)
                    {
                        err = ActOnCommand(newCommand);
                    }
                }
            }
        }
        else
        {
            //
            // Log error and continue
            //
            DrLogE( "XcSetAndGetProcessInfo got error: %s", DRERRORSTRING(err));
        }

        //
        // If the error is related to disconnection, retry up to 4 times
        //
        if (err == DrError_RemoteDisconnected ||
                err == DrError_LocalDisconnected ||
                err == DrError_ConnectionFailed ||
                err == DrError_ResponseDisconnect)
        {
            ++retries;
            // todo: move 4 to global
            if (retries < 4)
            {
                DrLogW( "Retrying get");
                err = DrError_OK;
            }
        }

        //
        // If result was allocated, free it before next iteration
        //
        if (pResults != NULL)
        {
            XCERROR freeError = XcFreeMemory(pResults);
            LogAssert(freeError == DrError_OK);
        }
    } while (err == DrError_OK);

    //
    // Close this controller and take no more requests
    //
    DrLogD( "About to terminate");
    Terminate(err, DrExitCode_Fail);

    //
    // Sleep forever, waiting for verticies to complete and take down the process
    //
    Sleep(INFINITE);

    return 0;
}