void DryadSubGraphVertex::EdgeInfo:: MakeFifo(UInt32 fifoLength, WorkQueue* workQueue) { LogAssert(m_reader == NULL); LogAssert(m_writer == NULL); UInt32 uniquifier = RChannelFactory::GetUniqueFifoId(); DrStr64 fifoName; fifoName.SetF("fifo://%u/internal-%u-%u.%u--%u.%u", fifoLength, uniquifier, m_sourceVertex, m_sourcePort, m_destinationVertex, m_destinationPort); DVErrorReporter errorReporter; RChannelFactory::OpenReader(fifoName, NULL, NULL, 1, NULL, 0, 0, workQueue, &errorReporter, &m_reader, NULL); LogAssert(errorReporter.NoError()); RChannelFactory::OpenWriter(fifoName, NULL, NULL, 1, NULL, 0, NULL, &errorReporter, &m_writer); LogAssert(errorReporter.NoError()); m_reader->GetReader()->Start(NULL); m_writer->GetWriter()->Start(); }
/** * Called at the start of InitBootstrapConfiguration, this method is a last chance for a subclass to mess with the bootstrap configuration before it is used. * * On entry, bootstrapConfigPathname is the fully qualified name of the bootstrap file. * bootstrapConfiguration is the raw (no macro expansion or override collapsing) configuration, or NULL if the configuration was not found. * * On exit, bootstrapConfiguration is the final raw (no macro expansion or override collapsing) bootstrap configuration. If NULL, initialization will fail. * * The default implementation does nothing. * * Returns false if initialization should fail. */ virtual bool PreprocessBootstrapConfiguration(const char *bootstrapConfigPathname, Ptr<const IConfiguration>& bootstrapConfiguration) { if (bootstrapConfiguration == NULL) { // The bootstrap config file is missing -- build a default one DrStr64 strDataDirLocation; DrStr32 strRelDataDirLocation; strRelDataDirLocation.SetF(".\\DataDir.%u", GetCurrentProcessId()); DrError err = DrCanonicalizeFilePath(strDataDirLocation, strRelDataDirLocation); if (err != DrError_OK) { DrLogE( "DryadConfigurationManager", "Failed to canonicalize data directory name %s error=%s", strRelDataDirLocation.GetString(), DRERRORSTRING(err)); return false; } Ptr<IMutableConfiguration> cfg = Configuration::GenerateDefaultBootstrapConfig( strDataDirLocation.GetString(), "...", "default", NULL); if (cfg == NULL) { DrLogE( "DryadConfigurationManager", "Failed to create default bootstrap file"); return false; } bootstrapConfiguration = cfg; } return true; }
// // Create files which contain information used to restart the upcoming vertex command // Used for post-mortem debugging. // void DVertexPnController::DumpRestartCommand(DVertexCommandBlock* commandBlock) { DrError err; // // Create temporary buffer // DrRef<DrSimpleHeapBuffer> buf; buf.Attach(new DrSimpleHeapBuffer()); // // Write command block into buffer // { DrMemoryBufferWriter writer(buf); err = commandBlock->Serialize(&writer); } // // If write fails, log failure and return // if (err != DrError_OK) { DrLogE("Can't serialize command block for restart --- %s", DRERRORSTRING(err)); return; } // // Get data reference and byte count // const void* serializedData; Size_t availableToRead; serializedData = buf->GetReadAddress(0, &availableToRead); LogAssert(availableToRead >= buf->GetAvailableSize()); // // Get the process information // DVertexProcessStatus* ps = commandBlock->GetProcessStatus(); // // Build file for data required for rerun, open it // DrStr64 restartBlockName; restartBlockName.SetF("vertex-%u-%u-rerun-data.dat", ps->GetVertexId(), ps->GetVertexInstanceVersion()); FILE* fData = fopen(restartBlockName, "wb"); if (fData == NULL) { // // If failed to open file, log and return // err = DrGetLastError(); DrLogE( "Can't open re-run command block file '%s' --- %s", restartBlockName.GetString(), DRERRORSTRING(err)); return; } // // Build file for original information required for rerun, open it // DrStr64 originalInfoName; originalInfoName.SetF("vertex-%u-%u-rerun-originalInfo.txt", ps->GetVertexId(), ps->GetVertexInstanceVersion()); FILE* fOriginalText = fopen(originalInfoName, "w"); if (fOriginalText == NULL) { // // If failed to open file, log and return // err = DrGetLastError(); DrLogE( "Can't open re-run original info file '%s' --- %s", originalInfoName.GetString(), DRERRORSTRING(err)); // // Close data file // fclose(fData); return; } // // Build file for rerun command line, open it // DrStr64 originalRestartCommand; originalRestartCommand.SetF("vertex-%u-%u-rerun.cmd", ps->GetVertexId(), ps->GetVertexInstanceVersion()); FILE* fOriginalRestart = fopen(originalRestartCommand, "w"); if (fOriginalRestart == NULL) { // // If failed to open file, log and return // err = DrGetLastError(); DrLogE( "Can't open re-run original command file '%s' --- %s", originalRestartCommand.GetString(), DRERRORSTRING(err)); // // Close data and original text files // fclose(fData); fclose(fOriginalText); return; } // // Open file for local info // /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. DrStr64 localInfoName; localInfoName.SetF("vertex-%u-%u-rerun-localInfo.txt", ps->GetVertexId(), ps->GetVertexInstanceVersion()); FILE* fLocalText = fopen(localInfoName, "w"); if (fLocalText == NULL) { // // If failed to open file, log and return // err = DrGetLastError(); DrLogE( "Can't open re-run local info file '%s' --- %s", localInfoName.GetString(), DRERRORSTRING(err)); // // Close data, cmd, and original text files // fclose(fData); fclose(fOriginalText); fclose(fOriginalRestart); return; } */ // // Open file for rerun with local inputs // /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. DrStr64 localRestartCommand; localRestartCommand.SetF("vertex-%u-%u-rerun-local-inputs.cmd", ps->GetVertexId(), ps->GetVertexInstanceVersion()); FILE* fLocalRestart = fopen(localRestartCommand, "w"); if (fLocalRestart == NULL) { // // If failed to open file, log and return // err = DrGetLastError(); DrLogE( "Can't open re-run local command file '%s' --- %s", localRestartCommand.GetString(), DRERRORSTRING(err)); // // Close data, cmd, original, and local text files // fclose(fData); fclose(fOriginalText); fclose(fOriginalRestart); fclose(fLocalText); return; } */ // // Open file for fetching inputs // /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. DrStr64 copyCommand; copyCommand.SetF("vertex-%u-%u-rerun-fetch-inputs.cmd", ps->GetVertexId(), ps->GetVertexInstanceVersion()); FILE* fCopyCommand = fopen(copyCommand, "w"); if (fCopyCommand == NULL) { // // If failed to open file, log and return // err = DrGetLastError(); DrLogE( "Can't open re-run copy command file '%s' --- %s", localRestartCommand.GetString(), DRERRORSTRING(err)); // // Close data, original and localcmd, and original and local text files // fclose(fData); fclose(fOriginalText); fclose(fOriginalRestart); fclose(fLocalText); fclose(fLocalRestart); return; } */ // // Write out data to data file, then close it. // size_t written = fwrite(serializedData, 1, buf->GetAvailableSize(), fData); fclose(fData); if (written != buf->GetAvailableSize()) { // // If failed to write all the data, log failure // err = DrGetLastError(); DrLogE( "Failed to write re-run command block file '%s': only %Iu of %Iu bytes written --- %s", restartBlockName.GetString(), written, (size_t) (buf->GetAvailableSize()), DRERRORSTRING(err)); } // // Write original restart command // fprintf(fOriginalRestart, "%s --cmd -dump %s -overridetext %s\n", m_parent->GetRunningExePathName(), restartBlockName.GetString(), originalInfoName.GetString()); // // Write local restart command // /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalRestart, "%s --vertex --cmd -dump %s -overridetext %s\n", m_parent->GetRunningExePathName(), restartBlockName.GetString(), localInfoName.GetString()); */ // // Record number of input files // fprintf(fOriginalText, "%u # input files\n", ps->GetInputChannelCount()); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalText, "%u # input files\n", ps->GetInputChannelCount()); */ // // Get the input channels and foreach channel, add copy command to copy script // DryadInputChannelDescription* inputs = ps->GetInputChannels(); for (UInt32 i=0; i<ps->GetInputChannelCount(); ++i) { const char* uri = inputs[i].GetChannelURI(); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. if (::_strnicmp(uri, "file://", 7) == 0) { // // If reading from file, copy command doesn't want "file://" prefix // todo: remove reference to cosmos // fprintf(fCopyCommand, "cosmos.exe copy %s v%u.%u-i%u\n", uri+7, ps->GetVertexId(), ps->GetVertexInstanceVersion(), i); } else if (::_strnicmp(uri, "cosmos://", 9) == 0) { // // If reading from cosmos path, copy directly // todo: remove cosmos code // fprintf(fCopyCommand, "cosmos.exe copy %s v%u.%u-i%u\n", uri, ps->GetVertexId(), ps->GetVertexInstanceVersion(), i); } else { // // Otherwise, unable to copy // fprintf(fCopyCommand, "echo can't copy URI %s to v%u.%u-i%u\n", uri, ps->GetVertexId(), ps->GetVertexInstanceVersion(), i); } */ // // At reference to this URI to original and relative reference to local // fprintf(fOriginalText, "%s\n", uri); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalText, "file://v%u.%u-i%u\n", ps->GetVertexId(), ps->GetVertexInstanceVersion(), i); */ } // // Record number of output files // fprintf(fOriginalText, "%u # output files\n", ps->GetOutputChannelCount()); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalText, "%u # output files\n", ps->GetOutputChannelCount()); */ // // Get the output channels and record each one // DryadOutputChannelDescription* outputs = ps->GetOutputChannels(); for (UInt32 i=0; i<ps->GetOutputChannelCount(); ++i) { const char* uri = outputs[i].GetChannelURI(); // // Check if uri is writting to DSC partition. // If it is, redirect to local temp file to avoid writing to sealed stream // DrStr uriMod(""); if(ConcreteRChannel::IsDscPartition(uri)) { uriMod.AppendF("file://hpcdscpt_redirect_%d.dtf", i); uri = uriMod.GetString(); } fprintf(fOriginalText, "%s\n", uri); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalText, "%s\n", uri); */ } // // Record number of arguments // fprintf(fOriginalText, "%u # arguments\n", commandBlock->GetArgumentCount()); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalText, "%u # arguments\n", commandBlock->GetArgumentCount()); */ // // Foreach argument, record its value // for (UInt32 i=0; i<commandBlock->GetArgumentCount(); ++i) { DrStr64 arg = commandBlock->GetArgumentVector()[i]; fprintf(fOriginalText, "%s\n", arg.GetString()); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fprintf(fLocalText, "%s\n", arg.GetString()); */ } // // Close all files // todo: fData closed above, remove duplicate // fclose(fData); fclose(fOriginalText); fclose(fOriginalRestart); /* BUG 16322: Do not create this for SP3, since it is currently broken. Consider fixing for v4. fclose(fLocalText); fclose(fLocalRestart); fclose(fCopyCommand); */ }
// // Run in thread for each vertex // unsigned DVertexXComputePnController::CommandLoop() { DrError err; UInt32 retries = 0; // // Get the vertex label // DrStr64 label; DVertexCommandBlock::GetPnPropertyLabel(&label, m_vertexId, m_vertexVersion); // // Wait for communication until error // do { // // Create request to get vertex version property // XC_SETANDGETPROCESSINFO_REQINPUT request; memset(&request, 0, sizeof(request)); request.Size = sizeof(request); request.pBlockOnPropertyLabel = label.GetString(); request.BlockOnPropertyversionLastSeen = m_currentCommandVersion; request.MaxBlockTime = XCTIMEINTERVAL_MINUTE; // XXXX request.pPropertyFetchTemplate = (char *) label.GetString(); // // Send the request and check for errors // PXC_SETANDGETPROCESSINFO_REQRESULTS pResults = NULL; err = XcSetAndGetProcessInfo(NULL,//GetProcessHandle(), &request, &pResults, NULL); if (err == DrError_OK) { // // If request successfully sent, store process status and exit code // DrLogI( "Got command property"); retries = 0; DrError processStatus = pResults->pProcessInfo->ProcessStatus; DrExitCode exitCode = pResults->pProcessInfo->ExitCode; if (processStatus == DrError_OK || exitCode != DrExitCode_StillActive) { // // If the PN thinks we have exited, so better make it so // err = DrError_Fail; } } // // If request was successful and other process doesn't think we're done // if (err == DrError_OK) { if (pResults->pProcessInfo->NumberofProcessProperties != 0) { // // Make sure there's only one property and it's the version // LogAssert(pResults->pProcessInfo-> NumberofProcessProperties == 1); PXC_PROCESSPROPERTY_INFO property = pResults->pProcessInfo->ppProperties[0]; LogAssert(::strcmp(property->pPropertyLabel, label) == 0); // // Update vertex version // UInt64 newVersion = property->PropertyVersion; if (newVersion < m_currentCommandVersion) { // // If vertex version is less than the current version, fail (logic error) // DrLogE( "Property version went back in time. Property %s old version %I64u new version %I64u", label.GetString(), m_currentCommandVersion, newVersion); err = DrError_ProcessPropertyVersionMismatch; } else if (newVersion == m_currentCommandVersion) { // // If version the same, report version the same // DrLogI( "Command timeout with same version. Property %s version %I64u", label.GetString(), m_currentCommandVersion); } else if (newVersion > m_currentCommandVersion) { // // If new vertex version, let GM know what process is handling it // DrLogI( "Property got new version. Property %s old version %I64u new version %I64u", label.GetString(), m_currentCommandVersion, newVersion); m_currentCommandVersion = newVersion; DrRef<DVertexCommandBlock> newCommand; newCommand.Attach(new DVertexCommandBlock()); DrRef<DryadXComputePnProcessPropertyResponse> response; response.Attach(new DryadXComputePnProcessPropertyResponse(pResults->pProcessInfo)); // // Get new vertex command // err = newCommand->ReadFromResponseMessage(response, m_vertexId, m_vertexVersion); // // If no errors in getting command, act on it. Log any failures below // if (err == DrError_OK) { err = ActOnCommand(newCommand); } } } } else { // // Log error and continue // DrLogE( "XcSetAndGetProcessInfo got error: %s", DRERRORSTRING(err)); } // // If the error is related to disconnection, retry up to 4 times // if (err == DrError_RemoteDisconnected || err == DrError_LocalDisconnected || err == DrError_ConnectionFailed || err == DrError_ResponseDisconnect) { ++retries; // todo: move 4 to global if (retries < 4) { DrLogW( "Retrying get"); err = DrError_OK; } } // // If result was allocated, free it before next iteration // if (pResults != NULL) { XCERROR freeError = XcFreeMemory(pResults); LogAssert(freeError == DrError_OK); } } while (err == DrError_OK); // // Close this controller and take no more requests // DrLogD( "About to terminate"); Terminate(err, DrExitCode_Fail); // // Sleep forever, waiting for verticies to complete and take down the process // Sleep(INFINITE); return 0; }