static bool RegisterSelf(SocketEndpoint &masterEp) { StringBuffer slfStr; StringBuffer masterStr; LOG(MCdebugProgress, thorJob, "registering %s - master %s",slfEp.getUrlStr(slfStr).str(),masterEp.getUrlStr(masterStr).str()); try { SocketEndpoint ep = masterEp; ep.port = getFixedPort(getMasterPortBase(), TPORT_mp); Owned<INode> masterNode = createINode(ep); CMessageBuffer msg; if (!queryWorldCommunicator().recv(msg, masterNode, MPTAG_THORREGISTRATION)) return false; PROGLOG("Initialization received"); unsigned vmajor, vminor; msg.read(vmajor); msg.read(vminor); if (vmajor != THOR_VERSION_MAJOR || vminor != THOR_VERSION_MINOR) { replyError(TE_FailedToRegisterSlave, "Thor master/slave version mismatch"); return false; } Owned<IGroup> rawGroup = deserializeIGroup(msg); globals->Release(); globals = createPTree(msg); mergeCmdParams(globals); // cmd line unsigned slavesPerNode = globals->getPropInt("@slavesPerNode", 1); unsigned channelsPerSlave = globals->getPropInt("@channelsPerSlave", 1); unsigned localThorPortInc = globals->getPropInt("@localThorPortInc", DEFAULT_SLAVEPORTINC); unsigned slaveBasePort = globals->getPropInt("@slaveport", DEFAULT_THORSLAVEPORT); setClusterGroup(masterNode, rawGroup, slavesPerNode, channelsPerSlave, slaveBasePort, localThorPortInc); unsigned numStrands, blockSize; if (globals->hasProp("Debug/@forceNumStrands")) numStrands = globals->getPropInt("Debug/@forceNumStrands"); else { numStrands = defaultForceNumStrands; globals->setPropInt("Debug/@forceNumStrands", defaultForceNumStrands); } if (globals->hasProp("Debug/@strandBlockSize")) blockSize = globals->getPropInt("Debug/@strandBlockSize"); else { blockSize = defaultStrandBlockSize; globals->setPropInt("Debug/@strandBlockSize", defaultStrandBlockSize); } PROGLOG("Strand defaults: numStrands=%u, blockSize=%u", numStrands, blockSize); const char *_masterBuildTag = globals->queryProp("@masterBuildTag"); const char *masterBuildTag = _masterBuildTag?_masterBuildTag:"no build tag"; PROGLOG("Master build: %s", masterBuildTag); if (!_masterBuildTag || 0 != strcmp(BUILD_TAG, _masterBuildTag)) { StringBuffer errStr("Thor master/slave build mismatch, master = "); errStr.append(masterBuildTag).append(", slave = ").append(BUILD_TAG); ERRLOG("%s", errStr.str()); #ifndef _DEBUG replyError(TE_FailedToRegisterSlave, errStr.str()); return false; #endif } msg.read((unsigned &)masterSlaveMpTag); msg.clear(); msg.setReplyTag(MPTAG_THORREGISTRATION); if (!queryNodeComm().reply(msg)) return false; PROGLOG("Registration confirmation sent"); if (!queryNodeComm().recv(msg, 0, MPTAG_THORREGISTRATION)) // when all registered return false; ::masterNode = LINK(masterNode); PROGLOG("verifying mp connection to rest of cluster"); if (!queryNodeComm().verifyAll()) ERRLOG("Failed to connect to all nodes"); else PROGLOG("verified mp connection to rest of cluster"); LOG(MCdebugProgress, thorJob, "registered %s",slfStr.str()); } catch (IException *e) { FLLOG(MCexception(e), thorJob, e,"slave registration error"); e->Release(); return false; } return true; }
static bool RegisterSelf(SocketEndpoint &masterEp) { StringBuffer slfStr; StringBuffer masterStr; LOG(MCdebugProgress, thorJob, "registering %s - master %s",slfEp.getUrlStr(slfStr).toCharArray(),masterEp.getUrlStr(masterStr).toCharArray()); try { SocketEndpoint ep = masterEp; ep.port = getFixedPort(getMasterPortBase(), TPORT_mp); Owned<INode> masterNode = createINode(ep); CMessageBuffer msg; if (!queryWorldCommunicator().recv(msg, masterNode, MPTAG_THORREGISTRATION)) return false; PROGLOG("Initialization received"); unsigned vmajor, vminor; msg.read(vmajor); msg.read(vminor); if (vmajor != THOR_VERSION_MAJOR || vminor != THOR_VERSION_MINOR) { replyError("Thor master/slave version mismatch"); return false; } Owned<IGroup> group = deserializeIGroup(msg); setClusterGroup(group); SocketEndpoint myEp = queryMyNode()->endpoint(); rank_t groupPos = group->rank(queryMyNode()); if (RANK_NULL == groupPos) { replyError("Node not part of thorgroup"); return false; } if (globals->hasProp("@SLAVENUM") && (mySlaveNum != (unsigned)groupPos)) { VStringBuffer errStr("Slave group rank[%d] does not match provided cmd line slaveNum[%d]", mySlaveNum, (unsigned)groupPos); replyError(errStr.str()); return false; } globals->Release(); globals = createPTree(msg); mergeCmdParams(globals); // cmd line const char *_masterBuildTag = globals->queryProp("@masterBuildTag"); const char *masterBuildTag = _masterBuildTag?_masterBuildTag:"no build tag"; PROGLOG("Master build: %s", masterBuildTag); #ifndef _DEBUG if (!_masterBuildTag || 0 != strcmp(BUILD_TAG, _masterBuildTag)) { StringBuffer errStr("Thor master/slave build mismatch, master = "); replyError(errStr.append(masterBuildTag).append(", slave = ").append(BUILD_TAG).str()); return false; } #endif msg.read((unsigned &)masterSlaveMpTag); msg.clear(); msg.setReplyTag(MPTAG_THORREGISTRATION); if (!queryClusterComm().reply(msg)) return false; PROGLOG("Registration confirmation sent"); if (!queryClusterComm().recv(msg, 0, MPTAG_THORREGISTRATION)) // when all registered return false; PROGLOG("verifying mp connection to rest of cluster"); if (!queryClusterComm().verifyAll()) ERRLOG("Failed to connect to all nodes"); else PROGLOG("verified mp connection to rest of cluster"); ::masterNode = LINK(masterNode); LOG(MCdebugProgress, thorJob, "registered %s",slfStr.toCharArray()); } catch (IException *e) { FLLOG(MCexception(e), thorJob, e,"slave registration error"); e->Release(); return false; } return true; }
int main( int argc, char *argv[] ) { #if defined(WIN32) && defined(_DEBUG) int tmpFlag = _CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); tmpFlag |= _CRTDBG_LEAK_CHECK_DF; _CrtSetDbgFlag( tmpFlag ); #endif InitModuleObjects(); addAbortHandler(ControlHandler); EnableSEHtoExceptionMapping(); dummyProc(); #ifndef __64BIT__ // Restrict stack sizes on 32-bit systems Thread::setDefaultStackSize(0x10000); // NB under windows requires linker setting (/stack:) #endif #ifdef _WIN32 Owned<CReleaseMutex> globalNamedMutex; #endif if (globals) globals->Release(); { Owned<IFile> iFile = createIFile("thor.xml"); globals = iFile->exists() ? createPTree(*iFile, ipt_caseInsensitive) : createPTree("Thor", ipt_caseInsensitive); } unsigned multiThorMemoryThreshold = 0; Owned<IException> unregisterException; try { if (argc==1) { usage(); return 1; } cmdArgs = argv+1; mergeCmdParams(globals); cmdArgs = argv+1; const char *master = globals->queryProp("@MASTER"); if (!master) usage(); const char *slave = globals->queryProp("@SLAVE"); if (slave) { slfEp.set(slave); localHostToNIC(slfEp); } else slfEp.setLocalHost(0); mySlaveNum = globals->getPropInt("@SLAVENUM"); setMachinePortBase(slfEp.port); slfEp.port = getMachinePortBase(); startSlaveLog(); setSlaveAffinity(globals->getPropInt("@SLAVEPROCESSNUM")); startMPServer(getFixedPort(TPORT_mp)); #ifdef USE_MP_LOG startLogMsgParentReceiver(); LOG(MCdebugProgress, thorJob, "MPServer started on port %d", getFixedPort(TPORT_mp)); #endif SocketEndpoint masterEp(master); localHostToNIC(masterEp); setMasterPortBase(masterEp.port); markNodeCentral(masterEp); if (RegisterSelf(masterEp)) { if (globals->getPropBool("Debug/@slaveDaliClient")) enableThorSlaveAsDaliClient(); IDaFileSrvHook *daFileSrvHook = queryDaFileSrvHook(); if (daFileSrvHook) // probably always installed daFileSrvHook->addFilters(globals->queryPropTree("NAS"), &slfEp); StringBuffer thorPath; globals->getProp("@thorPath", thorPath); recursiveCreateDirectory(thorPath.str()); int err = _chdir(thorPath.str()); if (err) { IException *e = makeErrnoExceptionV(-1, "Failed to change dir to '%s'", thorPath.str()); FLLOG(MCexception(e), thorJob, e); throw e; } // Initialization from globals setIORetryCount(globals->getPropInt("Debug/@ioRetries")); // default == 0 == off StringBuffer str; if (globals->getProp("@externalProgDir", str.clear())) _mkdir(str.str()); else globals->setProp("@externalProgDir", thorPath); const char * overrideBaseDirectory = globals->queryProp("@thorDataDirectory"); const char * overrideReplicateDirectory = globals->queryProp("@thorReplicateDirectory"); StringBuffer datadir; StringBuffer repdir; if (getConfigurationDirectory(globals->queryPropTree("Directories"),"data","thor",globals->queryProp("@name"),datadir)) overrideBaseDirectory = datadir.str(); if (getConfigurationDirectory(globals->queryPropTree("Directories"),"mirror","thor",globals->queryProp("@name"),repdir)) overrideReplicateDirectory = repdir.str(); if (overrideBaseDirectory&&*overrideBaseDirectory) setBaseDirectory(overrideBaseDirectory, false); if (overrideReplicateDirectory&&*overrideBaseDirectory) setBaseDirectory(overrideReplicateDirectory, true); StringBuffer tempDirStr; if (getConfigurationDirectory(globals->queryPropTree("Directories"),"temp","thor",globals->queryProp("@name"), tempDirStr)) globals->setProp("@thorTempDirectory", tempDirStr.str()); else tempDirStr.append(globals->queryProp("@thorTempDirectory")); addPathSepChar(tempDirStr).append(getMachinePortBase()); logDiskSpace(); // Log before temp space is cleared SetTempDir(tempDirStr.str(), "thtmp", true); useMemoryMappedRead(globals->getPropBool("@useMemoryMappedRead")); LOG(MCdebugProgress, thorJob, "ThorSlave Version LCR - %d.%d started",THOR_VERSION_MAJOR,THOR_VERSION_MINOR); StringBuffer url; LOG(MCdebugProgress, thorJob, "Slave %s - temporary dir set to : %s", slfEp.getUrlStr(url).str(), queryTempDir()); #ifdef _WIN32 ULARGE_INTEGER userfree; ULARGE_INTEGER total; ULARGE_INTEGER free; if (GetDiskFreeSpaceEx("c:\\",&userfree,&total,&free)&&total.QuadPart) { unsigned pc = (unsigned)(free.QuadPart*100/total.QuadPart); LOG(MCdebugProgress, thorJob, "Total disk space = %" I64F "d k", total.QuadPart/1000); LOG(MCdebugProgress, thorJob, "Free disk space = %" I64F "d k", free.QuadPart/1000); LOG(MCdebugProgress, thorJob, "%d%% disk free\n",pc); } #endif if (getConfigurationDirectory(globals->queryPropTree("Directories"),"query","thor",globals->queryProp("@name"),str.clear())) globals->setProp("@query_so_dir", str.str()); else globals->getProp("@query_so_dir", str.clear()); if (str.length()) { if (globals->getPropBool("Debug/@dllsToSlaves", true)) { StringBuffer uniqSoPath; if (PATHSEPCHAR == str.charAt(str.length()-1)) uniqSoPath.append(str.length()-1, str.str()); else uniqSoPath.append(str); uniqSoPath.append("_").append(getMachinePortBase()); str.swapWith(uniqSoPath); globals->setProp("@query_so_dir", str.str()); } PROGLOG("Using querySo directory: %s", str.str()); recursiveCreateDirectory(str.str()); } multiThorMemoryThreshold = globals->getPropInt("@multiThorMemoryThreshold")*0x100000; if (multiThorMemoryThreshold) { StringBuffer lgname; if (!globals->getProp("@multiThorResourceGroup",lgname)) globals->getProp("@nodeGroup",lgname); if (lgname.length()) { Owned<ILargeMemLimitNotify> notify = createMultiThorResourceMutex(lgname.str()); setMultiThorMemoryNotify(multiThorMemoryThreshold,notify); PROGLOG("Multi-Thor resource limit for %s set to %" I64F "d",lgname.str(),(__int64)multiThorMemoryThreshold); } else multiThorMemoryThreshold = 0; } slaveMain(jobListenerStopped); } LOG(MCdebugProgress, thorJob, "ThorSlave terminated OK"); } catch (IException *e) { if (!jobListenerStopped) FLLOG(MCexception(e), thorJob, e,"ThorSlave"); unregisterException.setown(e); } ClearTempDirs(); if (multiThorMemoryThreshold) setMultiThorMemoryNotify(0,NULL); roxiemem::releaseRoxieHeap(); if (unregisterException.get()) UnregisterSelf(unregisterException); if (globals->getPropBool("Debug/@slaveDaliClient")) disableThorSlaveAsDaliClient(); #ifdef USE_MP_LOG stopLogMsgReceivers(); #endif stopMPServer(); ::Release(globals); releaseAtoms(); // don't know why we can't use a module_exit to destruct these... ExitModuleObjects(); // not necessary, atexit will call, but good for leak checking return 0; }
int main( int argc, char *argv[] ) { #if defined(WIN32) && defined(_DEBUG) int tmpFlag = _CrtSetDbgFlag( _CRTDBG_REPORT_FLAG ); tmpFlag |= _CRTDBG_LEAK_CHECK_DF; _CrtSetDbgFlag( tmpFlag ); #endif InitModuleObjects(); addAbortHandler(ControlHandler); EnableSEHtoExceptionMapping(); dummyProc(); #ifndef __64BIT__ Thread::setDefaultStackSize(0x10000); // NB under windows requires linker setting (/stack:) #endif #ifdef _WIN32 Owned<CReleaseMutex> globalNamedMutex; #endif if (globals) globals->Release(); { Owned<IFile> iFile = createIFile("thor.xml"); globals = iFile->exists() ? createPTree(*iFile, ipt_caseInsensitive) : createPTree("Thor", ipt_caseInsensitive); } unsigned multiThorMemoryThreshold = 0; try { if (argc==1) { usage(); return 1; } cmdArgs = argv+1; mergeCmdParams(globals); cmdArgs = argv+1; const char *master = globals->queryProp("@MASTER"); if (!master) usage(); const char *slave = globals->queryProp("@SLAVE"); if (slave) { slfEp.set(slave); localHostToNIC(slfEp); } else slfEp.setLocalHost(0); if (globals->hasProp("@SLAVENUM")) mySlaveNum = atoi(globals->queryProp("@SLAVENUM")); else mySlaveNum = slfEp.port; // shouldn't happen, provided by script setMachinePortBase(slfEp.port); slfEp.port = getMachinePortBase(); startSlaveLog(); startMPServer(getFixedPort(TPORT_mp)); #ifdef USE_MP_LOG startLogMsgParentReceiver(); LOG(MCdebugProgress, thorJob, "MPServer started on port %d", getFixedPort(TPORT_mp)); #endif SocketEndpoint masterEp(master); localHostToNIC(masterEp); setMasterPortBase(masterEp.port); markNodeCentral(masterEp); if (RegisterSelf(masterEp)) { #define ISDALICLIENT // JCSMORE plugins *can* access dali - though I think we should probably prohibit somehow. #ifdef ISDALICLIENT const char *daliServers = globals->queryProp("@DALISERVERS"); if (!daliServers) { LOG(MCerror, thorJob, "No Dali server list specified\n"); return 1; } Owned<IGroup> serverGroup = createIGroup(daliServers, DALI_SERVER_PORT); unsigned retry = 0; loop { try { LOG(MCdebugProgress, thorJob, "calling initClientProcess"); initClientProcess(serverGroup,DCR_ThorSlave, getFixedPort(TPORT_mp)); break; } catch (IJSOCK_Exception *e) { if ((e->errorCode()!=JSOCKERR_port_in_use)) throw; FLLOG(MCexception(e), thorJob, e,"InitClientProcess"); if (retry++>10) throw; e->Release(); LOG(MCdebugProgress, thorJob, "Retrying"); Sleep(retry*2000); } } setPasswordsFromSDS(); #endif IDaFileSrvHook *daFileSrvHook = queryDaFileSrvHook(); if (daFileSrvHook) // probably always installed daFileSrvHook->addSubnetFilters(globals->queryPropTree("NAS"), NULL); StringBuffer thorPath; globals->getProp("@thorPath", thorPath); recursiveCreateDirectory(thorPath.str()); int err = _chdir(thorPath.str()); if (err) { IException *e = MakeErrnoException(-1, "Failed to change dir to '%s'",thorPath.str()); FLLOG(MCexception(e), thorJob, e); throw e; } // Initialization from globals setIORetryCount(globals->getPropInt("Debug/@ioRetries")); // default == 0 == off StringBuffer str; if (globals->getProp("@externalProgDir", str.clear())) _mkdir(str.str()); else globals->setProp("@externalProgDir", thorPath); const char * overrideBaseDirectory = globals->queryProp("@thorDataDirectory"); const char * overrideReplicateDirectory = globals->queryProp("@thorReplicateDirectory"); StringBuffer datadir; StringBuffer repdir; if (getConfigurationDirectory(globals->queryPropTree("Directories"),"data","thor",globals->queryProp("@name"),datadir)) overrideBaseDirectory = datadir.str(); if (getConfigurationDirectory(globals->queryPropTree("Directories"),"mirror","thor",globals->queryProp("@name"),repdir)) overrideReplicateDirectory = repdir.str(); if (overrideBaseDirectory&&*overrideBaseDirectory) setBaseDirectory(overrideBaseDirectory, false); if (overrideReplicateDirectory&&*overrideBaseDirectory) setBaseDirectory(overrideReplicateDirectory, true); StringBuffer tempdirstr; const char *tempdir = globals->queryProp("@thorTempDirectory"); if (getConfigurationDirectory(globals->queryPropTree("Directories"),"temp","thor",globals->queryProp("@name"),tempdirstr)) tempdir = tempdirstr.str(); SetTempDir(tempdir,true); useMemoryMappedRead(globals->getPropBool("@useMemoryMappedRead")); LOG(MCdebugProgress, thorJob, "ThorSlave Version LCR - %d.%d started",THOR_VERSION_MAJOR,THOR_VERSION_MINOR); StringBuffer url; LOG(MCdebugProgress, thorJob, "Slave %s - temporary dir set to : %s", slfEp.getUrlStr(url).toCharArray(), queryTempDir()); #ifdef _WIN32 ULARGE_INTEGER userfree; ULARGE_INTEGER total; ULARGE_INTEGER free; if (GetDiskFreeSpaceEx("c:\\",&userfree,&total,&free)&&total.QuadPart) { unsigned pc = (unsigned)(free.QuadPart*100/total.QuadPart); LOG(MCdebugProgress, thorJob, "Total disk space = %"I64F"d k", total.QuadPart/1000); LOG(MCdebugProgress, thorJob, "Free disk space = %"I64F"d k", free.QuadPart/1000); LOG(MCdebugProgress, thorJob, "%d%% disk free\n",pc); } #endif if (getConfigurationDirectory(globals->queryPropTree("Directories"),"query","thor",globals->queryProp("@name"),str.clear())) globals->setProp("@query_so_dir", str.str()); else globals->getProp("@query_so_dir", str.clear()); if (str.length()) { if (globals->getPropBool("Debug/@dllsToSlaves", true)) { StringBuffer uniqSoPath; if (PATHSEPCHAR == str.charAt(str.length()-1)) uniqSoPath.append(str.length()-1, str.str()); else uniqSoPath.append(str); uniqSoPath.append("_").append(getMachinePortBase()); str.swapWith(uniqSoPath); globals->setProp("@query_so_dir", str.str()); } PROGLOG("Using querySo directory: %s", str.str()); recursiveCreateDirectory(str.str()); } multiThorMemoryThreshold = globals->getPropInt("@multiThorMemoryThreshold")*0x100000; if (multiThorMemoryThreshold) { StringBuffer lgname; if (!globals->getProp("@multiThorResourceGroup",lgname)) globals->getProp("@nodeGroup",lgname); if (lgname.length()) { Owned<ILargeMemLimitNotify> notify = createMultiThorResourceMutex(lgname.str()); setMultiThorMemoryNotify(multiThorMemoryThreshold,notify); PROGLOG("Multi-Thor resource limit for %s set to %"I64F"d",lgname.str(),(__int64)multiThorMemoryThreshold); } else multiThorMemoryThreshold = 0; } slaveMain(); } LOG(MCdebugProgress, thorJob, "ThorSlave terminated OK"); } catch (IException *e) { FLLOG(MCexception(e), thorJob, e,"ThorSlave"); e->Release(); } catch (CATCHALL) { FLLOG(MCerror, thorJob, "ThorSlave exiting because of uncaught exception"); } ClearTempDirs(); if (multiThorMemoryThreshold) setMultiThorMemoryNotify(0,NULL); roxiemem::releaseRoxieHeap(); #ifdef ISDALICLIENT closeEnvironment(); closedownClientProcess(); // dali client closedown #endif #ifdef USE_MP_LOG stopLogMsgReceivers(); #endif stopMPServer(); ::Release(globals); releaseAtoms(); // don't know why we can't use a module_exit to destruct these... return 0; }