void MpiLauncher::completeLaunch(pid_t pid, const std::string& pidFile, int status)
{
    // rm args file
    boost::scoped_ptr<SharedMemoryIpc> shmIpc(mpi::newSharedMemoryIpc(_ipcName));
    shmIpc->remove();
    shmIpc.reset();

    // rm pid file
    scidb::File::remove(pidFile.c_str(), false);

    // rm log file
    if (!logger->isTraceEnabled() && !_inError) {
        string logFileName = mpi::getLauncherLogFile(_installPath, _queryId, _launchId);
        scidb::File::remove(logFileName.c_str(), false);
    }

    if (WIFSIGNALED(status)) {
        LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<pid<<") terminated by signal = "
                      << WTERMSIG(status) << (WCOREDUMP(status)? ", core dumped" : ""));
        throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process";

    } else if (WIFEXITED(status)) {
        int rc = WEXITSTATUS(status);
        if (rc != 0) {
            LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc);
            throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process";

        } else {
            LOG4CXX_DEBUG(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc);
            return;
        }
    }
    throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE);
}
Example #2
0
void MpiSlaveProxy::destroy(bool error)
{
    QueryID queryIdForKill(INVALID_QUERY_ID);
    if (error) {
        _inError=true;
        queryIdForKill = _queryId;
    }
    const string clusterUuid = Cluster::getInstance()->getUuid();
    // kill the slave proc and its parent orted
    for ( std::vector<pid_t>::const_iterator iter=_pids.begin();
          iter!=_pids.end(); ++iter) {
        pid_t pid = *iter;

        //XXX TODO tigor: kill proceess group (-pid) ?
        LOG4CXX_DEBUG(logger, "MpiSlaveProxy::destroy: killing slave pid = "<<pid);
        MpiErrorHandler::killProc(_installPath, clusterUuid, pid, queryIdForKill);
    }

    std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId);
    MpiErrorHandler::cleanupSlavePidFile(_installPath,
                                         clusterUuid,
                                         pidFile,
                                         queryIdForKill);

    // rm log file
    if (!logger->isTraceEnabled() && !_inError) {
        string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId);
        scidb::File::remove(logFileName.c_str(), false);
    }
}
Example #3
0
  void init()
  {
    //calculate log directory
#ifdef _MSC_VER
    std::string log_file_name_str = ros::file_log::getLogDirectory() + "/rosout.log";
    LOG4CXX_DECODE_CHAR(log_file_name, log_file_name_str); // this instantiates log_file_name as type LogString as well
    std::string empty_str = "";
    LOG4CXX_DECODE_CHAR(log_empty, empty_str);
#else
    std::string log_file_name = ros::file_log::getLogDirectory() + "/rosout.log";
    std::string log_empty = "";
#endif

    logger_ = log4cxx::Logger::getRootLogger();
    log4cxx::LayoutPtr layout = new log4cxx::PatternLayout(log_empty);
    log4cxx::RollingFileAppenderPtr appender = new log4cxx::RollingFileAppender(layout, log_file_name, true);
    logger_->addAppender( appender );
    appender->setMaximumFileSize(100*1024*1024);
    appender->setMaxBackupIndex(10);
    log4cxx::helpers::Pool pool;
    appender->activateOptions(pool);

    std::cout << "logging to " << log_file_name.c_str() << std::endl;

    LOG4CXX_INFO(logger_, "\n\n" << ros::Time::now() << "  Node Startup\n");

    agg_pub_ = node_.advertise<rosgraph_msgs::Log>("/rosout_agg", 0);
    std::cout << "re-publishing aggregated messages to /rosout_agg" << std::endl;

    rosout_sub_ = node_.subscribe("/rosout", 0, &Rosout::rosoutCallback, this);
    std::cout << "subscribed to /rosout" << std::endl;
  }
Example #4
0
bool LogImpl::validatePath(log4cxx::LoggerPtr logger)
{
	if(activatePathValidation)
	{
		vector<AppenderPtr> appenders = logger->getAllAppenders();
		RollingFileAppender* appenderTemp = new RollingFileAppender();

		for(int i = 0; i < (int)appenders.size(); i++)
		{
			std::string currentAppenderName = typeid(*appenders[i]).name();
			std::string typeName = typeid(*appenderTemp).name();

			//RollingFile
			if(currentAppenderName.find("RollingFile") != string::npos)
			{
				RollingFileAppender* apender = (RollingFileAppender*) &appenders[i];

				std::string currentFile;
				log4cxx::helpers::Transcoder::encode(apender->getFile(), currentFile);

				if(DirectoryExists(currentFile))
				{
					return true;
				}
			}
		}

		delete(appenderTemp);
	}
	else
		return true;

	return false;

}
Example #5
0
// XXX TODO: consider returning std::vector<scidb::SharedMemoryPtr>
// XXX TODO: which would require supporting different types of memory (double, char etc.)
std::vector<MPIPhysical::SMIptr_t> MPIPhysical::allocateMPISharedMemory(size_t numBufs,
                                                                        size_t elemSizes[],
                                                                        size_t numElems[],
                                                                        string dbgNames[])
{
    LOG4CXX_DEBUG(logger, "MPIPhysical::allocateMPISharedMemory(numBufs "<<numBufs<<",,,)");

    if(logger->isTraceEnabled()) {
        LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory(): allocations are: ");
        for(size_t ii=0; ii< numBufs; ii++) {
            LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory():"
                                   << " elemSizes["<<ii<<"] "<< dbgNames[ii] << " len " << numElems[ii]);
        }
    }

    std::vector<SMIptr_t> shmIpc(numBufs);
    bool preallocate = Config::getInstance()->getOption<bool>(CONFIG_PREALLOCATE_SHM);
    for(size_t ii=0; ii<numBufs; ii++) {
        std::stringstream suffix;
        suffix << "." << ii ;
        std::string ipcNameFull= _ipcName + suffix.str();
        LOG4CXX_TRACE(logger, "IPC name = " << ipcNameFull);
        shmIpc[ii] = SMIptr_t(mpi::newSharedMemoryIpc(ipcNameFull, preallocate)); // can I get 'em off ctx instead?
        _ctx->addSharedMemoryIpc(_launchId, shmIpc[ii]);

        char* ptr = MpiLauncher::initIpcForWrite(shmIpc[ii].get(), (elemSizes[ii] * numElems[ii]));
        assert(ptr); ptr=ptr;
    }
    return shmIpc;
}
Example #6
0
void log4cxx_debug_dimensions(const std::string& prefix, const Dimensions& dims)
{
    if(logger->isDebugEnabled()) {
        for (size_t i=0; i<dims.size(); i++) {
            LOG4CXX_DEBUG(logger, prefix << " dims["<<i<<"] from " << dims[i].getStartMin() << " to " << dims[i].getEndMax());
        }
    }
}
Example #7
0
void __cdecl AppLog(int level, const char *format, ...)
{
	static log4cxx::LoggerPtr logger(Logger::getLogger("App"));
	va_list args;
	va_start(args, format);

	int nBuf;
	char szBuffer[4096]= "";
#if _WIN32
	nBuf = _vsnprintf(szBuffer, _countof(szBuffer), format, args);
#else
	nBuf = vsnprintf(szBuffer, _countof(szBuffer), format, args);
#endif
	va_end(args);
	switch(level) {
	case APP_LOG_DEBUG:
		logger->debug(szBuffer);
		break;
	case APP_LOG_INFO:
		logger->info(szBuffer);
		break;
	case APP_LOG_WARN:
		logger->warn(szBuffer);
		break;
	case APP_LOG_ERR:
		logger->error(szBuffer);
		break;
	default:
		logger->debug(szBuffer);

	}

}
Example #8
0
void configure_default_logger(log4cxx::LoggerPtr logger,
		log4cxx::LevelPtr level, std::string fname, bool dual)
{
	if (fname.empty() && dual)
		throw std::logic_error("dual log mode requires a filename");

	logger->setLevel(level);

	if (fname.empty() || dual)
	{
		log4cxx::AppenderPtr app = logger_write_to_cout(logger);
		app->setName("COUT");
	}

	if (!fname.empty())
	{
		log4cxx::AppenderPtr app = logger_write_to_file(fname, logger);
		app->setName("FILE");
	}
}
void MpiSlaveProxy::destroy(bool error)
{
    if (error) {
        _inError=true;
    }
    // kill the slave proc and its parent orted
    for ( std::vector<pid_t>::const_iterator iter=_pids.begin();
          iter!=_pids.end(); ++iter) {
        pid_t pid = *iter;

        //XXX TODO tigor: kill proceess group (-pid) ?
        MpiErrorHandler::killProc(_installPath, pid);
    }

    // rm pid file
    std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId);
    scidb::File::remove(pidFile.c_str(), false);

    // rm log file
    if (!logger->isTraceEnabled() && !_inError) {
        string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId);
        scidb::File::remove(logFileName.c_str(), false);
    }
}
Example #10
0
void log_interface(int severity, const char *msg) {
	static log4cxx::LoggerPtr logger(Logger::getLogger("Libevent"));
	printf("%s", msg);
	switch(severity) {
	case EVENT_LOG_DEBUG:
		logger->debug(msg);
		break;
	case EVENT_LOG_MSG:
		logger->info(msg);
		break;
	case EVENT_LOG_WARN:
		logger->warn(msg);
		break;
	case EVENT_LOG_ERR:
		logger->error(msg);
		break;
	default:
		logger->debug(msg);
	}
}
Example #11
0
void TracingDemoModel::populateDetectedBlobs_CentroidPerNeighbourBugsGroup(const std::vector<BugCreature>& bugs, std::vector<DetectedBlob>& resultBlobs)
{
	const float CloseBlobsDist = 7;
	std::vector<uchar> processedBugs(bugs.size(), false);
	for (size_t i = 0; i < bugs.size(); ++i)
	{
		if (processedBugs[i])
			continue;
		processedBugs[i] = true;

		auto& bug = bugs[i];

		float centroidX = bug.Pos.x();
		float centroidY = bug.Pos.y();
		int neighboursCount = 1;

		for (size_t j = i + 1; j < bugs.size(); ++j)
		{
			if (processedBugs[j])
				continue;

			auto& nghBug = bugs[j];

			float len = PoolWatch::sqr(nghBug.Pos.x() - bug.Pos.x()) + PoolWatch::sqr(nghBug.Pos.y() - bug.Pos.y());
			len = std::sqrtf(len);
			if (len < CloseBlobsDist)
			{
				centroidX += nghBug.Pos.x();
				centroidY += nghBug.Pos.y();
				neighboursCount++;
				processedBugs[j] = true;
			}
		}

		centroidX /= neighboursCount;
		centroidY /= neighboursCount;

		const int blobW = 10;
		const int blobH = 10;
		DetectedBlob blob;
		blob.Id = i + 1;
		blob.Centroid = cv::Point2f(centroidX,centroidY);
		blob.CentroidWorld = cv::Point3f(centroidX, centroidY, 0);
		blob.BoundingBox = cv::Rect2f(centroidX - 5, centroidY-5, blobW,blobH);
		blob.FilledImage = cv::Mat(blobW, blobH, CV_8UC1);
		blob.FilledImage.setTo(255);
		fixBlobFilledImageRgb(bug, blobW, blobH, blob);

		blob.AreaPix = blobW * blobH;
		resultBlobs.push_back(blob);
	}

	if (log_->isDebugEnabled())
	{
		std::stringstream bld;
		bld << "Found " << resultBlobs.size() << " blobs" << std::endl;
		for (const auto& blob : resultBlobs)
			bld << "  Id=" << blob.Id << " Centroid=" << blob.Centroid << std::endl;
		LOG4CXX_DEBUG(log_, bld.str());
	}
}
Example #12
0
int main(int argc, char** argv)
{
    log4cxx::PropertyConfigurator::configure("logger.conf");
    Util::SyncHandler spipe(SIGPIPE, [](int, siginfo_t *, void *){;});

    po::options_description desc("Program options");
    desc.add_options()
    ("help,h", "show usage information")
    ("port", po::value<int>(&opt_port), "bind port (2080)")
    ("perf", po::bool_switch(&opt_perf), "turn off logging")
    ;
    //("coro", po::bool_switch(&opt_coro), "enable coro server")
    //("time", po::value<int>(&opt_time), "time to perform call, us (250000)")
    //("threads", po::value<int>(&opt_threads), "number of worker threads (no threads)")

    po::variables_map vm;
    try {
        po::store(po::parse_command_line(argc, argv, desc), vm);
    } catch (const std::exception& e) {
        ERROR(e.what());
        return -1;
    }
    po::notify(vm);
    if (vm.count("help")) {
        std::cout << desc << std::endl;
        return 0;
    }

    Server s;
    RPC::Library impl(s);

    boost::asio::io_service io;
    Util::ThreadGroup tg;

    boost::asio::signal_set signals(io, SIGINT, SIGTERM);
    signals.async_wait([&io](auto error, auto number){
        if (!error)
        {
            INFO("terminating server");
            io.stop();
        }
    });

    Util::Server server(io);
    server.run(opt_port, [&impl](boost::asio::streambuf& data, Util::FramedSocket ptr) {
        cbor::binary result;
        impl.process(data.data(), result);
        Util::FramedMessage reply(std::move(result));
        ptr->write(std::move(reply));
    });

    // start event loop
    tg.run([&io](){
        log4cxx::NDC ndc("server");
        TRACE("starting ... ");
        io.run();
        TRACE("terminated");
    });

    if (opt_perf)
        logger->setLevel(log4cxx::Level::getFatal());

    tg.wait();

    return 0;
}
Example #13
0
void ProxyServer::run() {
    TRACE(std::cout, "");
    zmq::context_t context(1);
    zmq::socket_t socket(context, ZMQ_REP);
    std::string addr = m_protocol + "://*:" + m_port;
    socket.bind(addr.c_str());
    while (true) {
        TRACE(std::cout, "");
        zmq::message_t z_req;
        socket.recv(&z_req);
        std::string s_req((char *)z_req.data(),
                z_req.size());
        Request req;
        req.ParseFromString(s_req);
        // if trace is in Request_Header, then change log level of appender
        // to trace
        if (m_msgDriver->requestHasTrace(req)) {
            TRACE(std::cout, "");
            logger->setLevel(log4cxx::Level::getTrace());
        }
        if (m_reqPrinter) {
            TRACE(std::cout, "");
            LOG4CXX_TRACE(logger, "Request: "<<(m_reqPrinter(req)));
        }
        Response *resp = NULL;
        // All the anlytics will be logged in this function
        switch (req.header().type()) {
            case Request_Header_Type_LOOKUP:
                {
                    TRACE(std::cout, "");
                    try {
                        m_worker->lookup(req);
                    } catch (KeyNotFoundException &e) {
                        TRACE(std::cout, "");
                        LOG4CXX_DEBUG(logger, "Key not found in store: "
                                <<e.what());
                    }
                    resp = m_worker->response();
                    break;
                }
            case Request_Header_Type_INSERT:
                {
                    TRACE(std::cout, "");
                    try {
                        m_worker->insert(req);
                    } catch (KeyPresentException &e) {
                        TRACE(std::cout, "");
                        LOG4CXX_DEBUG(logger, "Key already present in"
                                " store: "<<e.what());
                    } catch (InsertionException &e) {
                        TRACE(std::cout, "");
                        LOG4CXX_DEBUG(logger, "Insertion error: "
                                <<e.what());
                    }
                    resp = m_worker->response();
                    break;
                }
            case Request_Header_Type_REMOVE:
                {
                    TRACE(std::cout, "");
                    try {
                        m_worker->remove(req);
                    } catch (KeyNotFoundException &e) {
                        TRACE(std::cout, "");
                        LOG4CXX_DEBUG(logger, "Key not found in store: "
                                <<e.what());
                    } catch (DeletionException &e) {
                        TRACE(std::cout, "");
                        LOG4CXX_DEBUG(logger, "Deletion error: "
                                <<e.what());
                    }
                    resp = m_worker->response();
                    break;
                }
            default:
                {
                    TRACE(std::cout, "");
                    break;
                }
        };
        if (m_respPrinter) {
            TRACE(std::cout, "");
            LOG4CXX_TRACE(logger, "Response : "<<(m_respPrinter(*resp)));
        }
        std::string s_resp;
        resp->SerializeToString(&s_resp);
        delete resp;
        zmq::message_t reply(s_resp.size());
        memcpy((void *)reply.data(), (void *)s_resp.c_str(),
                s_resp.size());
        socket.send(reply);
    }
}
Example #14
0
namespace scidb
{
using namespace scidb;
using namespace boost;

static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.libdense_linear_algebra.ops.gemm"));

static const bool DBG_CERR = false;
static const bool DBG_REFORMAT = false;

/**
 *  A Physical multiply operator implemented using ScaLAPACK
 *  The interesting work is done in invokeMPI(), above
 *
 */
class GEMMPhysical : public ScaLAPACKPhysical
{
public:

    GEMMPhysical(const std::string& logicalName, const std::string& physicalName, const Parameters& parameters, const ArrayDesc& schema)
    :
        ScaLAPACKPhysical(logicalName, physicalName, parameters, schema)
    {
    }
    std::shared_ptr<Array> invokeMPI(std::vector< std::shared_ptr<Array> >& inputArrays,
                                const GEMMOptions options, std::shared_ptr<Query>& query,
                                ArrayDesc& outSchema);

    virtual std::shared_ptr<Array> execute(std::vector< std::shared_ptr<Array> >& inputArrays, std::shared_ptr<Query> query);
private:
};


char getTransposeCode(bool transpose) {
    return transpose ? 'T' : 'N' ;
}

std::shared_ptr<Array> GEMMPhysical::invokeMPI(std::vector< std::shared_ptr<Array> >& inputArrays,
                                          const GEMMOptions options, std::shared_ptr<Query>& query,
                                          ArrayDesc& outSchema)
{
    //
    // Everything about the execute() method concerning the MPI execution of the arrays
    // is factored into this method.  This does not include the re-distribution of data
    // chunks into the ScaLAPACK distribution scheme, as the supplied inputArrays
    // must already be in that scheme.
    //
    // + intersects the array chunkGrids with the maximum process grid
    // + sets up the ScaLAPACK grid accordingly and if not participating, return early
    // + start and connect to an MPI slave process
    // + create ScaLAPACK descriptors for the input arrays
    // + convert the inputArrays into in-memory ScaLAPACK layout in shared memory
    // + call a "master" routine that passes the ScaLAPACK operator name, parameters,
    //   and shared memory descriptors to the ScaLAPACK MPI process that will do the
    //   actual computation.
    // + wait for successful completion
    // + construct an "OpArray" that make and Array API view of the output memory.
    // + return that output array.
    //
    enum dummy  {R=0, C=1};              // row column
    enum dummy2 {AA=0, BB, CC, NUM_MATRICES};  // which matrix: alpha AA * BB + beta CC -> result

    LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): begin");

    size_t numArray = inputArrays.size();
    if (numArray != NUM_MATRICES) {  // for now ... may make CC optional when beta is 0, later
        LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): " << numArray << " != NUM_MATRICES " << size_t(NUM_MATRICES));
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                   << "GEMMPhysical::invokeMPI(): requires 3 input Arrays/matrices.");
    }

    //
    // Initialize the (emulated) BLACS and get the proces grid info
    //
    blacs::context_t blacsContext = doBlacsInit(inputArrays, query, "GEMMPhysical");
    bool isParticipatingInScaLAPACK = blacsContext.isParticipating();
    if (isParticipatingInScaLAPACK) {
        checkBlacsInfo(query, blacsContext, "GEMMPhysical");
    }

    blacs::int_t NPROW=-1, NPCOL=-1, MYPROW=-1 , MYPCOL=-1 ;
    scidb_blacs_gridinfo_(blacsContext, NPROW, NPCOL, MYPROW, MYPCOL);
    LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI() NPROW="<<NPROW<<", NPCOL="<<NPCOL);

    //
    // launch MPISlave if we participate
    // TODO: move this down into the ScaLAPACK code ... something that does
    //       the doBlacsInit, launchMPISlaves, and the check that they agree
    //
    bool isParticipatingInMPI = launchMPISlaves(query, NPROW*NPCOL);
    if (isParticipatingInScaLAPACK != isParticipatingInMPI) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " isParticipatingInScaLAPACK " << isParticipatingInScaLAPACK
                              << " isParticipatingInMPI " << isParticipatingInMPI);
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                   << "GEMMPhysical::invokeMPI(): internal inconsistency in MPI slave launch.");
    }

    if (isParticipatingInMPI) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): participating in MPI");
    } else {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): not participating in MPI");
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): only participating in redistribute of the input");

        // redistribute to psScaLAPACK
        // NOTE: this must be kept in sync with the particpatingInMPI version of the redistribute, below
        // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above
        procRowCol_t firstChunkSize = { chunkRow(inputArrays[0]), chunkCol(inputArrays[0]) };
        std::shared_ptr<PartitioningSchemaDataForScaLAPACK> schemeData =
           make_shared<PartitioningSchemaDataForScaLAPACK>(getBlacsGridSize(inputArrays, query, "GEMMPhysical"), firstChunkSize);

        for(size_t mat=0; mat < numArray; mat++ ) {
            std::stringstream labelStream;
            labelStream << "GEMMPhysical input[" << mat << "]";
            std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat], schemeData, query, labelStream.str());
            bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ;  // only when redistribute was actually done (sometimes optimize away)
            if (wasConverted) {
                SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get());
                syncArray->sync();
            }
            // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized
            inputArrays[mat].reset();

            // TODO: validate that the redistribute brought no chunks to the instance by
            //       getting an array iterator and make sure it returns no chunks
            //       (factor to ScaLAPACKPhysical.cpp)

            // after validating, we don't need tmpRedistedInput anymore, either
            tmpRedistedInput.reset();
        }
        unlaunchMPISlavesNonParticipating();
        return std::shared_ptr<Array>(new MemArray(_schema,query)); // NOTE: must not happen before redistribute is done.
    }

    //
    // get dimension information about the input arrays
    // TODO: REFACTOR, this is a pattern in DLAs
    //

    // matrix sizes from arrays A,B,C
    matSize_t size[NUM_MATRICES];       // does not change even after redistributeInputArray
    for(size_t i=0; i < numArray; i++ ) {
        size[i] = getMatSize(inputArrays[i]);
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                               << " size["<<i<<"] " << size[i][R] << "," << size[i][C]);

   }



    // TODO JHM : convert 1d arrays to nrows x 1 so we can use vectors as input to GEMM without requiring
    //            the user to add a dimension of size 1.
    for(size_t i=0; i < numArray; i++ ) {
        // TODO JHM : check inputArrays[i] to make sure we are only using 2D arrays,
        //            that may or may not be done by checkInputArrays
        checkInputArray(inputArrays[i]);  // check block size constraints, etc
    }

    //
    //.... Set up ScaLAPACK array descriptors ........................................
    //

    // we would like to do the automatic repart() [not yet implemented] inside the same loop as the
    // redistribute() and extractToScaLAPACK() in order to release each array after it is consumed.
    // unfortunately, we have made some of the routines below dependent on the MB,NB we are going to use,
    // which has recently become determined by the chunkSize of the inputArrays[] since it is no longer
    // a fixed value, but may vary over a legal range.
    // but when automatic repart() is done, we will want to use the chunksize of the output of the repart().
    // so we will need to decide by this point what the MB,NB is going to be, even if we haven't reparted
    // to it yet.
    // to make it clear we mean ScaLAPACK MB,NB
    // (which may become different from the inputArray[i] chunkSize in the future)
    // we will call the array of ScaLAPACK MB,NB pairs,  MB_NB[].
    matSize_t MB_NB[NUM_MATRICES];  // this one should be moved after redistributeInputArrays() for when it really reparts
    for(size_t i=0; i < numArray; i++ ) {
        MB_NB[i] = getMatChunkSize(inputArrays[i]);
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " using MB_NB["<<i<<"] " << MB_NB[i][R] << "," << MB_NB[i][C]);
    }

    // these formulas for LLD (local leading dimension) and LTD (local trailing dimension)
    // are found in the headers of the ScaLAPACK functions such as pdgemm_()
    const slpp::int_t one = 1 ;

    // TODO: turn these pairs into matSize_t matrixLocalSize[NUM_MATRICES];
    slpp::int_t LLD[NUM_MATRICES]; // local leading dimension
    slpp::int_t LTD[NUM_MATRICES]; // local trailing dimension
    for(size_t i=0; i < numArray; i++ ) {
        slpp::int_t RSRC = 0 ;
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " M["<<i<<"][R]"<<size[i][R] <<" MB["<<i<<"][R]:"<<MB_NB[i][R]
                              << " N["<<i<<"][R]"<<size[i][C] <<" NB["<<i<<"][R]:"<<MB_NB[i][C]
                              << " MYPROW:"<<MYPROW << " NPROW:"<< NPROW);
        LLD[i] = std::max(one, scidb_numroc_( size[i][R], MB_NB[i][R], MYPROW, RSRC, NPROW ));
        LTD[i] = std::max(one, scidb_numroc_( size[i][C], MB_NB[i][C], MYPCOL, RSRC, NPCOL ));
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " LLD["<<i<<"] = " << LLD[i]
                              << " LTD["<<i<<"] = " << LTD[i]);
    }

    // create ScaLAPACK array descriptors
    // TODO: lets factor this to a method on ScaLAPACKPhysical
    slpp::desc_t DESC[NUM_MATRICES];
    for(size_t i=0; i < numArray; i++ ) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " scidb_descinit_(DESC["<<i<<"], M=" << size[i][R] << ", N=" << size[i][C]
                              << ", MB=" << MB_NB[i][R] << ", NB=" << MB_NB[i][R]
                              << ", IRSRC=" << 0 << ", ICSRC=" << 0
                              << ", LLD=" << LLD[i]);

        slpp::int_t descinitINFO = 0; // an output implemented as non-const ref (due to Fortran calling conventions)
        scidb_descinit_(DESC[i], size[i][R], size[i][C], MB_NB[i][R], MB_NB[i][C], 0, 0, blacsContext, LLD[i], descinitINFO);
        if (descinitINFO != 0) {
            LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed, INFO " << descinitINFO
                                                                                    << " DESC " << DESC);
            throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                       << "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed");
        }

        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " scidb_descinit_() returned DESC["<<i<<"] " << DESC[i]);

        // debugging for #1986 ... when #instances is prime, process grid is a row.  When small chunk sizes are used,
        // desc.LLD is being set to a number larger than the chunk size ... I don't understand or expect this.
        bool doDebugTicket1986=true;  // remains on until fixed, can't ship with this not understood.
        if(doDebugTicket1986) {
            if (DESC[i].LLD > DESC[i].MB) {
                LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): ticket 1986 issue"
                                      <<  ", DESC["<<i<<"].LLD " << DESC[i].LLD
                                      << " > DESC["<<i<<"].MB: " << DESC[i].MB);
            }
        }
    }

    // matrix allocations are of local size, not global size
    size_t matrixLocalSize[NUM_MATRICES];
    for(size_t i=0; i < numArray; i++ ) {
        matrixLocalSize[i]  = size_t(LLD[i]) * LTD[i] ;
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): "
                << " LLD[" << i << "] ( " << LLD[i] << " ) x "
                << " LTD[" << i << "] ( " << LTD[i] << " ) = "
                << " matrixLocalSize[" << i << "] " << matrixLocalSize[i]);
    }

    //
    // Create IPC buffers
    //
    enum dummy3 {BUF_ARGS=0, BUF_MAT_AA, BUF_MAT_BB, BUF_MAT_CC, NUM_BUFS };
    assert(numArray < NUM_BUFS);

    size_t bufElemBytes[NUM_BUFS];
    size_t bufNumElem[NUM_BUFS];
    std::string bufDbgNames[NUM_BUFS];

    bufElemBytes[BUF_ARGS]= 1 ;           bufNumElem[BUF_ARGS]= sizeof(scidb::PdgemmArgs) ; bufDbgNames[BUF_ARGS] = "PdgemmArgs";
    bufElemBytes[BUF_MAT_AA]= sizeof(double) ; bufNumElem[BUF_MAT_AA]= matrixLocalSize[AA]; bufDbgNames[BUF_MAT_AA] = "A" ;
    bufElemBytes[BUF_MAT_BB]= sizeof(double) ; bufNumElem[BUF_MAT_BB]= matrixLocalSize[BB]; bufDbgNames[BUF_MAT_BB] = "B" ;
    bufElemBytes[BUF_MAT_CC]= sizeof(double) ; bufNumElem[BUF_MAT_CC]= matrixLocalSize[CC]; bufDbgNames[BUF_MAT_CC] = "C" ;
    typedef scidb::SharedMemoryPtr<double> shmSharedPtr_t ;

    for(size_t i=0; i < numArray; i++ ) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): "
                << " bufElemBytes[" << i << "] = " << bufElemBytes[i]);
    }
    std::vector<MPIPhysical::SMIptr_t> shmIpc = allocateMPISharedMemory(NUM_BUFS, bufElemBytes, bufNumElem, bufDbgNames);

    // the following used to determine the PDGEMM() "K" argument just prior to pdgemm,
    // but now it has to be done before inputArrays[AA] is .reset() in the following loop.
    //
    // Comments on PDGEMM input "K", taken from the netlib PDGEMM argument header:
    // If transa = 'T' or 'C'(true), it is the number of rows in submatrix A."
    // If transa = 'N'(false), it is the number of columns in submatrix A."
    slpp::int_t K = nCol(inputArrays[AA], options.transposeA);

    // the following also used to be done just prior to pdgemm,
    // but now must be done before inputArrays[CC] is .reset() in the following loop.
    // it must also now be a copy, and not a reference, for the same reason.
    Dimensions const dimsCC = inputArrays[CC]->getArrayDesc().getDimensions();

    // now for each input matrix, do the following:
    // 1. redistribute to psScaLAPACK (when not already correct).
    // 2. if actual conversion happened, release the inputArray, which might be a lot of memory, e.g. when inputArray[i] is materialized.
    // 2. zero the ScaLAPACK local block-cyclic storage in shared mem. (so that empty cells will become zeros).
    // 3. extract the (redistributed array) where not-empty, into the ScaLAPACK local matrix memory.
    // 4. release the redistributed array, which might be a lot of memory since SG is currently materializing.
    //
    // The only caller of this routine is the execute() method, and neither the execute() method, nor the executor that calls it,
    // access the inputArrays after calling execute, which is why we can reset() the shared_ptrs to the arrays after consuming the
    // arrrays into the ScaLAPACK memory.
    //
    // redistribute to psScaLAPACK, and convert to ScaLAPACK format.
    // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above

    procRowCol_t firstChunkSize = { chunkRow(inputArrays[0]), chunkCol(inputArrays[0]) };
    std::shared_ptr<PartitioningSchemaDataForScaLAPACK> schemeData =
       make_shared<PartitioningSchemaDataForScaLAPACK>(getBlacsGridSize(inputArrays, query, "GEMMPhysical"), firstChunkSize);

    double* asDoubles[NUM_MATRICES];
    for(size_t mat=0; mat < numArray; mat++ ) {
        std::stringstream labelStream;
        labelStream << "GEMMPhysical input[" << mat << "]";
        std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat], schemeData, query, labelStream.str());

        bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ;  // only when redistribute was actually done (sometimes optimize away)

        // TODO would be nice if we could allocate the ScaLAPACK memory after dropping the input array
        //      in case the physical memory for the shmem can be reclaimed from the reset inputArrays[mat]

        size_t buf= mat+1;          // buffer 0 is command buffer, buffers[1..n] correspond to inputs[0..n-1]
        assert(buf < NUM_BUFS);

        asDoubles[mat] = reinterpret_cast<double*>(shmIpc[buf]->get());

        setInputMatrixToAlgebraDefault(asDoubles[mat], bufNumElem[buf]);  // note asDoubles[CC] is input and output to/from ScaLAPACK
        extractArrayToScaLAPACK(tmpRedistedInput, asDoubles[mat], DESC[mat],NPROW, NPCOL, MYPROW, MYPCOL, query);

        if(wasConverted) {
            SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get());
            syncArray->sync();
        }
        // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized
        inputArrays[mat].reset();
        tmpRedistedInput.reset(); // and drop this array before iterating on the loop to the next repart/redist

        if(DBG_REFORMAT) { // that the reformat worked correctly
            for(size_t ii=0; ii < matrixLocalSize[mat]; ii++) {
                LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                                      << " @myPPos("<< MYPROW << "," << MYPCOL << ")"
                                      << " array["<<mat<<"]["<<ii<<"] = " << asDoubles[mat][ii]);
            }
        }
    }
    size_t resultShmIpcIndx = BUF_MAT_CC;           // by default, GEMM assumes it will return something for C
                                                    // but this will change if find we don't particpate in the output
    shmSharedPtr_t Cx(shmIpc[resultShmIpcIndx]);

    //
    //.... Call pdgemm to compute the product of A and B .............................
    //
    LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): calling pdgemm_ M,N,K:" << size[AA][R]  << ","
                                                                  << size[BB][R] << ","
                                                                  << size[CC][C]
                           << " MB,NB:" << MB_NB[AA][R] << "," << MB_NB[AA][C]);

    if(DBG_CERR) std::cerr << "GEMMPhysical::invokeMPI(): calling pdgemm to compute" << std:: endl;
    std::shared_ptr<MpiSlaveProxy> slave = _ctx->getSlave(_launchId);

    slpp::int_t MYPE = query->getInstanceID() ;  // we map 1-to-1 between instanceID and MPI rank
    slpp::int_t INFO = DEFAULT_BAD_INFO ;
    pdgemmMaster(query.get(), _ctx, slave, _ipcName, shmIpc[BUF_ARGS]->get(),
                  NPROW, NPCOL, MYPROW, MYPCOL, MYPE,
                  getTransposeCode(options.transposeA), getTransposeCode(options.transposeB),
                  size[CC][R], size[CC][C], K,
                  &options.alpha,
                  asDoubles[AA], one, one, DESC[AA],
                  asDoubles[BB], one, one, DESC[BB],
                  &options.beta,
                  asDoubles[CC], one, one, DESC[CC],
                  INFO);
    raiseIfBadResultInfo(INFO, "pdgemm");

    boost::shared_array<char> resPtrDummy(reinterpret_cast<char*>(NULL));
    typedef scidb::ReformatFromScalapack<shmSharedPtr_t> reformatOp_t ;

    if(logger->isTraceEnabled()) {
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI():--------------------------------------");
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): sequential values from 'C' memory");
        for(size_t ii=0; ii < matrixLocalSize[CC]; ii++) {
            LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): ("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << asDoubles[CC][ii]);
        }
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): --------------------------------------");
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): using pdelgetOp to reformat Gemm left from memory to scidb array , start");
    }


    //
    // an OpArray is a SplitArray that is filled on-the-fly by calling the operator
    // so all we have to do is create one with an upper-left corner equal to the
    // global position of the first local block we have.  so we need to map
    // our "processor" coordinate into that position, which we do by multiplying
    // by the chunkSize
    //
    Coordinates first(2);
    first[R] = dimsCC[R].getStartMin() + MYPROW * MB_NB[CC][R];
    first[C] = dimsCC[C].getStartMin() + MYPCOL * MB_NB[CC][C];

    Coordinates last(2);
    last[R] = dimsCC[R].getStartMin() + size[CC][R] - 1;
    last[C] = dimsCC[C].getStartMin() + size[CC][C] - 1;

    std::shared_ptr<Array> result;
    // the process grid may be larger than the size of output in chunks... e.g multiplying A(1x100) * B(100x1) -> C(1x1)
    bool isParticipatingInOutput = first[R] <= last[R] && first[C] <= last[C] ;
    if (isParticipatingInOutput) {
        // there is in fact some output in our shared memory... hook it up to an OpArray
        Coordinates iterDelta(2);
        iterDelta[0] = NPROW * MB_NB[CC][R];
        iterDelta[1] = NPCOL * MB_NB[CC][C];

        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():Creating OpArray from ("<<first[R]<<","<<first[C]<<") to (" << last[R] <<"," <<last[C]<<") delta:"<<iterDelta[R]<<","<<iterDelta[C]);
        reformatOp_t      pdelgetOp(Cx, DESC[CC], dimsCC[R].getStartMin(), dimsCC[C].getStartMin(),
                                    NPROW, NPCOL, MYPROW, MYPCOL);
        result = std::shared_ptr<Array>(new OpArray<reformatOp_t>(outSchema, resPtrDummy, pdelgetOp,
                                                             first, last, iterDelta, query));
        assert(resultShmIpcIndx == BUF_MAT_CC);
    } else {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): instance participated, but does not output: creating empty MemArray: first ("<<first[R]<<","<<first[C]<<"), last(" << last[R] <<"," <<last[C]<<")");
        result = std::shared_ptr<Array>(new MemArray(_schema,query));  // same as when we don't participate at all
        resultShmIpcIndx = shmIpc.size();                   // indicate we don't want to hold on to buffer BUF_MAT_CC after all
    }

    // TODO: common pattern in ScaLAPACK operators: factor to base class
    releaseMPISharedMemoryInputs(shmIpc, resultShmIpcIndx);
    unlaunchMPISlaves();

    LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI() end");

    return result;
}


std::shared_ptr<Array> GEMMPhysical::execute(std::vector< std::shared_ptr<Array> >& inputArrays, std::shared_ptr<Query> query)
{
    //
    // + converts inputArrays to psScaLAPACK distribution
    // + calls invokeMPI()
    // + returns the output OpArray.
    //
    LOG4CXX_DEBUG(logger, "GEMMPhysical::execute(): begin.");

    // TODO: make a GEMMLogical checkArgs(inputArrays, query); which asserts two or three arrays

    // get string of parameters from the optional 4th argument:
    // (TRANSA, TRANSB, ALPHA, BETA)
    std::string namedOptionStr;
    if (_parameters.size() >= 1) {
        assert(_parameters[0]->getParamType() == PARAM_PHYSICAL_EXPRESSION);
        typedef std::shared_ptr<OperatorParamPhysicalExpression> ParamType_t ;
        ParamType_t& paramExpr = reinterpret_cast<ParamType_t&>(_parameters[0]);
        assert(paramExpr->isConstant());
        namedOptionStr = paramExpr->getExpression()->evaluate().getString();
    }

    GEMMOptions options(namedOptionStr);

    //
    // invokeMPI()
    //

    // invokeMPI does not manage an empty bitmap yet, but it is specified in _schema.
    // so to make it compatible, we first create a copy of _schema without the empty tag attribute
    Attributes attrsNoEmptyTag = _schema.getAttributes(true /*exclude empty bitmap*/);
    ArrayDesc schemaNoEmptyTag(_schema.getName(), attrsNoEmptyTag, _schema.getDimensions(), defaultPartitioning());

    // and now invokeMPI produces an array without empty bitmap except when it is not participating
    std::shared_ptr<Array> arrayNoEmptyTag = invokeMPI(inputArrays, options, query, schemaNoEmptyTag);


    // now we place a wrapper array around arrayNoEmptyTag, that adds a fake emptyTag (true everywhere)
    // but otherwise passes through requests for iterators on the other attributes.
    // And yes, the class name is the complete opposite of what it shold be.
    std::shared_ptr<Array> result;
    if (arrayNoEmptyTag->getArrayDesc().getEmptyBitmapAttribute() == NULL) {
        result = make_shared<NonEmptyableArray>(arrayNoEmptyTag);
    } else {
        result = arrayNoEmptyTag;
    }

    // return the scidb array
    LOG4CXX_DEBUG(logger, "GEMMPhysical::execute(): (successful) end");
    return result;
}

REGISTER_PHYSICAL_OPERATOR_FACTORY(GEMMPhysical, "gemm", "GEMMPhysical");

} // namespace
Example #15
0
namespace scidb
{

static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.mpi"));

static bool checkLauncher(uint32_t testDelay,
                          uint64_t launchId,
                          MpiOperatorContext* ctx)
{
    std::shared_ptr<MpiLauncher> launcher(ctx->getLauncher(launchId));
    if (isDebug()) {
        if (launcher) {
            // when running tests, slow down to give launcher a chance to exit
            ::sleep(testDelay);
        }
    }
    if (launcher && !launcher->isRunning()) {
            throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                << "MPI launcher process already terminated";
    }
    return true;
}

static bool checkTimeout(double startTime,
                         double timeout,
                         uint64_t launchId,
                         MpiOperatorContext* ctx)
{
    if (mpi::hasExpired(startTime, timeout)) {
        throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
        << "MPI slave process failed to communicate in time";
    }
    return true;
}

static bool checkLauncherWithTimeout(uint32_t testDelay,
                                     double startTime,
                                     double timeout,
                                     uint64_t launchId,
                                     MpiOperatorContext* ctx)
{
    bool rc = checkLauncher(testDelay, launchId, ctx);
    assert(rc);
    return (checkTimeout(startTime, timeout, launchId, ctx) && rc);
}

void MpiSlaveProxy::waitForHandshake(std::shared_ptr<MpiOperatorContext>& ctx)
{
    if (_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "Connection to MPI slave already established");
    }

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: launchId="<<_launchId);

    MpiOperatorContext::LaunchErrorChecker errChecker =
    boost::bind(&checkLauncherWithTimeout,
                _delayForTestingInSec,
                mpi::getTimeInSecs(),
                static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT),
                _1, _2);

    std::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker);
    assert(msg);

    _connection = msg->getClientContext();

    if (msg->getMessageType() != scidb::mtMpiSlaveHandshake) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake is invalid");
    }

    std::shared_ptr<scidb_msg::MpiSlaveHandshake> handshake =
        std::dynamic_pointer_cast<scidb_msg::MpiSlaveHandshake>(msg->getRecord());
    assert(handshake);

    // parse the handshake
    if (!handshake->has_pid()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake has no PID");
    }
    const pid_t slavePid = handshake->pid();
    if (slavePid == ::getpid() ||
        slavePid == ::getppid() ||
        slavePid < 2) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake has invalid PID");
    }

    if (!handshake->has_ppid()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake has no PPID");
    }
    const pid_t slavePPid = handshake->ppid();
    if (slavePPid == ::getpid() ||
        slavePPid == ::getppid() ||
        slavePPid < 2) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid PPID");
    }

    const string clusterUuid = Cluster::getInstance()->getUuid();

    if (handshake->cluster_uuid() != clusterUuid) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid clusterUuid");
    }
    InstanceID instanceId = Cluster::getInstance()->getLocalInstanceId();

    if (handshake->instance_id() != instanceId) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid instanceId");
    }

    if (handshake->launch_id() != _launchId) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid launchId");
    }

    std::shared_ptr<scidb::Query> query(Query::getValidQueryPtr(_query));

    if (handshake->rank() != query->getInstanceID()) { // logical instance ID
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid rank");
    }

    _pids.push_back(slavePid);
    _pids.push_back(slavePPid);

    ClientContext::DisconnectHandler dh =
        boost::bind(&MpiMessageHandler::handleMpiSlaveDisconnect, _launchId, _1);
    _connection->attachQuery(query->getQueryID(), dh);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: handshake: "
                  <<" pid="<<handshake->pid()
                  <<", ppid="<<handshake->ppid()
                  <<", cluster_uuid="<<handshake->cluster_uuid()
                  <<", instance_id="<<handshake->instance_id()
                  <<", launch_id="<<handshake->launch_id()
                  <<", rank="<<handshake->rank());
}

void MpiSlaveProxy::sendCommand(mpi::Command& cmd, std::shared_ptr<MpiOperatorContext>& ctx)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    // set command
    std::shared_ptr<scidb_msg::MpiSlaveCommand> cmdPtr(new scidb_msg::MpiSlaveCommand());
    cmdPtr->set_command(cmd.getCmd());

    // set args
    const std::vector<std::string>& args = cmd.getArgs();
    google::protobuf::RepeatedPtrField<std::string>* msgArgs = cmdPtr->mutable_args();
    assert(msgArgs);
    msgArgs->Reserve(args.size());
    for (std::vector<std::string>::const_iterator iter = args.begin(); iter != args.end(); ++iter) {
        cmdPtr->add_args(*iter);
    }
    // send to slave
    scidb::MessagePtr msgPtr(cmdPtr);
    boost::asio::const_buffer binary(NULL,0);
    try {
        scidb::sendAsyncClient(_connection, scidb::mtMpiSlaveCommand, msgPtr, binary);

    } catch (const scidb::SystemException& e) {
        LOG4CXX_ERROR(logger, "MpiSlaveProxy::sendCommand: "
                      << "FAILED to send MpiSlaveCommand to slave because: "
                      << e.what());
        throw;
    }
    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::sendCommand: MpiSlaveCommand "
                  << cmd.toString() << " sent to slave");
}

/// @todo XXX TODO tigor: make it timeout ? TBD
int64_t MpiSlaveProxy::waitForStatus(std::shared_ptr<MpiOperatorContext>& ctx, bool raise)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForStatus: launchId="<<_launchId);

    MpiOperatorContext::LaunchErrorChecker errChecker =
       boost::bind(&checkLauncher, _delayForTestingInSec, _1, _2);

    std::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker);
    assert(msg);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForStatus: message from client: "
                  <<" ctx = " << msg->getClientContext().get()
                  <<", msg type = "<< msg->getMessageType()
                  <<", queryID = "<<msg->getQueryId());

    if (_connection != msg->getClientContext()) {
        if (!msg->getClientContext() &&
            msg->getMessageType() == scidb::SYSTEM_NONE_MSG_ID) {
            throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
                   << "MPI slave disconnected prematurely");
        }
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave connection context mismatch");
    }

    if (msg->getMessageType() != scidb::mtMpiSlaveResult) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned invalid status");
    }

    std::shared_ptr<scidb_msg::MpiSlaveResult> result =
        std::dynamic_pointer_cast<scidb_msg::MpiSlaveResult>(msg->getRecord());
    assert(result);

    if (!result->has_status()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned no status");
    }

    if (raise && result->status() != 0) {
        std::stringstream ss;
        ss << "MPI Slave Execution returned status " << result->status();
        throw (SYSTEM_EXCEPTION(SCIDB_SE_OPERATOR, SCIDB_LE_OPERATION_FAILED) << ss.str());
    }
    return result->status();
}

void MpiSlaveProxy::waitForExit(std::shared_ptr<MpiOperatorContext>& ctx)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: launchId="<<_launchId);

    MpiOperatorContext::LaunchErrorChecker errChecker =
    boost::bind(&checkTimeout,
                mpi::getTimeInSecs(),
                static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT),
                _1, _2);

    std::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker);
    assert(msg);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: "
                  <<" ctx = " << msg->getClientContext().get()
                  <<", msg type = "<< msg->getMessageType()
                  <<", queryID = "<<msg->getQueryId());

    if (msg->getMessageType() != scidb::SYSTEM_NONE_MSG_ID) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned invalid status");
    }
    assert(!msg->getClientContext());
    _connection.reset();
}

void MpiSlaveProxy::destroy(bool error)
{
    QueryID queryIdForKill(INVALID_QUERY_ID);
    if (error) {
        _inError=true;
        queryIdForKill = _queryId;
    }
    const string clusterUuid = Cluster::getInstance()->getUuid();
    // kill the slave proc and its parent orted
    for ( std::vector<pid_t>::const_iterator iter=_pids.begin();
          iter!=_pids.end(); ++iter) {
        pid_t pid = *iter;

        //XXX TODO tigor: kill proceess group (-pid) ?
        LOG4CXX_DEBUG(logger, "MpiSlaveProxy::destroy: killing slave pid = "<<pid);
        MpiErrorHandler::killProc(_installPath, clusterUuid, pid, queryIdForKill);
    }

    std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId);
    MpiErrorHandler::cleanupSlavePidFile(_installPath,
                                         clusterUuid,
                                         pidFile,
                                         queryIdForKill);

    // rm log file
    if (!logger->isTraceEnabled() && !_inError) {
        string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId);
        scidb::File::remove(logFileName.c_str(), false);
    }
}

} //namespace
namespace scidb
{
static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.mpi"));

#if defined(NDEBUG)
static const bool DBG = false;
#else
static const bool DBG = true;
#endif

MpiLauncher::MpiLauncher(uint64_t launchId, const boost::shared_ptr<Query>& q)
  : _pid(0),
    _status(0),
    _queryId(q->getQueryID()),
    _launchId(launchId),
    _query(q),
    _waiting(false),
    _inError(false),
    _MPI_LAUNCHER_KILL_TIMEOUT(scidb::getLivenessTimeout())
{
}

MpiLauncher::MpiLauncher(uint64_t launchId, const boost::shared_ptr<Query>& q, uint32_t timeout)
  : _pid(0),
    _status(0),
    _queryId(q->getQueryID()),
    _launchId(launchId),
    _query(q),
    _waiting(false),
    _inError(false),
    _MPI_LAUNCHER_KILL_TIMEOUT(timeout)
{
}

void MpiLauncher::getPids(vector<pid_t>& pids)
{
    ScopedMutexLock lock(_mutex);
    if (_pid <= 1) {
        throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
            << " MPI launcher is not running";
    }
    pids.push_back(_pid);
}

void MpiLauncher::launch(const vector<string>& slaveArgs,
                         const boost::shared_ptr<const InstanceMembership>& membership,
                         const size_t maxSlaves)
{
    vector<string> args;
    {
        ScopedMutexLock lock(_mutex);
        if (_pid != 0 || _waiting) {
            throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
                << " MPI launcher is already running";
        }
        boost::shared_ptr<Query> query = _query.lock();
        Query::validateQueryPtr(query);

        buildArgs(args, slaveArgs, membership, query, maxSlaves);
    }
    pid_t pid = fork();

    if (pid < 0) {
        // error
        int err = errno;
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_SYSCALL_ERROR)
               << "fork" << pid << err <<"");

    } else if (pid > 0) {
        // parent
        ScopedMutexLock lock(_mutex);
        if (_pid != 0 || _waiting) {
            throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
                << " MPI launcher is corrupted after launch";
        }
        _pid = pid;
        LOG4CXX_DEBUG(logger, "MPI launcher process spawned, pid="<<_pid);
        return;

    }  else {
        // child
        becomeProcGroupLeader();
        recordPids();
        setupLogging();

        if (DBG) {
            std::cerr << "LAUNCHER pid="<<getpid()
             << ", pgid="<< ::getpgid(0)
             << ", ppid="<< ::getppid()<<std::endl;
        }

        closeFds();
        boost::scoped_array<const char*> argv(new const char*[args.size()+1]);
        initExecArgs(args, argv);
        const char *path = argv[0];

        if (DBG) {
            std::cerr << "LAUNCHER pid="<<::getpid()<<" args for "<<path<<" are ready" << std::endl;
            for (size_t i=0; i<args.size(); ++i) {
                const char * arg = argv[i];
                if (!arg) break;
                cerr << "LAUNCHER arg["<<i<<"] = "<< argv[i] << std::endl;
            }
        }

        int rc = ::execv(path, const_cast<char* const*>(argv.get()));

        assert(rc == -1);
        rc=rc; // avoid compiler warning

        perror("LAUNCHER execv");
        _exit(1);
    }
    throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE);
}

bool MpiLauncher::isRunning()
{
    pid_t pid=0;
    int status=0;
    {
        ScopedMutexLock lock(_mutex);
        if (_pid<=0) { return false; }
        pid = _pid;
    }

    const bool doNotWait=true;
    bool rc = waitForExit(pid, &status, doNotWait);

    if (!rc) {
        return true;
    }

    ScopedMutexLock lock(_mutex);
    _pid = -pid;
    _status = status;

    return false;
}

void MpiLauncher::destroy(bool force)
{
    pid_t pid=0;
    int status=0;
    string pidFile;
    {
        ScopedMutexLock lock(_mutex);
        if (_pid == 0 || _waiting) {
            throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
                   << " MPI launcher already destroyed");
        }
        _waiting = true;
        pid = _pid;
        status = _status;
        pidFile = mpi::getLauncherPidFile(_installPath, _queryId, _launchId);

        if (pid > 0) {
            if (!force) {
                scheduleKillTimer();
            } else { // kill right away
                boost::shared_ptr<boost::asio::deadline_timer> dummyTimer;
                boost::system::error_code dummyErr;
                handleKillTimeout(dummyTimer, dummyErr);
            }
        }
        if (force) {
            _inError=true;
        }
    }
    if (pid < 0) {
        completeLaunch(-pid, pidFile, status);
        return;
    }
    bool rc = waitForExit(pid,&status);
    assert(rc); rc=rc;
    {
        ScopedMutexLock lock(_mutex);
        if (!_waiting || pid != _pid) {
             throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
                 << " MPI launcher is corrupted after collecting process exit code";
         }

        _pid = -pid;
        _status = status;

        if (_killTimer) {
            size_t n = _killTimer->cancel();
            assert(n<2); n=n;
        }
    }
    completeLaunch(pid, pidFile, status);
}

void MpiLauncher::completeLaunch(pid_t pid, const std::string& pidFile, int status)
{
    // rm args file
    boost::scoped_ptr<SharedMemoryIpc> shmIpc(mpi::newSharedMemoryIpc(_ipcName));
    shmIpc->remove();
    shmIpc.reset();

    // rm pid file
    scidb::File::remove(pidFile.c_str(), false);

    // rm log file
    if (!logger->isTraceEnabled() && !_inError) {
        string logFileName = mpi::getLauncherLogFile(_installPath, _queryId, _launchId);
        scidb::File::remove(logFileName.c_str(), false);
    }

    if (WIFSIGNALED(status)) {
        LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<pid<<") terminated by signal = "
                      << WTERMSIG(status) << (WCOREDUMP(status)? ", core dumped" : ""));
        throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process";

    } else if (WIFEXITED(status)) {
        int rc = WEXITSTATUS(status);
        if (rc != 0) {
            LOG4CXX_ERROR(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc);
            throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "MPI launcher process";

        } else {
            LOG4CXX_DEBUG(logger, "SciDB MPI launcher (pid="<<_pid<<") exited with status = " << rc);
            return;
        }
    }
    throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE);
}

 void MpiLauncher::handleKillTimeout(boost::shared_ptr<boost::asio::deadline_timer>& killTimer,
                                     const boost::system::error_code& error)
 {
     ScopedMutexLock lock(_mutex);

     if (error == boost::asio::error::operation_aborted) {
         assert(_pid < 0);
         LOG4CXX_TRACE(logger, " MPI launcher kill timer cancelled");
         return;
     }
     if (error) {
         assert(false);
         LOG4CXX_WARN(logger, "MPI launcher kill timer encountered error"<<error);
     }
     if (_pid <= 0) {
         LOG4CXX_WARN(logger, "MPI launcher kill timer cannot kill pid="<<_pid);
         return;
     }
     if (!_waiting) {
         assert(false);
         LOG4CXX_ERROR(logger, "MPI launcher kill timer cannot kill pid="<<_pid);
         throw InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
             << " MPI launcher process cannot be killed";
     }
     LOG4CXX_WARN(logger, "MPI launcher is about to kill group pid="<<_pid);

     // kill launcher's proc group
     MpiErrorHandler::killProc(_installPath, -_pid);
 }

static void validateLauncherArg(const std::string& arg)
{
    const char *notAllowed = " \n";
    if (arg.find_first_of(notAllowed) != string::npos) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_INVALID_FUNCTION_ARGUMENT)
               << (string("MPI launcher argument with whitespace: ")+arg));
    }
}

/// @todo XXX tigor: move command args into a file
void MpiLauncher::buildArgs(vector<string>& args,
                            const vector<string>& slaveArgs,
                            const boost::shared_ptr<const InstanceMembership>& membership,
                            const boost::shared_ptr<Query>& query,
                            const size_t maxSlaves)
{
    for (vector<string>::const_iterator iter=slaveArgs.begin();
         iter!=slaveArgs.end(); ++iter) {
        validateLauncherArg(*iter);
    }

    const Instances& instances = membership->getInstanceConfigs();

    map<InstanceID,const InstanceDesc*> sortedInstances;
    getSortedInstances(sortedInstances, instances, query);

    ostringstream buf;
    const string clusterUuid = Cluster::getInstance()->getUuid();
    buf << _queryId;
    const string queryId  = buf.str();
    buf.str("");
    buf << _launchId;
    const string launchId = buf.str();

    // preallocate memory
    const size_t ARGS_PER_INSTANCE = 16;
    const size_t ARGS_PER_LAUNCH = 4;
    const size_t MPI_PREFIX_CORRECTION = 2;
    size_t totalArgsNum = ARGS_PER_LAUNCH +
       (ARGS_PER_INSTANCE+slaveArgs.size()) * std::min(maxSlaves, sortedInstances.size()) -
       MPI_PREFIX_CORRECTION;
    args.clear();
    args.reserve(totalArgsNum);
    InstanceID myId = Cluster::getInstance()->getLocalInstanceId();

    args.push_back(string("")); //place holder for the binary
    args.push_back(string("--verbose"));
    args.push_back(string("--tag-output"));
    args.push_back(string("--timestamp-output"));

    // first, find my own install path, and add coordinator arguments
    for (map<InstanceID,const InstanceDesc*>::const_iterator i = sortedInstances.begin();
         i !=  sortedInstances.end(); ++i) {

        assert(i->first<sortedInstances.size());
        const InstanceDesc* desc = i->second;
        assert(desc);
        InstanceID currId = desc->getInstanceId();
        assert(currId < instances.size());

        if (currId != myId) {
            continue;
        }
        assert(args[0].empty());
        const string& installPath = desc->getPath();
        _installPath = installPath;
        args[0] = MpiManager::getLauncherBinFile(installPath);

        addPerInstanceArgs(myId, desc, clusterUuid, queryId,
                           launchId, slaveArgs, args);
    }

    assert(!args[0].empty());

    // second, loop again to actually start all the instances
    size_t count = 1;
    for (map<InstanceID,const InstanceDesc*>::const_iterator i = sortedInstances.begin();
         i !=  sortedInstances.end() && count<maxSlaves; ++i,++count) {

        const InstanceDesc* desc = i->second;
        InstanceID currId = desc->getInstanceId();

        if (currId == myId) {
            --count;
            continue;
        }
        addPerInstanceArgs(myId, desc, clusterUuid, queryId,
                           launchId, slaveArgs, args);
    }
    int64_t shmSize(0);
    vector<string>::iterator iter=args.begin();
    iter += ARGS_PER_LAUNCH;

    // compute arguments size
    const size_t DELIM_SIZE=sizeof('\n');
    for (; iter!=args.end(); ++iter) {
        string& arg = (*iter);
        shmSize += (arg.size()+DELIM_SIZE);
    }

    LOG4CXX_TRACE(logger, "MPI launcher arguments size = " << shmSize);

    // Create shared memory to pass the arguments to the launcher
    _ipcName = mpi::getIpcName(_installPath, clusterUuid, queryId, myId, launchId) + ".launch_args";

    LOG4CXX_TRACE(logger, "MPI launcher arguments ipc = " << _ipcName);

    boost::scoped_ptr<SharedMemoryIpc> shmIpc(mpi::newSharedMemoryIpc(_ipcName));
    char* ptr(NULL);
    try {
        shmIpc->create(SharedMemoryIpc::RDWR);
        shmIpc->truncate(shmSize);
        ptr = reinterpret_cast<char*>(shmIpc->get());
    } catch(scidb::SharedMemoryIpc::SystemErrorException& e) {
        LOG4CXX_ERROR(logger, "Cannot map shared memory: " << e.what());
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "shared_memory_mmap");
    } catch(scidb::SharedMemoryIpc::InvalidStateException& e) {
        LOG4CXX_ERROR(logger, "Unexpected error while mapping shared memory: " << e.what());
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR) << e.what());
    }
    assert(ptr);

    size_t off = 0;
    iter=args.begin();
    iter += ARGS_PER_LAUNCH;
    for (; iter!=args.end(); ++iter) {
        string& arg = (*iter);

        if (off == 0) {
        } else if (arg == "-H") {
            *(ptr+off) = '\n';
            ++off;
        } else {
            *(ptr+off) = ' ';
            ++off;
        }
         memcpy((ptr+off), arg.data(), arg.size());
         off += arg.size();
         arg.clear();
    }
    *(ptr+off) = '\n';
    ++off;
    assert(static_cast<int64_t>(off) <= shmSize);
    shmIpc->close();
    shmIpc->flush();

    assert(args.size() >= ARGS_PER_LAUNCH+2);
    args[ARGS_PER_LAUNCH+0] = "--app";
    args[ARGS_PER_LAUNCH+1] = mpi::getIpcFile(_installPath,_ipcName);
    args.resize(ARGS_PER_LAUNCH+2);
}

void MpiLauncher::addPerInstanceArgs(const InstanceID myId, const InstanceDesc* desc,
                                     const string& clusterUuid,
                                     const string& queryId,
                                     const string& launchId,
                                     const vector<string>& slaveArgs, vector<string>& args)
{
    InstanceID currId = desc->getInstanceId();

    ostringstream instanceIdStr;
    instanceIdStr << currId;

    const string& host = desc->getHost();
    const string& installPath = desc->getPath();

    ostringstream portStr;
    portStr << desc->getPort();

    // mpirun command line:
    // [":", "-H", <IP>, "-np", <#>, "-wd", <path>, "--prefix", <path>, "-x", "LD_LIBRARY_PATH"]*
    validateLauncherArg(host);
    args.push_back("-H");
    args.push_back(host);

    args.push_back("-np");
    args.push_back("1");

    validateLauncherArg(installPath);
    args.push_back("-wd");
    args.push_back(installPath);

    if (currId != myId) {
        // XXX NOTE: --prefix is not appended for this instance (the coordinator)
        // and this instance's arguments go first in the argument list because of
        // of an apparent bug in mpirun handling of --prefix
        const string mpiDir = MpiManager::getMpiDir(installPath);
        validateLauncherArg(mpiDir);
        args.push_back("--prefix");
        args.push_back(mpiDir);
    }
    args.push_back("-x");
    args.push_back("LD_LIBRARY_PATH");

    const string slaveBinFile = mpi::getSlaveBinFile(installPath);
    validateLauncherArg(slaveBinFile);
    args.push_back(slaveBinFile);

    // slave args
    args.push_back(clusterUuid);
    args.push_back(queryId);
    args.push_back(instanceIdStr.str());
    args.push_back(launchId);
    args.push_back(portStr.str());

    args.insert(args.end(), slaveArgs.begin(), slaveArgs.end());
}

void MpiLauncher::getSortedInstances(map<InstanceID,const InstanceDesc*>& sortedInstances,
                                     const Instances& instances,
                                     const boost::shared_ptr<Query>& query)
{
    for (Instances::const_iterator i = instances.begin(); i != instances.end(); ++i) {
        InstanceID id = i->getInstanceId();
        try {
            // lid should be equal mpi rank
            InstanceID lid = query->mapPhysicalToLogical(id);
            sortedInstances[lid] = &(*i);
        } catch(SystemException& e) {
            if (e.getLongErrorCode() != SCIDB_LE_INSTANCE_OFFLINE) {
                throw;
            }
        }
    }
    assert(sortedInstances.size() == query->getInstancesCount());
}

void MpiLauncher::closeFds()
{
    //XXX TODO: move to Sysinfo
    long maxfd = ::sysconf(_SC_OPEN_MAX);
    if (maxfd<2) {
        maxfd = 1024;
    }

    cerr << "LAUNCHER: maxfd = " << maxfd << endl;

    // close all fds except for stderr,stdout
    int rc = scidb::File::closeFd(0); //stdin
    rc=rc; // avoid compiler warning
    for (long fd=3; fd <= maxfd ; ++fd) {
        rc = scidb::File::closeFd(fd);
        rc=rc; // avoid compiler warning
    }
}

void MpiLauncher::becomeProcGroupLeader()
{
    if (setpgid(0,0) != 0) {
        perror("setpgid");
        _exit(1);
    }
}

void MpiLauncher::setupLogging()
{
    std::string path =
        mpi::getLauncherLogFile(_installPath, _queryId, _launchId);
    mpi::connectStdIoToLog(path);
}

void MpiLauncher::recordPids()
{
    assert(!_installPath.empty());
    string path =
        mpi::getLauncherPidFile(_installPath, _queryId, _launchId);
    mpi::recordPids(path);
}

void MpiLauncher::initExecArgs(const vector<string>& args,
                               boost::scoped_array<const char*>& argv)
{
     size_t argsSize = args.size();
     if (argsSize<1) {
         cerr << "LAUNCHER: initExecArgs failed to get args:" << argsSize << endl;
         _exit(1);
     }
     for (size_t i=0; i < argsSize; ++i) {
         argv[i] = args[i].c_str();
     }
     argv[argsSize] = NULL;
 }

void MpiLauncher::scheduleKillTimer()
{
    // this->_mutex must be locked
    assert (_pid > 1);
    assert(!_killTimer);
    _killTimer = shared_ptr<boost::asio::deadline_timer>(new boost::asio::deadline_timer(getIOService()));
    int rc = _killTimer->expires_from_now(posix_time::seconds(_MPI_LAUNCHER_KILL_TIMEOUT));
    if (rc != 0) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_SYSCALL_ERROR)
               << "boost::asio::expires_from_now" << rc << rc << _MPI_LAUNCHER_KILL_TIMEOUT);
    }
    _killTimer->async_wait(boost::bind(&MpiLauncher::handleKillTimeout,
                                      shared_from_this(), _killTimer,
                                      boost::asio::placeholders::error));
}

bool MpiLauncher::waitForExit(pid_t pid, int *status, bool noWait)
{
    int opts = 0;
    if (noWait) {
        opts = WNOHANG;
    }
    while(true) {

        pid_t rc = ::waitpid(pid,status,opts);

        if ((rc == -1) && (errno==EINTR)) {
            continue;
        }
        if (rc == 0 && noWait) {
            return false;
        }
        if ((rc <= 0) || (rc != pid)) {
            int err = errno;
            throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_SYSCALL_ERROR)
                   << "wait" << rc << err << pid);
        }
        return true;
    }
    throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNREACHABLE_CODE);
    return false;
}

} //namespace
Example #17
0
namespace scidb {
static const bool DBG = false;
static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.query.ops.mpi"));

///
/// some operators may not be able to work in degraded mode while they are being implemented
/// this call can make them exit if that is the case.
/// TODO: add a more explicit message of what is happening
void throwIfDegradedMode(shared_ptr<Query>& query) {
    const boost::shared_ptr<const InstanceMembership> membership =
    Cluster::getInstance()->getInstanceMembership();
    if ((membership->getViewId() != query->getCoordinatorLiveness()->getViewId()) ||
        (membership->getInstances().size() != query->getInstancesCount())) {
        // because we can't yet handle the extra data from
        // replicas that we would be fed in "degraded mode"
        throw USER_EXCEPTION(SCIDB_SE_EXECUTION, SCIDB_LE_NO_QUORUM2);
    }
}

void MPIPhysical::setQuery(const boost::shared_ptr<Query>& query)
{
    boost::shared_ptr<Query> myQuery = _query.lock();
    if (myQuery) {
        assert(query==myQuery);
        assert(_ctx);
        return;
    }
    PhysicalOperator::setQuery(query);
    _ctx = boost::shared_ptr<MpiOperatorContext>(new MpiOperatorContext(query));
    _ctx = MpiManager::getInstance()->checkAndSetCtx(query,_ctx);
}

void MPIPhysical::postSingleExecute(shared_ptr<Query> query)
{
    // On a non-participating launcher instance it is difficult
    // to determine when the launch is complete without a sync point.
    // postSingleExecute() is run after all instances report success of their execute() phase,
    // that is effectively a sync point.
    assert(query->getCoordinatorID() == COORDINATOR_INSTANCE);
    assert(_mustLaunch);
    assert(_ctx);
    const uint64_t lastIdInUse = _ctx->getLastLaunchIdInUse();

    boost::shared_ptr<MpiLauncher> launcher(_ctx->getLauncher(lastIdInUse));
    assert(launcher);
    if (launcher && launcher == _launcher) {
        LOG4CXX_DEBUG(logger, "MPIPhysical::postSingleExecute: destroying last launcher for launch = " << lastIdInUse);
        assert(lastIdInUse == _launchId);

        launcher->destroy();
        _launcher.reset();
    }
    _ctx.reset();
}

bool MPIPhysical::launchMPISlaves(shared_ptr<Query>& query, const size_t maxSlaves)
{
    LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(query, maxSlaves: " << maxSlaves << ") called.");

    assert(maxSlaves <= query->getInstancesCount());

    // This barrier guarantees MPIPhysical::setQuery is called on all instances
    // before any slaves are launched.
    // It also makes sure a non-participating launcher waits for the current launch to finish before starting a new one.
    syncBarrier(0, query);
    syncBarrier(1, query);

    _launchId = _ctx->getNextLaunchId(); // bump the launch ID by 1

    Cluster* cluster = Cluster::getInstance();
    const boost::shared_ptr<const InstanceMembership> membership = cluster->getInstanceMembership();
    const string& installPath = MpiManager::getInstallPath(membership);

    uint64_t lastIdInUse = _ctx->getLastLaunchIdInUse();
    assert(lastIdInUse < _launchId);

    boost::shared_ptr<MpiSlaveProxy> slave;

    // check if our logical ID is within the set of instances that will have a corresponding slave
    InstanceID iID = query->getInstanceID();
    if ( iID < maxSlaves) {
        slave = boost::make_shared<MpiSlaveProxy>(_launchId, query, installPath);
        _ctx->setSlave(slave);
    }

    _mustLaunch = (query->getCoordinatorID() == COORDINATOR_INSTANCE);
    if (_mustLaunch) {

        boost::shared_ptr<MpiLauncher> oldLauncher = _ctx->getLauncher(lastIdInUse);
        if (oldLauncher) {
            assert(lastIdInUse == oldLauncher->getLaunchId());
            LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): destroying last launcher for launch = " << lastIdInUse);
            oldLauncher->destroy();
            oldLauncher.reset();
        }
        _launcher = boost::shared_ptr<MpiLauncher>(MpiManager::getInstance()->newMPILauncher(_launchId, query));
        _ctx->setLauncher(_launcher);
        std::vector<std::string> args;
        _launcher->launch(args, membership, maxSlaves);
    }

    if ( iID < maxSlaves) {
        assert(slave);

        //-------------------- Get the handshake
        LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): slave->waitForHandshake() 1 called.");
        slave->waitForHandshake(_ctx);
        LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): slave->waitForHandshake() 1 returned.");
    }

    if ( iID < maxSlaves || _mustLaunch) {
        // After the handshake the old slave must be gone
        LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves():"
                      << " lastLaunchIdInUse=" << lastIdInUse
                      << " launchId=" << _launchId);


        boost::shared_ptr<MpiSlaveProxy> oldSlave = _ctx->getSlave(lastIdInUse);
        if (oldSlave) {
            assert(lastIdInUse == oldSlave->getLaunchId());
            LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): oldSlave->destroy() & .reset()");
            oldSlave->destroy();
            oldSlave.reset();
        }
        _ctx->complete(lastIdInUse);
    }

    if ( iID < maxSlaves) {

        _ipcName = mpi::getIpcName(installPath, cluster->getUuid(), query->getQueryID(),
                                   cluster->getLocalInstanceId(), _launchId);

        LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): instance " << iID << " slave started.");
        return true;
    } else {
        LOG4CXX_DEBUG(logger, "MPIPhysical::launchMPISlaves(): instance " << iID << " slave bypass.");
        return false;
    }
}

// XXX TODO: consider returning std::vector<scidb::SharedMemoryPtr>
// XXX TODO: which would require supporting different types of memory (double, char etc.)
std::vector<MPIPhysical::SMIptr_t> MPIPhysical::allocateMPISharedMemory(size_t numBufs,
                                                                        size_t elemSizes[],
                                                                        size_t numElems[],
                                                                        string dbgNames[])
{
    LOG4CXX_DEBUG(logger, "MPIPhysical::allocateMPISharedMemory(numBufs "<<numBufs<<",,,)");

    if(logger->isTraceEnabled()) {
        LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory(): allocations are: ");
        for(size_t ii=0; ii< numBufs; ii++) {
            LOG4CXX_TRACE(logger, "MPIPhysical::allocateMPISharedMemory():"
                                   << " elemSizes["<<ii<<"] "<< dbgNames[ii] << " len " << numElems[ii]);
        }
    }

    std::vector<SMIptr_t> shmIpc(numBufs);
    bool preallocate = Config::getInstance()->getOption<bool>(CONFIG_PREALLOCATE_SHM);
    for(size_t ii=0; ii<numBufs; ii++) {
        std::stringstream suffix;
        suffix << "." << ii ;
        std::string ipcNameFull= _ipcName + suffix.str();
        LOG4CXX_TRACE(logger, "IPC name = " << ipcNameFull);
        shmIpc[ii] = SMIptr_t(mpi::newSharedMemoryIpc(ipcNameFull, preallocate)); // can I get 'em off ctx instead?
        _ctx->addSharedMemoryIpc(_launchId, shmIpc[ii]);

        char* ptr = MpiLauncher::initIpcForWrite(shmIpc[ii].get(), (elemSizes[ii] * numElems[ii]));
        assert(ptr); ptr=ptr;
    }
    return shmIpc;
}

void MPIPhysical::releaseMPISharedMemoryInputs(std::vector<MPIPhysical::SMIptr_t>& shmIpc, size_t resultIpcIndx)
{
    for(size_t i=0; i<shmIpc.size(); i++) {
        if (!shmIpc[i]) {
            continue;
        }
        SharedMemoryIpc *ipc = shmIpc[i].get();
        ipc->close();

        if (i!=resultIpcIndx) {
            ipc->unmap();
        }
        if (!ipc->remove()) {
            throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "shared_memory_remove");
        }
    }
}

} // namespace
namespace scidb
{

static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.mpi"));

static double getTimeInSecs()
{
    struct timespec ts;
    if (clock_gettime(CLOCK_REALTIME, &ts) == -1) {
        assert(false);
        throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_CANT_GET_SYSTEM_TIME);
    }
    return (ts.tv_sec + ts.tv_nsec*1e-9);
}

static bool checkForTimeout(double startTime, double timeout,
                            uint64_t launchId, MpiOperatorContext* ctx)
{
    if ((getTimeInSecs() - startTime) > timeout) {
        throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
            << "MPI slave process failed to communicate in time";
    }
    return true;
}

static bool checkLauncher(double startTime, double timeout,
                          uint64_t launchId, MpiOperatorContext* ctx)
{
    boost::shared_ptr<MpiLauncher> launcher(ctx->getLauncher(launchId));
    if (launcher && !launcher->isRunning()) {
            throw SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                << "MPI launcher process";
    }
    checkForTimeout(startTime, timeout, launchId, ctx);

    return true;
}

void MpiSlaveProxy::waitForHandshake(boost::shared_ptr<MpiOperatorContext>& ctx)
{
    if (_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "Connection to MPI slave already established");
    }

    MpiOperatorContext::LaunchErrorChecker errChecker =
        boost::bind(&checkLauncher, getTimeInSecs(),
                    static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2);
    boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, errChecker);
    assert(msg);

    _connection = msg->getClientContext();

    if (msg->getMessageType() != scidb::mtMpiSlaveHandshake) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake is invalid");
    }

    boost::shared_ptr<scidb_msg::MpiSlaveHandshake> handshake =
        boost::dynamic_pointer_cast<scidb_msg::MpiSlaveHandshake>(msg->getRecord());
    assert(handshake);

    // parse the handshake
    if (!handshake->has_pid()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
            << "MPI slave handshake has no PID");
    }
    if (!handshake->has_ppid()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has no PPID");
    }

    _pids.push_back(handshake->pid());
    _pids.push_back(handshake->ppid());

    string clusterUuid = Cluster::getInstance()->getUuid();

    if (handshake->cluster_uuid() != clusterUuid) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid clusterUuid");
    }
    InstanceID instanceId = Cluster::getInstance()->getLocalInstanceId();

    if (handshake->instance_id() != instanceId) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid instanceId");
    }

    if (handshake->launch_id() != _launchId) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid launchId");
    }

    boost::shared_ptr<scidb::Query> query( _query.lock());
    Query::validateQueryPtr(query);

    if (handshake->rank() != query->getInstanceID()) { // logical instance ID
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave handshake has invalid rank");
    }

    ClientContext::DisconnectHandler dh =
        boost::bind(&MpiMessageHandler::handleMpiSlaveDisconnect, _launchId, _1);
    _connection->attachQuery(query->getQueryID(), dh);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForHandshake: handshake: "
                  <<" pid="<<handshake->pid()
                  <<", ppid="<<handshake->ppid()
                  <<", cluster_uuid="<<handshake->cluster_uuid()
                  <<", instance_id="<<handshake->instance_id()
                  <<", launch_id="<<handshake->launch_id()
                  <<", rank="<<handshake->rank());
}

void MpiSlaveProxy::sendCommand(mpi::Command& cmd, boost::shared_ptr<MpiOperatorContext>& ctx)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    // set command
    boost::shared_ptr<scidb_msg::MpiSlaveCommand> cmdPtr(new scidb_msg::MpiSlaveCommand());
    cmdPtr->set_command(cmd.getCmd());

    // set args
    const std::vector<std::string>& args = cmd.getArgs();
    google::protobuf::RepeatedPtrField<std::string>* msgArgs = cmdPtr->mutable_args();
    assert(msgArgs);
    msgArgs->Reserve(args.size());
    for (std::vector<std::string>::const_iterator iter = args.begin(); iter != args.end(); ++iter) {
        cmdPtr->add_args(*iter);
    }
    // send to slave
    scidb::MessagePtr msgPtr(cmdPtr);
    boost::asio::const_buffer binary(NULL,0);
    try {
        scidb::sendAsync(_connection, scidb::mtMpiSlaveCommand, msgPtr, binary);

    } catch (const scidb::SystemException& e) {
        LOG4CXX_ERROR(logger, "MpiSlaveProxy::sendCommand: "
                      << "FAILED to send MpiSlaveCommand to slave because: "
                      << e.what());
        throw;
    }
    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::sendCommand: MpiSlaveCommand "
                  << cmd.toString() << " sent to slave");
}

/// @todo XXX TODO tigor: make it timeout ? TBD
int64_t MpiSlaveProxy::waitForStatus(boost::shared_ptr<MpiOperatorContext>& ctx, bool raise)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    MpiOperatorContext::LaunchErrorChecker noopChecker;
    boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, noopChecker);
    assert(msg);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForStatus: message from client: "
                  <<" ctx = " << msg->getClientContext().get()
                  <<", msg type = "<< msg->getMessageType()
                  <<", queryID = "<<msg->getQueryId());

    if (_connection != msg->getClientContext()) {
        if (!msg->getClientContext() &&
            msg->getMessageType() == scidb::SYSTEM_NONE_MSG_ID) {
            throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
                   << "MPI slave disconnected prematurely");
        }
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave connection context mismatch");
    }

    if (msg->getMessageType() != scidb::mtMpiSlaveResult) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned invalid status");
    }

    boost::shared_ptr<scidb_msg::MpiSlaveResult> result =
        boost::dynamic_pointer_cast<scidb_msg::MpiSlaveResult>(msg->getRecord());
    assert(result);

    if (!result->has_status()) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned no status");
    }

    if (raise && result->status() != 0) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_OPERATOR, SCIDB_LE_OPERATION_FAILED)
               << result->status());
    }
    return result->status();
}

void MpiSlaveProxy::waitForExit(boost::shared_ptr<MpiOperatorContext>& ctx)
{
    if (!_connection) {
        throw (InvalidStateException(REL_FILE, __FUNCTION__, __LINE__)
               << "No connection to MPI slave");
    }

    MpiOperatorContext::LaunchErrorChecker timeChecker =
       boost::bind(&checkForTimeout, getTimeInSecs(),
                   static_cast<double>(_MPI_SLAVE_RESPONSE_TIMEOUT), _1, _2);
    boost::shared_ptr<scidb::ClientMessageDescription> msg = ctx->popMsg(_launchId, timeChecker);
    assert(msg);

    LOG4CXX_DEBUG(logger, "MpiSlaveProxy::waitForExit: "
                  <<" ctx = " << msg->getClientContext().get()
                  <<", msg type = "<< msg->getMessageType()
                  <<", queryID = "<<msg->getQueryId());

    if (msg->getMessageType() != scidb::SYSTEM_NONE_MSG_ID) {
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_UNKNOWN_ERROR)
               << "MPI slave returned invalid status");
    }
    assert(!msg->getClientContext());
    _connection.reset();
}

void MpiSlaveProxy::destroy(bool error)
{
    if (error) {
        _inError=true;
    }
    // kill the slave proc and its parent orted
    for ( std::vector<pid_t>::const_iterator iter=_pids.begin();
          iter!=_pids.end(); ++iter) {
        pid_t pid = *iter;

        //XXX TODO tigor: kill proceess group (-pid) ?
        MpiErrorHandler::killProc(_installPath, pid);
    }

    // rm pid file
    std::string pidFile = mpi::getSlavePidFile(_installPath, _queryId, _launchId);
    scidb::File::remove(pidFile.c_str(), false);

    // rm log file
    if (!logger->isTraceEnabled() && !_inError) {
        string logFileName = mpi::getSlaveLogFile(_installPath, _queryId, _launchId);
        scidb::File::remove(logFileName.c_str(), false);
    }
}

} //namespace
Example #19
0
namespace scidb {

static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.linear_algebra.ops.scalapack"));

inline bool hasSingleAttribute(ArrayDesc const& desc)
{
    return desc.getAttributes().size() == 1 || (desc.getAttributes().size() == 2 && desc.getAttributes()[1].isEmptyIndicator());
}

void checkScaLAPACKInputs(std::vector<ArrayDesc> schemas, boost::shared_ptr<Query> query,
                          size_t nMatsMin, size_t nMatsMax)
{
    enum dummy  {ROW=0, COL=1};
    enum dummy2 { ATTR0=0 };

    const size_t NUM_MATRICES = schemas.size();


    if(schemas.size() < nMatsMin ||
       schemas.size() > nMatsMax) {
        throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR2);
    }

    // Check the properties first by argument, then by order property is determined in AFL statement:
    // size, chunkSize, overlap.
    // Check individual properties in the loop, and any inter-matrix properties after the loop
    // TODO: in all of these, name the argument # at fault
    for(size_t iArray=0; iArray < NUM_MATRICES; iArray++) {

        // check: attribute count == 1
        if (!hasSingleAttribute(schemas[iArray])) {
            throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR2);
            // TODO: offending matrix is iArray
        }

        // check: attribute type is double
        if (schemas[iArray].getAttributes()[ATTR0].getType() != TID_DOUBLE) {
            throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR5);
            // TODO: offending matrix is iArray
        }

        // check: nDim == 2 (a matrix)
        // TODO: relax nDim to be 1 and have it imply NCOL=1  (column vector)
        //       if you want a row vector, we could make transpose accept the column vector and output a 1 x N matrix
        //       and call that a "row vector"  The other way could never be acceptable.
        //
        const size_t SCALAPACK_IS_2D = 2 ;
        if (schemas[iArray].getDimensions().size() != SCALAPACK_IS_2D) {
            throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR3);
            // TODO: offending matrix is iArray
        }

        // check: size is bounded
        const Dimensions& dims = schemas[iArray].getDimensions();
        if (dims[ROW].getLength() == INFINITE_LENGTH ||
            dims[COL].getLength() == INFINITE_LENGTH) {
            throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR9);
        }
        // TODO: check: sizes are not larger than largest ScaLAPACK fortran INTEGER

        // TEMPORARY until #2202 defines how to interpret arrays not starting at 0
        // "dimensions must start at 0"
        for(unsigned dim =ROW; dim <= COL; dim++) {
            if(dims[dim].getStart() != 0) {
                throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR44);
            }
        }

        // check: chunk interval not too small
        if (dims[ROW].getChunkInterval() < slpp::SCALAPACK_MIN_BLOCK_SIZE ||
            dims[COL].getChunkInterval() < slpp::SCALAPACK_MIN_BLOCK_SIZE ) {
            // the cache will thrash and performance will be unexplicably horrible to the user
            throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR41); // too small
        }


        // check: chunk interval not too large
        if (dims[ROW].getChunkInterval() > slpp::SCALAPACK_MAX_BLOCK_SIZE ||
            dims[COL].getChunkInterval() > slpp::SCALAPACK_MAX_BLOCK_SIZE ) {
            // the cache will thrash and performance will be unexplicably horrible to the user
            throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR42); // too large
        }


        // TODO: the following does not work correctly.  postWarning() itself uses SCIDB_WARNING
        //       does not work correctly from a plugin, so seeking an example of how to do
        //       postWarning() from a plugin.
        if (false) {
            // broken code inside postWarning(SCIDB_WARNING()) faults and needs a different argument.
            for(size_t d = ROW; d <= COL; d++) {
                if(dims[d].getChunkInterval() != slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE) {
                    query->postWarning(SCIDB_WARNING(DLA_WARNING4) << slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE
                                                                   << slpp::SCALAPACK_EFFICIENT_BLOCK_SIZE);
                }
            }
        }

        // check: no overlap allowed
        //        TODO: improvement? if there's overlap, we may be able to ignore it,
        //              else invoke a common piece of code to remove it
        //              and in both cases emit a warning about non-optimality
        if (dims[ROW].getChunkOverlap()!=0 ||
            dims[COL].getChunkOverlap()!=0) {
            stringstream ss;
            ss<<"in matrix "<<iArray;
            throw (PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR40)
                   << ss.str());
        }
    }

    // check: the chunkSizes from the user must be identical (until auto-repart is working)
    const bool AUTO_REPART_WORKING = false ;  // #2032
    if( ! AUTO_REPART_WORKING ) {
        int64_t commonChunkSize = schemas[0].getDimensions()[ROW].getChunkInterval();
        // TODO: remove these checks if #2023 is fixed and requiresRepart() is functioning correctly
        for(size_t iArray=0; iArray < NUM_MATRICES; iArray++) {
            const Dimensions& dims = schemas[iArray].getDimensions();
            // arbitrarily take first mentioned chunksize as the one for all to share
            if (dims[ROW].getChunkInterval() != commonChunkSize ||
                dims[COL].getChunkInterval() != commonChunkSize ) {
                throw PLUGIN_USER_EXCEPTION(DLANameSpace, SCIDB_SE_INFER_SCHEMA, DLA_ERROR10);
                // TODO: name the matrix
            }
        }
    }

    // Chunksize matching critique
    //    This is not what we want it to be, but has to be until #2023 is fixed, which
    //    will allow the query planner and optimizer to repartition automatically, instead
    //    of putting the burden on the user.
    //
    //    (1) The required restriction to make ScaLAPACK work is that they are equal
    //    in both dimensions (square chunks) and equal for all matrices.
    //    (2) Legal values are in a range, expressed by SCALAPACK_{MIN,MAX}_BLOCK_SIZE
    //    (3) So what do we do if the chunksize is not optimal?  Can we go ahead and compute
    //    the answer if the matrix is below a size where it will really matter?
    //    Can we fix query->postWarning to warn in that case?
    //    (4) If the user gives inputs that match, and don't need a repart, we can proceed.
    //    (5) Else we will have to add reparts for the user [not implemented]
    //    Should we repart some of them to another size?  Or should we repart all of them
    //    to the optimial aize?  Unforunately, we don't have the information we would need
    //    to make an intelligent choice ...
    //    Due to the api of LogicalOperator::requiresRepart() we can't tell which situation
    //    it is, because it still only functions on the first input only.
    //
    // TODO: after #2032 is fixed, have James fix note(4) above.
    //
}

// PGB: the requirement on names is that until such a time as we have syntax to disambiguate them by dimesion index
//      or other means, they must be distinct, else if stored, we will lose access to any but the first.
// JHM: in math, its annoying to have the names keep getting longer for the same thing.  So we only want to do
//      the appending of _? when required.

std::pair<string, string> ScaLAPACKDistinctDimensionNames(const string& a, const string& b)
{
    typedef std::pair<string, string> result_t ;
    result_t result;

    if (a != b) {
        // for algebra, avoid the renames when possible
        return result_t(a,b);
    } else {
        // fallback to appending _1 or _2 to both... would rather do it to just one,
        // but this is the only convention we have for conflicting in general.
        return result_t(a + "_1", b + "_2");
    }
}

void log4cxx_debug_dimensions(const std::string& prefix, const Dimensions& dims)
{
    if(logger->isDebugEnabled()) {
        for (size_t i=0; i<dims.size(); i++) {
            LOG4CXX_DEBUG(logger, prefix << " dims["<<i<<"] from " << dims[i].getStartMin() << " to " << dims[i].getEndMax());
        }
    }
}

} // namespace
Example #20
0
void
PhysicalQueryPlanNode::supplantChild(const PhysNodePtr& targetChild,
                                     const PhysNodePtr& newChild)
{
    assert(newChild);
    assert(targetChild);
    assert(newChild.get() != this);
    int removed = 0;
    std::vector<PhysNodePtr> newChildren;

    if (logger->isTraceEnabled()) {
        std::ostringstream os;
        os << "Supplanting targetChild Node:\n";
        targetChild->toString(os, 0 /*indent*/,false /*children*/);
        os << "\nwith\n";
        newChild->toString(os, 0 /*indent*/,false /*children*/);
        LOG4CXX_TRACE(logger, os.str());
    }

    for(auto &child : _childNodes) {
        if (child != targetChild) {
            newChildren.push_back(child);
        }
        else {
            // Set the parent of the newChild to this node.
            newChild->_parent = shared_from_this();

            // NOTE: Any existing children of the newChild are removed from the
            // Query Plan.
            if ((newChild->_childNodes).size() > 0) {
                LOG4CXX_INFO(logger,
                             "Child nodes of supplanting node are being removed from the tree.");
            }

            // Re-parent the children of the targetChild to the newChild
            newChild->_childNodes.swap(targetChild->_childNodes);
            for (auto grandchild : newChild -> _childNodes) {
                assert(grandchild != newChild);
                grandchild->_parent = newChild;
            }

            // Remove any references to the children from the targetChild
            targetChild->_childNodes.clear();
            targetChild->resetParent();

            // Add the newChild to this node
            newChildren.push_back(newChild);
            ++removed;
        }
    }
    _childNodes.swap(newChildren);

    if (logger->isTraceEnabled()) {
        std::ostringstream os;
        newChild->toString(os);
        LOG4CXX_TRACE(logger, "New Node subplan:\n"
                      << os.str());
    }

    SCIDB_ASSERT(removed==1);
}
Example #21
0
namespace scidb
{

// Logger for query processor. static to prevent visibility of variable outside of file
static log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("scidb.qproc.processor"));

// LogicalQueryPlanNode
LogicalQueryPlanNode::LogicalQueryPlanNode(
	const std::shared_ptr<ParsingContext>& parsingContext,
	const std::shared_ptr<LogicalOperator>& logicalOperator):
	_logicalOperator(logicalOperator),
	_parsingContext(parsingContext)
{

}

LogicalQueryPlanNode::LogicalQueryPlanNode(
	const std::shared_ptr<ParsingContext>& parsingContext,
	const std::shared_ptr<LogicalOperator>& logicalOperator,
	const std::vector<std::shared_ptr<LogicalQueryPlanNode> > &childNodes):
	_logicalOperator(logicalOperator),
	_childNodes(childNodes),
	_parsingContext(parsingContext)
{
}

const ArrayDesc& LogicalQueryPlanNode::inferTypes(std::shared_ptr< Query> query)
{
    std::vector<ArrayDesc> inputSchemas;
    ArrayDesc outputSchema;
    for (size_t i=0, end=_childNodes.size(); i<end; i++)
    {
        inputSchemas.push_back(_childNodes[i]->inferTypes(query));
    }
    outputSchema = _logicalOperator->inferSchema(inputSchemas, query);
    //FIXME: May be cover inferSchema method with another one and assign alias there?
    if (!_logicalOperator->getAliasName().empty())
    {
        outputSchema.addAlias(_logicalOperator->getAliasName());
    }
    _logicalOperator->setSchema(outputSchema);
    LOG4CXX_DEBUG(logger, "Inferred schema for operator " <<
                  _logicalOperator->getLogicalName() << ": " << outputSchema);
    return _logicalOperator->getSchema();
}

void LogicalQueryPlanNode::inferArrayAccess(std::shared_ptr<Query>& query)
{
    //XXX TODO: consider non-recursive implementation
    for (size_t i=0, end=_childNodes.size(); i<end; i++)
    {
        _childNodes[i]->inferArrayAccess(query);
    }
    assert(_logicalOperator);
    _logicalOperator->inferArrayAccess(query);
}

std::string LogicalQueryPlanNode::inferPermissions(std::shared_ptr<Query>& query)
{
    std::stringstream ss;

    // Consider non-recursive implementation
    for (size_t i=0, end=_childNodes.size(); i<end; i++)
    {
        ss << _childNodes[i]->inferPermissions(query);
    }
    assert(_logicalOperator);
    ss << _logicalOperator->inferPermissions(query);
    std::string permissions = ss.str();

    // Remove duplicates
    std::map<char, int> hashMap;

    std::string::const_iterator itStr;
    for (   itStr = permissions.begin();
            itStr != permissions.end();
            ++itStr)
    {
        // Add/overwrite the node in the hashMap
        hashMap[*itStr] = 1;
    }

    std::map<char, int>::const_iterator itHashMap;
    std::string response;
    for (   itHashMap = hashMap.begin();
            itHashMap != hashMap.end();
            ++itHashMap)
    {
        // Append each character from the hashMap
        response += itHashMap->first;
    }

    return response;
}

void LogicalQueryPlanNode::toString(std::ostream &out, int indent, bool children) const
{
    Indent prefix(indent);
    out << prefix('>', false);
    out << "[lInstance] children "<<_childNodes.size()<<"\n";
    _logicalOperator->toString(out,indent+1);

    if (children) {
        for (size_t i = 0; i< _childNodes.size(); i++)
        {
            _childNodes[i]->toString(out, indent+1);
        }
    }
}

PhysicalQueryPlanNode::PhysicalQueryPlanNode(const std::shared_ptr<PhysicalOperator>& physicalOperator,
                                             bool ddl, bool tile)
: _physicalOperator(physicalOperator),
  _parent(), _ddl(ddl), _tile(tile), _isSgMovable(true), _isSgOffsetable(true), _distribution()
{
}

PhysicalQueryPlanNode::PhysicalQueryPlanNode(const std::shared_ptr<PhysicalOperator>& physicalOperator,
		const std::vector<std::shared_ptr<PhysicalQueryPlanNode> > &childNodes,
                                             bool ddl, bool tile):
	_physicalOperator(physicalOperator),
	_childNodes(childNodes),
    _parent(), _ddl(ddl), _tile(tile), _isSgMovable(true), _isSgOffsetable(true), _distribution()
{
}

void PhysicalQueryPlanNode::toString(std::ostream &out, int indent, bool children) const
{
    Indent prefix(indent);
    out << prefix('>', false);

    out<<"[pNode] "<<_physicalOperator->getPhysicalName()<<" ddl "<<isDdl()<<" tile "<<supportsTileMode()<<" children "<<_childNodes.size()<<"\n";
    _physicalOperator->toString(out,indent+1);

    if (children) {
        out << prefix(' ');
        out << "output full chunks: ";
        out << (outputFullChunks() ? "yes" : "no");
        out << "\n";
        out << prefix(' ');
        out << "changes dstribution: ";
        out << (changesDistribution() ? "yes" : "no");
        out << "\n";
    }

    out << prefix(' ');
    out<<"props sgm "<<_isSgMovable<<" sgo "<<_isSgOffsetable<<"\n";
    out << prefix(' ');
    out<<"diout "<<_distribution<<"\n";
    const ArrayDesc& schema = _physicalOperator->getSchema();
    out << prefix(' ');
    out<<"bound "<<_boundaries
      <<" cells "<<_boundaries.getNumCells();

    if (_boundaries.getStartCoords().size() == schema.getDimensions().size()) {
        out  << " chunks ";
        try {
            uint64_t n = _boundaries.getNumChunks(schema.getDimensions());
            out << n;
        } catch (PhysicalBoundaries::UnknownChunkIntervalException&) {
            out << '?';
        }
        out << " est_bytes " << _boundaries.getSizeEstimateBytes(schema)
            << '\n';
    }
    else {
        out <<" [improperly initialized]\n";
    }

    if (children) {
        for (size_t i = 0; i< _childNodes.size(); i++) {
            _childNodes[i]->toString(out, indent+1);
        }
    }
}

bool PhysicalQueryPlanNode::isStoringSg() const
{
    if ( isSgNode() ) {
        return (!getSgArrayName(_physicalOperator->getParameters()).empty());
    }
    return false;
}

string PhysicalQueryPlanNode::getSgArrayName(const PhysicalOperator::Parameters& sgParameters)
{
    std::string arrayName;
    if (sgParameters.size() >= 3) {
        arrayName = static_cast<OperatorParamReference*>(sgParameters[2].get())->getObjectName();
    }
    return arrayName;
}

bool PhysicalQueryPlanNode::getRedimensionIsStrict(const PhysicalOperator::Parameters& redimParameters)
{
    bool isStrict = true;
    if (redimParameters.size() == 2 &&
        redimParameters[1]->getParamType() == scidb::PARAM_PHYSICAL_EXPRESSION) {
        OperatorParamPhysicalExpression* paramExpr = static_cast<OperatorParamPhysicalExpression*>(redimParameters[1].get());
        SCIDB_ASSERT(paramExpr->isConstant());
        isStrict = paramExpr->getExpression()->evaluate().getBool();
    }
    return isStrict;
}

bool PhysicalQueryPlanNode::getInputIsStrict(const PhysicalOperator::Parameters& inputParameters)
{
    bool isStrict = true;
    if (inputParameters.size() == 6 &&
        inputParameters[5]->getParamType() == scidb::PARAM_PHYSICAL_EXPRESSION)
    {
        OperatorParamPhysicalExpression* paramExpr =
        static_cast<OperatorParamPhysicalExpression*>(inputParameters[5].get());
        SCIDB_ASSERT(paramExpr->isConstant());
        isStrict = paramExpr->getExpression()->evaluate().getBool();
    }
    else if (inputParameters.size() == 7)
    {
        ASSERT_EXCEPTION((inputParameters[6]->getParamType() == scidb::PARAM_PHYSICAL_EXPRESSION),
                         "Invalid input() parameters 6");

        OperatorParamPhysicalExpression* paramExpr =
        static_cast<OperatorParamPhysicalExpression*>(inputParameters[6].get());
        SCIDB_ASSERT(paramExpr->isConstant());
        isStrict = paramExpr->getExpression()->evaluate().getBool();
    }
    return isStrict;
}

void
PhysicalQueryPlanNode::supplantChild(const PhysNodePtr& targetChild,
                                     const PhysNodePtr& newChild)
{
    assert(newChild);
    assert(targetChild);
    assert(newChild.get() != this);
    int removed = 0;
    std::vector<PhysNodePtr> newChildren;

    if (logger->isTraceEnabled()) {
        std::ostringstream os;
        os << "Supplanting targetChild Node:\n";
        targetChild->toString(os, 0 /*indent*/,false /*children*/);
        os << "\nwith\n";
        newChild->toString(os, 0 /*indent*/,false /*children*/);
        LOG4CXX_TRACE(logger, os.str());
    }

    for(auto &child : _childNodes) {
        if (child != targetChild) {
            newChildren.push_back(child);
        }
        else {
            // Set the parent of the newChild to this node.
            newChild->_parent = shared_from_this();

            // NOTE: Any existing children of the newChild are removed from the
            // Query Plan.
            if ((newChild->_childNodes).size() > 0) {
                LOG4CXX_INFO(logger,
                             "Child nodes of supplanting node are being removed from the tree.");
            }

            // Re-parent the children of the targetChild to the newChild
            newChild->_childNodes.swap(targetChild->_childNodes);
            for (auto grandchild : newChild -> _childNodes) {
                assert(grandchild != newChild);
                grandchild->_parent = newChild;
            }

            // Remove any references to the children from the targetChild
            targetChild->_childNodes.clear();
            targetChild->resetParent();

            // Add the newChild to this node
            newChildren.push_back(newChild);
            ++removed;
        }
    }
    _childNodes.swap(newChildren);

    if (logger->isTraceEnabled()) {
        std::ostringstream os;
        newChild->toString(os);
        LOG4CXX_TRACE(logger, "New Node subplan:\n"
                      << os.str());
    }

    SCIDB_ASSERT(removed==1);
}

// LogicalPlan
LogicalPlan::LogicalPlan(const std::shared_ptr<LogicalQueryPlanNode>& root):
        _root(root)
{

}

void LogicalPlan::toString(std::ostream &out, int indent, bool children) const
{
    Indent prefix(indent);
    out << prefix('>', false);
    out << "[lPlan]:\n";
    _root->toString(out, indent+1, children);
}

// PhysicalPlan
PhysicalPlan::PhysicalPlan(const std::shared_ptr<PhysicalQueryPlanNode>& root):
        _root(root)
{

}

void PhysicalPlan::toString(std::ostream &out, int const indent, bool children) const
{
    Indent prefix(indent);
    out << prefix('>', false);
    out << "[pPlan]:";
    if (_root.get() != NULL)
    {
        out << "\n";
        _root->toString(out, indent+1, children);
    }
    else
    {
        out << "[NULL]\n";
    }
}

} // namespace