Exemplo n.º 1
0
std::shared_ptr<Array> GEMMPhysical::invokeMPI(std::vector< std::shared_ptr<Array> >& inputArrays,
                                          const GEMMOptions options, std::shared_ptr<Query>& query,
                                          ArrayDesc& outSchema)
{
    //
    // Everything about the execute() method concerning the MPI execution of the arrays
    // is factored into this method.  This does not include the re-distribution of data
    // chunks into the ScaLAPACK distribution scheme, as the supplied inputArrays
    // must already be in that scheme.
    //
    // + intersects the array chunkGrids with the maximum process grid
    // + sets up the ScaLAPACK grid accordingly and if not participating, return early
    // + start and connect to an MPI slave process
    // + create ScaLAPACK descriptors for the input arrays
    // + convert the inputArrays into in-memory ScaLAPACK layout in shared memory
    // + call a "master" routine that passes the ScaLAPACK operator name, parameters,
    //   and shared memory descriptors to the ScaLAPACK MPI process that will do the
    //   actual computation.
    // + wait for successful completion
    // + construct an "OpArray" that make and Array API view of the output memory.
    // + return that output array.
    //
    enum dummy  {R=0, C=1};              // row column
    enum dummy2 {AA=0, BB, CC, NUM_MATRICES};  // which matrix: alpha AA * BB + beta CC -> result

    LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): begin");

    size_t numArray = inputArrays.size();
    if (numArray != NUM_MATRICES) {  // for now ... may make CC optional when beta is 0, later
        LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): " << numArray << " != NUM_MATRICES " << size_t(NUM_MATRICES));
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                   << "GEMMPhysical::invokeMPI(): requires 3 input Arrays/matrices.");
    }

    //
    // Initialize the (emulated) BLACS and get the proces grid info
    //
    blacs::context_t blacsContext = doBlacsInit(inputArrays, query, "GEMMPhysical");
    bool isParticipatingInScaLAPACK = blacsContext.isParticipating();
    if (isParticipatingInScaLAPACK) {
        checkBlacsInfo(query, blacsContext, "GEMMPhysical");
    }

    blacs::int_t NPROW=-1, NPCOL=-1, MYPROW=-1 , MYPCOL=-1 ;
    scidb_blacs_gridinfo_(blacsContext, NPROW, NPCOL, MYPROW, MYPCOL);
    LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI() NPROW="<<NPROW<<", NPCOL="<<NPCOL);

    //
    // launch MPISlave if we participate
    // TODO: move this down into the ScaLAPACK code ... something that does
    //       the doBlacsInit, launchMPISlaves, and the check that they agree
    //
    bool isParticipatingInMPI = launchMPISlaves(query, NPROW*NPCOL);
    if (isParticipatingInScaLAPACK != isParticipatingInMPI) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " isParticipatingInScaLAPACK " << isParticipatingInScaLAPACK
                              << " isParticipatingInMPI " << isParticipatingInMPI);
        throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                   << "GEMMPhysical::invokeMPI(): internal inconsistency in MPI slave launch.");
    }

    if (isParticipatingInMPI) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): participating in MPI");
    } else {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): not participating in MPI");
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): only participating in redistribute of the input");

        // redistribute to psScaLAPACK
        // NOTE: this must be kept in sync with the particpatingInMPI version of the redistribute, below
        // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above
        for(size_t mat=0; mat < numArray; mat++ ) {
            std::stringstream labelStream;
            labelStream << "GEMMPhysical input[" << mat << "]";
            std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat],
                                                                             outSchema.getDistribution(),
                                                                             query, labelStream.str());
            bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ;  // only when redistribute was actually done (sometimes optimize away)
            if (wasConverted) {
                SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get());
                syncArray->sync();
            }
            // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized
            inputArrays[mat].reset();

            // TODO: validate that the redistribute brought no chunks to the instance by
            //       getting an array iterator and make sure it returns no chunks
            //       (factor to ScaLAPACKPhysical.cpp)

            // after validating, we don't need tmpRedistedInput anymore, either
            tmpRedistedInput.reset();
        }
        unlaunchMPISlavesNonParticipating();
        return std::shared_ptr<Array>(new MemArray(outSchema,query)); // NOTE: must not happen before redistribute is done.
    }

    //
    // get dimension information about the input arrays
    // TODO: REFACTOR, this is a pattern in DLAs
    //

    // matrix sizes from arrays A,B,C
    matSize_t size[NUM_MATRICES];       // does not change even after redistributeInputArray
    for(size_t i=0; i < numArray; i++ ) {
        size[i] = getMatSize(inputArrays[i]);
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                               << " size["<<i<<"] " << size[i][R] << "," << size[i][C]);

   }



    // TODO JHM : convert 1d arrays to nrows x 1 so we can use vectors as input to GEMM without requiring
    //            the user to add a dimension of size 1.
    for(size_t i=0; i < numArray; i++ ) {
        // TODO JHM : check inputArrays[i] to make sure we are only using 2D arrays,
        //            that may or may not be done by checkInputArrays
        checkInputArray(inputArrays[i]);  // check block size constraints, etc
    }

    //
    //.... Set up ScaLAPACK array descriptors ........................................
    //

    // we would like to do the automatic repart() [not yet implemented] inside the same loop as the
    // redistribute() and extractToScaLAPACK() in order to release each array after it is consumed.
    // unfortunately, we have made some of the routines below dependent on the MB,NB we are going to use,
    // which has recently become determined by the chunkSize of the inputArrays[] since it is no longer
    // a fixed value, but may vary over a legal range.
    // but when automatic repart() is done, we will want to use the chunksize of the output of the repart().
    // so we will need to decide by this point what the MB,NB is going to be, even if we haven't reparted
    // to it yet.
    // to make it clear we mean ScaLAPACK MB,NB
    // (which may become different from the inputArray[i] chunkSize in the future)
    // we will call the array of ScaLAPACK MB,NB pairs,  MB_NB[].
    matSize_t MB_NB[NUM_MATRICES];  // this one should be moved after redistributeInputArrays() for when it really reparts
    for(size_t i=0; i < numArray; i++ ) {
        MB_NB[i] = getMatChunkSize(inputArrays[i]);
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " using MB_NB["<<i<<"] " << MB_NB[i][R] << "," << MB_NB[i][C]);
    }

    // these formulas for LLD (local leading dimension) and LTD (local trailing dimension)
    // are found in the headers of the ScaLAPACK functions such as pdgemm_()
    const slpp::int_t one = 1 ;

    // TODO: turn these pairs into matSize_t matrixLocalSize[NUM_MATRICES];
    slpp::int_t LLD[NUM_MATRICES]; // local leading dimension
    slpp::int_t LTD[NUM_MATRICES]; // local trailing dimension
    for(size_t i=0; i < numArray; i++ ) {
        slpp::int_t RSRC = 0 ;
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " M["<<i<<"][R]"<<size[i][R] <<" MB["<<i<<"][R]:"<<MB_NB[i][R]
                              << " N["<<i<<"][R]"<<size[i][C] <<" NB["<<i<<"][R]:"<<MB_NB[i][C]
                              << " MYPROW:"<<MYPROW << " NPROW:"<< NPROW);
        LLD[i] = std::max(one, scidb_numroc_(slpp::int_cast(size[i][R]),
                                              slpp::int_cast(MB_NB[i][R]),
                                              MYPROW,
                                              RSRC,
                                              NPROW));
        LTD[i] = std::max(one, scidb_numroc_( slpp::int_cast(size[i][C]),
                                              slpp::int_cast(MB_NB[i][C]),
                                              MYPCOL,
                                              RSRC,
                                              NPCOL));
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " LLD["<<i<<"] = " << LLD[i]
                              << " LTD["<<i<<"] = " << LTD[i]);
    }

    // create ScaLAPACK array descriptors
    // TODO: lets factor this to a method on ScaLAPACKPhysical
    slpp::desc_t DESC[NUM_MATRICES];
    for(size_t i=0; i < numArray; i++ ) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " scidb_descinit_(DESC["<<i<<"], M=" << size[i][R] << ", N=" << size[i][C]
                              << ", MB=" << MB_NB[i][R] << ", NB=" << MB_NB[i][R]
                              << ", IRSRC=" << 0 << ", ICSRC=" << 0
                              << ", LLD=" << LLD[i]);

        slpp::int_t descinitINFO = 0; // an output implemented as non-const ref (due to Fortran calling conventions)
        scidb_descinit_(DESC[i],
                        slpp::int_cast(size[i][R]),
                        slpp::int_cast(size[i][C]),
                        slpp::int_cast(MB_NB[i][R]),
                        slpp::int_cast(MB_NB[i][C]),
                        0, 0, blacsContext, LLD[i], descinitINFO);
        if (descinitINFO != 0) {
            LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed, INFO " << descinitINFO
                                                                                    << " DESC " << DESC);
            throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED)
                       << "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed");
        }

        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                              << " scidb_descinit_() returned DESC["<<i<<"] " << DESC[i]);

        // debugging for #1986 ... when #instances is prime, process grid is a row.  When small chunk sizes are used,
        // desc.LLD is being set to a number larger than the chunk size ... I don't understand or expect this.
        bool doDebugTicket1986=true;  // remains on until fixed, can't ship with this not understood.
        if(doDebugTicket1986) {
            if (DESC[i].LLD > DESC[i].MB) {
                LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): ticket 1986 issue"
                                      <<  ", DESC["<<i<<"].LLD " << DESC[i].LLD
                                      << " > DESC["<<i<<"].MB: " << DESC[i].MB);
            }
        }
    }

    // matrix allocations are of local size, not global size
    size_t matrixLocalSize[NUM_MATRICES];
    for(size_t i=0; i < numArray; i++ ) {
        matrixLocalSize[i]  = size_t(LLD[i]) * LTD[i] ;
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): "
                << " LLD[" << i << "] ( " << LLD[i] << " ) x "
                << " LTD[" << i << "] ( " << LTD[i] << " ) = "
                << " matrixLocalSize[" << i << "] " << matrixLocalSize[i]);
    }

    //
    // Create IPC buffers
    //
    enum dummy3 {BUF_ARGS=0, BUF_MAT_AA, BUF_MAT_BB, BUF_MAT_CC, NUM_BUFS };
    assert(numArray < NUM_BUFS);

    size_t bufElemBytes[NUM_BUFS];
    size_t bufNumElem[NUM_BUFS];
    std::string bufDbgNames[NUM_BUFS];

    bufElemBytes[BUF_ARGS]= 1 ;           bufNumElem[BUF_ARGS]= sizeof(scidb::PdgemmArgs) ; bufDbgNames[BUF_ARGS] = "PdgemmArgs";
    bufElemBytes[BUF_MAT_AA]= sizeof(double) ; bufNumElem[BUF_MAT_AA]= matrixLocalSize[AA]; bufDbgNames[BUF_MAT_AA] = "A" ;
    bufElemBytes[BUF_MAT_BB]= sizeof(double) ; bufNumElem[BUF_MAT_BB]= matrixLocalSize[BB]; bufDbgNames[BUF_MAT_BB] = "B" ;
    bufElemBytes[BUF_MAT_CC]= sizeof(double) ; bufNumElem[BUF_MAT_CC]= matrixLocalSize[CC]; bufDbgNames[BUF_MAT_CC] = "C" ;
    typedef scidb::SharedMemoryPtr<double> shmSharedPtr_t ;

    for(size_t i=0; i < numArray; i++ ) {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): "
                << " bufElemBytes[" << i << "] = " << bufElemBytes[i]);
    }
    std::vector<MPIPhysical::SMIptr_t> shmIpc = allocateMPISharedMemory(NUM_BUFS, bufElemBytes, bufNumElem, bufDbgNames);

    // the following used to determine the PDGEMM() "K" argument just prior to pdgemm,
    // but now it has to be done before inputArrays[AA] is .reset() in the following loop.
    //
    // Comments on PDGEMM input "K", taken from the netlib PDGEMM argument header:
    // If transa = 'T' or 'C'(true), it is the number of rows in submatrix A."
    // If transa = 'N'(false), it is the number of columns in submatrix A."
    slpp::int_t K = slpp::int_cast(nCol(inputArrays[AA], options.transposeA));

    // the following also used to be done just prior to pdgemm,
    // but now must be done before inputArrays[CC] is .reset() in the following loop.
    // it must also now be a copy, and not a reference, for the same reason.
    Dimensions const dimsCC = inputArrays[CC]->getArrayDesc().getDimensions();

    // now for each input matrix, do the following:
    // 1. redistribute to psScaLAPACK (when not already correct).
    // 2. if actual conversion happened, release the inputArray, which might be a lot of memory, e.g. when inputArray[i] is materialized.
    // 2. zero the ScaLAPACK local block-cyclic storage in shared mem. (so that empty cells will become zeros).
    // 3. extract the (redistributed array) where not-empty, into the ScaLAPACK local matrix memory.
    // 4. release the redistributed array, which might be a lot of memory since SG is currently materializing.
    //
    // The only caller of this routine is the execute() method, and neither the execute() method, nor the executor that calls it,
    // access the inputArrays after calling execute, which is why we can reset() the shared_ptrs to the arrays after consuming the
    // arrrays into the ScaLAPACK memory.
    //
    // redistribute to psScaLAPACK, and convert to ScaLAPACK format.
    // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above

    double* asDoubles[NUM_MATRICES];
    for(size_t mat=0; mat < numArray; mat++ ) {
        std::stringstream labelStream;
        labelStream << "GEMMPhysical input[" << mat << "]";
        std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat],
                                                                         outSchema.getDistribution(),
                                                                         query, labelStream.str());

        bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ;  // only when redistribute was actually done (sometimes optimize away)

        // TODO would be nice if we could allocate the ScaLAPACK memory after dropping the input array
        //      in case the physical memory for the shmem can be reclaimed from the reset inputArrays[mat]

        size_t buf= mat+1;          // buffer 0 is command buffer, buffers[1..n] correspond to inputs[0..n-1]
        assert(buf < NUM_BUFS);

        asDoubles[mat] = reinterpret_cast<double*>(shmIpc[buf]->get());

        setInputMatrixToAlgebraDefault(asDoubles[mat], bufNumElem[buf]);  // note asDoubles[CC] is input and output to/from ScaLAPACK
        extractArrayToScaLAPACK(tmpRedistedInput, asDoubles[mat], DESC[mat],NPROW, NPCOL, MYPROW, MYPCOL, query);

        if(wasConverted) {
            SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get());
            syncArray->sync();
        }
        // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized
        inputArrays[mat].reset();
        tmpRedistedInput.reset(); // and drop this array before iterating on the loop to the next repart/redist

        if(DBG_REFORMAT) { // that the reformat worked correctly
            for(size_t ii=0; ii < matrixLocalSize[mat]; ii++) {
                LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():"
                                      << " @myPPos("<< MYPROW << "," << MYPCOL << ")"
                                      << " array["<<mat<<"]["<<ii<<"] = " << asDoubles[mat][ii]);
            }
        }
    }
    size_t resultShmIpcIndx = BUF_MAT_CC;           // by default, GEMM assumes it will return something for C
                                                    // but this will change if find we don't particpate in the output
    shmSharedPtr_t Cx(shmIpc[resultShmIpcIndx]);

    //
    //.... Call pdgemm to compute the product of A and B .............................
    //
    LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): calling pdgemm_ M,N,K:" << size[AA][R]  << ","
                                                                  << size[BB][R] << ","
                                                                  << size[CC][C]
                           << " MB,NB:" << MB_NB[AA][R] << "," << MB_NB[AA][C]);

    if(DBG_CERR) std::cerr << "GEMMPhysical::invokeMPI(): calling pdgemm to compute" << std:: endl;
    std::shared_ptr<MpiSlaveProxy> slave = _ctx->getSlave(_launchId);

    slpp::int_t MYPE = slpp::int_cast(query->getInstanceID()) ;  // we map 1-to-1 between instanceID and MPI rank
    slpp::int_t INFO = DEFAULT_BAD_INFO ;
    pdgemmMaster(query.get(), _ctx, slave, _ipcName, shmIpc[BUF_ARGS]->get(),
                  NPROW, NPCOL, MYPROW, MYPCOL, MYPE,
                  getTransposeCode(options.transposeA), getTransposeCode(options.transposeB),
                 slpp::int_cast(size[CC][R]),
                 slpp::int_cast(size[CC][C]), K,
                  &options.alpha,
                  asDoubles[AA], one, one, DESC[AA],
                  asDoubles[BB], one, one, DESC[BB],
                  &options.beta,
                  asDoubles[CC], one, one, DESC[CC],
                  INFO);
    raiseIfBadResultInfo(INFO, "pdgemm");

    boost::shared_array<char> resPtrDummy(reinterpret_cast<char*>(NULL));
    typedef scidb::ReformatFromScalapack<shmSharedPtr_t> reformatOp_t ;

    if(logger->isTraceEnabled()) {
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI():--------------------------------------");
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): sequential values from 'C' memory");
        for(size_t ii=0; ii < matrixLocalSize[CC]; ii++) {
            LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): ("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << asDoubles[CC][ii]);
        }
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): --------------------------------------");
        LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): using pdelgetOp to reformat Gemm left from memory to scidb array , start");
    }


    //
    // an OpArray is a SplitArray that is filled on-the-fly by calling the operator
    // so all we have to do is create one with an upper-left corner equal to the
    // global position of the first local block we have.  so we need to map
    // our "processor" coordinate into that position, which we do by multiplying
    // by the chunkSize
    //
    Coordinates first(2);
    first[R] = dimsCC[R].getStartMin() + MYPROW * MB_NB[CC][R];
    first[C] = dimsCC[C].getStartMin() + MYPCOL * MB_NB[CC][C];

    Coordinates last(2);
    last[R] = dimsCC[R].getStartMin() + size[CC][R] - 1;
    last[C] = dimsCC[C].getStartMin() + size[CC][C] - 1;

    std::shared_ptr<Array> result;
    // the process grid may be larger than the size of output in chunks... e.g multiplying A(1x100) * B(100x1) -> C(1x1)
    bool isParticipatingInOutput = first[R] <= last[R] && first[C] <= last[C] ;
    if (isParticipatingInOutput) {
        // there is in fact some output in our shared memory... hook it up to an OpArray
        Coordinates iterDelta(2);
        iterDelta[0] = NPROW * MB_NB[CC][R];
        iterDelta[1] = NPCOL * MB_NB[CC][C];

        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():Creating OpArray from ("<<first[R]<<","<<first[C]<<") to (" << last[R] <<"," <<last[C]<<") delta:"<<iterDelta[R]<<","<<iterDelta[C]);
        reformatOp_t      pdelgetOp(Cx, DESC[CC], dimsCC[R].getStartMin(), dimsCC[C].getStartMin(),
                                    NPROW, NPCOL, MYPROW, MYPCOL);
        result = std::shared_ptr<Array>(new OpArray<reformatOp_t>(outSchema, resPtrDummy, pdelgetOp,
                                                             first, last, iterDelta, query));
        assert(resultShmIpcIndx == BUF_MAT_CC);
    } else {
        LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): instance participated, but does not output: creating empty MemArray: first ("<<first[R]<<","<<first[C]<<"), last(" << last[R] <<"," <<last[C]<<")");
        result = std::shared_ptr<Array>(new MemArray(outSchema,query));  // same as when we don't participate at all
        resultShmIpcIndx = shmIpc.size();                   // indicate we don't want to hold on to buffer BUF_MAT_CC after all
    }

    // TODO: common pattern in ScaLAPACK operators: factor to base class
    releaseMPISharedMemoryInputs(shmIpc, resultShmIpcIndx);
    unlaunchMPISlaves();

    LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI() end");

    return result;
}