Exemplos de ArrayDesc::getDistribution em C++ (Cpp)

Linguagem de programação: C++ (Cpp)

Classe / Tipo: ArrayDesc

Método / Função: getDistribution

Exemplos em hotexamples.com: 1

ArrayDesc::getDistribution em C++ (Cpp) - 1 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de ArrayDesc::getDistribution em C++ (Cpp) extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

getDimensions(19)

getAttributes(16)

getName(6)

getEmptyBitmapAttribute(5)

addAttribute(3)

getFlags(3)

getId(3)

getNumberOfChunks(2)

addAlias(1)

getChunkPositionFor(1)

getDistribution(1)

getUAId(1)

isImmutable(1)

isTransient(1)

setDistribution(1)

setResidency(1)

Métodos Frequentes

getDimensions (19)

getAttributes (16)

getName (6)

getEmptyBitmapAttribute (5)

addAttribute (3)

getFlags (3)

getId (3)

getNumberOfChunks (2)

addAlias (1)

getChunkPositionFor (1)

Métodos Frequentes

getDistribution (1)

getUAId (1)

isImmutable (1)

isTransient (1)

setDistribution (1)

setResidency (1)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: GEMMPhysical.cpp Projeto: cerbo/scidb

std::shared_ptr<Array> GEMMPhysical::invokeMPI(std::vector< std::shared_ptr<Array> >& inputArrays, const GEMMOptions options, std::shared_ptr<Query>& query, ArrayDesc& outSchema) { // // Everything about the execute() method concerning the MPI execution of the arrays // is factored into this method. This does not include the re-distribution of data // chunks into the ScaLAPACK distribution scheme, as the supplied inputArrays // must already be in that scheme. // // + intersects the array chunkGrids with the maximum process grid // + sets up the ScaLAPACK grid accordingly and if not participating, return early // + start and connect to an MPI slave process // + create ScaLAPACK descriptors for the input arrays // + convert the inputArrays into in-memory ScaLAPACK layout in shared memory // + call a "master" routine that passes the ScaLAPACK operator name, parameters, // and shared memory descriptors to the ScaLAPACK MPI process that will do the // actual computation. // + wait for successful completion // + construct an "OpArray" that make and Array API view of the output memory. // + return that output array. // enum dummy {R=0, C=1}; // row column enum dummy2 {AA=0, BB, CC, NUM_MATRICES}; // which matrix: alpha AA * BB + beta CC -> result LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): begin"); size_t numArray = inputArrays.size(); if (numArray != NUM_MATRICES) { // for now ... may make CC optional when beta is 0, later LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): " << numArray << " != NUM_MATRICES " << size_t(NUM_MATRICES)); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "GEMMPhysical::invokeMPI(): requires 3 input Arrays/matrices."); } // // Initialize the (emulated) BLACS and get the proces grid info // blacs::context_t blacsContext = doBlacsInit(inputArrays, query, "GEMMPhysical"); bool isParticipatingInScaLAPACK = blacsContext.isParticipating(); if (isParticipatingInScaLAPACK) { checkBlacsInfo(query, blacsContext, "GEMMPhysical"); } blacs::int_t NPROW=-1, NPCOL=-1, MYPROW=-1 , MYPCOL=-1 ; scidb_blacs_gridinfo_(blacsContext, NPROW, NPCOL, MYPROW, MYPCOL); LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI() NPROW="<<NPROW<<", NPCOL="<<NPCOL); // // launch MPISlave if we participate // TODO: move this down into the ScaLAPACK code ... something that does // the doBlacsInit, launchMPISlaves, and the check that they agree // bool isParticipatingInMPI = launchMPISlaves(query, NPROW*NPCOL); if (isParticipatingInScaLAPACK != isParticipatingInMPI) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " isParticipatingInScaLAPACK " << isParticipatingInScaLAPACK << " isParticipatingInMPI " << isParticipatingInMPI); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "GEMMPhysical::invokeMPI(): internal inconsistency in MPI slave launch."); } if (isParticipatingInMPI) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): participating in MPI"); } else { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): not participating in MPI"); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): only participating in redistribute of the input"); // redistribute to psScaLAPACK // NOTE: this must be kept in sync with the particpatingInMPI version of the redistribute, below // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above for(size_t mat=0; mat < numArray; mat++ ) { std::stringstream labelStream; labelStream << "GEMMPhysical input[" << mat << "]"; std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat], outSchema.getDistribution(), query, labelStream.str()); bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ; // only when redistribute was actually done (sometimes optimize away) if (wasConverted) { SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get()); syncArray->sync(); } // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized inputArrays[mat].reset(); // TODO: validate that the redistribute brought no chunks to the instance by // getting an array iterator and make sure it returns no chunks // (factor to ScaLAPACKPhysical.cpp) // after validating, we don't need tmpRedistedInput anymore, either tmpRedistedInput.reset(); } unlaunchMPISlavesNonParticipating(); return std::shared_ptr<Array>(new MemArray(outSchema,query)); // NOTE: must not happen before redistribute is done. } // // get dimension information about the input arrays // TODO: REFACTOR, this is a pattern in DLAs // // matrix sizes from arrays A,B,C matSize_t size[NUM_MATRICES]; // does not change even after redistributeInputArray for(size_t i=0; i < numArray; i++ ) { size[i] = getMatSize(inputArrays[i]); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " size["<<i<<"] " << size[i][R] << "," << size[i][C]); } // TODO JHM : convert 1d arrays to nrows x 1 so we can use vectors as input to GEMM without requiring // the user to add a dimension of size 1. for(size_t i=0; i < numArray; i++ ) { // TODO JHM : check inputArrays[i] to make sure we are only using 2D arrays, // that may or may not be done by checkInputArrays checkInputArray(inputArrays[i]); // check block size constraints, etc } // //.... Set up ScaLAPACK array descriptors ........................................ // // we would like to do the automatic repart() [not yet implemented] inside the same loop as the // redistribute() and extractToScaLAPACK() in order to release each array after it is consumed. // unfortunately, we have made some of the routines below dependent on the MB,NB we are going to use, // which has recently become determined by the chunkSize of the inputArrays[] since it is no longer // a fixed value, but may vary over a legal range. // but when automatic repart() is done, we will want to use the chunksize of the output of the repart(). // so we will need to decide by this point what the MB,NB is going to be, even if we haven't reparted // to it yet. // to make it clear we mean ScaLAPACK MB,NB // (which may become different from the inputArray[i] chunkSize in the future) // we will call the array of ScaLAPACK MB,NB pairs, MB_NB[]. matSize_t MB_NB[NUM_MATRICES]; // this one should be moved after redistributeInputArrays() for when it really reparts for(size_t i=0; i < numArray; i++ ) { MB_NB[i] = getMatChunkSize(inputArrays[i]); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " using MB_NB["<<i<<"] " << MB_NB[i][R] << "," << MB_NB[i][C]); } // these formulas for LLD (local leading dimension) and LTD (local trailing dimension) // are found in the headers of the ScaLAPACK functions such as pdgemm_() const slpp::int_t one = 1 ; // TODO: turn these pairs into matSize_t matrixLocalSize[NUM_MATRICES]; slpp::int_t LLD[NUM_MATRICES]; // local leading dimension slpp::int_t LTD[NUM_MATRICES]; // local trailing dimension for(size_t i=0; i < numArray; i++ ) { slpp::int_t RSRC = 0 ; LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " M["<<i<<"][R]"<<size[i][R] <<" MB["<<i<<"][R]:"<<MB_NB[i][R] << " N["<<i<<"][R]"<<size[i][C] <<" NB["<<i<<"][R]:"<<MB_NB[i][C] << " MYPROW:"<<MYPROW << " NPROW:"<< NPROW); LLD[i] = std::max(one, scidb_numroc_(slpp::int_cast(size[i][R]), slpp::int_cast(MB_NB[i][R]), MYPROW, RSRC, NPROW)); LTD[i] = std::max(one, scidb_numroc_( slpp::int_cast(size[i][C]), slpp::int_cast(MB_NB[i][C]), MYPCOL, RSRC, NPCOL)); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " LLD["<<i<<"] = " << LLD[i] << " LTD["<<i<<"] = " << LTD[i]); } // create ScaLAPACK array descriptors // TODO: lets factor this to a method on ScaLAPACKPhysical slpp::desc_t DESC[NUM_MATRICES]; for(size_t i=0; i < numArray; i++ ) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " scidb_descinit_(DESC["<<i<<"], M=" << size[i][R] << ", N=" << size[i][C] << ", MB=" << MB_NB[i][R] << ", NB=" << MB_NB[i][R] << ", IRSRC=" << 0 << ", ICSRC=" << 0 << ", LLD=" << LLD[i]); slpp::int_t descinitINFO = 0; // an output implemented as non-const ref (due to Fortran calling conventions) scidb_descinit_(DESC[i], slpp::int_cast(size[i][R]), slpp::int_cast(size[i][C]), slpp::int_cast(MB_NB[i][R]), slpp::int_cast(MB_NB[i][C]), 0, 0, blacsContext, LLD[i], descinitINFO); if (descinitINFO != 0) { LOG4CXX_ERROR(logger, "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed, INFO " << descinitINFO << " DESC " << DESC); throw (SYSTEM_EXCEPTION(SCIDB_SE_INTERNAL, SCIDB_LE_OPERATION_FAILED) << "GEMMPhysical::invokeMPI(): scidb_descinit(DESC) failed"); } LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " scidb_descinit_() returned DESC["<<i<<"] " << DESC[i]); // debugging for #1986 ... when #instances is prime, process grid is a row. When small chunk sizes are used, // desc.LLD is being set to a number larger than the chunk size ... I don't understand or expect this. bool doDebugTicket1986=true; // remains on until fixed, can't ship with this not understood. if(doDebugTicket1986) { if (DESC[i].LLD > DESC[i].MB) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): ticket 1986 issue" << ", DESC["<<i<<"].LLD " << DESC[i].LLD << " > DESC["<<i<<"].MB: " << DESC[i].MB); } } } // matrix allocations are of local size, not global size size_t matrixLocalSize[NUM_MATRICES]; for(size_t i=0; i < numArray; i++ ) { matrixLocalSize[i] = size_t(LLD[i]) * LTD[i] ; LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): " << " LLD[" << i << "] ( " << LLD[i] << " ) x " << " LTD[" << i << "] ( " << LTD[i] << " ) = " << " matrixLocalSize[" << i << "] " << matrixLocalSize[i]); } // // Create IPC buffers // enum dummy3 {BUF_ARGS=0, BUF_MAT_AA, BUF_MAT_BB, BUF_MAT_CC, NUM_BUFS }; assert(numArray < NUM_BUFS); size_t bufElemBytes[NUM_BUFS]; size_t bufNumElem[NUM_BUFS]; std::string bufDbgNames[NUM_BUFS]; bufElemBytes[BUF_ARGS]= 1 ; bufNumElem[BUF_ARGS]= sizeof(scidb::PdgemmArgs) ; bufDbgNames[BUF_ARGS] = "PdgemmArgs"; bufElemBytes[BUF_MAT_AA]= sizeof(double) ; bufNumElem[BUF_MAT_AA]= matrixLocalSize[AA]; bufDbgNames[BUF_MAT_AA] = "A" ; bufElemBytes[BUF_MAT_BB]= sizeof(double) ; bufNumElem[BUF_MAT_BB]= matrixLocalSize[BB]; bufDbgNames[BUF_MAT_BB] = "B" ; bufElemBytes[BUF_MAT_CC]= sizeof(double) ; bufNumElem[BUF_MAT_CC]= matrixLocalSize[CC]; bufDbgNames[BUF_MAT_CC] = "C" ; typedef scidb::SharedMemoryPtr<double> shmSharedPtr_t ; for(size_t i=0; i < numArray; i++ ) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): " << " bufElemBytes[" << i << "] = " << bufElemBytes[i]); } std::vector<MPIPhysical::SMIptr_t> shmIpc = allocateMPISharedMemory(NUM_BUFS, bufElemBytes, bufNumElem, bufDbgNames); // the following used to determine the PDGEMM() "K" argument just prior to pdgemm, // but now it has to be done before inputArrays[AA] is .reset() in the following loop. // // Comments on PDGEMM input "K", taken from the netlib PDGEMM argument header: // If transa = 'T' or 'C'(true), it is the number of rows in submatrix A." // If transa = 'N'(false), it is the number of columns in submatrix A." slpp::int_t K = slpp::int_cast(nCol(inputArrays[AA], options.transposeA)); // the following also used to be done just prior to pdgemm, // but now must be done before inputArrays[CC] is .reset() in the following loop. // it must also now be a copy, and not a reference, for the same reason. Dimensions const dimsCC = inputArrays[CC]->getArrayDesc().getDimensions(); // now for each input matrix, do the following: // 1. redistribute to psScaLAPACK (when not already correct). // 2. if actual conversion happened, release the inputArray, which might be a lot of memory, e.g. when inputArray[i] is materialized. // 2. zero the ScaLAPACK local block-cyclic storage in shared mem. (so that empty cells will become zeros). // 3. extract the (redistributed array) where not-empty, into the ScaLAPACK local matrix memory. // 4. release the redistributed array, which might be a lot of memory since SG is currently materializing. // // The only caller of this routine is the execute() method, and neither the execute() method, nor the executor that calls it, // access the inputArrays after calling execute, which is why we can reset() the shared_ptrs to the arrays after consuming the // arrrays into the ScaLAPACK memory. // // redistribute to psScaLAPACK, and convert to ScaLAPACK format. // NOTE: this redistribution must be kept in sync with the particpatingInMPI redistributeInputArrays, above double* asDoubles[NUM_MATRICES]; for(size_t mat=0; mat < numArray; mat++ ) { std::stringstream labelStream; labelStream << "GEMMPhysical input[" << mat << "]"; std::shared_ptr<Array> tmpRedistedInput = redistributeInputArray(inputArrays[mat], outSchema.getDistribution(), query, labelStream.str()); bool wasConverted = (tmpRedistedInput != inputArrays[mat]) ; // only when redistribute was actually done (sometimes optimize away) // TODO would be nice if we could allocate the ScaLAPACK memory after dropping the input array // in case the physical memory for the shmem can be reclaimed from the reset inputArrays[mat] size_t buf= mat+1; // buffer 0 is command buffer, buffers[1..n] correspond to inputs[0..n-1] assert(buf < NUM_BUFS); asDoubles[mat] = reinterpret_cast<double*>(shmIpc[buf]->get()); setInputMatrixToAlgebraDefault(asDoubles[mat], bufNumElem[buf]); // note asDoubles[CC] is input and output to/from ScaLAPACK extractArrayToScaLAPACK(tmpRedistedInput, asDoubles[mat], DESC[mat],NPROW, NPCOL, MYPROW, MYPCOL, query); if(wasConverted) { SynchableArray* syncArray = safe_dynamic_cast<SynchableArray*>(tmpRedistedInput.get()); syncArray->sync(); } // free potentially large amount of memory, e.g. when inputArrays[mat] was significantly memory-materialized inputArrays[mat].reset(); tmpRedistedInput.reset(); // and drop this array before iterating on the loop to the next repart/redist if(DBG_REFORMAT) { // that the reformat worked correctly for(size_t ii=0; ii < matrixLocalSize[mat]; ii++) { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():" << " @myPPos("<< MYPROW << "," << MYPCOL << ")" << " array["<<mat<<"]["<<ii<<"] = " << asDoubles[mat][ii]); } } } size_t resultShmIpcIndx = BUF_MAT_CC; // by default, GEMM assumes it will return something for C // but this will change if find we don't particpate in the output shmSharedPtr_t Cx(shmIpc[resultShmIpcIndx]); // //.... Call pdgemm to compute the product of A and B ............................. // LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): calling pdgemm_ M,N,K:" << size[AA][R] << "," << size[BB][R] << "," << size[CC][C] << " MB,NB:" << MB_NB[AA][R] << "," << MB_NB[AA][C]); if(DBG_CERR) std::cerr << "GEMMPhysical::invokeMPI(): calling pdgemm to compute" << std:: endl; std::shared_ptr<MpiSlaveProxy> slave = _ctx->getSlave(_launchId); slpp::int_t MYPE = slpp::int_cast(query->getInstanceID()) ; // we map 1-to-1 between instanceID and MPI rank slpp::int_t INFO = DEFAULT_BAD_INFO ; pdgemmMaster(query.get(), _ctx, slave, _ipcName, shmIpc[BUF_ARGS]->get(), NPROW, NPCOL, MYPROW, MYPCOL, MYPE, getTransposeCode(options.transposeA), getTransposeCode(options.transposeB), slpp::int_cast(size[CC][R]), slpp::int_cast(size[CC][C]), K, &options.alpha, asDoubles[AA], one, one, DESC[AA], asDoubles[BB], one, one, DESC[BB], &options.beta, asDoubles[CC], one, one, DESC[CC], INFO); raiseIfBadResultInfo(INFO, "pdgemm"); boost::shared_array<char> resPtrDummy(reinterpret_cast<char*>(NULL)); typedef scidb::ReformatFromScalapack<shmSharedPtr_t> reformatOp_t ; if(logger->isTraceEnabled()) { LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI():--------------------------------------"); LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): sequential values from 'C' memory"); for(size_t ii=0; ii < matrixLocalSize[CC]; ii++) { LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): ("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << asDoubles[CC][ii]); } LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): --------------------------------------"); LOG4CXX_TRACE(logger, "GEMMPhysical::invokeMPI(): using pdelgetOp to reformat Gemm left from memory to scidb array , start"); } // // an OpArray is a SplitArray that is filled on-the-fly by calling the operator // so all we have to do is create one with an upper-left corner equal to the // global position of the first local block we have. so we need to map // our "processor" coordinate into that position, which we do by multiplying // by the chunkSize // Coordinates first(2); first[R] = dimsCC[R].getStartMin() + MYPROW * MB_NB[CC][R]; first[C] = dimsCC[C].getStartMin() + MYPCOL * MB_NB[CC][C]; Coordinates last(2); last[R] = dimsCC[R].getStartMin() + size[CC][R] - 1; last[C] = dimsCC[C].getStartMin() + size[CC][C] - 1; std::shared_ptr<Array> result; // the process grid may be larger than the size of output in chunks... e.g multiplying A(1x100) * B(100x1) -> C(1x1) bool isParticipatingInOutput = first[R] <= last[R] && first[C] <= last[C] ; if (isParticipatingInOutput) { // there is in fact some output in our shared memory... hook it up to an OpArray Coordinates iterDelta(2); iterDelta[0] = NPROW * MB_NB[CC][R]; iterDelta[1] = NPCOL * MB_NB[CC][C]; LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI():Creating OpArray from ("<<first[R]<<","<<first[C]<<") to (" << last[R] <<"," <<last[C]<<") delta:"<<iterDelta[R]<<","<<iterDelta[C]); reformatOp_t pdelgetOp(Cx, DESC[CC], dimsCC[R].getStartMin(), dimsCC[C].getStartMin(), NPROW, NPCOL, MYPROW, MYPCOL); result = std::shared_ptr<Array>(new OpArray<reformatOp_t>(outSchema, resPtrDummy, pdelgetOp, first, last, iterDelta, query)); assert(resultShmIpcIndx == BUF_MAT_CC); } else { LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI(): instance participated, but does not output: creating empty MemArray: first ("<<first[R]<<","<<first[C]<<"), last(" << last[R] <<"," <<last[C]<<")"); result = std::shared_ptr<Array>(new MemArray(outSchema,query)); // same as when we don't participate at all resultShmIpcIndx = shmIpc.size(); // indicate we don't want to hold on to buffer BUF_MAT_CC after all } // TODO: common pattern in ScaLAPACK operators: factor to base class releaseMPISharedMemoryInputs(shmIpc, resultShmIpcIndx); unlaunchMPISlaves(); LOG4CXX_DEBUG(logger, "GEMMPhysical::invokeMPI() end"); return result; }