Block GPUTransientMemoryAllocator::allocate(GPUProgressTracking & progress, GPUTransientChunk & chunk, size_t size, size_t alignment, SeqNum cmdList) { size = roundUpToMultiple(size, alignment); auto &free = chunk.m_free; auto b = free.fitAtBegin(size, alignment); #if 0 XOR_GPU_TRANSIENT_VERBOSE("Trying to allocate %zu for list %lld in existing chunk (%lld, %lld).\n", size, cmdList, free.begin, free.end); #endif // If the allocation fits in the previous active chunk, just use that. if (b) { free.begin = b.end; #if 0 XOR_GPU_TRANSIENT_VERBOSE(" Allocation successful. Chunk is now (%lld, %lld).\n", free.begin, free.end); #endif return b; } // If not, get a new chunk. else { XOR_GPU_TRANSIENT_VERBOSE(" Existing chunk cannot hold allocation, getting new chunk for list %lld.\n", cmdList); XOR_CHECK(size <= static_cast<size_t>(m_chunkSize), "Allocation does not fit in one chunk"); ChunkNumber newChunk = findFreeChunk(progress); XOR_CHECK(newChunk >= 0, "There are no free or waitable chunks."); m_usedChunks.emplace_back(cmdList, newChunk); int64_t begin = newChunk * m_chunkSize; free = Block(begin, begin + m_chunkSize); auto b = free.fitAtBegin(size, alignment); XOR_ASSERT(b.valid(), "Allocation failed with an empty chunk"); free.begin = b.end; return b; } }
static size_t collDefBufferSizes() { int *streamIndexList, vlistID, nvars, varID, iorank; int modelID; size_t sumGetBufferSizes = 0; int rankGlob = commInqRankGlob (); int nProcsModel = commInqNProcsModel (); int root = commInqRootGlob (); xassert(rxWin != NULL); unsigned nstreams = reshCountType ( &streamOps ); streamIndexList = xmalloc((size_t)nstreams * sizeof (streamIndexList[0])); reshGetResHListOfType ( nstreams, streamIndexList, &streamOps ); for (unsigned streamNo = 0; streamNo < nstreams; streamNo++) { // space required for data vlistID = streamInqVlist ( streamIndexList[streamNo] ); nvars = vlistNvars ( vlistID ); for ( varID = 0; varID < nvars; varID++ ) { iorank = vlistInqVarIOrank ( vlistID, varID ); xassert ( iorank != CDI_UNDEFID ); if ( iorank == rankGlob ) { for ( modelID = 0; modelID < nProcsModel; modelID++ ) { int decoChunk; { int varSize = vlistInqVarSize(vlistID, varID); int nProcsModel = commInqNProcsModel(); decoChunk = (int)ceilf(cdiPIOpartInflate_ * (float)(varSize + nProcsModel - 1) / (float)nProcsModel); } xassert ( decoChunk > 0 ); rxWin[modelID].size += (size_t)decoChunk * sizeof (double) /* re-align chunks to multiple of double size */ + sizeof (double) - 1 /* one header for data record, one for * corresponding part descriptor*/ + 2 * sizeof (struct winHeaderEntry) /* FIXME: heuristic for size of packed Xt_idxlist */ + sizeof (Xt_int) * (size_t)decoChunk * 3; rxWin[modelID].dictSize += 2; } } } // space required for the 3 function calls streamOpen, streamDefVlist, streamClose // once per stream and timestep for all collprocs only on the modelproc root rxWin[root].size += numRPCFuncs * sizeof (struct winHeaderEntry) /* serialized filename */ + MAXDATAFILENAME /* data part of streamDefTimestep */ + (2 * CDI_MAX_NAME + sizeof (taxis_t)); rxWin[root].dictSize += numRPCFuncs; } free ( streamIndexList ); for ( modelID = 0; modelID < nProcsModel; modelID++ ) { /* account for size header */ rxWin[modelID].dictSize += 1; rxWin[modelID].size += sizeof (struct winHeaderEntry); rxWin[modelID].size = roundUpToMultiple(rxWin[modelID].size, PIO_WIN_ALIGN); sumGetBufferSizes += (size_t)rxWin[modelID].size; } xassert ( sumGetBufferSizes <= MAXWINBUFFERSIZE ); return sumGetBufferSizes; }