void Slave::renderFrame(Renderer *tiledRenderer, FrameBuffer *fb, const uint32 channelFlags ) { Ref<RenderTask> renderTask = new RenderTask;//(fb,tiledRenderer->createRenderJob(fb)); renderTask->fb = fb; renderTask->renderer = tiledRenderer; renderTask->numTiles_x = divRoundUp(fb->size.x,TILE_SIZE); renderTask->numTiles_y = divRoundUp(fb->size.y,TILE_SIZE); renderTask->channelFlags = channelFlags; tiledRenderer->beginFrame(fb); /*! iw: using a local sync event for now; "in theory" we should be able to attach something like a sync event to the frame buffer, just trigger the task here, and let somebody else sync on the framebuffer once it is needed; alas, I'm currently running into some issues with the embree taks system when trying to do so, and thus am reverting to this fully-synchronous version for now */ // renderTask->fb->frameIsReadyEvent = TaskScheduler::EventSync(); TaskScheduler::EventSync sync; renderTask->task = embree::TaskScheduler::Task (&sync, // (&renderTask->fb->frameIsReadyEvent, renderTask->_run,renderTask.ptr, renderTask->numTiles_x*renderTask->numTiles_y, renderTask->_finish,renderTask.ptr, "LocalTiledLoadBalancer::RenderTask"); TaskScheduler::addTask(-1, TaskScheduler::GLOBAL_BACK, &renderTask->task); sync.sync(); }
static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { size_t yLen; /* Length of "Y" vector */ const CLBlasKargs *kargs = args; unsigned int subgr = pgran->wgSize[0] / (subdims[0].bwidth / subdims[1].bwidth); (void)subdims; (void)extra; yLen = kargs->transA == clblasNoTrans ? kargs->M : kargs->N; if (yLen == 0) { yLen = 1; //launch one group to avoid CL_INVALID_WORK_GROUP_SIZE error } //each work item handles y1 lines threads[0] = divRoundUp(yLen, subdims[1].y) * subgr; threads[0] = roundUp(threads[0], pgran->wgSize[0]); threads[1] = 0; }
unsigned int tileVectorsNum(const Tile *tile) { size_t pitch, height; pitch = tilePitch(tile); height = (tile->trans) ? tile->nrCols : tile->nrRows; return (unsigned int)divRoundUp(height * pitch, tile->vecLen); }
void Master::renderFrame(Renderer *tiledRenderer, FrameBuffer *fb, const uint32 channelFlags) { int rc; MPI_Status status; // mpidevice already sent the 'cmd_render_frame' event; we // only have to wait for tiles... const size_t numTiles = divRoundUp(fb->size.x,TILE_SIZE) * divRoundUp(fb->size.y,TILE_SIZE); assert(fb->colorBufferFormat == OSP_RGBA_I8); uint32 rgba_i8[TILE_SIZE][TILE_SIZE]; for (int i=0;i<numTiles;i++) { box2ui region; // printf("#m: receiving tile %i\n",i); rc = MPI_Recv(®ion,4,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG, mpi::worker.comm,&status); Assert(rc == MPI_SUCCESS); // printf("#m: received tile %i (%i,%i) from %i\n",i, // tile.region.lower.x,tile.region.lower.y,status.MPI_SOURCE); rc = MPI_Recv(&rgba_i8[0],TILE_SIZE*TILE_SIZE,MPI_INT, status.MPI_SOURCE,status.MPI_TAG,mpi::worker.comm,&status); Assert(rc == MPI_SUCCESS); ospray::LocalFrameBuffer *lfb = (ospray::LocalFrameBuffer *)fb; for (int iy=region.lower.y;iy<region.upper.y;iy++) for (int ix=region.lower.x;ix<region.upper.x;ix++) { ((uint32*)lfb->colorBuffer)[ix+iy*lfb->size.x] = rgba_i8[iy-region.lower.y][ix-region.lower.x]; } } // printf("#m: master done fb %lx\n",fb); }
bool FileHeader::Allocate(PersistentBitmap *freeMap, int fileSize) { numBytes = fileSize; numSectors = divRoundUp(fileSize, kernel->diskSectorSize); if (freeMap->NumClear() < numSectors) return FALSE; // not enough space for (int i = 0; i < numSectors; i++) { dataSectors[i] = freeMap->FindAndSet(); // since we checked that there was enough free space, // we expect this to succeed ASSERT(dataSectors[i] >= 0); } return TRUE; }
static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { const CLBlasKargs *kargs = args; unsigned int subgr = subdims[0].bwidth / subdims[1].bwidth; (void)extra; //each work item handles y1 lines threads[0] = divRoundUp(kargs->N, subdims[1].y) * subgr; threads[0] = roundUp(threads[0], pgran->wgSize[0]); threads[1] = 0; }
void LocalFrameBuffer::clear(const uint32 fbChannelFlags) { if (fbChannelFlags & OSP_FB_ACCUM) { // it is only necessary to reset the accumID, // LocalFrameBuffer_accumulateTile takes care of clearing the // accumulation buffers memset(tileAccumID, 0, tiles*sizeof(int32)); // always also clear error buffer (if present) if (hasVarianceBuffer) { for (int i = 0; i < tiles; i++) tileErrorBuffer[i] = inf; errorRegion.clear(); // initially create one region covering the complete image errorRegion.push_back(box2i(vec2i(0), vec2i(tilesx, divRoundUp(size.y, TILE_SIZE)))); } } }
template <typename T> T roundUpToMultiple(T value, T multiplier) { return divRoundUp(value, multiplier) * multiplier; }
uint32_t crearSegmento(uint32_t pid, size_t size, t_msg_id* id) { uint16_t numSegmento; uint16_t cantPaginas = divRoundUp(size,PAG_SIZE); if (size > SEG_MAX_SIZE) { *id = INVALID_SEG_SIZE; return 0; } if (CantPaginasDisponibles >= cantPaginas) { int pag; t_segmento *tablaLocal; char *stringPID = string_uitoa(pid); if ((tablaLocal = tablaDelProceso(stringPID)) == NULL) { dictionary_put(TablaSegmentosGlobal, stringPID, tablaLocal = malloc(NUM_SEG_MAX * sizeof(t_segmento))); /* Creo la tabla local del proceso PID */ for (numSegmento = 0; numSegmento < NUM_SEG_MAX; ++numSegmento) /* Inicializo la Tabla Local de Segmentos */ tablaLocal[numSegmento].limite = 0; } if ((numSegmento = primerEntradaLibre(tablaLocal)) == NUM_SEG_MAX) { *id = MAX_SEG_NUM_REACHED; return 0; } CantPaginasDisponibles -= cantPaginas; tablaLocal[numSegmento].limite = size; tablaLocal[numSegmento].bytesOcupados = 0; tablaLocal[numSegmento].tablaPaginas = malloc(cantPaginas * sizeof(t_pagina)); /* Marco las páginas como NO en Memoria Principal */ for (pag = 0; pag < cantPaginas && CantPaginasEnSwapDisponibles; ++pag, --CantPaginasEnSwapDisponibles) paginaEnMemoria(tablaLocal, numSegmento, pag) = false; if (CantPaginasEnSwapDisponibles == 0) { pthread_mutex_lock(&LogMutex); log_trace(Logger, "Espacio de intercambio llleno."); pthread_mutex_unlock(&LogMutex); } /* Si la SWAP se llena reservo marcos de Memoria Principal para el resto de las páginas */ for (; pag < cantPaginas; ++pag, --CantPaginasEnMemoriaDisponibles) { MemoriaPrincipal[marcoDePagina(tablaLocal, numSegmento, pag) = marcoVacio()].ocupado = true; MemoriaPrincipal[marcoDePagina(tablaLocal, numSegmento, pag)].pid = pid; paginaEnMemoria(tablaLocal, numSegmento, pag) = true; AgregarPaginaAEstructuraSustitucion(pid, numSegmento, pag); pthread_mutex_lock(&LogMutex); log_trace(Logger, "Marco %u asignado al proceso %s.", marcoDePagina(tablaLocal, numSegmento, pag), stringPID); pthread_mutex_unlock(&LogMutex); } free(stringPID); if (CantPaginasEnMemoriaDisponibles == 0) { pthread_mutex_lock(&LogMutex); log_trace(Logger, "Memoria Principal lllena."); pthread_mutex_unlock(&LogMutex); } } else { *id = FULL_MEMORY; return 0; } *id = OK_CREATE; return generarDireccionLogica(numSegmento,0,0); }
LocalFrameBuffer::LocalFrameBuffer(const vec2i &size, ColorBufferFormat colorBufferFormat, bool hasDepthBuffer, bool hasAccumBuffer, bool hasVarianceBuffer, void *colorBufferToUse) : FrameBuffer(size, colorBufferFormat, hasDepthBuffer, hasAccumBuffer, hasVarianceBuffer) { Assert(size.x > 0); Assert(size.y > 0); if (colorBufferToUse) colorBuffer = colorBufferToUse; else { switch (colorBufferFormat) { case OSP_FB_NONE: colorBuffer = NULL; break; case OSP_FB_RGBA8: case OSP_FB_SRGBA: colorBuffer = (vec4f*)alignedMalloc(sizeof(vec4f)*size.x*size.y); break; case OSP_FB_RGBA32F: colorBuffer = (uint32*)alignedMalloc(sizeof(uint32)*size.x*size.y); break; default: throw std::runtime_error("color buffer format not supported"); } } if (hasDepthBuffer) depthBuffer = (float*)alignedMalloc(sizeof(float)*size.x*size.y); else depthBuffer = NULL; if (hasAccumBuffer) accumBuffer = (vec4f*)alignedMalloc(sizeof(vec4f)*size.x*size.y); else accumBuffer = NULL; tilesx = divRoundUp(size.x, TILE_SIZE); tiles = tilesx * divRoundUp(size.y, TILE_SIZE); tileAccumID = new int32[tiles]; memset(tileAccumID, 0, tiles*sizeof(int32)); if (hasVarianceBuffer) { varianceBuffer = (vec4f*)alignedMalloc(sizeof(vec4f)*size.x*size.y); tileErrorBuffer = new float[tiles]; // maximum number of regions: all regions are of size 3 are split in half errorRegion.reserve(divRoundUp(tiles*2, 3)); } else { varianceBuffer = NULL; tileErrorBuffer = NULL; } ispcEquivalent = ispc::LocalFrameBuffer_create(this,size.x,size.y, colorBufferFormat, colorBuffer, depthBuffer, accumBuffer, varianceBuffer, tileAccumID, tileErrorBuffer); }
static int genLoopsK( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, char *tmp) { KernelExtraFlags kflags = gset->kextra->flags; const size_t y0 = gset->subdims[0].y; const size_t bwidth = gset->subdims[1].bwidth; int ret; bool isRel = false; const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB; getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA); getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB); sprintf(tmp, "uint k0;\n"); kgenAddStmt(ctx, tmp); if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER | KEXTRA_TAILS_K_LOWER))) { FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE | FETCH_ADDR_K_RELATIVE; isRel = true; mulOpts->fctx = createFetchContext(); if (mulOpts->fctx == NULL) { return -ENOMEM; } setFetchAddrMode(mulOpts->fctx, addrMode); gset->varNames.A = "pA"; gset->varNames.B = "pB"; } else { gset->flags |= BGF_UPTRS; kgenPrintf(ctx, "GPtr Ag, Bg;\n" "\n" "Ag.%s = A;\n" "Bg.%s = B;\n\n", inPtrNameA, inPtrNameB); } if (isMatrixUpper(kflags)) { if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } sprintf(tmp, "for (k0 = kBegin; " "(k0 <= (kBegin + %luu))&&(k0 < M); " "k0 += %lu)", y0, bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); //main triangle part sprintf(tmp, "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)", y0, gset->subdims[1].bwidth); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // matrix side part // should be calculated by item0 of each subgroup sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); resetFetchNumA(mulOpts); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } else { // lower size_t diagBlocks; //Number of bw *y blocks that fit in y*y square if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } diagBlocks = divRoundUp(y0, bwidth); sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0); kgenAddStmt(ctx, tmp); sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth); kgenAddStmt(ctx, tmp); // main triangle part sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)", diagBlocks); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; // part without diagonal elements post fetch zeroing ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // diagonal part sprintf(tmp, "for (; k0 < iterK; k0++)"); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0 * %lu;\n", bwidth); // diagonal blocks part mulOpts->postFetch = genTrxmPostFetchZero; resetFetchNumA(mulOpts); ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (isRel) { destroyFetchContext(mulOpts->fctx); mulOpts->fctx = NULL; } return 0; }