// gamerect is the base resolution for the game which is scaled with the filter // depending on what resolution has been chosen, it is mostly 320x200 or 320x240 COpenGL::COpenGL(const CVidConfig &VidConfig) : CVideoEngine(VidConfig), m_texparam(GL_TEXTURE_2D), m_GameScaleDim(m_VidConfig.m_GameRect.w*m_VidConfig.m_ScaleXFilter, m_VidConfig.m_GameRect.h*m_VidConfig.m_ScaleXFilter), m_GamePOTScaleDim(getPowerOfTwo(m_GameScaleDim.w), getPowerOfTwo(m_GameScaleDim.h)) {}
bool StGLFontEntry::createTexture(StGLContext& theCtx) { const GLint aMaxSize = theCtx.getMaxTextureSize(); GLint aGlyphsNb = 0; if(myFont->hasCJK() || myFont->hasKorean()) { // italic does not make sense for Chinese // limit overall number of glyphs in the single texture to 4k // (single font file might contain about 20k-50k glyphs) aGlyphsNb = stMin(4000, 2 * myFont->getGlyphsNumber() - GLint(myLastTileId) + 1); } else { // western might contain reg/bold/italic/bolditalic styles // limit overall number of glyphs in the single texture to 1k // (single font file might contain about 6k glyphs for different languages) aGlyphsNb = stMin(1000, 4 * myFont->getGlyphsNumber() - GLint(myLastTileId) + 1); } const GLsizei aTextureSizeX = getPowerOfTwo(aGlyphsNb * myTileSizeX, aMaxSize); const size_t aTilesPerRow = aTextureSizeX / myTileSizeX; GLsizei aTextureSizeY = stMin(getEvenNumber(GLint((aGlyphsNb / aTilesPerRow) + 1) * myTileSizeY), aMaxSize); if(!theCtx.arbNPTW) { aTextureSizeY = getPowerOfTwo(aTextureSizeY, aMaxSize); } stMemZero(&myLastTilePx, sizeof(myLastTilePx)); myLastTilePx.bottom() = myTileSizeY; myTextures.add(new StGLTexture(theCtx.arbTexRG ? GL_R8 : GL_ALPHA)); myFbos.add(new StGLFrameBuffer()); StHandle<StGLTexture>& aTexture = myTextures[myTextures.size() - 1]; StHandle<StGLFrameBuffer>& aFbo = myFbos [myTextures.size() - 1]; if(!aTexture->initTrash(theCtx, aTextureSizeX, aTextureSizeY)) { return false; } aTexture->bind(theCtx); theCtx.core11fwd->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); theCtx.core11fwd->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); aTexture->unbind(theCtx); // destruction of temporary FBO produces broken texture on Catalyst drivers for unknown reason //StGLFrameBuffer::clearTexture(theCtx, aTexture); #if !defined(GL_ES_VERSION_2_0) if(theCtx.arbTexClear) { theCtx.core11fwd->glPixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE); theCtx.core11fwd->glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); theCtx.core11fwd->glPixelStorei(GL_UNPACK_ALIGNMENT, 1); const stUByte_t THE_BLACK = 0; theCtx.extAll->glClearTexImage(aTexture->getTextureId(), 0, theCtx.arbTexRG ? GL_RED : GL_ALPHA, GL_UNSIGNED_BYTE, &THE_BLACK); } else if(aFbo->init(theCtx, aTexture, false)) { aFbo->clearTexture(theCtx); } else { ST_ERROR_LOG("Fail to bind " + (theCtx.arbTexRG ? "GL_R8" : "GL_ALPHA8") + " texture to FBO!"); } #else (void )aFbo; #endif return true; }
// gamerect is the base resolution for the game which is scaled with the filter // depending on what resolution has been chosen, it is mostly 320x200 or 320x240 COpenGL::COpenGL(const CVidConfig &VidConfig) : CVideoEngine(VidConfig), m_texparam(GL_TEXTURE_2D), m_aspectratio(m_VidConfig.m_DisplayRect.aspectRatio()), m_GamePOTBaseDim(getPowerOfTwo(m_VidConfig.m_GameRect.w), getPowerOfTwo(m_VidConfig.m_GameRect.h)), m_GamePOTVideoDim(getPowerOfTwo(m_VidConfig.m_DisplayRect.w), getPowerOfTwo(m_VidConfig.m_DisplayRect.h)) {}
bool COpenGL::createSurfaces() { // This function creates the surfaces which are needed for the game. const CRect<Uint16> gamerect = m_VidConfig.m_GameRect; ScrollSurface = createSurface( "ScrollSurface", true, 512, 512, RES_BPP, m_Mode, screen->format ); g_pLogFile->textOut("Blitsurface = creatergbsurface<br>"); BlitSurface = createSurface( "BlitSurface", true, getPowerOfTwo(gamerect.w), getPowerOfTwo(gamerect.h), RES_BPP, m_Mode, screen->format ); g_pLogFile->textOut("FilteredSurface = creatergbsurface<br>"); FilteredSurface = createSurface( "FilteredSurface", true, BlitSurface->w*m_VidConfig.m_ScaleXFilter, BlitSurface->h*m_VidConfig.m_ScaleXFilter, RES_BPP, m_Mode, screen->format ); m_dst_slice = FilteredSurface->w*screen->format->BytesPerPixel; if(m_VidConfig.m_ScaleXFilter == 1) { FXSurface = createSurface( "FXSurface", true, getPowerOfTwo(gamerect.w), getPowerOfTwo(gamerect.h), RES_BPP, m_Mode, screen->format ); } else { FXSurface = createSurface( "FXSurface", false, gamerect.w, gamerect.h, RES_BPP, m_Mode, screen->format ); //Set surface alpha } g_pGfxEngine->Palette.setFXSurface( FXSurface ); Scaler.setFilterFactor(m_VidConfig.m_ScaleXFilter); Scaler.setFilterType(m_VidConfig.m_normal_scale); Scaler.setDynamicFactor( float(FilteredSurface->w)/float(screen->w), float(FilteredSurface->h)/float(screen->h)); return true; }
// gamerect is the base resolution for the game which is scaled with the filter // depending on what resolution has been chosen, it is mostly 320x200 or 320x240 COpenGL::COpenGL(const CVidConfig &VidConfig, Sint16 *&p_sbufferx, Sint16 *&p_sbuffery) : CVideoEngine(VidConfig, p_sbufferx, p_sbuffery), m_opengl_buffer(NULL), m_texparam(GL_TEXTURE_2D), m_aspectratio(m_VidConfig.m_Resolution.computeAspectRatio()), m_GamePOTBaseDim(getPowerOfTwo(m_VidConfig.m_Gamescreen.w), getPowerOfTwo(m_VidConfig.m_Gamescreen.h)), m_GamePOTVideoDim(getPowerOfTwo(m_VidConfig.m_Resolution.width), getPowerOfTwo(m_VidConfig.m_Resolution.height)) {}
bool COpenGL::createSurfaces() { // This function creates the surfaces which are needed for the game. const SDL_Rect &gamerect = m_VidConfig.m_Gamescreen; ScrollSurface = createSurface( "ScrollSurface", true, 512, 512, m_VidConfig.m_Resolution.depth, m_Mode, screen->format ); g_pLogFile->textOut("Blitsurface = creatergbsurface<br>"); BlitSurface = createSurface( "BlitSurface", true, getPowerOfTwo(gamerect.w), getPowerOfTwo(gamerect.h), m_VidConfig.m_Resolution.depth, m_Mode, screen->format ); m_blitsurface_alloc = true; if(m_VidConfig.m_ScaleXFilter == 1) { FGLayerSurface = createSurface( "FGLayerSurface", true, getPowerOfTwo(gamerect.w), getPowerOfTwo(gamerect.h), m_VidConfig.m_Resolution.depth, m_Mode, screen->format ); FXSurface = createSurface( "FXSurface", true, getPowerOfTwo(gamerect.w), getPowerOfTwo(gamerect.h), m_VidConfig.m_Resolution.depth, m_Mode, screen->format ); } else { FGLayerSurface = createSurface( "FGLayerSurface", false, gamerect.w, gamerect.h, m_VidConfig.m_Resolution.depth, m_Mode, screen->format ); SDL_SetColorKey( FGLayerSurface, SDL_SRCCOLORKEY, SDL_MapRGB(FGLayerSurface->format, 0, 0xFF, 0xFE) ); FXSurface = createSurface( "FXSurface", false, gamerect.w, gamerect.h, m_VidConfig.m_Resolution.depth, m_Mode, screen->format ); //Set surface alpha } SDL_SetAlpha( FGLayerSurface, SDL_SRCALPHA, 225 ); g_pGfxEngine->Palette.setFXSurface( FXSurface ); return true; }
void StGLFrameTextures::increaseSize(StGLContext& theCtx, StGLFrameTexture& theTexture, const GLsizei theTextureSizeX, const GLsizei theTextureSizeY) { // test existing size / new size /// TODO (Kirill Gavrilov#8) we can automatically reduce texture size here if((theTexture.getSizeX() < theTextureSizeX) || (theTexture.getSizeY() < theTextureSizeY) || !theTexture.isValid()) { ST_DEBUG_LOG("Requested texture size (" + theTextureSizeX + 'x' + theTextureSizeY + ") larger than current texture size(" + theTexture.getSizeX() + 'x' + theTexture.getSizeY() + ')'); const GLsizei anOriginalSizeX = theTexture.getSizeX(); const GLsizei anOriginalSizeY = theTexture.getSizeY(); const GLint aMaxTexDim = theCtx.getMaxTextureSize(); GLsizei aNewSizeX = stMin(theTextureSizeX, GLsizei(aMaxTexDim)); GLsizei aNewSizeY = stMin(theTextureSizeY, GLsizei(aMaxTexDim)); if(!theCtx.arbNPTW) { aNewSizeX = getPowerOfTwo(theTextureSizeX, GLsizei(aMaxTexDim)); aNewSizeY = getPowerOfTwo(theTextureSizeY, GLsizei(aMaxTexDim)); } if((aNewSizeY != anOriginalSizeY) || (aNewSizeX != anOriginalSizeX)) { if(!theTexture.initTrash(theCtx, aNewSizeX, aNewSizeY)) { theTexture.initTrash(theCtx, (anOriginalSizeX > 0) ? anOriginalSizeX : 512, (anOriginalSizeY > 0) ? anOriginalSizeY : 512); ST_DEBUG_LOG("FAILED to Increase the texture size to (" + aNewSizeX + 'x' + aNewSizeY + ")!"); } else { ST_DEBUG_LOG("Increase the texture size to (" + aNewSizeX + 'x' + aNewSizeY + ") success!"); } } else { ST_DEBUG_LOG("Not possible to Increase the texture size!"); } } }
bool CVideoEngine::createSurfaces(const GsRect<Uint16> &gamerect) { gLogging.ftextOut("Blitsurface creation of %dx%d!\n<br>", gamerect.w, gamerect.h ); mGameSfc.create(m_Mode, gamerect.w, gamerect.h, RES_BPP, 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000); #if SDL_VERSION_ATLEAST(2, 0, 0) SDL_SetSurfaceBlendMode(mGameSfc.getSDLSurface(), SDL_BLENDMODE_NONE); #endif const int squareSize = getPowerOfTwo( gamerect.h > gamerect.w ? gamerect.h : gamerect.w ); gLogging.ftextOut("ScrollSurface creation of %dx%d!\n<br>", squareSize, squareSize ); // This function creates the surfaces which are needed for the game. ScrollSurface = SDL_CreateRGBSurface( m_Mode, squareSize, squareSize, 32, 0x00FF0000, 0x0000FF00, 0x000000FF, 0x00000000); auto blit = mGameSfc.getSDLSurface(); gLogging.ftextOut("ScreenSurface creation of %dx%d!\n<br>", blit->w, blit->h ); if(m_VidConfig.m_ScaleXFilter > 1) { mFilteredSfc.create(m_Mode, blit->w*m_VidConfig.m_ScaleXFilter, blit->h*m_VidConfig.m_ScaleXFilter, RES_BPP, 0, 0, 0, 0); mpScreenSfc = &mFilteredSfc; } else { mpScreenSfc = &mGameSfc; } initOverlaySurface(blit->w, blit->h); #if SDL_VERSION_ATLEAST(2, 0, 0) mpSdlTexture.reset( SDL_CreateTexture(renderer, SDL_PIXELFORMAT_ARGB8888, SDL_TEXTUREACCESS_STREAMING, gamerect.w*m_VidConfig.m_ScaleXFilter, gamerect.h*m_VidConfig.m_ScaleXFilter) ); #endif return true; }
int main(int argc, char** argv){ srand(time(NULL)); if(argc != 2) { printf("Usage: search [elements]\nExample: scan 10000\n"); return -1; } unsigned long long start_time = time_ms(); int event_amount=2; int elems = atoi(argv[1]); cl_int err; cl_event* events=allocateMemoryForEvent(event_amount); cl_ulong total_downsweep=0,total_hillissteele=0; size_t localWorkGroupSize_downSweep[1]={LOCALSIZE}; //must be power of two size_t globalWorkGroupSize_downSweep[1]={getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2))}; //calculating size_t localWorkGroupSize_hillissteele[1]={LOCALSIZE}; //must be power of two size_t globalWorkGroupSize_hillissteele[1]={roundUp(LOCALSIZE,elems)}; //calculating worksize int howManyWorkGroups=globalWorkGroupSize_downSweep[0]/LOCALSIZE; //quotient is power of two, since dividend and divisor are power of two int sumBuffer_length_downSweep=howManyWorkGroups; int sumBuffer_length_hillis=getPowerOfTwo(roundUp(LOCALSIZE,elems)/LOCALSIZE); VALUE *data = (VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result_seq=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *result_hillissteele=(VALUE*)malloc(elems*sizeof(VALUE)); VALUE *sum=(VALUE*)malloc(sumBuffer_length_downSweep*sizeof(VALUE)); VALUE *sum_hillis=(VALUE*)malloc(sumBuffer_length_hillis*sizeof(VALUE)); memset(sum_hillis,0,sumBuffer_length_hillis*sizeof(VALUE)); memset(result_seq,0,elems*sizeof(VALUE)); // initialize data set (fill randomly) for(int j=0; j<elems; ++j) { data[j] =rand()%121; } // printResult(data, elems, 4, "INPUT"); /*Sequential Scan*/ for(int i=1; i<elems; i++){ result_seq[i]=result_seq[i-1]+data[i-1]; } // printResult(result_seq, elems, 4, "Sequential Algorithm OUTPUT"); //ocl initialization size_t deviceInfo; cl_context context; cl_command_queue command_queue; cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue); clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t), &deviceInfo,NULL ); // create memory buffer cl_mem mem_data=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err); cl_mem mem_data_hillis=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err); cl_mem mem_result=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err); cl_mem mem_result_tmp=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err); cl_mem mem_sum=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_downSweep*sizeof(VALUE), NULL, &err); cl_mem mem_sum_hillis=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_hillis*sizeof(VALUE), NULL, &err); CLU_ERRCHECK(err, "Failed to create Buffer"); err=clEnqueueWriteBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL); CLU_ERRCHECK(err, "Failed to write values into mem_sum"); // create kernel from source char tmp[1024]; sprintf(tmp,"-DVALUE=%s", EXPAND_AND_QUOTE(VALUE)); cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp); cl_kernel kernel_downSweep = clCreateKernel(program, "prefix_scan_downSweep", &err); cl_kernel kernel_hillissteele=clCreateKernel(program, "prefix_scan_hillissteele", &err); cl_kernel kernel_last_stage= clCreateKernel(program, "prefix_scan_last_stage", &err); CLU_ERRCHECK(err,"Could not load source program"); /*-------------------------------------DOWNSWEEP-----------------------------------------------*/ // set arguments int border=elems/2; int flag=1; cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_data, sizeof(cl_mem), (void*)&mem_result, sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values"); /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values"); clFinish(command_queue); printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM"); */ err=clEnqueueCopyBuffer(command_queue, mem_result, mem_result_tmp, 0, 0, elems*sizeof(VALUE),0,NULL,NULL); CLU_ERRCHECK(err,"DownSweep_Failed during copying buffer"); /*+++++++++++++++++++++++++++++++++DOWNSWEEP-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_downSweep/2; //since sumbuffer_length is power of two no further adaption is needed cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_sum, sizeof(cl_mem), (void*)&mem_sum, sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*sumBuffer_length_downSweep, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); howManyWorkGroups>1 ? globalWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups; //if 1 workgroup make adaption howManyWorkGroups>1 ? localWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups; //if 1 workgroup make adaption //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL,&(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values"); printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM PREFIX"); */ /*+++++++++++++++++++++++++++++++++DOWNSWEEP-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/ border=sumBuffer_length_downSweep; flag=1; cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result_tmp, sizeof(cl_mem), (void*)&mem_sum, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); globalWorkGroupSize_downSweep[0]=getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2)); localWorkGroupSize_downSweep[0]=LOCALSIZE; //printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel"); clFinish(command_queue); total_downsweep+=getProfileTotalTime(events,1); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result_tmp, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values"); /*---------------------------------------HILLISSTEELE----------------------------------------------------------*/ flag=1; border=elems; cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_data_hillis, sizeof(cl_mem), (void*)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel //printf("GlobalSize: %d\tLocalWorkGroupSize: %d\n",globalWorkGroupSize[0], localWorkGroupSize[0]); //printf("Amount of WorkGroups: %d\n", globalWorkGroupSize[0]/localWorkGroupSize[0]); CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Inputbuffer"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Failed to read Result Values"); CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum_1 Values"); printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM"); printResult(result_hillissteele,elems, 4, "HILLISSTEELE Temporary OUTPUT"); */ /*++++++++++++++++++++++++++++++++++++++HILLISSTEELE-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_hillis; cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*howManyWorkGroups*2, NULL, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); //execute kernel globalWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis; localWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis; CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Sumbuffer"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device /* CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum2 Values"); printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM PREFIX"); */ /*+++++++++++++++++++++++++++++++++++++HILLISSTEELE-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/ flag=0; border=sumBuffer_length_hillis; cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(int), (void*)&border, sizeof(int), (void*)&flag); globalWorkGroupSize_hillissteele[0]=roundUp(LOCALSIZE,elems); localWorkGroupSize_hillissteele[0]=LOCALSIZE; //printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]); //execute kernel CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue kernel_Last_stage"); clFinish(command_queue); total_hillissteele+=getProfileTotalTime(events,0); //read values back from device CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Hillissteele_Failed to read Result Values"); /*-------------------------FINISHED---------------------------------------------*/ //printResult(result_hillissteele, elems, 4, "HILLISSTEELE OUTPUT"); //printResult(result, elems, 4, "IMPROVED IMPLEMENTATION OUTPUT"); //verify results verifyResult(result_seq,result,elems, "Verifying result of DownSweep for bigger array size"); verifyResult(result_seq,result_hillissteele,elems, "Verifying result of HILLISSTEELE for bigger array size"); printProfileInfo(total_downsweep,"Improved Algorithm Time:"); printProfileInfo(total_hillissteele,"Hillis & Steele Time:"); printf("\nDEVICE INFO MAX_WORK_GROUP_SIZE: %d\n", (int) deviceInfo); printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE)); printf("Done, took %16llu ms\n", time_ms()-start_time); // finalization for(int i=0; i<event_amount; i++){ clReleaseEvent(events[i]); } err = clFinish(command_queue); err |= clReleaseKernel(kernel_downSweep); err |= clReleaseKernel(kernel_last_stage); err |= clReleaseKernel(kernel_hillissteele); err |= clReleaseProgram(program); err |= clReleaseMemObject(mem_data); err |= clReleaseMemObject(mem_data_hillis); err |= clReleaseMemObject(mem_result); err |= clReleaseMemObject(mem_result_tmp); err |= clReleaseMemObject(mem_sum); err |= clReleaseMemObject(mem_sum_hillis); err |= clReleaseCommandQueue(command_queue); err |= clReleaseContext(context); CLU_ERRCHECK(err, "Failed during ocl cleanup"); free(events); free(result); free(result_hillissteele); free(result_seq); free(sum); free(sum_hillis); return EXIT_SUCCESS; }