Esempio n. 1
0
// gamerect is the base resolution for the game which is scaled with the filter
// depending on what resolution has been chosen, it is mostly 320x200 or 320x240
COpenGL::COpenGL(const CVidConfig &VidConfig) :
CVideoEngine(VidConfig),
m_texparam(GL_TEXTURE_2D),
m_GameScaleDim(m_VidConfig.m_GameRect.w*m_VidConfig.m_ScaleXFilter,
				m_VidConfig.m_GameRect.h*m_VidConfig.m_ScaleXFilter),
m_GamePOTScaleDim(getPowerOfTwo(m_GameScaleDim.w), getPowerOfTwo(m_GameScaleDim.h))
{}
Esempio n. 2
0
bool StGLFontEntry::createTexture(StGLContext& theCtx) {
    const GLint aMaxSize = theCtx.getMaxTextureSize();

    GLint aGlyphsNb = 0;
    if(myFont->hasCJK()
    || myFont->hasKorean()) {
        // italic does not make sense for Chinese
        // limit overall number of glyphs in the single texture to 4k
        // (single font file might contain about 20k-50k glyphs)
        aGlyphsNb = stMin(4000, 2 * myFont->getGlyphsNumber() - GLint(myLastTileId) + 1);
    } else {
        // western might contain reg/bold/italic/bolditalic styles
        // limit overall number of glyphs in the single texture to 1k
        // (single font file might contain about 6k glyphs for different languages)
        aGlyphsNb = stMin(1000, 4 * myFont->getGlyphsNumber() - GLint(myLastTileId) + 1);
    }

    const GLsizei aTextureSizeX = getPowerOfTwo(aGlyphsNb * myTileSizeX, aMaxSize);
    const size_t  aTilesPerRow  = aTextureSizeX / myTileSizeX;
    GLsizei aTextureSizeY = stMin(getEvenNumber(GLint((aGlyphsNb / aTilesPerRow) + 1) * myTileSizeY), aMaxSize);
    if(!theCtx.arbNPTW) {
        aTextureSizeY = getPowerOfTwo(aTextureSizeY, aMaxSize);
    }

    stMemZero(&myLastTilePx, sizeof(myLastTilePx));
    myLastTilePx.bottom() = myTileSizeY;

    myTextures.add(new StGLTexture(theCtx.arbTexRG ? GL_R8 : GL_ALPHA));
    myFbos.add(new StGLFrameBuffer());
    StHandle<StGLTexture>&     aTexture = myTextures[myTextures.size() - 1];
    StHandle<StGLFrameBuffer>& aFbo     = myFbos    [myTextures.size() - 1];
    if(!aTexture->initTrash(theCtx, aTextureSizeX, aTextureSizeY)) {
        return false;
    }
    aTexture->bind(theCtx);
    theCtx.core11fwd->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
    theCtx.core11fwd->glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
    aTexture->unbind(theCtx);

    // destruction of temporary FBO produces broken texture on Catalyst drivers for unknown reason
    //StGLFrameBuffer::clearTexture(theCtx, aTexture);
#if !defined(GL_ES_VERSION_2_0)
    if(theCtx.arbTexClear) {
        theCtx.core11fwd->glPixelStorei(GL_UNPACK_LSB_FIRST,  GL_FALSE);
        theCtx.core11fwd->glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
        theCtx.core11fwd->glPixelStorei(GL_UNPACK_ALIGNMENT,  1);
        const stUByte_t THE_BLACK = 0;
        theCtx.extAll->glClearTexImage(aTexture->getTextureId(), 0, theCtx.arbTexRG ? GL_RED : GL_ALPHA, GL_UNSIGNED_BYTE, &THE_BLACK);
    } else if(aFbo->init(theCtx, aTexture, false)) {
        aFbo->clearTexture(theCtx);
    } else {
        ST_ERROR_LOG("Fail to bind " + (theCtx.arbTexRG ? "GL_R8" : "GL_ALPHA8") + " texture to FBO!");
    }
#else
    (void )aFbo;
#endif

    return true;
}
Esempio n. 3
0
// gamerect is the base resolution for the game which is scaled with the filter
// depending on what resolution has been chosen, it is mostly 320x200 or 320x240
COpenGL::COpenGL(const CVidConfig &VidConfig) :
CVideoEngine(VidConfig),
m_texparam(GL_TEXTURE_2D),
m_aspectratio(m_VidConfig.m_DisplayRect.aspectRatio()),
m_GamePOTBaseDim(getPowerOfTwo(m_VidConfig.m_GameRect.w),
				getPowerOfTwo(m_VidConfig.m_GameRect.h)),
m_GamePOTVideoDim(getPowerOfTwo(m_VidConfig.m_DisplayRect.w),
				getPowerOfTwo(m_VidConfig.m_DisplayRect.h))
{}
Esempio n. 4
0
bool COpenGL::createSurfaces()
{
	// This function creates the surfaces which are needed for the game.
	const CRect<Uint16> gamerect = m_VidConfig.m_GameRect;
    ScrollSurface = createSurface( "ScrollSurface", true,
								  512, 512,
								  RES_BPP,
								  m_Mode, screen->format );

    g_pLogFile->textOut("Blitsurface = creatergbsurface<br>");

    BlitSurface = createSurface( "BlitSurface", true,
    		getPowerOfTwo(gamerect.w),
    		getPowerOfTwo(gamerect.h),
    		RES_BPP,
    		m_Mode, screen->format );

    g_pLogFile->textOut("FilteredSurface = creatergbsurface<br>");

	FilteredSurface = createSurface( "FilteredSurface", true,
				BlitSurface->w*m_VidConfig.m_ScaleXFilter,
				BlitSurface->h*m_VidConfig.m_ScaleXFilter,
				RES_BPP,
				m_Mode, screen->format );

	m_dst_slice = FilteredSurface->w*screen->format->BytesPerPixel;

	if(m_VidConfig.m_ScaleXFilter == 1)
	{
		FXSurface = createSurface( "FXSurface", true,
						getPowerOfTwo(gamerect.w),
						getPowerOfTwo(gamerect.h),
						RES_BPP,
						m_Mode, screen->format );
	}
	else
	{
		FXSurface = createSurface( "FXSurface", false,
				gamerect.w,
				gamerect.h,
				RES_BPP,
				m_Mode, screen->format );

		//Set surface alpha
	}

	g_pGfxEngine->Palette.setFXSurface( FXSurface );

	Scaler.setFilterFactor(m_VidConfig.m_ScaleXFilter);
	Scaler.setFilterType(m_VidConfig.m_normal_scale);
	Scaler.setDynamicFactor( float(FilteredSurface->w)/float(screen->w),
							 float(FilteredSurface->h)/float(screen->h));


	return true;
}
Esempio n. 5
0
// gamerect is the base resolution for the game which is scaled with the filter
// depending on what resolution has been chosen, it is mostly 320x200 or 320x240
COpenGL::COpenGL(const CVidConfig &VidConfig, Sint16 *&p_sbufferx, Sint16 *&p_sbuffery) :
CVideoEngine(VidConfig, p_sbufferx, p_sbuffery),
m_opengl_buffer(NULL),
m_texparam(GL_TEXTURE_2D),
m_aspectratio(m_VidConfig.m_Resolution.computeAspectRatio()),
m_GamePOTBaseDim(getPowerOfTwo(m_VidConfig.m_Gamescreen.w),
				getPowerOfTwo(m_VidConfig.m_Gamescreen.h)),
m_GamePOTVideoDim(getPowerOfTwo(m_VidConfig.m_Resolution.width),
				getPowerOfTwo(m_VidConfig.m_Resolution.height))
{}
Esempio n. 6
0
bool COpenGL::createSurfaces()
{
	// This function creates the surfaces which are needed for the game.
	const SDL_Rect &gamerect = m_VidConfig.m_Gamescreen;
    ScrollSurface = createSurface( "ScrollSurface", true,
								  512, 512,
								  m_VidConfig.m_Resolution.depth,
								  m_Mode, screen->format );

    g_pLogFile->textOut("Blitsurface = creatergbsurface<br>");

    BlitSurface = createSurface( "BlitSurface", true,
    		getPowerOfTwo(gamerect.w),
    		getPowerOfTwo(gamerect.h),
    		m_VidConfig.m_Resolution.depth,
    		m_Mode, screen->format );
    m_blitsurface_alloc = true;

	if(m_VidConfig.m_ScaleXFilter == 1)
	{
		FGLayerSurface = createSurface( "FGLayerSurface", true,
						getPowerOfTwo(gamerect.w),
						getPowerOfTwo(gamerect.h),
									m_VidConfig.m_Resolution.depth,
									m_Mode, screen->format );
		FXSurface = createSurface( "FXSurface", true,
				getPowerOfTwo(gamerect.w),
				getPowerOfTwo(gamerect.h),
						m_VidConfig.m_Resolution.depth,
						m_Mode, screen->format );
	}
	else
	{
		FGLayerSurface = createSurface( "FGLayerSurface", false,
				gamerect.w,
				gamerect.h,
				m_VidConfig.m_Resolution.depth,
				m_Mode, screen->format );

		SDL_SetColorKey( FGLayerSurface, SDL_SRCCOLORKEY,
		SDL_MapRGB(FGLayerSurface->format, 0, 0xFF, 0xFE) );

		FXSurface = createSurface( "FXSurface", false,
				gamerect.w,
				gamerect.h,
				m_VidConfig.m_Resolution.depth,
				m_Mode, screen->format );

		//Set surface alpha
	}

	SDL_SetAlpha( FGLayerSurface, SDL_SRCALPHA, 225 );
	g_pGfxEngine->Palette.setFXSurface( FXSurface );

	return true;
}
Esempio n. 7
0
void StGLFrameTextures::increaseSize(StGLContext&      theCtx,
                                     StGLFrameTexture& theTexture,
                                     const GLsizei     theTextureSizeX,
                                     const GLsizei     theTextureSizeY) {
    // test existing size / new size
    /// TODO (Kirill Gavrilov#8) we can automatically reduce texture size here
    if((theTexture.getSizeX() < theTextureSizeX) ||
       (theTexture.getSizeY() < theTextureSizeY) ||
       !theTexture.isValid()) {
        ST_DEBUG_LOG("Requested texture size (" + theTextureSizeX + 'x' + theTextureSizeY
                   + ") larger than current texture size(" + theTexture.getSizeX() + 'x' + theTexture.getSizeY() + ')');
        const GLsizei anOriginalSizeX = theTexture.getSizeX();
        const GLsizei anOriginalSizeY = theTexture.getSizeY();

        const GLint aMaxTexDim = theCtx.getMaxTextureSize();
        GLsizei aNewSizeX = stMin(theTextureSizeX, GLsizei(aMaxTexDim));
        GLsizei aNewSizeY = stMin(theTextureSizeY, GLsizei(aMaxTexDim));
        if(!theCtx.arbNPTW) {
            aNewSizeX = getPowerOfTwo(theTextureSizeX, GLsizei(aMaxTexDim));
            aNewSizeY = getPowerOfTwo(theTextureSizeY, GLsizei(aMaxTexDim));
        }
        if((aNewSizeY != anOriginalSizeY)
        || (aNewSizeX != anOriginalSizeX)) {
            if(!theTexture.initTrash(theCtx, aNewSizeX, aNewSizeY)) {
                theTexture.initTrash(theCtx,
                                     (anOriginalSizeX > 0) ? anOriginalSizeX : 512,
                                     (anOriginalSizeY > 0) ? anOriginalSizeY : 512);
                ST_DEBUG_LOG("FAILED to Increase the texture size to (" + aNewSizeX + 'x' +  aNewSizeY + ")!");
            } else {
                ST_DEBUG_LOG("Increase the texture size to (" + aNewSizeX + 'x' +  aNewSizeY + ") success!");
            }
        } else {
            ST_DEBUG_LOG("Not possible to Increase the texture size!");
        }
    }
}
Esempio n. 8
0
bool CVideoEngine::createSurfaces(const GsRect<Uint16> &gamerect)
{
    gLogging.ftextOut("Blitsurface creation of %dx%d!\n<br>",
                     gamerect.w, gamerect.h );


    mGameSfc.create(m_Mode, gamerect.w, gamerect.h, RES_BPP,
                    0x00FF0000,
                    0x0000FF00,
                    0x000000FF,
                    0xFF000000);


#if SDL_VERSION_ATLEAST(2, 0, 0)
    SDL_SetSurfaceBlendMode(mGameSfc.getSDLSurface(), SDL_BLENDMODE_NONE);
#endif


    const int squareSize = getPowerOfTwo( gamerect.h > gamerect.w ? gamerect.h : gamerect.w );

    gLogging.ftextOut("ScrollSurface creation of %dx%d!\n<br>",
                     squareSize, squareSize );

    // This function creates the surfaces which are needed for the game.
    ScrollSurface = SDL_CreateRGBSurface( m_Mode,
                                          squareSize,
                                          squareSize, 32,
                                          0x00FF0000,
                                          0x0000FF00,
                                          0x000000FF,
                                          0x00000000);

    auto blit = mGameSfc.getSDLSurface();

    gLogging.ftextOut("ScreenSurface creation of %dx%d!\n<br>",
                     blit->w, blit->h );


    if(m_VidConfig.m_ScaleXFilter > 1)
    {
        mFilteredSfc.create(m_Mode, blit->w*m_VidConfig.m_ScaleXFilter, blit->h*m_VidConfig.m_ScaleXFilter,
                            RES_BPP, 0, 0, 0, 0);

        mpScreenSfc = &mFilteredSfc;
    }
    else
    {
        mpScreenSfc = &mGameSfc;
    }

    initOverlaySurface(blit->w, blit->h);


#if SDL_VERSION_ATLEAST(2, 0, 0)
    mpSdlTexture.reset( SDL_CreateTexture(renderer,
                                   SDL_PIXELFORMAT_ARGB8888,
                                   SDL_TEXTUREACCESS_STREAMING,
                                   gamerect.w*m_VidConfig.m_ScaleXFilter,
                                   gamerect.h*m_VidConfig.m_ScaleXFilter) );
#endif


    return true;
}
int main(int argc, char** argv){
	
	srand(time(NULL));
	
	if(argc != 2) {
		printf("Usage: search [elements]\nExample: scan 10000\n");
		return -1;
	}
	
	unsigned long long start_time = time_ms();
	int event_amount=2;
	int elems = atoi(argv[1]);
	
	cl_int err;
	cl_event* events=allocateMemoryForEvent(event_amount);
	cl_ulong total_downsweep=0,total_hillissteele=0;
	size_t localWorkGroupSize_downSweep[1]={LOCALSIZE};	//must be power of two
	size_t globalWorkGroupSize_downSweep[1]={getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2))};	//calculating
	
	size_t localWorkGroupSize_hillissteele[1]={LOCALSIZE};	//must be power of two
	size_t globalWorkGroupSize_hillissteele[1]={roundUp(LOCALSIZE,elems)};	//calculating worksize
	
	
	int howManyWorkGroups=globalWorkGroupSize_downSweep[0]/LOCALSIZE;	//quotient is power of two, since dividend and divisor are power of two
	int sumBuffer_length_downSweep=howManyWorkGroups;
	int sumBuffer_length_hillis=getPowerOfTwo(roundUp(LOCALSIZE,elems)/LOCALSIZE);	

	VALUE *data = (VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *result_seq=(VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *result=(VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *result_hillissteele=(VALUE*)malloc(elems*sizeof(VALUE));
	VALUE *sum=(VALUE*)malloc(sumBuffer_length_downSweep*sizeof(VALUE));
	VALUE *sum_hillis=(VALUE*)malloc(sumBuffer_length_hillis*sizeof(VALUE));
	

	memset(sum_hillis,0,sumBuffer_length_hillis*sizeof(VALUE));
	memset(result_seq,0,elems*sizeof(VALUE));
	
	// initialize data set (fill randomly)
	for(int j=0; j<elems; ++j) {
		data[j] =rand()%121;
	}
	
//	printResult(data, elems, 4, "INPUT");
	
	/*Sequential Scan*/
	for(int i=1; i<elems; i++){
	    result_seq[i]=result_seq[i-1]+data[i-1];
	}
	
//	printResult(result_seq, elems, 4, "Sequential Algorithm OUTPUT");
		
	//ocl initialization
	size_t deviceInfo;
	cl_context context;
	cl_command_queue command_queue;
	cl_device_id device_id = cluInitDevice(CL_DEVICE, &context, &command_queue);
	clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE,sizeof(size_t), &deviceInfo,NULL );
  
	
	// create memory buffer
	cl_mem mem_data=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err);
 	cl_mem mem_data_hillis=clCreateBuffer(context, CL_MEM_READ_ONLY| CL_MEM_USE_HOST_PTR,elems*sizeof(VALUE), data, &err);
	cl_mem mem_result=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err);
	cl_mem mem_result_tmp=clCreateBuffer(context, CL_MEM_READ_WRITE, elems*sizeof(VALUE), NULL,&err);
	cl_mem mem_sum=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_downSweep*sizeof(VALUE), NULL, &err);
	cl_mem mem_sum_hillis=clCreateBuffer(context, CL_MEM_READ_WRITE, sumBuffer_length_hillis*sizeof(VALUE), NULL, &err);
	CLU_ERRCHECK(err, "Failed to create Buffer");
    
	err=clEnqueueWriteBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL);
	CLU_ERRCHECK(err, "Failed to write values into mem_sum");
	

	// create kernel from source
	char tmp[1024];
 	sprintf(tmp,"-DVALUE=%s", EXPAND_AND_QUOTE(VALUE));
	cl_program program = cluBuildProgramFromFile(context, device_id, KERNEL_FILE_NAME, tmp);
	cl_kernel kernel_downSweep = clCreateKernel(program, "prefix_scan_downSweep", &err);
	cl_kernel kernel_hillissteele=clCreateKernel(program, "prefix_scan_hillissteele", &err);
	cl_kernel kernel_last_stage= clCreateKernel(program, "prefix_scan_last_stage", &err);
	CLU_ERRCHECK(err,"Could not load source program");
    

	
	/*-------------------------------------DOWNSWEEP-----------------------------------------------*/
	// set arguments
	int border=elems/2;
	int flag=1;
	
	cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_data, sizeof(cl_mem), (void*)&mem_result,
			      sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);

	//execute kernel  	     
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel");		      
	clFinish(command_queue);
	total_downsweep+=getProfileTotalTime(events,1);
	
	//read values back from device
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values");
	
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values");
	clFinish(command_queue);
	printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM");
	*/
	err=clEnqueueCopyBuffer(command_queue, mem_result, mem_result_tmp, 0, 0, elems*sizeof(VALUE),0,NULL,NULL);
	CLU_ERRCHECK(err,"DownSweep_Failed during copying buffer");
	
	
	/*+++++++++++++++++++++++++++++++++DOWNSWEEP-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++++*/
	flag=0;
	border=sumBuffer_length_downSweep/2;	//since sumbuffer_length is power of two no further adaption is needed
	cluSetKernelArguments(kernel_downSweep, 6, sizeof(cl_mem), (void *)&mem_sum, sizeof(cl_mem), (void*)&mem_sum,
			      sizeof(cl_mem), (void*)&mem_sum,sizeof(VALUE)*sumBuffer_length_downSweep, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);
	
	howManyWorkGroups>1 ? globalWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups;	//if 1 workgroup make adaption
	howManyWorkGroups>1 ? localWorkGroupSize_downSweep[0]=howManyWorkGroups/2:howManyWorkGroups;	//if 1 workgroup make adaption
	
	
	//execute kernel
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_downSweep, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL,&(events[1])), "DownSweep_Failed to enqueue 2D kernel");		      
	clFinish(command_queue);
	total_downsweep+=getProfileTotalTime(events,1);
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum, CL_TRUE, 0, sumBuffer_length_downSweep*sizeof(VALUE), sum, 0, NULL, NULL),"Failed to read Sum Values");	
	printSumBuffer(sum, sumBuffer_length_downSweep,"DOWNSWEEP SUM PREFIX");
	*/
	
	/*+++++++++++++++++++++++++++++++++DOWNSWEEP-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/
	border=sumBuffer_length_downSweep;
	flag=1;
	cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result_tmp, sizeof(cl_mem), (void*)&mem_sum, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);
	globalWorkGroupSize_downSweep[0]=getPowerOfTwo(roundUp(LOCALSIZE,roundUp(LOCALSIZE, elems)/2));
	localWorkGroupSize_downSweep[0]=LOCALSIZE;
	
	//printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]);
	
	//execute kernel  	     
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_downSweep, localWorkGroupSize_downSweep, 0, NULL, &(events[1])), "DownSweep_Failed to enqueue 2D kernel");		      
	clFinish(command_queue);
	total_downsweep+=getProfileTotalTime(events,1);
	//read values back from device
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result_tmp, CL_TRUE, 0, elems*sizeof(VALUE), result, 0, NULL, NULL),"DownSweep_Failed to read Result Values");
	
	
	/*---------------------------------------HILLISSTEELE----------------------------------------------------------*/
	
	
	flag=1;
	border=elems;
	
	cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_data_hillis, sizeof(cl_mem), (void*)&mem_result,
			      sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*LOCALSIZE*2, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);

	//execute kernel	
	//printf("GlobalSize: %d\tLocalWorkGroupSize: %d\n",globalWorkGroupSize[0], localWorkGroupSize[0]);
	//printf("Amount of WorkGroups: %d\n", globalWorkGroupSize[0]/localWorkGroupSize[0]);
	
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Inputbuffer");		      
	
	clFinish(command_queue);
	total_hillissteele+=getProfileTotalTime(events,0);
	//read values back from device
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Failed to read Result Values");
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum_1 Values");
	printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM");
	printResult(result_hillissteele,elems, 4, "HILLISSTEELE Temporary OUTPUT");
	*/
	
	
	/*++++++++++++++++++++++++++++++++++++++HILLISSTEELE-ON-SUM-BUFFER+++++++++++++++++++++++++++++++++++++*/
	
	flag=0;
	border=sumBuffer_length_hillis;
	cluSetKernelArguments(kernel_hillissteele, 6, sizeof(cl_mem), (void *)&mem_sum_hillis, sizeof(cl_mem), (void*)&mem_sum_hillis,
			      sizeof(cl_mem), (void*)&mem_sum_hillis,sizeof(VALUE)*howManyWorkGroups*2, NULL, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);

	//execute kernel
	globalWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis;
	localWorkGroupSize_hillissteele[0]=sumBuffer_length_hillis;
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_hillissteele, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue 2D kernel_Sumbuffer");		      
	
	clFinish(command_queue);
	total_hillissteele+=getProfileTotalTime(events,0);
	
	//read values back from device
	/*
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_sum_hillis, CL_TRUE, 0, sumBuffer_length_hillis*sizeof(VALUE), sum_hillis, 0, NULL, NULL),"Failed to read Sum2 Values");
	printSumBuffer(sum_hillis, sumBuffer_length_hillis, "HILLISSTEELE SUM PREFIX");
	*/
	
	/*+++++++++++++++++++++++++++++++++++++HILLISSTEELE-LAST-STAGE(Add Sums)++++++++++++++++++++++++++++++++++++++++*/
	
	flag=0;
	border=sumBuffer_length_hillis;
	cluSetKernelArguments(kernel_last_stage, 4, sizeof(cl_mem), (void *)&mem_result, sizeof(cl_mem), (void*)&mem_sum_hillis, sizeof(int), (void*)&border,
			      sizeof(int), (void*)&flag);
	
	globalWorkGroupSize_hillissteele[0]=roundUp(LOCALSIZE,elems);
	localWorkGroupSize_hillissteele[0]=LOCALSIZE;
	
	//printf("GLOBALSIZE: %d\tLOCALSIZE %d\n",globalWorkGroupSize[0],localWorkGroupSize[0]);
	
	//execute kernel  	     
	CLU_ERRCHECK(clEnqueueNDRangeKernel(command_queue, kernel_last_stage, 1, NULL, globalWorkGroupSize_hillissteele, localWorkGroupSize_hillissteele, 0, NULL, &(events[0])), "Hillissteele_Failed to enqueue kernel_Last_stage");		      
	clFinish(command_queue);
	total_hillissteele+=getProfileTotalTime(events,0);
	
	//read values back from device
	CLU_ERRCHECK(clEnqueueReadBuffer(command_queue, mem_result, CL_TRUE, 0, elems*sizeof(VALUE), result_hillissteele, 0, NULL, NULL),"Hillissteele_Failed to read Result Values");
	
	
	/*-------------------------FINISHED---------------------------------------------*/
	
	//printResult(result_hillissteele, elems, 4, "HILLISSTEELE OUTPUT");
	//printResult(result, elems, 4, "IMPROVED IMPLEMENTATION OUTPUT");
	
	//verify results
	verifyResult(result_seq,result,elems, "Verifying result of DownSweep for bigger array size");
	verifyResult(result_seq,result_hillissteele,elems, "Verifying result of HILLISSTEELE for bigger array size");
	
	
	printProfileInfo(total_downsweep,"Improved Algorithm Time:");
	printProfileInfo(total_hillissteele,"Hillis & Steele Time:");
	printf("\nDEVICE INFO MAX_WORK_GROUP_SIZE: %d\n", (int) deviceInfo);
	printf("OCL Device: %s\n", cluGetDeviceDescription(device_id, CL_DEVICE));
	printf("Done, took %16llu ms\n", time_ms()-start_time);
    
	
	
	// finalization
	
	for(int i=0; i<event_amount; i++){
	    clReleaseEvent(events[i]);
	}
	
	err =  clFinish(command_queue);
	err |= clReleaseKernel(kernel_downSweep);
	err |= clReleaseKernel(kernel_last_stage);
	err |= clReleaseKernel(kernel_hillissteele);
	err |= clReleaseProgram(program);
	err |= clReleaseMemObject(mem_data);
	err |= clReleaseMemObject(mem_data_hillis);
	err |= clReleaseMemObject(mem_result);
	err |= clReleaseMemObject(mem_result_tmp);
	err |= clReleaseMemObject(mem_sum);
	err |= clReleaseMemObject(mem_sum_hillis);
	err |= clReleaseCommandQueue(command_queue);
	err |= clReleaseContext(context);
	CLU_ERRCHECK(err, "Failed during ocl cleanup");
    
	free(events);
	free(result);
	free(result_hillissteele);
	free(result_seq);
	free(sum);
	free(sum_hillis);
	
	return EXIT_SUCCESS; 
}