Beispiel #1
0
int main(int argc, char** argv) 
{

   int step,burst;

   int nparticle = 8192; /* MUST be a nice power of two for simplicity */
   int nstep = 500;
   int nburst = 20; /* MUST divide the value of nstep without remainder */
   int nthread = 64; /* chosen for ATI Radeon HD 5870 */

   float dt = 0.0001;
   float eps = 0.0001;
   cl_float4* pos1 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);
   cl_float4* pos2 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);
   cl_float4* vel = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);

   nbody_init(nparticle,pos1,vel);

   void* h = clopen(stdgpu,"nbody_kern.cl",CLLD_NOW);
   cl_kernel krn = clsym(stdgpu,h,"nbody_kern",CLLD_NOW);

   clndrange_t ndr = clndrange_init1d(0,nparticle,nthread);

   clarg_set(stdgpu,krn,0,dt);
   clarg_set(stdgpu,krn,1,eps);
   clarg_set_global(stdgpu,krn,4,vel);
   clarg_set_local(stdgpu,krn,5,nthread*sizeof(cl_float4));

	clmsync(stdgpu,0,pos1,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
	clmsync(stdgpu,0,vel,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   for(step=0; step<nstep; step+=nburst) {

      for(burst=0; burst<nburst; burst+=2) {

         clarg_set_global(stdgpu,krn,2,pos1);
         clarg_set_global(stdgpu,krn,3,pos2);
         clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT);

         clarg_set_global(stdgpu,krn,2,pos2);
         clarg_set_global(stdgpu,krn,3,pos1);
         clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT);
      
      }

      clmsync(stdgpu,0,pos1,CL_MEM_HOST|CL_EVENT_NOWAIT);

      clwait(stdgpu,0,CL_KERNEL_EVENT|CL_MEM_EVENT);

   }

   nbody_output(nparticle,pos1,vel);

   clclose(stdgpu,h);

   clfree(pos1);
   clfree(pos2);
   clfree(vel);  
}
void BasketGeometricOpenCLOption::fork_kernel(cl_kernel kernel)
{
    assert(cl_start_prices != NULL);
    assert(cl_asset_volatilities != NULL);
    assert(cl_correlations != NULL);

    clndrange_t index_range = clndrange_init1d(0, 1, 1);

    clforka(context, device_number, kernel, &index_range, CL_EVENT_NOWAIT, number_of_assets, cl_start_prices, strike_price, maturity, cl_asset_volatilities, risk_free_rate, cl_correlations, results);
}
Beispiel #3
0
int main (int argc, char * argv[])
{

   char kernFile[] = "../src/nD.cl";	// the default location of the cl file - can be over ridden by the first command line arguement
   char debugFile[] = "./debug";		// the debug file location
   char msg[255];						// space for the debug message where a program variable is to be written to the debug file
   char * clFile;						// the cl file string actually used (either argv or kernFile)
   void * openHandle;					// the return value to clOpen
   int	bytesPerCore = 16;				// how many bytes we want each core to process
   int workItems = 32; 					// the total number workItems (threads sent to the Epiphany)
   int	i;								// loop counter
   cl_uchar * wrkArea1D;					// the pointer to the malloc'd space (bytesPerCore * workItems)
   cl_uchar * wrkArea2D;

//	These variables will be useful when I get getDeviceInfo and getBuildInfo working
//   unsigned int space;					// the space required for clGetInfo style calls
//   cl_build_status bOk;					// the return value from clGetProgramBuildInfo
//   size_t computeUnits;					// return from GetDeviceInfo
//   char strInfo[20];

   FILE * pFile;

   if(argc == 2)
	   clFile = argv[1];
   else
	   clFile = kernFile;

   pFile = fopen(clFile, "r");
   if (pFile == NULL)
   {
	   printf("Opening the Kernel file: %s produced an error(%d). Make sure that the source code variable kern has a valid path to the cl code and that the code is readable.\n", clFile, errno);
	   exit(0);
   }
   else
	   fclose(pFile);	// only open the file to check that it is there and readable


   debugReset(debugFile);
   debugdebug(debugFile, (char*)"How many devices do we have?\n");


   sprintf(msg, "About to malloc wrkArea1D: %d\n", workItems * bytesPerCore);
   debugdebug(debugFile, msg);
   wrkArea1D = (cl_uchar*) clmalloc(stdacc, workItems * bytesPerCore, 0);
   wrkArea2D = (cl_uchar*) clmalloc(stdacc, workItems * bytesPerCore, 0);

   for (i=0; i < workItems * bytesPerCore; i++)
	   wrkArea2D[i] = wrkArea1D[i] = 0;

   sprintf(msg, "Well malloc worked! Opening kernel file:%s\n", clFile);
   debugdebug(debugFile, msg);
   openHandle = clopen(stdacc, clFile, CLLD_NOW);
   // open the standard accellerator context (i.e. the Epiphany chip, reading in the .cl file and compiling it immediately

   clndrange_t ndr1D = clndrange_init1d(NULL,							// global offset (always zero)
		   	   	   	   	   	   	   ((size_t)workItems),					// total number of threads (get_global_id will return 0 to workItems
		   	   	   	   	   	   	   ((size_t)bytesPerCore));				// How many bytes do we tell the kernel to process via get_local_size(0)
   exKernel(openHandle, &ndr1D, (char*)"k_init1D", workItems, bytesPerCore, wrkArea1D, debugFile);

   clndrange_t ndr2D = clndrange_init2d(NULL,							// global offset (always zero)
		   	   	   	   	   	   	   ((size_t)workItems),					// total number of threads (get_global_id will return 0 to workItems
		   	   	   	   	   	   	   ((size_t)(bytesPerCore/4)),			// How many bytes do we tell the kernel to process via get_local_size(0)
		   	   	   	   	   	   	   NULL,								// another useless global offset
		   	   	   	   	   	   	   ((size_t)workItems),					// a value that does not seem to do anything useful
		   	   	   	   	   	   	   ((size_t)(bytesPerCore/4)));			// How many rows to process per call returned by get_local_size(1)

   exKernel(openHandle, &ndr2D, (char*)"k_init2D", workItems, bytesPerCore, wrkArea2D, debugFile);


   // ============================================================================================================
   // show the results
   // ============================================================================================================

   printf("The 1D data:\n");
   for(i=0; i < workItems * bytesPerCore; i++)
   {
	   printf("%u\t", wrkArea1D[i]);
	   if(((i+1) % bytesPerCore) == 0)
		   printf("\n");
   }

   printf("The 2D data:\n");
   for(i=0; i < workItems * bytesPerCore; i++)
   {
	   printf("%u\t", wrkArea2D[i]);
	   if(((i+1) % bytesPerCore) == 0)
		   printf("\n");
   }

   clfree(wrkArea1D);
   clfree(wrkArea2D);

   return 0;
}
Beispiel #4
0
int main()
{
   cl_uint n = 64;

#if(1)

	/* use default contexts, if no GPU use CPU */
   CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

   unsigned int devnum = 0;

   void* clh = clopen(cp,"matvecmult.cl",CLLD_NOW);
   cl_kernel krn = clsym(cp,clh,"matvecmult_kern",0);

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* initialize vectors a[] and b[], zero c[] */
   int i,j; 
   for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;

   /* define the computational domain and workgroup size */
   //clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   /* set the kernel arguments */
   clarg_set(cp,krn,0,n);
   clarg_set_global(cp,krn,1,aa);
   clarg_set_global(cp,krn,2,b);
   clarg_set_global(cp,krn,3,c);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* force execution of operations in command queue (non-blocking call) */
   clflush(cp,devnum,0);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);

   clfree(aa);
   clfree(b);
   clfree(c);

   clclose(cp,clh);

#endif

}
int main()
{
   cl_uint n = 1024;

	/* use default contexts, if no GPU use CPU */
   CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

   unsigned int devnum = 0;

#ifdef __FreeBSD__
   void* clh = clopen(cp,"matvecmult_special.cl",CLLD_NOW);
   cl_kernel krn = clsym(cp,clh,"matvecmult_special_kern",0);
#else
   cl_kernel krn = clsym(cp,0,"matvecmult_special_kern",0);
#endif

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   /* initialize vectors a[] and b[], zero c[] */
   int i,j; 
   for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;


	/***
	 *** Create a image2d allocation to be used as a read-only table.
	 *** The table will consist of a 24x24 array of float coefficients.
	 *** The clmctl() call is used to set the type and shape of the table.
	 *** Note that we will only use the first component of the float4 elements.
	 ***/	
	cl_float4* table 
		= (cl_float4*)clmalloc(cp,24*24*sizeof(cl_float4),CL_MEM_DETACHED);
	clmctl(table,CL_MCTL_SET_IMAGE2D,24,24,0);
	clmattach(cp,table);

	/* initialize the table to some contrived values */
	for(i=0;i<24;i++) for(j=0;j<24;j++) table[i*24+j].x = 0.125f*(i-j);


   /* define the computational domain and workgroup size */
   clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,table,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   /* set the kernel arguments */
   clarg_set(cp,krn,0,n);
   clarg_set_global(cp,krn,1,aa);
   clarg_set_global(cp,krn,2,b);
   clarg_set_global(cp,krn,3,c);
   clarg_set_global(cp,krn,4,table);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);

   clfree(aa);
   clfree(b);
   clfree(c);

#ifdef __FreeBSD__
	clclose(cp,clh);
#endif

}
int main()
{
   cl_uint n = 1024;

	/* use default contexts, if no ACCELERATOR use CPU */
   CLCONTEXT* cp = (stdacc)? stdacc : stdcpu;

   unsigned int devnum = 0;

	/******************************************************************
	 *** this example requires the .cl file to be available at run-time
	 *** and shows how to pass compiler options to the OCL compiler
	 ******************************************************************/

   void* clh = clopen(cp,"outerprod.cl",CLLD_NOBUILD);
   clbuild(cp,clh,"-D COEF=2", 0);
   cl_kernel krn = clsym(cp,clh,"outerprod_kern",0);

	if (!krn) { fprintf(stderr,"error: no OpenCL kernel\n"); exit(-1); }

   /* allocate OpenCL device-sharable memory */
   cl_float* a = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   /* initialize vectors a[] and b[], zero c[] */
   int i; 
   for(i=0;i<n;i++) a[i] = 1.1f*i;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,a,CL_MEM_DEVICE|CL_EVENT_WAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_WAIT);

   /* define the computational domain and workgroup size */
   clndrange_t ndr = clndrange_init1d( 0, n, 16);

   /* set the kernel arguments */
   clarg_set_global(cp,krn,0,a);
   clarg_set_global(cp,krn,1,b);
   clarg_set_global(cp,krn,2,c);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f %f\n",i,a[i],b[i],c[i]);

   clfree(a);
   clfree(b);
   clfree(c);

   clclose(cp,clh);
}
int main()
{
   cl_uint n = 1024;

	// use default contexts, if no ACCELERATOR use CPU 
   CLCONTEXT* cp = (stdacc)? stdacc : stdcpu;

   unsigned int devnum = 0;

   cl_kernel krn = clsym(cp,0,"matvecmult_kern",0);

   // allocate matrix and vectors using clmulti_array 
	typedef clmulti_array<cl_float,1> array1_t;
	typedef clmulti_array<cl_float,2> array2_t;
	array2_t aa(boost::extents[n][n]);
	array1_t b(boost::extents[n]);
	array1_t c(boost::extents[n]);

   // initialize matrix a[] and vector b[], zero c[] 
   for(int i=0;i<n;i++) for(int j=0;j<n;j++) aa[i][j] = 1.1f*i*j;
   for(int i=0;i<n;i++) b[i] = 2.2f*i;
   for(int i=0;i<n;i++) c[i] = 0.0f;

   // attach the vectors to the STDCL context 
   aa.clmattach(cp);
   b.clmattach(cp);
   c.clmattach(cp);

   // define the computational domain and workgroup size 
   clndrange_t ndr = clndrange_init1d( 0, n, 16);

   // non-blocking sync vectors a and b to device memory (copy to GPU)
   aa.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   b.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   // set the kernel arguments
   clarg_set(cp,krn,0,n);
   aa.clarg_set_global(cp,krn,1);
   b.clarg_set_global(cp,krn,2);
   c.clarg_set_global(cp,krn,3);

   // non-blocking fork of the OpenCL kernel to execute on the GPU
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   // non-blocking sync vector c to host memory (copy back to host)
   c.clmsync(cp,0,CL_MEM_HOST|CL_EVENT_NOWAIT);

   // force execution of operations in command queue, non-blocking call
   clflush(cp,devnum,0);

   // block on completion of all operations in the command queue
   clwait(cp,devnum,CL_ALL_EVENT);

   for(int i=0;i<n;i++) printf("%f %f\n",b[i],c[i]);


   ///////////////////////////////////////////////////////////
   ///// now resize the vectors by adding some more values ...
   ///////////////////////////////////////////////////////////
  
	n *= 3;
 
   // OPTIONAL: for better performance, detach containers from STDCL context
   aa.clmdetach();
   b.clmdetach();
   c.clmdetach();


   // increase size of vectors three-fold 
   // ... note that *all* boost multi_array operations are valid
	aa.resize(boost::extents[n][n]);
	b.resize(boost::extents[n]);
	c.resize(boost::extents[n]);
   for(int i=0;i<n;i++) for(int j=0;j<n;j++) aa[i][j] = 1.1f*i*j;
   for(int i=0;i<n;i++) b[i] = 2.2f*i;
   for(int i=0;i<n;i++) c[i] = 0.0f;


   // OPTIONAL: ... if you dettached the containers, you must re-attach them
   aa.clmattach(cp);
   b.clmattach(cp);
   c.clmattach(cp);


   // now follow same steps used above to sync memory, execute kernel, etc.

   aa.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   b.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   clndrange_t ndr_threefold = clndrange_init1d( 0, n, 64);
   
   clarg_set(cp,krn,0,n);
   aa.clarg_set_global(cp,krn,1);
   b.clarg_set_global(cp,krn,2);
   c.clarg_set_global(cp,krn,3);

   clfork(cp,devnum,krn,&ndr_threefold,CL_EVENT_NOWAIT);

   c.clmsync(cp,0,CL_MEM_HOST|CL_EVENT_NOWAIT);

	clflush(cp,devnum,0);

   clwait(cp,devnum,CL_ALL_EVENT);

   for(int i=0;i<n;i++) printf("%f %f\n",b[i],c[i]);

}