Example #1
0
int main(int argc, char** argv) 
{

   int step,burst;

   int nparticle = 8192; /* MUST be a nice power of two for simplicity */
   int nstep = 500;
   int nburst = 20; /* MUST divide the value of nstep without remainder */
   int nthread = 64; /* chosen for ATI Radeon HD 5870 */

   float dt = 0.0001;
   float eps = 0.0001;
   cl_float4* pos1 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);
   cl_float4* pos2 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);
   cl_float4* vel = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);

   nbody_init(nparticle,pos1,vel);

   void* h = clopen(stdgpu,"nbody_kern.cl",CLLD_NOW);
   cl_kernel krn = clsym(stdgpu,h,"nbody_kern",CLLD_NOW);

   clndrange_t ndr = clndrange_init1d(0,nparticle,nthread);

   clarg_set(stdgpu,krn,0,dt);
   clarg_set(stdgpu,krn,1,eps);
   clarg_set_global(stdgpu,krn,4,vel);
   clarg_set_local(stdgpu,krn,5,nthread*sizeof(cl_float4));

	clmsync(stdgpu,0,pos1,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
	clmsync(stdgpu,0,vel,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   for(step=0; step<nstep; step+=nburst) {

      for(burst=0; burst<nburst; burst+=2) {

         clarg_set_global(stdgpu,krn,2,pos1);
         clarg_set_global(stdgpu,krn,3,pos2);
         clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT);

         clarg_set_global(stdgpu,krn,2,pos2);
         clarg_set_global(stdgpu,krn,3,pos1);
         clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT);
      
      }

      clmsync(stdgpu,0,pos1,CL_MEM_HOST|CL_EVENT_NOWAIT);

      clwait(stdgpu,0,CL_KERNEL_EVENT|CL_MEM_EVENT);

   }

   nbody_output(nparticle,pos1,vel);

   clclose(stdgpu,h);

   clfree(pos1);
   clfree(pos2);
   clfree(vel);  
}
random_seed* generate_seeds(CLCONTEXT* context, unsigned int number_of_workers)
{
	random_seed *seeds = (random_seed*)clmalloc(context, number_of_workers*sizeof(random_seed), 0);
	for(int i = 0; i < number_of_workers; i++)
	{
		seeds[i].x = rand();
		seeds[i].y = rand();
	}
	
	return seeds;
}
Example #3
0
/*
 * Allocates or resets raster data in host memory, returning a 
 * pointer to it. The operation is performed within the given
 * OpenCL context. Raster data is initialized to the given value.
 *
 * CONTEXT *ctx ........... an active OpenCL context.
 * int nrows .............. the number of rows in the DEM.
 * int ncols .............. the number of columns in the DEM.
 * RASTER_MAP_TYPE rtype .. DEM element data type.
 * FCELL init_value ....... initial value used for reseting or allocating
 *                          raster data. If NAN is given, the raster is
 *                          nullified.
 * void *alloc_ptr ........ a pointer to previously allocated memory. 
 *                          If it is valie, the memory is just set to
 *                          'init_value' without new memory allocation.
 *
 */
FCELL* alloc_raster (CONTEXT *ctx,
                     const int nrows, const int ncols, 
                     RASTER_MAP_TYPE rtype,
                     FCELL init_value,
                     void *alloc_ptr) 
{
    int i,j;

    if (rtype != FCELL_TYPE)
    {
        G_fatal_error ("Wrong element data type in 'null_raster'");
        return NULL;
    }
    else
    {
        FCELL *dem_device;

        if (alloc_ptr == NULL)
        {
            G_message ("Allocating NULL raster data into memory ...");

            /* allocate raster DEM data */
            dem_device = (FCELL *) clmalloc (ctx, 
                                   nrows * ncols * G_raster_size (rtype), 0);
        }
        else
        {
            G_message ("Resetting memory ...");
            dem_device = alloc_ptr;
        }

        /* Initialize the whole piece of memory */
        if (init_value == NAN)
        {
            G_set_f_null_value (dem_device, nrows * ncols);
        }
        else
        {
            for (i=0; i<nrows; i++)
            {
                for (j=0; j<ncols; j++)
                {
                    dem_device[i*ncols+j] = init_value;
                }
            }
        }

        /* Return a pointer to the allocated resources */
        return dem_device;
    }
}
Example #4
0
/*
 * Loads raster data to host memory and returns a pointer to it.
 * The operation is performed within the current context.
 *
 * CONTEXT *ctx ........... an active OpenCL context.
 * int nrows .............. the number of rows in the DEM.
 * int ncols .............. the number of columns in the DEM.
 * int infd ............... file descriptor to read from the DEM.
 * RASTER_MAP_TYPE rtype .. DEM element data type.
 *
 */
FCELL* load_raster (CONTEXT *ctx,
                    const int nrows, const int ncols, 
                    const int infd,
                    RASTER_MAP_TYPE rtype)
{
    int row, col;

    if (rtype != FCELL_TYPE)
    {
        G_fatal_error ("Wrong element data type in 'load_raster'");
        return NULL;
    }
    else
    {
        FCELL *rastbuf = malloc (ncols * G_raster_size (rtype));

        G_message ("Loading raster data into memory ...");

        /* allocate raster DEM data */
        FCELL *dem_device = (FCELL *) clmalloc (ctx, 
                                      nrows * ncols * G_raster_size (rtype), 0);

        /* Load input map in device memory row by row */
        for (row = 0; row < nrows; row++)
        {
            /* display completion percentage */
            G_percent (row, nrows, 2);
            
            /* read input map */
            if (G_get_raster_row (infd, rastbuf, row, rtype) < 0)
                G_fatal_error ("Unable to read from raster map");

            /* copy this row to the device buffer */
            FCELL *offset = dem_device + row * ncols;

            if (offset != memcpy (offset, rastbuf, ncols * G_raster_size (rtype)))
                G_fatal_error ("Error while copying memory!");
        }

        /* free buffers */
        free (rastbuf);

        /* Return a pointer to the allocated resources */
        return dem_device;
    }
}
Example #5
0
File: nD.cpp Project: nickoppen/nD
int main (int argc, char * argv[])
{

   char kernFile[] = "../src/nD.cl";	// the default location of the cl file - can be over ridden by the first command line arguement
   char debugFile[] = "./debug";		// the debug file location
   char msg[255];						// space for the debug message where a program variable is to be written to the debug file
   char * clFile;						// the cl file string actually used (either argv or kernFile)
   void * openHandle;					// the return value to clOpen
   int	bytesPerCore = 16;				// how many bytes we want each core to process
   int workItems = 32; 					// the total number workItems (threads sent to the Epiphany)
   int	i;								// loop counter
   cl_uchar * wrkArea1D;					// the pointer to the malloc'd space (bytesPerCore * workItems)
   cl_uchar * wrkArea2D;

//	These variables will be useful when I get getDeviceInfo and getBuildInfo working
//   unsigned int space;					// the space required for clGetInfo style calls
//   cl_build_status bOk;					// the return value from clGetProgramBuildInfo
//   size_t computeUnits;					// return from GetDeviceInfo
//   char strInfo[20];

   FILE * pFile;

   if(argc == 2)
	   clFile = argv[1];
   else
	   clFile = kernFile;

   pFile = fopen(clFile, "r");
   if (pFile == NULL)
   {
	   printf("Opening the Kernel file: %s produced an error(%d). Make sure that the source code variable kern has a valid path to the cl code and that the code is readable.\n", clFile, errno);
	   exit(0);
   }
   else
	   fclose(pFile);	// only open the file to check that it is there and readable


   debugReset(debugFile);
   debugdebug(debugFile, (char*)"How many devices do we have?\n");


   sprintf(msg, "About to malloc wrkArea1D: %d\n", workItems * bytesPerCore);
   debugdebug(debugFile, msg);
   wrkArea1D = (cl_uchar*) clmalloc(stdacc, workItems * bytesPerCore, 0);
   wrkArea2D = (cl_uchar*) clmalloc(stdacc, workItems * bytesPerCore, 0);

   for (i=0; i < workItems * bytesPerCore; i++)
	   wrkArea2D[i] = wrkArea1D[i] = 0;

   sprintf(msg, "Well malloc worked! Opening kernel file:%s\n", clFile);
   debugdebug(debugFile, msg);
   openHandle = clopen(stdacc, clFile, CLLD_NOW);
   // open the standard accellerator context (i.e. the Epiphany chip, reading in the .cl file and compiling it immediately

   clndrange_t ndr1D = clndrange_init1d(NULL,							// global offset (always zero)
		   	   	   	   	   	   	   ((size_t)workItems),					// total number of threads (get_global_id will return 0 to workItems
		   	   	   	   	   	   	   ((size_t)bytesPerCore));				// How many bytes do we tell the kernel to process via get_local_size(0)
   exKernel(openHandle, &ndr1D, (char*)"k_init1D", workItems, bytesPerCore, wrkArea1D, debugFile);

   clndrange_t ndr2D = clndrange_init2d(NULL,							// global offset (always zero)
		   	   	   	   	   	   	   ((size_t)workItems),					// total number of threads (get_global_id will return 0 to workItems
		   	   	   	   	   	   	   ((size_t)(bytesPerCore/4)),			// How many bytes do we tell the kernel to process via get_local_size(0)
		   	   	   	   	   	   	   NULL,								// another useless global offset
		   	   	   	   	   	   	   ((size_t)workItems),					// a value that does not seem to do anything useful
		   	   	   	   	   	   	   ((size_t)(bytesPerCore/4)));			// How many rows to process per call returned by get_local_size(1)

   exKernel(openHandle, &ndr2D, (char*)"k_init2D", workItems, bytesPerCore, wrkArea2D, debugFile);


   // ============================================================================================================
   // show the results
   // ============================================================================================================

   printf("The 1D data:\n");
   for(i=0; i < workItems * bytesPerCore; i++)
   {
	   printf("%u\t", wrkArea1D[i]);
	   if(((i+1) % bytesPerCore) == 0)
		   printf("\n");
   }

   printf("The 2D data:\n");
   for(i=0; i < workItems * bytesPerCore; i++)
   {
	   printf("%u\t", wrkArea2D[i]);
	   if(((i+1) % bytesPerCore) == 0)
		   printf("\n");
   }

   clfree(wrkArea1D);
   clfree(wrkArea2D);

   return 0;
}
Example #6
0
int main()
{
   cl_uint n = 64;

#if(1)

	/* use default contexts, if no GPU use CPU */
   CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

   unsigned int devnum = 0;

   void* clh = clopen(cp,"matvecmult.cl",CLLD_NOW);
   cl_kernel krn = clsym(cp,clh,"matvecmult_kern",0);

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* initialize vectors a[] and b[], zero c[] */
   int i,j; 
   for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;

   /* define the computational domain and workgroup size */
   //clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   /* set the kernel arguments */
   clarg_set(cp,krn,0,n);
   clarg_set_global(cp,krn,1,aa);
   clarg_set_global(cp,krn,2,b);
   clarg_set_global(cp,krn,3,c);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* force execution of operations in command queue (non-blocking call) */
   clflush(cp,devnum,0);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);

   clfree(aa);
   clfree(b);
   clfree(c);

   clclose(cp,clh);

#endif

}
int main()
{
   cl_uint n = 1024;

	/* use default contexts, if no GPU use CPU */
   CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

   unsigned int devnum = 0;

#ifdef __FreeBSD__
   void* clh = clopen(cp,"matvecmult_special.cl",CLLD_NOW);
   cl_kernel krn = clsym(cp,clh,"matvecmult_special_kern",0);
#else
   cl_kernel krn = clsym(cp,0,"matvecmult_special_kern",0);
#endif

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   /* initialize vectors a[] and b[], zero c[] */
   int i,j; 
   for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;


	/***
	 *** Create a image2d allocation to be used as a read-only table.
	 *** The table will consist of a 24x24 array of float coefficients.
	 *** The clmctl() call is used to set the type and shape of the table.
	 *** Note that we will only use the first component of the float4 elements.
	 ***/	
	cl_float4* table 
		= (cl_float4*)clmalloc(cp,24*24*sizeof(cl_float4),CL_MEM_DETACHED);
	clmctl(table,CL_MCTL_SET_IMAGE2D,24,24,0);
	clmattach(cp,table);

	/* initialize the table to some contrived values */
	for(i=0;i<24;i++) for(j=0;j<24;j++) table[i*24+j].x = 0.125f*(i-j);


   /* define the computational domain and workgroup size */
   clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,table,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   /* set the kernel arguments */
   clarg_set(cp,krn,0,n);
   clarg_set_global(cp,krn,1,aa);
   clarg_set_global(cp,krn,2,b);
   clarg_set_global(cp,krn,3,c);
   clarg_set_global(cp,krn,4,table);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);

   clfree(aa);
   clfree(b);
   clfree(c);

#ifdef __FreeBSD__
	clclose(cp,clh);
#endif

}
int main()
{
   cl_uint n = 1024;

	/* use default contexts, if no ACCELERATOR use CPU */
   CLCONTEXT* cp = (stdacc)? stdacc : stdcpu;

   unsigned int devnum = 0;

	/******************************************************************
	 *** this example requires the .cl file to be available at run-time
	 *** and shows how to pass compiler options to the OCL compiler
	 ******************************************************************/

   void* clh = clopen(cp,"outerprod.cl",CLLD_NOBUILD);
   clbuild(cp,clh,"-D COEF=2", 0);
   cl_kernel krn = clsym(cp,clh,"outerprod_kern",0);

	if (!krn) { fprintf(stderr,"error: no OpenCL kernel\n"); exit(-1); }

   /* allocate OpenCL device-sharable memory */
   cl_float* a = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   /* initialize vectors a[] and b[], zero c[] */
   int i; 
   for(i=0;i<n;i++) a[i] = 1.1f*i;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,a,CL_MEM_DEVICE|CL_EVENT_WAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_WAIT);

   /* define the computational domain and workgroup size */
   clndrange_t ndr = clndrange_init1d( 0, n, 16);

   /* set the kernel arguments */
   clarg_set_global(cp,krn,0,a);
   clarg_set_global(cp,krn,1,b);
   clarg_set_global(cp,krn,2,c);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f %f\n",i,a[i],b[i],c[i]);

   clfree(a);
   clfree(b);
   clfree(c);

   clclose(cp,clh);
}