int main(int argc, char** argv) { int step,burst; int nparticle = 8192; /* MUST be a nice power of two for simplicity */ int nstep = 500; int nburst = 20; /* MUST divide the value of nstep without remainder */ int nthread = 64; /* chosen for ATI Radeon HD 5870 */ float dt = 0.0001; float eps = 0.0001; cl_float4* pos1 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0); cl_float4* pos2 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0); cl_float4* vel = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0); nbody_init(nparticle,pos1,vel); void* h = clopen(stdgpu,"nbody_kern.cl",CLLD_NOW); cl_kernel krn = clsym(stdgpu,h,"nbody_kern",CLLD_NOW); clndrange_t ndr = clndrange_init1d(0,nparticle,nthread); clarg_set(stdgpu,krn,0,dt); clarg_set(stdgpu,krn,1,eps); clarg_set_global(stdgpu,krn,4,vel); clarg_set_local(stdgpu,krn,5,nthread*sizeof(cl_float4)); clmsync(stdgpu,0,pos1,CL_MEM_DEVICE|CL_EVENT_NOWAIT); clmsync(stdgpu,0,vel,CL_MEM_DEVICE|CL_EVENT_NOWAIT); for(step=0; step<nstep; step+=nburst) { for(burst=0; burst<nburst; burst+=2) { clarg_set_global(stdgpu,krn,2,pos1); clarg_set_global(stdgpu,krn,3,pos2); clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT); clarg_set_global(stdgpu,krn,2,pos2); clarg_set_global(stdgpu,krn,3,pos1); clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT); } clmsync(stdgpu,0,pos1,CL_MEM_HOST|CL_EVENT_NOWAIT); clwait(stdgpu,0,CL_KERNEL_EVENT|CL_MEM_EVENT); } nbody_output(nparticle,pos1,vel); clclose(stdgpu,h); clfree(pos1); clfree(pos2); clfree(vel); }
random_seed* generate_seeds(CLCONTEXT* context, unsigned int number_of_workers) { random_seed *seeds = (random_seed*)clmalloc(context, number_of_workers*sizeof(random_seed), 0); for(int i = 0; i < number_of_workers; i++) { seeds[i].x = rand(); seeds[i].y = rand(); } return seeds; }
/* * Allocates or resets raster data in host memory, returning a * pointer to it. The operation is performed within the given * OpenCL context. Raster data is initialized to the given value. * * CONTEXT *ctx ........... an active OpenCL context. * int nrows .............. the number of rows in the DEM. * int ncols .............. the number of columns in the DEM. * RASTER_MAP_TYPE rtype .. DEM element data type. * FCELL init_value ....... initial value used for reseting or allocating * raster data. If NAN is given, the raster is * nullified. * void *alloc_ptr ........ a pointer to previously allocated memory. * If it is valie, the memory is just set to * 'init_value' without new memory allocation. * */ FCELL* alloc_raster (CONTEXT *ctx, const int nrows, const int ncols, RASTER_MAP_TYPE rtype, FCELL init_value, void *alloc_ptr) { int i,j; if (rtype != FCELL_TYPE) { G_fatal_error ("Wrong element data type in 'null_raster'"); return NULL; } else { FCELL *dem_device; if (alloc_ptr == NULL) { G_message ("Allocating NULL raster data into memory ..."); /* allocate raster DEM data */ dem_device = (FCELL *) clmalloc (ctx, nrows * ncols * G_raster_size (rtype), 0); } else { G_message ("Resetting memory ..."); dem_device = alloc_ptr; } /* Initialize the whole piece of memory */ if (init_value == NAN) { G_set_f_null_value (dem_device, nrows * ncols); } else { for (i=0; i<nrows; i++) { for (j=0; j<ncols; j++) { dem_device[i*ncols+j] = init_value; } } } /* Return a pointer to the allocated resources */ return dem_device; } }
/* * Loads raster data to host memory and returns a pointer to it. * The operation is performed within the current context. * * CONTEXT *ctx ........... an active OpenCL context. * int nrows .............. the number of rows in the DEM. * int ncols .............. the number of columns in the DEM. * int infd ............... file descriptor to read from the DEM. * RASTER_MAP_TYPE rtype .. DEM element data type. * */ FCELL* load_raster (CONTEXT *ctx, const int nrows, const int ncols, const int infd, RASTER_MAP_TYPE rtype) { int row, col; if (rtype != FCELL_TYPE) { G_fatal_error ("Wrong element data type in 'load_raster'"); return NULL; } else { FCELL *rastbuf = malloc (ncols * G_raster_size (rtype)); G_message ("Loading raster data into memory ..."); /* allocate raster DEM data */ FCELL *dem_device = (FCELL *) clmalloc (ctx, nrows * ncols * G_raster_size (rtype), 0); /* Load input map in device memory row by row */ for (row = 0; row < nrows; row++) { /* display completion percentage */ G_percent (row, nrows, 2); /* read input map */ if (G_get_raster_row (infd, rastbuf, row, rtype) < 0) G_fatal_error ("Unable to read from raster map"); /* copy this row to the device buffer */ FCELL *offset = dem_device + row * ncols; if (offset != memcpy (offset, rastbuf, ncols * G_raster_size (rtype))) G_fatal_error ("Error while copying memory!"); } /* free buffers */ free (rastbuf); /* Return a pointer to the allocated resources */ return dem_device; } }
int main (int argc, char * argv[]) { char kernFile[] = "../src/nD.cl"; // the default location of the cl file - can be over ridden by the first command line arguement char debugFile[] = "./debug"; // the debug file location char msg[255]; // space for the debug message where a program variable is to be written to the debug file char * clFile; // the cl file string actually used (either argv or kernFile) void * openHandle; // the return value to clOpen int bytesPerCore = 16; // how many bytes we want each core to process int workItems = 32; // the total number workItems (threads sent to the Epiphany) int i; // loop counter cl_uchar * wrkArea1D; // the pointer to the malloc'd space (bytesPerCore * workItems) cl_uchar * wrkArea2D; // These variables will be useful when I get getDeviceInfo and getBuildInfo working // unsigned int space; // the space required for clGetInfo style calls // cl_build_status bOk; // the return value from clGetProgramBuildInfo // size_t computeUnits; // return from GetDeviceInfo // char strInfo[20]; FILE * pFile; if(argc == 2) clFile = argv[1]; else clFile = kernFile; pFile = fopen(clFile, "r"); if (pFile == NULL) { printf("Opening the Kernel file: %s produced an error(%d). Make sure that the source code variable kern has a valid path to the cl code and that the code is readable.\n", clFile, errno); exit(0); } else fclose(pFile); // only open the file to check that it is there and readable debugReset(debugFile); debugdebug(debugFile, (char*)"How many devices do we have?\n"); sprintf(msg, "About to malloc wrkArea1D: %d\n", workItems * bytesPerCore); debugdebug(debugFile, msg); wrkArea1D = (cl_uchar*) clmalloc(stdacc, workItems * bytesPerCore, 0); wrkArea2D = (cl_uchar*) clmalloc(stdacc, workItems * bytesPerCore, 0); for (i=0; i < workItems * bytesPerCore; i++) wrkArea2D[i] = wrkArea1D[i] = 0; sprintf(msg, "Well malloc worked! Opening kernel file:%s\n", clFile); debugdebug(debugFile, msg); openHandle = clopen(stdacc, clFile, CLLD_NOW); // open the standard accellerator context (i.e. the Epiphany chip, reading in the .cl file and compiling it immediately clndrange_t ndr1D = clndrange_init1d(NULL, // global offset (always zero) ((size_t)workItems), // total number of threads (get_global_id will return 0 to workItems ((size_t)bytesPerCore)); // How many bytes do we tell the kernel to process via get_local_size(0) exKernel(openHandle, &ndr1D, (char*)"k_init1D", workItems, bytesPerCore, wrkArea1D, debugFile); clndrange_t ndr2D = clndrange_init2d(NULL, // global offset (always zero) ((size_t)workItems), // total number of threads (get_global_id will return 0 to workItems ((size_t)(bytesPerCore/4)), // How many bytes do we tell the kernel to process via get_local_size(0) NULL, // another useless global offset ((size_t)workItems), // a value that does not seem to do anything useful ((size_t)(bytesPerCore/4))); // How many rows to process per call returned by get_local_size(1) exKernel(openHandle, &ndr2D, (char*)"k_init2D", workItems, bytesPerCore, wrkArea2D, debugFile); // ============================================================================================================ // show the results // ============================================================================================================ printf("The 1D data:\n"); for(i=0; i < workItems * bytesPerCore; i++) { printf("%u\t", wrkArea1D[i]); if(((i+1) % bytesPerCore) == 0) printf("\n"); } printf("The 2D data:\n"); for(i=0; i < workItems * bytesPerCore; i++) { printf("%u\t", wrkArea2D[i]); if(((i+1) % bytesPerCore) == 0) printf("\n"); } clfree(wrkArea1D); clfree(wrkArea2D); return 0; }
int main() { cl_uint n = 64; #if(1) /* use default contexts, if no GPU use CPU */ CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu; unsigned int devnum = 0; void* clh = clopen(cp,"matvecmult.cl",CLLD_NOW); cl_kernel krn = clsym(cp,clh,"matvecmult_kern",0); /* allocate OpenCL device-sharable memory */ cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0); cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0); cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0); clndrange_t ndr = clndrange_init1d( 0, n, 64); /* initialize vectors a[] and b[], zero c[] */ int i,j; for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j; for(i=0;i<n;i++) b[i] = 2.2f*i; for(i=0;i<n;i++) c[i] = 0.0f; /* define the computational domain and workgroup size */ //clndrange_t ndr = clndrange_init1d( 0, n, 64); /* non-blocking sync vectors a and b to device memory (copy to GPU)*/ clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT); clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT); /* set the kernel arguments */ clarg_set(cp,krn,0,n); clarg_set_global(cp,krn,1,aa); clarg_set_global(cp,krn,2,b); clarg_set_global(cp,krn,3,c); /* non-blocking fork of the OpenCL kernel to execute on the GPU */ clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT); /* non-blocking sync vector c to host memory (copy back to host) */ clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT); /* force execution of operations in command queue (non-blocking call) */ clflush(cp,devnum,0); /* block on completion of operations in command queue */ clwait(cp,devnum,CL_ALL_EVENT); for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]); clfree(aa); clfree(b); clfree(c); clclose(cp,clh); #endif }
int main() { cl_uint n = 1024; /* use default contexts, if no GPU use CPU */ CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu; unsigned int devnum = 0; #ifdef __FreeBSD__ void* clh = clopen(cp,"matvecmult_special.cl",CLLD_NOW); cl_kernel krn = clsym(cp,clh,"matvecmult_special_kern",0); #else cl_kernel krn = clsym(cp,0,"matvecmult_special_kern",0); #endif /* allocate OpenCL device-sharable memory */ cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0); cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0); cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0); /* initialize vectors a[] and b[], zero c[] */ int i,j; for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j; for(i=0;i<n;i++) b[i] = 2.2f*i; for(i=0;i<n;i++) c[i] = 0.0f; /*** *** Create a image2d allocation to be used as a read-only table. *** The table will consist of a 24x24 array of float coefficients. *** The clmctl() call is used to set the type and shape of the table. *** Note that we will only use the first component of the float4 elements. ***/ cl_float4* table = (cl_float4*)clmalloc(cp,24*24*sizeof(cl_float4),CL_MEM_DETACHED); clmctl(table,CL_MCTL_SET_IMAGE2D,24,24,0); clmattach(cp,table); /* initialize the table to some contrived values */ for(i=0;i<24;i++) for(j=0;j<24;j++) table[i*24+j].x = 0.125f*(i-j); /* define the computational domain and workgroup size */ clndrange_t ndr = clndrange_init1d( 0, n, 64); /* non-blocking sync vectors a and b to device memory (copy to GPU)*/ clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT); clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT); clmsync(cp,devnum,table,CL_MEM_DEVICE|CL_EVENT_NOWAIT); /* set the kernel arguments */ clarg_set(cp,krn,0,n); clarg_set_global(cp,krn,1,aa); clarg_set_global(cp,krn,2,b); clarg_set_global(cp,krn,3,c); clarg_set_global(cp,krn,4,table); /* non-blocking fork of the OpenCL kernel to execute on the GPU */ clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT); /* non-blocking sync vector c to host memory (copy back to host) */ clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT); /* block on completion of operations in command queue */ clwait(cp,devnum,CL_ALL_EVENT); for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]); clfree(aa); clfree(b); clfree(c); #ifdef __FreeBSD__ clclose(cp,clh); #endif }
int main() { cl_uint n = 1024; /* use default contexts, if no ACCELERATOR use CPU */ CLCONTEXT* cp = (stdacc)? stdacc : stdcpu; unsigned int devnum = 0; /****************************************************************** *** this example requires the .cl file to be available at run-time *** and shows how to pass compiler options to the OCL compiler ******************************************************************/ void* clh = clopen(cp,"outerprod.cl",CLLD_NOBUILD); clbuild(cp,clh,"-D COEF=2", 0); cl_kernel krn = clsym(cp,clh,"outerprod_kern",0); if (!krn) { fprintf(stderr,"error: no OpenCL kernel\n"); exit(-1); } /* allocate OpenCL device-sharable memory */ cl_float* a = (float*)clmalloc(cp,n*sizeof(cl_float),0); cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0); cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0); /* initialize vectors a[] and b[], zero c[] */ int i; for(i=0;i<n;i++) a[i] = 1.1f*i; for(i=0;i<n;i++) b[i] = 2.2f*i; for(i=0;i<n;i++) c[i] = 0.0f; /* non-blocking sync vectors a and b to device memory (copy to GPU)*/ clmsync(cp,devnum,a,CL_MEM_DEVICE|CL_EVENT_WAIT); clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_WAIT); /* define the computational domain and workgroup size */ clndrange_t ndr = clndrange_init1d( 0, n, 16); /* set the kernel arguments */ clarg_set_global(cp,krn,0,a); clarg_set_global(cp,krn,1,b); clarg_set_global(cp,krn,2,c); /* non-blocking fork of the OpenCL kernel to execute on the GPU */ clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT); /* block on completion of operations in command queue */ clwait(cp,devnum,CL_ALL_EVENT); /* non-blocking sync vector c to host memory (copy back to host) */ clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT); /* block on completion of operations in command queue */ clwait(cp,devnum,CL_ALL_EVENT); for(i=0;i<n;i++) printf("%d %f %f %f\n",i,a[i],b[i],c[i]); clfree(a); clfree(b); clfree(c); clclose(cp,clh); }