int main(int argc, const char * argv[]) { static mach_timebase_info_data_t sTimebaseInfo; int seq = 0; int cpu = 0; // insert code here... int* data = (int*) malloc(sizeof(cl_int)*RANGE); for (int i = 0; i < RANGE; i ++) { data[i] = START + i; //printf("%d",data[i]); } int* out = (int*) malloc(sizeof(cl_int)*RANGE); printf("Hello, World!\n"); if (argc > 1){ if (strncmp(argv[1], "seq", 3) == 0) { seq = TRUE; } else if (strncmp(argv[1], "cpu", 3) == 0) { cpu = 1; } } uint64_t start = mach_absolute_time(); if (seq) euler_totient(data, out); else { dispatch_queue_t queue; if (cpu) queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL); else queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL); void *mem_in = gcl_malloc(sizeof(cl_int)*RANGE, data, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR); void *mem_out = gcl_malloc(sizeof(cl_int)*RANGE, NULL, CL_MEM_WRITE_ONLY); dispatch_sync(queue, ^{ size_t wgs; gcl_get_kernel_block_workgroup_info(euler_totient_kernel, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgs), &wgs, NULL); cl_ndrange range = { 1, {0, 0, 0}, {RANGE, 0, 0}, {wgs, 0, 0} }; euler_totient_kernel(&range,(cl_int*)mem_in, (cl_int*)mem_out); gcl_memcpy(out, mem_out, sizeof(cl_float) * RANGE); }); }
int main (int argc, const char * argv[]) { int i, j; char name[128]; float *a, *b, *c; void *gpu_a, *gpu_b, *gpu_c; int ARRAY_SIZE = 128; dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL); if (queue == NULL) { queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL); fprintf(stdout, "Created a dispatch queue failed\n", name); } cl_device_id gpu = gcl_get_device_id_with_dispatch_queue(queue); clGetDeviceInfo(gpu, CL_DEVICE_NAME, 128, name, NULL); fprintf(stdout, "Created a dispatch queue using the %s\n", name); if(argc >= 2) ARRAY_SIZE = atoi(argv[1]); a = (float *)malloc(ARRAY_SIZE*ARRAY_SIZE*sizeof(cl_float)); if(a == NULL) fprintf(stderr,"allocating array a failed\n"); for(i = 0; i < ARRAY_SIZE*ARRAY_SIZE; i++) a[i] = (cl_float)rand(); b = (float *)malloc(ARRAY_SIZE*ARRAY_SIZE*sizeof(cl_float)); if(b == NULL) fprintf(stderr,"allocating array b failed\n"); for(i = 0; i < ARRAY_SIZE*ARRAY_SIZE; i++) b[i] = (cl_float)rand(); c = (float *)malloc(ARRAY_SIZE*ARRAY_SIZE*sizeof(cl_float)); if(c == NULL) fprintf(stderr,"allocating array c failed\n"); gpu_a = gcl_malloc(sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE, a, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR); gpu_b = gcl_malloc(sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE, b, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR); gpu_c = gcl_malloc(sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE, NULL, CL_MEM_WRITE_ONLY); dispatch_sync(queue, ^{ cl_ndrange range = { 2, {0, 0, 0},{ARRAY_SIZE, ARRAY_SIZE, 0},{16, 16, 0}}; matrix_mul_kernel(&range,(cl_float*)gpu_a,(cl_float*)gpu_b,(cl_float*)gpu_c,(cl_int)ARRAY_SIZE); gcl_memcpy(c, gpu_c, sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE); });
int main(int argc, const char * argv[]) { int i; char deviceName[128]; if (init() != true) { return 1; } SDL_Delay(2000); // try to get the dispatch queue for the GPU dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL); // in the event that the system does not have an OpenCL GPU, we can use the CPU instead if (queue == NULL) { queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL); } // let's print some data on the device we're usng! かわいいです! cl_device_id gpu = gcl_get_device_id_with_dispatch_queue(queue); clGetDeviceInfo(gpu, CL_DEVICE_NAME, 128, deviceName, NULL); fprintf(stdout, "Created a dispatch queue using the %s\n", deviceName); // let's hardcode some handy test data that's easy to understand float* test_in = (float*)malloc(sizeof(cl_float) * NUM_VALUES); for (i = 0; i < NUM_VALUES; i++) { test_in[i] = (cl_float)i; } // Once the computation using the CL is done, we'll need space in RAM for the output float* test_out = (float*)malloc(sizeof(cl_float) * NUM_VALUES); // Now we're going to allocate the buffers again in the OpenCL device's memory space // CL_MEM_COPY_HOST_PTR will copy the values of test_in to mem_in void* mem_in = gcl_malloc(sizeof(cl_float) * NUM_VALUES, test_in, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR); void* mem_out = gcl_malloc(sizeof(cl_float) * NUM_VALUES, NULL, CL_MEM_WRITE_ONLY); // DISPATCH THE KERNEL PROGRAM dispatch_sync(queue, ^{ //workgroup size, I think size_t wgs; //information on sizing of dimensions gcl_get_kernel_block_workgroup_info(square_kernel, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgs), &wgs, NULL); cl_ndrange range = { 1, {0, 0, 0}, {NUM_VALUES, 0, 0}, {wgs, 0, 0} }; // call the kernel square_kernel(&range, (cl_float*)mem_in, (cl_float*)mem_out); //copy the output into memory gcl_memcpy(test_out, mem_out, sizeof(cl_float) * NUM_VALUES); });