int main(int argc, const char * argv[]) {
    static mach_timebase_info_data_t sTimebaseInfo;
    int seq = 0;
    int cpu = 0;
    // insert code here...
    int* data = (int*) malloc(sizeof(cl_int)*RANGE);
    for (int i = 0; i < RANGE; i ++) {
        data[i] = START + i;
        //printf("%d",data[i]);
    }
    
    int* out = (int*) malloc(sizeof(cl_int)*RANGE);
    printf("Hello, World!\n");
    if (argc > 1){
        if (strncmp(argv[1], "seq", 3) == 0) {
            seq = TRUE;
        } else if (strncmp(argv[1], "cpu", 3) == 0) {
            cpu = 1;
        }
    }
    uint64_t start = mach_absolute_time();
    if (seq)
        euler_totient(data, out);
    else {
        dispatch_queue_t queue;
        if (cpu)
            queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
        else
            queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);
       
        void *mem_in = gcl_malloc(sizeof(cl_int)*RANGE, data, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
        void *mem_out = gcl_malloc(sizeof(cl_int)*RANGE, NULL, CL_MEM_WRITE_ONLY);
        dispatch_sync(queue, ^{
            size_t wgs;
            gcl_get_kernel_block_workgroup_info(euler_totient_kernel,
                                               CL_KERNEL_WORK_GROUP_SIZE,
                                               sizeof(wgs), &wgs, NULL);
            
            cl_ndrange range = {
                1,
                {0, 0, 0},
                {RANGE, 0, 0},
                {wgs, 0, 0}
            };
            
            euler_totient_kernel(&range,(cl_int*)mem_in, (cl_int*)mem_out);
            
            gcl_memcpy(out, mem_out, sizeof(cl_float) * RANGE);
            
        });
    }
Beispiel #2
0
int main (int argc, const char * argv[]) {
  int i, j;
  char name[128];
  float *a, *b, *c;
  void *gpu_a, *gpu_b, *gpu_c;
  int ARRAY_SIZE = 128;
  dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);
  if (queue == NULL) {
      queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
   fprintf(stdout, "Created a dispatch queue failed\n", name);
  }
  
  cl_device_id gpu = gcl_get_device_id_with_dispatch_queue(queue);
  clGetDeviceInfo(gpu, CL_DEVICE_NAME, 128, name, NULL);
  fprintf(stdout, "Created a dispatch queue using the %s\n", name);
  
  if(argc >= 2)
    ARRAY_SIZE = atoi(argv[1]);
  a = (float *)malloc(ARRAY_SIZE*ARRAY_SIZE*sizeof(cl_float));
  if(a == NULL)
   fprintf(stderr,"allocating array a failed\n");
  for(i = 0; i < ARRAY_SIZE*ARRAY_SIZE; i++)
      a[i] = (cl_float)rand();
  b = (float *)malloc(ARRAY_SIZE*ARRAY_SIZE*sizeof(cl_float));
  if(b == NULL)
   fprintf(stderr,"allocating array b failed\n");
  for(i = 0; i < ARRAY_SIZE*ARRAY_SIZE; i++)
      b[i] = (cl_float)rand();
  c = (float *)malloc(ARRAY_SIZE*ARRAY_SIZE*sizeof(cl_float));
  if(c == NULL)
   fprintf(stderr,"allocating array c failed\n");

  gpu_a  = gcl_malloc(sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE, a,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
  gpu_b  = gcl_malloc(sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE, b,
                                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
  gpu_c  = gcl_malloc(sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE, NULL,
                                 CL_MEM_WRITE_ONLY);
  dispatch_sync(queue, ^{
   cl_ndrange range = { 2, {0, 0, 0},{ARRAY_SIZE, ARRAY_SIZE, 0},{16, 16, 0}};
   matrix_mul_kernel(&range,(cl_float*)gpu_a,(cl_float*)gpu_b,(cl_float*)gpu_c,(cl_int)ARRAY_SIZE);
   gcl_memcpy(c, gpu_c, sizeof(cl_float) * ARRAY_SIZE*ARRAY_SIZE);
  });
Beispiel #3
0
int main(int argc, const char * argv[])
{
    int i;
    char deviceName[128];
    
    if (init() != true)
    {
        return 1;
    }
    
    SDL_Delay(2000);
    
    // try to get the dispatch queue for the GPU
    dispatch_queue_t queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_GPU, NULL);
    
    // in the event that the system does not have an OpenCL GPU, we can use the CPU instead
    if (queue == NULL)
    {
        queue = gcl_create_dispatch_queue(CL_DEVICE_TYPE_CPU, NULL);
    }
    
    // let's print some data on the device we're usng! かわいいです!
    cl_device_id gpu = gcl_get_device_id_with_dispatch_queue(queue);
    clGetDeviceInfo(gpu, CL_DEVICE_NAME, 128, deviceName, NULL);
    fprintf(stdout, "Created a dispatch queue using the %s\n", deviceName);
    
    // let's hardcode some handy test data that's easy to understand
    float* test_in = (float*)malloc(sizeof(cl_float) * NUM_VALUES);
    for (i = 0; i < NUM_VALUES; i++)
    {
        test_in[i] = (cl_float)i;
    }
    
    // Once the computation using the CL is done, we'll need space in RAM for the output
    float* test_out = (float*)malloc(sizeof(cl_float) * NUM_VALUES);
    
    // Now we're going to allocate the buffers again in the OpenCL device's memory space
    // CL_MEM_COPY_HOST_PTR will copy the values of test_in to mem_in
    void* mem_in = gcl_malloc(sizeof(cl_float) * NUM_VALUES, test_in, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR);
    void* mem_out = gcl_malloc(sizeof(cl_float) * NUM_VALUES, NULL, CL_MEM_WRITE_ONLY);
    
    // DISPATCH THE KERNEL PROGRAM
    dispatch_sync(queue, ^{
        
        //workgroup size, I think
        size_t wgs;
        
        //information on sizing of dimensions
        gcl_get_kernel_block_workgroup_info(square_kernel, CL_KERNEL_WORK_GROUP_SIZE, sizeof(wgs), &wgs, NULL);
        cl_ndrange range = {
            1,
            {0, 0, 0},
            {NUM_VALUES, 0, 0},
            {wgs, 0, 0}
        };
        
        // call the kernel
        square_kernel(&range, (cl_float*)mem_in, (cl_float*)mem_out);
        
        //copy the output into memory
        gcl_memcpy(test_out, mem_out, sizeof(cl_float) * NUM_VALUES);
    });