void ocl_real_arr_arr_sca(const char* kernel_name, modelica_real* src_1, modelica_real* src_2, modelica_real* dest, int size_){

    cl_program OpenCLProgram;
    cl_kernel OpenCLfunction;
    clock_t c0, c1;
    cl_int err;
    cl_int inc1, inc2;
    inc1=1;
    inc2=1;

    size_t WorkSize[1] = {static_cast<size_t>(size_)};
    size_t localWorkSize[1] = {32};    // one dimensional Range

    if (!device_comm_queue)
        ocl_initialize();


    //This can be moved out. left here hoping that similar ops will be called
    //sequentialy. If we kept them in one .cl file we dont have to build again
    OpenCLProgram = ocl_build_p_from_src("real_ar_ar_sca.cl", true);


    OpenCLfunction = clCreateKernel(OpenCLProgram, kernel_name, &err);
    ocl_error_check(OCL_CREATE_KERNEL, err);

    cl_mem device_array_1 = ocl_alloc_init_real_arr(src_1, size_);
    cl_mem device_array_2 = ocl_alloc_init_real_arr(src_2, size_);
    cl_mem result = ocl_alloc_init_real_arr(NULL, 1);

    err = clSetKernelArg(OpenCLfunction, 0, sizeof(cl_int),(void*)&size_);
    err |= clSetKernelArg(OpenCLfunction, 1, sizeof(cl_mem), (void*)&device_array_1);
    err |= clSetKernelArg(OpenCLfunction, 2, sizeof(cl_int), (void*)&inc1);
    err |= clSetKernelArg(OpenCLfunction, 3, sizeof(cl_mem), (void*)&device_array_2);
    err |= clSetKernelArg(OpenCLfunction, 4, sizeof(cl_int), (void*)&inc2);
    err |= clSetKernelArg(OpenCLfunction, 5, sizeof(cl_mem), (void*)&result);
    err |= clSetKernelArg(OpenCLfunction, 6, sizeof(modelica_real)*localWorkSize[0], NULL);
    ocl_error_check(OCL_SET_KER_ARGS, err);

    c0 = clock();
    err = clEnqueueNDRangeKernel(device_comm_queue, OpenCLfunction, 1, NULL,
        WorkSize, localWorkSize, 0, NULL, NULL);
    ocl_error_check(OCL_ENQUE_ND_RANGE_KERNEL, err);
    clFinish(device_comm_queue);

    c1 = clock();
    printf ("\telapsed CPU CLOCKS:        %f sec\n", (float) (c1-c0)/1000);

    ocl_copy_back_to_host_real(result, dest, 1);


    clReleaseMemObject(device_array_1);
    clReleaseMemObject(device_array_2);
    clReleaseMemObject(result);

    clReleaseKernel(OpenCLfunction);
    clReleaseProgram(OpenCLProgram);
}
void ocl_real_matrix_matrix_matrix(const char* kernel_name, modelica_real* src_1, int M, modelica_real* src_2, int N, modelica_real* dest, int K){

    cl_program OpenCLProgram;
    cl_kernel OpenCLfunction;
    clock_t c0, c1;
    cl_int err;


    size_t WorkSize[2] = {static_cast<size_t>(M), static_cast<size_t>(N)};
    size_t localWorkSize[2] = {16,16};

    if (!device_comm_queue){
    printf("------------------------------Initizlizing---------------------\n");
        ocl_initialize();
    }
    //This can be moved out. left here hoping that similar ops will be called
    //sequentialy. If we kept them in one .cl file we dont have to build again
    OpenCLProgram = ocl_build_p_from_src("matrix.cl", true);


    OpenCLfunction = clCreateKernel(OpenCLProgram, kernel_name, &err);
    ocl_error_check(OCL_CREATE_KERNEL, err);

    cl_mem device_array_1 = ocl_alloc_init_real_arr(src_1, M*K);
    cl_mem device_array_2 = ocl_alloc_init_real_arr(src_2, K*N);
    cl_mem result = ocl_alloc_init_real_arr(NULL, M*N);

    err = clSetKernelArg(OpenCLfunction, 0, sizeof(cl_mem), (void*)&result);
    err |= clSetKernelArg(OpenCLfunction, 1, sizeof(cl_mem), (void*)&device_array_1);
    err |= clSetKernelArg(OpenCLfunction, 2, sizeof(cl_mem), (void*)&device_array_2);
    err |= clSetKernelArg(OpenCLfunction, 3, sizeof(cl_int), (void*)&K);
    err |= clSetKernelArg(OpenCLfunction, 4, sizeof(cl_int), (void*)&N);
    ocl_error_check(OCL_SET_KER_ARGS, err);

    c0 = clock();
    err = clEnqueueNDRangeKernel(device_comm_queue, OpenCLfunction, 2, NULL,
        WorkSize, localWorkSize, 0, NULL, NULL);
    ocl_error_check(OCL_ENQUE_ND_RANGE_KERNEL, err);
    clFinish(device_comm_queue);

    c1 = clock();
    printf ("\telapsed CPU CLOCKS:        %f sec\n", (float) (c1-c0)/1000);

    ocl_copy_back_to_host_real(result, dest, M*N);


    clReleaseMemObject(device_array_1);
    clReleaseMemObject(device_array_2);
    clReleaseMemObject(result);

    clReleaseKernel(OpenCLfunction);
    clReleaseProgram(OpenCLProgram);
}
Example #3
0
void ocl_initialize(){
    timeval t1, t2;
    double elapsedTime;
    gettimeofday(&t1, NULL);

    if (!device_comm_queue){
        if(!ocl_device){
            ocl_get_device();
        }
        ocl_create_context_and_comm_queue();
        ocl_build_p_from_src();
    }

    gettimeofday(&t2, NULL);
    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      // sec to ms
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   // us to ms
    printf ("\tOpenCL initialization :        %lf ms\n", elapsedTime);

    setenv("CUDA_CACHE_DISABLE", "1", 1);
}