Example #1
0
void initCL()
{
    FILE *kernelFile;
    char *kernelSource;
    size_t kernelLength;

    cl_int errcode;
	
    ocd_options opts = ocd_get_options();
	platform_id = opts.platform_id;
	device_id = opts.device_id;

clDevice = GetDevice(platform_id, device_id);
	size_t max_worksize[3];
errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL);
 CHECKERR(errcode);
        while(num_threads_perdim*num_threads_perdim>max_worksize[0])
                num_threads_perdim = num_threads_perdim/2;

	num_threads = num_threads_perdim*num_threads_perdim;
	
    clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode);
    CHECKERR(errcode);

    clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode);
    CHECKERR(errcode);

    kernelFile = fopen("kmeans_opencl_kernel.cl", "r");
    fseek(kernelFile, 0, SEEK_END);
    kernelLength = (size_t) ftell(kernelFile);
    kernelSource = (char *) malloc(sizeof(char)*kernelLength);
    rewind(kernelFile);
    fread((void *) kernelSource, kernelLength, 1, kernelFile);
    fclose(kernelFile);

    clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode);
    CHECKERR(errcode);

    free(kernelSource);

    errcode = clBuildProgram(clProgram, 1, &clDevice, NULL, NULL, NULL);
    if (errcode == CL_BUILD_PROGRAM_FAILURE)
    {
        char *log;
        size_t logLength;
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
        log = (char *) malloc(sizeof(char)*logLength);
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL);
        fprintf(stderr, "Kernel build error! Log:\n%s", log);
        free(log);
        return;
    }
    CHECKERR(errcode);

    clKernel_invert_mapping = clCreateKernel(clProgram, "invert_mapping", &errcode);
    CHECKERR(errcode);
    clKernel_kmeansPoint = clCreateKernel(clProgram, "kmeansPoint", &errcode);
    CHECKERR(errcode);
}
Example #2
0
int ocd_check_requirements(ocd_requirements* reqs)
{
	cl_int dev_type;

	if(reqs == NULL)
		return 1;

	int pass = 1;

	#ifdef USE_AFPGA
	dev_type = CL_DEVICE_TYPE_ACCELERATOR;
	#elif defined(USEGPU)
	dev_type = CL_DEVICE_TYPE_GPU;
	#else
	dev_type = CL_DEVICE_TYPE_CPU;
	#endif

	ocd_options opts = ocd_get_options();
//	cl_device_id d_id = _ocd_get_device(opts.platform_id, opts.device_id);
	cl_device_id d_id = GetDevice(opts.platform_id, opts.device_id,dev_type);

	cl_ulong local_mem;
	clGetDeviceInfo(d_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem, NULL);
	if(local_mem < reqs->local_mem_size)
		pass = 0;
	reqs->local_mem_size = local_mem;
	
	cl_ulong global_mem;
	clGetDeviceInfo(d_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_mem, NULL);
	if(global_mem < reqs->global_mem_size)
		pass = 0;
	reqs->global_mem_size = global_mem;
	
	size_t workgroup_size;
	clGetDeviceInfo(d_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
	if(workgroup_size < reqs->workgroup_size)
		pass = 0;
	reqs->workgroup_size = workgroup_size;

	return pass;
}
Example #3
0
int main(int argc, char** argv)
{
	cl_int err;
	int usegpu = USEGPU;
    int do_verify = 0;
    int opt, option_index=0;

    unsigned int correct;

    size_t global_size;
    size_t local_size;

    cl_device_id device_id;
    cl_context context;
    cl_command_queue commands;
    cl_program program;
    cl_kernel kernel;

    stopwatch sw;

    cl_mem csr_ap;
    cl_mem csr_aj;
    cl_mem csr_ax;
    cl_mem x_loc;
    cl_mem y_loc;

    FILE *kernelFile;
    char *kernelSource;
    size_t kernelLength;
    size_t lengthRead;


    ocd_init(&argc, &argv, NULL);
    ocd_options opts = ocd_get_options();
    platform_id = opts.platform_id;
    n_device = opts.device_id;

    while ((opt = getopt_long(argc, argv, "::vc::", 
                            long_options, &option_index)) != -1 ) {
      switch(opt){
        //case 'i':
          //input_file = optarg;
          //break;
        case 'v':
          fprintf(stderr, "verify\n");
          do_verify = 1;
          break;
        case 'c':
          fprintf(stderr, "using cpu\n");
          usegpu = 0;
	  break;
        default:
          fprintf(stderr, "Usage: %s [-v Warning: lots of output] [-c use CPU]\n",
                  argv[0]);
          exit(EXIT_FAILURE);
      }
  }

    /* Fill input set with random float values */
    int i;

    csr_matrix csr;
    csr = laplacian_5pt(512);
    int k = 0;
      for(k = 0; k < csr.num_nonzeros; k++){
         csr.Ax[k] = 1.0 - 2.0 * (rand() / (RAND_MAX + 1.0));
      }

    //The other arrays
    float * x_host = float_new_array(csr.num_cols);
    float * y_host = float_new_array(csr.num_rows);

    unsigned int ii;
    for(ii = 0; ii < csr.num_cols; ii++){
        x_host[ii] = rand() / (RAND_MAX + 1.0);
    }
    for(ii = 0; ii < csr.num_rows; ii++){
        y_host[ii] = rand() / (RAND_MAX + 2.0);
    }

    /* Retrieve an OpenCL platform */
    device_id = GetDevice(platform_id, n_device);

    /* Create a compute context */
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    CHKERR(err, "Failed to create a compute context!");

    /* Create a command queue */
    commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
    CHKERR(err, "Failed to create a command queue!");

    /* Load kernel source */
    kernelFile = fopen("spmv_csr_kernel.cl", "r");
    fseek(kernelFile, 0, SEEK_END);
    kernelLength = (size_t) ftell(kernelFile);
    kernelSource = (char *) malloc(sizeof(char)*kernelLength);
    rewind(kernelFile);
    lengthRead = fread((void *) kernelSource, kernelLength, 1, kernelFile);
    fclose(kernelFile);

    /* Create the compute program from the source buffer */
    program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, &kernelLength, &err);
    CHKERR(err, "Failed to create a compute program!");

    /* Free kernel source */
    free(kernelSource);

    /* Build the program executable */
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err == CL_BUILD_PROGRAM_FAILURE)                                                                                                                                       
    {
        char *buildLog;
        size_t logLen;
        err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen);
        buildLog = (char *) malloc(sizeof(char)*logLen);
        err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) buildLog, NULL);
        fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, buildLog);
        free(buildLog);
        exit(1);
    }
    CHKERR(err, "Failed to build program!");

    /* Create the compute kernel in the program we wish to run */
    kernel = clCreateKernel(program, "csr", &err);
    CHKERR(err, "Failed to create a compute kernel!");

    /* Create the input and output arrays in device memory for our calculation */
    csr_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_rows+4, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    csr_aj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_nonzeros, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    csr_ax = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    x_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");
    y_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_rows, NULL, &err);
    CHKERR(err, "Failed to allocate device memory!");

    /* beginning of timing point */
    stopwatch_start(&sw); 
   
    /* Write our data set into the input array in device memory */
	err = clEnqueueWriteBuffer(commands, csr_ap, CL_TRUE, 0, sizeof(unsigned int)*csr.num_rows+4, csr.Ap, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, csr_aj, CL_TRUE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr.Aj, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, csr_ax, CL_TRUE, 0, sizeof(float)*csr.num_nonzeros, csr.Ax, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, x_loc, CL_TRUE, 0, sizeof(float)*csr.num_cols, x_host, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
    err = clEnqueueWriteBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, y_host, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer)
    CHKERR(err, "Failed to write to source array!");
	END_TIMER(ocdTempTimer)
    /* Set the arguments to our compute kernel */
    err = 0;
    err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &csr.num_rows);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &csr_ap);
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &csr_aj);
    err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &csr_ax);
    err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &x_loc);
    err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &y_loc);
    CHKERR(err, "Failed to set kernel arguments!");

    /* Get the maximum work group size for executing the kernel on the device */
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
    CHKERR(err, "Failed to retrieve kernel work group info!");

    /* Execute the kernel over the entire range of our 1d input data set */
    /* using the maximum number of work group items for this device */
    global_size = csr.num_rows;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &ocdTempEvent);
        clFinish(commands);
	START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CSR Kernel", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    CHKERR(err, "Failed to execute kernel!");

    /* Wait for the command commands to get serviced before reading back results */
    float output[csr.num_rows];
    
    /* Read back the results from the device to verify the output */
	err = clEnqueueReadBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, output, 0, NULL, &ocdTempEvent);
        clFinish(commands);
    	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CSR Data Copy", ocdTempTimer)
	END_TIMER(ocdTempTimer)
	CHKERR(err, "Failed to read output array!");

    /* end of timing point */
    stopwatch_stop(&sw);
    printf("Time consumed(ms): %lf Gflops: %f \n", 1000*get_interval_by_sec(&sw), (2.0 * (double) csr.num_nonzeros / get_interval_by_sec(&sw)) / 1e9);

   /* Validate our results */
   if(do_verify){
       for (i = 0; i < csr.num_rows; i++){
           printf("row: %d	output: %f \n", i, output[i]);  
       }
   }

   int row = 0;
   float sum = 0;
   int row_start = 0;
   int row_end = 0;
   for(row =0; row < csr.num_rows; row++){     
        sum = y_host[row];
        
        row_start = csr.Ap[row];
        row_end   = csr.Ap[row+1];
        
        unsigned int jj = 0;
        for (jj = row_start; jj < row_end; jj++){             
            sum += csr.Ax[jj] * x_host[csr.Aj[jj]];      
        }
        y_host[row] = sum;
    }
    for (i = 0; i < csr.num_rows; i++){
        if((fabsf(y_host[i]) - fabsf(output[i])) > .001)
             printf("Possible error, difference greater then .001 at row %d \n", i);
    }

    /* Print a brief summary detailing the results */
    ocd_finalize();

    /* Shutdown and cleanup */
    clReleaseMemObject(csr_ap);
    clReleaseMemObject(csr_aj);
    clReleaseMemObject(csr_ax);
    clReleaseMemObject(x_loc);
    clReleaseMemObject(y_loc);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    return 0;
}
Example #4
0
int
main ( int argc, char *argv[] )
{
    int matrix_dim = 32; /* default matrix_dim */
    int opt, option_index=0;
    func_ret_t ret;
    const char *input_file = NULL;
    float *m, *mm;
    stopwatch sw;

    cl_device_id clDevice;
    cl_context clContext;
    cl_command_queue clCommands;
    cl_program clProgram;
    cl_kernel clKernel_diagonal;
    cl_kernel clKernel_perimeter;
    cl_kernel clKernel_internal;
    cl_int dev_type;

    cl_int errcode;

    FILE *kernelFile;
    char *kernelSource;
    size_t kernelLength;

    cl_mem d_m;

    ocd_init(&argc, &argv, NULL);
    ocd_options opts = ocd_get_options();
    platform_id = opts.platform_id;
    device_id = opts.device_id;


    while ((opt = getopt_long(argc, argv, "::vs:i:",
                              long_options, &option_index)) != -1 ) {
        switch(opt) {
        case 'i':
            input_file = optarg;
            break;
        case 'v':
            do_verify = 1;
            break;
        case 's':
            matrix_dim = atoi(optarg);
            fprintf(stderr, "Currently not supported, use -i instead\n");
            fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]);
            exit(EXIT_FAILURE);
        case '?':
            fprintf(stderr, "invalid option\n");
            break;
        case ':':
            fprintf(stderr, "missing argument\n");
            break;
        default:
            fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file||-p platform|-d device]\n",
                    argv[0]);
            exit(EXIT_FAILURE);
        }
    }

    if ( (optind < argc) || (optind == 1)) {
        fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    if (input_file) {
        printf("Reading matrix from file %s\n", input_file);
        ret = create_matrix_from_file(&m, input_file, &matrix_dim);
        if (ret != RET_SUCCESS) {
            m = NULL;
            fprintf(stderr, "error create matrix from file %s\n", input_file);
            exit(EXIT_FAILURE);
        }
    } else {
        printf("No input file specified!\n");
        exit(EXIT_FAILURE);
    }

    if (do_verify) {
        printf("Before LUD\n");
        print_matrix(m, matrix_dim);
        matrix_duplicate(m, &mm, matrix_dim);
    }

//  errcode = clGetPlatformIDs(NUM_PLATFORM, clPlatform, NULL);
//  CHECKERR(errcode);
//
//  errcode = clGetDeviceIDs(clPlatform[PLATFORM_ID], USEGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
//  CHECKERR(errcode);
#ifdef USEGPU
    dev_type = CL_DEVICE_TYPE_GPU;
#elif defined(USE_AFPGA)
    dev_type = CL_DEVICE_TYPE_ACCELERATOR;
#else
    dev_type = CL_DEVICE_TYPE_CPU;
#endif


    clDevice = GetDevice(platform_id, device_id,dev_type);
    size_t max_worksize[3];
    errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL);
    CHECKERR(errcode);
    while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0])
        BLOCK_SIZE = BLOCK_SIZE/2;

    clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode);
    CHECKERR(errcode);

    clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode);
    CHECKERR(errcode);

    kernelFile = fopen("lud_kernel.cl", "r");
    fseek(kernelFile, 0, SEEK_END);
    kernelLength = (size_t) ftell(kernelFile);
    kernelSource = (char *) malloc(sizeof(char)*kernelLength);
    rewind(kernelFile);
    fread((void *) kernelSource, kernelLength, 1, kernelFile);
    fclose(kernelFile);

    clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode);
    CHECKERR(errcode);

    free(kernelSource);
    char arg[100];
    sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE);
    errcode = clBuildProgram(clProgram, 1, &clDevice, arg, NULL, NULL);
    if (errcode == CL_BUILD_PROGRAM_FAILURE)
    {
        char *log;
        size_t logLength;
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
        log = (char *) malloc(sizeof(char)*logLength);
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL);
        fprintf(stderr, "Kernel build error! Log:\n%s", log);
        free(log);
        return 0;
    }
    CHECKERR(errcode);

    clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode);
    CHECKERR(errcode);
    clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode);
    CHECKERR(errcode);
    clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode);
    CHECKERR(errcode);

    d_m = clCreateBuffer(clContext, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);

    /* beginning of timing point */
    stopwatch_start(&sw);

    errcode = clEnqueueWriteBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent);

    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    CHECKERR(errcode);

    int i=0;
    size_t localWorkSize[2];
    size_t globalWorkSize[2];
    //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE);
//	printf("max Work-item Size: %d\n",(int)max_worksize[0]);
#ifdef START_POWER
    for( int iter = 0; iter < 1000; iter++)
#endif
        for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
            errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);

            localWorkSize[0] = BLOCK_SIZE;
            globalWorkSize[0] = BLOCK_SIZE;

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHECKERR(errcode);
            errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);
            localWorkSize[0] = BLOCK_SIZE*2;
            globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0];

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer)
            CHECKERR(errcode);
            END_TIMER(ocdTempTimer)
            errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);
            localWorkSize[0] = BLOCK_SIZE;
            localWorkSize[1] = BLOCK_SIZE;
            globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0];
            globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1];

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHECKERR(errcode);
        }
    errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m);
    errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim);
    errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i);
    CHECKERR(errcode);
    localWorkSize[0] = BLOCK_SIZE;
    globalWorkSize[0] = BLOCK_SIZE;

    errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer)
    CHECKERR(errcode);

    END_TIMER(ocdTempTimer)

    errcode = clEnqueueReadBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    /* end of timing point */
    stopwatch_stop(&sw);
    printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));

    clReleaseMemObject(d_m);

    if (do_verify) {
        printf("After LUD\n");
        print_matrix(m, matrix_dim);
        printf(">>>Verify<<<<\n");
        printf("matrix_dim: %d\n",matrix_dim);
        lud_verify(mm, m, matrix_dim);
        free(mm);
    }

    clReleaseKernel(clKernel_diagonal);
    clReleaseKernel(clKernel_perimeter);
    clReleaseKernel(clKernel_internal);
    clReleaseProgram(clProgram);
    clReleaseCommandQueue(clCommands);
    clReleaseContext(clContext);

    free(m);
    ocd_finalize();
    return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest( int argc, char** argv)
{
	ocd_options opts = ocd_get_options();
	platform_id = opts.platform_id;
	n_device = opts.device_id;

	if ( argc != 8)
	{
		printf("Usage: GpuTemporalDataMining [<platform> <device> --] <file path> <temporal constraint path> <threads> <support> <(a)bsolute or (r)atio> <(s)tatic | (d)ynamic> <(m)ap and merge | (n)aive | (o)hybrid> \n");
		return;
	}

//    CUT_DEVICE_INIT();
    initGpu();

    getDeviceVariables(device_id);


	printf("Dataset, Support Threshold, PTPE or MapMerge, A1 or A1+A2, Level, Episodes (N), Episodes Culled (X), A1 Counting Time, A2 Counting Time, Generation Time, Total Counting Time\n");

    //CUT_SAFE_CALL( cutCreateTimer( &timer));
    //CUT_SAFE_CALL( cutCreateTimer( &generating_timer));
    //CUT_SAFE_CALL( cutCreateTimer( &a1_counting_timer));
    //CUT_SAFE_CALL( cutCreateTimer( &a2_counting_timer));
    //CUT_SAFE_CALL( cutCreateTimer( &total_timer));

    //CUT_SAFE_CALL( cutStartTimer( total_timer));
    //CUT_SAFE_CALL( cutStartTimer( timer));
    //CUT_SAFE_CALL( cutStartTimer( generating_timer));
    //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));
    //CUT_SAFE_CALL( cutStartTimer( a2_counting_timer));

    unsigned int num_threads = atoi(argv[3]);
    // allocate host memory
    //initEpisodeCandidates();
	if ( loadData( argv[1] ) != 0 )
		return;
	if ( loadTemporalConstraints(argv[2]) != 0 )
		return;

	// Check whether value supplied is absolute or ratio support
	supportType = *(argv[5]) == 'a' ? ABSOLUTE : RATIO;
	memoryModel = *(argv[6]) == 's' ? STATIC : DYNAMIC;

	switch (*(argv[7]))
	{
	case 'm':
		algorithmType = MAP_AND_MERGE;
		break;
	case 'n':
		algorithmType = NAIVE;
		break;
	case 'o':
		algorithmType = OPTIMAL;
		break;
	}

	support = atof(argv[4]);

	dumpFile = fopen( "episode.txt", "w" );

	//printf("Initializing GPU Data...\n");

	setupGpu();

	// setup execution parameters
    size_t grid[3];
    size_t threads[3];

	//printf("Event stream size: %i\n", eventSize);

	// BEGIN LOOP
	for ( int level = 1; level <= eventSize; level++ )
	{
		printf("Generating episode candidates for level %i...\n", level);
//		CUT_SAFE_CALL( cutResetTimer( total_timer));
//		CUT_SAFE_CALL( cutStartTimer( total_timer));

		//CUDA_SAFE_CALL( cudaUnbindTexture( candidateTex ) );
		if(level != 1){
			unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * (level-1) * sizeof(UBYTE) );
		//CUDA_SAFE_CALL( cudaUnbindTexture( intervalTex ) );
		unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-2) * 2 * sizeof(float));
        }

//		CUT_SAFE_CALL( cutResetTimer( generating_timer));
//		CUT_SAFE_CALL( cutStartTimer( generating_timer));

//		int test1, test = numCandidates;
//		generateEpisodeCandidatesCPU( level );
//		test1 = numCandidates;
//		numCandidates = test;
        printf("Generating Episodes\n");
#ifdef CPU_EPISODE_GENERATION
		generateEpisodeCandidatesCPU( level );
#else
		generateEpisodeCandidatesGPU( level, num_threads );
#endif

//		CUT_SAFE_CALL( cutStopTimer( generating_timer));
		//printf( "\tGenerating time: %f (ms)\n", cutGetTimerValue( generating_timer));


		if ( numCandidates == 0 )
			break;
        printf("Writing to buffer\n");
		// Copy candidates to GPU
#ifdef CPU_EPISODE_GENERATION
		clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent);
                START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
                END_TIMER(ocdTempTimer)
		clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent);
                clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
                END_TIMER(ocdTempTimer)
#endif

        bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8);

		bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT );

		//printf("Executing kernel on %i candidates...\n", numCandidates, level);

		// execute the kernel
		calculateGrid(grid, num_threads, numCandidates);
		calculateBlock(threads, num_threads, numCandidates);

		int sections;
		unsigned int shared_mem_needed;
		//CUT_SAFE_CALL( cutStartTimer( counting_timer));

		int aType = algorithmType;
		if ( algorithmType == OPTIMAL )
			aType = chooseAlgorithmType( level, numCandidates, num_threads );

		if ( memoryModel == DYNAMIC )
		{
			if ( aType == NAIVE )
			{
				shared_mem_needed = MaxListSize*level*threads[0]*sizeof(float);
                printf("Shared memory needed %d\n", shared_mem_needed);
				//CUT_SAFE_CALL( cutResetTimer( a1_counting_timer));
				//CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));
				countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );

			}
			else
			{
                printf("DYNAMIC MAP MERGE\n");
				calculateLevelParameters(level, threads, grid, sections);
				shared_mem_needed = 16000;
                printf("numCandidates=%d\n", numCandidates);
				//CUT_SAFE_CALL( cutResetTimer( a1_counting_timer));
				//CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));
				countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );
				//countCandidatesMapMergeStatic<<< grid, threads, shared_mem_needed >>>( d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates );
			}
		}
		else
		{
			if ( aType == NAIVE )
			{
				shared_mem_needed = level*threads[0]*sizeof(float);
			}
			else
			{
				calculateLevelParameters(level, threads, grid, sections);
				shared_mem_needed = 16000;
			}

				//CUT_SAFE_CALL( cutResetTimer( a2_counting_timer));
				//CUT_SAFE_CALL( cutStartTimer( a2_counting_timer));
			if ( aType == NAIVE )
                countCandidatesStatic(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed  );
			else
                countCandidatesMapMergeStatic(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );
			clFinish(commands);

			//CUT_SAFE_CALL( cutStopTimer( a2_counting_timer));

            int err;
            err = clEnqueueReadBuffer(commands,d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent);
            clFinish(commands);
            	START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer)
            END_TIMER(ocdTempTimer)
		CHKERR(err, "Unable to read buffer from device.");

			unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE) );
			unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float));

			// Remove undersupported episodes
			cullCandidates( level );

			if ( numCandidates == 0 )
				break;
			unsigned int mmthreads = num_threads;
			if ( MaxListSize*level*num_threads*sizeof(float) > 16384 )
			{
				if ( MaxListSize*level*96*sizeof(float) < 16384 )
					mmthreads = 96;
				else if ( MaxListSize*level*64*sizeof(float) < 16384)
					mmthreads = 64;
				else if ( MaxListSize*level*32*sizeof(float) < 16384)
					mmthreads = 32;
				printf("More shared memory needed for %d threads. Changed to %d threads.\n", num_threads, mmthreads );
			}

#ifdef CPU_EPISODE_GENERATION
            err = clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent);
	START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHKERR(err, "Unable to write buffer 1.");
            if(numCandidates * (level - 1) * 2 * sizeof(float) != 0)
            err = clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent);
        clFinish(commands);
            START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer)
        CHKERR(err, "Unable to write buffer 2.");
		END_TIMER(ocdTempTimer)
#endif
            bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8);
            bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT );

			if ( algorithmType == OPTIMAL )
				aType = chooseAlgorithmType( level, numCandidates, mmthreads );

			// Run (T1,T2] algorithm
			if ( aType == NAIVE )
			{
				shared_mem_needed = MaxListSize*level* mmthreads*sizeof(float);
				calculateGrid(grid, mmthreads, numCandidates );
				calculateBlock(threads, mmthreads, numCandidates );
			}
			else
			{
				calculateLevelParameters(level, threads, grid, sections);
				shared_mem_needed = 16000;
			}
			//CUT_SAFE_CALL( cutResetTimer( a1_counting_timer));
			//CUT_SAFE_CALL( cutStartTimer( a1_counting_timer));

			if ( aType == NAIVE )
                countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );
			else
                countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates,
                    candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed );

		}
        printf("Finishing\n");
		clFinish(commands);
		//CUT_SAFE_CALL( cutStopTimer( a1_counting_timer));
		//printf( "\tCounting time: %f (ms)\n", cutGetTimerValue( counting_timer));

		// check if kernel execution generated an error
		//CUT_CHECK_ERROR("Kernel execution failed");

		//printf("Copying result back to host...\n\n");

        int err = clEnqueueReadBuffer(commands, d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent);
        clFinish(commands);
		START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer)
        END_TIMER(ocdTempTimer)
		CHKERR(err, "Unable to read memory 1.");
		err = clEnqueueReadBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent);
                clFinish(commands);
                START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer)
                END_TIMER(ocdTempTimer)
		CHKERR(err, "Unable to read memory 2.");
		//CUDA_SAFE_CALL( cudaMemcpy( h_mapRecords, d_mapRecords, 3 * numSections * maxLevel * maxCandidates * sizeof(float), cudaMemcpyDeviceToHost ));
		saveResult(level);
		fflush(dumpFile);
	// END LOOP

		//CUT_SAFE_CALL( cutStopTimer( total_timer));

		// Print Statistics for this run
		printf("%s, %f, %s, %s, %d, %d, %d\n",
			argv[1],											// Dataset
			support,											// Support Threshold
			algorithmType == NAIVE ? "PTPE" : algorithmType == MAP_AND_MERGE ? "MapMerge" : "Episode-Based",		// PTPE or MapMerge or Episode-Based
			memoryModel == STATIC ? "A1+A2" : "A1",				// A1 or A1+A2
			level,												// Level
			numCandidates+episodesCulled,						// Episodes counted
			episodesCulled  									// Episodes removed by A2
	//		cutGetTimerValue( a1_counting_timer),				// Time for A1
//			memoryModel == STATIC ? cutGetTimerValue( a2_counting_timer) : 0.0f,				// Time for A2
		//	cutGetTimerValue( generating_timer),				// Episode generation time
		//	cutGetTimerValue( total_timer) );					// Time for total loop
            );
	}
	printf("Done!\n");

    cleanup();

    //CUT_SAFE_CALL( cutStopTimer( timer));
    //printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
    //CUT_SAFE_CALL( cutDeleteTimer( timer));
    //CUT_SAFE_CALL( cutDeleteTimer( generating_timer));
    //CUT_SAFE_CALL( cutDeleteTimer( a1_counting_timer));
    //CUT_SAFE_CALL( cutDeleteTimer( a2_counting_timer));
    //CUT_SAFE_CALL( cutDeleteTimer( total_timer));

}