void initCL() { FILE *kernelFile; char *kernelSource; size_t kernelLength; cl_int errcode; ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; device_id = opts.device_id; clDevice = GetDevice(platform_id, device_id); size_t max_worksize[3]; errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL); CHECKERR(errcode); while(num_threads_perdim*num_threads_perdim>max_worksize[0]) num_threads_perdim = num_threads_perdim/2; num_threads = num_threads_perdim*num_threads_perdim; clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode); CHECKERR(errcode); clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode); CHECKERR(errcode); kernelFile = fopen("kmeans_opencl_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode); CHECKERR(errcode); free(kernelSource); errcode = clBuildProgram(clProgram, 1, &clDevice, NULL, NULL, NULL); if (errcode == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLength; errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); log = (char *) malloc(sizeof(char)*logLength); errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL); fprintf(stderr, "Kernel build error! Log:\n%s", log); free(log); return; } CHECKERR(errcode); clKernel_invert_mapping = clCreateKernel(clProgram, "invert_mapping", &errcode); CHECKERR(errcode); clKernel_kmeansPoint = clCreateKernel(clProgram, "kmeansPoint", &errcode); CHECKERR(errcode); }
int ocd_check_requirements(ocd_requirements* reqs) { cl_int dev_type; if(reqs == NULL) return 1; int pass = 1; #ifdef USE_AFPGA dev_type = CL_DEVICE_TYPE_ACCELERATOR; #elif defined(USEGPU) dev_type = CL_DEVICE_TYPE_GPU; #else dev_type = CL_DEVICE_TYPE_CPU; #endif ocd_options opts = ocd_get_options(); // cl_device_id d_id = _ocd_get_device(opts.platform_id, opts.device_id); cl_device_id d_id = GetDevice(opts.platform_id, opts.device_id,dev_type); cl_ulong local_mem; clGetDeviceInfo(d_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem, NULL); if(local_mem < reqs->local_mem_size) pass = 0; reqs->local_mem_size = local_mem; cl_ulong global_mem; clGetDeviceInfo(d_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &global_mem, NULL); if(global_mem < reqs->global_mem_size) pass = 0; reqs->global_mem_size = global_mem; size_t workgroup_size; clGetDeviceInfo(d_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); if(workgroup_size < reqs->workgroup_size) pass = 0; reqs->workgroup_size = workgroup_size; return pass; }
int main(int argc, char** argv) { cl_int err; int usegpu = USEGPU; int do_verify = 0; int opt, option_index=0; unsigned int correct; size_t global_size; size_t local_size; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; stopwatch sw; cl_mem csr_ap; cl_mem csr_aj; cl_mem csr_ax; cl_mem x_loc; cl_mem y_loc; FILE *kernelFile; char *kernelSource; size_t kernelLength; size_t lengthRead; ocd_init(&argc, &argv, NULL); ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; n_device = opts.device_id; while ((opt = getopt_long(argc, argv, "::vc::", long_options, &option_index)) != -1 ) { switch(opt){ //case 'i': //input_file = optarg; //break; case 'v': fprintf(stderr, "verify\n"); do_verify = 1; break; case 'c': fprintf(stderr, "using cpu\n"); usegpu = 0; break; default: fprintf(stderr, "Usage: %s [-v Warning: lots of output] [-c use CPU]\n", argv[0]); exit(EXIT_FAILURE); } } /* Fill input set with random float values */ int i; csr_matrix csr; csr = laplacian_5pt(512); int k = 0; for(k = 0; k < csr.num_nonzeros; k++){ csr.Ax[k] = 1.0 - 2.0 * (rand() / (RAND_MAX + 1.0)); } //The other arrays float * x_host = float_new_array(csr.num_cols); float * y_host = float_new_array(csr.num_rows); unsigned int ii; for(ii = 0; ii < csr.num_cols; ii++){ x_host[ii] = rand() / (RAND_MAX + 1.0); } for(ii = 0; ii < csr.num_rows; ii++){ y_host[ii] = rand() / (RAND_MAX + 2.0); } /* Retrieve an OpenCL platform */ device_id = GetDevice(platform_id, n_device); /* Create a compute context */ context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); CHKERR(err, "Failed to create a compute context!"); /* Create a command queue */ commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); CHKERR(err, "Failed to create a command queue!"); /* Load kernel source */ kernelFile = fopen("spmv_csr_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); lengthRead = fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); /* Create the compute program from the source buffer */ program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, &kernelLength, &err); CHKERR(err, "Failed to create a compute program!"); /* Free kernel source */ free(kernelSource); /* Build the program executable */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *buildLog; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); buildLog = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) buildLog, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, buildLog); free(buildLog); exit(1); } CHKERR(err, "Failed to build program!"); /* Create the compute kernel in the program we wish to run */ kernel = clCreateKernel(program, "csr", &err); CHKERR(err, "Failed to create a compute kernel!"); /* Create the input and output arrays in device memory for our calculation */ csr_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_rows+4, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); csr_aj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_nonzeros, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); csr_ax = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); x_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); y_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_rows, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); /* beginning of timing point */ stopwatch_start(&sw); /* Write our data set into the input array in device memory */ err = clEnqueueWriteBuffer(commands, csr_ap, CL_TRUE, 0, sizeof(unsigned int)*csr.num_rows+4, csr.Ap, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, csr_aj, CL_TRUE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr.Aj, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, csr_ax, CL_TRUE, 0, sizeof(float)*csr.num_nonzeros, csr.Ax, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, x_loc, CL_TRUE, 0, sizeof(float)*csr.num_cols, x_host, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, y_host, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) CHKERR(err, "Failed to write to source array!"); END_TIMER(ocdTempTimer) /* Set the arguments to our compute kernel */ err = 0; err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &csr.num_rows); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &csr_ap); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &csr_aj); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &csr_ax); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &x_loc); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &y_loc); CHKERR(err, "Failed to set kernel arguments!"); /* Get the maximum work group size for executing the kernel on the device */ err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel work group info!"); /* Execute the kernel over the entire range of our 1d input data set */ /* using the maximum number of work group items for this device */ global_size = csr.num_rows; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CSR Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel!"); /* Wait for the command commands to get serviced before reading back results */ float output[csr.num_rows]; /* Read back the results from the device to verify the output */ err = clEnqueueReadBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, output, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to read output array!"); /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf Gflops: %f \n", 1000*get_interval_by_sec(&sw), (2.0 * (double) csr.num_nonzeros / get_interval_by_sec(&sw)) / 1e9); /* Validate our results */ if(do_verify){ for (i = 0; i < csr.num_rows; i++){ printf("row: %d output: %f \n", i, output[i]); } } int row = 0; float sum = 0; int row_start = 0; int row_end = 0; for(row =0; row < csr.num_rows; row++){ sum = y_host[row]; row_start = csr.Ap[row]; row_end = csr.Ap[row+1]; unsigned int jj = 0; for (jj = row_start; jj < row_end; jj++){ sum += csr.Ax[jj] * x_host[csr.Aj[jj]]; } y_host[row] = sum; } for (i = 0; i < csr.num_rows; i++){ if((fabsf(y_host[i]) - fabsf(output[i])) > .001) printf("Possible error, difference greater then .001 at row %d \n", i); } /* Print a brief summary detailing the results */ ocd_finalize(); /* Shutdown and cleanup */ clReleaseMemObject(csr_ap); clReleaseMemObject(csr_aj); clReleaseMemObject(csr_ax); clReleaseMemObject(x_loc); clReleaseMemObject(y_loc); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main ( int argc, char *argv[] ) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index=0; func_ret_t ret; const char *input_file = NULL; float *m, *mm; stopwatch sw; cl_device_id clDevice; cl_context clContext; cl_command_queue clCommands; cl_program clProgram; cl_kernel clKernel_diagonal; cl_kernel clKernel_perimeter; cl_kernel clKernel_internal; cl_int dev_type; cl_int errcode; FILE *kernelFile; char *kernelSource; size_t kernelLength; cl_mem d_m; ocd_init(&argc, &argv, NULL); ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; device_id = opts.device_id; while ((opt = getopt_long(argc, argv, "::vs:i:", long_options, &option_index)) != -1 ) { switch(opt) { case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); fprintf(stderr, "Currently not supported, use -i instead\n"); fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file||-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify) { printf("Before LUD\n"); print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } // errcode = clGetPlatformIDs(NUM_PLATFORM, clPlatform, NULL); // CHECKERR(errcode); // // errcode = clGetDeviceIDs(clPlatform[PLATFORM_ID], USEGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); // CHECKERR(errcode); #ifdef USEGPU dev_type = CL_DEVICE_TYPE_GPU; #elif defined(USE_AFPGA) dev_type = CL_DEVICE_TYPE_ACCELERATOR; #else dev_type = CL_DEVICE_TYPE_CPU; #endif clDevice = GetDevice(platform_id, device_id,dev_type); size_t max_worksize[3]; errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL); CHECKERR(errcode); while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0]) BLOCK_SIZE = BLOCK_SIZE/2; clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode); CHECKERR(errcode); clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode); CHECKERR(errcode); kernelFile = fopen("lud_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode); CHECKERR(errcode); free(kernelSource); char arg[100]; sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE); errcode = clBuildProgram(clProgram, 1, &clDevice, arg, NULL, NULL); if (errcode == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLength; errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); log = (char *) malloc(sizeof(char)*logLength); errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL); fprintf(stderr, "Kernel build error! Log:\n%s", log); free(log); return 0; } CHECKERR(errcode); clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode); CHECKERR(errcode); clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode); CHECKERR(errcode); clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode); CHECKERR(errcode); d_m = clCreateBuffer(clContext, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode); CHECKERR(errcode); /* beginning of timing point */ stopwatch_start(&sw); errcode = clEnqueueWriteBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); int i=0; size_t localWorkSize[2]; size_t globalWorkSize[2]; //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE); // printf("max Work-item Size: %d\n",(int)max_worksize[0]); #ifdef START_POWER for( int iter = 0; iter < 1000; iter++) #endif for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) { errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE*2; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer) CHECKERR(errcode); END_TIMER(ocdTempTimer) errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; localWorkSize[1] = BLOCK_SIZE; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1]; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); } errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) CHECKERR(errcode); END_TIMER(ocdTempTimer) errcode = clEnqueueReadBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer) END_TIMER(ocdTempTimer) /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw)); clReleaseMemObject(d_m); if (do_verify) { printf("After LUD\n"); print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); printf("matrix_dim: %d\n",matrix_dim); lud_verify(mm, m, matrix_dim); free(mm); } clReleaseKernel(clKernel_diagonal); clReleaseKernel(clKernel_perimeter); clReleaseKernel(clKernel_internal); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommands); clReleaseContext(clContext); free(m); ocd_finalize(); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest( int argc, char** argv) { ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; n_device = opts.device_id; if ( argc != 8) { printf("Usage: GpuTemporalDataMining [<platform> <device> --] <file path> <temporal constraint path> <threads> <support> <(a)bsolute or (r)atio> <(s)tatic | (d)ynamic> <(m)ap and merge | (n)aive | (o)hybrid> \n"); return; } // CUT_DEVICE_INIT(); initGpu(); getDeviceVariables(device_id); printf("Dataset, Support Threshold, PTPE or MapMerge, A1 or A1+A2, Level, Episodes (N), Episodes Culled (X), A1 Counting Time, A2 Counting Time, Generation Time, Total Counting Time\n"); //CUT_SAFE_CALL( cutCreateTimer( &timer)); //CUT_SAFE_CALL( cutCreateTimer( &generating_timer)); //CUT_SAFE_CALL( cutCreateTimer( &a1_counting_timer)); //CUT_SAFE_CALL( cutCreateTimer( &a2_counting_timer)); //CUT_SAFE_CALL( cutCreateTimer( &total_timer)); //CUT_SAFE_CALL( cutStartTimer( total_timer)); //CUT_SAFE_CALL( cutStartTimer( timer)); //CUT_SAFE_CALL( cutStartTimer( generating_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a2_counting_timer)); unsigned int num_threads = atoi(argv[3]); // allocate host memory //initEpisodeCandidates(); if ( loadData( argv[1] ) != 0 ) return; if ( loadTemporalConstraints(argv[2]) != 0 ) return; // Check whether value supplied is absolute or ratio support supportType = *(argv[5]) == 'a' ? ABSOLUTE : RATIO; memoryModel = *(argv[6]) == 's' ? STATIC : DYNAMIC; switch (*(argv[7])) { case 'm': algorithmType = MAP_AND_MERGE; break; case 'n': algorithmType = NAIVE; break; case 'o': algorithmType = OPTIMAL; break; } support = atof(argv[4]); dumpFile = fopen( "episode.txt", "w" ); //printf("Initializing GPU Data...\n"); setupGpu(); // setup execution parameters size_t grid[3]; size_t threads[3]; //printf("Event stream size: %i\n", eventSize); // BEGIN LOOP for ( int level = 1; level <= eventSize; level++ ) { printf("Generating episode candidates for level %i...\n", level); // CUT_SAFE_CALL( cutResetTimer( total_timer)); // CUT_SAFE_CALL( cutStartTimer( total_timer)); //CUDA_SAFE_CALL( cudaUnbindTexture( candidateTex ) ); if(level != 1){ unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * (level-1) * sizeof(UBYTE) ); //CUDA_SAFE_CALL( cudaUnbindTexture( intervalTex ) ); unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-2) * 2 * sizeof(float)); } // CUT_SAFE_CALL( cutResetTimer( generating_timer)); // CUT_SAFE_CALL( cutStartTimer( generating_timer)); // int test1, test = numCandidates; // generateEpisodeCandidatesCPU( level ); // test1 = numCandidates; // numCandidates = test; printf("Generating Episodes\n"); #ifdef CPU_EPISODE_GENERATION generateEpisodeCandidatesCPU( level ); #else generateEpisodeCandidatesGPU( level, num_threads ); #endif // CUT_SAFE_CALL( cutStopTimer( generating_timer)); //printf( "\tGenerating time: %f (ms)\n", cutGetTimerValue( generating_timer)); if ( numCandidates == 0 ) break; printf("Writing to buffer\n"); // Copy candidates to GPU #ifdef CPU_EPISODE_GENERATION clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) #endif bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8); bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT ); //printf("Executing kernel on %i candidates...\n", numCandidates, level); // execute the kernel calculateGrid(grid, num_threads, numCandidates); calculateBlock(threads, num_threads, numCandidates); int sections; unsigned int shared_mem_needed; //CUT_SAFE_CALL( cutStartTimer( counting_timer)); int aType = algorithmType; if ( algorithmType == OPTIMAL ) aType = chooseAlgorithmType( level, numCandidates, num_threads ); if ( memoryModel == DYNAMIC ) { if ( aType == NAIVE ) { shared_mem_needed = MaxListSize*level*threads[0]*sizeof(float); printf("Shared memory needed %d\n", shared_mem_needed); //CUT_SAFE_CALL( cutResetTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); } else { printf("DYNAMIC MAP MERGE\n"); calculateLevelParameters(level, threads, grid, sections); shared_mem_needed = 16000; printf("numCandidates=%d\n", numCandidates); //CUT_SAFE_CALL( cutResetTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); //countCandidatesMapMergeStatic<<< grid, threads, shared_mem_needed >>>( d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates ); } } else { if ( aType == NAIVE ) { shared_mem_needed = level*threads[0]*sizeof(float); } else { calculateLevelParameters(level, threads, grid, sections); shared_mem_needed = 16000; } //CUT_SAFE_CALL( cutResetTimer( a2_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a2_counting_timer)); if ( aType == NAIVE ) countCandidatesStatic(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); else countCandidatesMapMergeStatic(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); clFinish(commands); //CUT_SAFE_CALL( cutStopTimer( a2_counting_timer)); int err; err = clEnqueueReadBuffer(commands,d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read buffer from device."); unbindTexture(&candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE) ); unbindTexture(&intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float)); // Remove undersupported episodes cullCandidates( level ); if ( numCandidates == 0 ) break; unsigned int mmthreads = num_threads; if ( MaxListSize*level*num_threads*sizeof(float) > 16384 ) { if ( MaxListSize*level*96*sizeof(float) < 16384 ) mmthreads = 96; else if ( MaxListSize*level*64*sizeof(float) < 16384) mmthreads = 64; else if ( MaxListSize*level*32*sizeof(float) < 16384) mmthreads = 32; printf("More shared memory needed for %d threads. Changed to %d threads.\n", num_threads, mmthreads ); } #ifdef CPU_EPISODE_GENERATION err = clEnqueueWriteBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to write buffer 1."); if(numCandidates * (level - 1) * 2 * sizeof(float) != 0) err = clEnqueueWriteBuffer(commands, d_episodeIntervals, CL_TRUE, 0, numCandidates * (level-1) * 2 * sizeof(float), h_episodeIntervals, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "TDM Episode Copy", ocdTempTimer) CHKERR(err, "Unable to write buffer 2."); END_TIMER(ocdTempTimer) #endif bindTexture( 0, &candidateTex, d_episodeCandidates, numCandidates * level * sizeof(UBYTE), CL_UNSIGNED_INT8); bindTexture( 0, &intervalTex, d_episodeIntervals, numCandidates * (level-1) * 2 * sizeof(float), CL_FLOAT ); if ( algorithmType == OPTIMAL ) aType = chooseAlgorithmType( level, numCandidates, mmthreads ); // Run (T1,T2] algorithm if ( aType == NAIVE ) { shared_mem_needed = MaxListSize*level* mmthreads*sizeof(float); calculateGrid(grid, mmthreads, numCandidates ); calculateBlock(threads, mmthreads, numCandidates ); } else { calculateLevelParameters(level, threads, grid, sections); shared_mem_needed = 16000; } //CUT_SAFE_CALL( cutResetTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutStartTimer( a1_counting_timer)); if ( aType == NAIVE ) countCandidates(grid, threads, d_episodeSupport, eventSize, level, supportType, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); else countCandidatesMapMerge(grid, threads, d_episodeSupport, padEventSize, level, supportType, sections, padEventSize / sections, numCandidates, candidateTex, intervalTex, eventTex, timeTex, shared_mem_needed ); } printf("Finishing\n"); clFinish(commands); //CUT_SAFE_CALL( cutStopTimer( a1_counting_timer)); //printf( "\tCounting time: %f (ms)\n", cutGetTimerValue( counting_timer)); // check if kernel execution generated an error //CUT_CHECK_ERROR("Kernel execution failed"); //printf("Copying result back to host...\n\n"); int err = clEnqueueReadBuffer(commands, d_episodeSupport, CL_TRUE, 0, numCandidates * sizeof(float), h_episodeSupport, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read memory 1."); err = clEnqueueReadBuffer(commands, d_episodeCandidates, CL_TRUE, 0, numCandidates * level * sizeof(UBYTE), h_episodeCandidates, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "TDM Episode Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Unable to read memory 2."); //CUDA_SAFE_CALL( cudaMemcpy( h_mapRecords, d_mapRecords, 3 * numSections * maxLevel * maxCandidates * sizeof(float), cudaMemcpyDeviceToHost )); saveResult(level); fflush(dumpFile); // END LOOP //CUT_SAFE_CALL( cutStopTimer( total_timer)); // Print Statistics for this run printf("%s, %f, %s, %s, %d, %d, %d\n", argv[1], // Dataset support, // Support Threshold algorithmType == NAIVE ? "PTPE" : algorithmType == MAP_AND_MERGE ? "MapMerge" : "Episode-Based", // PTPE or MapMerge or Episode-Based memoryModel == STATIC ? "A1+A2" : "A1", // A1 or A1+A2 level, // Level numCandidates+episodesCulled, // Episodes counted episodesCulled // Episodes removed by A2 // cutGetTimerValue( a1_counting_timer), // Time for A1 // memoryModel == STATIC ? cutGetTimerValue( a2_counting_timer) : 0.0f, // Time for A2 // cutGetTimerValue( generating_timer), // Episode generation time // cutGetTimerValue( total_timer) ); // Time for total loop ); } printf("Done!\n"); cleanup(); //CUT_SAFE_CALL( cutStopTimer( timer)); //printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer)); //CUT_SAFE_CALL( cutDeleteTimer( timer)); //CUT_SAFE_CALL( cutDeleteTimer( generating_timer)); //CUT_SAFE_CALL( cutDeleteTimer( a1_counting_timer)); //CUT_SAFE_CALL( cutDeleteTimer( a2_counting_timer)); //CUT_SAFE_CALL( cutDeleteTimer( total_timer)); }