int main ( int argc, char *argv[] ) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index=0; func_ret_t ret; const char *input_file = NULL; float *m, *mm; stopwatch sw; cl_device_id clDevice; cl_context clContext; cl_command_queue clCommands; cl_program clProgram; cl_kernel clKernel_diagonal; cl_kernel clKernel_perimeter; cl_kernel clKernel_internal; cl_int dev_type; cl_int errcode; FILE *kernelFile; char *kernelSource; size_t kernelLength; cl_mem d_m; ocd_init(&argc, &argv, NULL); ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; device_id = opts.device_id; while ((opt = getopt_long(argc, argv, "::vs:i:", long_options, &option_index)) != -1 ) { switch(opt) { case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); fprintf(stderr, "Currently not supported, use -i instead\n"); fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file||-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify) { printf("Before LUD\n"); print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } // errcode = clGetPlatformIDs(NUM_PLATFORM, clPlatform, NULL); // CHECKERR(errcode); // // errcode = clGetDeviceIDs(clPlatform[PLATFORM_ID], USEGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL); // CHECKERR(errcode); #ifdef USEGPU dev_type = CL_DEVICE_TYPE_GPU; #elif defined(USE_AFPGA) dev_type = CL_DEVICE_TYPE_ACCELERATOR; #else dev_type = CL_DEVICE_TYPE_CPU; #endif clDevice = GetDevice(platform_id, device_id,dev_type); size_t max_worksize[3]; errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL); CHECKERR(errcode); while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0]) BLOCK_SIZE = BLOCK_SIZE/2; clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode); CHECKERR(errcode); clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode); CHECKERR(errcode); kernelFile = fopen("lud_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode); CHECKERR(errcode); free(kernelSource); char arg[100]; sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE); errcode = clBuildProgram(clProgram, 1, &clDevice, arg, NULL, NULL); if (errcode == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLength; errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength); log = (char *) malloc(sizeof(char)*logLength); errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL); fprintf(stderr, "Kernel build error! Log:\n%s", log); free(log); return 0; } CHECKERR(errcode); clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode); CHECKERR(errcode); clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode); CHECKERR(errcode); clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode); CHECKERR(errcode); d_m = clCreateBuffer(clContext, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode); CHECKERR(errcode); /* beginning of timing point */ stopwatch_start(&sw); errcode = clEnqueueWriteBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); int i=0; size_t localWorkSize[2]; size_t globalWorkSize[2]; //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE); // printf("max Work-item Size: %d\n",(int)max_worksize[0]); #ifdef START_POWER for( int iter = 0; iter < 1000; iter++) #endif for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) { errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE*2; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer) CHECKERR(errcode); END_TIMER(ocdTempTimer) errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; localWorkSize[1] = BLOCK_SIZE; globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0]; globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1]; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHECKERR(errcode); } errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m); errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim); errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i); CHECKERR(errcode); localWorkSize[0] = BLOCK_SIZE; globalWorkSize[0] = BLOCK_SIZE; errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer) CHECKERR(errcode); END_TIMER(ocdTempTimer) errcode = clEnqueueReadBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent); clFinish(clCommands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer) END_TIMER(ocdTempTimer) /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw)); clReleaseMemObject(d_m); if (do_verify) { printf("After LUD\n"); print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); printf("matrix_dim: %d\n",matrix_dim); lud_verify(mm, m, matrix_dim); free(mm); } clReleaseKernel(clKernel_diagonal); clReleaseKernel(clKernel_perimeter); clReleaseKernel(clKernel_internal); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommands); clReleaseContext(clContext); free(m); ocd_finalize(); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */
int main ( int argc, char *argv[] ) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index=0, error=0; func_ret_t ret; const char *input_file = NULL; double *m, *mm; stopwatch sw; int i; while ((opt = getopt_long(argc, argv, ":vs:i:", long_options, &option_index)) != -1 ) { switch(opt){ case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); break; case '?': fprintf(stderr, "invalid option\n"); error=1; break; case ':': fprintf(stderr, "missing argument\n"); error=1; break; default: error=1; } } if ((optind < argc) || (optind == 1) || error) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size]\n", argv[0]); exit(EXIT_FAILURE); } if(matrix_dim>1) { fprintf(stderr, "Generating matrix of size %d x %d\n", matrix_dim, matrix_dim); ret = create_matrix_from_random(&m, matrix_dim); if(ret != RET_SUCCESS){ m = NULL; fprintf(stderr, "error could not generate random matrix of size %d x %d!\n", matrix_dim, matrix_dim); exit(EXIT_FAILURE); } } else { fprintf(stderr, "No input file or valid matrix size specified!\n"); exit(EXIT_FAILURE); } if (do_verify){ //printf("Before LUD\n"); //print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } stopwatch_start(&sw); lud_base(m, matrix_dim); stopwatch_stop(&sw); if (matrix_dim == 1024) { for (i=0; i<100; ++i) { if (m[expected_row_indices[i]*matrix_dim + expected_col_indices[i]] != expected_values[i]) { fprintf(stderr, "ERROR: value at index (%d,%d) = '%.*f' is different from the expected value '%.*f'\n", expected_row_indices[i], expected_col_indices[i], // the 21 parameter prints enough significant decimal digits to obtain the same floating-point number // when read back 21, m[expected_row_indices[i]*matrix_dim + expected_col_indices[i]], 21, expected_values[i] ); fprintf(stderr, "Received values:\n"); for (i=0; i<100; ++i) { fprintf(stderr, "%.*f, ", 21, m[expected_row_indices[i]*matrix_dim + expected_col_indices[i]]); } fprintf(stderr, "\n"); exit(1); } } } else { fprintf(stderr, "WARNING: No self-checking step for dimension '%d'\n", matrix_dim); } if (do_verify){ //fprintf(stderr, "After LUD\n"); //print_matrix(m, matrix_dim); fprintf(stderr, ">>>Verify<<<<\n"); lud_verify(mm, m, matrix_dim); free(mm); } free(m); printf("{ \"status\": %d, \"options\": \"-s %d\", \"time\": %f }\n", 1, matrix_dim, get_interval_by_sec(&sw)); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */
int main ( int argc, char *argv[] ) { //printf("Starting..\n"); int matrix_dim = 32; /* default size */ int opt, option_index=0; func_ret_t ret; const char *input_file = NULL; float *m, *mm; stopwatch sw; int grid_x=0; int grid_y=0; while ((opt = getopt_long(argc, argv, "::vs:i:x:y:", long_options, &option_index)) != -1 ) { switch(opt){ case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); //printf("Generate input matrix internally, size =%d\n", matrix_dim); // fprintf(stderr, "Currently not supported, use -i instead\n"); // fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); // exit(EXIT_FAILURE); break; case 'x': grid_x = atoi(optarg); break; case 'y': grid_y = atoi(optarg); break; case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "1Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); } } /* if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "2Usage: %s [-v] [-n no. of threads] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); } */ if (input_file) { //printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else if (matrix_dim) { //printf("Creating matrix internally size=%d\n", matrix_dim); ret = create_matrix(&m, matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify){ /* print_matrix(m, matrix_dim); */ matrix_duplicate(m, &mm, matrix_dim); } wul(); //printf("Starting. . . \n"); //lud_oacc(m, matrix_dim,grid_x,grid_y); stopwatch_start(&sw); // lud_omp(m, matrix_dim); lud_oacc(m, matrix_dim,grid_x,grid_y); stopwatch_stop(&sw); printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw)); if (do_verify){ printf("After LUD\n"); /* print_matrix(m, matrix_dim); */ printf(">>>Verify<<<<\n"); lud_verify(mm, m, matrix_dim); free(mm); } free(m); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */
int main ( int argc, char *argv[] ) { printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE); int matrix_dim = 32; /* default matrix_dim */ int opt, option_index=0; func_ret_t ret; const char *input_file = NULL; float *m, *mm; stopwatch sw; while ((opt = getopt_long(argc, argv, "::vs:i:", long_options, &option_index)) != -1 ) { switch(opt){ case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); printf("Generate input matrix internally, size =%d\n", matrix_dim); // fprintf(stderr, "Currently not supported, use -i instead\n"); // fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); // exit(EXIT_FAILURE); break; case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else if (matrix_dim) { printf("Creating matrix internally size=%d\n", matrix_dim); ret = create_matrix(&m, matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify){ printf("Before LUD\n"); // print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } char * kernel_lud_diag = "lud_diagonal"; char * kernel_lud_peri = "lud_perimeter"; char * kernel_lud_inter = "lud_internal"; FILE * fp = fopen("./lud_kernel.cl", "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n"); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // Use 1: GPU 0: CPU int use_gpu = 1; // OpenCL initialization if(initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = { source, 0 }; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions," "); #ifdef BLOCK_SIZE sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE); #endif err = clBuildProgram(prog, 0, NULL, clOptions, NULL, NULL); { // show warnings/errors //static char log[65536]; memset(log, 0, sizeof(log)); //cl_device_id device_id = 0; //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel diagnal; cl_kernel perimeter; cl_kernel internal; diagnal = clCreateKernel(prog, kernel_lud_diag, &err); perimeter = clCreateKernel(prog, kernel_lud_peri, &err); internal = clCreateKernel(prog, kernel_lud_inter, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; } clReleaseProgram(prog); //size_t local_work[3] = { 1, 1, 1 }; //size_t global_work[3] = {1, 1, 1 }; cl_mem d_m; d_m = clCreateBuffer(context, CL_MEM_READ_WRITE, matrix_dim*matrix_dim * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1;} /* beginning of timing point */ stopwatch_start(&sw); err = clEnqueueWriteBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; } int i=0; for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) { clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m); clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim); clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i); size_t global_work1[3] = {BLOCK_SIZE, 1, 1}; size_t local_work1[3] = {BLOCK_SIZE, 1, 1}; err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } clSetKernelArg(perimeter, 0, sizeof(void *), (void*) &d_m); clSetKernelArg(perimeter, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(perimeter, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(perimeter, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(perimeter, 4, sizeof(cl_int), (void*) &matrix_dim); clSetKernelArg(perimeter, 5, sizeof(cl_int), (void*) &i); size_t global_work2[3] = {BLOCK_SIZE * 2 * ((matrix_dim-i)/BLOCK_SIZE-1), 1, 1}; size_t local_work2[3] = {BLOCK_SIZE * 2, 1, 1}; err = clEnqueueNDRangeKernel(cmd_queue, perimeter, 2, NULL, global_work2, local_work2, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: perimeter clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } clSetKernelArg(internal, 0, sizeof(void *), (void*) &d_m); clSetKernelArg(internal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(internal, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(internal, 3, sizeof(cl_int), (void*) &matrix_dim); clSetKernelArg(internal, 4, sizeof(cl_int), (void*) &i); size_t global_work3[3] = {BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), 1}; size_t local_work3[3] = {BLOCK_SIZE, BLOCK_SIZE, 1}; err = clEnqueueNDRangeKernel(cmd_queue, internal, 2, NULL, global_work3, local_work3, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: internal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } } clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m); clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim); clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i); size_t global_work1[3] = {BLOCK_SIZE, 1, 1}; size_t local_work1[3] = {BLOCK_SIZE, 1, 1}; err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueReadBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; } clFinish(cmd_queue); /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw)); clReleaseMemObject(d_m); if (do_verify){ printf("After LUD\n"); // print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); lud_verify(mm, m, matrix_dim); free(mm); } free(m); if(shutdown()) return -1; }
int main (int argc, char *argv[]) { int matrix_dim = 32; /* default matrix_dim */ int opt, option_index = 0; func_ret_t ret; const char *input_file = NULL; const char *cubin_file = NULL; float *m, *mm; struct timeval tv; CUdeviceptr d_m; CUcontext ctx; CUmodule mod; CUresult res; while ((opt = getopt_long(argc, argv, "::vs:i:c:", long_options, &option_index)) != -1 ) { switch(opt) { case 'c': cubin_file = optarg; break; case 'i': input_file = optarg; break; case 'v': do_verify = 1; break; case 's': matrix_dim = atoi(optarg); fprintf(stderr, "Currently not supported, use -i instead\n"); fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n", argv[0]); exit(EXIT_FAILURE); case '?': fprintf(stderr, "invalid option\n"); break; case ':': fprintf(stderr, "missing argument\n"); break; default: fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n", argv[0]); exit(EXIT_FAILURE); } } if ( (optind < argc) || (optind == 1)) { fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n", argv[0]); exit(EXIT_FAILURE); } if (!cubin_file) { printf("No cubin file specified!\n"); exit(EXIT_FAILURE); } if (input_file) { printf("Reading matrix from file %s\n", input_file); ret = create_matrix_from_file(&m, input_file, &matrix_dim); if (ret != RET_SUCCESS) { m = NULL; fprintf(stderr, "error create matrix from file %s\n", input_file); exit(EXIT_FAILURE); } } else { printf("No input file specified!\n"); exit(EXIT_FAILURE); } if (do_verify){ print_matrix(m, matrix_dim); matrix_duplicate(m, &mm, matrix_dim); } /* * call our common CUDA initialization utility function. */ res = cuda_driver_api_init(&ctx, &mod, cubin_file); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_init failed: res = %u\n", res); return -1; } res = cuMemAlloc(&d_m, matrix_dim * matrix_dim * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed\n"); return -1; } /* * measurement start! */ time_measure_start(&tv); res = cuMemcpyHtoD(d_m, m, matrix_dim * matrix_dim * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoD (a) failed: res = %u\n", res); return -1; } lud_launch(mod, d_m, matrix_dim); res = cuMemcpyDtoH(m, d_m, matrix_dim * matrix_dim * sizeof(float)); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoH failed: res = %u\n", res); return -1; } /* * measurement end! will print out the time. */ time_measure_end(&tv); res = cuMemFree(d_m); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", res); return -1; } res = cuda_driver_api_exit(ctx, mod); if (res != CUDA_SUCCESS) { printf("cuda_driver_api_exit faild: res = %u\n", res); return -1; } if (do_verify){ print_matrix(m, matrix_dim); printf(">>>Verify<<<<\n"); lud_verify(mm, m, matrix_dim); free(mm); } free(m); return EXIT_SUCCESS; } /* ---------- end of function main ---------- */