int main(int argc, char** argv) { cl_int err; int usegpu = USEGPU; int do_verify = 0; int opt, option_index=0; unsigned int correct; size_t global_size; size_t local_size; cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; stopwatch sw; cl_mem csr_ap; cl_mem csr_aj; cl_mem csr_ax; cl_mem x_loc; cl_mem y_loc; FILE *kernelFile; char *kernelSource; size_t kernelLength; size_t lengthRead; ocd_init(&argc, &argv, NULL); ocd_options opts = ocd_get_options(); platform_id = opts.platform_id; n_device = opts.device_id; while ((opt = getopt_long(argc, argv, "::vc::", long_options, &option_index)) != -1 ) { switch(opt){ //case 'i': //input_file = optarg; //break; case 'v': fprintf(stderr, "verify\n"); do_verify = 1; break; case 'c': fprintf(stderr, "using cpu\n"); usegpu = 0; break; default: fprintf(stderr, "Usage: %s [-v Warning: lots of output] [-c use CPU]\n", argv[0]); exit(EXIT_FAILURE); } } /* Fill input set with random float values */ int i; csr_matrix csr; csr = laplacian_5pt(512); int k = 0; for(k = 0; k < csr.num_nonzeros; k++){ csr.Ax[k] = 1.0 - 2.0 * (rand() / (RAND_MAX + 1.0)); } //The other arrays float * x_host = float_new_array(csr.num_cols); float * y_host = float_new_array(csr.num_rows); unsigned int ii; for(ii = 0; ii < csr.num_cols; ii++){ x_host[ii] = rand() / (RAND_MAX + 1.0); } for(ii = 0; ii < csr.num_rows; ii++){ y_host[ii] = rand() / (RAND_MAX + 2.0); } /* Retrieve an OpenCL platform */ device_id = GetDevice(platform_id, n_device); /* Create a compute context */ context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); CHKERR(err, "Failed to create a compute context!"); /* Create a command queue */ commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); CHKERR(err, "Failed to create a command queue!"); /* Load kernel source */ kernelFile = fopen("spmv_csr_kernel.cl", "r"); fseek(kernelFile, 0, SEEK_END); kernelLength = (size_t) ftell(kernelFile); kernelSource = (char *) malloc(sizeof(char)*kernelLength); rewind(kernelFile); lengthRead = fread((void *) kernelSource, kernelLength, 1, kernelFile); fclose(kernelFile); /* Create the compute program from the source buffer */ program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, &kernelLength, &err); CHKERR(err, "Failed to create a compute program!"); /* Free kernel source */ free(kernelSource); /* Build the program executable */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *buildLog; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); buildLog = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) buildLog, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, buildLog); free(buildLog); exit(1); } CHKERR(err, "Failed to build program!"); /* Create the compute kernel in the program we wish to run */ kernel = clCreateKernel(program, "csr", &err); CHKERR(err, "Failed to create a compute kernel!"); /* Create the input and output arrays in device memory for our calculation */ csr_ap = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_rows+4, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); csr_aj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*csr.num_nonzeros, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); csr_ax = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_nonzeros, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); x_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_cols, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); y_loc = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float)*csr.num_rows, NULL, &err); CHKERR(err, "Failed to allocate device memory!"); /* beginning of timing point */ stopwatch_start(&sw); /* Write our data set into the input array in device memory */ err = clEnqueueWriteBuffer(commands, csr_ap, CL_TRUE, 0, sizeof(unsigned int)*csr.num_rows+4, csr.Ap, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, csr_aj, CL_TRUE, 0, sizeof(unsigned int)*csr.num_nonzeros, csr.Aj, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, csr_ax, CL_TRUE, 0, sizeof(float)*csr.num_nonzeros, csr.Ax, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, x_loc, CL_TRUE, 0, sizeof(float)*csr.num_cols, x_host, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to write to source array!"); err = clEnqueueWriteBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, y_host, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "CSR Data Copy", ocdTempTimer) CHKERR(err, "Failed to write to source array!"); END_TIMER(ocdTempTimer) /* Set the arguments to our compute kernel */ err = 0; err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &csr.num_rows); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &csr_ap); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &csr_aj); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &csr_ax); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &x_loc); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &y_loc); CHKERR(err, "Failed to set kernel arguments!"); /* Get the maximum work group size for executing the kernel on the device */ err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel work group info!"); /* Execute the kernel over the entire range of our 1d input data set */ /* using the maximum number of work group items for this device */ global_size = csr.num_rows; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CSR Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel!"); /* Wait for the command commands to get serviced before reading back results */ float output[csr.num_rows]; /* Read back the results from the device to verify the output */ err = clEnqueueReadBuffer(commands, y_loc, CL_TRUE, 0, sizeof(float)*csr.num_rows, output, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "CSR Data Copy", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to read output array!"); /* end of timing point */ stopwatch_stop(&sw); printf("Time consumed(ms): %lf Gflops: %f \n", 1000*get_interval_by_sec(&sw), (2.0 * (double) csr.num_nonzeros / get_interval_by_sec(&sw)) / 1e9); /* Validate our results */ if(do_verify){ for (i = 0; i < csr.num_rows; i++){ printf("row: %d output: %f \n", i, output[i]); } } int row = 0; float sum = 0; int row_start = 0; int row_end = 0; for(row =0; row < csr.num_rows; row++){ sum = y_host[row]; row_start = csr.Ap[row]; row_end = csr.Ap[row+1]; unsigned int jj = 0; for (jj = row_start; jj < row_end; jj++){ sum += csr.Ax[jj] * x_host[csr.Aj[jj]]; } y_host[row] = sum; } for (i = 0; i < csr.num_rows; i++){ if((fabsf(y_host[i]) - fabsf(output[i])) > .001) printf("Possible error, difference greater then .001 at row %d \n", i); } /* Print a brief summary detailing the results */ ocd_finalize(); /* Shutdown and cleanup */ clReleaseMemObject(csr_ap); clReleaseMemObject(csr_aj); clReleaseMemObject(csr_ax); clReleaseMemObject(x_loc); clReleaseMemObject(y_loc); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
csr_matrix rand_csr(const unsigned int N,const unsigned int density, const double normal_stddev,unsigned long* seed,FILE* log) { unsigned int i,j,nnz_ith_row,nnz,update_interval,rand_col; double nnz_ith_row_double,nz_error,nz_per_row_doubled,high_bound; int kn[128]; float fn[128],wn[128]; char* used_cols; csr_matrix csr; csr.num_rows = N; csr.num_cols = N; csr.density_perc = (((double)(density))/10000.0); csr.nz_per_row = (((double)N)*((double)density))/1000000.0; csr.num_nonzeros = round(csr.nz_per_row*N); csr.stddev = normal_stddev * csr.nz_per_row; //scale normalized standard deviation by average NZ/row fprintf(log,"Average NZ/Row: %-8.3f\n",csr.nz_per_row); fprintf(log,"Standard Deviation: %-8.3f\n",csr.stddev); fprintf(log,"Target Density: %u ppm = %g%%\n",density,csr.density_perc); fprintf(log,"Approximate NUM_nonzeros: %d\n",csr.num_nonzeros); csr.Ap = (unsigned int *) int_new_array(csr.num_rows+1,"rand_csr() - Heap Overflow! Cannot Allocate Space for csr.Ap"); csr.Aj = (unsigned int *) int_new_array(csr.num_nonzeros,"rand_csr() - Heap Overflow! Cannot Allocate Space for csr.Aj"); csr.Ap[0] = 0; nnz = 0; nz_per_row_doubled = 2*csr.nz_per_row; //limit nnz_ith_row to double the average because negative values are rounded up to 0. This high_bound = MINIMUM(csr.num_cols,nz_per_row_doubled); //limitation ensures the distribution will be symmetric about the mean, albeit not truly normal. used_cols = (char *) malloc(csr.num_cols*sizeof(char)); check(used_cols != NULL,"rand_csr() - Heap Overflow! Cannot allocate space for used_cols"); r4_nor_setup(kn,fn,wn); srand(*seed); update_interval = round(csr.num_rows / 10.0); if(!update_interval) update_interval = csr.num_rows; for(i=0; i<csr.num_rows; i++) { if(i % update_interval == 0) fprintf(log,"\t%d of %d (%5.1f%%) Rows Generated. Continuing...\n",i,csr.num_rows,((double)(i))/csr.num_rows*100); nnz_ith_row_double = r4_nor(seed,kn,fn,wn); //random, normally-distributed value for # of nz elements in ith row, NORMALIZED nnz_ith_row_double *= csr.stddev; //scale by standard deviation nnz_ith_row_double += csr.nz_per_row; //add average nz/row if(nnz_ith_row_double < 0) nnz_ith_row = 0; else if(nnz_ith_row_double > high_bound) nnz_ith_row = high_bound; else nnz_ith_row = (unsigned int) round(nnz_ith_row_double); csr.Ap[i+1] = csr.Ap[i] + nnz_ith_row; if(csr.Ap[i+1] > csr.num_nonzeros) csr.Aj = (unsigned int *) realloc(csr.Aj,sizeof(unsigned int)*csr.Ap[i+1]); for(j=0; j<csr.num_cols; j++) used_cols[j] = 0; for(j=0; j<nnz_ith_row; j++) { rand_col = abs(gen_rand(0,csr.num_cols - 1)); //unsigned long is always non-negative if(used_cols[rand_col]) { j--; } else { csr.Aj[csr.Ap[i]+j] = rand_col; used_cols[rand_col] = 1; } } qsort((&(csr.Aj[csr.Ap[i]])),nnz_ith_row,sizeof(unsigned int),unsigned_int_comparator); } nz_error = ((double)abs((signed int)(csr.num_nonzeros - csr.Ap[csr.num_rows]))) / ((double)csr.num_nonzeros); if(nz_error >= .05) fprintf(stderr,"WARNING: Actual NNZ differs from Theoretical NNZ by %5.2f%%!\n",nz_error*100); csr.num_nonzeros = csr.Ap[csr.num_rows]; fprintf(log,"Actual NUM_nonzeros: %d\n",csr.num_nonzeros); csr.density_perc = (((double)csr.num_nonzeros)*100.0)/((double)csr.num_cols)/((double)csr.num_rows); csr.density_ppm = (unsigned int)round(csr.density_perc * 10000.0); fprintf(log,"Actual Density: %u ppm = %g%%\n",csr.density_ppm,csr.density_perc); free(used_cols); csr.Ax = (float *) float_new_array(csr.num_nonzeros,"rand_csr() - Heap Overflow! Cannot Allocate Space for csr.Ax"); for(i=0; i<csr.num_nonzeros; i++) { csr.Ax[i] = 1.0 - 2.0 * common_randJS(); while(csr.Ax[i] == 0.0) csr.Ax[i] = 1.0 - 2.0 * common_randJS(); } return csr; }