void spmv_csr_ocl(csr_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int groupnum) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowPtr; cl_mem devColId; cl_mem devData; cl_mem devVec; cl_mem devTexVec; cl_mem devRes; //Initialize values int nnz = mat->matinfo.nnz; int vecsize = mat->matinfo.width; int rownum = mat->matinfo.height; int rowptrsize = rownum + 1; ALLOCATE_GPU_READ(devRowPtr, mat->csr_row_ptr, sizeof(int)*rowptrsize); ALLOCATE_GPU_READ(devColId, mat->csr_col_id, sizeof(int)*nnz); ALLOCATE_GPU_READ(devData, mat->csr_data, sizeof(float)*nnz); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 16); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; //errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_mem devRowPtrPad; int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE); int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1)); memset(rowptrpad, 0, sizeof(int)*(padrowsize+1)); for (int i = 0; i <= mat->matinfo.height; i++) rowptrpad[i] = mat->csr_row_ptr[i]; ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1)); clFinish(cmdQueue); printf("\nRow Num %d padded size %d\n", rownum, padrowsize); cl_uint work_dim = 2; //int dim2 = 16; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_csr_ve_slm_pm_fs", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR; { size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2}; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE, time_in_sec / (double) ntimes * 1000, gflops, methodid); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (devRowPtrPad) clReleaseMemObject(devRowPtrPad); if (csrKernel) clReleaseKernel(csrKernel); free(rowptrpad); } { int methodid = 1; cl_mem devRowPtrPad; int padrowsize = findPaddedSize(rownum, CSR_VEC_GROUP_SIZE/WARPSIZE); int* rowptrpad = (int*)malloc(sizeof(int)*(padrowsize+1)); memset(rowptrpad, 0, sizeof(int)*(padrowsize+1)); for (int i = 0; i <= mat->matinfo.height; i++) rowptrpad[i] = mat->csr_row_ptr[i]; ALLOCATE_GPU_READ(devRowPtrPad, rowptrpad, sizeof(int)*(padrowsize+1)); clFinish(cmdQueue); printf("\nRow Num %d padded size %d\n", rownum, padrowsize); cl_uint work_dim = 2; //int dim2 = 16; size_t blocksize[] = {CSR_VEC_GROUP_SIZE, 1}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_csr_ve_reduction_fs", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowPtrPad); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColId); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &rownum); CHECKERROR; { size_t globalsize[] = {groupnum * CSR_VEC_GROUP_SIZE, dim2}; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCSR vector SLM row ptr padded mat strided rows fixed size:%d cpu time %lf ms GFLOPS %lf code %d \n\n", groupnum * CSR_VEC_GROUP_SIZE, time_in_sec / (double) ntimes * 1000, gflops, methodid); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (devRowPtrPad) clReleaseMemObject(devRowPtrPad); if (csrKernel) clReleaseKernel(csrKernel); free(rowptrpad); } //Clean up if (image2dVec) free(image2dVec); if (devRowPtr) clReleaseMemObject(devRowPtr); if (devColId) clReleaseMemObject(devColId); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
void spmv_coo_ocl(coo_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable, int maxgroupnum) { for (int i = 0; i < mat->matinfo.height; i++) result[i] = 0.0f; cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devRowid; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; cl_mem devTmpRow; cl_mem devTmpData; //Initialize values int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int vecsize = mat->matinfo.width; int num_units = nnz / COO_GROUP_SIZE; if (nnz % COO_GROUP_SIZE != 0) num_units++; int group_num = (num_units < maxgroupnum) ? num_units : maxgroupnum; int work_size = group_num * COO_GROUP_SIZE; int num_iters = nnz / work_size; if (nnz % work_size != 0) num_iters++; int process_size = num_iters * COO_GROUP_SIZE; int active_warp = num_units / num_iters; if (num_units % num_iters != 0) active_warp++; int paddedNNZ = findPaddedSize(nnz, COO_ALIGNMENT); int* paddedRow = (int*)malloc(sizeof(int)*paddedNNZ); int* paddedCol = (int*)malloc(sizeof(int)*paddedNNZ); float* paddedData = (float*)malloc(sizeof(float)*paddedNNZ); memcpy(paddedRow, mat->coo_row_id, sizeof(int)*nnz); memcpy(paddedCol, mat->coo_col_id, sizeof(int)*nnz); memcpy(paddedData, mat->coo_data, sizeof(float)*nnz); for (int i = nnz; i < paddedNNZ; i++) { paddedRow[i] = mat->coo_row_id[nnz - 1]; paddedCol[i] = mat->coo_col_id[nnz - 1]; paddedData[i] = 0.0f; } ALLOCATE_GPU_READ(devRowid, paddedRow, sizeof(int)*paddedNNZ); ALLOCATE_GPU_READ(devColid, paddedCol, sizeof(int)*paddedNNZ); ALLOCATE_GPU_READ(devData, paddedData, sizeof(float)*paddedNNZ); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; devTmpRow = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*maxgroupnum, NULL, &errorCode); CHECKERROR; devTmpData = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*maxgroupnum, NULL, &errorCode); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {COO_GROUP_SIZE, 1}; int gsize = group_num * COO_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_coo_s1", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devRowid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &process_size); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &paddedNNZ); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(cl_mem), &devTmpRow); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 8, sizeof(cl_mem), &devTmpData); CHECKERROR; printf("process size %d nnz %d gsize %d active_warp %d\n", process_size, paddedNNZ, gsize, active_warp); size_t blocksize2[] = {COO_GROUP_SIZE * 2, 1}; size_t globalsize2[] = {COO_GROUP_SIZE * 2, dim2}; cl_kernel csrKernel2 = NULL; csrKernel2 = clCreateKernel(program, "gpu_coo_s2", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 0, sizeof(cl_mem), &devTmpRow); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 1, sizeof(cl_mem), &devTmpData); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 2, sizeof(int), &active_warp); CHECKERROR; errorCode = clSetKernelArg(csrKernel2, 3, sizeof(cl_mem), &devRes); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); //int* tmpRow = (int*)malloc(sizeof(int)*maxgroupnum); //float* tmpData = (float*)malloc(sizeof(float)*maxgroupnum); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel2, work_dim, NULL, globalsize2, blocksize2, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nCOO cpu time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); if (csrKernel2) clReleaseKernel(csrKernel2); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } //for (int i = 0; i < active_warp; i++) //printf("Row %d Data %f\n", tmpRow[i], tmpData[i]); } //Clean up if (paddedRow) free(paddedRow); if (paddedCol) free(paddedCol); if (paddedData) free(paddedData); if (image2dVec) free(image2dVec); if (devRowid) clReleaseMemObject(devRowid); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); if (devTmpRow) clReleaseMemObject(devTmpRow); if (devTmpData) clReleaseMemObject(devTmpData); freeObjects(devices, &context, &cmdQueue, &program); }
void spmv_sell_ocl(sell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, double& optflop, int& optmethod, char* oclfilename, cl_device_type deviceType, int ntimes, double* floptable) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devSlicePtr; cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int vecsize = mat->matinfo.width; int sliceheight = mat->sell_slice_height; int slicenum = mat->sell_slice_num; int datasize = mat->sell_slice_ptr[slicenum]; ALLOCATE_GPU_READ(devSlicePtr, mat->sell_slice_ptr, sizeof(int)*(slicenum + 1)); ALLOCATE_GPU_READ(devColid, mat->sell_col_id, sizeof(int)*datasize); ALLOCATE_GPU_READ(devData, mat->sell_data, sizeof(float)*datasize); ALLOCATE_GPU_READ(devVec, vec, sizeof(float)*vecsize); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_R, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); int dim2 = dim2Size; if (sliceheight == WARPSIZE) { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {SELL_GROUP_SIZE, 1}; int gsize = ((rownum + SELL_GROUP_SIZE - 1)/SELL_GROUP_SIZE)*SELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; //printf("gsize %d rownum %d slicenum %d sliceheight %d datasize %d nnz %d vecsize %d \n", gsize, rownum, slicenum, sliceheight, datasize, nnz, vecsize); //int warpnum = SELL_GROUP_SIZE / WARPSIZE; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_sell_warp", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &slicenum); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nSELL cpu warp time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } if (sliceheight == SELL_GROUP_SIZE) { int methodid = 1; cl_uint work_dim = 2; size_t blocksize[] = {SELL_GROUP_SIZE, 1}; int gsize = slicenum * SELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, "gpu_sell_group", &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devSlicePtr); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(int), &slicenum); CHECKERROR; for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nSELL cpu group time %lf ms GFLOPS %lf code %d \n\n", time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; floptable[methodid] = gflops; if (onetime < opttime) { opttime = onetime; optmethod = methodid; optflop = gflops; } } //Clean up if (image2dVec) free(image2dVec); if (devSlicePtr) clReleaseMemObject(devSlicePtr); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
void spmv_b4ell_ocl(b4ell_matrix<int, float>* mat, float* vec, float* result, int dim2Size, double& opttime, int& optmethod, char* oclfilename, cl_device_type deviceType, float* coores, int ntimes, int bw, int bh) { cl_device_id* devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; assert(initialization(deviceType, devices, &context, &cmdQueue, &program, oclfilename) == 1); cl_int errorCode = CL_SUCCESS; //Create device memory objects cl_mem devColid; cl_mem devData; cl_mem devVec; cl_mem devRes; cl_mem devTexVec; //Initialize values int col_align = mat->b4ell_height_aligned; int data_align = mat->b4ell_float4_aligned; int nnz = mat->matinfo.nnz; int rownum = mat->matinfo.height; int blockrownum = mat->b4ell_row_num; int vecsize = mat->matinfo.width; int b4ellnum = mat->b4ell_block_num; int bwidth = mat->b4ell_bwidth; int bheight = mat->b4ell_bheight; int width4num = bwidth / 4; int padveclen = findPaddedSize(vecsize, 8); float* paddedvec = (float*)malloc(sizeof(float)*padveclen); memset(paddedvec, 0, sizeof(float)*padveclen); memcpy(paddedvec, vec, sizeof(float)*vecsize); ALLOCATE_GPU_READ(devColid, mat->b4ell_col_id, sizeof(int)*col_align*b4ellnum); ALLOCATE_GPU_READ(devData, mat->b4ell_data, sizeof(float)*data_align*bheight*width4num*b4ellnum); ALLOCATE_GPU_READ(devVec, paddedvec, sizeof(float)*padveclen); int paddedres = findPaddedSize(rownum, 512); devRes = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*paddedres, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; const cl_image_format floatFormat = { CL_RGBA, CL_FLOAT, }; int width = VEC2DWIDTH; int height = (vecsize + VEC2DWIDTH - 1)/VEC2DWIDTH; if (height % 4 != 0) height += (4 - (height % 4)); float* image2dVec = (float*)malloc(sizeof(float)*width*height); memset(image2dVec, 0, sizeof(float)*width*height); for (int i = 0; i < vecsize; i++) { image2dVec[i] = vec[i]; } size_t origin[] = {0, 0, 0}; size_t vectorSize[] = {width, height/4, 1}; devTexVec = clCreateImage2D(context, CL_MEM_READ_ONLY, &floatFormat, width, height/4, 0, NULL, &errorCode); CHECKERROR; errorCode = clEnqueueWriteImage(cmdQueue, devTexVec, CL_TRUE, origin, vectorSize, 0, 0, image2dVec, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); //printf("\nvec length %d padded length %d", mat->matinfo.width, padveclength); opttime = 10000.0f; optmethod = 0; int dim2 = dim2Size; { int methodid = 0; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 1; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_mad"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block mad cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 100; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_tx"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } { int methodid = 101; cl_uint work_dim = 2; size_t blocksize[] = {BELL_GROUP_SIZE, 1}; int gsize = ((blockrownum + BELL_GROUP_SIZE - 1)/BELL_GROUP_SIZE)*BELL_GROUP_SIZE; size_t globalsize[] = {gsize, dim2}; int data_align4 = data_align / 4; char kernelname[100] = "gpu_bell00_mad_tx"; kernelname[8] += bh; kernelname[9] += bw; cl_kernel csrKernel = NULL; csrKernel = clCreateKernel(program, kernelname, &errorCode); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 0, sizeof(cl_mem), &devColid); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 1, sizeof(cl_mem), &devData); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 2, sizeof(int), &data_align4); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 3, sizeof(int), &col_align); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 4, sizeof(int), &b4ellnum); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 5, sizeof(cl_mem), &devTexVec); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 6, sizeof(cl_mem), &devRes); CHECKERROR; errorCode = clSetKernelArg(csrKernel, 7, sizeof(int), &blockrownum); CHECKERROR; errorCode = clEnqueueWriteBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, result, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); float* tmpresult = (float*)malloc(sizeof(float)*rownum); errorCode = clEnqueueReadBuffer(cmdQueue, devRes, CL_TRUE, 0, sizeof(float)*rownum, tmpresult, 0, NULL, NULL); CHECKERROR; clFinish(cmdQueue); two_vec_compare(coores, tmpresult, rownum); free(tmpresult); for (int k = 0; k < 3; k++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double teststart = timestamp(); for (int i = 0; i < ntimes; i++) { errorCode = clEnqueueNDRangeKernel(cmdQueue, csrKernel, work_dim, NULL, globalsize, blocksize, 0, NULL, NULL); CHECKERROR; } clFinish(cmdQueue); double testend = timestamp(); double time_in_sec = (testend - teststart)/(double)dim2; double gflops = (double)nnz*2/(time_in_sec/(double)ntimes)/(double)1e9; printf("\nBELL %dx%d block mad tx cpu time %lf ms GFLOPS %lf code %d \n\n", bh, bw, time_in_sec / (double) ntimes * 1000, gflops, methodid); if (csrKernel) clReleaseKernel(csrKernel); double onetime = time_in_sec / (double) ntimes; if (onetime < opttime) { opttime = onetime; optmethod = methodid; } } //Clean up if (image2dVec) free(image2dVec); if (devColid) clReleaseMemObject(devColid); if (devData) clReleaseMemObject(devData); if (devVec) clReleaseMemObject(devVec); if (devTexVec) clReleaseMemObject(devTexVec); if (devRes) clReleaseMemObject(devRes); freeObjects(devices, &context, &cmdQueue, &program); }
int main(int argc, char *argv[]){ // check commandline parameters if (argc < 3) { fprintf(stderr, "Usage: %s [kernel] [length of vector] [dim]\n", argv[0]); exit(1); } cl_int errorCode; cl_device_type deviceType = CL_DEVICE_TYPE_CPU; cl_device_id * devices = NULL; cl_context context = NULL; cl_command_queue cmdQueue = NULL; cl_program program = NULL; char *kernelfile = argv[1]; int length = atoi(argv[2]); int dim = atoi(argv[3]); assert(initialization( deviceType, devices, &context, &cmdQueue, &program, kernelfile)); float *X = (float*) malloc(sizeof(float)*length); float *Y = (float*) malloc(sizeof(float)*length); float *Z = (float*) malloc(sizeof(float)*length); for (int i = 0; i < length; i++) { X[i] = (float)i + 0.1; Y[i] = (float)i + 0.2; Z[i] = 0.0; } cl_mem X_mem, Y_mem, Z_mem; ALLOCATE_GPU_READ(X_mem, X, sizeof(float)*length); ALLOCATE_GPU_READ(Y_mem, Y, sizeof(float)*length); ALLOCATE_GPU_READ_WRITE_INIT(Z_mem, Z, sizeof(float)*length); size_t globalSize[1] = {length / dim}; size_t localSize[1] = {1}; float alpha = 0.2; cl_kernel kernel = clCreateKernel(program, "saxpy_naive", &errorCode); CHECKERROR; errorCode = clSetKernelArg(kernel, 0, sizeof(cl_mem), &X_mem); CHECKERROR; errorCode = clSetKernelArg(kernel, 1, sizeof(cl_mem), &Y_mem); CHECKERROR; errorCode = clSetKernelArg(kernel, 2, sizeof(cl_mem), &Z_mem); CHECKERROR; errorCode = clSetKernelArg(kernel, 3, sizeof(cl_float), &alpha); CHECKERROR; errorCode = clSetKernelArg(kernel, 4, sizeof(cl_int), &dim); CHECKERROR; errorCode = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, NULL); CHECKERROR; printf("Start to Run ...\n"); cl_event runEvent; errorCode = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalSize, localSize, 0, NULL, &runEvent); CHECKERROR; errorCode = clFinish(cmdQueue); printf("Execution Time: %.2fns\n", executionTime(runEvent) / length * 1e9); printf("Start to Readback ...\n"); errorCode = clEnqueueReadBuffer(cmdQueue, Z_mem, CL_TRUE, 0, sizeof(float)*length, Z, 0, NULL, NULL); CHECKERROR; printf("Checking Correctness ...\n"); for (int i = 0; i < length; i++) { float res = X[i] * alpha + Y[i]; float ans = Z[i]; if (res - ans > 1E-4 || res - ans < -1E-4) { printf("%.10f %.10f %.10f\n", res, ans, res-ans); fprintf(stderr, "ERROR!"); exit(1); } } printf("OK\n"); return 0; }