void setup_buffer( double pAlpha, double pBeta, const std::string& path ) { sparseFile = path; // Read sparse data from file and construct a COO matrix from it int nnz, row, col; clsparseStatus fileError = clsparseHeaderfromFile( &nnz, &row, &col, sparseFile.c_str( ) ); if( fileError != clsparseSuccess ) throw clsparse::io_exception( "Could not read matrix market header from disk" ); // Now initialise a CSR matrix from the COO matrix clsparseInitCsrMatrix( &csrMtx ); csrMtx.num_nonzeros = nnz; csrMtx.num_rows = row; csrMtx.num_cols = col; clsparseCsrMetaSize( &csrMtx, control ); cl_int status; csrMtx.values = ::clCreateBuffer( ctx, CL_MEM_READ_ONLY, csrMtx.num_nonzeros * sizeof( T ), NULL, &status ); CLSPARSE_V( status, "::clCreateBuffer csrMtx.values" ); csrMtx.colIndices = ::clCreateBuffer( ctx, CL_MEM_READ_ONLY, csrMtx.num_nonzeros * sizeof( cl_int ), NULL, &status ); CLSPARSE_V( status, "::clCreateBuffer csrMtx.colIndices" ); csrMtx.rowOffsets = ::clCreateBuffer( ctx, CL_MEM_READ_ONLY, ( csrMtx.num_rows + 1 ) * sizeof( cl_int ), NULL, &status ); CLSPARSE_V( status, "::clCreateBuffer csrMtx.rowOffsets" ); csrMtx.rowBlocks = ::clCreateBuffer( ctx, CL_MEM_READ_ONLY, csrMtx.rowBlockSize * sizeof( cl_ulong ), NULL, &status ); CLSPARSE_V( status, "::clCreateBuffer csrMtx.rowBlocks" ); if(typeid(T) == typeid(float)) fileError = clsparseSCsrMatrixfromFile( &csrMtx, sparseFile.c_str( ), control ); else if (typeid(T) == typeid(double)) fileError = clsparseDCsrMatrixfromFile( &csrMtx, sparseFile.c_str( ), control ); else fileError = clsparseInvalidType; if( fileError != clsparseSuccess ) throw std::runtime_error( "Could not read matrix market data from disk" ); // Initialize the dense X & Y vectors that we multiply against the sparse matrix clsparseInitVector( &x ); x.num_values = csrMtx.num_rows; x.values = ::clCreateBuffer( ctx, CL_MEM_READ_WRITE, x.num_values * sizeof( T ), NULL, &status ); CLSPARSE_V( status, "::clCreateBuffer x.values" ); clsparseInitVector( &y ); y.num_values = csrMtx.num_cols; y.values = ::clCreateBuffer( ctx, CL_MEM_READ_WRITE, y.num_values * sizeof( T ), NULL, &status ); CLSPARSE_V( status, "::clCreateBuffer y.values" ); }
void TearDown() { clsparseReleaseSolverControl(solverControl); ::clReleaseMemObject(gX.values); ::clReleaseMemObject(gB.values); clsparseInitVector(&gX); clsparseInitVector(&gB); }
void TearDown() { ::clReleaseMemObject(gAlpha.value); ::clReleaseMemObject(gBeta.value); ::clReleaseMemObject(gX.values); ::clReleaseMemObject(gY.values); clsparseInitScalar(&gAlpha); clsparseInitScalar(&gBeta); clsparseInitVector(&gX); clsparseInitVector(&gY); }
clsparseStatus dot(clsparseScalarPrivate* pR, const cldenseVectorPrivate* pX, const cldenseVectorPrivate* pY, const clsparseControl control) { cl_int status; init_scalar(pR, (T)0, control); // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed // within one block; const cl_ulong REDUCE_BLOCKS_NUMBER = 256; /* For future optimisation //workgroups per compute units; const cl_uint WG_PER_CU = 64; const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU; */ const cl_ulong REDUCE_BLOCK_SIZE = 256; cl_ulong xSize = pX->num_values - pX->offset(); cl_ulong ySize = pY->num_values - pY->offset(); assert (xSize == ySize); cl_ulong size = xSize; if (size > 0) { cl::Context context = control->getContext(); //partial result cldenseVectorPrivate partial; clsparseInitVector(&partial); partial.num_values = REDUCE_BLOCKS_NUMBER; clMemRAII<T> rPartial (control->queue(), &partial.values, partial.num_values); status = inner_product<T>(&partial, pX, pY, size, REDUCE_BLOCKS_NUMBER, REDUCE_BLOCK_SIZE, control); if (status != clsparseSuccess) { return clsparseInvalidKernelExecution; } status = atomic_reduce<T>(pR, &partial, REDUCE_BLOCK_SIZE, control); if (status != CL_SUCCESS) { return clsparseInvalidKernelExecution; } } return clsparseSuccess; }
void SetUp() { clsparseInitScalar(&gAlpha); clsparseInitScalar(&gBeta); clsparseInitVector(&gX); clsparseInitVector(&gY); hAlpha = T(CSRE::alpha); hBeta = T(CSRE::beta); hX = uBLAS::vector<T>(CSRE::n_cols, 1); hY = uBLAS::vector<T>(CSRE::n_rows, 2); cl_int status; gX.values = clCreateBuffer(CLSE::context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, hX.size() * sizeof(T), hX.data().begin(), &status); gX.num_values = hX.size(); ASSERT_EQ(CL_SUCCESS, status); gY.values = clCreateBuffer(CLSE::context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, hY.size() * sizeof(T), hY.data().begin(), &status); gY.num_values = hY.size(); ASSERT_EQ(CL_SUCCESS, status); gAlpha.value = clCreateBuffer(CLSE::context, CL_MEM_READ_ONLY| CL_MEM_COPY_HOST_PTR, sizeof(T), &hAlpha, &status); ASSERT_EQ(CL_SUCCESS, status); gBeta.value = clCreateBuffer(CLSE::context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(T), &hBeta, &status); ASSERT_EQ(CL_SUCCESS, status); }
void SetUp() { // Setup solver control clsparseStatus status; solverControl = clsparseCreateSolverControl(precond, maxIterations, relativeTolerance, absoluteTolerance); ASSERT_NE(nullptr, solverControl); status = clsparseSolverPrintMode(solverControl, printMode); ASSERT_EQ(clsparseSuccess, status); // Setup rhs and vector of unknowns hX = uBLAS::vector<T>(CSRE::n_cols, (T) initialUnknownsValue); hB = uBLAS::vector<T>(CSRE::n_rows, (T) initialRhsValue); clsparseInitVector(&gX); clsparseInitVector(&gB); cl_int cl_status; gX.values = clCreateBuffer(CLSE::context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, hX.size() * sizeof(T), hX.data().begin(), &cl_status); gX.num_values = hX.size(); ASSERT_EQ(CL_SUCCESS, cl_status); gB.values = clCreateBuffer(CLSE::context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, hB.size() * sizeof(T), hB.data().begin(), &cl_status); gB.num_values = hB.size(); ASSERT_EQ(CL_SUCCESS, cl_status); }
void test_csrmv() { clsparseStatus status; cl_int cl_status; clsparseEnableExtendedPrecision(CLSE::control, extended_precision); if (typeid(T) == typeid(cl_float) ) { status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX, &gBeta, &gY, CLSE::control); ASSERT_EQ(clsparseSuccess, status); float* vals = (float*)&CSRE::ublasSCsr.value_data()[0]; int* rows = &CSRE::ublasSCsr.index1_data()[0]; int* cols = &CSRE::ublasSCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { // Summation done at a higher precision to decrease // summation errors from rounding. hY[row] *= hBeta; int row_end = rows[row+1]; double temp_sum; temp_sum = hY[row]; for (int i = rows[row]; i < rows[row+1]; i++) { // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; temp_sum += hAlpha * vals[i] * hX[cols[i]]; } hY[row] = temp_sum; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); uint64_t max_ulps = 0; uint64_t min_ulps = UINT64_MAX; uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) { long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); intDiff = llabs(intDiff); total_ulps += intDiff; if (max_ulps < intDiff) max_ulps = intDiff; if (min_ulps > intDiff) min_ulps = intDiff; // Debug printouts. //std::cout << "Row " << i << " Float Ulps: " << intDiff << std::endl; //std::cout.precision(9); //std::cout << "\tFloat hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint32_t *)&hY[i] << "), " << std::dec; //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint32_t *)&host_result[i] << ")" << std::dec << std::endl; } #ifndef NDEBUG if (extended_precision) { std::cout << "Float Min ulps: " << min_ulps << std::endl; std::cout << "Float Max ulps: " << max_ulps << std::endl; std::cout << "Float Total ulps: " << total_ulps << std::endl; std::cout << "Float Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl; } #endif for (int i = 0; i < hY.size(); i++) { double compare_val = 0.; if (extended_precision) { // The limit here is somewhat weak because some GPUs don't // support correctly rounded denorms in SPFP mode. if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*1e-3); } else { if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*0.1); } if (compare_val < 10*FLT_EPSILON) compare_val = 10*FLT_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } if (typeid(T) == typeid(cl_double) ) { status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX, &gBeta, &gY, CLSE::control); ASSERT_EQ(clsparseSuccess, status); double* vals = (double*)&CSRE::ublasDCsr.value_data()[0]; int* rows = &CSRE::ublasDCsr.index1_data()[0]; int* cols = &CSRE::ublasDCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { // Summation done using a compensated summation to decrease // summation errors from rounding. This allows us to get // smaller errors without requiring quad precision support. // This method is like performing summation at quad precision and // casting down to double in the end. hY[row] *= hBeta; int row_end = rows[row+1]; double temp_sum; temp_sum = hY[row]; T sumk_err = 0.; for (int i = rows[row]; i < rows[row+1]; i++) { // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err); } hY[row] = temp_sum + sumk_err; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); uint64_t max_ulps = 0; uint64_t min_ulps = ULLONG_MAX; uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) { long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); intDiff = llabs(intDiff); total_ulps += intDiff; if (max_ulps < intDiff) max_ulps = intDiff; if (min_ulps > intDiff) min_ulps = intDiff; // Debug printouts. //std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl; //std::cout.precision(17); //std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec; //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl; } if (extended_precision) { #ifndef NDEBUG std::cout << "Double Min ulps: " << min_ulps << std::endl; std::cout << "Double Max ulps: " << max_ulps << std::endl; std::cout << "Double Total ulps: " << total_ulps << std::endl; std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl; #endif for (int i = 0; i < hY.size(); i++) { double compare_val = fabs(hY[i]*1e-14); if (compare_val < 10*DBL_EPSILON) compare_val = 10*DBL_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } } else { for (int i = 0; i < hY.size(); i++) { double compare_val = 0.; if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*0.1); if (compare_val < 10*DBL_EPSILON) compare_val = 10*DBL_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } // Reset output buffer for next test. ::clReleaseMemObject(gY.values); clsparseInitVector(&gY); gY.values = clCreateBuffer(CLSE::context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, hY.size() * sizeof(T), hY.data().begin(), &cl_status); gY.num_values = hY.size(); ASSERT_EQ(CL_SUCCESS, cl_status); }