template<> void xSpMdV<double>::xSpMdV_Function( bool flush ) { clsparseStatus status = clsparseDcsrmv( &a, &csrMtx, &x, &b, &y, control ); if( flush ) clFinish( queue ); }
void test_csrmv() { clsparseStatus status; cl_int cl_status; clsparseEnableExtendedPrecision(CLSE::control, extended_precision); if (typeid(T) == typeid(cl_float) ) { status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX, &gBeta, &gY, CLSE::control); ASSERT_EQ(clsparseSuccess, status); float* vals = (float*)&CSRE::ublasSCsr.value_data()[0]; int* rows = &CSRE::ublasSCsr.index1_data()[0]; int* cols = &CSRE::ublasSCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { // Summation done at a higher precision to decrease // summation errors from rounding. hY[row] *= hBeta; int row_end = rows[row+1]; double temp_sum; temp_sum = hY[row]; for (int i = rows[row]; i < rows[row+1]; i++) { // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; temp_sum += hAlpha * vals[i] * hX[cols[i]]; } hY[row] = temp_sum; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); uint64_t max_ulps = 0; uint64_t min_ulps = UINT64_MAX; uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) { long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); intDiff = llabs(intDiff); total_ulps += intDiff; if (max_ulps < intDiff) max_ulps = intDiff; if (min_ulps > intDiff) min_ulps = intDiff; // Debug printouts. //std::cout << "Row " << i << " Float Ulps: " << intDiff << std::endl; //std::cout.precision(9); //std::cout << "\tFloat hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint32_t *)&hY[i] << "), " << std::dec; //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint32_t *)&host_result[i] << ")" << std::dec << std::endl; } #ifndef NDEBUG if (extended_precision) { std::cout << "Float Min ulps: " << min_ulps << std::endl; std::cout << "Float Max ulps: " << max_ulps << std::endl; std::cout << "Float Total ulps: " << total_ulps << std::endl; std::cout << "Float Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl; } #endif for (int i = 0; i < hY.size(); i++) { double compare_val = 0.; if (extended_precision) { // The limit here is somewhat weak because some GPUs don't // support correctly rounded denorms in SPFP mode. if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*1e-3); } else { if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*0.1); } if (compare_val < 10*FLT_EPSILON) compare_val = 10*FLT_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } if (typeid(T) == typeid(cl_double) ) { status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX, &gBeta, &gY, CLSE::control); ASSERT_EQ(clsparseSuccess, status); double* vals = (double*)&CSRE::ublasDCsr.value_data()[0]; int* rows = &CSRE::ublasDCsr.index1_data()[0]; int* cols = &CSRE::ublasDCsr.index2_data()[0]; for (int row = 0; row < CSRE::n_rows; row++) { // Summation done using a compensated summation to decrease // summation errors from rounding. This allows us to get // smaller errors without requiring quad precision support. // This method is like performing summation at quad precision and // casting down to double in the end. hY[row] *= hBeta; int row_end = rows[row+1]; double temp_sum; temp_sum = hY[row]; T sumk_err = 0.; for (int i = rows[row]; i < rows[row+1]; i++) { // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]]; temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err); } hY[row] = temp_sum + sumk_err; } T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values, CL_TRUE, CL_MAP_READ, 0, gY.num_values * sizeof(T), 0, nullptr, nullptr, &cl_status); ASSERT_EQ(CL_SUCCESS, cl_status); uint64_t max_ulps = 0; uint64_t min_ulps = ULLONG_MAX; uint64_t total_ulps = 0; for (int i = 0; i < hY.size(); i++) { long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]); intDiff = llabs(intDiff); total_ulps += intDiff; if (max_ulps < intDiff) max_ulps = intDiff; if (min_ulps > intDiff) min_ulps = intDiff; // Debug printouts. //std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl; //std::cout.precision(17); //std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec; //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl; } if (extended_precision) { #ifndef NDEBUG std::cout << "Double Min ulps: " << min_ulps << std::endl; std::cout << "Double Max ulps: " << max_ulps << std::endl; std::cout << "Double Total ulps: " << total_ulps << std::endl; std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl; #endif for (int i = 0; i < hY.size(); i++) { double compare_val = fabs(hY[i]*1e-14); if (compare_val < 10*DBL_EPSILON) compare_val = 10*DBL_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } } else { for (int i = 0; i < hY.size(); i++) { double compare_val = 0.; if (boost::math::isnormal(hY[i])) compare_val = fabs(hY[i]*0.1); if (compare_val < 10*DBL_EPSILON) compare_val = 10*DBL_EPSILON; ASSERT_NEAR(hY[i], host_result[i], compare_val); } } cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values, host_result, 0, nullptr, nullptr); ASSERT_EQ(CL_SUCCESS, cl_status); } // Reset output buffer for next test. ::clReleaseMemObject(gY.values); clsparseInitVector(&gY); gY.values = clCreateBuffer(CLSE::context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, hY.size() * sizeof(T), hY.data().begin(), &cl_status); gY.num_values = hY.size(); ASSERT_EQ(CL_SUCCESS, cl_status); }