Beispiel #1
template<> void
xSpMdV<float>::xSpMdV_Function( bool flush )
    clsparseStatus status = clsparseScsrmv( &a, &csrMtx, &x, &b, &y, control );

    if( flush )
        clFinish( queue );
Beispiel #2
    void test_csrmv()
        clsparseStatus status;
        cl_int cl_status;

        clsparseEnableExtendedPrecision(CLSE::control, extended_precision);

        if (typeid(T) == typeid(cl_float) )
            status = clsparseScsrmv(&gAlpha, &CSRE::csrSMatrix, &gX,
                                    &gBeta, &gY, CLSE::control);

            ASSERT_EQ(clsparseSuccess, status);

            float* vals = (float*)&CSRE::ublasSCsr.value_data()[0];
            int* rows = &CSRE::ublasSCsr.index1_data()[0];
            int* cols = &CSRE::ublasSCsr.index2_data()[0];
            for (int row = 0; row < CSRE::n_rows; row++)
                // Summation done at a higher precision to decrease
                // summation errors from rounding.
                hY[row] *= hBeta;
                int row_end = rows[row+1];
                double temp_sum;
                temp_sum = hY[row];
                for (int i = rows[row]; i < rows[row+1]; i++)
                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
                    temp_sum += hAlpha * vals[i] * hX[cols[i]];
                hY[row] = temp_sum;

            T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                       CL_TRUE, CL_MAP_READ,
                                                       0, gY.num_values * sizeof(T),
                                                       0, nullptr, nullptr, &cl_status);
            ASSERT_EQ(CL_SUCCESS, cl_status);

            uint64_t max_ulps = 0;
            uint64_t min_ulps = UINT64_MAX;
            uint64_t total_ulps = 0;
            for (int i = 0; i < hY.size(); i++)
                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
                intDiff = llabs(intDiff);
                total_ulps += intDiff;
                if (max_ulps < intDiff)
                    max_ulps = intDiff;
                if (min_ulps > intDiff)
                    min_ulps = intDiff;
                // Debug printouts.
                //std::cout << "Row " << i << " Float Ulps: " << intDiff << std::endl;
                //std::cout << "\tFloat hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint32_t *)&hY[i] << "), " << std::dec;
                //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint32_t *)&host_result[i] << ")" << std::dec << std::endl;
#ifndef NDEBUG
            if (extended_precision)
                std::cout << "Float Min ulps: " << min_ulps << std::endl;
                std::cout << "Float Max ulps: " << max_ulps << std::endl;
                std::cout << "Float Total ulps: " << total_ulps << std::endl;
                std::cout << "Float Average ulps: " << (double)total_ulps/(double)hY.size() <<  " (Size: " << hY.size() << ")" << std::endl;

            for (int i = 0; i < hY.size(); i++)
                double compare_val = 0.;
                if (extended_precision)
                    // The limit here is somewhat weak because some GPUs don't
                    // support correctly rounded denorms in SPFP mode.
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*1e-3);
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*0.1);
                if (compare_val < 10*FLT_EPSILON)
                    compare_val = 10*FLT_EPSILON;
                ASSERT_NEAR(hY[i], host_result[i], compare_val);

            cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                  host_result, 0, nullptr, nullptr);
            ASSERT_EQ(CL_SUCCESS, cl_status);

        if (typeid(T) == typeid(cl_double) )
            status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX,
                                    &gBeta, &gY, CLSE::control);

            ASSERT_EQ(clsparseSuccess, status);

            double* vals = (double*)&CSRE::ublasDCsr.value_data()[0];
            int* rows = &CSRE::ublasDCsr.index1_data()[0];
            int* cols = &CSRE::ublasDCsr.index2_data()[0];
            for (int row = 0; row < CSRE::n_rows; row++)
                // Summation done using a compensated summation to decrease
                // summation errors from rounding. This allows us to get
                // smaller errors without requiring quad precision support.
                // This method is like performing summation at quad precision and
                // casting down to double in the end.
                hY[row] *= hBeta;
                int row_end = rows[row+1];
                double temp_sum;
                temp_sum = hY[row];
                T sumk_err = 0.;
                for (int i = rows[row]; i < rows[row+1]; i++)
                    // Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
                    temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err);
                hY[row] = temp_sum + sumk_err;

            T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
                                                       CL_TRUE, CL_MAP_READ,
                                                       0, gY.num_values * sizeof(T),
                                                       0, nullptr, nullptr, &cl_status);
            ASSERT_EQ(CL_SUCCESS, cl_status);

            uint64_t max_ulps = 0;
            uint64_t min_ulps = ULLONG_MAX;
            uint64_t total_ulps = 0;
            for (int i = 0; i < hY.size(); i++)
                long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
                intDiff = llabs(intDiff);
                total_ulps += intDiff;
                if (max_ulps < intDiff)
                    max_ulps = intDiff;
                if (min_ulps > intDiff)
                    min_ulps = intDiff;
                // Debug printouts.
                //std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl;
                //std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec;
                //std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl;
            if (extended_precision)
#ifndef NDEBUG
                std::cout << "Double Min ulps: " << min_ulps << std::endl;
                std::cout << "Double Max ulps: " << max_ulps << std::endl;
                std::cout << "Double Total ulps: " << total_ulps << std::endl;
                std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() <<  " (Size: " << hY.size() << ")" << std::endl;

                for (int i = 0; i < hY.size(); i++)
                    double compare_val = fabs(hY[i]*1e-14);
                    if (compare_val < 10*DBL_EPSILON)
                        compare_val = 10*DBL_EPSILON;
                    ASSERT_NEAR(hY[i], host_result[i], compare_val);
                for (int i = 0; i < hY.size(); i++)
                    double compare_val = 0.;
                    if (boost::math::isnormal(hY[i]))
                        compare_val = fabs(hY[i]*0.1);
                    if (compare_val < 10*DBL_EPSILON)
                        compare_val = 10*DBL_EPSILON;
                    ASSERT_NEAR(hY[i], host_result[i], compare_val);

            cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
                                                  host_result, 0, nullptr, nullptr);
            ASSERT_EQ(CL_SUCCESS, cl_status);
        // Reset output buffer for next test.
        gY.values = clCreateBuffer(CLSE::context,
                hY.size() * sizeof(T),,
        gY.num_values = hY.size();
        ASSERT_EQ(CL_SUCCESS, cl_status);