static double factorization( const multivector_type Q , const multivector_type R ) { const size_type count = Q.dimension_1(); value_view tmp("tmp"); value_view one("one"); KokkosArray::deep_copy( one , (Scalar) 1 ); KokkosArray::Impl::Timer timer ; for ( size_type j = 0 ; j < count ; ++j ) { // Reduction : tmp = dot( Q(:,j) , Q(:,j) ); // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ; const vector_type Qj( Q , j ); const value_view Rjj( R , j , j ); KokkosArray::dot( Qj , InvNorm2( Rjj , tmp ) ); // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; KokkosArray::scale( tmp , Qj ); for ( size_t k = j + 1 ; k < count ; ++k ) { const vector_type Qk( Q , k ); const value_view Rjk( R , j , k ); // Reduction : R(j,k) = dot( Q(:,j) , Q(:,k) ); // PostProcess : tmp = - R(j,k); KokkosArray::dot( Qj , Qk , DotM( Rjk , tmp ) ); // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j) KokkosArray::axpby( tmp , Qj , one , Qk ); } } device_type::fence(); return timer.seconds(); }
std::vector<double> test_product_tensor_legendre( const std::vector<int> & arg_var_degree , const int nGrid , const int iterCount , const bool check ) { typedef TensorType tensor_type ; typedef typename tensor_type::device_type device_type ; typedef KokkosArray::View< VectorScalar** , KokkosArray::LayoutLeft , device_type > vector_type ; typedef KokkosArray::BlockCrsMatrix< tensor_type , MatrixScalar , device_type > matrix_type ; typedef typename matrix_type::graph_type graph_type ; //------------------------------ // Generate graph for "FEM" box structure: std::vector< std::vector<size_t> > fem_graph ; const size_t fem_length = nGrid * nGrid * nGrid ; const size_t fem_graph_length = unit_test::generate_fem_graph( nGrid , fem_graph ); //------------------------------ // Generate CRS block-tensor matrix: const std::vector<unsigned> var_degree( arg_var_degree.begin() , arg_var_degree.end() ); const KokkosArray::TripleProductTensorLegendreCombinatorialEvaluation tensor( var_degree ); const size_t stoch_length = tensor.bases_count(); std::vector< std::vector< size_t > > stoch_graph( stoch_length ); for ( size_t i = 0 ; i < stoch_length ; ++i ) { for ( size_t j = 0 ; j < stoch_length ; ++j ) { if ( KokkosArray::matrix_nonzero(tensor,i,j) ) { stoch_graph[i].push_back(j); } } } //------------------------------ // Generate input multivector: vector_type x = vector_type( "x" , stoch_length , fem_length ); vector_type y = vector_type( "y" , stoch_length , fem_length ); typename vector_type::HostMirror hx = KokkosArray::create_mirror( x ); typename vector_type::HostMirror hy_result = KokkosArray::create_mirror( y ); for ( size_t iColFEM = 0 ; iColFEM < fem_length ; ++iColFEM ) { for ( size_t iColStoch = 0 ; iColStoch < stoch_length ; ++iColStoch ) { hx(iColStoch,iColFEM) = generate_vector_coefficient( fem_length , stoch_length , iColFEM , iColStoch ); }} KokkosArray::deep_copy( x , hx ); //------------------------------ matrix_type matrix ; matrix.block = tensor_type( var_degree ); matrix.graph = KokkosArray::create_crsarray<graph_type>( std::string("test crs graph") , fem_graph ); if ( stoch_length != matrix.block.dimension() ) { throw std::runtime_error("test_crs_product_tensor_legendre matrix sizing error"); } matrix.values = vector_type( "matrix" , stoch_length , fem_graph_length ); typename vector_type::HostMirror hM = KokkosArray::create_mirror( matrix.values ); for ( size_t iRowFEM = 0 , iEntryFEM = 0 ; iRowFEM < fem_length ; ++iRowFEM ) { for ( size_t iRowEntryFEM = 0 ; iRowEntryFEM < fem_graph[iRowFEM].size() ; ++iRowEntryFEM , ++iEntryFEM ) { const size_t iColFEM = fem_graph[iRowFEM][iRowEntryFEM] ; for ( size_t k = 0 ; k < stoch_length ; ++k ) { hM(k,iEntryFEM) = generate_matrix_coefficient( fem_length , stoch_length , iRowFEM , iColFEM , k ); } } } KokkosArray::deep_copy( matrix.values , hM ); //------------------------------ if (check) { for ( size_t iRowStoch = 0 ; iRowStoch < stoch_length ; ++iRowStoch ) { for ( size_t iRowFEM = 0 , iEntryFEM = 0 ; iRowFEM < fem_length ; ++iRowFEM ) { double y = 0 ; for ( size_t iRowEntryFEM = 0 ; iRowEntryFEM < fem_graph[ iRowFEM ].size() ; ++iRowEntryFEM , ++iEntryFEM ) { const size_t iColFEM = fem_graph[iRowFEM][iRowEntryFEM] ; for ( size_t iRowEntryStoch = 0 ; iRowEntryStoch < stoch_graph[iRowStoch].size() ; ++iRowEntryStoch ) { const size_t iColStoch = stoch_graph[iRowStoch][iRowEntryStoch]; double value = 0 ; for ( unsigned k = 0 ; k < stoch_length ; ++k ) { const double A_fem_k = generate_matrix_coefficient( fem_length , stoch_length , iRowFEM , iColFEM , k ); if ( 1.0e-6 < std::abs( hM(k,iEntryFEM) - A_fem_k ) ) { std::cout << "test_crs_product_tensor_legendre error: Matrix entry" << " A(" << k << ",(" << iRowFEM << "," << iColFEM << ")) = " << hM(k,iEntryFEM) << " , error = " << hM(k,iEntryFEM) - A_fem_k << std::endl ; } value += tensor(iRowStoch,iColStoch,k) * A_fem_k ; } y += value * hx( iColStoch , iColFEM ); } } hy_result( iRowStoch , iRowFEM ) = y ; } } } //------------------------------ const KokkosArray::Impl::Multiply< matrix_type , vector_type , vector_type > op( matrix , x , y ); KokkosArray::Impl::Timer clock ; for ( int iter = 0 ; iter < iterCount ; ++iter ) { op.run(); } device_type::fence(); const double seconds_per_iter = clock.seconds() / ((double) iterCount ); const double flops_per_block = matrix.block.multiply_add_flops(); const double flops = 1.0e-9*fem_graph_length*flops_per_block / seconds_per_iter; //------------------------------ // Verify result if (check) { const double tol = KokkosArray::Impl::is_same<double,VectorScalar>::value ? 1.0e-13 : 1.0e-5 ; const size_t error_max = 10 ; KokkosArray::deep_copy( hx , y ); size_t error_count = 0 ; for ( size_t iRowFEM = 0 ; iRowFEM < fem_length ; ++iRowFEM ) { for ( size_t iRowStoch = 0 ; iRowStoch < stoch_length ; ++iRowStoch ) { const double mag = std::abs( hy_result(iRowStoch,iRowFEM) ); const double error = std::abs( hx(iRowStoch,iRowFEM) - hy_result(iRowStoch,iRowFEM) ); if ( tol < error && tol < error / mag ) { if ( error_count < error_max ) { std::cout << "test_product_tensor_legendre error:" << " y(" << iRowStoch << "," << iRowFEM << ") = " << hx(iRowStoch,iRowFEM) << " , error = " << ( hx(iRowStoch,iRowFEM) - hy_result(iRowStoch,iRowFEM) ) << std::endl ; } ++error_count ; } } } if ( error_count ) { std::cout << "test_crs_product_tensor_legendre error_count = " << error_count << std::endl ; } } //------------------------------ std::vector<double> perf(3) ; perf[0] = fem_length * stoch_length ; perf[1] = seconds_per_iter ; perf[2] = flops ; return perf ; }
void test( const std::string & label , const size_t elem_count , const size_t iter_count ) { KokkosArray::Impl::Timer timer ; double seconds_scalar ; double seconds_multi ; double seconds_array1 ; double seconds_array4 ; double seconds_array16 ; { // Loop 16 times: Explicit::TestHexGrad<double,float,Device> test_scalar( elem_count ); timer.reset(); for ( size_t i = 0 ; i < iter_count * 16 ; ++i ) { test_scalar.apply(); } Device::fence(); seconds_scalar = timer.seconds() / ( 16 * iter_count * elem_count ); } { // 16 x elements Explicit::TestHexGrad<double,float,Device> test_multiple( elem_count * 16 ); timer.reset(); for ( size_t i = 0 ; i < iter_count ; ++i ) { test_multiple.apply(); } Device::fence(); seconds_multi = timer.seconds() / ( 16 * iter_count * elem_count ); } { // 16 x elements with Array<1> typedef KokkosArray::Array<double,1> coord_scalar_type ; typedef KokkosArray::Array<float,1> grad_scalar_type ; Explicit::TestHexGrad<coord_scalar_type,grad_scalar_type,Device> test_array( elem_count * 16 ); timer.reset(); for ( size_t i = 0 ; i < iter_count ; ++i ) { test_array.apply(); } Device::fence(); seconds_array1 = timer.seconds() / ( 16 * iter_count * elem_count ); } { // 4 x elements with Array<4> typedef KokkosArray::Array<double,4> coord_scalar_type ; typedef KokkosArray::Array<float,4> grad_scalar_type ; Explicit::TestHexGrad<coord_scalar_type,grad_scalar_type,Device> test_array( elem_count * 4 ); timer.reset(); for ( size_t i = 0 ; i < iter_count ; ++i ) { test_array.apply(); } Device::fence(); seconds_array4 = timer.seconds() / ( 16 * iter_count * elem_count ); } { // 1 x elements with Array<16> typedef KokkosArray::Array<double,16> coord_scalar_type ; typedef KokkosArray::Array<float,16> grad_scalar_type ; Explicit::TestHexGrad<coord_scalar_type,grad_scalar_type,Device> test_array( elem_count ); timer.reset(); for ( size_t i = 0 ; i < iter_count ; ++i ) { test_array.apply(); } Device::fence(); seconds_array16 = timer.seconds() / ( 16 * iter_count * elem_count ); } std::cout << label << " scalar( " << seconds_scalar << " ) multi( " << seconds_multi << " )" << " ) array1( " << seconds_array1 << " )" << " ) array4( " << seconds_array4 << " )" << " ) array16( " << seconds_array16 << " )" << std::endl ; }