int main( int argc, char* argv[] ) { int N = -1; // number of rows 2^12 int M = -1; // number of columns 2^10 int S = -1; // total size 2^22 int nrepeat = 100; // number of repeats of the test // Read command line arguments. for ( int i = 0; i < argc; i++ ) { if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { N = pow( 2, atoi( argv[ ++i ] ) ); printf( " User N is %d\n", N ); } else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { M = pow( 2, atof( argv[ ++i ] ) ); printf( " User M is %d\n", M ); } else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { S = pow( 2, atof( argv[ ++i ] ) ); printf( " User S is %d\n", S ); } else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { nrepeat = atoi( argv[ ++i ] ); } else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { printf( " y^T*A*x Options:\n" ); printf( " -Rows (-N) <int>: exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); printf( " -Columns (-M) <int>: exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); printf( " -Size (-S) <int>: exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); printf( " -nrepeat <int>: number of repetitions (default: 100)\n" ); printf( " -help (-h): print this message\n\n" ); exit( 1 ); } } // Check sizes. checkSizes( N, M, S, nrepeat ); Kokkos::initialize( argc, argv ); // EXERCISE give-away: Choose an Execution Space. // typedef Kokkos::Serial ExecSpace; // typedef Kokkos::Threads ExecSpace; // typedef Kokkos::OpenMP ExecSpace; // typedef Kokkos::Cuda ExecSpace; // EXERCISE: Choose device memory space. // typedef Kokkos::HostSpace MemSpace; // typedef Kokkos::OpenMP MemSpace; // typedef Kokkos::CudaSpace MemSpace; // typedef Kokkos::CudaUVMSpace MemSpace; // EXERCISE give-away: Choose a Layout. // EXERCISE: When exercise is correctly implemented, then // either layout will generate the correct answer. // However, performance will be different! // typedef Kokkos::LayoutLeft Layout; // typedef Kokkos::LayoutRight Layout; // EXERCISE give-away: Use a RangePolicy. // typedef Kokkos::RangePolicy<ExecSpace> range_policy; // Allocate y, x vectors and Matrix A on device. // EXERCISE: Use MemSpace and Layout. typedef Kokkos::View<double*> ViewVectorType; typedef Kokkos::View<double**> ViewMatrixType; ViewVectorType y( "y", N ); ViewVectorType x( "x", M ); ViewMatrixType A( "A", N, M ); // Create host mirrors of device views. ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y ); ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x ); ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A ); // Initialize y vector on host. for ( int i = 0; i < N; ++i ) { h_y( i ) = 1; } // Initialize x vector on host. for ( int i = 0; i < M; ++i ) { h_x( i ) = 1; } // Initialize A matrix on host. for ( int j = 0; j < N; ++j ) { for ( int i = 0; i < M; ++i ) { h_A( j, i ) = 1; } } // Deep copy host views to device views. Kokkos::deep_copy( y, h_y ); Kokkos::deep_copy( x, h_x ); Kokkos::deep_copy( A, h_A ); // Timer products. struct timeval begin, end; gettimeofday( &begin, NULL ); for ( int repeat = 0; repeat < nrepeat; repeat++ ) { // Application: <y,Ax> = y^T*A*x double result = 0; // EXERCISE: Use Kokkos::RangePolicy<ExecSpace> to execute parallel_reduce // in the correct space. Kokkos::parallel_reduce( N, KOKKOS_LAMBDA ( int j, double &update ) { double temp2 = 0; for ( int i = 0; i < M; ++i ) { temp2 += A( j, i ) * x( i ); } update += y( j ) * temp2; }, result );
int main(int argc, char* argv[]) { int N = -1 ; // number of rows 2^12 int M = -1 ; // number of columns 2^10 int S = -1 ; // total size 2^22 int nrepeat = 100 ; // number of repeats of the test // Read command line arguments for(int i=0; i<argc; i++) { if( (strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0) ) { N = pow( 2, atoi(argv[++i]) ); printf(" User N is %d\n",N); } else if( (strcmp(argv[i], "-M") == 0) || (strcmp(argv[i], "-Columns") == 0)) { M = pow( 2, atof(argv[++i]) ); printf(" User M is %d\n",M); } else if( (strcmp(argv[i], "-S") == 0) || (strcmp(argv[i], "-Size") == 0)) { S = pow( 2, atof(argv[++i]) ); printf(" User S is %d\n",S); } else if( strcmp(argv[i], "-nrepeat") == 0) { nrepeat = atoi(argv[++i]); } else if( (strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "-help") == 0) ) { printf(" y^T*A*x Options:\n"); printf(" -Rows (-N) <int>: exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n"); printf(" -Columns (-M) <int>: exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n"); printf(" -Size (-S) <int>: exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n"); printf(" -nrepeat <int>: number of repetitions (default: 100)\n"); printf(" -help (-h): print this message\n\n"); exit(1); } } //Check Sizes checkSizes( N, M, S, nrepeat ); Kokkos::initialize(argc,argv); // Allocate y, x vectors and Matrix A: // Device typedef Kokkos::View<double*> ViewVectorType; typedef Kokkos::View<double**> ViewMatrixType; ViewVectorType y("y", N); ViewVectorType x("x", M); ViewMatrixType A("A", N, M); ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view(y); ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view(x); ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view(A); // Initialize y vector on host for (int i = 0; i < N; ++i) { h_y( i ) = 1; } // Initialize x vector on host for (int i = 0; i < M; ++i) { h_x( i ) = 1; } // Initialize A matrix, note 2D indexing computation on host for (int j = 0; j < N; ++j) { for ( int i = 0 ; i < M ; ++i ) { h_A( j , i ) = 1; } } Kokkos::deep_copy(y,h_y); Kokkos::deep_copy(x,h_x); Kokkos::deep_copy(A,h_A); // Timer products struct timeval begin,end; gettimeofday(&begin,NULL); for ( int repeat = 0; repeat < nrepeat; repeat++) { //Application: <y,Ax> = y^T*A*x double result = 0; Kokkos::parallel_reduce( N, KOKKOS_LAMBDA ( int j, double &update ) { double temp2 = 0; for ( int i = 0 ; i < M ; ++i ) { temp2 += A( j , i ) * x( i ); } update += y( j ) * temp2; }, result );
int main( int argc, char* argv[] ) { int N = -1; // number of rows 2^12 int M = -1; // number of columns 2^10 int S = -1; // total size 2^22 int E = -1; // number of Elements int nrepeat = 100; // number of repeats of the test // Read command line arguments. for ( int i = 0; i < argc; i++ ) { if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { N = pow( 2, atoi( argv[ ++i ] ) ); printf( " User N is %d\n", N ); } else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { M = pow( 2, atof( argv[ ++i ] ) ); printf( " User M is %d\n", M ); } else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { S = pow( 2, atof( argv[ ++i ] ) ); printf( " User S is %d\n", S ); } else if ( ( strcmp( argv[ i ], "-E" ) == 0 ) || ( strcmp( argv[ i ], "-Elements" ) == 0 ) ) { E = pow( 2, atof( argv[ ++i ] ) ); printf( " User E is %d\n", E ); } else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { nrepeat = atoi( argv[ ++i ] ); } else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { printf( " y^T*A*x Options:\n" ); printf( " -Rows (-N) <int>: exponent num, determines number of rows 2^num (default: 2^8 = 256)\n" ); printf( " -Columns (-M) <int>: exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); printf( " -Size (-S) <int>: exponent num, determines total matrix size 2^num (default: 2^18 = 256*1024 )\n" ); printf( " -Elements (-E) <int>: exponent num, determines number of elements 2^num (default: 2^10 = 1024 )\n" ); printf( " -nrepeat <int>: number of repetitions (default: 100)\n" ); printf( " -help (-h): print this message\n\n" ); exit( 1 ); } } // Check sizes. checkSizes( N, M, S, E, nrepeat ); Kokkos::initialize( argc, argv ); typedef Kokkos::LayoutRight Layout; typedef Kokkos::RangePolicy<> range_policy; // Allocate y, x vectors and Matrix A on device. typedef Kokkos::View<double**, Layout> ViewVectorType; typedef Kokkos::View<double***, Layout> ViewMatrixType; ViewVectorType y( "y", E, N ); ViewVectorType x( "x", E, M ); ViewMatrixType A( "A", E, N, M ); // Create host mirrors of device views. ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y ); ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x ); ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A ); for ( int e = 0; e < E; e++ ) { // Initialize y vector on host. for ( int i = 0; i < N; ++i ) { h_y( e, i ) = 1; } // Initialize x vector on host. for ( int i = 0; i < M; ++i ) { h_x( e, i ) = 1; } // Initialize A matrix on host. for ( int j = 0; j < N; ++j ) { for ( int i = 0; i < M; ++i ) { h_A( e, j, i ) = 1; } } } // Deep copy host views to device views. Kokkos::deep_copy( y, h_y ); Kokkos::deep_copy( x, h_x ); Kokkos::deep_copy( A, h_A ); typedef Kokkos::TeamPolicy<> team_policy; typedef Kokkos::TeamPolicy<>::member_type member_type; // Timer products. struct timeval begin, end; gettimeofday( &begin, NULL ); for ( int repeat = 0; repeat < nrepeat; repeat++ ) { // Application: <y,Ax> = y^T*A*x double result = 0; Kokkos::parallel_reduce( team_policy( E, Kokkos::AUTO, 32 ), KOKKOS_LAMBDA ( const member_type &teamMember, double &update ) { const int e = teamMember.league_rank(); double tempN = 0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange( teamMember, N ), [&] ( const int j, double &innerUpdateN ) { double tempM = 0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember, M ), [&] ( const int i, double &innerUpdateM ) { innerUpdateM += A( e, j, i ) * x( e, i ); }, tempM ); innerUpdateN += y( e, j ) * tempM; }, tempN ); Kokkos::single( Kokkos::PerTeam( teamMember ), [&] () { update += tempN; }); }, result ); // Output result. if ( repeat == ( nrepeat - 1 ) ) { printf( " Computed result for %d x %d x %d is %lf\n", N, M, E, result ); } const double solution = (double) N *(double) M *(double) E; if ( result != solution ) { printf( " Error: result( %lf ) != solution( %lf )\n", result, solution ); } }
int main( int argc, char* argv[] ) { int N = -1; // number of rows 2^12 int M = -1; // number of columns 2^10 int S = -1; // total size 2^22 int nrepeat = 100; // number of repeats of the test // Read command line arguments. for ( int i = 0; i < argc; i++ ) { if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) { N = pow( 2, atoi( argv[ ++i ] ) ); printf( " User N is %d\n", N ); } else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) { M = pow( 2, atof( argv[ ++i ] ) ); printf( " User M is %d\n", M ); } else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) { S = pow( 2, atof( argv[ ++i ] ) ); printf( " User S is %d\n", S ); } else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) { nrepeat = atoi( argv[ ++i ] ); } else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) { printf( " y^T*A*x Options:\n" ); printf( " -Rows (-N) <int>: exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" ); printf( " -Columns (-M) <int>: exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" ); printf( " -Size (-S) <int>: exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" ); printf( " -nrepeat <int>: number of repetitions (default: 100)\n" ); printf( " -help (-h): print this message\n\n" ); exit( 1 ); } } // Check sizes. checkSizes( N, M, S, nrepeat ); Kokkos::initialize( argc, argv ); typedef Kokkos::DefaultExecutionSpace::array_layout Layout; // typedef Kokkos::LayoutLeft Layout; // typedef Kokkos::LayoutRight Layout; typedef Kokkos::RangePolicy<> range_policy; // Allocate y, x vectors and Matrix A on device. typedef Kokkos::View<double*, Layout> ViewVectorType; typedef Kokkos::View<double**, Layout> ViewMatrixType; ViewVectorType y( "y", N ); ViewVectorType x( "x", M ); ViewMatrixType A( "A", N, M ); // Create host mirrors of device views. ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y ); ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x ); ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A ); // Initialize y vector on host. for ( int i = 0; i < N; ++i ) { h_y( i ) = 1; } // Initialize x vector on host. for ( int i = 0; i < M; ++i ) { h_x( i ) = 1; } // Initialize A matrix on host. for ( int j = 0; j < N; ++j ) { for ( int i = 0; i < M; ++i ) { h_A( j, i ) = 1; } } // Deep copy host views to device views. Kokkos::deep_copy( y, h_y ); Kokkos::deep_copy( x, h_x ); Kokkos::deep_copy( A, h_A ); // EXERCISE: Use hierarchical parallel execution policy for calculation. // EXERCISE hints: // typedef Kokkos::TeamPolicy<> team_policy; // typedef Kokkos::TeamPolicy<>::member_type member_type; // Timer products. struct timeval begin, end; gettimeofday( &begin, NULL ); for ( int repeat = 0; repeat < nrepeat; repeat++ ) { // Application: <y,Ax> = y^T*A*x double result = 0; // EXERCISE: Convert from range_policy to team_policy. Kokkos::parallel_reduce( range_policy( 0, N ), KOKKOS_LAMBDA ( int j, double &update ) { // EXERCISE: Convert to nested Kokkos::parallel_reduce. // EXERCISE hint: Kokkos::TeamThreadRange( ??? ) and [&]. double temp2 = 0; for ( int i = 0; i < M; ++i ) { temp2 += A( j, i ) * x( i ); } // EXERCISE: Only one team member update the result. update += y( j ) * temp2; }, result );
int test_crs_matrix_test_singlevec(int numRows, int numCols, int nnz, int test, const char* filename, const bool binaryfile) { typedef Kokkos::CrsMatrix<Scalar,int,execution_space,void,int> matrix_type ; typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space> mv_type; typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space,Kokkos::MemoryRandomAccess > mv_random_read_type; typedef typename mv_type::HostMirror h_mv_type; Scalar* val = NULL; int* row = NULL; int* col = NULL; srand(17312837); if(filename==NULL) nnz = SparseMatrix_generate<Scalar,int>(numRows,numCols,nnz,nnz/numRows*0.2,numRows*0.01,val,row,col); else if(!binaryfile) nnz = SparseMatrix_MatrixMarket_read<Scalar,int>(filename,numRows,numCols,nnz,val,row,col); else nnz = SparseMatrix_ReadBinaryGraph<Scalar,int>(filename,numRows,numCols,nnz,val,row,col); matrix_type A("CRS::A",numRows,numCols,nnz,val,row,col,false); mv_type x("X",numCols); mv_random_read_type t_x(x); mv_type y("Y",numRows); h_mv_type h_x = Kokkos::create_mirror_view(x); h_mv_type h_y = Kokkos::create_mirror_view(y); h_mv_type h_y_compare = Kokkos::create_mirror(y); typename matrix_type::StaticCrsGraphType::HostMirror h_graph = Kokkos::create_mirror(A.graph); typename matrix_type::values_type::HostMirror h_values = Kokkos::create_mirror_view(A.values); //Kokkos::deep_copy(h_graph.row_map,A.graph.row_map); //h_a(k) = (Scalar) (1.0*(rand()%40)-20.); for(int i=0; i<numCols;i++) { h_x(i) = (Scalar) (1.0*(rand()%40)-20.); h_y(i) = (Scalar) (1.0*(rand()%40)-20.); } for(int i=0;i<numRows;i++) { int start = h_graph.row_map(i); int end = h_graph.row_map(i+1); for(int j=start;j<end;j++) { h_values(j) = h_graph.entries(j) + i; } h_y_compare(i) = 0; for(int j=start;j<end;j++) { Scalar val = h_graph.entries(j) + i; int idx = h_graph.entries(j); h_y_compare(i)+=val*h_x(idx); } } Kokkos::deep_copy(x,h_x); Kokkos::deep_copy(y,h_y); Kokkos::deep_copy(A.graph.entries,h_graph.entries); Kokkos::deep_copy(A.values,h_values); /*for(int i=0;i<numRows;i++) for(int k = 0; k<numVecs; k++) { //error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k)); printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k)); }*/ typename Kokkos::CrsMatrix<Scalar,int,execution_space,void,int>::values_type x1("X1",numCols); typename Kokkos::CrsMatrix<Scalar,int,execution_space,void,int>::values_type y1("Y1",numRows); #ifdef NEWKERNEL KokkosSparse::spmv("N",1.0,A,x1,0.0,y1); #else Kokkos::MV_Multiply(y1,A,x1); #endif #ifdef NEWKERNEL KokkosSparse::spmv("N",1.0,A,x,0.0,y); #else Kokkos::MV_Multiply(y,A,x); #endif execution_space::fence(); Kokkos::deep_copy(h_y,y); Scalar error = 0; Scalar sum = 0; for(int i=0;i<numRows;i++) { error+=(h_y_compare(i)-h_y(i))*(h_y_compare(i)-h_y(i)); sum += h_y_compare(i)*h_y_compare(i); // printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k)); } //for(int i=0;i<A.nnz;i++) printf("%i %lf\n",h_graph.entries(i),h_values(i)); int num_errors = 0; double total_error = 0; double total_sum = 0; num_errors += (error/(sum==0?1:sum))>1e-5?1:0; total_error += error; total_sum += sum; int loop = 100; Kokkos::Impl::Timer timer; for(int i=0;i<loop;i++) #ifdef NEWKERNEL KokkosSparse::spmv("N",1.0,A,x,0.0,y); #else Kokkos::MV_Multiply(y,A,x); #endif execution_space::fence(); double time = timer.seconds(); double matrix_size = 1.0*((nnz*(sizeof(Scalar)+sizeof(int)) + numRows*sizeof(int)))/1024/1024; double vector_size = 2.0*numRows*sizeof(Scalar)/1024/1024; double vector_readwrite = (nnz+numCols)*sizeof(Scalar)/1024/1024; double problem_size = matrix_size+vector_size; printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf GFlop/s %6.3lf ms %i\n",nnz, numRows,numCols,1,problem_size,(matrix_size+vector_readwrite)/time*loop/1024, 2.0*nnz*loop/time/1e9, time/loop*1000, num_errors); return (int)total_error; }
int test_crs_matrix_test_singlevec(int numRows, int numCols, int nnz, int test, const char* filename, const bool binaryfile) { typedef KokkosArray::CrsMatrix<Scalar,int,device_type> matrix_type ; typedef typename KokkosArray::View<Scalar*,KokkosArray::LayoutLeft,device_type> mv_type; typedef typename KokkosArray::View<Scalar*,KokkosArray::LayoutLeft,device_type,KokkosArray::MemoryRandomRead> mv_random_read_type; typedef typename mv_type::HostMirror h_mv_type; Scalar* val = NULL; int* row = NULL; int* col = NULL; srand(17312837); if(filename==NULL) nnz = SparseMatrix_generate<Scalar,int>(numRows,numCols,nnz,nnz/numRows*0.2,numRows*0.01,val,row,col); else if(!binaryfile) nnz = SparseMatrix_MatrixMarket_read<Scalar,int>(filename,numRows,numCols,nnz,val,row,col); else nnz = SparseMatrix_ReadBinaryGraph<Scalar,int>(filename,numRows,numCols,nnz,val,row,col); matrix_type A("CRS::A",numRows,numCols,nnz,val,row,col,false); mv_type x("X",numCols); mv_random_read_type t_x(x); mv_type y("Y",numRows); h_mv_type h_x = KokkosArray::create_mirror_view(x); h_mv_type h_y = KokkosArray::create_mirror_view(y); h_mv_type h_y_compare = KokkosArray::create_mirror(y); typename matrix_type::CrsArrayType::HostMirror h_graph = KokkosArray::create_mirror(A.graph); typename matrix_type::values_type::HostMirror h_values = KokkosArray::create_mirror_view(A.values); //KokkosArray::deep_copy(h_graph.row_map,A.graph.row_map); //h_a(k) = (Scalar) (1.0*(rand()%40)-20.); for(int i=0; i<numCols;i++) { h_x(i) = (Scalar) (1.0*(rand()%40)-20.); h_y(i) = (Scalar) (1.0*(rand()%40)-20.); } for(int i=0;i<numRows;i++) { int start = h_graph.row_map(i); int end = h_graph.row_map(i+1); for(int j=start;j<end;j++) { h_values(j) = h_graph.entries(j) + i; } h_y_compare(i) = 0; for(int j=start;j<end;j++) { Scalar val = h_graph.entries(j) + i; int idx = h_graph.entries(j); h_y_compare(i)+=val*h_x(idx); } } KokkosArray::deep_copy(x,h_x); KokkosArray::deep_copy(y,h_y); KokkosArray::deep_copy(A.graph.entries,h_graph.entries); KokkosArray::deep_copy(A.values,h_values); /*for(int i=0;i<numRows;i++) for(int k = 0; k<numVecs; k++) { //error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k)); printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k)); }*/ typename KokkosArray::CrsMatrix<Scalar,int,device_type>::values_type x1("X1",numCols); typename KokkosArray::CrsMatrix<Scalar,int,device_type>::values_type y1("Y1",numRows); KokkosArray::MV_Multiply(0.0,y1,1.0,A,x1); KokkosArray::MV_Multiply(0.0,y,1.0,A,x); device_type::fence(); KokkosArray::deep_copy(h_y,y); Scalar error = 0; Scalar sum = 0; for(int i=0;i<numRows;i++) { error+=(h_y_compare(i)-h_y(i))*(h_y_compare(i)-h_y(i)); sum += h_y_compare(i)*h_y_compare(i); // printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k)); } //for(int i=0;i<A.nnz;i++) printf("%i %lf\n",h_graph.entries(i),h_values(i)); int num_errors = 0; double total_error = 0; double total_sum = 0; num_errors += (error/(sum==0?1:sum))>1e-5?1:0; total_error += error; total_sum += sum; int loop = 10; timespec starttime,endtime; clock_gettime(CLOCK_REALTIME,&starttime); for(int i=0;i<loop;i++) KokkosArray::MV_Multiply(0.0,y,1.0,A,t_x); device_type::fence(); clock_gettime(CLOCK_REALTIME,&endtime); double time = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000; double matrix_size = 1.0*((nnz*(sizeof(Scalar)+sizeof(int)) + numRows*sizeof(int)))/1024/1024; double vector_size = 2.0*numRows*sizeof(Scalar)/1024/1024; double vector_readwrite = 2.0*nnz*sizeof(Scalar)/1024/1024; double problem_size = matrix_size+vector_size; printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf ms %i\n",nnz, numRows,numCols,1,problem_size,(matrix_size+vector_readwrite)/time*loop/1024, time/loop*1000, num_errors); return (int)total_error; }
int main(int argc, char* argv[]) { int N = -1 ; // number of rows 2^12 int M = -1 ; // number of columns 2^10 int S = -1 ; // total size 2^22 int nrepeat = 100 ; // number of repeats of the test // Read command line arguments for(int i=0; i<argc; i++) { if( (strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0) ) { N = pow( 2, atoi(argv[++i]) ); printf(" User N is %d\n",N); } else if( (strcmp(argv[i], "-M") == 0) || (strcmp(argv[i], "-Columns") == 0)) { M = pow( 2, atof(argv[++i]) ); printf(" User M is %d\n",M); } else if( (strcmp(argv[i], "-S") == 0) || (strcmp(argv[i], "-Size") == 0)) { S = pow( 2, atof(argv[++i]) ); printf(" User S is %d\n",S); } else if( strcmp(argv[i], "-nrepeat") == 0) { nrepeat = atoi(argv[++i]); } else if( (strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "-help") == 0) ) { printf(" y^T*A*x Options:\n"); printf(" -Rows (-N) <int>: exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n"); printf(" -Columns (-M) <int>: exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n"); printf(" -Size (-S) <int>: exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n"); printf(" -nrepeat <int>: number of repetitions (default: 100)\n"); printf(" -help (-h): print this message\n\n"); exit(1); } } //Check Sizes checkSizes( N, M, S, nrepeat ); Kokkos::initialize(argc,argv); // typedef Kokkos::DefaultExecutionSpace::array_layout Layout; // typedef Kokkos::LayoutLeft Layout ; typedef Kokkos::LayoutRight Layout ; // Allocate y, x vectors and Matrix A: // Device typedef Kokkos::View<double*, Layout> ViewVectorType; typedef Kokkos::View<double**, Layout> ViewMatrixType; ViewVectorType y("y", N); ViewVectorType x("x", M); ViewMatrixType A("A", N, M); //Host mirror ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view(y); ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view(x); ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view(A); // Initialize y vector on host for (int i = 0; i < N; ++i) { h_y( i ) = 1; } // Initialize x vector on host for (int i = 0; i < M; ++i) { h_x( i ) = 1; } // Initialize A matrix, note 2D indexing computation on host for (int j = 0; j < N; ++j) { for ( int i = 0 ; i < M ; ++i ) { h_A( j , i ) = 1; } } //Deep copy host view to device views Kokkos::deep_copy(y, h_y); Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(A, h_A); typedef Kokkos::TeamPolicy<> team_policy ; typedef Kokkos::TeamPolicy<>::member_type member_type ; // Timer products struct timeval begin,end; gettimeofday(&begin,NULL); for ( int repeat = 0; repeat < nrepeat; repeat++) { //Application: <y,Ax> = y^T*A*x double result = 0; Kokkos::parallel_reduce( team_policy( N , Kokkos::AUTO ), KOKKOS_LAMBDA ( const member_type& teamMember, double &update ) { const int j = teamMember.league_rank(); double temp2 = 0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange( teamMember, M ), [&] (const int i, double &innerUpdate ) { innerUpdate += A( j , i ) * x( i ); }, temp2); if ( teamMember.team_rank() == 0 ) update += y( j ) * temp2; }, result ); //Output result if ( repeat == (nrepeat - 1) ) printf(" Computed result for %d x %d is %lf\n", N, M, result); const double solution = (double)N *(double)M; if ( result != solution ) { printf(" Error: result( %lf ) != solution( %lf )\n",result,solution); } }