int main( int argc, char* argv[] )
{
  int N = -1;         // number of rows 2^12
  int M = -1;         // number of columns 2^10
  int S = -1;         // total size 2^22
  int nrepeat = 100;  // number of repeats of the test

  // Read command line arguments.
  for ( int i = 0; i < argc; i++ ) {
    if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) {
      N = pow( 2, atoi( argv[ ++i ] ) );
      printf( "  User N is %d\n", N );
    }
    else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) {
      M = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User M is %d\n", M );
    }
    else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) {
      S = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User S is %d\n", S );
    }
    else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) {
      nrepeat = atoi( argv[ ++i ] );
    }
    else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) {
      printf( "  y^T*A*x Options:\n" );
      printf( "  -Rows (-N) <int>:      exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" );
      printf( "  -Columns (-M) <int>:   exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" );
      printf( "  -Size (-S) <int>:      exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" );
      printf( "  -nrepeat <int>:        number of repetitions (default: 100)\n" );
      printf( "  -help (-h):            print this message\n\n" );
      exit( 1 );
    }
  }

  // Check sizes.
  checkSizes( N, M, S, nrepeat );

  Kokkos::initialize( argc, argv );

  // EXERCISE give-away: Choose an Execution Space.
  // typedef Kokkos::Serial   ExecSpace;
  // typedef Kokkos::Threads  ExecSpace;
  // typedef Kokkos::OpenMP   ExecSpace;
  // typedef Kokkos::Cuda     ExecSpace;

  // EXERCISE: Choose device memory space.
  // typedef Kokkos::HostSpace     MemSpace;
  // typedef Kokkos::OpenMP        MemSpace;
  // typedef Kokkos::CudaSpace     MemSpace;
  // typedef Kokkos::CudaUVMSpace  MemSpace;

  // EXERCISE give-away: Choose a Layout.
  // EXERCISE: When exercise is correctly implemented, then
  //           either layout will generate the correct answer.
  //           However, performance will be different!

  // typedef Kokkos::LayoutLeft   Layout;
  // typedef Kokkos::LayoutRight  Layout;

  // EXERCISE give-away: Use a RangePolicy.
  // typedef Kokkos::RangePolicy<ExecSpace>  range_policy;

  // Allocate y, x vectors and Matrix A on device.
  // EXERCISE: Use MemSpace and Layout.
  typedef Kokkos::View<double*>   ViewVectorType;
  typedef Kokkos::View<double**>  ViewMatrixType;
  ViewVectorType y( "y", N );
  ViewVectorType x( "x", M );
  ViewMatrixType A( "A", N, M );

  // Create host mirrors of device views.
  ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y );
  ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x );
  ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A );

  // Initialize y vector on host.
  for ( int i = 0; i < N; ++i ) {
    h_y( i ) = 1;
  }

  // Initialize x vector on host.
  for ( int i = 0; i < M; ++i ) {
    h_x( i ) = 1;
  }

  // Initialize A matrix on host.
  for ( int j = 0; j < N; ++j ) {
    for ( int i = 0; i < M; ++i ) {
      h_A( j, i ) = 1;
    }
  }

  // Deep copy host views to device views.
  Kokkos::deep_copy( y, h_y );
  Kokkos::deep_copy( x, h_x );
  Kokkos::deep_copy( A, h_A );

  // Timer products.
  struct timeval begin, end;

  gettimeofday( &begin, NULL );

  for ( int repeat = 0; repeat < nrepeat; repeat++ ) {
    // Application: <y,Ax> = y^T*A*x
    double result = 0;

    // EXERCISE: Use Kokkos::RangePolicy<ExecSpace> to execute parallel_reduce
    //           in the correct space.
    Kokkos::parallel_reduce( N, KOKKOS_LAMBDA ( int j, double &update ) {
      double temp2 = 0;

      for ( int i = 0; i < M; ++i ) {
        temp2 += A( j, i ) * x( i );
      }

      update += y( j ) * temp2;
    }, result );
int main(int argc, char* argv[])
{

  int N = -1 ;       // number of rows 2^12
  int M = -1 ;       // number of columns 2^10
  int S = -1 ;      // total size 2^22
  int nrepeat = 100 ;    // number of repeats of the test

  // Read command line arguments
  for(int i=0; i<argc; i++) {
    if( (strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0) ) {
      N = pow( 2, atoi(argv[++i]) );
      printf("  User N is %d\n",N);
    } else if( (strcmp(argv[i], "-M") == 0) || (strcmp(argv[i], "-Columns") == 0)) {
      M = pow( 2, atof(argv[++i]) );
      printf("  User M is %d\n",M);
    } else if( (strcmp(argv[i], "-S") == 0) || (strcmp(argv[i], "-Size") == 0)) {
      S = pow( 2, atof(argv[++i]) );
      printf("  User S is %d\n",S);
    } else if( strcmp(argv[i], "-nrepeat") == 0) {
      nrepeat = atoi(argv[++i]);
    } else if( (strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "-help") == 0) ) {

      printf("  y^T*A*x Options:\n");
      printf("  -Rows (-N) <int>:      exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n");
      printf("  -Columns (-M) <int>:   exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n");
      printf("  -Size (-S) <int>:      exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n");
      printf("  -nrepeat <int>:        number of repetitions (default: 100)\n");
      printf("  -help (-h):            print this message\n\n");

      exit(1); }
  }

  //Check Sizes
  checkSizes( N, M, S, nrepeat );

  Kokkos::initialize(argc,argv);

  // Allocate y, x vectors and Matrix A:
  // Device
  typedef Kokkos::View<double*>   ViewVectorType;
  typedef Kokkos::View<double**>   ViewMatrixType;
  ViewVectorType y("y", N);
  ViewVectorType x("x", M);
  ViewMatrixType A("A", N, M);

  ViewVectorType::HostMirror h_y =  Kokkos::create_mirror_view(y);
  ViewVectorType::HostMirror h_x =  Kokkos::create_mirror_view(x);
  ViewMatrixType::HostMirror h_A =  Kokkos::create_mirror_view(A);

  // Initialize y vector on host
  for (int i = 0; i < N; ++i) {
    h_y( i ) = 1;
  }

  // Initialize x vector on host
  for (int i = 0; i < M; ++i) {
    h_x( i ) = 1;
  }

  // Initialize A matrix, note 2D indexing computation on host
  for (int j = 0; j < N; ++j) {
    for ( int i = 0 ; i < M ; ++i ) {
      h_A( j , i ) = 1;
    }
  }

  Kokkos::deep_copy(y,h_y);
  Kokkos::deep_copy(x,h_x);
  Kokkos::deep_copy(A,h_A);

  // Timer products
  struct timeval begin,end;

  gettimeofday(&begin,NULL);

  for ( int repeat = 0; repeat < nrepeat; repeat++) {

    //Application: <y,Ax> = y^T*A*x
    double result = 0;
    Kokkos::parallel_reduce( N, KOKKOS_LAMBDA ( int j, double &update ) {
      double temp2 = 0;
      for ( int i = 0 ; i < M ; ++i ) {
        temp2 += A( j , i ) * x( i );
      }
      update += y( j ) * temp2;
    }, result );
int main( int argc, char* argv[] )
{
  int N = -1;         // number of rows 2^12
  int M = -1;         // number of columns 2^10
  int S = -1;         // total size 2^22
  int E = -1;         // number of Elements
  int nrepeat = 100;  // number of repeats of the test

  // Read command line arguments.
  for ( int i = 0; i < argc; i++ ) {
    if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) {
      N = pow( 2, atoi( argv[ ++i ] ) );
      printf( "  User N is %d\n", N );
    }
    else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) {
      M = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User M is %d\n", M );
    }
    else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) {
      S = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User S is %d\n", S );
    }
    else if ( ( strcmp( argv[ i ], "-E" ) == 0 ) || ( strcmp( argv[ i ], "-Elements" ) == 0 ) ) {
      E = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User E is %d\n", E );
    }
    else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) {
      nrepeat = atoi( argv[ ++i ] );
    }
    else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) {
      printf( "  y^T*A*x Options:\n" );
      printf( "  -Rows (-N) <int>:      exponent num, determines number of rows 2^num (default: 2^8 = 256)\n" );
      printf( "  -Columns (-M) <int>:   exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" );
      printf( "  -Size (-S) <int>:      exponent num, determines total matrix size 2^num (default: 2^18 = 256*1024 )\n" );
      printf( "  -Elements (-E) <int>:  exponent num, determines number of elements 2^num (default: 2^10 = 1024 )\n" );
      printf( "  -nrepeat <int>:        number of repetitions (default: 100)\n" );
      printf( "  -help (-h):            print this message\n\n" );
      exit( 1 );
    }
  }

  // Check sizes.
  checkSizes( N, M, S, E, nrepeat );

  Kokkos::initialize( argc, argv );

  typedef Kokkos::LayoutRight  Layout;

  typedef Kokkos::RangePolicy<> range_policy;

  // Allocate y, x vectors and Matrix A on device.
  typedef Kokkos::View<double**, Layout>   ViewVectorType;
  typedef Kokkos::View<double***, Layout>  ViewMatrixType;
  ViewVectorType y( "y", E, N );
  ViewVectorType x( "x", E, M );
  ViewMatrixType A( "A", E, N, M );

  // Create host mirrors of device views.
  ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y );
  ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x );
  ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A );

  for ( int e = 0; e < E; e++ ) {
    // Initialize y vector on host.
    for ( int i = 0; i < N; ++i ) {
      h_y( e, i ) = 1;
    }

    // Initialize x vector on host.
    for ( int i = 0; i < M; ++i ) {
      h_x( e, i ) = 1;
    }

    // Initialize A matrix on host.
    for ( int j = 0; j < N; ++j ) {
      for ( int i = 0; i < M; ++i ) {
        h_A( e, j, i ) = 1;
      }
    }
  }

  // Deep copy host views to device views.
  Kokkos::deep_copy( y, h_y );
  Kokkos::deep_copy( x, h_x );
  Kokkos::deep_copy( A, h_A );

  typedef Kokkos::TeamPolicy<>               team_policy;
  typedef Kokkos::TeamPolicy<>::member_type  member_type;

  // Timer products.
  struct timeval begin, end;

  gettimeofday( &begin, NULL );

  for ( int repeat = 0; repeat < nrepeat; repeat++ ) {
    // Application: <y,Ax> = y^T*A*x
    double result = 0;

    Kokkos::parallel_reduce( team_policy( E, Kokkos::AUTO, 32 ), KOKKOS_LAMBDA ( const member_type &teamMember, double &update ) {
      const int e = teamMember.league_rank();
      double tempN = 0;

      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( teamMember, N ), [&] ( const int j, double &innerUpdateN ) {
        double tempM = 0;

        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember, M ), [&] ( const int i, double &innerUpdateM ) {
          innerUpdateM += A( e, j, i ) * x( e, i );
        }, tempM );

        innerUpdateN += y( e, j ) * tempM;
      }, tempN );

      Kokkos::single( Kokkos::PerTeam( teamMember ), [&] () {
        update += tempN;
      });
    }, result );

    // Output result.
    if ( repeat == ( nrepeat - 1 ) ) {
      printf( "  Computed result for %d x %d x %d is %lf\n", N, M, E, result );
    }

    const double solution = (double) N *(double) M *(double) E;

    if ( result != solution ) {
      printf( "  Error: result( %lf ) != solution( %lf )\n", result, solution );
    }
  }
int main( int argc, char* argv[] )
{
  int N = -1;         // number of rows 2^12
  int M = -1;         // number of columns 2^10
  int S = -1;         // total size 2^22
  int nrepeat = 100;  // number of repeats of the test

  // Read command line arguments.
  for ( int i = 0; i < argc; i++ ) {
    if ( ( strcmp( argv[ i ], "-N" ) == 0 ) || ( strcmp( argv[ i ], "-Rows" ) == 0 ) ) {
      N = pow( 2, atoi( argv[ ++i ] ) );
      printf( "  User N is %d\n", N );
    }
    else if ( ( strcmp( argv[ i ], "-M" ) == 0 ) || ( strcmp( argv[ i ], "-Columns" ) == 0 ) ) {
      M = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User M is %d\n", M );
    }
    else if ( ( strcmp( argv[ i ], "-S" ) == 0 ) || ( strcmp( argv[ i ], "-Size" ) == 0 ) ) {
      S = pow( 2, atof( argv[ ++i ] ) );
      printf( "  User S is %d\n", S );
    }
    else if ( strcmp( argv[ i ], "-nrepeat" ) == 0 ) {
      nrepeat = atoi( argv[ ++i ] );
    }
    else if ( ( strcmp( argv[ i ], "-h" ) == 0 ) || ( strcmp( argv[ i ], "-help" ) == 0 ) ) {
      printf( "  y^T*A*x Options:\n" );
      printf( "  -Rows (-N) <int>:      exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n" );
      printf( "  -Columns (-M) <int>:   exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n" );
      printf( "  -Size (-S) <int>:      exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n" );
      printf( "  -nrepeat <int>:        number of repetitions (default: 100)\n" );
      printf( "  -help (-h):            print this message\n\n" );
      exit( 1 );
    }
  }

  // Check sizes.
  checkSizes( N, M, S, nrepeat );

  Kokkos::initialize( argc, argv );

  typedef Kokkos::DefaultExecutionSpace::array_layout  Layout;
  // typedef Kokkos::LayoutLeft   Layout;
  // typedef Kokkos::LayoutRight  Layout;

  typedef Kokkos::RangePolicy<> range_policy;

  // Allocate y, x vectors and Matrix A on device.
  typedef Kokkos::View<double*, Layout>   ViewVectorType;
  typedef Kokkos::View<double**, Layout>  ViewMatrixType;
  ViewVectorType y( "y", N );
  ViewVectorType x( "x", M );
  ViewMatrixType A( "A", N, M );

  // Create host mirrors of device views.
  ViewVectorType::HostMirror h_y = Kokkos::create_mirror_view( y );
  ViewVectorType::HostMirror h_x = Kokkos::create_mirror_view( x );
  ViewMatrixType::HostMirror h_A = Kokkos::create_mirror_view( A );

  // Initialize y vector on host.
  for ( int i = 0; i < N; ++i ) {
    h_y( i ) = 1;
  }

  // Initialize x vector on host.
  for ( int i = 0; i < M; ++i ) {
    h_x( i ) = 1;
  }

  // Initialize A matrix on host.
  for ( int j = 0; j < N; ++j ) {
    for ( int i = 0; i < M; ++i ) {
      h_A( j, i ) = 1;
    }
  }

  // Deep copy host views to device views.
  Kokkos::deep_copy( y, h_y );
  Kokkos::deep_copy( x, h_x );
  Kokkos::deep_copy( A, h_A );

  // EXERCISE: Use hierarchical parallel execution policy for calculation.
  // EXERCISE hints:
  // typedef Kokkos::TeamPolicy<>               team_policy;
  // typedef Kokkos::TeamPolicy<>::member_type  member_type;

  // Timer products.
  struct timeval begin, end;

  gettimeofday( &begin, NULL );

  for ( int repeat = 0; repeat < nrepeat; repeat++ ) {
    // Application: <y,Ax> = y^T*A*x
    double result = 0;

    // EXERCISE: Convert from range_policy to team_policy.
    Kokkos::parallel_reduce( range_policy( 0, N ), KOKKOS_LAMBDA ( int j, double &update ) {
      // EXERCISE: Convert to nested Kokkos::parallel_reduce.
      // EXERCISE hint: Kokkos::TeamThreadRange( ??? ) and [&].
      double temp2 = 0;

      for ( int i = 0; i < M; ++i ) {
        temp2 += A( j, i ) * x( i );
      }

      // EXERCISE: Only one team member update the result.
      update += y( j ) * temp2;
    }, result );
Esempio n. 5
0
int test_crs_matrix_test_singlevec(int numRows, int numCols, int nnz, int test, const char* filename, const bool binaryfile) {
        typedef Kokkos::CrsMatrix<Scalar,int,execution_space,void,int> matrix_type ;
        typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space> mv_type;
        typedef typename Kokkos::View<Scalar*,Kokkos::LayoutLeft,execution_space,Kokkos::MemoryRandomAccess > mv_random_read_type;
        typedef typename mv_type::HostMirror h_mv_type;

        Scalar* val = NULL;
        int* row = NULL;
        int* col = NULL;

        srand(17312837);
        if(filename==NULL)
          nnz = SparseMatrix_generate<Scalar,int>(numRows,numCols,nnz,nnz/numRows*0.2,numRows*0.01,val,row,col);
        else
          if(!binaryfile)
            nnz = SparseMatrix_MatrixMarket_read<Scalar,int>(filename,numRows,numCols,nnz,val,row,col);
          else
            nnz = SparseMatrix_ReadBinaryGraph<Scalar,int>(filename,numRows,numCols,nnz,val,row,col);

        matrix_type A("CRS::A",numRows,numCols,nnz,val,row,col,false);

        mv_type x("X",numCols);
        mv_random_read_type t_x(x);
        mv_type y("Y",numRows);
        h_mv_type h_x = Kokkos::create_mirror_view(x);
        h_mv_type h_y = Kokkos::create_mirror_view(y);
        h_mv_type h_y_compare = Kokkos::create_mirror(y);
    typename matrix_type::StaticCrsGraphType::HostMirror h_graph = Kokkos::create_mirror(A.graph);
    typename matrix_type::values_type::HostMirror h_values = Kokkos::create_mirror_view(A.values);

    //Kokkos::deep_copy(h_graph.row_map,A.graph.row_map);
          //h_a(k) = (Scalar) (1.0*(rand()%40)-20.);
          for(int i=0; i<numCols;i++) {
                  h_x(i) = (Scalar) (1.0*(rand()%40)-20.);
                  h_y(i) = (Scalar) (1.0*(rand()%40)-20.);
          }
        for(int i=0;i<numRows;i++) {
                int start = h_graph.row_map(i);
                int end = h_graph.row_map(i+1);
                for(int j=start;j<end;j++) {
                   h_values(j) = h_graph.entries(j) + i;
                }
            h_y_compare(i) = 0;
                for(int j=start;j<end;j++) {
                   Scalar val = h_graph.entries(j) + i;
                   int idx = h_graph.entries(j);
                     h_y_compare(i)+=val*h_x(idx);
                }
        }

        Kokkos::deep_copy(x,h_x);
        Kokkos::deep_copy(y,h_y);
        Kokkos::deep_copy(A.graph.entries,h_graph.entries);
        Kokkos::deep_copy(A.values,h_values);
        /*for(int i=0;i<numRows;i++)
                for(int k = 0; k<numVecs; k++) {
          //error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k));
          printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
                }*/
    typename Kokkos::CrsMatrix<Scalar,int,execution_space,void,int>::values_type x1("X1",numCols);
    typename Kokkos::CrsMatrix<Scalar,int,execution_space,void,int>::values_type y1("Y1",numRows);
#ifdef NEWKERNEL
          KokkosSparse::spmv("N",1.0,A,x1,0.0,y1);
#else
          Kokkos::MV_Multiply(y1,A,x1);
#endif

#ifdef NEWKERNEL
          KokkosSparse::spmv("N",1.0,A,x,0.0,y);
#else
          Kokkos::MV_Multiply(y,A,x);
#endif
        execution_space::fence();
        Kokkos::deep_copy(h_y,y);
        Scalar error = 0;
        Scalar sum = 0;
        for(int i=0;i<numRows;i++) {
          error+=(h_y_compare(i)-h_y(i))*(h_y_compare(i)-h_y(i));
          sum += h_y_compare(i)*h_y_compare(i);
         // printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
                }

        //for(int i=0;i<A.nnz;i++) printf("%i %lf\n",h_graph.entries(i),h_values(i));
    int num_errors = 0;
    double total_error = 0;
    double total_sum = 0;
                num_errors += (error/(sum==0?1:sum))>1e-5?1:0;
                total_error += error;
                total_sum += sum;

    int loop = 100;
    Kokkos::Impl::Timer timer;

        for(int i=0;i<loop;i++)
#ifdef NEWKERNEL
          KokkosSparse::spmv("N",1.0,A,x,0.0,y);
#else
        Kokkos::MV_Multiply(y,A,x);
#endif
        execution_space::fence();
        double time = timer.seconds();
        double matrix_size = 1.0*((nnz*(sizeof(Scalar)+sizeof(int)) + numRows*sizeof(int)))/1024/1024;
        double vector_size = 2.0*numRows*sizeof(Scalar)/1024/1024;
        double vector_readwrite = (nnz+numCols)*sizeof(Scalar)/1024/1024;

        double problem_size = matrix_size+vector_size;
    printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf GFlop/s %6.3lf ms %i\n",nnz, numRows,numCols,1,problem_size,(matrix_size+vector_readwrite)/time*loop/1024, 2.0*nnz*loop/time/1e9, time/loop*1000, num_errors);
        return (int)total_error;
}
Esempio n. 6
0
int test_crs_matrix_test_singlevec(int numRows, int numCols, int nnz, int test, const char* filename, const bool binaryfile) {
	typedef KokkosArray::CrsMatrix<Scalar,int,device_type> matrix_type ;
	typedef typename KokkosArray::View<Scalar*,KokkosArray::LayoutLeft,device_type> mv_type;
	typedef typename KokkosArray::View<Scalar*,KokkosArray::LayoutLeft,device_type,KokkosArray::MemoryRandomRead> mv_random_read_type;
	typedef typename mv_type::HostMirror h_mv_type;

	Scalar* val = NULL;
	int* row = NULL;
	int* col = NULL;

	srand(17312837);
	if(filename==NULL)
	  nnz = SparseMatrix_generate<Scalar,int>(numRows,numCols,nnz,nnz/numRows*0.2,numRows*0.01,val,row,col);
	else
	  if(!binaryfile)
	    nnz = SparseMatrix_MatrixMarket_read<Scalar,int>(filename,numRows,numCols,nnz,val,row,col);
	  else
	    nnz = SparseMatrix_ReadBinaryGraph<Scalar,int>(filename,numRows,numCols,nnz,val,row,col);

	matrix_type A("CRS::A",numRows,numCols,nnz,val,row,col,false);

	mv_type x("X",numCols);
	mv_random_read_type t_x(x);
	mv_type y("Y",numRows);
	h_mv_type h_x = KokkosArray::create_mirror_view(x);
	h_mv_type h_y = KokkosArray::create_mirror_view(y);
	h_mv_type h_y_compare = KokkosArray::create_mirror(y);
    typename matrix_type::CrsArrayType::HostMirror h_graph = KokkosArray::create_mirror(A.graph);
    typename matrix_type::values_type::HostMirror h_values = KokkosArray::create_mirror_view(A.values);

    //KokkosArray::deep_copy(h_graph.row_map,A.graph.row_map);
	  //h_a(k) = (Scalar) (1.0*(rand()%40)-20.);
	  for(int i=0; i<numCols;i++) {
		  h_x(i) = (Scalar) (1.0*(rand()%40)-20.);
		  h_y(i) = (Scalar) (1.0*(rand()%40)-20.);
	  }
	for(int i=0;i<numRows;i++) {
		int start = h_graph.row_map(i);
		int end = h_graph.row_map(i+1);
		for(int j=start;j<end;j++) {
		   h_values(j) = h_graph.entries(j) + i;
		}
  	    h_y_compare(i) = 0;
		for(int j=start;j<end;j++) {
		   Scalar val = h_graph.entries(j) + i;
		   int idx = h_graph.entries(j);
  		     h_y_compare(i)+=val*h_x(idx);
		}
	}

	KokkosArray::deep_copy(x,h_x);
	KokkosArray::deep_copy(y,h_y);
	KokkosArray::deep_copy(A.graph.entries,h_graph.entries);
	KokkosArray::deep_copy(A.values,h_values);
	/*for(int i=0;i<numRows;i++)
		for(int k = 0; k<numVecs; k++) {
          //error[k]+=(h_y_compare(i,k)-h_y(i,k))*(h_y_compare(i,k)-h_y(i,k));
          printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
		}*/
    typename KokkosArray::CrsMatrix<Scalar,int,device_type>::values_type x1("X1",numCols);
    typename KokkosArray::CrsMatrix<Scalar,int,device_type>::values_type y1("Y1",numRows);
    KokkosArray::MV_Multiply(0.0,y1,1.0,A,x1);

	KokkosArray::MV_Multiply(0.0,y,1.0,A,x);
	device_type::fence();
	KokkosArray::deep_copy(h_y,y);
	Scalar error = 0;
	Scalar sum = 0;
	for(int i=0;i<numRows;i++) {
          error+=(h_y_compare(i)-h_y(i))*(h_y_compare(i)-h_y(i));
          sum += h_y_compare(i)*h_y_compare(i);
         // printf("%i %i %lf %lf %lf\n",i,k,h_y_compare(i,k),h_y(i,k),h_x(i,k));
		}

	//for(int i=0;i<A.nnz;i++) printf("%i %lf\n",h_graph.entries(i),h_values(i));
    int num_errors = 0;
    double total_error = 0;
    double total_sum = 0;
		num_errors += (error/(sum==0?1:sum))>1e-5?1:0;
		total_error += error;
		total_sum += sum;

    int loop = 10;
	timespec starttime,endtime;
    clock_gettime(CLOCK_REALTIME,&starttime);

	for(int i=0;i<loop;i++)
		KokkosArray::MV_Multiply(0.0,y,1.0,A,t_x);
	device_type::fence();
	clock_gettime(CLOCK_REALTIME,&endtime);
	double time = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
	double matrix_size = 1.0*((nnz*(sizeof(Scalar)+sizeof(int)) + numRows*sizeof(int)))/1024/1024;
	double vector_size = 2.0*numRows*sizeof(Scalar)/1024/1024;
	double vector_readwrite = 2.0*nnz*sizeof(Scalar)/1024/1024;

	double problem_size = matrix_size+vector_size;
    printf("%i %i %i %i %6.2lf MB %6.2lf GB/s %6.2lf ms %i\n",nnz, numRows,numCols,1,problem_size,(matrix_size+vector_readwrite)/time*loop/1024, time/loop*1000, num_errors);
	return (int)total_error;
}
int main(int argc, char* argv[])
{

  int N = -1 ;       // number of rows 2^12
  int M = -1 ;       // number of columns 2^10
  int S = -1 ;      // total size 2^22
  int nrepeat = 100 ;    // number of repeats of the test

  // Read command line arguments
  for(int i=0; i<argc; i++) {
    if( (strcmp(argv[i], "-N") == 0) || (strcmp(argv[i], "-Rows") == 0) ) {
      N = pow( 2, atoi(argv[++i]) );
      printf("  User N is %d\n",N);
    } else if( (strcmp(argv[i], "-M") == 0) || (strcmp(argv[i], "-Columns") == 0)) {
      M = pow( 2, atof(argv[++i]) );
      printf("  User M is %d\n",M);
    } else if( (strcmp(argv[i], "-S") == 0) || (strcmp(argv[i], "-Size") == 0)) {
      S = pow( 2, atof(argv[++i]) );
      printf("  User S is %d\n",S);
    } else if( strcmp(argv[i], "-nrepeat") == 0) {
      nrepeat = atoi(argv[++i]);
    } else if( (strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "-help") == 0) ) {

      printf("  y^T*A*x Options:\n");
      printf("  -Rows (-N) <int>:      exponent num, determines number of rows 2^num (default: 2^12 = 4096)\n");
      printf("  -Columns (-M) <int>:   exponent num, determines number of columns 2^num (default: 2^10 = 1024)\n");
      printf("  -Size (-S) <int>:      exponent num, determines total matrix size 2^num (default: 2^22 = 4096*1024 )\n");
      printf("  -nrepeat <int>:        number of repetitions (default: 100)\n");
      printf("  -help (-h):            print this message\n\n");
      exit(1); }
  }

  //Check Sizes
  checkSizes( N, M, S, nrepeat );

  Kokkos::initialize(argc,argv);

  // typedef Kokkos::DefaultExecutionSpace::array_layout Layout;
  // typedef Kokkos::LayoutLeft   Layout ;
  typedef Kokkos::LayoutRight  Layout ;

  // Allocate y, x vectors and Matrix A:
  // Device
  typedef Kokkos::View<double*, Layout>   ViewVectorType;
  typedef Kokkos::View<double**, Layout>   ViewMatrixType;
  ViewVectorType y("y", N);
  ViewVectorType x("x", M);
  ViewMatrixType A("A", N, M);

  //Host mirror
  ViewVectorType::HostMirror h_y =  Kokkos::create_mirror_view(y);
  ViewVectorType::HostMirror h_x =  Kokkos::create_mirror_view(x);
  ViewMatrixType::HostMirror h_A =  Kokkos::create_mirror_view(A);

  // Initialize y vector on host
  for (int i = 0; i < N; ++i) {
    h_y( i ) = 1;
  }

  // Initialize x vector on host
  for (int i = 0; i < M; ++i) {
    h_x( i ) = 1;
  }

  // Initialize A matrix, note 2D indexing computation on host
  for (int j = 0; j < N; ++j) {
    for ( int i = 0 ; i < M ; ++i ) {
      h_A( j , i ) = 1;
    }
  }

  //Deep copy host view to device views
  Kokkos::deep_copy(y, h_y);
  Kokkos::deep_copy(x, h_x);
  Kokkos::deep_copy(A, h_A);

  typedef Kokkos::TeamPolicy<>               team_policy ;
  typedef Kokkos::TeamPolicy<>::member_type  member_type ;

  // Timer products
  struct timeval begin,end;

  gettimeofday(&begin,NULL);

  for ( int repeat = 0; repeat < nrepeat; repeat++) {

    //Application: <y,Ax> = y^T*A*x
    double result = 0;
    Kokkos::parallel_reduce( team_policy( N , Kokkos::AUTO ), KOKKOS_LAMBDA ( const member_type& teamMember, double &update ) {
      const int j = teamMember.league_rank();
      double temp2 = 0;
      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( teamMember, M ), [&] (const int i, double &innerUpdate ) {
        innerUpdate += A( j , i ) * x( i );
      }, temp2);
      if ( teamMember.team_rank() == 0 )
        update += y( j ) * temp2;
    }, result );

    //Output result
    if ( repeat == (nrepeat - 1) )
      printf("  Computed result for %d x %d is %lf\n", N, M, result);
    const double solution = (double)N *(double)M;

    if ( result != solution ) {
      printf("  Error: result( %lf ) != solution( %lf )\n",result,solution);
    }
  }