void Gemm(const int NN) {
        typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;

        constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
        const int N = NN/VectorLength;

        {
          std::string value_type_name;
          if (std::is_same<value_type,double>::value)                   value_type_name = "double";
          if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
#if   defined(__AVX512F__)
          std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#elif defined(__AVX__) || defined(__AVX2__)
          std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#else
          std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#endif
        }

        const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize);
        const double tmax = 1.0e15;

        const int iter_begin = -10, iter_end = 100;
        Kokkos::Impl::Timer timer;

        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref;
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
          amat("amat", N*VectorLength, BlkSize, BlkSize),
          bmat("bmat", N*VectorLength, BlkSize, BlkSize);

        Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
        Kokkos::fill_random(amat, random, value_type(1.0));
        Kokkos::fill_random(bmat, random, value_type(1.0));

        typedef Vector<SIMD<value_type>,VectorLength> VectorType;
        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
          amat_simd("amat_simd", N, BlkSize, BlkSize),
          bmat_simd("bmat_simd", N, BlkSize, BlkSize);

        Kokkos::parallel_for
          (Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
           KOKKOS_LAMBDA(const int k) {
            const int k0 = k/VectorLength, k1 = k%VectorLength;
            for (int i=0;i<BlkSize;++i)
              for (int j=0;j<BlkSize;++j) {
                amat_simd(k0, i, j)[k1] = amat(k, i, j);
                bmat_simd(k0, i, j)[k1] = bmat(k, i, j);                  
              }
          });
Ejemplo n.º 2
0
int main (int argc, char *argv[])
{
#define amat(I,J) a[I*n + J]
#define bmat(I,J) b[I*n + J]
#define cmat(I,J) c[I*n + J]

  int n, nthreads, i, j, k;
  double *a, *b, *c;
  double t0, t1;

  n = 1000;
  nthreads = atoi(argv[1]);
  omp_set_num_threads (nthreads);

  a = (double *) malloc (n * n * sizeof (double));
  b = (double *) malloc (n * n * sizeof (double));
  c = (double *) malloc (n * n * sizeof (double));

  t0 = omp_get_wtime();

#pragma omp parallel for private (i, j, k) 
  for (j=0; j<n; j++) 
    for (i=0; i<n; i++) 
      for (k=0; k<n; k++) 
         cmat(i,j) = cmat(i,j) + amat(i,k) * bmat(k,j);

  t1 = omp_get_wtime();
  printf("nthreads, time: %d %6.2f\n", nthreads, t1-t0);
}
/* Used to avoid having really messy code.  */
void calc_c(int j2, int k2)
{
   //Ck = Ck + Ak*Bjk
   int sum;
   int x,y,z;
   for (x=0; x<n; x++)
   {
       for (y=0; y<nc; y++)
       {
      
           double sum = cmat(x,y);
           for (z=0; z<nc; z++)
           {
               sum = sum + atempmat(x,z) * bmat(z+j2*nc,y);
           }
           cmat(x,y)=sum;
           
       }
   }
}