void Gemm(const int NN) { typedef Kokkos::Schedule<Kokkos::Static> ScheduleType; constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value; const int N = NN/VectorLength; { std::string value_type_name; if (std::is_same<value_type,double>::value) value_type_name = "double"; if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>"; #if defined(__AVX512F__) std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #endif } const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Impl::Timer timer; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> amat("amat", N*VectorLength, BlkSize, BlkSize), bmat("bmat", N*VectorLength, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718); Kokkos::fill_random(amat, random, value_type(1.0)); Kokkos::fill_random(bmat, random, value_type(1.0)); typedef Vector<SIMD<value_type>,VectorLength> VectorType; Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, BlkSize); Kokkos::parallel_for (Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i<BlkSize;++i) for (int j=0;j<BlkSize;++j) { amat_simd(k0, i, j)[k1] = amat(k, i, j); bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } });
int main (int argc, char *argv[]) { #define amat(I,J) a[I*n + J] #define bmat(I,J) b[I*n + J] #define cmat(I,J) c[I*n + J] int n, nthreads, i, j, k; double *a, *b, *c; double t0, t1; n = 1000; nthreads = atoi(argv[1]); omp_set_num_threads (nthreads); a = (double *) malloc (n * n * sizeof (double)); b = (double *) malloc (n * n * sizeof (double)); c = (double *) malloc (n * n * sizeof (double)); t0 = omp_get_wtime(); #pragma omp parallel for private (i, j, k) for (j=0; j<n; j++) for (i=0; i<n; i++) for (k=0; k<n; k++) cmat(i,j) = cmat(i,j) + amat(i,k) * bmat(k,j); t1 = omp_get_wtime(); printf("nthreads, time: %d %6.2f\n", nthreads, t1-t0); }
/* Used to avoid having really messy code. */ void calc_c(int j2, int k2) { //Ck = Ck + Ak*Bjk int sum; int x,y,z; for (x=0; x<n; x++) { for (y=0; y<nc; y++) { double sum = cmat(x,y); for (z=0; z<nc; z++) { sum = sum + atempmat(x,z) * bmat(z+j2*nc,y); } cmat(x,y)=sum; } } }