void Gemm(const int NN) { typedef Kokkos::Schedule<Kokkos::Static> ScheduleType; constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value; const int N = NN/VectorLength; { std::string value_type_name; if (std::is_same<value_type,double>::value) value_type_name = "double"; if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>"; #if defined(__AVX512F__) std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #endif } const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Impl::Timer timer; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> amat("amat", N*VectorLength, BlkSize, BlkSize), bmat("bmat", N*VectorLength, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718); Kokkos::fill_random(amat, random, value_type(1.0)); Kokkos::fill_random(bmat, random, value_type(1.0)); typedef Vector<SIMD<value_type>,VectorLength> VectorType; Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> amat_simd("amat_simd", N, BlkSize, BlkSize), bmat_simd("bmat_simd", N, BlkSize, BlkSize); Kokkos::parallel_for (Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i<BlkSize;++i) for (int j=0;j<BlkSize;++j) { amat_simd(k0, i, j)[k1] = amat(k, i, j); bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } });
void LU(const int NN) { typedef Kokkos::Schedule<Kokkos::Static> ScheduleType; //typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType; constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value; const int N = NN/VectorLength; { std::string value_type_name; if (std::is_same<value_type,double>::value) value_type_name = "double"; if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>"; #if defined(__AVX512F__) std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #else std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; #endif } const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize); const double tmax = 1.0e15; const int iter_begin = -10, iter_end = 100; Kokkos::Impl::Timer timer; /// /// Reference version using MKL DGETRF /// Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> aref; Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> amat("amat", N*VectorLength, BlkSize, BlkSize); Random<value_type> random; for (int k=0;k<N*VectorLength;++k) { // use tridiagonal matrices; for now we just check elementwise l/u factors // do not allow pivots for (int i=0;i<BlkSize;++i) { amat(k, i, i) = random.value() + 10.0; if ((i+1) < BlkSize) { amat(k, i, i+1) = random.value() + 1.0; amat(k, i+1, i) = random.value() + 1.0; } } } typedef Vector<SIMD<value_type>,VectorLength> VectorType; Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength), KOKKOS_LAMBDA(const int k) { const int k0 = k/VectorLength, k1 = k%VectorLength; for (int i=0;i<BlkSize;++i) for (int j=0;j<BlkSize;++j) { amat_simd(k0, i, j)[k1] = amat(k0*VectorLength+k1, i, j); } });