void Gemm(const int NN) {
        typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;

        constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
        const int N = NN/VectorLength;

        {
          std::string value_type_name;
          if (std::is_same<value_type,double>::value)                   value_type_name = "double";
          if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";
#if   defined(__AVX512F__)
          std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#elif defined(__AVX__) || defined(__AVX2__)
          std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#else
          std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#endif
        }

        const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize);
        const double tmax = 1.0e15;

        const int iter_begin = -10, iter_end = 100;
        Kokkos::Impl::Timer timer;

        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> cref;
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> 
          amat("amat", N*VectorLength, BlkSize, BlkSize),
          bmat("bmat", N*VectorLength, BlkSize, BlkSize);

        Kokkos::Random_XorShift64_Pool<HostSpaceType> random(13718);
        Kokkos::fill_random(amat, random, value_type(1.0));
        Kokkos::fill_random(bmat, random, value_type(1.0));

        typedef Vector<SIMD<value_type>,VectorLength> VectorType;
        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType> 
          amat_simd("amat_simd", N, BlkSize, BlkSize),
          bmat_simd("bmat_simd", N, BlkSize, BlkSize);

        Kokkos::parallel_for
          (Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
           KOKKOS_LAMBDA(const int k) {
            const int k0 = k/VectorLength, k1 = k%VectorLength;
            for (int i=0;i<BlkSize;++i)
              for (int j=0;j<BlkSize;++j) {
                amat_simd(k0, i, j)[k1] = amat(k, i, j);
                bmat_simd(k0, i, j)[k1] = bmat(k, i, j);                  
              }
          });
      void LU(const int NN) {
        typedef Kokkos::Schedule<Kokkos::Static> ScheduleType;
        //typedef Kokkos::Schedule<Kokkos::Dynamic> ScheduleType;

        constexpr int VectorLength = DefaultVectorLength<value_type,typename HostSpaceType::memory_space>::value;
        const int N = NN/VectorLength;

        {
          std::string value_type_name;
          if (std::is_same<value_type,double>::value)                   value_type_name = "double";
          if (std::is_same<value_type,Kokkos::complex<double> >::value) value_type_name = "Kokkos::complex<double>";

#if   defined(__AVX512F__)
          std::cout << "AVX512 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#elif defined(__AVX__) || defined(__AVX2__)
          std::cout << "AVX or AVX2 is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#else
          std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name <<  " a vector length " << VectorLength << "\n";
#endif
        }

        const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize);
        const double tmax = 1.0e15;

        const int iter_begin = -10, iter_end = 100;
        Kokkos::Impl::Timer timer;

        ///
        /// Reference version using MKL DGETRF
        ///
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType> aref;
        Kokkos::View<value_type***,Kokkos::LayoutRight,HostSpaceType>
          amat("amat", N*VectorLength, BlkSize, BlkSize);

        Random<value_type> random;

        for (int k=0;k<N*VectorLength;++k) {
          // use tridiagonal matrices; for now we just check elementwise l/u factors
          // do not allow pivots
          for (int i=0;i<BlkSize;++i) {
            amat(k, i, i) = random.value() + 10.0;
            if ((i+1) < BlkSize) {
              amat(k, i, i+1) = random.value() + 1.0;
              amat(k, i+1, i) = random.value() + 1.0;
            }
          }
        }

        typedef Vector<SIMD<value_type>,VectorLength> VectorType;
        Kokkos::View<VectorType***,Kokkos::LayoutRight,HostSpaceType>
          amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize);
      
        Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", 
           Kokkos::RangePolicy<HostSpaceType>(0, N*VectorLength),
           KOKKOS_LAMBDA(const int k) {
            const int k0 = k/VectorLength, k1 = k%VectorLength;
            for (int i=0;i<BlkSize;++i)
              for (int j=0;j<BlkSize;++j) {
                amat_simd(k0, i, j)[k1] = amat(k0*VectorLength+k1, i, j);
              }
          });