Ejemplo n.º 1
0
void run_jit_float( const float*               i_a,
                    const float*               i_b,
                    float*                     o_c,
                    const int                   i_M,
                    const int                   i_N,
                    const int                   i_K,
                    const libxsmm_prefetch_type i_prefetch,
                    const char*                 i_arch ) {

  /* define function pointer */
  libxsmm_smmfunction l_test_jit;
  double l_jittime = 0.0, l_runtime = 0.0;
  float l_alpha = 1.0f;
  float l_beta = 1.0f;
  unsigned long long l_start;
  unsigned int l_t;

  if ( l_beta != 0.0f && l_beta != 1.0f ) {
    fprintf(stderr, "JIT float: beta needs to be 0.0 or 1.0!\n");
    exit(-1);
  }
  if ( l_alpha != 1.0f ) {
    fprintf(stderr, "JIT float: alpha needs to be 1.0!\n");
    exit(-1);
  }

  l_start = libxsmm_timer_tick();
  l_test_jit = libxsmm_smmdispatch(i_M, i_N, i_K, &i_M, &i_K, &i_M, &l_alpha, &l_beta, NULL, &i_prefetch );
  l_jittime = libxsmm_timer_duration(l_start, libxsmm_timer_tick());  
  printf("function pointer address: %llx\n", (size_t)l_test_jit);

  l_start = libxsmm_timer_tick();

  if ( i_prefetch == LIBXSMM_PREFETCH_NONE ) {
    for ( l_t = 0; l_t < g_jit_code_reps; l_t++ ) {
      l_test_jit(i_a, i_b, o_c);
    }
  } else {
    for ( l_t = 0; l_t < g_jit_code_reps; l_t++ ) {
      l_test_jit(i_a, i_b, o_c, i_a, i_b, o_c);
    }
  }

  l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick());

  printf("%fs for creating jit\n", l_jittime);
  printf("%fs for executing jit\n", l_runtime);
  printf("%f GFLOPS for jit\n", ((double)((double)g_jit_code_reps * (double)i_M * (double)i_N * (double)i_K) * 2.0) / (l_runtime * 1.0e9));
}
Ejemplo n.º 2
0
void run_gold_float( const float*                   i_a,
                     const float*                   i_b,
                     float*                         o_c,
                     const libxsmm_gemm_descriptor* i_xgemm_desc ) {
  unsigned int l_m, l_n, l_k, l_t;
  double l_runtime = 0.0;

  const unsigned long long l_start = libxsmm_timer_tick();

  for ( l_t = 0; l_t < g_jit_code_reps; l_t++  ) {
    for ( l_n = 0; l_n < i_xgemm_desc->n; l_n++  ) {
      for ( l_k = 0; l_k < i_xgemm_desc->k; l_k++  ) {
        for ( l_m = 0; l_m < i_xgemm_desc->m; l_m++ ) {
          o_c[(l_n * i_xgemm_desc->ldc) + l_m] += i_a[(l_k * i_xgemm_desc->lda) + l_m] * i_b[(l_n * i_xgemm_desc->ldb) + l_k];
        }
      }
    }
  }

  l_runtime = libxsmm_timer_duration(l_start, libxsmm_timer_tick());

  printf("%fs for C\n", l_runtime);
  printf("%f GFLOPS for C\n", ((double)((double)g_jit_code_reps * (double)i_xgemm_desc->m * (double)i_xgemm_desc->n * (double)i_xgemm_desc->k) * 2.0) / (l_runtime * 1.0e9));
}
Ejemplo n.º 3
0
int main(int argc, char* argv[])
{
  const int m = (1 < argc ? atoi(argv[1]) : 16);
  const int n = (2 < argc ? atoi(argv[2]) : m);
  const int unsigned ldi = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m);
  const int unsigned ldo = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, m);
  const int unroll = (5 < argc ? atoi(argv[5]) : 1);
  const int prefetch = (6 < argc ? atoi(argv[6]) : 0);
  const int flags = ((7 < argc && 0 != atoi(argv[7])) ? LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE : 0);
  const int iters = (8 < argc ? atoi(argv[8]) : 1);

  /* we should modify to test all data-types */
  const libxsmm_mcopy_descriptor* desc;
  libxsmm_xmcopyfunction kernel;
  libxsmm_descriptor_blob blob;
  libxsmm_timer_tickint l_start;
  libxsmm_timer_tickint l_end;
  int error = 0, i, j;
  ELEM_TYPE *a, *b;
  double copy_time;

  printf("This is a tester for JIT matcopy kernels!\n");
  desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE),
    m, n, ldo, ldi, flags, prefetch, &unroll);

  a = (ELEM_TYPE*)malloc(n * ldi * sizeof(ELEM_TYPE));
  b = (ELEM_TYPE*)malloc(n * ldo * sizeof(ELEM_TYPE));

  for (i = 0; i < n; i++) {
    for (j = 0; j < m; j++) {
      a[j+ldi*i] = (ELEM_TYPE)rand();
      if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) {
        b[j+ldo*i] = (ELEM_TYPE)rand();
      }
    }
  }

  /* test dispatch call */
  kernel = libxsmm_dispatch_mcopy(desc);
  if (kernel == 0) {
    printf("JIT error -> exit!!!!\n");
    exit(EXIT_FAILURE);
  }

  /* let's call */
  kernel(a, &ldi, b, &ldo, &a[128]);

  l_start = libxsmm_timer_tick();
  for (i = 0; i < iters; ++i) {
    kernel(a, &ldi, b, &ldo, &a[128]);
  }
  l_end = libxsmm_timer_tick();
  copy_time = libxsmm_timer_duration(l_start, l_end);

  for (i = 0; i < n; ++i) {
    for (j = 0; j < m; ++j) {
      if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) {
        if (LIBXSMM_NEQ(b[j+ldo*i], 0)) {
          printf("ERROR!!!\n");
          i = n;
          error = 1;
          break;
        }
      }
      else if (LIBXSMM_NEQ(a[j+ldi*i], b[j+ldo*i])) {
        printf("ERROR!!!\n");
        i = n;
        error = 1;
        break;
      }
    }
  }

  if (error == 0) {
    printf("CORRECT copy!!!!\n");
    printf("Time taken is\t%.5f seconds\n", copy_time);
  }

  return EXIT_SUCCESS;
}
Ejemplo n.º 4
0
int main(int argc, char* argv[]) {
  int M = ( argc == 7 ) ? atoi(argv[1]) : 9;
  int N = ( argc == 7 ) ? atoi(argv[2]) : 10;
  int K = ( argc == 7 ) ? atoi(argv[3]) : 9;
  unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8;
  unsigned int REPS =    ( argc == 7 ) ? atoi(argv[5]) : 1;
  char* l_csr_file =     ( argc == 7 ) ?      argv[6]  : "file.csr";

  const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE;
  const int flags = LIBXSMM_GEMM_FLAGS('N', 'N');
  const REALTYPE alpha = 1, beta = 1;

  REALTYPE* l_a_de = (REALTYPE*)libxsmm_aligned_malloc(K * K * sizeof(REALTYPE), 64);
  REALTYPE* l_a_sp = NULL;
  REALTYPE* l_b = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS* sizeof(REALTYPE), 64);
  unsigned int* l_rowptr = NULL;
  unsigned int* l_colidx = NULL;
  unsigned int l_rowcount, l_colcount, l_elements;
  REALTYPE* l_c = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE l_max_error = 0.0;
  unsigned int l_k, l_n;
  int l_i, l_j, l_jj;

  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_b, l_b, N, N_CRUNS);
  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS);
  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS);

  libxsmm_descriptor_blob l_xgemm_blob;
  const libxsmm_gemm_descriptor* l_xgemm_desc = 0;
  LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL;

  unsigned long long l_start, l_end;
  double l_total;

  if (argc != 7) {
    fprintf( stderr, "arguments: M #iters CSR-file!\n" );
    return -1;
  }

  /* touch B */
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)libxsmm_rand_f64();
      }
    }
  }

  /* touch C */
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0;
        LIBXSMM_VLA_ACCESS(3, l_p_c_asm,  l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0;
      }
    }
  }

  /* read A, CSR */
  libxsmm_sparse_csr_reader(  l_csr_file,
                             &l_rowptr,
                             &l_colidx,
                             &l_a_sp,
                             &l_rowcount, &l_colcount, &l_elements );

  /* copy b to dense */
  printf("CSR matrix data structure we just read:\n");
  printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements);

  for ( l_n = 0; l_n < (((unsigned int)K) * K); l_n++) {
    l_a_de[l_n] = 0.0;
  }

  for ( l_n = 0; l_n < (unsigned int)K; l_n++) {
    const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n];
    assert(l_rowptr[l_n+1] >= l_rowptr[l_n]);

    for ( l_k = 0; l_k < l_rowelems; l_k++) {
      l_a_de[(l_n * K) + l_colidx[l_rowptr[l_n] + l_k]] = l_a_sp[l_rowptr[l_n] + l_k];
    }
  }

  /* dense routine */
  l_start = libxsmm_timer_tick();
#if 1
  for ( l_n = 0; l_n < REPS; l_n++) {
    for ( l_i = 0; l_i < K; l_i++) {
      for ( l_j = 0; l_j < N; l_j++) {
        for ( l_jj = 0; l_jj < K; l_jj++) {
          LIBXSMM_PRAGMA_SIMD
          for (l_k = 0; l_k < N_CRUNS; l_k++) {
            LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
              +=   l_a_de[(l_i*K)+l_jj]
                 * LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_j, l_k, N, N_CRUNS);
          }
        }
      }
    }
  }
#endif
  l_end = libxsmm_timer_tick();
  l_total = libxsmm_timer_duration(l_start, l_end);
  printf("%fs for dense\n", l_total);
  printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)K * (double)K * (double)N * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9));

  l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE),
    K, N, K, 0, N, N, alpha, beta, flags, prefetch);

  /* sparse routine */
#if defined(__EDGE_EXECUTE_F32__)
  mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).smm;
#else
  mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).dmm;
#endif

  l_start = libxsmm_timer_tick();
  for ( l_n = 0; l_n < REPS; l_n++) {
    mykernel( l_a_sp, l_b, l_c_asm );
  }
  l_end = libxsmm_timer_tick();
  l_total = libxsmm_timer_duration(l_start, l_end);
  printf("%fs for sparse (asm)\n", l_total);
  printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9));

  /* check for errors */
  l_max_error = (REALTYPE)0.0;
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
                    - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) {
          l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
                                       -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) );
        }
      }
    }
  }
  printf("max error: %f\n", l_max_error);

  printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, K * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) );

  /* free */
  libxsmm_free( l_a_de );
  libxsmm_free( l_b );
  libxsmm_free( l_c );
  libxsmm_free( l_c_gold );
  libxsmm_free( l_c_asm );

  free( l_a_sp );
  free( l_rowptr );
  free( l_colidx );

  return 0;
}
Ejemplo n.º 5
0
int main(int argc, char* argv[])
{
  int result = EXIT_SUCCESS;
  try {
    const libxsmm_blasint benchmark = 1 < argc ? std::atoi(argv[1]) : 0;
    LIBXSMM_GEMM_CONST libxsmm_blasint m = (2 < argc ? std::atoi(argv[2]) : 23);
    LIBXSMM_GEMM_CONST libxsmm_blasint k = (4 < argc ? std::atoi(argv[4]) : m);
    LIBXSMM_GEMM_CONST libxsmm_blasint n = (3 < argc ? std::atoi(argv[3]) : k);
    const libxsmm_blasint q = (5 < argc ? std::atoi(argv[5]) : 0/*auto*/);
    const libxsmm_blasint nrepeat = (6 < argc ? std::atoi(argv[6]) : (0 >= q ? 13 : 1));

    LIBXSMM_GEMM_CONST libxsmm_blasint lda = m, ldb = k, ldc = m;
    LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N';
    LIBXSMM_GEMM_CONST OTYPE alpha = 1, beta = 1;

    const libxsmm_blasint asize = PAD(ITYPE, lda * k), bsize = PAD(ITYPE, ldb * n), csize = PAD(OTYPE, ldc * n);
    const libxsmm_blasint max_size = ((2ULL << 30/*2 GB*/) / ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE)));
    const libxsmm_blasint s = LIBXSMM_MIN(0 < q ? q : max_size, max_size);
    const libxsmm_blasint aspace = LIBXSMM_ALIGNMENT / sizeof(ITYPE);
    const size_t bwsize = static_cast<size_t>((asize/*load*/ + bsize/*load*/) * sizeof(ITYPE) + 2/*RFO*/ * csize * sizeof(OTYPE));
    const double gflops = 2E-9 * s * m * n * k;
#if LIBXSMM_TYPEINFO(ITYPE, FP)
    const char *const ops = "FLOPS";
    const double scale = 1.0 / s;
#else
    const char *const ops = "OPS";
    const double scale = 1;
#endif
#if !defined(_DEBUG)
    const char *const env_check = getenv("CHECK");
    const int check = (0 == env_check ? 0 : atoi(env_check));
#else
    /*const*/ int check = 1;
#endif

#if defined(LIBXSMM_OFFLOAD_TARGET)
#   pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
    {
#if defined(_OPENMP)
      const libxsmm_blasint chunksize = s / omp_get_max_threads();
#endif
      struct raii { // avoid std::vector (first-touch init. causes NUMA issue)
        ITYPE *a, *b;
        OTYPE *c, *d;
        libxsmm_blasint *m_shuffle;
        raii(libxsmm_blasint asize_, libxsmm_blasint bsize_, libxsmm_blasint csize_, libxsmm_blasint size_)
          : a(new ITYPE[static_cast<size_t>(asize_)]), b(new ITYPE[static_cast<size_t>(bsize_)])
          , c(new OTYPE[static_cast<size_t>(csize_)]), d(new OTYPE[static_cast<size_t>(csize_)])
          , m_shuffle(new libxsmm_blasint[size_])
        {
# if defined(_OPENMP)
#         pragma omp parallel for schedule(static)
# endif
          for (libxsmm_blasint i = 0; i < size_; ++i) m_shuffle[i] = libxsmm_rand_u32(size_);
        }
        ~raii() { delete[] a; delete[] b; delete[] c; delete[] d; delete[] m_shuffle; }
#if defined(RANDOMIZED)
        libxsmm_blasint shuffle(libxsmm_blasint i) const { return m_shuffle[i]; }
#else
        libxsmm_blasint shuffle(libxsmm_blasint i) const { return i; }
#endif
      } helper(s * asize + aspace - 1, s * bsize + aspace - 1, s * csize + aspace - 1, s);

      ITYPE *const a = LIBXSMM_ALIGN(helper.a, LIBXSMM_ALIGNMENT);
      ITYPE *const b = LIBXSMM_ALIGN(helper.b, LIBXSMM_ALIGNMENT);
      OTYPE *const c = LIBXSMM_ALIGN(helper.c, LIBXSMM_ALIGNMENT);
      OTYPE *const d = LIBXSMM_ALIGN(helper.d, LIBXSMM_ALIGNMENT);
#if defined(_OPENMP)
#     pragma omp parallel for schedule(static)
#endif
      for (libxsmm_blasint i = 0; i < s; ++i) {
        LIBXSMM_MATINIT(ITYPE, 42 + helper.shuffle(i), a + helper.shuffle(i) * asize, m, k, lda, scale);
        LIBXSMM_MATINIT(ITYPE, 24 + helper.shuffle(i), b + helper.shuffle(i) * bsize, k, n, ldb, scale);
        LIBXSMM_MATINIT(OTYPE, 22 + i, c + i * csize, m, n, ldc, scale);
        LIBXSMM_MATINIT(OTYPE, 22 + i, d + i * csize, m, n, ldc, scale);
      }

#if defined(MKL_ENABLE_AVX512)
      mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
      // initialize LIBXSMM
      libxsmm_init();

      fprintf(stdout, "m=%lli n=%lli k=%lli size=%lli memory=%.1f MB (input=%s output=%s)\n\n",
        static_cast<long long>(m), static_cast<long long>(n), static_cast<long long>(k), static_cast<long long>(s),
        1.0 * (s * ((asize + bsize) * sizeof(ITYPE) + csize * sizeof(OTYPE))) / (1 << 20),
        LIBXSMM_TYPENAME(ITYPE), LIBXSMM_TYPENAME(OTYPE));

      // LAPACK/BLAS3 (warm-up BLAS Library)
#if defined(_OPENMP)
#     pragma omp parallel for schedule(static)
#endif
      for (libxsmm_blasint i = 0; i < s; ++i) {
        LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k,
          &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb,
           &beta, c + i * csize, &ldc);
      }

#if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION)
      std::vector<const ITYPE*> va_array(static_cast<size_t>(s)), vb_array(static_cast<size_t>(s));
      std::vector<OTYPE*> vc_array(static_cast<size_t>(s));
      const ITYPE* *const a_array = &va_array[0];
      const ITYPE* *const b_array = &vb_array[0];
      OTYPE* *const c_array = &vc_array[0];
      const libxsmm_blasint group_count = 1;
      for (libxsmm_blasint i = 0; i < s; ++i) { // setup batched (A,B,C)
        a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize;
      }
      // additional warm-up (also to eventually match the Gold result)
      LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k,
        &alpha, &a_array[0], &lda, &b_array[0], &ldb,
          &beta, &c_array[0], &ldc, &group_count, &s);
#endif

      switch (benchmark) {
      case 0: { // batched
        fprintf(stdout, "Batched (A,B,C)...\n");
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
#if defined(_OPENMP)
#         pragma omp parallel for schedule(static)
#endif
          for (libxsmm_blasint i = 0; i < s; ++i) {
            LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k,
              &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb,
               &beta, c + i * csize, &ldc);
          }
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      } /* fallthrough */

#if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION)
      case 1: { // batched indirect
        fprintf(stdout, "Indirect (A,B,C)...\n");
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
          LIBXSMM_TPREFIX(ITYPE,gemm_batch)(&transa, &transb, &m, &n, &k,
            &alpha, &a_array[0], &lda, &b_array[0], &ldb,
             &beta, &c_array[0], &ldc, &group_count, &s);
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * bwsize / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
        if (0 == benchmark) { /* Gold result is available */
          libxsmm_matdiff_info diff;
          memset(&diff, 0, sizeof(diff));
          for (libxsmm_blasint h = 0; h < s; ++h) {
            const OTYPE *const u = c + h * csize, *const v = c_array[h];
            libxsmm_matdiff_info dv;
            if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, u, v, &ldc, &ldc, &dv)) {
              libxsmm_matdiff_reduce(&diff, &dv);
            }
          }
          if (0 < diff.normf_rel) fprintf(stdout, "\tdiff: %.0f%%\n", 100.0 * diff.normf_rel);
        }
      }
#endif
      break;
      case 2: { // streaming A and C
        fprintf(stdout, "Streamed (A,C)...\n");
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
#if defined(_OPENMP)
#         pragma omp parallel for schedule(static)
#endif
          for (libxsmm_blasint i = 0; i < s; ++i) {
            LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k,
              &alpha, a + helper.shuffle(i) * asize, &lda, b, &ldb,
               &beta, c + i * csize, &ldc);
          }
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      } /* fallthrough */

#if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION)
      case 3: { // indirect A and C
        fprintf(stdout, "Indirect (A,C)...\n");
        for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b; c_array[i] = d + i * csize; }
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
          LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k,
            &alpha, &a_array[0], &lda, &b_array[0], &ldb,
             &beta, &c_array[0], &ldc, &group_count, &s);
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - bsize * sizeof(ITYPE)) / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      }
#endif
      break;
      case 4: { // streaming B and C
        fprintf(stdout, "Streamed (B,C)...\n");
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
#if defined(_OPENMP)
#         pragma omp parallel for schedule(static)
#endif
          for (libxsmm_blasint i = 0; i < s; ++i) {
            LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k,
              &alpha, a, &lda, b + helper.shuffle(i) * bsize, &ldb,
               &beta, c + i * csize, &ldc);
          }
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      } /* fallthrough */

#if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION)
      case 5: { // indirect B and C
        fprintf(stdout, "Indirect (B,C)...\n");
        for (libxsmm_blasint i = 0; i < s; ++i) { a_array[i] = a; b_array[i] = b + helper.shuffle(i) * bsize; c_array[i] = d + i * csize; }
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
          LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k,
            &alpha, &a_array[0], &lda, &b_array[0], &ldb,
             &beta, &c_array[0], &ldc, &group_count, &s);
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - asize * sizeof(ITYPE)) / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      }
#endif
      break;
      case 6: { // streaming A and B
        fprintf(stdout, "Streamed (A,B)...\n");
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
#if defined(_OPENMP)
#         pragma omp parallel for schedule(static)
#endif
          for (libxsmm_blasint i = 0; i < s; ++i) {
#if defined(_OPENMP) /* attempt to write to disjunct cachelines */
            const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize;
#else
            const libxsmm_blasint j = 0;
#endif
            LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k,
              &alpha, a + helper.shuffle(i) * asize, &lda, b + helper.shuffle(i) * bsize, &ldb,
               &beta, c + j, &ldc);
          }
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      } /* fallthrough */

#if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION)
      case 7: { // indirect A and B
        fprintf(stdout, "Indirect (A,B)...\n");
#if defined(_OPENMP)
#       pragma omp parallel for schedule(static)
#endif
        for (libxsmm_blasint i = 0; i < s; ++i) {
          a_array[i] = a + helper.shuffle(i) * asize; b_array[i] = b + helper.shuffle(i) * bsize;
#if defined(_OPENMP) /* attempt to write to disjunct cachelines */
          c_array[i] = d + omp_get_thread_num() * chunksize * csize;
#else
          c_array[i] = d;
#endif
        }
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
          LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k,
            &alpha, &a_array[0], &lda, &b_array[0], &ldb,
             &beta, &c_array[0], &ldc, &group_count, &s);
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
          fprintf(stdout, "\tbandwidth: %.1f GB/s\n", s * (bwsize - 2 * csize * sizeof(OTYPE)) / (duration * (1 << 30)));
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      }
#endif
      break;
      case 8: { // cached
        fprintf(stdout, "Cached...\n");
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
#if defined(_OPENMP)
#         pragma omp parallel for schedule(static)
#endif
          for (libxsmm_blasint i = 0; i < s; ++i) {
#if defined(_OPENMP) /* attempt to write to disjunct cachelines */
            const libxsmm_blasint j = omp_get_thread_num() * chunksize * csize;
#else
            const libxsmm_blasint j = 0;
#endif
            LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k,
              &alpha, a, &lda, b, &ldb, &beta, c + j, &ldc);
          }
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      } /* fallthrough */

#if (defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)) && (LIBXSMM_VERSION3(11, 3, 0) <= INTEL_MKL_VERSION)
      case 9: { // indirect cached
        fprintf(stdout, "Indirect cached...\n");
#if defined(_OPENMP)
#       pragma omp parallel for schedule(static)
#endif
        for (libxsmm_blasint i = 0; i < s; ++i) {
          a_array[i] = a; b_array[i] = b;
#if defined(_OPENMP) /* attempt to write to disjunct cachelines */
          c_array[i] = d + omp_get_thread_num() * chunksize * csize;
#else
          c_array[i] = d;
#endif
        }
        const unsigned long long start = libxsmm_timer_tick();
        for (libxsmm_blasint r = 0; r < nrepeat; ++r) {
          LIBXSMM_TPREFIX(ITYPE, gemm_batch)(&transa, &transb, &m, &n, &k,
            &alpha, &a_array[0], &lda, &b_array[0], &ldb,
             &beta, &c_array[0], &ldc, &group_count, &s);
        }
        const unsigned long long ncycles = libxsmm_timer_diff(start, libxsmm_timer_tick());
        const double duration = libxsmm_timer_duration(0, ncycles) / nrepeat;
        if (0 < duration && 0 != ncycles) {
          fprintf(stdout, "\tpseudo-perf.: %.1f %s/cycle\n", (2 * k - 1) * (double)(s * m * n) / ncycles, ops);
          fprintf(stdout, "\tperformance: %.1f G%s/s\n", gflops / duration, ops);
        }
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
      }
#endif
      break;
      default: throw "invalid case selected!";
      } /*switch*/

      if (0 != check) {
        libxsmm_matdiff_info diff;
        if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(OTYPE), m, n, 0 == (benchmark & 1) ? c : d, NULL, &ldc, &ldc, &diff)) {
          fprintf(stdout, "\tcheck: %f\n", diff.l1_ref);
        }
      }
      // finalize LIBXSMM
      libxsmm_finalize();
      fprintf(stdout, "Finished\n");
    }
  }
  catch(const std::exception& e) {
    fprintf(stderr, "Error: %s\n", e.what());
    result = EXIT_FAILURE;
  }
  catch(const char* message) {
    fprintf(stderr, "Error: %s\n", message);
    result = EXIT_FAILURE;
  }
  catch(...) {
    fprintf(stderr, "Error: unknown exception caught!\n");
    result = EXIT_FAILURE;
  }

  return result;
}
Ejemplo n.º 6
0
int main(int argc, char* argv[])
{
  LIBXSMM_GEMM_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 1024);
  LIBXSMM_GEMM_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m);
  LIBXSMM_GEMM_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k);
  const libxsmm_blasint bm = (4 < argc ? atoi(argv[4]) : 32);
  const libxsmm_blasint bk = (6 < argc ? atoi(argv[6]) : bm);
  const libxsmm_blasint bn = (5 < argc ? atoi(argv[5]) : bk);
  const libxsmm_bgemm_order order = (libxsmm_bgemm_order)(7 < argc ? atoi(argv[7]) : 0);
  const int nrepeat = (8 < argc ? atoi(argv[8]) : 100);
  const libxsmm_blasint b_m1 = (9 < argc ? atoi(argv[9]) : 1);
  const libxsmm_blasint b_n1  = (10 < argc ? atoi(argv[10]) : 1);
  const libxsmm_blasint b_k1 = (11 < argc ? atoi(argv[11]) : 1);
  const libxsmm_blasint b_k2 = (12 < argc ? atoi(argv[12]) : 1);
  const int ab = (13 < argc ? atoi(argv[13]) : 0);
  LIBXSMM_GEMM_CONST libxsmm_blasint lda = (14 < argc ? atoi(argv[13]) : m);
  LIBXSMM_GEMM_CONST libxsmm_blasint ldb = (15 < argc ? atoi(argv[14]) : k);
  LIBXSMM_GEMM_CONST libxsmm_blasint ldc = (16 < argc ? atoi(argv[15]) : m);
  LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; /* no transposes */
  LIBXSMM_GEMM_CONST ITYPE alpha = 1, beta = 1;
  const int gemm_flags = LIBXSMM_GEMM_FLAGS(transa, transb);
  const double gflops = 2.0 * m * n * k * 1E-9;
  int result = EXIT_SUCCESS;
#if defined(CHECK)
  const char *const env_check = getenv("CHECK");
  const double check = LIBXSMM_ABS(0 == env_check ? 0 : atof(env_check));
#endif
  if (argc > 1 && !strncmp(argv[1], "-h", 3)) { /* check command line */
    printf("\nUsage: ./bgemm [M] [N] [K] [bm] [bn] [bk] [order] [reps] [b_m1] [b_n1] [b_k1] [b_k2] [verbose]\n\n");
    return result;
  }

  MYASSERT(m % b_m1 == 0);
  MYASSERT(n % b_n1 == 0);
  MYASSERT(k % b_k1 == 0);
  MYASSERT(m/b_m1 % bm == 0);
  MYASSERT(n/b_n1 % bn == 0);
  MYASSERT(k/b_k1/b_k2 % bk == 0);

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    ITYPE* agold = (ITYPE*)libxsmm_malloc((size_t)(lda * k * sizeof(ITYPE)));
    ITYPE* bgold = (ITYPE*)libxsmm_malloc((size_t)(ldb * n * sizeof(ITYPE)));
    ITYPE* cgold = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE)));
    ITYPE* a = (ITYPE*)libxsmm_malloc((size_t)(m * k * sizeof(ITYPE)));
    ITYPE* b = (ITYPE*)libxsmm_malloc((size_t)(k * n * sizeof(ITYPE)));
    ITYPE* c = (ITYPE*)libxsmm_malloc((size_t)(m * n * sizeof(ITYPE)));
    libxsmm_bgemm_handle* handle = 0;
    unsigned long long start;
    double duration;
    handle = libxsmm_bgemm_handle_create(
      LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(ITYPE),
      m, n, k, &bm, &bn, &bk, &b_m1, &b_n1, &b_k1, &b_k2,
      &alpha, &beta, &gemm_flags, NULL/*auto-prefetch*/, &order);

    if (0 != handle) {
      LIBXSMM_MATINIT(ITYPE, 42, agold, m, k, lda, 1.0);
      LIBXSMM_MATINIT(ITYPE, 24, bgold, k, n, ldb, 1.0);
      LIBXSMM_MATINIT(ITYPE,  0, cgold, m, n, ldc, 1.0);
      libxsmm_bgemm_copyin_a(handle, agold, &lda, a);
      libxsmm_bgemm_copyin_b(handle, bgold, &ldb, b);
      libxsmm_bgemm_copyin_c(handle, cgold, &ldc, c);
#if defined(MKL_ENABLE_AVX512)
      mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
      /* warm-up OpenMP (populate thread pool) */
      libxsmm_bgemm_omp(handle, a, b, c, 1);
#if defined(CHECK)
      if (!LIBXSMM_FEQ(0, check)) {
        LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc);
      }
#endif
      if (!ab) {
      libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE),
        &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
      fprintf(stdout, "\n\n");
      }
      start = libxsmm_timer_tick();
      libxsmm_bgemm_omp(handle, a, b, c, nrepeat);
      duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
      if (0 < duration) {
        if (ab) {
          fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s | %lli,%lli,%lli,%lli,%lli,%lli,%i,%lli,%lli,%lli,%lli\n",
            gflops * nrepeat / duration, (long long)m, (long long)n, (long long)k, (long long)bm, (long long)bn, (long long)bk,
            (int)order, (long long)b_m1, (long long)b_n1, (long long)b_k1, (long long)b_k2);
        } else {
          fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration);
        }
      }
#if defined(CHECK)
      if (!LIBXSMM_FEQ(0, check)) { /* validate result against LAPACK/BLAS xGEMM */
        ITYPE* ctest = 0;
        int i;
        start = libxsmm_timer_tick();
        for (i = 0; i < nrepeat; ++i) {
          LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc);
        }
        duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
        if (0 < duration) {
          fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration);
        }
        /* free memory not needed further; avoid double-free later on */
        libxsmm_free(agold); agold = 0;
        libxsmm_free(bgold); bgold = 0;
        libxsmm_free(a); a = 0;
        libxsmm_free(b); b = 0;
        /* allocate C-matrix in regular format, and perform copy-out */
        ctest = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE)));
        if (0 != ctest) {
          libxsmm_matdiff_info diff;
          libxsmm_bgemm_copyout_c(handle, c, &ldc, ctest);
          if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(ITYPE), m, n, cgold, ctest, &ldc, &ldc, &diff)) {
            fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs);
            if (check < 100.0 * diff.normf_rel) {
              fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel);
              result = EXIT_FAILURE;
            }
          }
          libxsmm_free(ctest);
        }
      }
#endif
      libxsmm_bgemm_handle_destroy(handle);
    }
    else {
      fprintf(stderr, "FAILED to create BGEMM-handle! For details retry with LIBXSMM_VERBOSE=1.\n");
      result = EXIT_FAILURE;
    }
    libxsmm_free(agold);
    libxsmm_free(bgold);
    libxsmm_free(cgold);
    libxsmm_free(a);
    libxsmm_free(b);
    libxsmm_free(c);
  }
  if(!ab) {
    fprintf(stdout, "Finished\n");
  }
  return result;
}
Ejemplo n.º 7
0
int main(int argc, char* argv[])
{
  const libxsmm_blasint m = 1 < argc ? atoi(argv[1]) : 4096;
  const libxsmm_blasint n = 2 < argc ? atoi(argv[2]) : m;
  const libxsmm_blasint lda = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m);
  const libxsmm_blasint ldb = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, n);

  REAL_TYPE *const a = (REAL_TYPE*)malloc(lda * n * sizeof(REAL_TYPE));
  REAL_TYPE *const b = (REAL_TYPE*)malloc(ldb * m * sizeof(REAL_TYPE));
  const unsigned int size = m * n * sizeof(REAL_TYPE);
  unsigned long long start;
  libxsmm_blasint i, j;
  double duration;

  fprintf(stdout, "m=%i n=%i lda=%i ldb=%i size=%.fMB (%s)\n", m, n, lda, ldb,
    1.0 * size / (1 << 20), 8 == sizeof(REAL_TYPE) ? "DP" : "SP");

  for (i = 0; i < n; ++i) {
    for (j = 0; j < m; ++j) {
      a[i*lda+j] = initial_value(i, j, lda);
    }
  }

  start = libxsmm_timer_tick();
  libxsmm_transpose_oop(b, a, sizeof(REAL_TYPE), m, n, lda, ldb);
  libxsmm_transpose_oop(a, b, sizeof(REAL_TYPE), n, m, ldb, lda);
  duration = libxsmm_timer_duration(start, libxsmm_timer_tick());

  for (i = 0; i < n; ++i) {
    for (j = 0; j < m; ++j) {
      if (0 < fabs(a[i*lda+j] - initial_value(i, j, lda))) {
        i = n + 1;
        break;
      }
    }
  }

  if (i <= n) {
    if (0 < duration) {
      fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size / (duration * (1 << 30)));
    }
    fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration);
  }
  else {
    fprintf(stderr, "Validation failed!\n");
  }

#if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL)
  {
    double mkl_duration;
    start = libxsmm_timer_tick();
    LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', m, n, 1, a, lda, b, ldb);
    LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', n, m, 1, b, ldb, a, lda);
    mkl_duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
    if (0 < mkl_duration) {
      fprintf(stdout, "\tMKL: %.1fx\n", duration / mkl_duration);
    }
  }
#endif

  free(a);
  free(b);

  return EXIT_SUCCESS;
}
Ejemplo n.º 8
0
Archivo: spmdm.c Proyecto: hfp/libxsmm
int main(int argc, char *argv[])
{
  REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2;
  float *C_gold, *C0_gold, *C, *C2;

  int M, N, K;
  REAL_TYPE alpha, beta;
  int reps;

  libxsmm_spmdm_handle handle, handle2;
  libxsmm_CSR_sparseslice *A_sparse, *A_sparse2;
  int max_threads;

  /* Step 1: Read in args */
  libxsmm_timer_tickint start, end;
  double flops, duration;
  char transA, transB, transC;
  int i, j, k;
  size_t l;

  /* Step 1: Initialize handle */
  M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N';

  if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
    printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]);
    return EXIT_SUCCESS;
  }

  /* defaults */
  M = 2048;
  N = 2048;
  K = 2048;
  transA = 'N';
  transB = 'N';
  transC = 'N';
  reps = 100;

  /* reading new values from cli */
  i = 1;
  if (argc > i) M = atoi(argv[i++]);
  if (argc > i) N = atoi(argv[i++]);
  if (argc > i) K = atoi(argv[i++]);
  if (argc > i) { transA = argv[i][0]; i++; }
  if (argc > i) { transB = argv[i][0]; i++; }
  if (argc > i) { transC = argv[i][0]; i++; }
  if (argc > i) reps = atoi(argv[i++]);

  /* Step 2: allocate data */
  A_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  B_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );
  C_gold  = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C       = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  /* Step 3: init data */
  libxsmm_rng_set_seed(1);
  for (l = 0; l < (size_t)M * (size_t)K; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    if (r64 > 0.85) A_gold[l] = val;
    else              A_gold[l] = (REAL_TYPE)0.0;
  }

  for (l = 0; l < (size_t)K * (size_t)N; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    B_gold[l] = val;
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C0_gold[l] = (float)libxsmm_rng_f64();
    C_gold[l] = C0_gold[l];
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  flops = (double)M * (double)N * (double)K * 2.0;

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */
# if defined(_OPENMP)
  max_threads = omp_get_max_threads();
# else
  max_threads = 1;
# endif

  start = libxsmm_timer_tick();
  libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse);
  end = libxsmm_timer_tick();
  printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end));

  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps );
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha input */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# else
  spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# endif

  /* Checks */

  /* Compute a "gold" answer sequentially */
#if defined(_OPENMP)
  LIBXSMM_OMP_VAR(k);
# pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2)
#endif
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      float sum = 0.0;
      float Cval;
      for (k = 0; k < K; ++k) {
#       ifdef USE_BFLOAT
        libxsmm_bfloat16 Atmp = A_gold[i*K+k];
        int Atmp_int  = Atmp; Atmp_int <<= 16;
        float Aval = *(float *)&Atmp_int;
        libxsmm_bfloat16 Btmp = B_gold[k*N+j];
        int Btmp_int  = Btmp; Btmp_int <<= 16;
        float Bval = *(float *)&Btmp_int;
#       else
        float Aval = A_gold[i*K + k];
        float Bval = B_gold[k*N + j];
#       endif
        sum += Aval * Bval;
      }
      Cval = sum;
      C_gold[i*N + j] = Cval + beta*C_gold[i*N + j];
    }
  }
  /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */

  /* Compute the max difference between gold and computed results. */
  spmdm_check_c( &handle, C, C_gold );

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   else
    spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */
  transA = 'T'; transB = 'N'; transC = 'T';
  libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2);
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  for (i = 0; i < M; ++i) {
    for (j = 0; j < K; ++j) {
      A_gold2[j*M + i] = A_gold[i*K + j];
    }
  }
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C[j*M + i] = (float)C0_gold[i*N + j];
    }
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# endif

  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C2[i*N + j] = C[j*M + i];
    }
  }
  /* Checks */
  spmdm_check_c( &handle2, C2, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 6: Test transpose B */
  transA = 'N'; transB = 'T'; transC = 'N';
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );

  for (i = 0; i < K; ++i) {
    for (j = 0; j < N; ++j) {
      B_gold2[j*K + i] = B_gold[i*N + j];
    }
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# endif

  /* Checks */
  spmdm_check_c( &handle2, C, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle2);

  libxsmm_free(A_gold);
  libxsmm_free(B_gold);
  libxsmm_free(C_gold);
  libxsmm_free(C);
  libxsmm_free(C2);
  libxsmm_free(C0_gold);
  libxsmm_free(B_gold2);
  libxsmm_free(A_gold2);

  return EXIT_SUCCESS;
}
Ejemplo n.º 9
0
Archivo: diff.c Proyecto: hfp/libxsmm
int main(int argc, char* argv[])
{
  const int insize = (1 < argc ? atoi(argv[1]) : 0);
  const int incrmt = (2 < argc ? atoi(argv[2]) : 0);
  const int nelems = (3 < argc ? atoi(argv[3]) : 0);
  const int niters = (4 < argc ? atoi(argv[4]) : 1);
  const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize);
  const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize));
  const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems));
  unsigned char *input, *icopy = NULL, *ilast = NULL;
  int result = EXIT_SUCCESS;
  size_t nbytes, size, nrpt;

  if (0 < niters) {
    size = n;
    nrpt = niters;
  }
  else {
    size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1);
    nrpt = n;
  }
  nbytes = size * stride;
  input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL);

  if (NULL != input) {
    unsigned char *const ref = input + (size - 1) * stride; /* last item */
    libxsmm_timer_tickint start;
    size_t i, j = 0;

    /* initialize the input data */
    for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128);
    for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255;

    { /* benchmark libxsmm_diff_n */
#if defined(USE_HASH)
      const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/);
#endif
      start = libxsmm_timer_tick();
      for (i = 0; i < nrpt; ++i) {
#if !defined(USE_HASH)
        j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride,
          (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size);
#else
        const unsigned char* tst = input;
        for (j = 0; j < size; ++j) {
          const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/);
          if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) {
            break;
          }
          tst += stride;
        }
#endif
      }
      printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
    }

    if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */
      icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL);
      if (NULL != icopy) {
        ilast = icopy + (size - 1) * stride; /* last item */
        memcpy(icopy, input, nbytes);
        start = libxsmm_timer_tick();
        for (i = 0; i < nrpt; ++i) {
          j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */
          /* memcmp may be pure and without touching input it is not repeated (nrpt) */
          ilast[i%elsize] = 255;
        }
        printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
        result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */
      }
    }
    else {
      result = EXIT_FAILURE;
    }

    if (NULL != icopy) { /* benchmark stdlib's memcmp */
      LIBXSMM_ASSERT(NULL != ilast);
      start = libxsmm_timer_tick();
      for (i = 0; i < nrpt; ++i) {
        j += memcmp(input, icopy, nbytes); /* take result of every execution */
        /* memcmp is likely pure and without touching input it is not repeated (nrpt) */
        ilast[i%elsize] = 255;
      }
      printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick()));
      result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */
      free(icopy);
    }

    free(input);
  }
  else {
    result = EXIT_FAILURE;
  }

  return result;
}
Ejemplo n.º 10
0
int edge::reproducers::local( unsigned int const i_nSteps, unsigned int const i_nElements ) {

  double                                  l_dT = 0.000001;
  unsigned int                            l_nSteps = i_nSteps;
  unsigned int                            l_nElements = i_nElements;
  t_elementChars                        * l_elChars; /* zero initialization */
  t_dg                                    l_dg;
  t_matStar                            (* l_starM)[N_DIM];
  t_fluxSolver                         (* l_fluxSolvers)[ C_ENT[T_SDISC.ELEMENT].N_FACES ];
  real_base                            (* l_dofs)[N_QUANTITIES][N_ELEMENT_MODES][N_CRUNS];
  real_base                            (* l_tInt)[N_QUANTITIES][N_ELEMENT_MODES][N_CRUNS];
  edge::io::Receivers                     l_recvs;
  edge::data::MmXsmmFused< real_base >    l_mm;
  unsigned int                            l_dummyUInt;
  double                                  l_dummyDouble;


  // 1. Set up structures
  setupDg( l_dg );
  setupKernel( l_mm );

  setupStarM( l_nElements, &l_starM );
  setupFluxSolv( l_nElements, &l_fluxSolvers );

  setupTensor( l_nElements, &l_dofs, &l_tInt );
#ifdef PP_USE_OMP
  #pragma omp parallel
  #pragma omp critical
#endif
  setupScratchMem( &(edge::parallel::g_scratchMem) );

  setupPseudoMesh( edge::reproducers::C_MODE_LOCAL, l_nElements,
                   &l_elChars,
                   nullptr, nullptr, nullptr, nullptr, nullptr );


  // 2. Run solvers
  std::cout << "Runing solvers" << std::endl;
  unsigned long long l_start = libxsmm_timer_tick();
#ifdef PP_USE_OMP
  #pragma omp parallel firstprivate( l_nSteps, l_nElements, l_dT )  \
                       firstprivate( l_elChars )                    \
                       firstprivate( l_dg, l_starM, l_fluxSolvers ) \
                       firstprivate( l_dofs, l_tInt  )              \
                       firstprivate( l_mm )                         \
                       private( l_recvs, l_dummyUInt, l_dummyDouble )
#endif
  {
    const unsigned int l_nThreads = omp_get_num_threads();
    const unsigned int l_tid = omp_get_thread_num();
    unsigned int l_firstEl = (unsigned int)((l_nElements + l_nThreads - 1) / l_nThreads) * l_tid;
    unsigned int l_lastEl = (unsigned int)((l_nElements + l_nThreads - 1) / l_nThreads) * (l_tid + 1);
    l_lastEl = std::min(l_lastEl, l_nElements);
    unsigned int l_numEl = l_lastEl - l_firstEl;

    for ( unsigned int l_step = 0; l_step < l_nSteps; l_step++ ) {
      edge::elastic::solvers::AderDg::local< unsigned int,
                                             real_base,
                                             edge::data::MmXsmmFused< real_base > >
                                           ( l_firstEl,
                                             l_numEl,
                                             l_dummyDouble,
                                             l_dT,
                                             l_dummyUInt,
                                             l_dummyUInt,
                                             nullptr,
                                             nullptr,
                                             l_elChars,
                                             l_dg,
                                             l_starM,
                                             l_fluxSolvers,
                                             l_dofs,
                                             l_tInt,
                                             nullptr,
                                             l_recvs,
                                             l_mm           );
#ifdef PP_USE_OMP
      #pragma omp barrier
#endif
    }
  }
  unsigned long long l_end = libxsmm_timer_tick();

  // 3. Print statistics
  double l_time = libxsmm_timer_duration(l_start, l_end);
  unsigned int l_local_flops[] =
  {
    792, 3564, 11412, 31500, 77184, 173538, 360522
  };
  unsigned long long l_flops = (unsigned long long)l_local_flops[ORDER-1] * PP_N_CRUNS * \
                               l_nElements * l_nSteps;
  double l_gflops = (double)l_flops / (l_time * 1000000000);
  std::cout << "Elapsed time: " << l_time << " s" << std::endl;
  std::cout << "Performance:  " << l_gflops << " GFLOPS" << std::endl;
  std::cout << std::endl;

#ifdef PP_REPRODUCER_DUMP
  std::string l_dumpFileName1 = "./local_o"+std::to_string(ORDER)+"_"
                                "f"+std::to_string(PP_PRECISION)+"_"
                                "el"+std::to_string(l_nElements)+"_"
                                "stp"+std::to_string(l_nSteps)+"_dofs.log";
  std::string l_dumpFileName2 = "./local_o"+std::to_string(ORDER)+"_"
                                "f"+std::to_string(PP_PRECISION)+"_"
                                "el"+std::to_string(l_nElements)+"_"
                                "stp"+std::to_string(l_nSteps)+"_tInt.log";
  std::ofstream l_fp1( l_dumpFileName1 );
  std::ofstream l_fp2( l_dumpFileName2 );
  for ( unsigned int l_el = 0; l_el < l_nElements; l_el++ ) {
    for ( unsigned int l_qt = 0; l_qt < N_QUANTITIES; l_qt++ ) {
      for ( unsigned int l_md = 0; l_md < N_ELEMENT_MODES; l_md++ ) {
        for ( unsigned int l_cfr = 0; l_cfr < N_CRUNS; l_cfr++ ) {
          l_fp1 << l_dofs[l_el][l_qt][l_md][l_cfr] << "\n";
          l_fp2 << l_tInt[l_el][l_qt][l_md][l_cfr] << "\n";
        }
      }
    }
  }
#endif


  // 4. Clean up
  cleanupDg( l_dg );
  cleanupStarM( l_starM );
  cleanupFluxSolv( l_fluxSolvers );
  cleanupTensor( l_dofs, l_tInt );

#ifdef PP_USE_OMP
  #pragma omp parallel
  #pragma omp critical
#endif
  cleanupScratchMem( edge::parallel::g_scratchMem );

  cleanupPseudoMesh( edge::reproducers::C_MODE_LOCAL,
                     l_elChars,
                     nullptr, nullptr, nullptr, nullptr, nullptr );

  return 0;
}
Ejemplo n.º 11
0
int main(int argc, char* argv[])
{
  const int ncalls = 1000000;
#if defined(_OPENMP)
  const int max_nthreads = omp_get_max_threads();
#else
  const int max_nthreads = 1;
#endif
  const int ncycles = LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : 100, 1);
  const int max_nallocs = LIBXSMM_CLMP(2 < argc ? atoi(argv[2]) : 4, 1, MAX_MALLOC_N);
  const int nthreads = LIBXSMM_CLMP(3 < argc ? atoi(argv[3]) : 1, 1, max_nthreads);
  unsigned int nallocs = 0, nerrors = 0;
  int r[MAX_MALLOC_N], i;

  /* generate set of random number for parallel region */
  for (i = 0; i < (MAX_MALLOC_N); ++i) r[i] = rand();

  /* count number of calls according to randomized scheme */
  for (i = 0; i < ncycles; ++i) {
    nallocs += r[i%(MAX_MALLOC_N)] % max_nallocs + 1;
  }
  assert(0 != nallocs);

  fprintf(stdout, "Running %i cycles with max. %i malloc+free (%u calls) using %i thread%s...\n",
    ncycles, max_nallocs, nallocs, 1 >= nthreads ? 1 : nthreads, 1 >= nthreads ? "" : "s");

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    const char *const longlife_env = getenv("LONGLIFE");
    const int enable_longlife = ((0 == longlife_env || 0 == *longlife_env) ? 0 : atoi(longlife_env));
    void *const longlife = (0 == enable_longlife ? 0 : malloc_offsite((MAX_MALLOC_MB) << 20));
    unsigned long long d0, d1 = 0;
    libxsmm_scratch_info info;

    /* run non-inline function to measure call overhead of an "empty" function */
    const unsigned long long t0 = libxsmm_timer_tick();
    for (i = 0; i < ncalls; ++i) {
      libxsmm_init(); /* subsequent calls are not doing any work */
    }
    d0 = libxsmm_timer_diff(t0, libxsmm_timer_tick());

#if defined(_OPENMP)
#   pragma omp parallel for num_threads(nthreads) private(i) default(none) shared(r) reduction(+:d1,nerrors)
#endif
    for (i = 0; i < ncycles; ++i) {
      const int count = r[i%(MAX_MALLOC_N)] % max_nallocs + 1;
      void* p[MAX_MALLOC_N];
      int j;
      assert(count <= MAX_MALLOC_N);
      for (j = 0; j < count; ++j) {
        const int k = (i * count + j) % (MAX_MALLOC_N);
        const size_t nbytes = (r[k] % (MAX_MALLOC_MB) + 1) << 20;
        const unsigned long long t1 = libxsmm_timer_tick();
        p[j] = libxsmm_aligned_scratch(nbytes, 0/*auto*/);
        d1 += libxsmm_timer_diff(t1, libxsmm_timer_tick());
        if (0 != p[j]) {
          memset(p[j], j, nbytes);
        }
        else {
          ++nerrors;
        }
      }
      for (j = 0; j < count; ++j) {
        libxsmm_free(p[j]);
      }
    }
    libxsmm_free(longlife);

    if (0 != d0 && 0 != d1 && 0 < nallocs) {
      const double dcalls = libxsmm_timer_duration(0, d0);
      const double dalloc = libxsmm_timer_duration(0, d1);
      const double alloc_freq = 1E-3 * nallocs / dalloc;
      const double empty_freq = 1E-3 * ncalls / dcalls;
      fprintf(stdout, "\tallocation+free calls/s: %.1f kHz\n", alloc_freq);
      fprintf(stdout, "\tempty calls/s: %.1f MHz\n", 1E-3 * empty_freq);
      fprintf(stdout, "\toverhead: %.1fx\n", empty_freq / alloc_freq);
    }

    if (EXIT_SUCCESS == libxsmm_get_scratch_info(&info) && 0 < info.size) {
      fprintf(stdout, "\nScratch: %.f MB (mallocs=%lu, pools=%u",
        1.0 * info.size / (1 << 20), (unsigned long int)info.nmallocs, info.npools);
      if (1 < nthreads) fprintf(stdout, ", threads=%i)\n", nthreads); else fprintf(stdout, ")\n");
      libxsmm_release_scratch(); /* suppress LIBXSMM's termination message about scratch */
    }
  }

  if (0 == nerrors) {
    fprintf(stdout, "Finished\n");
    return EXIT_SUCCESS;
  }
  else {
    fprintf(stdout, "FAILED (%u errors)\n", nerrors);
    return EXIT_FAILURE;
  }
}
Ejemplo n.º 12
0
int main(int argc, char* argv[])
{
  unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, nmats, nmatd, ntest;
  unsigned int layout, asize, VLEND=4, VLENS=8, bsize;
  unsigned int ncorr;
  int i, j;
  char side, uplo, trans, diag;
  unsigned int typesize8 = 8;
  unsigned int typesize4 = 4;
  float  *sa, *sb, *sc, *sd;
  double *da, *db, *dc, *dd, *tmpbuf;
  double dalpha = 1.0;
  float  salpha;
  double dtmp;
  const unsigned char *cptr;
  unsigned long op_count;
  const libxsmm_trsm_descriptor* desc8 = NULL;
  const libxsmm_trsm_descriptor* desc4 = NULL;
  libxsmm_descriptor_blob blob;
  union {
    libxsmm_xtrsmfunction dp;
    libxsmm_xtrsmfunction sp;
    const void* pv;
  } mykernel = { 0 };
#ifdef USE_KERNEL_GENERATION_DIRECTLY
  void (*opcode_routine)();
#endif
#ifdef USE_KERNEL_GENERATION_DIRECTLY
  #include <unistd.h>
  #include <signal.h>
  #include <malloc.h>
  #include <sys/mman.h>
  #include "../../src/generator_packed_trsm_avx_avx512.h"
  unsigned char *routine_output;
  libxsmm_generated_code io_generated_code;
  int pagesize = sysconf(_SC_PAGE_SIZE);
  if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n");
  routine_output = (unsigned char *) mmap(NULL,
                      BUFSIZE2, PROT_READ|PROT_WRITE,
                      MAP_PRIVATE|MAP_ANONYMOUS, 0,0);
  if (mprotect(routine_output, BUFSIZE2,
                PROT_EXEC | PROT_READ | PROT_WRITE ) == -1)
      fprintf(stderr,"mprotect\n");
  printf("Routine ready\n");
  io_generated_code.generated_code = &routine_output[0];
  io_generated_code.buffer_size = BUFSIZE2;
  io_generated_code.code_size = 0;
  io_generated_code.code_type = 2;
  io_generated_code.last_error = 0;
#endif

  if ( argc <= 3 )
  {
     printf("\nUSAGE: %s m n lda ldb nmat side uplo trans diag layout ntest alpha\n",argv[0]);
     printf("Compact TRSM a mxn matrix of leading dimension ldb\n");
     printf("This will test the jit of 1 VLEN work of nmat at a time\n");
     printf("Defaults: m=n=lda=ldb=nmat=8, alpha=1.0, side=uplo='L',trans=diag='N',layout=102,ntest=1\n");
  }
  if ( argc > 1 ) m = atoi(argv[1]); else m = 8;
  if ( argc > 2 ) n = atoi(argv[2]); else n = 8;
  if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8;
  if ( argc > 4 ) ldb = atoi(argv[4]); else ldb = 8;
  if ( argc > 5 ) nmat = atoi(argv[5]); else nmat = 8;
  if ( argc > 6 ) side = argv[6][0]; else side = 'L';
  if ( argc > 7 ) uplo = argv[7][0]; else uplo = 'L';
  if ( argc > 8 ) trans = argv[8][0]; else trans = 'N';
  if ( argc > 9 ) diag = argv[9][0]; else diag = 'N';
  if ( argc > 10 ) layout = atoi(argv[10]); else layout=102;
  if ( argc > 11 ) ntest = atoi(argv[11]); else ntest = 1;
  if ( argc > 12 ) dalpha = atof(argv[12]); else dalpha = 1.0;
  salpha = (float)dalpha;

  m = LIBXSMM_MAX(m,1);
  n = LIBXSMM_MAX(n,1);
  /* A is either mxm or nxn depending on side */
  if ( (side == 'L') || (side=='l') ) asize = m; else asize = n;

  lda = LIBXSMM_MAX(lda,asize);
  if ( layout == 102 )
  {
      /* Column major: B is mxn, and stored in B format */
      ldb = LIBXSMM_MAX(ldb,m);
      bsize = ldb*n;
  } else {
      /* Row major: B is mxn, and stored in B^T format */
      ldb = LIBXSMM_MAX(ldb,n);
      bsize = ldb*m;
  }
  nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS));
  nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND));
  nmat = LIBXSMM_MAX(nmats,nmatd);

  op_count = n * m * asize;

  printf("This is a real*%u tester for JIT compact TRSM kernels! (%c%c%c%c m=%u n=%u lda=%u ldb=%u layout=%u nmat=%u)\n",typesize8,side,uplo,trans,diag,m,n,lda,ldb,layout,nmat);
#ifdef USE_XSMM_GENERATED
  printf("This code tests the LIBXSMM generated kernels\n");
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
  printf("This code tests some predefined assembly kenrel\n");
#endif
#ifdef USE_KERNEL_GENERATION_DIRECTLY
  printf("This code tests kernel generation directly\n");
#endif
#ifdef TIME_MKL
  printf("This code tests MKL compact batch directly\n");
#endif

  desc8 = libxsmm_trsm_descriptor_init(&blob, typesize8, m, n, lda, ldb, &dalpha, trans, diag, side, uplo, layout);
  desc4 = libxsmm_trsm_descriptor_init(&blob, typesize4, m, n, lda, ldb, &salpha, trans, diag, side, uplo, layout);
#ifdef USE_XSMM_GENERATED
  printf("calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8);
  mykernel.dp = libxsmm_dispatch_trsm(desc8);
  printf("done calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8);
  if ( mykernel.dp == NULL ) printf("R8 Kernel after the create call is null\n");
  mykernel.sp = libxsmm_dispatch_trsm(desc4);
  if ( mykernel.sp == NULL ) printf("R4 kernel after the create call is null\n");
#endif

#ifdef USE_KERNEL_GENERATION_DIRECTLY
  libxsmm_generator_trsm_kernel ( &io_generated_code, &desc8, "hsw" );
#endif

#ifndef NO_ACCURACY_CHECK
  printf("mallocing matrices\n");
#endif
  sa  = (float  *) malloc ( lda*asize*nmats*sizeof(float) );
  da  = (double *) malloc ( lda*asize*nmatd*sizeof(double) );
  sb  = (float  *) malloc ( bsize*nmats*sizeof(float) );
  db  = (double *) malloc ( bsize*nmatd*sizeof(double) );
  sc  = (float  *) malloc ( bsize*nmats*sizeof(float) );
  dc  = (double *) malloc ( bsize*nmatd*sizeof(double) );
  sd  = (float  *) malloc ( bsize*nmats*sizeof(float) );
  dd  = (double *) malloc ( bsize*nmatd*sizeof(double) );
  tmpbuf = (double *) malloc ( asize*VLEND*sizeof(double) );

#ifndef NO_ACCURACY_CHECK
  printf("filling matrices\n");
#endif
  sfill_matrix ( sa, lda, asize, asize*nmats );
#ifdef TRIANGLE_IS_IDENTITY
  printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n");
  dfill_identity ( da, lda, asize, asize, VLEND, nmatd/VLEND );
#else
  dfill_matrix ( da, lda, asize, asize*nmatd );
#endif
  sfill_matrix ( sb, bsize, bsize, nmats );
  dfill_matrix ( db, bsize, bsize, nmatd );

#ifndef NO_ACCURACY_CHECK
  for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sc[i]=sb[i];
  for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=db[i];
  for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sd[i]=sb[i];
  for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dd[i]=db[i];
  printf("Pointing at the kernel now\n");
#endif

#ifdef USE_XSMM_GENERATED
  cptr = (const unsigned char*) mykernel.pv;
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
  cptr = (const unsigned char*) trsm_xct_;
#endif
#ifdef USE_KERNEL_GENERATION_DIRECTLY
  cptr = (const unsigned char*) &routine_output[0];
  opcode_routine = (void *) &cptr[0];
#endif

#ifndef TIME_MKL
  #define DUMP_ASSEMBLY_FILE
#endif

#ifdef DUMP_ASSEMBLY_FILE
  printf("Dumping assembly file\n");
  FILE *fp = fopen("foo.s","w");
  char buffer[80];
  fputs("\t.text\n",fp);
  fputs("\t.align 256\n",fp);
  fputs("\t.globl trsm_xct_\n",fp);
  fputs("trsm_xct_:\n",fp);
  for (i = 0 ; i < 4000; i+=4 )
  {
     sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]);
     fputs(buffer,fp);
  }
  fputs("\tretq\n",fp);
  fputs("\t.type trsm_xct_,@function\n",fp);
  fputs("\t.size trsm_xct_,.-trsm_xct_\n",fp);
  fclose(fp);
#endif

#if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL)
  #include "mkl.h"
  MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR;
  MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT;
  MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER;
  MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS;
  MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT;
  MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact();
#if 0
  MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX;
#endif
#endif

#ifndef NO_ACCURACY_CHECK
  printf("Before routine, initial B(1,1)=%g B[256]=%g\n",db[0],db[256]);
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
  double one = 1.0;
#endif
  double timer;
#ifdef MKL_TIMER
  double tmptimer;
  tmptimer = dsecnd_();
#else
  unsigned long long l_start, l_end;
#endif

  timer = 0.0;
  for ( j = 0 ; j < (int)ntest ; j++ )
  {
#ifndef TRIANGLE_IS_IDENTITY
  for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) db[i]=dd[i];
#endif
  for ( i = 0 , num = 0; i < (int)nmatd ; i+= (int)VLEND, num++ )
  {
     double *Ap = &da[num*lda*asize*VLEND];
     double *Bp = &db[num*bsize*VLEND];
#ifdef MKL_TIMER
     tmptimer = dsecnd_();
#else
     l_start = libxsmm_timer_tick();
#endif

#ifdef USE_XSMM_GENERATED
     mykernel.dp ( Ap, Bp, tmpbuf );
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
     trsm_xct_ ( Ap, Bp, &one );
#endif
#ifdef USE_KERNEL_GENERATION_DIRECTLY
     (*opcode_routine)( Ap, Bp );
#endif
#ifdef TIME_MKL
     mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, db, ldb, CMP_FORMAT, nmatd );
     i+=nmatd; /* Because MKL will do everything */
#endif
#ifdef MKL_TIMER
     timer += dsecnd_() - tmptimer;
#else
     l_end = libxsmm_timer_tick();
     timer += libxsmm_timer_duration(l_start,l_end);
#endif
  }
  }
  timer /= ((double)ntest);

#ifndef NO_ACCURACY_CHECK
  printf("Average time to get through %u matrices: %g\n",nmatd,timer);
  printf("Gflops: %g\n",(double)(op_count*nmatd)/(timer*1.0e9));
  printf("after routine, new      B(1,1)=%g B[256]=%g\n",db[0],db[256]);
#endif

#ifdef TEST_SINGLE
  printf("Before r4 routine, initial B(1,1)=%g B[256]=%g\n",sb[0],sb[256]);
  for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ )
  {
     float *Ap = &sa[num*lda*asize*VLENS];
     float *Bp = &sb[num*bsize*VLENS];
#ifdef USE_XSMM_GENERATED
     mykernel.sp ( Ap, Bp, NULL );
#endif
  }
  printf("after r4 routine, new      B(1,1)=%g B]256]=%g\n",db[0],db[256]);
#endif

#ifndef NO_ACCURACY_CHECK
  /* Call some reference code now on a copy of the B matrix (C) */
  double timer2 = 0.0;
  for ( j = 0 ; j < (int)ntest ; j++ )
  {
#ifndef TRIANGLE_IS_IDENTITY
  for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=dd[i];
#endif

#ifdef MKL_TIMER
  tmptimer = dsecnd_();
#else
  l_start = libxsmm_timer_tick();
#endif

#ifdef USE_MKL_FOR_REFERENCE
  mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, dc, ldb, CMP_FORMAT, nmatd );
#elif !defined(LIBXSMM_NOFORTRAN)
  if ( (layout == 101) && (nmatd!=VLEND) )
  {
     unsigned int lay = 102, m1 = n, n1 = m;
     char side1='L', uplo1='L';
     if ( side == 'L' || side == 'l' ) side1 = 'R';
     if ( uplo == 'L' || uplo == 'l' ) uplo1 = 'U';
     compact_dtrsm_ ( &lay, &side1, &uplo1, &trans, &diag, &m1, &n1, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND );
  } else {
     compact_dtrsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND );
  }
#endif

#ifdef MKL_TIMER
  timer2 += dsecnd_() - tmptimer;
#else
  l_end = libxsmm_timer_tick();
  timer2 += libxsmm_timer_duration(l_start,l_end);
#endif

  }
  timer2 /= ((double)ntest);
  printf("Reference time=%g Reference Gflops=%g\n",timer2,(op_count*nmatd)/(timer2*1.0e9));

  /* Compute the residual between B and C */
  dtmp = residual_d ( dc, bsize, bsize, nmatd, db, bsize, &nerrs, &ncorr );
  printf("R8 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr);
  if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout);
  printf("\n");

#ifdef TEST_SINGLE
  /* Call some reference code now on a copy of the B matrix (C) */
  compact_strsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &salpha, sa, &lda, sc, &ldb, &nmats, &VLENS );
  /* Compute the residual between B and C */
  dtmp = residual_s ( sc, bsize, bsize, nmats, sb, bsize, &nerrs, &ncorr );
  printf("R4 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u\n",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr);
  if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n);
  printf("\n");
#endif

#else
  for ( j = 0, nerrs = 0 ; j < bsize*nmatd ; j++ )
  {
     if ( isnan(db[j]) || isinf(db[j]) )
     {
        if ( ++nerrs < 10 )
        {
           printf("WARNING: db[%d]=%g\n",j,db[j]);
        }
     }
  }
  printf("%g,real*8 %c%c%c%c m=%u n=%u lda=%u ldb=%u Denormals=%u Time=%g Gflops=%g",(op_count*nmatd)/(timer*1.0e9),side,uplo,trans,diag,m,n,lda,ldb,nerrs,timer,(op_count*nmatd)/(timer*1.0e9));
  if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n);
  printf("\n");
#endif

  free(dd);
  free(sd);
  free(dc);
  free(sc);
  free(db);
  free(sb);
  free(da);
  free(sa);

  return 0;
}
Ejemplo n.º 13
0
Archivo: rng.c Proyecto: hfp/libxsmm
int main(int argc, char* argv[])
{
  double rng_stddev = 0;
  float* rngs;
  float  vrng[16];
  libxsmm_matdiff_info info;
  libxsmm_blasint num_rngs;
  libxsmm_blasint i;
  unsigned long long start;

  if (2 < argc) {
    fprintf(stderr, "Usage:\n  %s number_rngs\n", argv[0]);
    return EXIT_SUCCESS;
  }

  /* parse the command line and set up the test parameters */
  num_rngs = (1 < argc ? atoi(argv[1]) : 1000);
  assert(num_rngs >= 1);

  rngs = (float*)malloc((size_t)(sizeof(float) * num_rngs));
  if (NULL == rngs) num_rngs = 0;

  libxsmm_rng_set_seed( (uint32_t)(time(0)));

  /* fill array with random floats */
  libxsmm_rng_f32_seq( rngs, num_rngs );

  /* some quality measure; variance is based on discovered average rather than expected value */
  if (EXIT_SUCCESS == libxsmm_matdiff(&info, LIBXSMM_DATATYPE_F32, 1/*m*/, num_rngs,
    NULL/*ref*/, rngs/*tst*/, NULL/*ldref*/, NULL/*ldtst*/))
  {
    rng_stddev = sqrt(info.var_tst);
  }

  start = libxsmm_timer_tick();
  for (i = 0; i < num_rngs; ++i) {
    libxsmm_rng_f32_seq( rngs, 1 );
  }
  printf("\nlibxsmm_rng_float:  %llu cycles per random number (scalar)\n",
    libxsmm_timer_cycles(start, libxsmm_timer_tick()) / num_rngs);

  start = libxsmm_timer_tick();
  for (i = 0; i < num_rngs; ++i) {
    libxsmm_rng_f32_seq( vrng, 16 );
  }
  printf("\nlibxsmm_rng_float:  %llu cycles per random number (vlen=16)\n",
    libxsmm_timer_cycles(start, libxsmm_timer_tick()) / ((size_t)num_rngs*16));

  /* let's compute some values of the random numbers */
  printf("\nWe have generated %lli random numbers uniformly distributed in [0,1(\n", (long long)num_rngs);
  printf("We expect the following values E=0.5, Var=0.083333, Stddev=0.288675\n\n");
  printf("minimum random number is:            %f\n", info.min_tst);
  printf("maximum random number is:            %f\n", info.max_tst);
  printf("sum of random numbers is:            %f\n", info.l1_tst);
  printf("Expected Value of random numbers is: %f\n", info.avg_tst);
  printf("Variance of random numbers is:       %f\n", info.var_tst);
  printf("StdDev of random numbers is:         %f\n\n", rng_stddev);

  free( rngs );

  return EXIT_SUCCESS;
}
Ejemplo n.º 14
0
int main(int argc, char* argv[])
{
  const char t = (char)(1 < argc ? *argv[1] : 'o');
  const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096);
#if 0 /* TODO: enable when in-place transpose is fully supported */
  const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m);
#else
  const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m);
#endif
  const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m);
  const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n);
  const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r);
  const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0);
  libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi);
  int result = EXIT_SUCCESS, k;

  if (0 == strchr("oOiI", t)) {
    fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK");
    const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks);
    const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check);
    ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE)));
    ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE)));
    libxsmm_timer_tickint start, duration = 0;
#if defined(USE_REFERENCE) /* benchmark against a reference */
    libxsmm_timer_tickint duration2 = 0;
#endif
    libxsmm_blasint i;
    size_t size = 0;
#if defined(MKL_ENABLE_AVX512)
    mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
    fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n",
      (long long)m, (long long)n, (long long)ldi, (long long)ldo,
      1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE),
      ('o' == t || 'O' == t) ? "out-of-place" : "in-place");

#if defined(_OPENMP)
#   pragma omp parallel for private(i)
#endif
    for (i = 0; i < n; ++i) {
      libxsmm_blasint j;
      for (j = 0; j < m; ++j) {
        a[i*ldi+j] = initial_value(i, j, m);
      }
    }

    if (0 != check) { /* repeatable (reference) */
      srand(RAND_SEED);
    }
    else { /* randomized selection */
      srand(libxsmm_timer_tick() % ((unsigned int)-1));
    }
    for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
      if (0 < r) {
        const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
        km = randstart(LIBXSMM_ABS(lower), m);
        kldi = LIBXSMM_MAX(rldi, km);
        if (('o' == t || 'O' == t)) {
          const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
          kn = randstart(LIBXSMM_ABS(lower), n);
          kldo = LIBXSMM_MAX(rldo, kn);
          /* trigger JIT-generated code */
          OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
        }
        else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
          kn = randstart(LIBXSMM_ABS(lower), n);
#else
          kn = km;
#endif
          kldo = kldi;
          /* trigger JIT-generated code */
          ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
        }
      }
      size += (size_t)(km * kn * sizeof(ELEM_TYPE));

      if (('o' == t || 'O' == t)) {
        if (0 == tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
#if defined(OTRANS_THREAD)
#         pragma omp parallel
          OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads());
#else
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
#endif
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single nowait
#endif
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      else {
        assert(('i' == t || 'I' == t) && kldo == kldi);
        memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));

        if (2 > tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single
#endif
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      if (0 != check) { /* check */
        for (i = 0; i < km; ++i) {
          libxsmm_blasint j;
          for (j = 0; j < kn; ++j) {
            const ELEM_TYPE u = b[i*kldo+j];
            const ELEM_TYPE v = a[j*kldi+i];
            if (LIBXSMM_NEQ(u, v)) {
              i += km; /* leave outer loop as well */
              result = EXIT_FAILURE;
              break;
            }
          }
        }
      }
    }

#if defined(USE_REFERENCE)
    if (0 < check) { /* check shall imply reference (performance-)test */
      srand(RAND_SEED); /* reproduce the same sequence as above */
      for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
        if (0 < r) {
          const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
          km = randstart(LIBXSMM_ABS(lower), m);
          kldi = LIBXSMM_MAX(rldi, km);
          if (('o' == t || 'O' == t)) {
            const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
            kn = randstart(LIBXSMM_ABS(lower), n);
            kldo = LIBXSMM_MAX(rldo, kn);
          }
          else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
            kn = randstart(LIBXSMM_ABS(lower), n);
#else
            kn = km;
#endif
            kldo = kldi;
          }
        }

        if (('o' == t || 'O' == t)) {
          start = libxsmm_timer_tick();
          OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else {
          assert(('i' == t || 'I' == t) && kldo == kldi);
          memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));
          start = libxsmm_timer_tick();
          ITRANS_GOLD(&km, &kn, b, &kldi, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        if (1 < check || 0 > check) { /* check */
          for (i = 0; i < km; ++i) {
            libxsmm_blasint j;
            for (j = 0; j < kn; ++j) {
              const ELEM_TYPE u = b[i*kldo+j];
              const ELEM_TYPE v = a[j*kldi+i];
              if (LIBXSMM_NEQ(u, v)) {
                i += km; /* leave outer loop as well */
                result = EXIT_FAILURE;
                break;
              }
            }
          }
        }
      }
    }
#endif

    if (EXIT_SUCCESS == result) {
      const double d = libxsmm_timer_duration(0, duration);
      if (0 < duration) {
        /* out-of-place transpose bandwidth assumes RFO */
        fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size
          * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30)));
      }
      if (0 == lower) {
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s)));
      }
      else {
        fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d);
      }
#if defined(USE_REFERENCE)
      if (0 < duration2) {
        fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2);
      }
#endif
    }
    else if (0 != check) { /* check */
      fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n",
        (long long)km, (long long)kn, (long long)kldi, (long long)kldo);
    }

    libxsmm_free(a);
    libxsmm_free(b);
  }
  return result;
}
Ejemplo n.º 15
0
Archivo: gemm.c Proyecto: hfp/libxsmm
int main(int argc, char* argv[])
{
  unsigned int m=8, n=8, k=8, lda=8, ldb=8, ldc=8, nerrs, num, nmat;
  unsigned int layout, asize, bsize, ntest, ncorr;
#ifdef AVX512_TESTING
  unsigned int VLEND=8, VLENS=16;
  int arch=LIBXSMM_X86_AVX512_CORE;
#else
  unsigned int VLEND=4, VLENS=8;
  int arch=LIBXSMM_X86_AVX2;
#endif
  unsigned int nmats, nmatd;
  unsigned int i, j, l, iunroll, junroll, loopi, loopj;
  char side='L', uplo='U', transa='N', transb='N', diag='N';
  unsigned int typesize8 = 8;
  unsigned int typesize4 = 4;
  float  *sa, *sb, *sc, *sd, *sc1;
  double *da, *db, *dc, *dd, *dc1;
  double dalpha = 1.0;
  float  salpha = (float)dalpha;
  double dbeta = 1.0;
  float  sbeta = (float)dbeta;
  double dtmp;
  const unsigned char *cptr = NULL;
  unsigned long op_count;
  const libxsmm_pgemm_descriptor* desc8 = NULL;
  const libxsmm_pgemm_descriptor* desc4 = NULL;
#ifdef USE_XSMM_GENERATED
  libxsmm_descriptor_blob blob;
  libxsmm_pgemm_xfunction mykernel = NULL;
#endif
#if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
  void (*opcode_routine)();
  unsigned char *routine_output;
  libxsmm_generated_code io_generated_code;
  int pagesize = sysconf(_SC_PAGE_SIZE);
  if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n");
  routine_output = (unsigned char *) mmap(NULL,
                      BUFSIZE2, PROT_READ|PROT_WRITE,
                      MAP_PRIVATE|MAP_ANONYMOUS, 0,0);
  if (mprotect(routine_output, BUFSIZE2,
                PROT_EXEC | PROT_READ | PROT_WRITE ) == -1)
      fprintf(stderr,"mprotect\n");
  printf("Routine ready\n");
  io_generated_code.generated_code = &routine_output[0];
  io_generated_code.buffer_size = BUFSIZE2;
  io_generated_code.code_size = 0;
  io_generated_code.code_type = 2;
  io_generated_code.last_error = 0;
#endif

  printf("\nUSAGE: %s m n k lda ldb ldc nmat layout ntest transa transb iunroll junroll loopj loopi\n",argv[0]);
  if ( argc <= 3 )
  {
#ifdef TEST_SINGLE
     printf("Compact SGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n");
     printf("This will test the jit of 1 VLEN=%d ",VLENS);
     if ( VLENS==8 ) printf("(AVX2)");
     else            printf("(AVX512)");
#else
     printf("Compact DGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n");
     printf("This will test the jit of 1 VLEN=%d ",VLEND);
     if ( VLEND==4 ) printf("(AVX2)");
     else            printf("(AVX512)");
#endif
     printf(" work of nmat at a time\n");
     printf("Configurable: M-loop controlled by iunroll & loopi. N-loop by junroll & loopj\n");
     printf("Defaults: m=n=k=lda=ldb=ldc=nmat=8, layout=102 (col major), transa=/b='N', ntest=1\n");
  }
  if ( argc > 1 ) m = atoi(argv[1]); else m = 8;
  if ( argc > 2 ) n = atoi(argv[2]); else n = 8;
  if ( argc > 3 ) k = atoi(argv[3]); else k = 8;
  if ( argc > 4 ) lda= atoi(argv[4]); else lda = 8;
  if ( argc > 5 ) ldb= atoi(argv[5]); else ldb = 8;
  if ( argc > 6 ) ldc= atoi(argv[6]); else ldc = 8;
  if ( argc > 7 ) nmat = atoi(argv[7]); else nmat = 8;
  if ( argc > 8 ) layout = atoi(argv[8]); else layout=102;
  if ( argc > 9 ) ntest = atoi(argv[9]); else ntest = 1;
  if ( argc > 10 ) transa = argv[10][0]; else transa = 'N';
  if ( argc > 11 ) transb = argv[11][0]; else transb = 'N';
  if ( argc > 12 ) iunroll=atoi(argv[12]); else iunroll=0;
  if ( argc > 13 ) junroll=atoi(argv[13]); else junroll=0;
  if ( argc > 14 ) loopj=atoi(argv[14]); else loopj=0;
  if ( argc > 15 ) loopi=atoi(argv[15]); else loopi=0;

  salpha = (float)dalpha;
  m = LIBXSMM_MAX(m,1);
  n = LIBXSMM_MAX(n,1);
  k = LIBXSMM_MAX(k,1);
  ntest = LIBXSMM_MAX(ntest,1);
  nmat = LIBXSMM_MAX(nmat,VLEND);
  layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101);

  if ( transa!='N' && transa!='n' && transa!='T' && transa!='t' ) transa='N';
  if ( transb!='N' && transb!='n' && transb!='T' && transb!='t' ) transb='N';

  lda = LIBXSMM_MAX(lda,m);
  ldb = LIBXSMM_MAX(ldb,k);
  ldc = LIBXSMM_MAX(ldc,m);

  nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS));
  nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND));
#ifdef TEST_SINGLE
  nmat = nmats;
#else
  nmat = nmatd;
#endif

  op_count = (unsigned long)(nmat * 2.0 * (double)m * (double)n * (double)k);

#ifdef TEST_SINGLE
printf("This is a real*%d tester for JIT compact SGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize4,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLENS);
#else
printf("This is a real*%d tester for JIT compact DGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize8,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLEND);
#endif

#ifdef USE_XSMM_GENERATED
  printf("This code tests the LIBXSMM generated kernels\n");
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
  printf("This code tests some predefined assembly kernel\n");
#endif
#if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
  printf("This code tests kernel generation directly\n");
#endif
#ifdef TIME_MKL
  printf("This code tests MKL compact batch directly\n");
#endif
#ifdef AVX512_TESTING
  printf("This tests AVX512 binaries\n");
#endif
#ifdef AVX2_TESTING
  printf("This tests AVX2 binaries\n");
#endif

  desc8 = libxsmm_pgemm_descriptor_init(&blob, typesize8, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout );
#ifdef TEST_SINGLE
  desc4 = libxsmm_pgemm_descriptor_init(&blob, typesize4, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout );
#endif

  printf("Descriptor set\n");


#ifdef USE_XSMM_GENERATED
  printf("calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8);
  mykernel = libxsmm_dispatch_pgemm(desc8);
  printf("done calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8);
  if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n");
#ifdef TEST_SINGLE
  mykernel = libxsmm_dispatch_pgemm(desc4);
  if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n");
#endif
#endif

#if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
  libxsmm_generator_pgemm_kernel( &io_generated_code, desc8, arch, iunroll, junroll, loopi, loopj );
#endif

#ifndef NO_ACCURACY_CHECK
  printf("mallocing matrices\n");
#endif
  sa  = (float  *) malloc ( lda*k*nmat*sizeof(float) );
  da  = (double *) malloc ( lda*k*nmat*sizeof(double) );
  sb  = (float  *) malloc ( ldb*n*nmat*sizeof(float) );
  db  = (double *) malloc ( ldb*n*nmat*sizeof(double) );
  sc1 = (float  *) malloc ( ldc*n*nmat*sizeof(float) );
  dc1 = (double *) malloc ( ldc*n*nmat*sizeof(double) );
  sc  = (float  *) malloc ( ldc*n*nmat*sizeof(float) );
  dc  = (double *) malloc ( ldc*n*nmat*sizeof(double) );
  sd  = (float  *) malloc ( ldc*n*nmat*sizeof(float) );
  dd  = (double *) malloc ( ldc*n*nmat*sizeof(double) );

#ifndef NO_ACCURACY_CHECK
  printf("filling matrices\n");
#endif
  sfill_matrix ( sa, lda, m, k*nmat );
  sfill_matrix ( sb, ldb, k, n*nmat );
  sfill_matrix ( sc, ldc, m, n*nmat );
  dfill_matrix ( da, lda, m, k*nmat );
  dfill_matrix ( db, ldb, k, n*nmat );
  dfill_matrix ( dc, ldc, m, n*nmat );

#ifndef NO_ACCURACY_CHECK
  for ( i = 0 ; i < ldc*n*nmat ; i++ ) sd[i]=sc[i];
  for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc[i];
  for ( i = 0 ; i < ldc*n*nmat ; i++ ) sc1[i]=sc[i];
  for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc1[i]=dc[i];
  printf("Pointing at the kernel now\n");
#endif

#ifdef USE_XSMM_GENERATED
  cptr = (const unsigned char*) mykernel;
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
  cptr = (const unsigned char*) gemm_;
#endif
#if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__)
  cptr = (const unsigned char*) &routine_output[0];
  opcode_routine = (void *) &cptr[0];
#endif

#ifndef TIME_MKL
# define DUMP_ASSEMBLY_FILE
#endif

#ifdef DUMP_ASSEMBLY_FILE
  printf("Dumping assembly file\n");
  FILE *fp = fopen("foo.s","w");
  char buffer[80];
  fputs("\t.text\n",fp);
  fputs("\t.align 256\n",fp);
  fputs("\t.globl gemm_\n",fp);
  fputs("gemm_:\n",fp);
  for (i = 0 ; i < 7000; i+=4 )
  {
     sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]);
     fputs(buffer,fp);
  }
  fputs("\tretq\n",fp);
  fputs("\t.type gemm_,@function\n",fp);
  fputs("\t.size gemm_,.-gemm_\n",fp);
  fclose(fp);
#endif

#if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL)
# include <mkl.h>
  MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR;
  MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT;
  MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER;
  MKL_TRANSPOSE TRANSA = (transa == 'N' || transa == 'n') ? MKL_NOTRANS : MKL_TRANS;
  MKL_TRANSPOSE TRANSB = (transb == 'N' || transb == 'n') ? MKL_NOTRANS : MKL_TRANS;
  MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT;
  MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact();
#if 0
  MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX;
#endif
#endif

#ifndef NO_ACCURACY_CHECK
  printf("Before routine, initial A(1,1)=%g A[256]=%g\n",da[0],da[256]);
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
  double one = 1.0;
#endif
  double timer, firsttime = 0;
#ifdef MKL_TIMER
  double tmptimer;
  tmptimer = dsecnd_();
#else
  unsigned long long l_start, l_end;
#endif

  timer = 0.0;
  for ( j = 0 ; j < (int)ntest ; j++ )
  {
  for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc[i]=dc1[i];
  for ( i = 0 , num = 0; i < (int)nmat ; i+= (int)VLEND, num++ )
  {
     double *Ap = &da[num*lda*k*VLEND];
     double *Bp = &db[num*ldb*n*VLEND];
     double *Cp = &dc[num*ldc*n*VLEND];
#ifdef MKL_TIMER
     tmptimer = dsecnd_();
#else
     l_start = libxsmm_timer_tick();
#endif

#if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT)
     gen_compact_dgemm_ ( &layout, &m, &n, &k, &dalpha, Ap, &lda, Bp, &ldb, &dbeta, Cp, &ldc, &VLEND );
#endif
#ifdef USE_XSMM_GENERATED
     mykernel ( Ap, Bp, Cp );
#endif
#ifdef USE_PREDEFINED_ASSEMBLY
     gemm_ ( Ap, Bp, Cp );
#endif
#ifdef USE_KERNEL_GENERATION_DIRECTLY
     (*opcode_routine)( Ap, Bp, Cp );
#endif
#ifdef TIME_MKL
     mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dc, ldc, CMP_FORMAT, nmat );
     i+=nmatd; /* Because MKL will do everything */
#endif
#ifdef MKL_TIMER
     dtmp = dsecnd_() - tmptimer;
#else
     l_end = libxsmm_timer_tick();
     dtmp = libxsmm_timer_duration(l_start,l_end);
#endif
     if ( j == 0 ) firsttime=dtmp;
     timer += dtmp;
  }
  }
  if ( ntest >= 100 ) {
      /* Skip the first timing: super necessary if using MKL */
      timer = (timer-firsttime)/((double)(ntest-1));
  } else {
      timer /= ((double)ntest);
  }

#ifndef NO_ACCURACY_CHECK
  printf("Average time to get through %u matrices: %g\n",nmat,timer);
  printf("Gflops: %g\n",(double)op_count/(timer*1.0e9));
  printf("after routine, new      C(1,1)=%g C[256]=%g\n",dc[0],dc[256]);
#endif

#ifdef TEST_SINGLE
  printf("Before r4 routine, initial C(1,1)=%g C[256]=%g\n",sc[0],sc[256]);

  for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ )
  {
     float *Ap = &sa[num*lda*k*VLENS];
     float *Bp = &sb[num*ldb*n*VLENS];
     float *Cp = &sc[num*ldc*n*VLENS];
#ifdef USE_XSMM_GENERATED
     mykernel ( Ap, Bp, Cp );
#endif
  }
  printf("after r4 routine, new      C(1,1)=%g C]256]=%g\n",dc[0],dc[256]);
#endif

#ifndef NO_ACCURACY_CHECK
  /* Call some reference code now on a copy of the B matrix (C) */
  double timer2 = 0.0;
  for ( j = 0 ; j < (int)ntest ; j++ )
  {
  for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc1[i];
#ifdef MKL_TIMER
  tmptimer = dsecnd_();
#else
  l_start = libxsmm_timer_tick();
#endif

#ifndef USE_MKL_FOR_REFERENCE
  compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &dalpha, da, &lda, db, &ldb, &dbeta, dd, &ldc, &nmat, &VLEND );
#else
  mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dd, ldc, CMP_FORMAT, nmat );
#endif

#ifdef MKL_TIMER
  timer2 += dsecnd_() - tmptimer;
#else
  l_end = libxsmm_timer_tick();
  timer2 += libxsmm_timer_duration(l_start,l_end);
#endif

  }
  timer2 /= ((double)ntest);
  printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9));

  /* Compute the residual between B and C */
  dtmp = residual_d ( dc, ldc, m, n*nmat, dd, ldc, &nerrs, &ncorr );
  printf("R8 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr);
  if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout);
  printf("\n");

#ifdef TEST_SINGLE
  /* Call some reference code now on a copy of the B matrix (C) */
  compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &salpha, sa, &lda, sb, &ldb, &sbeta, sd, &ldc, &nmat, &VLENS );
  /* Compute the residual between C and D */
  dtmp = residual_s ( sc, ldc, m, n*nmat, sd, ldc, &nerrs, &ncorr );
  printf("R4 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr);
  if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n);
  printf("\n");
#endif

#else
  for ( j = 0, nerrs = 0 ; j < lda*n*nmat; j++ )
  {
     if ( isnan(dc[j]) || isinf(dc[j]) )
     {
        if ( ++nerrs < 10 )
        {
           printf("WARNING: dc[%d]=%g\n",j,dc[j]);
        }
     }
  }
  printf("%g,real*8 m/n/k=%u %u %u lda-c=%u %u %u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,k,lda,ldb,ldc,nerrs,timer,op_count/(timer*1.0e9));
  if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n);
  printf("\n");
#endif

  free(dd);
  free(sd);
  free(dc);
  free(sc);
  free(dc1);
  free(sc1);
  free(db);
  free(sb);
  free(da);
  free(sa);

  return 0;
}