Exemple #1
0
int main(int argc, char* argv[]) {
  int M = ( argc == 7 ) ? atoi(argv[1]) : 9;
  int N = ( argc == 7 ) ? atoi(argv[2]) : 10;
  int K = ( argc == 7 ) ? atoi(argv[3]) : 9;
  unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8;
  unsigned int REPS =    ( argc == 7 ) ? atoi(argv[5]) : 1;
  char* l_csr_file =     ( argc == 7 ) ?      argv[6]  : "file.csr";

  const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE;
  const int flags = LIBXSMM_GEMM_FLAGS('N', 'N');
  const REALTYPE alpha = 1, beta = 1;

  REALTYPE* l_a_de = (REALTYPE*)libxsmm_aligned_malloc(K * K * sizeof(REALTYPE), 64);
  REALTYPE* l_a_sp = NULL;
  REALTYPE* l_b = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS* sizeof(REALTYPE), 64);
  unsigned int* l_rowptr = NULL;
  unsigned int* l_colidx = NULL;
  unsigned int l_rowcount, l_colcount, l_elements;
  REALTYPE* l_c = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE l_max_error = 0.0;
  unsigned int l_k, l_n;
  int l_i, l_j, l_jj;

  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_b, l_b, N, N_CRUNS);
  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS);
  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS);

  libxsmm_descriptor_blob l_xgemm_blob;
  const libxsmm_gemm_descriptor* l_xgemm_desc = 0;
  LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL;

  unsigned long long l_start, l_end;
  double l_total;

  if (argc != 7) {
    fprintf( stderr, "arguments: M #iters CSR-file!\n" );
    return -1;
  }

  /* touch B */
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)libxsmm_rand_f64();
      }
    }
  }

  /* touch C */
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0;
        LIBXSMM_VLA_ACCESS(3, l_p_c_asm,  l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0;
      }
    }
  }

  /* read A, CSR */
  libxsmm_sparse_csr_reader(  l_csr_file,
                             &l_rowptr,
                             &l_colidx,
                             &l_a_sp,
                             &l_rowcount, &l_colcount, &l_elements );

  /* copy b to dense */
  printf("CSR matrix data structure we just read:\n");
  printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements);

  for ( l_n = 0; l_n < (((unsigned int)K) * K); l_n++) {
    l_a_de[l_n] = 0.0;
  }

  for ( l_n = 0; l_n < (unsigned int)K; l_n++) {
    const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n];
    assert(l_rowptr[l_n+1] >= l_rowptr[l_n]);

    for ( l_k = 0; l_k < l_rowelems; l_k++) {
      l_a_de[(l_n * K) + l_colidx[l_rowptr[l_n] + l_k]] = l_a_sp[l_rowptr[l_n] + l_k];
    }
  }

  /* dense routine */
  l_start = libxsmm_timer_tick();
#if 1
  for ( l_n = 0; l_n < REPS; l_n++) {
    for ( l_i = 0; l_i < K; l_i++) {
      for ( l_j = 0; l_j < N; l_j++) {
        for ( l_jj = 0; l_jj < K; l_jj++) {
          LIBXSMM_PRAGMA_SIMD
          for (l_k = 0; l_k < N_CRUNS; l_k++) {
            LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
              +=   l_a_de[(l_i*K)+l_jj]
                 * LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_j, l_k, N, N_CRUNS);
          }
        }
      }
    }
  }
#endif
  l_end = libxsmm_timer_tick();
  l_total = libxsmm_timer_duration(l_start, l_end);
  printf("%fs for dense\n", l_total);
  printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)K * (double)K * (double)N * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9));

  l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE),
    K, N, K, 0, N, N, alpha, beta, flags, prefetch);

  /* sparse routine */
#if defined(__EDGE_EXECUTE_F32__)
  mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).smm;
#else
  mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).dmm;
#endif

  l_start = libxsmm_timer_tick();
  for ( l_n = 0; l_n < REPS; l_n++) {
    mykernel( l_a_sp, l_b, l_c_asm );
  }
  l_end = libxsmm_timer_tick();
  l_total = libxsmm_timer_duration(l_start, l_end);
  printf("%fs for sparse (asm)\n", l_total);
  printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9));

  /* check for errors */
  l_max_error = (REALTYPE)0.0;
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
                    - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) {
          l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
                                       -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) );
        }
      }
    }
  }
  printf("max error: %f\n", l_max_error);

  printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, K * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) );

  /* free */
  libxsmm_free( l_a_de );
  libxsmm_free( l_b );
  libxsmm_free( l_c );
  libxsmm_free( l_c_gold );
  libxsmm_free( l_c_asm );

  free( l_a_sp );
  free( l_rowptr );
  free( l_colidx );

  return 0;
}
Exemple #2
0
int main(int argc, char *argv[])
{
  REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2;
  float *C_gold, *C0_gold, *C, *C2;

  int M, N, K;
  REAL_TYPE alpha, beta;
  int reps;

  libxsmm_spmdm_handle handle, handle2;
  libxsmm_CSR_sparseslice *A_sparse, *A_sparse2;
  int max_threads;

  /* Step 1: Read in args */
  libxsmm_timer_tickint start, end;
  double flops, duration;
  char transA, transB, transC;
  int i, j, k;
  size_t l;

  /* Step 1: Initialize handle */
  M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N';

  if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
    printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]);
    return EXIT_SUCCESS;
  }

  /* defaults */
  M = 2048;
  N = 2048;
  K = 2048;
  transA = 'N';
  transB = 'N';
  transC = 'N';
  reps = 100;

  /* reading new values from cli */
  i = 1;
  if (argc > i) M = atoi(argv[i++]);
  if (argc > i) N = atoi(argv[i++]);
  if (argc > i) K = atoi(argv[i++]);
  if (argc > i) { transA = argv[i][0]; i++; }
  if (argc > i) { transB = argv[i][0]; i++; }
  if (argc > i) { transC = argv[i][0]; i++; }
  if (argc > i) reps = atoi(argv[i++]);

  /* Step 2: allocate data */
  A_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  B_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );
  C_gold  = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C       = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  /* Step 3: init data */
  libxsmm_rng_set_seed(1);
  for (l = 0; l < (size_t)M * (size_t)K; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    if (r64 > 0.85) A_gold[l] = val;
    else              A_gold[l] = (REAL_TYPE)0.0;
  }

  for (l = 0; l < (size_t)K * (size_t)N; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    B_gold[l] = val;
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C0_gold[l] = (float)libxsmm_rng_f64();
    C_gold[l] = C0_gold[l];
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  flops = (double)M * (double)N * (double)K * 2.0;

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */
# if defined(_OPENMP)
  max_threads = omp_get_max_threads();
# else
  max_threads = 1;
# endif

  start = libxsmm_timer_tick();
  libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse);
  end = libxsmm_timer_tick();
  printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end));

  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps );
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha input */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# else
  spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# endif

  /* Checks */

  /* Compute a "gold" answer sequentially */
#if defined(_OPENMP)
  LIBXSMM_OMP_VAR(k);
# pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2)
#endif
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      float sum = 0.0;
      float Cval;
      for (k = 0; k < K; ++k) {
#       ifdef USE_BFLOAT
        libxsmm_bfloat16 Atmp = A_gold[i*K+k];
        int Atmp_int  = Atmp; Atmp_int <<= 16;
        float Aval = *(float *)&Atmp_int;
        libxsmm_bfloat16 Btmp = B_gold[k*N+j];
        int Btmp_int  = Btmp; Btmp_int <<= 16;
        float Bval = *(float *)&Btmp_int;
#       else
        float Aval = A_gold[i*K + k];
        float Bval = B_gold[k*N + j];
#       endif
        sum += Aval * Bval;
      }
      Cval = sum;
      C_gold[i*N + j] = Cval + beta*C_gold[i*N + j];
    }
  }
  /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */

  /* Compute the max difference between gold and computed results. */
  spmdm_check_c( &handle, C, C_gold );

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   else
    spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */
  transA = 'T'; transB = 'N'; transC = 'T';
  libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2);
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  for (i = 0; i < M; ++i) {
    for (j = 0; j < K; ++j) {
      A_gold2[j*M + i] = A_gold[i*K + j];
    }
  }
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C[j*M + i] = (float)C0_gold[i*N + j];
    }
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# endif

  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C2[i*N + j] = C[j*M + i];
    }
  }
  /* Checks */
  spmdm_check_c( &handle2, C2, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 6: Test transpose B */
  transA = 'N'; transB = 'T'; transC = 'N';
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );

  for (i = 0; i < K; ++i) {
    for (j = 0; j < N; ++j) {
      B_gold2[j*K + i] = B_gold[i*N + j];
    }
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# endif

  /* Checks */
  spmdm_check_c( &handle2, C, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle2);

  libxsmm_free(A_gold);
  libxsmm_free(B_gold);
  libxsmm_free(C_gold);
  libxsmm_free(C);
  libxsmm_free(C2);
  libxsmm_free(C0_gold);
  libxsmm_free(B_gold2);
  libxsmm_free(A_gold2);

  return EXIT_SUCCESS;
}