示例#1
0
void libxsmm_finalize(void)
{
  internal_code_type* registry = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);

  if (0 != registry) {
    int i;
#if !defined(LIBXSMM_OPENMP)
# if !defined(LIBXSMM_NOSYNC)
    /* acquire locks and thereby shortcut lazy initialization later on */
    for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i);
# endif
#else
#   pragma omp critical(internal_reglock)
#endif
    {
      registry = internal_registry;

      if (0 != registry) {
        internal_regkey_type *const registry_keys = internal_registry_keys;
        const char *const target_arch = internal_get_target_arch(internal_target_archid);
        unsigned int heapmem = (LIBXSMM_REGSIZE) * (sizeof(internal_code_type) + sizeof(internal_regkey_type));

        /* serves as an id to invalidate the thread-local cache; never decremented */
        ++internal_teardown;
#if defined(__TRACE)
        i = libxsmm_trace_finalize();
# if !defined(NDEBUG) /* library code is expected to be mute */
        if (EXIT_SUCCESS != i) {
          fprintf(stderr, "LIBXSMM: failed to finalize trace (error #%i)!\n", i);
        }
# endif
#endif
        libxsmm_gemm_finalize();
        libxsmm_gemm_diff_finalize();
        libxsmm_hash_finalize();

        /* make internal registry globally unavailable */
        LIBXSMM_ATOMIC_STORE_ZERO(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);
        internal_registry_keys = 0;

        for (i = 0; i < LIBXSMM_REGSIZE; ++i) {
          internal_code_type code = registry[i];
          if (0 != code.pmm) {
            const libxsmm_gemm_descriptor *const desc = &registry_keys[i].descriptor;
            const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k);
            const int precision = (0 == (LIBXSMM_GEMM_FLAG_F32PREC & desc->flags) ? 0 : 1);
            const unsigned int statistic_sml = internal_statistic_sml;
            int bucket = 2;
            assert((LIBXSMM_HASH_COLLISION | LIBXSMM_CODE_STATIC) != code.imm);
            if (LIBXSMM_MNK_SIZE(statistic_sml, statistic_sml, statistic_sml) >= kernel_size) {
              bucket = 0;
            }
            else {
              const unsigned int statistic_med = internal_statistic_med;
              if (LIBXSMM_MNK_SIZE(statistic_med, statistic_med, statistic_med) >= kernel_size) {
                bucket = 1;
              }
            }
            if (0 == (LIBXSMM_CODE_STATIC & code.imm)) { /* check for allocated/generated JIT-code */
              void* buffer = 0;
              size_t size = 0;
              code.imm &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */
              if (EXIT_SUCCESS == libxsmm_alloc_info(code.pmm, &size, 0/*flags*/, &buffer)) {
                libxsmm_deallocate(code.pmm);
                ++internal_statistic[precision][bucket].njit;
                heapmem += (unsigned int)(size + (((char*)code.pmm) - (char*)buffer));
              }
            }
            else {
              ++internal_statistic[precision][bucket].nsta;
            }
          }
        }
        if (0 != internal_verbose_mode) { /* print statistic on termination */
          LIBXSMM_FLOCK(stderr);
          LIBXSMM_FLOCK(stdout);
          fflush(stdout); /* synchronize with standard output */
          {
            const unsigned int linebreak = 0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0) ? 1 : 0;
            if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak) {
              fprintf(stderr, "LIBXSMM_TARGET=%s ", target_arch);
            }
            fprintf(stderr, "HEAP: %.f MB\n", 1.0 * heapmem / (1 << 20));
          }
          LIBXSMM_FUNLOCK(stdout);
          LIBXSMM_FUNLOCK(stderr);
        }
        libxsmm_free(registry_keys);
        libxsmm_free(registry);
      }
    }
#if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */
    for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i);
#endif
  }
}
示例#2
0
int main(int argc, char* argv[]) {
  int M = ( argc == 7 ) ? atoi(argv[1]) : 9;
  int N = ( argc == 7 ) ? atoi(argv[2]) : 10;
  int K = ( argc == 7 ) ? atoi(argv[3]) : 9;
  unsigned int N_CRUNS = ( argc == 7 ) ? atoi(argv[4]) : 8;
  unsigned int REPS =    ( argc == 7 ) ? atoi(argv[5]) : 1;
  char* l_csr_file =     ( argc == 7 ) ?      argv[6]  : "file.csr";

  const libxsmm_gemm_prefetch_type prefetch = LIBXSMM_GEMM_PREFETCH_NONE;
  const int flags = LIBXSMM_GEMM_FLAGS('N', 'N');
  const REALTYPE alpha = 1, beta = 1;

  REALTYPE* l_a_de = (REALTYPE*)libxsmm_aligned_malloc(K * K * sizeof(REALTYPE), 64);
  REALTYPE* l_a_sp = NULL;
  REALTYPE* l_b = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS* sizeof(REALTYPE), 64);
  unsigned int* l_rowptr = NULL;
  unsigned int* l_colidx = NULL;
  unsigned int l_rowcount, l_colcount, l_elements;
  REALTYPE* l_c = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE* l_c_gold = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE* l_c_asm = (REALTYPE*)libxsmm_aligned_malloc(K * N * N_CRUNS * sizeof(REALTYPE), 64);
  REALTYPE l_max_error = 0.0;
  unsigned int l_k, l_n;
  int l_i, l_j, l_jj;

  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_b, l_b, N, N_CRUNS);
  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_asm, l_c_asm, N, N_CRUNS);
  LIBXSMM_VLA_DECL(3, REALTYPE, l_p_c_gold, l_c_gold, N, N_CRUNS);

  libxsmm_descriptor_blob l_xgemm_blob;
  const libxsmm_gemm_descriptor* l_xgemm_desc = 0;
  LIBXSMM_MMFUNCTION_TYPE(REALTYPE) mykernel = NULL;

  unsigned long long l_start, l_end;
  double l_total;

  if (argc != 7) {
    fprintf( stderr, "arguments: M #iters CSR-file!\n" );
    return -1;
  }

  /* touch B */
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        LIBXSMM_VLA_ACCESS(3, l_p_b, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)libxsmm_rand_f64();
      }
    }
  }

  /* touch C */
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0;
        LIBXSMM_VLA_ACCESS(3, l_p_c_asm,  l_i, l_j, l_k, N, N_CRUNS) = (REALTYPE)0.0;
      }
    }
  }

  /* read A, CSR */
  libxsmm_sparse_csr_reader(  l_csr_file,
                             &l_rowptr,
                             &l_colidx,
                             &l_a_sp,
                             &l_rowcount, &l_colcount, &l_elements );

  /* copy b to dense */
  printf("CSR matrix data structure we just read:\n");
  printf("rows: %u, columns: %u, elements: %u\n", l_rowcount, l_colcount, l_elements);

  for ( l_n = 0; l_n < (((unsigned int)K) * K); l_n++) {
    l_a_de[l_n] = 0.0;
  }

  for ( l_n = 0; l_n < (unsigned int)K; l_n++) {
    const unsigned int l_rowelems = l_rowptr[l_n+1] - l_rowptr[l_n];
    assert(l_rowptr[l_n+1] >= l_rowptr[l_n]);

    for ( l_k = 0; l_k < l_rowelems; l_k++) {
      l_a_de[(l_n * K) + l_colidx[l_rowptr[l_n] + l_k]] = l_a_sp[l_rowptr[l_n] + l_k];
    }
  }

  /* dense routine */
  l_start = libxsmm_timer_tick();
#if 1
  for ( l_n = 0; l_n < REPS; l_n++) {
    for ( l_i = 0; l_i < K; l_i++) {
      for ( l_j = 0; l_j < N; l_j++) {
        for ( l_jj = 0; l_jj < K; l_jj++) {
          LIBXSMM_PRAGMA_SIMD
          for (l_k = 0; l_k < N_CRUNS; l_k++) {
            LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
              +=   l_a_de[(l_i*K)+l_jj]
                 * LIBXSMM_VLA_ACCESS(3, l_p_b, l_jj, l_j, l_k, N, N_CRUNS);
          }
        }
      }
    }
  }
#endif
  l_end = libxsmm_timer_tick();
  l_total = libxsmm_timer_duration(l_start, l_end);
  printf("%fs for dense\n", l_total);
  printf("%f GFLOPS for dense\n", ((double)((double)REPS * (double)K * (double)K * (double)N * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9));

  l_xgemm_desc = libxsmm_gemm_descriptor_dinit(&l_xgemm_blob, LIBXSMM_GEMM_PRECISION(REALTYPE),
    K, N, K, 0, N, N, alpha, beta, flags, prefetch);

  /* sparse routine */
#if defined(__EDGE_EXECUTE_F32__)
  mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).smm;
#else
  mykernel = libxsmm_create_xcsr_soa( l_xgemm_desc, l_rowptr, l_colidx, (const void*)l_a_sp ).dmm;
#endif

  l_start = libxsmm_timer_tick();
  for ( l_n = 0; l_n < REPS; l_n++) {
    mykernel( l_a_sp, l_b, l_c_asm );
  }
  l_end = libxsmm_timer_tick();
  l_total = libxsmm_timer_duration(l_start, l_end);
  printf("%fs for sparse (asm)\n", l_total);
  printf("%f GFLOPS for sparse (asm)\n", ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9));

  /* check for errors */
  l_max_error = (REALTYPE)0.0;
  for ( l_i = 0; l_i < K; l_i++) {
    for ( l_j = 0; l_j < N; l_j++) {
      for ( l_k = 0; l_k < N_CRUNS; l_k++ ) {
        if (fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
                    - LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) ) > l_max_error ) {
          l_max_error = (REALTYPE)fabs( LIBXSMM_VLA_ACCESS(3, l_p_c_gold, l_i, l_j, l_k, N, N_CRUNS)
                                       -LIBXSMM_VLA_ACCESS(3, l_p_c_asm, l_i, l_j, l_k, N, N_CRUNS) );
        }
      }
    }
  }
  printf("max error: %f\n", l_max_error);

  printf("PERFDUMP,%s,%u,%i,%i,%i,%u,%u,%f,%f,%f\n", l_csr_file, REPS, M, N, K, l_elements, K * l_elements * N_CRUNS * 2, l_max_error, l_total, ((double)((double)REPS * (double)K * (double)l_elements * (double)N_CRUNS) * 2.0) / (l_total * 1.0e9) );

  /* free */
  libxsmm_free( l_a_de );
  libxsmm_free( l_b );
  libxsmm_free( l_c );
  libxsmm_free( l_c_gold );
  libxsmm_free( l_c_asm );

  free( l_a_sp );
  free( l_rowptr );
  free( l_colidx );

  return 0;
}
示例#3
0
LIBXSMM_INLINE LIBXSMM_RETARGETABLE internal_code_type* internal_init(void)
{
  /*const*/internal_code_type* result;
  int i;
#if !defined(LIBXSMM_OPENMP)
# if !defined(LIBXSMM_NOSYNC)
  static int internal_reglock_check = 1; /* setup the locks in a thread-safe fashion */
  assert(sizeof(internal_reglock) == (INTERNAL_REGLOCK_COUNT * sizeof(*internal_reglock)));
  if (1 == LIBXSMM_ATOMIC_LOAD(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST)) {
    LIBXSMM_ATOMIC_ADD_FETCH(&internal_reglock_check, 1, LIBXSMM_ATOMIC_SEQ_CST);
    if (2 == internal_reglock_check) {
      for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_INIT(internal_reglock + i);
      LIBXSMM_ATOMIC_STORE_ZERO(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST);
    }
  }
  while (0 != internal_reglock_check); /* wait until locks are initialized */
  for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i);
# endif
#else
# pragma omp critical(internal_reglock)
#endif
  {
    result = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);
    if (0 == result) {
      int init_code;
      /* set internal_target_archid */
      libxsmm_set_target_arch(getenv("LIBXSMM_TARGET"));
      { /* select prefetch strategy for JIT */
        const char *const env_prefetch = getenv("LIBXSMM_PREFETCH");
        if (0 == env_prefetch || 0 == *env_prefetch) {
#if (0 > LIBXSMM_PREFETCH) /* permitted by LIBXSMM_PREFETCH_AUTO */
          internal_prefetch = (LIBXSMM_X86_AVX512_MIC != internal_target_archid
            ? LIBXSMM_PREFETCH_NONE : LIBXSMM_PREFETCH_AL2BL2_VIA_C);
#else
          internal_prefetch = LIBXSMM_MAX(INTERNAL_PREFETCH, 0);
#endif
        }
        else { /* user input considered even if LIBXSMM_PREFETCH_AUTO is disabled */
          switch (atoi(env_prefetch)) {
            case 2:  internal_prefetch = LIBXSMM_PREFETCH_SIGONLY; break;
            case 3:  internal_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; break;
            case 4:  internal_prefetch = LIBXSMM_PREFETCH_AL2; break;
            case 5:  internal_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; break;
            case 6:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; break;
            case 7:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; break;
            case 8:  internal_prefetch = LIBXSMM_PREFETCH_AL2_JPST; break;
            case 9:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; break;
            default: internal_prefetch = LIBXSMM_PREFETCH_NONE;
          }
        }
      }
      libxsmm_hash_init(internal_target_archid);
      libxsmm_gemm_diff_init(internal_target_archid);
      init_code = libxsmm_gemm_init(internal_target_archid, internal_prefetch);
#if defined(__TRACE)
      {
        int filter_threadid = 0, filter_mindepth = 1, filter_maxnsyms = 0;
        const char *const env_trace_init = getenv("LIBXSMM_TRACE");
        if (EXIT_SUCCESS == init_code && 0 != env_trace_init && 0 != *env_trace_init) {
          char buffer[32];
          if (1 == sscanf(env_trace_init, "%32[^,],", buffer)) {
            sscanf(buffer, "%i", &filter_threadid);
          }
          if (1 == sscanf(env_trace_init, "%*[^,],%32[^,],", buffer)) {
            sscanf(buffer, "%i", &filter_mindepth);
          }
          if (1 == sscanf(env_trace_init, "%*[^,],%*[^,],%32s", buffer)) {
            sscanf(buffer, "%i", &filter_maxnsyms);
          }
          else {
            filter_maxnsyms = -1; /* all */
          }
        }
        init_code = libxsmm_trace_init(filter_threadid - 1, filter_mindepth, filter_maxnsyms);
      }
#endif
      if (EXIT_SUCCESS == init_code) {
        assert(0 == internal_registry_keys && 0 == internal_registry); /* should never happen */
        result = (internal_code_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_code_type));
        internal_registry_keys = (internal_regkey_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_regkey_type));
        if (0 != result && 0 != internal_registry_keys) {
          const char *const env_verbose = getenv("LIBXSMM_VERBOSE");
          internal_statistic_mnk = (unsigned int)(pow((double)(LIBXSMM_MAX_MNK), 0.3333333333333333) + 0.5);
          internal_statistic_sml = 13; internal_statistic_med = 23;
          if (0 != env_verbose && 0 != *env_verbose) {
            internal_verbose_mode = atoi(env_verbose);
          }
#if !defined(NDEBUG)
          else {
            internal_verbose_mode = 1; /* quiet -> verbose */
          }
#endif
          for (i = 0; i < LIBXSMM_REGSIZE; ++i) result[i].pmm = 0;
          /* omit registering code if JIT is enabled and if an ISA extension is found
           * which is beyond the static code path used to compile the library
           */
#if defined(LIBXSMM_BUILD)
# if (0 != LIBXSMM_JIT) && !defined(__MIC__)
          /* check if target arch. permits execution (arch. may be overridden) */
          if (LIBXSMM_STATIC_TARGET_ARCH <= internal_target_archid &&
             (LIBXSMM_X86_AVX > internal_target_archid /* jit is not available */
              /* condition allows to avoid JIT (if static code is good enough) */
           || LIBXSMM_STATIC_TARGET_ARCH == internal_target_archid))
# endif
          { /* opening a scope for eventually declaring variables */
            /* setup the dispatch table for the statically generated code */
#           include <libxsmm_dispatch.h>
          }
#endif
          atexit(libxsmm_finalize);
          LIBXSMM_ATOMIC_STORE(&internal_registry, result, LIBXSMM_ATOMIC_SEQ_CST);
        }
        else {
#if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */
          fprintf(stderr, "LIBXSMM: failed to allocate code registry!\n");
#endif
          libxsmm_free(internal_registry_keys);
          libxsmm_free(result);
        }
      }
#if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */
      else {
        fprintf(stderr, "LIBXSMM: failed to initialize sub-component (error #%i)!\n", init_code);
      }
#endif
    }
  }
#if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */
  for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i);
#endif
  assert(result);
  return result;
}
示例#4
0
int main(int argc, char* argv[])
{
  LIBXSMM_GEMM_CONST libxsmm_blasint m = (1 < argc ? atoi(argv[1]) : 1024);
  LIBXSMM_GEMM_CONST libxsmm_blasint k = (3 < argc ? atoi(argv[3]) : m);
  LIBXSMM_GEMM_CONST libxsmm_blasint n = (2 < argc ? atoi(argv[2]) : k);
  const libxsmm_blasint bm = (4 < argc ? atoi(argv[4]) : 32);
  const libxsmm_blasint bk = (6 < argc ? atoi(argv[6]) : bm);
  const libxsmm_blasint bn = (5 < argc ? atoi(argv[5]) : bk);
  const libxsmm_bgemm_order order = (libxsmm_bgemm_order)(7 < argc ? atoi(argv[7]) : 0);
  const int nrepeat = (8 < argc ? atoi(argv[8]) : 100);
  const libxsmm_blasint b_m1 = (9 < argc ? atoi(argv[9]) : 1);
  const libxsmm_blasint b_n1  = (10 < argc ? atoi(argv[10]) : 1);
  const libxsmm_blasint b_k1 = (11 < argc ? atoi(argv[11]) : 1);
  const libxsmm_blasint b_k2 = (12 < argc ? atoi(argv[12]) : 1);
  const int ab = (13 < argc ? atoi(argv[13]) : 0);
  LIBXSMM_GEMM_CONST libxsmm_blasint lda = (14 < argc ? atoi(argv[13]) : m);
  LIBXSMM_GEMM_CONST libxsmm_blasint ldb = (15 < argc ? atoi(argv[14]) : k);
  LIBXSMM_GEMM_CONST libxsmm_blasint ldc = (16 < argc ? atoi(argv[15]) : m);
  LIBXSMM_GEMM_CONST char transa = 'N', transb = 'N'; /* no transposes */
  LIBXSMM_GEMM_CONST ITYPE alpha = 1, beta = 1;
  const int gemm_flags = LIBXSMM_GEMM_FLAGS(transa, transb);
  const double gflops = 2.0 * m * n * k * 1E-9;
  int result = EXIT_SUCCESS;
#if defined(CHECK)
  const char *const env_check = getenv("CHECK");
  const double check = LIBXSMM_ABS(0 == env_check ? 0 : atof(env_check));
#endif
  if (argc > 1 && !strncmp(argv[1], "-h", 3)) { /* check command line */
    printf("\nUsage: ./bgemm [M] [N] [K] [bm] [bn] [bk] [order] [reps] [b_m1] [b_n1] [b_k1] [b_k2] [verbose]\n\n");
    return result;
  }

  MYASSERT(m % b_m1 == 0);
  MYASSERT(n % b_n1 == 0);
  MYASSERT(k % b_k1 == 0);
  MYASSERT(m/b_m1 % bm == 0);
  MYASSERT(n/b_n1 % bn == 0);
  MYASSERT(k/b_k1/b_k2 % bk == 0);

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    ITYPE* agold = (ITYPE*)libxsmm_malloc((size_t)(lda * k * sizeof(ITYPE)));
    ITYPE* bgold = (ITYPE*)libxsmm_malloc((size_t)(ldb * n * sizeof(ITYPE)));
    ITYPE* cgold = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE)));
    ITYPE* a = (ITYPE*)libxsmm_malloc((size_t)(m * k * sizeof(ITYPE)));
    ITYPE* b = (ITYPE*)libxsmm_malloc((size_t)(k * n * sizeof(ITYPE)));
    ITYPE* c = (ITYPE*)libxsmm_malloc((size_t)(m * n * sizeof(ITYPE)));
    libxsmm_bgemm_handle* handle = 0;
    unsigned long long start;
    double duration;
    handle = libxsmm_bgemm_handle_create(
      LIBXSMM_GEMM_PRECISION(ITYPE), LIBXSMM_GEMM_PRECISION(ITYPE),
      m, n, k, &bm, &bn, &bk, &b_m1, &b_n1, &b_k1, &b_k2,
      &alpha, &beta, &gemm_flags, NULL/*auto-prefetch*/, &order);

    if (0 != handle) {
      LIBXSMM_MATINIT(ITYPE, 42, agold, m, k, lda, 1.0);
      LIBXSMM_MATINIT(ITYPE, 24, bgold, k, n, ldb, 1.0);
      LIBXSMM_MATINIT(ITYPE,  0, cgold, m, n, ldc, 1.0);
      libxsmm_bgemm_copyin_a(handle, agold, &lda, a);
      libxsmm_bgemm_copyin_b(handle, bgold, &ldb, b);
      libxsmm_bgemm_copyin_c(handle, cgold, &ldc, c);
#if defined(MKL_ENABLE_AVX512)
      mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
      /* warm-up OpenMP (populate thread pool) */
      libxsmm_bgemm_omp(handle, a, b, c, 1);
#if defined(CHECK)
      if (!LIBXSMM_FEQ(0, check)) {
        LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc);
      }
#endif
      if (!ab) {
      libxsmm_gemm_print(stdout, LIBXSMM_GEMM_PRECISION(ITYPE),
        &transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
      fprintf(stdout, "\n\n");
      }
      start = libxsmm_timer_tick();
      libxsmm_bgemm_omp(handle, a, b, c, nrepeat);
      duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
      if (0 < duration) {
        if (ab) {
          fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s | %lli,%lli,%lli,%lli,%lli,%lli,%i,%lli,%lli,%lli,%lli\n",
            gflops * nrepeat / duration, (long long)m, (long long)n, (long long)k, (long long)bm, (long long)bn, (long long)bk,
            (int)order, (long long)b_m1, (long long)b_n1, (long long)b_k1, (long long)b_k2);
        } else {
          fprintf(stdout, "\tLIBXSMM: %.1f GFLOPS/s\n", gflops * nrepeat / duration);
        }
      }
#if defined(CHECK)
      if (!LIBXSMM_FEQ(0, check)) { /* validate result against LAPACK/BLAS xGEMM */
        ITYPE* ctest = 0;
        int i;
        start = libxsmm_timer_tick();
        for (i = 0; i < nrepeat; ++i) {
          LIBXSMM_GEMM_SYMBOL(ITYPE)(&transa, &transb, &m, &n, &k, &alpha, agold, &lda, bgold, &ldb, &beta, cgold, &ldc);
        }
        duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
        if (0 < duration) {
          fprintf(stdout, "\tBLAS: %.1f GFLOPS/s\n", gflops * nrepeat / duration);
        }
        /* free memory not needed further; avoid double-free later on */
        libxsmm_free(agold); agold = 0;
        libxsmm_free(bgold); bgold = 0;
        libxsmm_free(a); a = 0;
        libxsmm_free(b); b = 0;
        /* allocate C-matrix in regular format, and perform copy-out */
        ctest = (ITYPE*)libxsmm_malloc((size_t)(ldc * n * sizeof(ITYPE)));
        if (0 != ctest) {
          libxsmm_matdiff_info diff;
          libxsmm_bgemm_copyout_c(handle, c, &ldc, ctest);
          if (EXIT_SUCCESS == libxsmm_matdiff(LIBXSMM_DATATYPE(ITYPE), m, n, cgold, ctest, &ldc, &ldc, &diff)) {
            fprintf(stdout, "\tdiff: L2abs=%f Linf=%f\n", diff.l2_abs, diff.linf_abs);
            if (check < 100.0 * diff.normf_rel) {
              fprintf(stderr, "FAILED with an error of %f%%!\n", 100.0 * diff.normf_rel);
              result = EXIT_FAILURE;
            }
          }
          libxsmm_free(ctest);
        }
      }
#endif
      libxsmm_bgemm_handle_destroy(handle);
    }
    else {
      fprintf(stderr, "FAILED to create BGEMM-handle! For details retry with LIBXSMM_VERBOSE=1.\n");
      result = EXIT_FAILURE;
    }
    libxsmm_free(agold);
    libxsmm_free(bgold);
    libxsmm_free(cgold);
    libxsmm_free(a);
    libxsmm_free(b);
    libxsmm_free(c);
  }
  if(!ab) {
    fprintf(stdout, "Finished\n");
  }
  return result;
}
示例#5
0
文件: spmdm.c 项目: hfp/libxsmm
int main(int argc, char *argv[])
{
  REAL_TYPE *A_gold, *B_gold, *A_gold2, *B_gold2;
  float *C_gold, *C0_gold, *C, *C2;

  int M, N, K;
  REAL_TYPE alpha, beta;
  int reps;

  libxsmm_spmdm_handle handle, handle2;
  libxsmm_CSR_sparseslice *A_sparse, *A_sparse2;
  int max_threads;

  /* Step 1: Read in args */
  libxsmm_timer_tickint start, end;
  double flops, duration;
  char transA, transB, transC;
  int i, j, k;
  size_t l;

  /* Step 1: Initialize handle */
  M = 0; N = 0; K = 0; alpha = (REAL_TYPE)1.0; beta = (REAL_TYPE)0.0; reps = 0; transA = 'N'; transB = 'N';

  if (argc > 1 && !strncmp(argv[1], "-h", 3)) {
    printf("\nUsage: %s [M] [N] [K] [transA] [transB] [reps]\n\n", argv[0]);
    return EXIT_SUCCESS;
  }

  /* defaults */
  M = 2048;
  N = 2048;
  K = 2048;
  transA = 'N';
  transB = 'N';
  transC = 'N';
  reps = 100;

  /* reading new values from cli */
  i = 1;
  if (argc > i) M = atoi(argv[i++]);
  if (argc > i) N = atoi(argv[i++]);
  if (argc > i) K = atoi(argv[i++]);
  if (argc > i) { transA = argv[i][0]; i++; }
  if (argc > i) { transB = argv[i][0]; i++; }
  if (argc > i) { transC = argv[i][0]; i++; }
  if (argc > i) reps = atoi(argv[i++]);

  /* Step 2: allocate data */
  A_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  B_gold  = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );
  C_gold  = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C0_gold = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );
  C       = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  /* Step 3: init data */
  libxsmm_rng_set_seed(1);
  for (l = 0; l < (size_t)M * (size_t)K; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    if (r64 > 0.85) A_gold[l] = val;
    else              A_gold[l] = (REAL_TYPE)0.0;
  }

  for (l = 0; l < (size_t)K * (size_t)N; ++l) {
    const double r64 = libxsmm_rng_f64();
    const float r32 = (float)r64;
#ifdef USE_BFLOAT
    const int r = *(const int*)(&r32);
    const libxsmm_bfloat16 val = (r >> 16);
#else
    const float val = r32;
#endif
    B_gold[l] = val;
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C0_gold[l] = (float)libxsmm_rng_f64();
    C_gold[l] = C0_gold[l];
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  flops = (double)M * (double)N * (double)K * 2.0;

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 4: Initialize LIBXSMM for these sizes - allocates handle and temporary space for the sparse data structure for A */
# if defined(_OPENMP)
  max_threads = omp_get_max_threads();
# else
  max_threads = 1;
# endif

  start = libxsmm_timer_tick();
  libxsmm_spmdm_init(M, N, K, max_threads, &handle, &A_sparse);
  end = libxsmm_timer_tick();
  printf("Time for handle init = %f\n", libxsmm_timer_duration(start, end));

  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i -- forward pass\n", M, N, K, handle.bm, handle.bn, handle.bk, handle.mb, handle.nb, handle.kb, reps );
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha input */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# else
  spmdm_exec_fp32(&handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
# endif

  /* Checks */

  /* Compute a "gold" answer sequentially */
#if defined(_OPENMP)
  LIBXSMM_OMP_VAR(k);
# pragma omp parallel for private(i, j, k) LIBXSMM_OPENMP_COLLAPSE(2)
#endif
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      float sum = 0.0;
      float Cval;
      for (k = 0; k < K; ++k) {
#       ifdef USE_BFLOAT
        libxsmm_bfloat16 Atmp = A_gold[i*K+k];
        int Atmp_int  = Atmp; Atmp_int <<= 16;
        float Aval = *(float *)&Atmp_int;
        libxsmm_bfloat16 Btmp = B_gold[k*N+j];
        int Btmp_int  = Btmp; Btmp_int <<= 16;
        float Bval = *(float *)&Btmp_int;
#       else
        float Aval = A_gold[i*K + k];
        float Bval = B_gold[k*N + j];
#       endif
        sum += Aval * Bval;
      }
      Cval = sum;
      C_gold[i*N + j] = Cval + beta*C_gold[i*N + j];
    }
  }
  /* LIBXSMM_FSYMBOL(sgemm)(&trans, &trans, &N, &M, &K, &alpha, B_gold, &N, A_gold, &K, &beta, C_gold, &N); */

  /* Compute the max difference between gold and computed results. */
  spmdm_check_c( &handle, C, C_gold );

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   else
    spmdm_exec_fp32( &handle, transA, transB, &alpha, A_gold, B_gold, transC, &beta, C, A_sparse);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 5: Initialize libxsmm for transpose A - allocates handle and temporary space for the sparse data structure for A */
  transA = 'T'; transB = 'N'; transC = 'T';
  libxsmm_spmdm_init(M, N, K, max_threads, &handle2, &A_sparse2);
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transA = Y, transC = Y -- weight update\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  A_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( M*K*sizeof(REAL_TYPE), 64 );
  C2 = (float*)libxsmm_aligned_malloc( M*N*sizeof(float), 64 );

  for (i = 0; i < M; ++i) {
    for (j = 0; j < K; ++j) {
      A_gold2[j*M + i] = A_gold[i*K + j];
    }
  }
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C[j*M + i] = (float)C0_gold[i*N + j];
    }
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
# endif

  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      C2[i*N + j] = C[j*M + i];
    }
  }
  /* Checks */
  spmdm_check_c( &handle2, C2, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold2, B_gold, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);

  /*----------------------------------------------------------------------------------------------------------------------*/
  /* Step 6: Test transpose B */
  transA = 'N'; transB = 'T'; transC = 'N';
  printf(" running with: M=%i, N=%i, K=%i, bm=%i, bn=%i, bk=%i, mb=%i, nb=%i, kb=%i, reps=%i, transB = Y -- backprop\n", handle2.m, handle2.n, handle2.k, handle2.bm, handle2.bn, handle2.bk, handle2.mb, handle2.nb, handle2.kb, reps );
  B_gold2 = (REAL_TYPE*)libxsmm_aligned_malloc( K*N*sizeof(REAL_TYPE), 64 );

  for (i = 0; i < K; ++i) {
    for (j = 0; j < N; ++j) {
      B_gold2[j*K + i] = B_gold[i*N + j];
    }
  }
  for (l = 0; l < (size_t)M * (size_t)N; ++l) {
    C[l] = (float)C0_gold[l];
  }
  /* The overall function that takes in matrix inputs in dense format, does the conversion of A to sparse format and does the matrix multiply */
  /* Currently ignores alpha */
  /* TODO: fix alpha inputs */
# ifdef USE_BFLOAT
  spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# else
  spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
# endif

  /* Checks */
  spmdm_check_c( &handle2, C, C_gold);

  /* Timing loop starts */
  start = libxsmm_timer_tick();
  for (i = 0; i < reps; ++i) {
#   ifdef USE_BFLOAT
    spmdm_exec_bfloat16( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   else
    spmdm_exec_fp32( &handle2, transA, transB, &alpha, A_gold, B_gold2, transC, &beta, C, A_sparse2);
#   endif
  }
  end = libxsmm_timer_tick();
  duration = libxsmm_timer_duration(start, end);
  printf("Time = %f Time/rep = %f, TFlops/s = %f\n", duration, duration*1.0/reps, flops/1000./1000./1000./1000./duration*reps);
  libxsmm_spmdm_destroy(&handle2);

  libxsmm_free(A_gold);
  libxsmm_free(B_gold);
  libxsmm_free(C_gold);
  libxsmm_free(C);
  libxsmm_free(C2);
  libxsmm_free(C0_gold);
  libxsmm_free(B_gold2);
  libxsmm_free(A_gold2);

  return EXIT_SUCCESS;
}
示例#6
0
int main(int argc, char* argv[])
{
  const int ncalls = 1000000;
#if defined(_OPENMP)
  const int max_nthreads = omp_get_max_threads();
#else
  const int max_nthreads = 1;
#endif
  const int ncycles = LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : 100, 1);
  const int max_nallocs = LIBXSMM_CLMP(2 < argc ? atoi(argv[2]) : 4, 1, MAX_MALLOC_N);
  const int nthreads = LIBXSMM_CLMP(3 < argc ? atoi(argv[3]) : 1, 1, max_nthreads);
  unsigned int nallocs = 0, nerrors = 0;
  int r[MAX_MALLOC_N], i;

  /* generate set of random number for parallel region */
  for (i = 0; i < (MAX_MALLOC_N); ++i) r[i] = rand();

  /* count number of calls according to randomized scheme */
  for (i = 0; i < ncycles; ++i) {
    nallocs += r[i%(MAX_MALLOC_N)] % max_nallocs + 1;
  }
  assert(0 != nallocs);

  fprintf(stdout, "Running %i cycles with max. %i malloc+free (%u calls) using %i thread%s...\n",
    ncycles, max_nallocs, nallocs, 1 >= nthreads ? 1 : nthreads, 1 >= nthreads ? "" : "s");

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    const char *const longlife_env = getenv("LONGLIFE");
    const int enable_longlife = ((0 == longlife_env || 0 == *longlife_env) ? 0 : atoi(longlife_env));
    void *const longlife = (0 == enable_longlife ? 0 : malloc_offsite((MAX_MALLOC_MB) << 20));
    unsigned long long d0, d1 = 0;
    libxsmm_scratch_info info;

    /* run non-inline function to measure call overhead of an "empty" function */
    const unsigned long long t0 = libxsmm_timer_tick();
    for (i = 0; i < ncalls; ++i) {
      libxsmm_init(); /* subsequent calls are not doing any work */
    }
    d0 = libxsmm_timer_diff(t0, libxsmm_timer_tick());

#if defined(_OPENMP)
#   pragma omp parallel for num_threads(nthreads) private(i) default(none) shared(r) reduction(+:d1,nerrors)
#endif
    for (i = 0; i < ncycles; ++i) {
      const int count = r[i%(MAX_MALLOC_N)] % max_nallocs + 1;
      void* p[MAX_MALLOC_N];
      int j;
      assert(count <= MAX_MALLOC_N);
      for (j = 0; j < count; ++j) {
        const int k = (i * count + j) % (MAX_MALLOC_N);
        const size_t nbytes = (r[k] % (MAX_MALLOC_MB) + 1) << 20;
        const unsigned long long t1 = libxsmm_timer_tick();
        p[j] = libxsmm_aligned_scratch(nbytes, 0/*auto*/);
        d1 += libxsmm_timer_diff(t1, libxsmm_timer_tick());
        if (0 != p[j]) {
          memset(p[j], j, nbytes);
        }
        else {
          ++nerrors;
        }
      }
      for (j = 0; j < count; ++j) {
        libxsmm_free(p[j]);
      }
    }
    libxsmm_free(longlife);

    if (0 != d0 && 0 != d1 && 0 < nallocs) {
      const double dcalls = libxsmm_timer_duration(0, d0);
      const double dalloc = libxsmm_timer_duration(0, d1);
      const double alloc_freq = 1E-3 * nallocs / dalloc;
      const double empty_freq = 1E-3 * ncalls / dcalls;
      fprintf(stdout, "\tallocation+free calls/s: %.1f kHz\n", alloc_freq);
      fprintf(stdout, "\tempty calls/s: %.1f MHz\n", 1E-3 * empty_freq);
      fprintf(stdout, "\toverhead: %.1fx\n", empty_freq / alloc_freq);
    }

    if (EXIT_SUCCESS == libxsmm_get_scratch_info(&info) && 0 < info.size) {
      fprintf(stdout, "\nScratch: %.f MB (mallocs=%lu, pools=%u",
        1.0 * info.size / (1 << 20), (unsigned long int)info.nmallocs, info.npools);
      if (1 < nthreads) fprintf(stdout, ", threads=%i)\n", nthreads); else fprintf(stdout, ")\n");
      libxsmm_release_scratch(); /* suppress LIBXSMM's termination message about scratch */
    }
  }

  if (0 == nerrors) {
    fprintf(stdout, "Finished\n");
    return EXIT_SUCCESS;
  }
  else {
    fprintf(stdout, "FAILED (%u errors)\n", nerrors);
    return EXIT_FAILURE;
  }
}
示例#7
0
int main(void)
{
  const libxsmm_blasint m[]   = { 1, 1, 1, 1, 2, 3, 5, 5, 5, 16, 63,  16,  75, 2507 };
  const libxsmm_blasint n[]   = { 1, 7, 7, 7, 2, 3, 1, 1, 1, 16, 31, 500, 130, 1975 };
  const libxsmm_blasint ldi[] = { 1, 1, 1, 9, 2, 3, 5, 8, 8, 16, 64,  16,  87, 3000 };
  const libxsmm_blasint ldo[] = { 1, 7, 8, 8, 2, 3, 1, 1, 4, 16, 32, 512, 136, 3072 };
  const int start = 0, ntests = sizeof(m) / sizeof(*m);
  libxsmm_blasint max_size_a = 0, max_size_b = 0;
  unsigned int nerrors = 0;
  ELEM_TYPE *a = 0, *b = 0;
  int test;

  for (test = start; test < ntests; ++test) {
    const libxsmm_blasint size_a = ldi[test] * n[test], size_b = ldo[test] * m[test];
    assert(m[test] <= ldi[test] && n[test] <= ldo[test]);
    max_size_a = LIBXSMM_MAX(max_size_a, size_a);
    max_size_b = LIBXSMM_MAX(max_size_b, size_b);
  }
  a = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_a * sizeof(ELEM_TYPE)));
  b = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_b * sizeof(ELEM_TYPE)));
  assert(0 != a && 0 != b);

  LIBXSMM_MATINIT(ELEM_TYPE, 42, a, max_size_a, 1, max_size_a, 1.0);
  LIBXSMM_MATINIT(ELEM_TYPE,  0, b, max_size_b, 1, max_size_b, 1.0);

  for (test = start; test < ntests; ++test) {
    unsigned int testerrors = (EXIT_SUCCESS == libxsmm_otrans(
      b, a, sizeof(ELEM_TYPE), m[test], n[test],
      ldi[test], ldo[test]) ? 0 : 1);

    if (0 == testerrors) {
      libxsmm_blasint i, j;
      for (i = 0; i < n[test]; ++i) {
        for (j = 0; j < m[test]; ++j) {
          const libxsmm_blasint u = i * ldi[test] + j;
          const libxsmm_blasint v = j * ldo[test] + i;
          testerrors += (LIBXSMM_FEQ(a[u], b[v]) ? 0u : 1u);
        }
      }
    }
    if (nerrors < testerrors) {
      nerrors = testerrors;
    }
  }

  if (0 == nerrors) { /* previous results are correct and may be used to validate other tests */
    for (test = start; test < ntests; ++test) {
      /* prepare expected results in b (correct according to the previous test block) */
      libxsmm_otrans(b, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]);

      if (m[test] == n[test] && ldi[test] == ldo[test]) {
        unsigned int testerrors = (EXIT_SUCCESS == libxsmm_otrans(
          a, a, sizeof(ELEM_TYPE), m[test], n[test],
          ldi[test], ldo[test]) ? 0 : 1);

        if (0 == testerrors) {
          libxsmm_blasint i, j;
          for (i = 0; i < n[test]; ++i) {
            for (j = 0; j < m[test]; ++j) {
              /* address serves both a and b since ldi and ldo are equal */
              const libxsmm_blasint uv = i * ldi[test] + j;
              testerrors += (LIBXSMM_FEQ(a[uv], b[uv]) ? 0u : 1u);
            }
          }
        }
        if (nerrors < testerrors) {
          nerrors = testerrors;
        }
      }
      else { /* negative tests */
        nerrors = LIBXSMM_MAX(EXIT_SUCCESS != libxsmm_otrans(
          a, a, sizeof(ELEM_TYPE), m[test], n[test],
          ldi[test], ldo[test]) ? 0u : 1u, nerrors);
      }
    }
  }

  libxsmm_free(a);
  libxsmm_free(b);

  if (0 == nerrors) {
    return EXIT_SUCCESS;
  }
  else {
# if defined(_DEBUG)
    fprintf(stderr, "errors=%u\n", nerrors);
# endif
    return EXIT_FAILURE;
  }
}
示例#8
0
int main(int argc, char* argv[])
{
  const char t = (char)(1 < argc ? *argv[1] : 'o');
  const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096);
#if 0 /* TODO: enable when in-place transpose is fully supported */
  const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m);
#else
  const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m);
#endif
  const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m);
  const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n);
  const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r);
  const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0);
  libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi);
  int result = EXIT_SUCCESS, k;

  if (0 == strchr("oOiI", t)) {
    fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]);
    exit(EXIT_FAILURE);
  }

#if defined(LIBXSMM_OFFLOAD_TARGET)
# pragma offload target(LIBXSMM_OFFLOAD_TARGET)
#endif
  {
    const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK");
    const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks);
    const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check);
    ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE)));
    ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE)));
    libxsmm_timer_tickint start, duration = 0;
#if defined(USE_REFERENCE) /* benchmark against a reference */
    libxsmm_timer_tickint duration2 = 0;
#endif
    libxsmm_blasint i;
    size_t size = 0;
#if defined(MKL_ENABLE_AVX512)
    mkl_enable_instructions(MKL_ENABLE_AVX512);
#endif
    fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n",
      (long long)m, (long long)n, (long long)ldi, (long long)ldo,
      1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE),
      ('o' == t || 'O' == t) ? "out-of-place" : "in-place");

#if defined(_OPENMP)
#   pragma omp parallel for private(i)
#endif
    for (i = 0; i < n; ++i) {
      libxsmm_blasint j;
      for (j = 0; j < m; ++j) {
        a[i*ldi+j] = initial_value(i, j, m);
      }
    }

    if (0 != check) { /* repeatable (reference) */
      srand(RAND_SEED);
    }
    else { /* randomized selection */
      srand(libxsmm_timer_tick() % ((unsigned int)-1));
    }
    for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
      if (0 < r) {
        const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
        km = randstart(LIBXSMM_ABS(lower), m);
        kldi = LIBXSMM_MAX(rldi, km);
        if (('o' == t || 'O' == t)) {
          const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
          kn = randstart(LIBXSMM_ABS(lower), n);
          kldo = LIBXSMM_MAX(rldo, kn);
          /* trigger JIT-generated code */
          OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
        }
        else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
          kn = randstart(LIBXSMM_ABS(lower), n);
#else
          kn = km;
#endif
          kldo = kldi;
          /* trigger JIT-generated code */
          ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
        }
      }
      size += (size_t)(km * kn * sizeof(ELEM_TYPE));

      if (('o' == t || 'O' == t)) {
        if (0 == tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
#if defined(OTRANS_THREAD)
#         pragma omp parallel
          OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads());
#else
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
#endif
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single nowait
#endif
          result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      else {
        assert(('i' == t || 'I' == t) && kldo == kldi);
        memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));

        if (2 > tasks) { /* library-internal parallelization */
          start = libxsmm_timer_tick();
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else { /* external parallelization */
          start = libxsmm_timer_tick();
#if defined(_OPENMP)
#         pragma omp parallel
#         pragma omp single
#endif
          result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi);
          duration += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
      }
      if (0 != check) { /* check */
        for (i = 0; i < km; ++i) {
          libxsmm_blasint j;
          for (j = 0; j < kn; ++j) {
            const ELEM_TYPE u = b[i*kldo+j];
            const ELEM_TYPE v = a[j*kldi+i];
            if (LIBXSMM_NEQ(u, v)) {
              i += km; /* leave outer loop as well */
              result = EXIT_FAILURE;
              break;
            }
          }
        }
      }
    }

#if defined(USE_REFERENCE)
    if (0 < check) { /* check shall imply reference (performance-)test */
      srand(RAND_SEED); /* reproduce the same sequence as above */
      for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) {
        if (0 < r) {
          const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0;
          km = randstart(LIBXSMM_ABS(lower), m);
          kldi = LIBXSMM_MAX(rldi, km);
          if (('o' == t || 'O' == t)) {
            const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0;
            kn = randstart(LIBXSMM_ABS(lower), n);
            kldo = LIBXSMM_MAX(rldo, kn);
          }
          else {
#if 0 /* TODO: enable when in-place transpose is fully supported */
            kn = randstart(LIBXSMM_ABS(lower), n);
#else
            kn = km;
#endif
            kldo = kldi;
          }
        }

        if (('o' == t || 'O' == t)) {
          start = libxsmm_timer_tick();
          OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        else {
          assert(('i' == t || 'I' == t) && kldo == kldi);
          memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE)));
          start = libxsmm_timer_tick();
          ITRANS_GOLD(&km, &kn, b, &kldi, &kldo);
          duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick());
        }
        if (1 < check || 0 > check) { /* check */
          for (i = 0; i < km; ++i) {
            libxsmm_blasint j;
            for (j = 0; j < kn; ++j) {
              const ELEM_TYPE u = b[i*kldo+j];
              const ELEM_TYPE v = a[j*kldi+i];
              if (LIBXSMM_NEQ(u, v)) {
                i += km; /* leave outer loop as well */
                result = EXIT_FAILURE;
                break;
              }
            }
          }
        }
      }
    }
#endif

    if (EXIT_SUCCESS == result) {
      const double d = libxsmm_timer_duration(0, duration);
      if (0 < duration) {
        /* out-of-place transpose bandwidth assumes RFO */
        fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size
          * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30)));
      }
      if (0 == lower) {
        fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s)));
      }
      else {
        fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d);
      }
#if defined(USE_REFERENCE)
      if (0 < duration2) {
        fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2);
      }
#endif
    }
    else if (0 != check) { /* check */
      fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n",
        (long long)km, (long long)kn, (long long)kldi, (long long)kldo);
    }

    libxsmm_free(a);
    libxsmm_free(b);
  }
  return result;
}