예제 #1
0
LIBXSMM_INLINE LIBXSMM_RETARGETABLE void internal_update_statistic(const libxsmm_gemm_descriptor* desc,
  unsigned ntry, unsigned ncol)
{
  assert(0 != desc);
  {
    const unsigned long long size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k);
    const int precision = (0 == (LIBXSMM_GEMM_FLAG_F32PREC & desc->flags) ? 0 : 1);
    const unsigned int statistic_sml = internal_statistic_sml;
    int bucket = 2/*big*/;

    if (LIBXSMM_MNK_SIZE(statistic_sml, statistic_sml, statistic_sml) >= size) {
      bucket = 0;
    }
    else {
      const unsigned int statistic_med = internal_statistic_med;
      if (LIBXSMM_MNK_SIZE(statistic_med, statistic_med, statistic_med) >= size) {
        bucket = 1;
      }
    }

    LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[precision][bucket].ntry, ntry, LIBXSMM_ATOMIC_RELAXED);
    LIBXSMM_ATOMIC_ADD_FETCH(&internal_statistic[precision][bucket].ncol, ncol, LIBXSMM_ATOMIC_RELAXED);
  }
}
예제 #2
0
void LIBXSMM_FSYMBOL(dgemm)(LIBXSMM_GEMM_CONST char* transa, LIBXSMM_GEMM_CONST char* transb,
  LIBXSMM_GEMM_CONST libxsmm_blasint* m, LIBXSMM_GEMM_CONST libxsmm_blasint* n, LIBXSMM_GEMM_CONST libxsmm_blasint* k,
  LIBXSMM_GEMM_CONST double* alpha, LIBXSMM_GEMM_CONST double* a, LIBXSMM_GEMM_CONST libxsmm_blasint* lda,
  LIBXSMM_GEMM_CONST double* b, LIBXSMM_GEMM_CONST libxsmm_blasint* ldb,
  LIBXSMM_GEMM_CONST double* beta, double* c, LIBXSMM_GEMM_CONST libxsmm_blasint* ldc)
{
#if !defined(NDEBUG) /* library code is expected to be mute */
  static int error_once = 0;
  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
    fprintf(stderr, "LIBXSMM ERROR: application must be linked against LAPACK/BLAS!\n");
  }
#endif
  LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(transb); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k);
  LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(a); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(b); LIBXSMM_UNUSED(ldb);
  LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(c); LIBXSMM_UNUSED(ldc);
}
예제 #3
0
void LIBXSMM_FSYMBOL(sgemm)(
  const char* transa, const char* transb,
  const libxsmm_blasint* m, const libxsmm_blasint* n, const libxsmm_blasint* k,
  const float* alpha, const float* a, const libxsmm_blasint* lda,
  const float* b, const libxsmm_blasint* ldb,
  const float* beta, float* c, const libxsmm_blasint* ldc)
{
#if !defined(NDEBUG) /* library code is expected to be mute */
  static int error_once = 0;
  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED)) {
    fprintf(stderr, "LIBXSMM: application must be linked against a LAPACK/BLAS implementation!\n");
  }
#endif
  LIBXSMM_UNUSED(transa); LIBXSMM_UNUSED(transb); LIBXSMM_UNUSED(m); LIBXSMM_UNUSED(n); LIBXSMM_UNUSED(k);
  LIBXSMM_UNUSED(alpha); LIBXSMM_UNUSED(a); LIBXSMM_UNUSED(lda); LIBXSMM_UNUSED(b); LIBXSMM_UNUSED(ldb);
  LIBXSMM_UNUSED(beta); LIBXSMM_UNUSED(c); LIBXSMM_UNUSED(ldc);
}
예제 #4
0
LIBXSMM_APIEXT void libxsmm_bgemm_omp(const libxsmm_bgemm_handle* handle,
  const void* a, const void* b, void* c, /*unsigned*/int count)
{
  static int error_once = 0;
  if (0 < count) {
    if (0 != a && 0 != b && 0 != c) {
#if !defined(_OPENMP)
      const int nthreads = 1;
#else
      const int nthreads = omp_get_max_threads();
# if defined(LIBXSMM_BGEMM_BARRIER)
      libxsmm_barrier* barrier = 0;
      /* make an informed guess about the number of threads per core */
      if (224 <= nthreads
#   if !defined(__MIC__)
        && LIBXSMM_X86_AVX512_MIC <= libxsmm_target_archid
        && LIBXSMM_X86_AVX512_CORE > libxsmm_target_archid
#   endif
        )
      {
        barrier = libxsmm_barrier_create(nthreads / 4, 4);
      }
      else {
        barrier = libxsmm_barrier_create(nthreads / 2, 2);
      }
# endif /*defined(LIBXSMM_BGEMM_BARRIER)*/
#     pragma omp parallel
#endif /*defined(_OPENMP)*/
      {
        int tid = 0, i;
#if defined(_OPENMP)
        tid = omp_get_thread_num();
#endif
        assert(tid < nthreads);
#if defined(LIBXSMM_BGEMM_BARRIER)
        libxsmm_barrier_init(barrier, tid);
#endif
        for (i = 0; i < count; ++i) {
          libxsmm_bgemm(handle, a, b, c, tid, nthreads);
#if defined(LIBXSMM_BGEMM_BARRIER)
          libxsmm_barrier_wait(barrier, tid);
#elif defined(_OPENMP)
#         pragma omp barrier
#endif
        }
      }
#if defined(LIBXSMM_BGEMM_BARRIER)
      libxsmm_barrier_release(barrier);
#endif
    }
    else if (0 != libxsmm_get_verbosity() /* library code is expected to be mute */
          && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
    {
      fprintf(stderr, "LIBXSMM ERROR: BGEMM matrix-operands cannot be NULL!\n");
    }
  }
  else if (0 > count && 0 != libxsmm_get_verbosity() /* library code is expected to be mute */
        && 1 == LIBXSMM_ATOMIC_ADD_FETCH(&error_once, 1, LIBXSMM_ATOMIC_RELAXED))
  {
    fprintf(stderr, "LIBXSMM ERROR: BGEMM count-argument cannot be negative!\n");
  }
}
예제 #5
0
LIBXSMM_INLINE LIBXSMM_RETARGETABLE internal_code_type* internal_init(void)
{
  /*const*/internal_code_type* result;
  int i;
#if !defined(LIBXSMM_OPENMP)
# if !defined(LIBXSMM_NOSYNC)
  static int internal_reglock_check = 1; /* setup the locks in a thread-safe fashion */
  assert(sizeof(internal_reglock) == (INTERNAL_REGLOCK_COUNT * sizeof(*internal_reglock)));
  if (1 == LIBXSMM_ATOMIC_LOAD(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST)) {
    LIBXSMM_ATOMIC_ADD_FETCH(&internal_reglock_check, 1, LIBXSMM_ATOMIC_SEQ_CST);
    if (2 == internal_reglock_check) {
      for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_INIT(internal_reglock + i);
      LIBXSMM_ATOMIC_STORE_ZERO(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST);
    }
  }
  while (0 != internal_reglock_check); /* wait until locks are initialized */
  for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i);
# endif
#else
# pragma omp critical(internal_reglock)
#endif
  {
    result = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);
    if (0 == result) {
      int init_code;
      /* set internal_target_archid */
      libxsmm_set_target_arch(getenv("LIBXSMM_TARGET"));
      { /* select prefetch strategy for JIT */
        const char *const env_prefetch = getenv("LIBXSMM_PREFETCH");
        if (0 == env_prefetch || 0 == *env_prefetch) {
#if (0 > LIBXSMM_PREFETCH) /* permitted by LIBXSMM_PREFETCH_AUTO */
          internal_prefetch = (LIBXSMM_X86_AVX512_MIC != internal_target_archid
            ? LIBXSMM_PREFETCH_NONE : LIBXSMM_PREFETCH_AL2BL2_VIA_C);
#else
          internal_prefetch = LIBXSMM_MAX(INTERNAL_PREFETCH, 0);
#endif
        }
        else { /* user input considered even if LIBXSMM_PREFETCH_AUTO is disabled */
          switch (atoi(env_prefetch)) {
            case 2:  internal_prefetch = LIBXSMM_PREFETCH_SIGONLY; break;
            case 3:  internal_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; break;
            case 4:  internal_prefetch = LIBXSMM_PREFETCH_AL2; break;
            case 5:  internal_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; break;
            case 6:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; break;
            case 7:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; break;
            case 8:  internal_prefetch = LIBXSMM_PREFETCH_AL2_JPST; break;
            case 9:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; break;
            default: internal_prefetch = LIBXSMM_PREFETCH_NONE;
          }
        }
      }
      libxsmm_hash_init(internal_target_archid);
      libxsmm_gemm_diff_init(internal_target_archid);
      init_code = libxsmm_gemm_init(internal_target_archid, internal_prefetch);
#if defined(__TRACE)
      {
        int filter_threadid = 0, filter_mindepth = 1, filter_maxnsyms = 0;
        const char *const env_trace_init = getenv("LIBXSMM_TRACE");
        if (EXIT_SUCCESS == init_code && 0 != env_trace_init && 0 != *env_trace_init) {
          char buffer[32];
          if (1 == sscanf(env_trace_init, "%32[^,],", buffer)) {
            sscanf(buffer, "%i", &filter_threadid);
          }
          if (1 == sscanf(env_trace_init, "%*[^,],%32[^,],", buffer)) {
            sscanf(buffer, "%i", &filter_mindepth);
          }
          if (1 == sscanf(env_trace_init, "%*[^,],%*[^,],%32s", buffer)) {
            sscanf(buffer, "%i", &filter_maxnsyms);
          }
          else {
            filter_maxnsyms = -1; /* all */
          }
        }
        init_code = libxsmm_trace_init(filter_threadid - 1, filter_mindepth, filter_maxnsyms);
      }
#endif
      if (EXIT_SUCCESS == init_code) {
        assert(0 == internal_registry_keys && 0 == internal_registry); /* should never happen */
        result = (internal_code_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_code_type));
        internal_registry_keys = (internal_regkey_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_regkey_type));
        if (0 != result && 0 != internal_registry_keys) {
          const char *const env_verbose = getenv("LIBXSMM_VERBOSE");
          internal_statistic_mnk = (unsigned int)(pow((double)(LIBXSMM_MAX_MNK), 0.3333333333333333) + 0.5);
          internal_statistic_sml = 13; internal_statistic_med = 23;
          if (0 != env_verbose && 0 != *env_verbose) {
            internal_verbose_mode = atoi(env_verbose);
          }
#if !defined(NDEBUG)
          else {
            internal_verbose_mode = 1; /* quiet -> verbose */
          }
#endif
          for (i = 0; i < LIBXSMM_REGSIZE; ++i) result[i].pmm = 0;
          /* omit registering code if JIT is enabled and if an ISA extension is found
           * which is beyond the static code path used to compile the library
           */
#if defined(LIBXSMM_BUILD)
# if (0 != LIBXSMM_JIT) && !defined(__MIC__)
          /* check if target arch. permits execution (arch. may be overridden) */
          if (LIBXSMM_STATIC_TARGET_ARCH <= internal_target_archid &&
             (LIBXSMM_X86_AVX > internal_target_archid /* jit is not available */
              /* condition allows to avoid JIT (if static code is good enough) */
           || LIBXSMM_STATIC_TARGET_ARCH == internal_target_archid))
# endif
          { /* opening a scope for eventually declaring variables */
            /* setup the dispatch table for the statically generated code */
#           include <libxsmm_dispatch.h>
          }
#endif
          atexit(libxsmm_finalize);
          LIBXSMM_ATOMIC_STORE(&internal_registry, result, LIBXSMM_ATOMIC_SEQ_CST);
        }
        else {
#if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */
          fprintf(stderr, "LIBXSMM: failed to allocate code registry!\n");
#endif
          libxsmm_free(internal_registry_keys);
          libxsmm_free(result);
        }
      }
#if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */
      else {
        fprintf(stderr, "LIBXSMM: failed to initialize sub-component (error #%i)!\n", init_code);
      }
#endif
    }
  }
#if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */
  for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i);
#endif
  assert(result);
  return result;
}
예제 #6
0
LIBXSMM_API
#if defined(_WIN32)
/*TODO: no inline*/
#elif defined(__GNUC__)
/*LIBXSMM_ATTRIBUTE(noinline)*/
#endif
const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms)
{
  const char *fname = NULL;
#if defined(LIBXSMM_TRACE)
  const int max_n = (0 != depth ? (LIBXSMM_TRACE_MAXDEPTH) : 2);
  const int min_n = (0 != depth ? (LIBXSMM_TRACE_MINDEPTH + *depth) : 2);
  void *stacktrace[LIBXSMM_TRACE_MAXDEPTH], **symbol = stacktrace + LIBXSMM_MIN(0 != depth ? ((int)(*depth + 1)) : 1, max_n - 1);
  static LIBXSMM_TLS int cerberus = 0;
  int i;

  /* check against entering a recursion (recursion should not happen due to
   * attribute "no_instrument_function" but better prevent this in any case)
   */
  if (0 == cerberus) {
    ++cerberus;
# if defined(__GNUC__)
    __asm__("");
# endif
    i = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED);
    if (0 <= i) { /* do nothing if not yet initialized */
      const int mindepth = (0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth);
      const int maxnsyms = (0 != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms);
      i = libxsmm_backtrace(stacktrace, max_n);
      /* filter depth against filter_mindepth and filter_maxnsyms */
      if ((0 >= mindepth ||      (min_n + mindepth) <= i) &&
          (0 >  maxnsyms || i <= (min_n + mindepth + maxnsyms - 1)))
      {
        if (min_n <= i) { /* check against min. depth */
          const int filter = (0 != filter_threadid ? *filter_threadid : internal_trace_threadid);
          int abs_tid = 0;
# if defined(_WIN32) || defined(__CYGWIN__)
          static LIBXSMM_TLS char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE];
          static LIBXSMM_TLS int tid = 0;

          PSYMBOL_INFO value = (PSYMBOL_INFO)buffer;
          value->SizeOfStruct = sizeof(SYMBOL_INFO);
          value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1;

          if (0 != tid) {
            abs_tid = (0 <= tid ? tid : -tid);
          }
          else {
            abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED);
            /* use sign bit to flag enabled fall-back for symbol resolution */
            tid = -abs_tid;
          }

          assert(0 < abs_tid);
          if (0 > filter || filter == abs_tid - 1) {
            if (FALSE != SymFromAddr(GetCurrentProcess(), (DWORD64)*symbol, NULL, value)
              && 0 < value->NameLen)
            {
              /* disable fall-back allowing unresolved symbol names */
              tid = abs_tid; /* make unsigned */
              fname = value->Name;
            }
            else if (0 > tid) { /* fall-back allowing unresolved symbol names */
#   if defined(__MINGW32__)
              sprintf(buffer, "%p", *symbol);
#   else
              sprintf(buffer, "0x%" PRIxPTR, (uintptr_t)*symbol);
#   endif
              fname = buffer;
            }
            if (depth) *depth = i - min_n;
            if (threadid) *threadid = abs_tid - 1;
          }
# else
#   if defined(LIBXSMM_NO_SYNC)
          static char raw_c;
          char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */
#   else
          char *const raw_value = (char*)pthread_getspecific(internal_trace_key);
#   endif
          int* ivalue = 0, fd = -1;
          char* value = 0;

          if (raw_value) {
            ivalue = (int*)raw_value;
            abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]);

            if (0 > filter || filter == abs_tid - 1) {
              fd = ivalue[0];
              if (0 <= fd && (sizeof(int) * 2) == lseek(fd, sizeof(int) * 2, SEEK_SET)) {
                value = raw_value + sizeof(int) * 2;
              }
#   if !defined(NDEBUG) /* library code is expected to be mute */
              else {
                fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n");
              }
#   endif
            }
          }
          else {
            char filename[] = "/tmp/.libxsmm_XXXXXX.map";
#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(2, 19) <= LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__)
            fd = mkstemps(filename, 4/*.map*/);
#else
            char *const xpos = strrchr(filename, 'X');
            const char c = (char)(NULL != xpos ? *(xpos + 1) : 0);
            if (0 != c) {
              xpos[1] = 0;
              fd = mkstemp(filename);
              xpos[1] = c;
            }
            else {
              fd = -1;
            }
#endif
            if (0 <= fd && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) {
              char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE,
                PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

              if (MAP_FAILED != buffer) {
                int check = -1;
                ivalue = (int*)buffer;
                ivalue[0] = fd; /* valid file descriptor for internal_delete */

                if (
#   if !defined(LIBXSMM_NO_SYNC)
                  0 == pthread_setspecific(internal_trace_key, buffer) &&
#   endif
                     (sizeof(int) * 1) == read(fd, &check, sizeof(int))
                  && (sizeof(int) * 2) == lseek(fd, sizeof(int), SEEK_CUR)
                  && check == fd)
                {
                  abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED);
                  assert(0 < abs_tid);
                  /* use sign bit to flag enabled fall-back for symbol resolution */
                  ivalue[1] = -abs_tid;

                  if (0 > filter || filter == abs_tid - 1) {
                    value = buffer + sizeof(int) * 2;
                  }
                }
                else {
#   if !defined(NDEBUG) /* library code is expected to be mute */
                  fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n");
#   endif
                  internal_delete(buffer);
                }
              }
#   if !defined(NDEBUG)
              else {
                const int error = errno;
                fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n",
                  strerror(error), error);
              }
#   endif
            }
#   if !defined(NDEBUG) /* library code is expected to be mute */
            else {
              fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd);
            }
#   endif
          }

          if (value) {
            backtrace_symbols_fd(symbol, 1, fd);

            /* attempt to parse symbol name */
            if (1 == sscanf(value, "%*[^(](%s0x", value)) {
              char* c;
              for (c = value; '+' != *c && *c; ++c);
              if ('+' == *c) {
                /* disable fall-back allowing unresolved symbol names */
                ivalue[1] = abs_tid; /* make unsigned */
                fname = value;
                *c = 0;
              }
            }

            /* fall-back to symbol address */
            if (0 > ivalue[1] && 0 == fname) {
              sprintf(value, "0x%llx", (unsigned long long)*symbol);
              fname = value;
            }

            if (depth) *depth = i - min_n;
            if (threadid) *threadid = abs_tid - 1;
          }
# endif
        }
      }
    }

    --cerberus;
  }
#else
  LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid);
  LIBXSMM_UNUSED(filter_threadid);
  LIBXSMM_UNUSED(filter_mindepth);
  LIBXSMM_UNUSED(filter_maxnsyms);
#endif

  return fname;
}