Пример #1
0
void libxsmm_init(void)
{
  const void *const registry = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_RELAXED);
  if (0 == registry) {
    internal_init();
  }
}
Пример #2
0
void libxsmm_finalize(void)
{
  internal_code_type* registry = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);

  if (0 != registry) {
    int i;
#if !defined(LIBXSMM_OPENMP)
# if !defined(LIBXSMM_NOSYNC)
    /* acquire locks and thereby shortcut lazy initialization later on */
    for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i);
# endif
#else
#   pragma omp critical(internal_reglock)
#endif
    {
      registry = internal_registry;

      if (0 != registry) {
        internal_regkey_type *const registry_keys = internal_registry_keys;
        const char *const target_arch = internal_get_target_arch(internal_target_archid);
        unsigned int heapmem = (LIBXSMM_REGSIZE) * (sizeof(internal_code_type) + sizeof(internal_regkey_type));

        /* serves as an id to invalidate the thread-local cache; never decremented */
        ++internal_teardown;
#if defined(__TRACE)
        i = libxsmm_trace_finalize();
# if !defined(NDEBUG) /* library code is expected to be mute */
        if (EXIT_SUCCESS != i) {
          fprintf(stderr, "LIBXSMM: failed to finalize trace (error #%i)!\n", i);
        }
# endif
#endif
        libxsmm_gemm_finalize();
        libxsmm_gemm_diff_finalize();
        libxsmm_hash_finalize();

        /* make internal registry globally unavailable */
        LIBXSMM_ATOMIC_STORE_ZERO(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);
        internal_registry_keys = 0;

        for (i = 0; i < LIBXSMM_REGSIZE; ++i) {
          internal_code_type code = registry[i];
          if (0 != code.pmm) {
            const libxsmm_gemm_descriptor *const desc = &registry_keys[i].descriptor;
            const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k);
            const int precision = (0 == (LIBXSMM_GEMM_FLAG_F32PREC & desc->flags) ? 0 : 1);
            const unsigned int statistic_sml = internal_statistic_sml;
            int bucket = 2;
            assert((LIBXSMM_HASH_COLLISION | LIBXSMM_CODE_STATIC) != code.imm);
            if (LIBXSMM_MNK_SIZE(statistic_sml, statistic_sml, statistic_sml) >= kernel_size) {
              bucket = 0;
            }
            else {
              const unsigned int statistic_med = internal_statistic_med;
              if (LIBXSMM_MNK_SIZE(statistic_med, statistic_med, statistic_med) >= kernel_size) {
                bucket = 1;
              }
            }
            if (0 == (LIBXSMM_CODE_STATIC & code.imm)) { /* check for allocated/generated JIT-code */
              void* buffer = 0;
              size_t size = 0;
              code.imm &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */
              if (EXIT_SUCCESS == libxsmm_alloc_info(code.pmm, &size, 0/*flags*/, &buffer)) {
                libxsmm_deallocate(code.pmm);
                ++internal_statistic[precision][bucket].njit;
                heapmem += (unsigned int)(size + (((char*)code.pmm) - (char*)buffer));
              }
            }
            else {
              ++internal_statistic[precision][bucket].nsta;
            }
          }
        }
        if (0 != internal_verbose_mode) { /* print statistic on termination */
          LIBXSMM_FLOCK(stderr);
          LIBXSMM_FLOCK(stdout);
          fflush(stdout); /* synchronize with standard output */
          {
            const unsigned int linebreak = 0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0) ? 1 : 0;
            if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak) {
              fprintf(stderr, "LIBXSMM_TARGET=%s ", target_arch);
            }
            fprintf(stderr, "HEAP: %.f MB\n", 1.0 * heapmem / (1 << 20));
          }
          LIBXSMM_FUNLOCK(stdout);
          LIBXSMM_FUNLOCK(stderr);
        }
        libxsmm_free(registry_keys);
        libxsmm_free(registry);
      }
    }
#if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */
    for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i);
#endif
  }
}
Пример #3
0
LIBXSMM_INLINE LIBXSMM_RETARGETABLE internal_code_type* internal_init(void)
{
  /*const*/internal_code_type* result;
  int i;
#if !defined(LIBXSMM_OPENMP)
# if !defined(LIBXSMM_NOSYNC)
  static int internal_reglock_check = 1; /* setup the locks in a thread-safe fashion */
  assert(sizeof(internal_reglock) == (INTERNAL_REGLOCK_COUNT * sizeof(*internal_reglock)));
  if (1 == LIBXSMM_ATOMIC_LOAD(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST)) {
    LIBXSMM_ATOMIC_ADD_FETCH(&internal_reglock_check, 1, LIBXSMM_ATOMIC_SEQ_CST);
    if (2 == internal_reglock_check) {
      for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_INIT(internal_reglock + i);
      LIBXSMM_ATOMIC_STORE_ZERO(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST);
    }
  }
  while (0 != internal_reglock_check); /* wait until locks are initialized */
  for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i);
# endif
#else
# pragma omp critical(internal_reglock)
#endif
  {
    result = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST);
    if (0 == result) {
      int init_code;
      /* set internal_target_archid */
      libxsmm_set_target_arch(getenv("LIBXSMM_TARGET"));
      { /* select prefetch strategy for JIT */
        const char *const env_prefetch = getenv("LIBXSMM_PREFETCH");
        if (0 == env_prefetch || 0 == *env_prefetch) {
#if (0 > LIBXSMM_PREFETCH) /* permitted by LIBXSMM_PREFETCH_AUTO */
          internal_prefetch = (LIBXSMM_X86_AVX512_MIC != internal_target_archid
            ? LIBXSMM_PREFETCH_NONE : LIBXSMM_PREFETCH_AL2BL2_VIA_C);
#else
          internal_prefetch = LIBXSMM_MAX(INTERNAL_PREFETCH, 0);
#endif
        }
        else { /* user input considered even if LIBXSMM_PREFETCH_AUTO is disabled */
          switch (atoi(env_prefetch)) {
            case 2:  internal_prefetch = LIBXSMM_PREFETCH_SIGONLY; break;
            case 3:  internal_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; break;
            case 4:  internal_prefetch = LIBXSMM_PREFETCH_AL2; break;
            case 5:  internal_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; break;
            case 6:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; break;
            case 7:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; break;
            case 8:  internal_prefetch = LIBXSMM_PREFETCH_AL2_JPST; break;
            case 9:  internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; break;
            default: internal_prefetch = LIBXSMM_PREFETCH_NONE;
          }
        }
      }
      libxsmm_hash_init(internal_target_archid);
      libxsmm_gemm_diff_init(internal_target_archid);
      init_code = libxsmm_gemm_init(internal_target_archid, internal_prefetch);
#if defined(__TRACE)
      {
        int filter_threadid = 0, filter_mindepth = 1, filter_maxnsyms = 0;
        const char *const env_trace_init = getenv("LIBXSMM_TRACE");
        if (EXIT_SUCCESS == init_code && 0 != env_trace_init && 0 != *env_trace_init) {
          char buffer[32];
          if (1 == sscanf(env_trace_init, "%32[^,],", buffer)) {
            sscanf(buffer, "%i", &filter_threadid);
          }
          if (1 == sscanf(env_trace_init, "%*[^,],%32[^,],", buffer)) {
            sscanf(buffer, "%i", &filter_mindepth);
          }
          if (1 == sscanf(env_trace_init, "%*[^,],%*[^,],%32s", buffer)) {
            sscanf(buffer, "%i", &filter_maxnsyms);
          }
          else {
            filter_maxnsyms = -1; /* all */
          }
        }
        init_code = libxsmm_trace_init(filter_threadid - 1, filter_mindepth, filter_maxnsyms);
      }
#endif
      if (EXIT_SUCCESS == init_code) {
        assert(0 == internal_registry_keys && 0 == internal_registry); /* should never happen */
        result = (internal_code_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_code_type));
        internal_registry_keys = (internal_regkey_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_regkey_type));
        if (0 != result && 0 != internal_registry_keys) {
          const char *const env_verbose = getenv("LIBXSMM_VERBOSE");
          internal_statistic_mnk = (unsigned int)(pow((double)(LIBXSMM_MAX_MNK), 0.3333333333333333) + 0.5);
          internal_statistic_sml = 13; internal_statistic_med = 23;
          if (0 != env_verbose && 0 != *env_verbose) {
            internal_verbose_mode = atoi(env_verbose);
          }
#if !defined(NDEBUG)
          else {
            internal_verbose_mode = 1; /* quiet -> verbose */
          }
#endif
          for (i = 0; i < LIBXSMM_REGSIZE; ++i) result[i].pmm = 0;
          /* omit registering code if JIT is enabled and if an ISA extension is found
           * which is beyond the static code path used to compile the library
           */
#if defined(LIBXSMM_BUILD)
# if (0 != LIBXSMM_JIT) && !defined(__MIC__)
          /* check if target arch. permits execution (arch. may be overridden) */
          if (LIBXSMM_STATIC_TARGET_ARCH <= internal_target_archid &&
             (LIBXSMM_X86_AVX > internal_target_archid /* jit is not available */
              /* condition allows to avoid JIT (if static code is good enough) */
           || LIBXSMM_STATIC_TARGET_ARCH == internal_target_archid))
# endif
          { /* opening a scope for eventually declaring variables */
            /* setup the dispatch table for the statically generated code */
#           include <libxsmm_dispatch.h>
          }
#endif
          atexit(libxsmm_finalize);
          LIBXSMM_ATOMIC_STORE(&internal_registry, result, LIBXSMM_ATOMIC_SEQ_CST);
        }
        else {
#if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */
          fprintf(stderr, "LIBXSMM: failed to allocate code registry!\n");
#endif
          libxsmm_free(internal_registry_keys);
          libxsmm_free(result);
        }
      }
#if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */
      else {
        fprintf(stderr, "LIBXSMM: failed to initialize sub-component (error #%i)!\n", init_code);
      }
#endif
    }
  }
#if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */
  for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i);
#endif
  assert(result);
  return result;
}
Пример #4
0
LIBXSMM_API
#if defined(_WIN32)
/*TODO: no inline*/
#elif defined(__GNUC__)
/*LIBXSMM_ATTRIBUTE(noinline)*/
#endif
const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms)
{
  const char *fname = NULL;
#if defined(LIBXSMM_TRACE)
  const int max_n = (0 != depth ? (LIBXSMM_TRACE_MAXDEPTH) : 2);
  const int min_n = (0 != depth ? (LIBXSMM_TRACE_MINDEPTH + *depth) : 2);
  void *stacktrace[LIBXSMM_TRACE_MAXDEPTH], **symbol = stacktrace + LIBXSMM_MIN(0 != depth ? ((int)(*depth + 1)) : 1, max_n - 1);
  static LIBXSMM_TLS int cerberus = 0;
  int i;

  /* check against entering a recursion (recursion should not happen due to
   * attribute "no_instrument_function" but better prevent this in any case)
   */
  if (0 == cerberus) {
    ++cerberus;
# if defined(__GNUC__)
    __asm__("");
# endif
    i = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED);
    if (0 <= i) { /* do nothing if not yet initialized */
      const int mindepth = (0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth);
      const int maxnsyms = (0 != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms);
      i = libxsmm_backtrace(stacktrace, max_n);
      /* filter depth against filter_mindepth and filter_maxnsyms */
      if ((0 >= mindepth ||      (min_n + mindepth) <= i) &&
          (0 >  maxnsyms || i <= (min_n + mindepth + maxnsyms - 1)))
      {
        if (min_n <= i) { /* check against min. depth */
          const int filter = (0 != filter_threadid ? *filter_threadid : internal_trace_threadid);
          int abs_tid = 0;
# if defined(_WIN32) || defined(__CYGWIN__)
          static LIBXSMM_TLS char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE];
          static LIBXSMM_TLS int tid = 0;

          PSYMBOL_INFO value = (PSYMBOL_INFO)buffer;
          value->SizeOfStruct = sizeof(SYMBOL_INFO);
          value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1;

          if (0 != tid) {
            abs_tid = (0 <= tid ? tid : -tid);
          }
          else {
            abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED);
            /* use sign bit to flag enabled fall-back for symbol resolution */
            tid = -abs_tid;
          }

          assert(0 < abs_tid);
          if (0 > filter || filter == abs_tid - 1) {
            if (FALSE != SymFromAddr(GetCurrentProcess(), (DWORD64)*symbol, NULL, value)
              && 0 < value->NameLen)
            {
              /* disable fall-back allowing unresolved symbol names */
              tid = abs_tid; /* make unsigned */
              fname = value->Name;
            }
            else if (0 > tid) { /* fall-back allowing unresolved symbol names */
#   if defined(__MINGW32__)
              sprintf(buffer, "%p", *symbol);
#   else
              sprintf(buffer, "0x%" PRIxPTR, (uintptr_t)*symbol);
#   endif
              fname = buffer;
            }
            if (depth) *depth = i - min_n;
            if (threadid) *threadid = abs_tid - 1;
          }
# else
#   if defined(LIBXSMM_NO_SYNC)
          static char raw_c;
          char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */
#   else
          char *const raw_value = (char*)pthread_getspecific(internal_trace_key);
#   endif
          int* ivalue = 0, fd = -1;
          char* value = 0;

          if (raw_value) {
            ivalue = (int*)raw_value;
            abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]);

            if (0 > filter || filter == abs_tid - 1) {
              fd = ivalue[0];
              if (0 <= fd && (sizeof(int) * 2) == lseek(fd, sizeof(int) * 2, SEEK_SET)) {
                value = raw_value + sizeof(int) * 2;
              }
#   if !defined(NDEBUG) /* library code is expected to be mute */
              else {
                fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n");
              }
#   endif
            }
          }
          else {
            char filename[] = "/tmp/.libxsmm_XXXXXX.map";
#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(2, 19) <= LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__)
            fd = mkstemps(filename, 4/*.map*/);
#else
            char *const xpos = strrchr(filename, 'X');
            const char c = (char)(NULL != xpos ? *(xpos + 1) : 0);
            if (0 != c) {
              xpos[1] = 0;
              fd = mkstemp(filename);
              xpos[1] = c;
            }
            else {
              fd = -1;
            }
#endif
            if (0 <= fd && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) {
              char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE,
                PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

              if (MAP_FAILED != buffer) {
                int check = -1;
                ivalue = (int*)buffer;
                ivalue[0] = fd; /* valid file descriptor for internal_delete */

                if (
#   if !defined(LIBXSMM_NO_SYNC)
                  0 == pthread_setspecific(internal_trace_key, buffer) &&
#   endif
                     (sizeof(int) * 1) == read(fd, &check, sizeof(int))
                  && (sizeof(int) * 2) == lseek(fd, sizeof(int), SEEK_CUR)
                  && check == fd)
                {
                  abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED);
                  assert(0 < abs_tid);
                  /* use sign bit to flag enabled fall-back for symbol resolution */
                  ivalue[1] = -abs_tid;

                  if (0 > filter || filter == abs_tid - 1) {
                    value = buffer + sizeof(int) * 2;
                  }
                }
                else {
#   if !defined(NDEBUG) /* library code is expected to be mute */
                  fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n");
#   endif
                  internal_delete(buffer);
                }
              }
#   if !defined(NDEBUG)
              else {
                const int error = errno;
                fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n",
                  strerror(error), error);
              }
#   endif
            }
#   if !defined(NDEBUG) /* library code is expected to be mute */
            else {
              fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd);
            }
#   endif
          }

          if (value) {
            backtrace_symbols_fd(symbol, 1, fd);

            /* attempt to parse symbol name */
            if (1 == sscanf(value, "%*[^(](%s0x", value)) {
              char* c;
              for (c = value; '+' != *c && *c; ++c);
              if ('+' == *c) {
                /* disable fall-back allowing unresolved symbol names */
                ivalue[1] = abs_tid; /* make unsigned */
                fname = value;
                *c = 0;
              }
            }

            /* fall-back to symbol address */
            if (0 > ivalue[1] && 0 == fname) {
              sprintf(value, "0x%llx", (unsigned long long)*symbol);
              fname = value;
            }

            if (depth) *depth = i - min_n;
            if (threadid) *threadid = abs_tid - 1;
          }
# endif
        }
      }
    }

    --cerberus;
  }
#else
  LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid);
  LIBXSMM_UNUSED(filter_threadid);
  LIBXSMM_UNUSED(filter_mindepth);
  LIBXSMM_UNUSED(filter_maxnsyms);
#endif

  return fname;
}