void libxsmm_init(void) { const void *const registry = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_RELAXED); if (0 == registry) { internal_init(); } }
void libxsmm_finalize(void) { internal_code_type* registry = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST); if (0 != registry) { int i; #if !defined(LIBXSMM_OPENMP) # if !defined(LIBXSMM_NOSYNC) /* acquire locks and thereby shortcut lazy initialization later on */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i); # endif #else # pragma omp critical(internal_reglock) #endif { registry = internal_registry; if (0 != registry) { internal_regkey_type *const registry_keys = internal_registry_keys; const char *const target_arch = internal_get_target_arch(internal_target_archid); unsigned int heapmem = (LIBXSMM_REGSIZE) * (sizeof(internal_code_type) + sizeof(internal_regkey_type)); /* serves as an id to invalidate the thread-local cache; never decremented */ ++internal_teardown; #if defined(__TRACE) i = libxsmm_trace_finalize(); # if !defined(NDEBUG) /* library code is expected to be mute */ if (EXIT_SUCCESS != i) { fprintf(stderr, "LIBXSMM: failed to finalize trace (error #%i)!\n", i); } # endif #endif libxsmm_gemm_finalize(); libxsmm_gemm_diff_finalize(); libxsmm_hash_finalize(); /* make internal registry globally unavailable */ LIBXSMM_ATOMIC_STORE_ZERO(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST); internal_registry_keys = 0; for (i = 0; i < LIBXSMM_REGSIZE; ++i) { internal_code_type code = registry[i]; if (0 != code.pmm) { const libxsmm_gemm_descriptor *const desc = ®istry_keys[i].descriptor; const unsigned long long kernel_size = LIBXSMM_MNK_SIZE(desc->m, desc->n, desc->k); const int precision = (0 == (LIBXSMM_GEMM_FLAG_F32PREC & desc->flags) ? 0 : 1); const unsigned int statistic_sml = internal_statistic_sml; int bucket = 2; assert((LIBXSMM_HASH_COLLISION | LIBXSMM_CODE_STATIC) != code.imm); if (LIBXSMM_MNK_SIZE(statistic_sml, statistic_sml, statistic_sml) >= kernel_size) { bucket = 0; } else { const unsigned int statistic_med = internal_statistic_med; if (LIBXSMM_MNK_SIZE(statistic_med, statistic_med, statistic_med) >= kernel_size) { bucket = 1; } } if (0 == (LIBXSMM_CODE_STATIC & code.imm)) { /* check for allocated/generated JIT-code */ void* buffer = 0; size_t size = 0; code.imm &= ~LIBXSMM_HASH_COLLISION; /* clear collision flag */ if (EXIT_SUCCESS == libxsmm_alloc_info(code.pmm, &size, 0/*flags*/, &buffer)) { libxsmm_deallocate(code.pmm); ++internal_statistic[precision][bucket].njit; heapmem += (unsigned int)(size + (((char*)code.pmm) - (char*)buffer)); } } else { ++internal_statistic[precision][bucket].nsta; } } } if (0 != internal_verbose_mode) { /* print statistic on termination */ LIBXSMM_FLOCK(stderr); LIBXSMM_FLOCK(stdout); fflush(stdout); /* synchronize with standard output */ { const unsigned int linebreak = 0 == internal_print_statistic(stderr, target_arch, 1/*SP*/, 1, 0) ? 1 : 0; if (0 == internal_print_statistic(stderr, target_arch, 0/*DP*/, linebreak, 0) && 0 != linebreak) { fprintf(stderr, "LIBXSMM_TARGET=%s ", target_arch); } fprintf(stderr, "HEAP: %.f MB\n", 1.0 * heapmem / (1 << 20)); } LIBXSMM_FUNLOCK(stdout); LIBXSMM_FUNLOCK(stderr); } libxsmm_free(registry_keys); libxsmm_free(registry); } } #if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i); #endif } }
LIBXSMM_INLINE LIBXSMM_RETARGETABLE internal_code_type* internal_init(void) { /*const*/internal_code_type* result; int i; #if !defined(LIBXSMM_OPENMP) # if !defined(LIBXSMM_NOSYNC) static int internal_reglock_check = 1; /* setup the locks in a thread-safe fashion */ assert(sizeof(internal_reglock) == (INTERNAL_REGLOCK_COUNT * sizeof(*internal_reglock))); if (1 == LIBXSMM_ATOMIC_LOAD(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST)) { LIBXSMM_ATOMIC_ADD_FETCH(&internal_reglock_check, 1, LIBXSMM_ATOMIC_SEQ_CST); if (2 == internal_reglock_check) { for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_INIT(internal_reglock + i); LIBXSMM_ATOMIC_STORE_ZERO(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST); } } while (0 != internal_reglock_check); /* wait until locks are initialized */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i); # endif #else # pragma omp critical(internal_reglock) #endif { result = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST); if (0 == result) { int init_code; /* set internal_target_archid */ libxsmm_set_target_arch(getenv("LIBXSMM_TARGET")); { /* select prefetch strategy for JIT */ const char *const env_prefetch = getenv("LIBXSMM_PREFETCH"); if (0 == env_prefetch || 0 == *env_prefetch) { #if (0 > LIBXSMM_PREFETCH) /* permitted by LIBXSMM_PREFETCH_AUTO */ internal_prefetch = (LIBXSMM_X86_AVX512_MIC != internal_target_archid ? LIBXSMM_PREFETCH_NONE : LIBXSMM_PREFETCH_AL2BL2_VIA_C); #else internal_prefetch = LIBXSMM_MAX(INTERNAL_PREFETCH, 0); #endif } else { /* user input considered even if LIBXSMM_PREFETCH_AUTO is disabled */ switch (atoi(env_prefetch)) { case 2: internal_prefetch = LIBXSMM_PREFETCH_SIGONLY; break; case 3: internal_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; break; case 4: internal_prefetch = LIBXSMM_PREFETCH_AL2; break; case 5: internal_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; break; case 6: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; break; case 7: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; break; case 8: internal_prefetch = LIBXSMM_PREFETCH_AL2_JPST; break; case 9: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; break; default: internal_prefetch = LIBXSMM_PREFETCH_NONE; } } } libxsmm_hash_init(internal_target_archid); libxsmm_gemm_diff_init(internal_target_archid); init_code = libxsmm_gemm_init(internal_target_archid, internal_prefetch); #if defined(__TRACE) { int filter_threadid = 0, filter_mindepth = 1, filter_maxnsyms = 0; const char *const env_trace_init = getenv("LIBXSMM_TRACE"); if (EXIT_SUCCESS == init_code && 0 != env_trace_init && 0 != *env_trace_init) { char buffer[32]; if (1 == sscanf(env_trace_init, "%32[^,],", buffer)) { sscanf(buffer, "%i", &filter_threadid); } if (1 == sscanf(env_trace_init, "%*[^,],%32[^,],", buffer)) { sscanf(buffer, "%i", &filter_mindepth); } if (1 == sscanf(env_trace_init, "%*[^,],%*[^,],%32s", buffer)) { sscanf(buffer, "%i", &filter_maxnsyms); } else { filter_maxnsyms = -1; /* all */ } } init_code = libxsmm_trace_init(filter_threadid - 1, filter_mindepth, filter_maxnsyms); } #endif if (EXIT_SUCCESS == init_code) { assert(0 == internal_registry_keys && 0 == internal_registry); /* should never happen */ result = (internal_code_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_code_type)); internal_registry_keys = (internal_regkey_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_regkey_type)); if (0 != result && 0 != internal_registry_keys) { const char *const env_verbose = getenv("LIBXSMM_VERBOSE"); internal_statistic_mnk = (unsigned int)(pow((double)(LIBXSMM_MAX_MNK), 0.3333333333333333) + 0.5); internal_statistic_sml = 13; internal_statistic_med = 23; if (0 != env_verbose && 0 != *env_verbose) { internal_verbose_mode = atoi(env_verbose); } #if !defined(NDEBUG) else { internal_verbose_mode = 1; /* quiet -> verbose */ } #endif for (i = 0; i < LIBXSMM_REGSIZE; ++i) result[i].pmm = 0; /* omit registering code if JIT is enabled and if an ISA extension is found * which is beyond the static code path used to compile the library */ #if defined(LIBXSMM_BUILD) # if (0 != LIBXSMM_JIT) && !defined(__MIC__) /* check if target arch. permits execution (arch. may be overridden) */ if (LIBXSMM_STATIC_TARGET_ARCH <= internal_target_archid && (LIBXSMM_X86_AVX > internal_target_archid /* jit is not available */ /* condition allows to avoid JIT (if static code is good enough) */ || LIBXSMM_STATIC_TARGET_ARCH == internal_target_archid)) # endif { /* opening a scope for eventually declaring variables */ /* setup the dispatch table for the statically generated code */ # include <libxsmm_dispatch.h> } #endif atexit(libxsmm_finalize); LIBXSMM_ATOMIC_STORE(&internal_registry, result, LIBXSMM_ATOMIC_SEQ_CST); } else { #if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM: failed to allocate code registry!\n"); #endif libxsmm_free(internal_registry_keys); libxsmm_free(result); } } #if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM: failed to initialize sub-component (error #%i)!\n", init_code); } #endif } } #if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i); #endif assert(result); return result; }
LIBXSMM_API #if defined(_WIN32) /*TODO: no inline*/ #elif defined(__GNUC__) /*LIBXSMM_ATTRIBUTE(noinline)*/ #endif const char* libxsmm_trace_info(unsigned int* depth, unsigned int* threadid, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms) { const char *fname = NULL; #if defined(LIBXSMM_TRACE) const int max_n = (0 != depth ? (LIBXSMM_TRACE_MAXDEPTH) : 2); const int min_n = (0 != depth ? (LIBXSMM_TRACE_MINDEPTH + *depth) : 2); void *stacktrace[LIBXSMM_TRACE_MAXDEPTH], **symbol = stacktrace + LIBXSMM_MIN(0 != depth ? ((int)(*depth + 1)) : 1, max_n - 1); static LIBXSMM_TLS int cerberus = 0; int i; /* check against entering a recursion (recursion should not happen due to * attribute "no_instrument_function" but better prevent this in any case) */ if (0 == cerberus) { ++cerberus; # if defined(__GNUC__) __asm__(""); # endif i = LIBXSMM_ATOMIC_LOAD(&internal_trace_initialized, LIBXSMM_ATOMIC_RELAXED); if (0 <= i) { /* do nothing if not yet initialized */ const int mindepth = (0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth); const int maxnsyms = (0 != filter_maxnsyms ? *filter_maxnsyms : internal_trace_maxnsyms); i = libxsmm_backtrace(stacktrace, max_n); /* filter depth against filter_mindepth and filter_maxnsyms */ if ((0 >= mindepth || (min_n + mindepth) <= i) && (0 > maxnsyms || i <= (min_n + mindepth + maxnsyms - 1))) { if (min_n <= i) { /* check against min. depth */ const int filter = (0 != filter_threadid ? *filter_threadid : internal_trace_threadid); int abs_tid = 0; # if defined(_WIN32) || defined(__CYGWIN__) static LIBXSMM_TLS char buffer[sizeof(SYMBOL_INFO)+LIBXSMM_TRACE_SYMBOLSIZE]; static LIBXSMM_TLS int tid = 0; PSYMBOL_INFO value = (PSYMBOL_INFO)buffer; value->SizeOfStruct = sizeof(SYMBOL_INFO); value->MaxNameLen = LIBXSMM_TRACE_SYMBOLSIZE - 1; if (0 != tid) { abs_tid = (0 <= tid ? tid : -tid); } else { abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED); /* use sign bit to flag enabled fall-back for symbol resolution */ tid = -abs_tid; } assert(0 < abs_tid); if (0 > filter || filter == abs_tid - 1) { if (FALSE != SymFromAddr(GetCurrentProcess(), (DWORD64)*symbol, NULL, value) && 0 < value->NameLen) { /* disable fall-back allowing unresolved symbol names */ tid = abs_tid; /* make unsigned */ fname = value->Name; } else if (0 > tid) { /* fall-back allowing unresolved symbol names */ # if defined(__MINGW32__) sprintf(buffer, "%p", *symbol); # else sprintf(buffer, "0x%" PRIxPTR, (uintptr_t)*symbol); # endif fname = buffer; } if (depth) *depth = i - min_n; if (threadid) *threadid = abs_tid - 1; } # else # if defined(LIBXSMM_NO_SYNC) static char raw_c; char */*const*/ raw_value = &raw_c; /* const: avoid warning (below / constant control-flow) */ # else char *const raw_value = (char*)pthread_getspecific(internal_trace_key); # endif int* ivalue = 0, fd = -1; char* value = 0; if (raw_value) { ivalue = (int*)raw_value; abs_tid = (0 <= ivalue[1] ? ivalue[1] : -ivalue[1]); if (0 > filter || filter == abs_tid - 1) { fd = ivalue[0]; if (0 <= fd && (sizeof(int) * 2) == lseek(fd, sizeof(int) * 2, SEEK_SET)) { value = raw_value + sizeof(int) * 2; } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to get buffer\n"); } # endif } } else { char filename[] = "/tmp/.libxsmm_XXXXXX.map"; #if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && LIBXSMM_VERSION2(2, 19) <= LIBXSMM_VERSION2(__GLIBC__, __GLIBC_MINOR__) fd = mkstemps(filename, 4/*.map*/); #else char *const xpos = strrchr(filename, 'X'); const char c = (char)(NULL != xpos ? *(xpos + 1) : 0); if (0 != c) { xpos[1] = 0; fd = mkstemp(filename); xpos[1] = c; } else { fd = -1; } #endif if (0 <= fd && 0 == posix_fallocate(fd, 0, LIBXSMM_TRACE_SYMBOLSIZE)) { char *const buffer = (char*)mmap(NULL, LIBXSMM_TRACE_SYMBOLSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (MAP_FAILED != buffer) { int check = -1; ivalue = (int*)buffer; ivalue[0] = fd; /* valid file descriptor for internal_delete */ if ( # if !defined(LIBXSMM_NO_SYNC) 0 == pthread_setspecific(internal_trace_key, buffer) && # endif (sizeof(int) * 1) == read(fd, &check, sizeof(int)) && (sizeof(int) * 2) == lseek(fd, sizeof(int), SEEK_CUR) && check == fd) { abs_tid = LIBXSMM_ATOMIC_ADD_FETCH(&internal_trace_initialized, 1, LIBXSMM_ATOMIC_RELAXED); assert(0 < abs_tid); /* use sign bit to flag enabled fall-back for symbol resolution */ ivalue[1] = -abs_tid; if (0 > filter || filter == abs_tid - 1) { value = buffer + sizeof(int) * 2; } } else { # if !defined(NDEBUG) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM ERROR: failed to setup buffer\n"); # endif internal_delete(buffer); } } # if !defined(NDEBUG) else { const int error = errno; fprintf(stderr, "LIBXSMM ERROR: %s (mmap allocation error #%i)\n", strerror(error), error); } # endif } # if !defined(NDEBUG) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM ERROR: failed to setup file descriptor (%i)\n", fd); } # endif } if (value) { backtrace_symbols_fd(symbol, 1, fd); /* attempt to parse symbol name */ if (1 == sscanf(value, "%*[^(](%s0x", value)) { char* c; for (c = value; '+' != *c && *c; ++c); if ('+' == *c) { /* disable fall-back allowing unresolved symbol names */ ivalue[1] = abs_tid; /* make unsigned */ fname = value; *c = 0; } } /* fall-back to symbol address */ if (0 > ivalue[1] && 0 == fname) { sprintf(value, "0x%llx", (unsigned long long)*symbol); fname = value; } if (depth) *depth = i - min_n; if (threadid) *threadid = abs_tid - 1; } # endif } } } --cerberus; } #else LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(threadid); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif return fname; }