LIBXSMM_INLINE double residual_s ( float *A, unsigned int lda, unsigned int m, unsigned int n, float *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j, address, i4, j4, k4, i8, j8, k8; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1 ; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = (double) A[ (j-1)*lda + (i-1)]; btmp = (double) B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( (dtmp / ref > 1.0e-4) && (dtmp > 1.0e-7) ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { address = (j-1)*lda + (i-1); j4 = (int)(address/(lda*4)) + 1; i4 = (int)((address-(j4-1)*lda*4) / 4) + 1; k4 = (address-(j4-1)*lda*4 - (i4-1)*4) + 1; j8 = (int)(address/(lda*8)) + 1; i8 = (int)((address-(j8-1)*lda*8) / 8) + 1; k8 = (address-(j8-1)*lda*8 - (i8-1)*8) + 1; printf("Bug #%i: A[%u]=A(%u,%u)=A4(%u,%u,%u)=A8(%u,%u,%u) expected=%g instead=%g err=%g\n",ntimes,address,i,j,i4,j4,k4,i8,j8,k8,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return ( derror ); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE double libxsmm_timer_duration(unsigned long long tick0, unsigned long long tick1) { const double d = (double)(LIBXSMM_MAX(tick1, tick0) - tick0); #if defined(_WIN32) LARGE_INTEGER frequency; QueryPerformanceFrequency(&frequency); return d / (double)frequency.QuadPart; #elif defined(CLOCK_MONOTONIC) return d * 1E-9; #else return d * 1E-6; #endif }
LIBXSMM_INLINE double residual_s ( float *A, unsigned int lda, unsigned int m, unsigned int n, float *B, unsigned int ldb, unsigned int *nerrs, unsigned int *ncorr ) { unsigned int i, j; double atmp, btmp, dtmp, ref, derror; static int ntimes = 0; *nerrs = 0; *ncorr = 0; derror = 0.0; for ( j = 1 ; j<= n; j++ ) { for ( i = 1; i <= m; i++ ) { atmp = (double) A[ (j-1)*lda + (i-1)]; btmp = (double) B[ (j-1)*ldb + (i-1)]; ref = LIBXSMM_MAX(atmp,-atmp); if ( atmp >= btmp ) { dtmp = atmp - btmp; } else { dtmp = btmp - atmp; } if ( isnan(dtmp) || isinf(dtmp) ) { if ( ++ntimes < 15 ) { printf("Denormal bug: A(%u,%u) is %g B(%u,%u) is %g\n",i,j,atmp,i,j,btmp); } } if ( dtmp / ref > 1.0e-4 ) { *nerrs = *nerrs + 1; if ( ++ntimes < 15 ) { printf("Bug #%d: A(%u,%u) expected=%g instead=%g err=%g\n",ntimes,i,j,atmp,btmp,dtmp); } } else { if ( (*nerrs > 0) && (ntimes < 10) && (*ncorr < 40) ) { printf("Cor #%u: A(%u,%u) expected=%g\n",*ncorr+1,i,j,atmp); } *ncorr = *ncorr + 1; } derror += dtmp; } } return ( derror ); }
LIBXSMM_API void libxsmm_trace(FILE* stream, unsigned int depth, const int* filter_threadid, const int* filter_mindepth, const int* filter_maxnsyms) { #if defined(LIBXSMM_TRACE) unsigned int depth1 = depth + 1, threadid; const char *const name = libxsmm_trace_info(&depth1, &threadid, filter_threadid, filter_mindepth, filter_maxnsyms); if (name && *name) { /* implies actual other results to be valid */ const int depth0 = LIBXSMM_MAX(0 != filter_mindepth ? *filter_mindepth : internal_trace_mindepth, 0); assert(0 != stream/*otherwise fprintf handle the error*/); if ((0 == filter_threadid && 0 > internal_trace_threadid) || (0 != filter_threadid && 0 > *filter_threadid)) { fprintf(stream, "%*s%s@%u\n", (int)(depth1 - depth0), "", name, threadid); } else { fprintf(stream, "%*s%s\n", (int)(depth1 - depth0), "", name); } } #else /* suppress warning */ LIBXSMM_UNUSED(stream); LIBXSMM_UNUSED(depth); LIBXSMM_UNUSED(filter_threadid); LIBXSMM_UNUSED(filter_mindepth); LIBXSMM_UNUSED(filter_maxnsyms); #endif }
LIBXSMM_API int libxsmm_cpuid_x86(void) { int target_arch = LIBXSMM_STATIC_TARGET_ARCH; unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0; LIBXSMM_CPUID_X86(0, eax, ebx, ecx, edx); if (1 <= eax) { /* CPUID */ LIBXSMM_CPUID_X86(1, eax, ebx, ecx, edx); /* XSAVE/XGETBV(0x04000000), OSXSAVE(0x08000000) */ if (0x0C000000 == (0x0C000000 & ecx)) { /* Check for CRC32 (this is not a proper test for SSE 4.2 as a whole!) */ if (0x00100000 == (0x00100000 & ecx)) { target_arch = LIBXSMM_X86_SSE4; } LIBXSMM_XGETBV(0, eax, edx); if (0x00000006 == (0x00000006 & eax)) { /* OS XSAVE 256-bit */ if (0x000000E0 == (0x000000E0 & eax)) { /* OS XSAVE 512-bit */ LIBXSMM_CPUID_X86(7, eax, ebx, ecx, edx); /* AVX512F(0x00010000), AVX512CD(0x10000000) */ if (0x10010000 == (0x10010000 & ebx)) { /* Common */ /* AVX512DQ(0x00020000), AVX512BW(0x40000000), AVX512VL(0x80000000) */ if (0xC0020000 == (0xC0020000 & ebx)) { /* SKX (Core) */ if (0x00000800 == (0x00000800 & ecx)) { /* ICL (CORE) */ target_arch = LIBXSMM_X86_AVX512_ICL; } else { /* SKX (CORE) */ target_arch = LIBXSMM_X86_AVX512_CORE; } } /* AVX512PF(0x04000000), AVX512ER(0x08000000) */ else if (0x0C000000 == (0x0C000000 & ebx)) { if (0x0000000C == (0x0000000C & edx)) { /* KNM (MIC) */ target_arch = LIBXSMM_X86_AVX512_KNM; } else { /* KNL (MIC) */ target_arch = LIBXSMM_X86_AVX512_MIC; } } else { /* Common */ target_arch = LIBXSMM_X86_AVX512; } } } else if (0x10000000 == (0x10000000 & ecx)) { /* AVX(0x10000000) */ if (0x00001000 == (0x00001000 & ecx)) { /* FMA(0x00001000) */ target_arch = LIBXSMM_X86_AVX2; } else { target_arch = LIBXSMM_X86_AVX; } } } } } /* check if procedure obviously failed to detect the highest available instruction set extension */ assert(LIBXSMM_STATIC_TARGET_ARCH <= target_arch); return LIBXSMM_MAX(target_arch, LIBXSMM_STATIC_TARGET_ARCH); }
LIBXSMM_INLINE LIBXSMM_RETARGETABLE internal_code_type* internal_init(void) { /*const*/internal_code_type* result; int i; #if !defined(LIBXSMM_OPENMP) # if !defined(LIBXSMM_NOSYNC) static int internal_reglock_check = 1; /* setup the locks in a thread-safe fashion */ assert(sizeof(internal_reglock) == (INTERNAL_REGLOCK_COUNT * sizeof(*internal_reglock))); if (1 == LIBXSMM_ATOMIC_LOAD(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST)) { LIBXSMM_ATOMIC_ADD_FETCH(&internal_reglock_check, 1, LIBXSMM_ATOMIC_SEQ_CST); if (2 == internal_reglock_check) { for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_INIT(internal_reglock + i); LIBXSMM_ATOMIC_STORE_ZERO(&internal_reglock_check, LIBXSMM_ATOMIC_SEQ_CST); } } while (0 != internal_reglock_check); /* wait until locks are initialized */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_ACQUIRE(internal_reglock + i); # endif #else # pragma omp critical(internal_reglock) #endif { result = LIBXSMM_ATOMIC_LOAD(&internal_registry, LIBXSMM_ATOMIC_SEQ_CST); if (0 == result) { int init_code; /* set internal_target_archid */ libxsmm_set_target_arch(getenv("LIBXSMM_TARGET")); { /* select prefetch strategy for JIT */ const char *const env_prefetch = getenv("LIBXSMM_PREFETCH"); if (0 == env_prefetch || 0 == *env_prefetch) { #if (0 > LIBXSMM_PREFETCH) /* permitted by LIBXSMM_PREFETCH_AUTO */ internal_prefetch = (LIBXSMM_X86_AVX512_MIC != internal_target_archid ? LIBXSMM_PREFETCH_NONE : LIBXSMM_PREFETCH_AL2BL2_VIA_C); #else internal_prefetch = LIBXSMM_MAX(INTERNAL_PREFETCH, 0); #endif } else { /* user input considered even if LIBXSMM_PREFETCH_AUTO is disabled */ switch (atoi(env_prefetch)) { case 2: internal_prefetch = LIBXSMM_PREFETCH_SIGONLY; break; case 3: internal_prefetch = LIBXSMM_PREFETCH_BL2_VIA_C; break; case 4: internal_prefetch = LIBXSMM_PREFETCH_AL2; break; case 5: internal_prefetch = LIBXSMM_PREFETCH_AL2_AHEAD; break; case 6: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C; break; case 7: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_AHEAD; break; case 8: internal_prefetch = LIBXSMM_PREFETCH_AL2_JPST; break; case 9: internal_prefetch = LIBXSMM_PREFETCH_AL2BL2_VIA_C_JPST; break; default: internal_prefetch = LIBXSMM_PREFETCH_NONE; } } } libxsmm_hash_init(internal_target_archid); libxsmm_gemm_diff_init(internal_target_archid); init_code = libxsmm_gemm_init(internal_target_archid, internal_prefetch); #if defined(__TRACE) { int filter_threadid = 0, filter_mindepth = 1, filter_maxnsyms = 0; const char *const env_trace_init = getenv("LIBXSMM_TRACE"); if (EXIT_SUCCESS == init_code && 0 != env_trace_init && 0 != *env_trace_init) { char buffer[32]; if (1 == sscanf(env_trace_init, "%32[^,],", buffer)) { sscanf(buffer, "%i", &filter_threadid); } if (1 == sscanf(env_trace_init, "%*[^,],%32[^,],", buffer)) { sscanf(buffer, "%i", &filter_mindepth); } if (1 == sscanf(env_trace_init, "%*[^,],%*[^,],%32s", buffer)) { sscanf(buffer, "%i", &filter_maxnsyms); } else { filter_maxnsyms = -1; /* all */ } } init_code = libxsmm_trace_init(filter_threadid - 1, filter_mindepth, filter_maxnsyms); } #endif if (EXIT_SUCCESS == init_code) { assert(0 == internal_registry_keys && 0 == internal_registry); /* should never happen */ result = (internal_code_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_code_type)); internal_registry_keys = (internal_regkey_type*)libxsmm_malloc(LIBXSMM_REGSIZE * sizeof(internal_regkey_type)); if (0 != result && 0 != internal_registry_keys) { const char *const env_verbose = getenv("LIBXSMM_VERBOSE"); internal_statistic_mnk = (unsigned int)(pow((double)(LIBXSMM_MAX_MNK), 0.3333333333333333) + 0.5); internal_statistic_sml = 13; internal_statistic_med = 23; if (0 != env_verbose && 0 != *env_verbose) { internal_verbose_mode = atoi(env_verbose); } #if !defined(NDEBUG) else { internal_verbose_mode = 1; /* quiet -> verbose */ } #endif for (i = 0; i < LIBXSMM_REGSIZE; ++i) result[i].pmm = 0; /* omit registering code if JIT is enabled and if an ISA extension is found * which is beyond the static code path used to compile the library */ #if defined(LIBXSMM_BUILD) # if (0 != LIBXSMM_JIT) && !defined(__MIC__) /* check if target arch. permits execution (arch. may be overridden) */ if (LIBXSMM_STATIC_TARGET_ARCH <= internal_target_archid && (LIBXSMM_X86_AVX > internal_target_archid /* jit is not available */ /* condition allows to avoid JIT (if static code is good enough) */ || LIBXSMM_STATIC_TARGET_ARCH == internal_target_archid)) # endif { /* opening a scope for eventually declaring variables */ /* setup the dispatch table for the statically generated code */ # include <libxsmm_dispatch.h> } #endif atexit(libxsmm_finalize); LIBXSMM_ATOMIC_STORE(&internal_registry, result, LIBXSMM_ATOMIC_SEQ_CST); } else { #if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */ fprintf(stderr, "LIBXSMM: failed to allocate code registry!\n"); #endif libxsmm_free(internal_registry_keys); libxsmm_free(result); } } #if !defined(NDEBUG) && defined(__TRACE) /* library code is expected to be mute */ else { fprintf(stderr, "LIBXSMM: failed to initialize sub-component (error #%i)!\n", init_code); } #endif } } #if !defined(LIBXSMM_OPENMP) && !defined(LIBXSMM_NOSYNC) /* release locks */ for (i = 0; i < INTERNAL_REGLOCK_COUNT; ++i) LIBXSMM_LOCK_RELEASE(internal_reglock + i); #endif assert(result); return result; }
int main(int argc, char* argv[]) { const int m = (1 < argc ? atoi(argv[1]) : 16); const int n = (2 < argc ? atoi(argv[2]) : m); const int unsigned ldi = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m); const int unsigned ldo = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, m); const int unroll = (5 < argc ? atoi(argv[5]) : 1); const int prefetch = (6 < argc ? atoi(argv[6]) : 0); const int flags = ((7 < argc && 0 != atoi(argv[7])) ? LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE : 0); const int iters = (8 < argc ? atoi(argv[8]) : 1); /* we should modify to test all data-types */ const libxsmm_mcopy_descriptor* desc; libxsmm_xmcopyfunction kernel; libxsmm_descriptor_blob blob; libxsmm_timer_tickint l_start; libxsmm_timer_tickint l_end; int error = 0, i, j; ELEM_TYPE *a, *b; double copy_time; printf("This is a tester for JIT matcopy kernels!\n"); desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE), m, n, ldo, ldi, flags, prefetch, &unroll); a = (ELEM_TYPE*)malloc(n * ldi * sizeof(ELEM_TYPE)); b = (ELEM_TYPE*)malloc(n * ldo * sizeof(ELEM_TYPE)); for (i = 0; i < n; i++) { for (j = 0; j < m; j++) { a[j+ldi*i] = (ELEM_TYPE)rand(); if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { b[j+ldo*i] = (ELEM_TYPE)rand(); } } } /* test dispatch call */ kernel = libxsmm_dispatch_mcopy(desc); if (kernel == 0) { printf("JIT error -> exit!!!!\n"); exit(EXIT_FAILURE); } /* let's call */ kernel(a, &ldi, b, &ldo, &a[128]); l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { kernel(a, &ldi, b, &ldo, &a[128]); } l_end = libxsmm_timer_tick(); copy_time = libxsmm_timer_duration(l_start, l_end); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { if (LIBXSMM_NEQ(b[j+ldo*i], 0)) { printf("ERROR!!!\n"); i = n; error = 1; break; } } else if (LIBXSMM_NEQ(a[j+ldi*i], b[j+ldo*i])) { printf("ERROR!!!\n"); i = n; error = 1; break; } } } if (error == 0) { printf("CORRECT copy!!!!\n"); printf("Time taken is\t%.5f seconds\n", copy_time); } return EXIT_SUCCESS; }
#include <stdlib.h> #include <stdio.h> #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload_attribute(pop) #endif #if !defined(REAL_TYPE) # define REAL_TYPE double #endif LIBXSMM_RETARGETABLE void init(int seed, REAL_TYPE *LIBXSMM_RESTRICT dst, double scale, libxsmm_blasint nrows, libxsmm_blasint ncols, libxsmm_blasint ld); LIBXSMM_RETARGETABLE void init(int seed, REAL_TYPE *LIBXSMM_RESTRICT dst, double scale, libxsmm_blasint nrows, libxsmm_blasint ncols, libxsmm_blasint ld) { const libxsmm_blasint minval = seed, addval = (nrows - 1) * ld + (ncols - 1); const libxsmm_blasint maxval = LIBXSMM_MAX(LIBXSMM_ABS(minval), addval); const double norm = 0 != maxval ? (scale / maxval) : scale; libxsmm_blasint i, j; #if defined(_OPENMP) # pragma omp parallel for #endif for (i = 0; i < ncols; ++i) { for (j = 0; j < nrows; ++j) { const libxsmm_blasint k = i * ld + j; const double value = (double)(k + minval); dst[k] = (REAL_TYPE)(norm * (value - 0.5 * addval)); } } }
int main(int argc, char* argv[]) { const char *const filename = (1 < argc ? argv[1] : "mhd_image.mhd"); /* take some block-sizes, which are used to test leading dimensions */ const int bw = LIBXSMM_MAX(2 < argc ? atoi(argv[2]) : 64, 1); const int bh = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 64, 1); size_t ndims = 3, size[3], pitch[3], offset[3], ncomponents, header_size, extension_size; libxsmm_mhd_elemtype type; char data_filename[1024]; void* data = 0; int result; /* Read header information; function includes various sanity checks. */ result = libxsmm_mhd_read_header(filename, sizeof(data_filename), data_filename, &ndims, size, &ncomponents, &type, &header_size, &extension_size); /* Allocate data according to the header information. */ if (EXIT_SUCCESS == result) { size_t typesize; pitch[0] = (size[0] + bw - 1) / bw * bw; pitch[1] = (size[1] + bh - 1) / bh * bh; pitch[2] = size[2]; /* center the image inside of the (pitched) buffer */ offset[0] = (pitch[0] - size[0]) / 2; offset[1] = (pitch[1] - size[1]) / 2; offset[2] = 0; if (0 != libxsmm_mhd_typename(type, &typesize, 0/*ctypename*/)) { const size_t nelements = pitch[0] * (1 < ndims ? (pitch[1] * (2 < ndims ? pitch[2] : 1)) : 1); data = malloc(ncomponents * nelements * typesize); } else { result = EXIT_FAILURE; } } /* Read the data according to the header into the allocated buffer. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_read(data_filename, offset, size, pitch, ndims, ncomponents, header_size, type, 0/*type_data*/, data, 0/*handle_element*/, 0/*extension*/, 0/*extension_size*/); } /* Write the data into a new file; update header_size. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_write("mhd_test.mhd", 0/*offset*/, pitch, pitch, ndims, ncomponents, type, data, &header_size, 0/*extension_header*/, 0/*extension*/, 0/*extension_size*/); } /* Check the written data against the buffer. */ if (EXIT_SUCCESS == result) { result = libxsmm_mhd_read(data_filename, offset, size, pitch, ndims, ncomponents, 0/*header_size*/, type, 0/*type_data*/, data, libxsmm_mhd_element_comparison, 0/*extension*/, 0/*extension_size*/); } /* Deallocate the buffer. */ free(data); return result; }
int main(int argc, char* argv[]) { const libxsmm_blasint m = 1 < argc ? atoi(argv[1]) : 4096; const libxsmm_blasint n = 2 < argc ? atoi(argv[2]) : m; const libxsmm_blasint lda = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m); const libxsmm_blasint ldb = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, n); REAL_TYPE *const a = (REAL_TYPE*)malloc(lda * n * sizeof(REAL_TYPE)); REAL_TYPE *const b = (REAL_TYPE*)malloc(ldb * m * sizeof(REAL_TYPE)); const unsigned int size = m * n * sizeof(REAL_TYPE); unsigned long long start; libxsmm_blasint i, j; double duration; fprintf(stdout, "m=%i n=%i lda=%i ldb=%i size=%.fMB (%s)\n", m, n, lda, ldb, 1.0 * size / (1 << 20), 8 == sizeof(REAL_TYPE) ? "DP" : "SP"); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { a[i*lda+j] = initial_value(i, j, lda); } } start = libxsmm_timer_tick(); libxsmm_transpose_oop(b, a, sizeof(REAL_TYPE), m, n, lda, ldb); libxsmm_transpose_oop(a, b, sizeof(REAL_TYPE), n, m, ldb, lda); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 < fabs(a[i*lda+j] - initial_value(i, j, lda))) { i = n + 1; break; } } } if (i <= n) { if (0 < duration) { fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size / (duration * (1 << 30))); } fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * duration); } else { fprintf(stderr, "Validation failed!\n"); } #if defined(__MKL) || defined(MKL_DIRECT_CALL_SEQ) || defined(MKL_DIRECT_CALL) { double mkl_duration; start = libxsmm_timer_tick(); LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', m, n, 1, a, lda, b, ldb); LIBXSMM_CONCATENATE(mkl_, LIBXSMM_TPREFIX(REAL_TYPE, omatcopy))('C', 'T', n, m, 1, b, ldb, a, lda); mkl_duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (0 < mkl_duration) { fprintf(stdout, "\tMKL: %.1fx\n", duration / mkl_duration); } } #endif free(a); free(b); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { const int insize = (1 < argc ? atoi(argv[1]) : 0); const int incrmt = (2 < argc ? atoi(argv[2]) : 0); const int nelems = (3 < argc ? atoi(argv[3]) : 0); const int niters = (4 < argc ? atoi(argv[4]) : 1); const int elsize = (0 >= insize ? LIBXSMM_DESCRIPTOR_SIGSIZE : insize); const int stride = (0 >= incrmt ? LIBXSMM_MAX(LIBXSMM_DESCRIPTOR_MAXSIZE, elsize) : LIBXSMM_MAX(incrmt, elsize)); const size_t n = (0 >= nelems ? (((size_t)2 << 30/*2 GB*/) / stride) : ((size_t)nelems)); unsigned char *input, *icopy = NULL, *ilast = NULL; int result = EXIT_SUCCESS; size_t nbytes, size, nrpt; if (0 < niters) { size = n; nrpt = niters; } else { size = LIBXSMM_MAX(LIBXSMM_ABS(niters), 1); nrpt = n; } nbytes = size * stride; input = (unsigned char*)(0 != nbytes ? malloc(nbytes) : NULL); if (NULL != input) { unsigned char *const ref = input + (size - 1) * stride; /* last item */ libxsmm_timer_tickint start; size_t i, j = 0; /* initialize the input data */ for (i = 0; i < nbytes; ++i) input[i] = LIBXSMM_MOD2(i, 128); for (i = 0; i < (size_t)elsize; ++i) ref[i] = 255; { /* benchmark libxsmm_diff_n */ #if defined(USE_HASH) const unsigned int hashref = libxsmm_hash(ref, elsize, 0/*seed*/); #endif start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { #if !defined(USE_HASH) j = libxsmm_diff_n(ref, input, (unsigned char)elsize, (unsigned char)stride, (unsigned int)LIBXSMM_MIN(i, size)/*hint*/, (unsigned int)size); #else const unsigned char* tst = input; for (j = 0; j < size; ++j) { const unsigned int hashtst = libxsmm_hash(tst, elsize, 0/*seed*/); if (hashref == hashtst && 0 == libxsmm_diff(ref, tst, (unsigned char)elsize)) { break; } tst += stride; } #endif } printf("libxsmm_diff_n:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); } if (size == (j + 1) && 0 == memcmp(ref, input + j * stride, elsize)) { /* benchmark libxsmm_memcmp */ icopy = (unsigned char*)(elsize == stride ? malloc(nbytes) : NULL); if (NULL != icopy) { ilast = icopy + (size - 1) * stride; /* last item */ memcpy(icopy, input, nbytes); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += libxsmm_memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp may be pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("libxsmm_memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ } } else { result = EXIT_FAILURE; } if (NULL != icopy) { /* benchmark stdlib's memcmp */ LIBXSMM_ASSERT(NULL != ilast); start = libxsmm_timer_tick(); for (i = 0; i < nrpt; ++i) { j += memcmp(input, icopy, nbytes); /* take result of every execution */ /* memcmp is likely pure and without touching input it is not repeated (nrpt) */ ilast[i%elsize] = 255; } printf("stdlib memcmp:\t\t%.8f s\n", libxsmm_timer_duration(start, libxsmm_timer_tick())); result += (int)j * ((int)stride / ((int)stride + 1)); /* ignore result */ free(icopy); } free(input); } else { result = EXIT_FAILURE; } return result; }
int main(int argc, char* argv[]) { const int ncalls = 1000000; #if defined(_OPENMP) const int max_nthreads = omp_get_max_threads(); #else const int max_nthreads = 1; #endif const int ncycles = LIBXSMM_MAX(1 < argc ? atoi(argv[1]) : 100, 1); const int max_nallocs = LIBXSMM_CLMP(2 < argc ? atoi(argv[2]) : 4, 1, MAX_MALLOC_N); const int nthreads = LIBXSMM_CLMP(3 < argc ? atoi(argv[3]) : 1, 1, max_nthreads); unsigned int nallocs = 0, nerrors = 0; int r[MAX_MALLOC_N], i; /* generate set of random number for parallel region */ for (i = 0; i < (MAX_MALLOC_N); ++i) r[i] = rand(); /* count number of calls according to randomized scheme */ for (i = 0; i < ncycles; ++i) { nallocs += r[i%(MAX_MALLOC_N)] % max_nallocs + 1; } assert(0 != nallocs); fprintf(stdout, "Running %i cycles with max. %i malloc+free (%u calls) using %i thread%s...\n", ncycles, max_nallocs, nallocs, 1 >= nthreads ? 1 : nthreads, 1 >= nthreads ? "" : "s"); #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const longlife_env = getenv("LONGLIFE"); const int enable_longlife = ((0 == longlife_env || 0 == *longlife_env) ? 0 : atoi(longlife_env)); void *const longlife = (0 == enable_longlife ? 0 : malloc_offsite((MAX_MALLOC_MB) << 20)); unsigned long long d0, d1 = 0; libxsmm_scratch_info info; /* run non-inline function to measure call overhead of an "empty" function */ const unsigned long long t0 = libxsmm_timer_tick(); for (i = 0; i < ncalls; ++i) { libxsmm_init(); /* subsequent calls are not doing any work */ } d0 = libxsmm_timer_diff(t0, libxsmm_timer_tick()); #if defined(_OPENMP) # pragma omp parallel for num_threads(nthreads) private(i) default(none) shared(r) reduction(+:d1,nerrors) #endif for (i = 0; i < ncycles; ++i) { const int count = r[i%(MAX_MALLOC_N)] % max_nallocs + 1; void* p[MAX_MALLOC_N]; int j; assert(count <= MAX_MALLOC_N); for (j = 0; j < count; ++j) { const int k = (i * count + j) % (MAX_MALLOC_N); const size_t nbytes = (r[k] % (MAX_MALLOC_MB) + 1) << 20; const unsigned long long t1 = libxsmm_timer_tick(); p[j] = libxsmm_aligned_scratch(nbytes, 0/*auto*/); d1 += libxsmm_timer_diff(t1, libxsmm_timer_tick()); if (0 != p[j]) { memset(p[j], j, nbytes); } else { ++nerrors; } } for (j = 0; j < count; ++j) { libxsmm_free(p[j]); } } libxsmm_free(longlife); if (0 != d0 && 0 != d1 && 0 < nallocs) { const double dcalls = libxsmm_timer_duration(0, d0); const double dalloc = libxsmm_timer_duration(0, d1); const double alloc_freq = 1E-3 * nallocs / dalloc; const double empty_freq = 1E-3 * ncalls / dcalls; fprintf(stdout, "\tallocation+free calls/s: %.1f kHz\n", alloc_freq); fprintf(stdout, "\tempty calls/s: %.1f MHz\n", 1E-3 * empty_freq); fprintf(stdout, "\toverhead: %.1fx\n", empty_freq / alloc_freq); } if (EXIT_SUCCESS == libxsmm_get_scratch_info(&info) && 0 < info.size) { fprintf(stdout, "\nScratch: %.f MB (mallocs=%lu, pools=%u", 1.0 * info.size / (1 << 20), (unsigned long int)info.nmallocs, info.npools); if (1 < nthreads) fprintf(stdout, ", threads=%i)\n", nthreads); else fprintf(stdout, ")\n"); libxsmm_release_scratch(); /* suppress LIBXSMM's termination message about scratch */ } } if (0 == nerrors) { fprintf(stdout, "Finished\n"); return EXIT_SUCCESS; } else { fprintf(stdout, "FAILED (%u errors)\n", nerrors); return EXIT_FAILURE; } }
int main(void) { const libxsmm_blasint m[] = { 1, 1, 1, 1, 2, 3, 5, 5, 5, 16, 63, 16, 75, 2507 }; const libxsmm_blasint n[] = { 1, 7, 7, 7, 2, 3, 1, 1, 1, 16, 31, 500, 130, 1975 }; const libxsmm_blasint ldi[] = { 1, 1, 1, 9, 2, 3, 5, 8, 8, 16, 64, 16, 87, 3000 }; const libxsmm_blasint ldo[] = { 1, 7, 8, 8, 2, 3, 1, 1, 4, 16, 32, 512, 136, 3072 }; const int start = 0, ntests = sizeof(m) / sizeof(*m); libxsmm_blasint max_size_a = 0, max_size_b = 0; unsigned int nerrors = 0; ELEM_TYPE *a = 0, *b = 0; int test; for (test = start; test < ntests; ++test) { const libxsmm_blasint size_a = ldi[test] * n[test], size_b = ldo[test] * m[test]; assert(m[test] <= ldi[test] && n[test] <= ldo[test]); max_size_a = LIBXSMM_MAX(max_size_a, size_a); max_size_b = LIBXSMM_MAX(max_size_b, size_b); } a = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_a * sizeof(ELEM_TYPE))); b = (ELEM_TYPE*)libxsmm_malloc((size_t)(max_size_b * sizeof(ELEM_TYPE))); assert(0 != a && 0 != b); LIBXSMM_MATINIT(ELEM_TYPE, 42, a, max_size_a, 1, max_size_a, 1.0); LIBXSMM_MATINIT(ELEM_TYPE, 0, b, max_size_b, 1, max_size_b, 1.0); for (test = start; test < ntests; ++test) { unsigned int testerrors = (EXIT_SUCCESS == libxsmm_otrans( b, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]) ? 0 : 1); if (0 == testerrors) { libxsmm_blasint i, j; for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { const libxsmm_blasint u = i * ldi[test] + j; const libxsmm_blasint v = j * ldo[test] + i; testerrors += (LIBXSMM_FEQ(a[u], b[v]) ? 0u : 1u); } } } if (nerrors < testerrors) { nerrors = testerrors; } } if (0 == nerrors) { /* previous results are correct and may be used to validate other tests */ for (test = start; test < ntests; ++test) { /* prepare expected results in b (correct according to the previous test block) */ libxsmm_otrans(b, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]); if (m[test] == n[test] && ldi[test] == ldo[test]) { unsigned int testerrors = (EXIT_SUCCESS == libxsmm_otrans( a, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]) ? 0 : 1); if (0 == testerrors) { libxsmm_blasint i, j; for (i = 0; i < n[test]; ++i) { for (j = 0; j < m[test]; ++j) { /* address serves both a and b since ldi and ldo are equal */ const libxsmm_blasint uv = i * ldi[test] + j; testerrors += (LIBXSMM_FEQ(a[uv], b[uv]) ? 0u : 1u); } } } if (nerrors < testerrors) { nerrors = testerrors; } } else { /* negative tests */ nerrors = LIBXSMM_MAX(EXIT_SUCCESS != libxsmm_otrans( a, a, sizeof(ELEM_TYPE), m[test], n[test], ldi[test], ldo[test]) ? 0u : 1u, nerrors); } } } libxsmm_free(a); libxsmm_free(b); if (0 == nerrors) { return EXIT_SUCCESS; } else { # if defined(_DEBUG) fprintf(stderr, "errors=%u\n", nerrors); # endif return EXIT_FAILURE; } }
int main(int argc, char* argv[]) { unsigned int m=8, n=8, lda=8, ldb=8, nerrs, num, nmat, nmats, nmatd, ntest; unsigned int layout, asize, VLEND=4, VLENS=8, bsize; unsigned int ncorr; int i, j; char side, uplo, trans, diag; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd; double *da, *db, *dc, *dd, *tmpbuf; double dalpha = 1.0; float salpha; double dtmp; const unsigned char *cptr; unsigned long op_count; const libxsmm_trsm_descriptor* desc8 = NULL; const libxsmm_trsm_descriptor* desc4 = NULL; libxsmm_descriptor_blob blob; union { libxsmm_xtrsmfunction dp; libxsmm_xtrsmfunction sp; const void* pv; } mykernel = { 0 }; #ifdef USE_KERNEL_GENERATION_DIRECTLY void (*opcode_routine)(); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY #include <unistd.h> #include <signal.h> #include <malloc.h> #include <sys/mman.h> #include "../../src/generator_packed_trsm_avx_avx512.h" unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif if ( argc <= 3 ) { printf("\nUSAGE: %s m n lda ldb nmat side uplo trans diag layout ntest alpha\n",argv[0]); printf("Compact TRSM a mxn matrix of leading dimension ldb\n"); printf("This will test the jit of 1 VLEN work of nmat at a time\n"); printf("Defaults: m=n=lda=ldb=nmat=8, alpha=1.0, side=uplo='L',trans=diag='N',layout=102,ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) lda= atoi(argv[3]); else lda = 8; if ( argc > 4 ) ldb = atoi(argv[4]); else ldb = 8; if ( argc > 5 ) nmat = atoi(argv[5]); else nmat = 8; if ( argc > 6 ) side = argv[6][0]; else side = 'L'; if ( argc > 7 ) uplo = argv[7][0]; else uplo = 'L'; if ( argc > 8 ) trans = argv[8][0]; else trans = 'N'; if ( argc > 9 ) diag = argv[9][0]; else diag = 'N'; if ( argc > 10 ) layout = atoi(argv[10]); else layout=102; if ( argc > 11 ) ntest = atoi(argv[11]); else ntest = 1; if ( argc > 12 ) dalpha = atof(argv[12]); else dalpha = 1.0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); /* A is either mxm or nxn depending on side */ if ( (side == 'L') || (side=='l') ) asize = m; else asize = n; lda = LIBXSMM_MAX(lda,asize); if ( layout == 102 ) { /* Column major: B is mxn, and stored in B format */ ldb = LIBXSMM_MAX(ldb,m); bsize = ldb*n; } else { /* Row major: B is mxn, and stored in B^T format */ ldb = LIBXSMM_MAX(ldb,n); bsize = ldb*m; } nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); nmat = LIBXSMM_MAX(nmats,nmatd); op_count = n * m * asize; printf("This is a real*%u tester for JIT compact TRSM kernels! (%c%c%c%c m=%u n=%u lda=%u ldb=%u layout=%u nmat=%u)\n",typesize8,side,uplo,trans,diag,m,n,lda,ldb,layout,nmat); #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kenrel\n"); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif desc8 = libxsmm_trsm_descriptor_init(&blob, typesize8, m, n, lda, ldb, &dalpha, trans, diag, side, uplo, layout); desc4 = libxsmm_trsm_descriptor_init(&blob, typesize4, m, n, lda, ldb, &salpha, trans, diag, side, uplo, layout); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); mykernel.dp = libxsmm_dispatch_trsm(desc8); printf("done calling libxsmm_dispatch_trsm: typesize8=%u\n",typesize8); if ( mykernel.dp == NULL ) printf("R8 Kernel after the create call is null\n"); mykernel.sp = libxsmm_dispatch_trsm(desc4); if ( mykernel.sp == NULL ) printf("R4 kernel after the create call is null\n"); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY libxsmm_generator_trsm_kernel ( &io_generated_code, &desc8, "hsw" ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*asize*nmats*sizeof(float) ); da = (double *) malloc ( lda*asize*nmatd*sizeof(double) ); sb = (float *) malloc ( bsize*nmats*sizeof(float) ); db = (double *) malloc ( bsize*nmatd*sizeof(double) ); sc = (float *) malloc ( bsize*nmats*sizeof(float) ); dc = (double *) malloc ( bsize*nmatd*sizeof(double) ); sd = (float *) malloc ( bsize*nmats*sizeof(float) ); dd = (double *) malloc ( bsize*nmatd*sizeof(double) ); tmpbuf = (double *) malloc ( asize*VLEND*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, asize, asize*nmats ); #ifdef TRIANGLE_IS_IDENTITY printf("Warning: setting triangular matrix to identity. Not good for accuracy testing\n"); dfill_identity ( da, lda, asize, asize, VLEND, nmatd/VLEND ); #else dfill_matrix ( da, lda, asize, asize*nmatd ); #endif sfill_matrix ( sb, bsize, bsize, nmats ); dfill_matrix ( db, bsize, bsize, nmatd ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sc[i]=sb[i]; for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=db[i]; for ( i = 0 ; i < (int)(bsize*nmats) ; i++ ) sd[i]=sb[i]; for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dd[i]=db[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel.pv; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) trsm_xct_; #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL #define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl trsm_xct_\n",fp); fputs("trsm_xct_:\n",fp); for (i = 0 ; i < 4000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type trsm_xct_,@function\n",fp); fputs("\t.size trsm_xct_,.-trsm_xct_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) #include "mkl.h" MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (trans == 'N' || trans == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) db[i]=dd[i]; #endif for ( i = 0 , num = 0; i < (int)nmatd ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*asize*VLEND]; double *Bp = &db[num*bsize*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_XSMM_GENERATED mykernel.dp ( Ap, Bp, tmpbuf ); #endif #ifdef USE_PREDEFINED_ASSEMBLY trsm_xct_ ( Ap, Bp, &one ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp ); #endif #ifdef TIME_MKL mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, db, ldb, CMP_FORMAT, nmatd ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER timer += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer += libxsmm_timer_duration(l_start,l_end); #endif } } timer /= ((double)ntest); #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmatd,timer); printf("Gflops: %g\n",(double)(op_count*nmatd)/(timer*1.0e9)); printf("after routine, new B(1,1)=%g B[256]=%g\n",db[0],db[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial B(1,1)=%g B[256]=%g\n",sb[0],sb[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*asize*VLENS]; float *Bp = &sb[num*bsize*VLENS]; #ifdef USE_XSMM_GENERATED mykernel.sp ( Ap, Bp, NULL ); #endif } printf("after r4 routine, new B(1,1)=%g B]256]=%g\n",db[0],db[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { #ifndef TRIANGLE_IS_IDENTITY for ( i = 0 ; i < (int)(bsize*nmatd) ; i++ ) dc[i]=dd[i]; #endif #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifdef USE_MKL_FOR_REFERENCE mkl_dtrsm_compact ( CLAYOUT, SIDE, UPLO, TRANSA, DIAG, m, n, dalpha, da, lda, dc, ldb, CMP_FORMAT, nmatd ); #elif !defined(LIBXSMM_NOFORTRAN) if ( (layout == 101) && (nmatd!=VLEND) ) { unsigned int lay = 102, m1 = n, n1 = m; char side1='L', uplo1='L'; if ( side == 'L' || side == 'l' ) side1 = 'R'; if ( uplo == 'L' || uplo == 'l' ) uplo1 = 'U'; compact_dtrsm_ ( &lay, &side1, &uplo1, &trans, &diag, &m1, &n1, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } else { compact_dtrsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &dalpha, da, &lda, dc, &ldb, &nmatd, &VLEND ); } #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,(op_count*nmatd)/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, bsize, bsize, nmatd, db, bsize, &nerrs, &ncorr ); printf("R8 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_strsm_ ( &layout, &side, &uplo, &trans, &diag, &m, &n, &salpha, sa, &lda, sc, &ldb, &nmats, &VLENS ); /* Compute the residual between B and C */ dtmp = residual_s ( sc, bsize, bsize, nmats, sb, bsize, &nerrs, &ncorr ); printf("R4 %c%c%c%c m=%u n=%u lda=%u ldb=%u error: %g number of errors: %u corrects: %u\n",side,uplo,trans,diag,m,n,lda,ldb,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < bsize*nmatd ; j++ ) { if ( isnan(db[j]) || isinf(db[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: db[%d]=%g\n",j,db[j]); } } } printf("%g,real*8 %c%c%c%c m=%u n=%u lda=%u ldb=%u Denormals=%u Time=%g Gflops=%g",(op_count*nmatd)/(timer*1.0e9),side,uplo,trans,diag,m,n,lda,ldb,nerrs,timer,(op_count*nmatd)/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(db); free(sb); free(da); free(sa); return 0; }
int main(int argc, char* argv[]) { const char t = (char)(1 < argc ? *argv[1] : 'o'); const libxsmm_blasint m = (2 < argc ? atoi(argv[2]) : 4096); #if 0 /* TODO: enable when in-place transpose is fully supported */ const libxsmm_blasint n = (3 < argc ? atoi(argv[3]) : m); #else const libxsmm_blasint n = (3 < argc ? (('o' == t || 'O' == t) ? atoi(argv[3]) : m) : m); #endif const libxsmm_blasint ldi = LIBXSMM_MAX/*sanitize ld*/(4 < argc ? atoi(argv[4]) : 0, m); const libxsmm_blasint ldo = LIBXSMM_MAX/*sanitize ld*/(5 < argc ? atoi(argv[5]) : 0, n); const int r = (6 < argc ? atoi(argv[6]) : 0), s = LIBXSMM_ABS(r); const libxsmm_blasint lower = (7 < argc ? atoi(argv[7]) : 0); libxsmm_blasint km = m, kn = n, kldi = ldi, kldo = (('o' == t || 'O' == t) ? ldo : ldi); int result = EXIT_SUCCESS, k; if (0 == strchr("oOiI", t)) { fprintf(stderr, "%s [<transpose-kind:o|i>] [<m>] [<n>] [<ld-in>] [<ld-out>] [random:0|nruns] [lbound]\n", argv[0]); exit(EXIT_FAILURE); } #if defined(LIBXSMM_OFFLOAD_TARGET) # pragma offload target(LIBXSMM_OFFLOAD_TARGET) #endif { const char *const env_tasks = getenv("TASKS"), *const env_check = getenv("CHECK"); const int tasks = (0 == env_tasks || 0 == *env_tasks) ? 0/*default*/ : atoi(env_tasks); const int check = (0 == env_check || 0 == *env_check) ? 1/*default*/ : atoi(env_check); ELEM_TYPE *const a = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldi * (('o' == t || 'O' == t) ? n : ldo) * sizeof(ELEM_TYPE))); ELEM_TYPE *const b = (ELEM_TYPE*)libxsmm_malloc((size_t)(ldo * (('o' == t || 'O' == t) ? m : ldi) * sizeof(ELEM_TYPE))); libxsmm_timer_tickint start, duration = 0; #if defined(USE_REFERENCE) /* benchmark against a reference */ libxsmm_timer_tickint duration2 = 0; #endif libxsmm_blasint i; size_t size = 0; #if defined(MKL_ENABLE_AVX512) mkl_enable_instructions(MKL_ENABLE_AVX512); #endif fprintf(stdout, "m=%lli n=%lli ldi=%lli ldo=%lli size=%.fMB (%s, %s)\n", (long long)m, (long long)n, (long long)ldi, (long long)ldo, 1.0 * (m * n * sizeof(ELEM_TYPE)) / (1 << 20), LIBXSMM_STRINGIFY(ELEM_TYPE), ('o' == t || 'O' == t) ? "out-of-place" : "in-place"); #if defined(_OPENMP) # pragma omp parallel for private(i) #endif for (i = 0; i < n; ++i) { libxsmm_blasint j; for (j = 0; j < m; ++j) { a[i*ldi+j] = initial_value(i, j, m); } } if (0 != check) { /* repeatable (reference) */ srand(RAND_SEED); } else { /* randomized selection */ srand(libxsmm_timer_tick() % ((unsigned int)-1)); } for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); /* trigger JIT-generated code */ OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; /* trigger JIT-generated code */ ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); } } size += (size_t)(km * kn * sizeof(ELEM_TYPE)); if (('o' == t || 'O' == t)) { if (0 == tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); #if defined(OTRANS_THREAD) # pragma omp parallel OTRANS_THREAD(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo, omp_get_thread_num(), omp_get_num_threads()); #else result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); #endif duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single nowait #endif result = OTRANS(b, a, sizeof(ELEM_TYPE), km, kn, kldi, kldo); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); if (2 > tasks) { /* library-internal parallelization */ start = libxsmm_timer_tick(); result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { /* external parallelization */ start = libxsmm_timer_tick(); #if defined(_OPENMP) # pragma omp parallel # pragma omp single #endif result = ITRANS(b, sizeof(ELEM_TYPE), km, kn, kldi); duration += libxsmm_timer_diff(start, libxsmm_timer_tick()); } } if (0 != check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } #if defined(USE_REFERENCE) if (0 < check) { /* check shall imply reference (performance-)test */ srand(RAND_SEED); /* reproduce the same sequence as above */ for (k = (0 == r ? -1 : 0); k < s && EXIT_SUCCESS == result; ++k) { if (0 < r) { const libxsmm_blasint rldi = 0 <= lower ? randstart(lower, ldi) : 0; km = randstart(LIBXSMM_ABS(lower), m); kldi = LIBXSMM_MAX(rldi, km); if (('o' == t || 'O' == t)) { const libxsmm_blasint rldo = 0 <= lower ? randstart(lower, ldo) : 0; kn = randstart(LIBXSMM_ABS(lower), n); kldo = LIBXSMM_MAX(rldo, kn); } else { #if 0 /* TODO: enable when in-place transpose is fully supported */ kn = randstart(LIBXSMM_ABS(lower), n); #else kn = km; #endif kldo = kldi; } } if (('o' == t || 'O' == t)) { start = libxsmm_timer_tick(); OTRANS_GOLD(&km, &kn, a, &kldi, b, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } else { assert(('i' == t || 'I' == t) && kldo == kldi); memcpy(b, a, (size_t)(kldi * kn * sizeof(ELEM_TYPE))); start = libxsmm_timer_tick(); ITRANS_GOLD(&km, &kn, b, &kldi, &kldo); duration2 += libxsmm_timer_diff(start, libxsmm_timer_tick()); } if (1 < check || 0 > check) { /* check */ for (i = 0; i < km; ++i) { libxsmm_blasint j; for (j = 0; j < kn; ++j) { const ELEM_TYPE u = b[i*kldo+j]; const ELEM_TYPE v = a[j*kldi+i]; if (LIBXSMM_NEQ(u, v)) { i += km; /* leave outer loop as well */ result = EXIT_FAILURE; break; } } } } } } #endif if (EXIT_SUCCESS == result) { const double d = libxsmm_timer_duration(0, duration); if (0 < duration) { /* out-of-place transpose bandwidth assumes RFO */ fprintf(stdout, "\tbandwidth: %.1f GB/s\n", size * ((('o' == t || 'O' == t)) ? 3 : 2) / (d * (1 << 30))); } if (0 == lower) { fprintf(stdout, "\tduration: %.0f ms\n", 1000.0 * (d / (0 == r ? (s + 1) : s))); } else { fprintf(stdout, "\tduration: %f ms\n", 1000.0 * d); } #if defined(USE_REFERENCE) if (0 < duration2) { fprintf(stdout, "\treference: %.1fx\n", (1.0 * duration) / duration2); } #endif } else if (0 != check) { /* check */ fprintf(stderr, "Error: validation failed for m=%lli, n=%lli, ldi=%lli, and ldo=%lli!\n", (long long)km, (long long)kn, (long long)kldi, (long long)kldo); } libxsmm_free(a); libxsmm_free(b); } return result; }
int main(int argc, char* argv[]) { unsigned int m=8, n=8, k=8, lda=8, ldb=8, ldc=8, nerrs, num, nmat; unsigned int layout, asize, bsize, ntest, ncorr; #ifdef AVX512_TESTING unsigned int VLEND=8, VLENS=16; int arch=LIBXSMM_X86_AVX512_CORE; #else unsigned int VLEND=4, VLENS=8; int arch=LIBXSMM_X86_AVX2; #endif unsigned int nmats, nmatd; unsigned int i, j, l, iunroll, junroll, loopi, loopj; char side='L', uplo='U', transa='N', transb='N', diag='N'; unsigned int typesize8 = 8; unsigned int typesize4 = 4; float *sa, *sb, *sc, *sd, *sc1; double *da, *db, *dc, *dd, *dc1; double dalpha = 1.0; float salpha = (float)dalpha; double dbeta = 1.0; float sbeta = (float)dbeta; double dtmp; const unsigned char *cptr = NULL; unsigned long op_count; const libxsmm_pgemm_descriptor* desc8 = NULL; const libxsmm_pgemm_descriptor* desc4 = NULL; #ifdef USE_XSMM_GENERATED libxsmm_descriptor_blob blob; libxsmm_pgemm_xfunction mykernel = NULL; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) void (*opcode_routine)(); unsigned char *routine_output; libxsmm_generated_code io_generated_code; int pagesize = sysconf(_SC_PAGE_SIZE); if (pagesize == -1) fprintf(stderr,"sysconf pagesize\n"); routine_output = (unsigned char *) mmap(NULL, BUFSIZE2, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0,0); if (mprotect(routine_output, BUFSIZE2, PROT_EXEC | PROT_READ | PROT_WRITE ) == -1) fprintf(stderr,"mprotect\n"); printf("Routine ready\n"); io_generated_code.generated_code = &routine_output[0]; io_generated_code.buffer_size = BUFSIZE2; io_generated_code.code_size = 0; io_generated_code.code_type = 2; io_generated_code.last_error = 0; #endif printf("\nUSAGE: %s m n k lda ldb ldc nmat layout ntest transa transb iunroll junroll loopj loopi\n",argv[0]); if ( argc <= 3 ) { #ifdef TEST_SINGLE printf("Compact SGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLENS); if ( VLENS==8 ) printf("(AVX2)"); else printf("(AVX512)"); #else printf("Compact DGEMM a C_mxn<-C_mxn+A_mxk*B_kxn matrix of leading dims lda/b/c\n"); printf("This will test the jit of 1 VLEN=%d ",VLEND); if ( VLEND==4 ) printf("(AVX2)"); else printf("(AVX512)"); #endif printf(" work of nmat at a time\n"); printf("Configurable: M-loop controlled by iunroll & loopi. N-loop by junroll & loopj\n"); printf("Defaults: m=n=k=lda=ldb=ldc=nmat=8, layout=102 (col major), transa=/b='N', ntest=1\n"); } if ( argc > 1 ) m = atoi(argv[1]); else m = 8; if ( argc > 2 ) n = atoi(argv[2]); else n = 8; if ( argc > 3 ) k = atoi(argv[3]); else k = 8; if ( argc > 4 ) lda= atoi(argv[4]); else lda = 8; if ( argc > 5 ) ldb= atoi(argv[5]); else ldb = 8; if ( argc > 6 ) ldc= atoi(argv[6]); else ldc = 8; if ( argc > 7 ) nmat = atoi(argv[7]); else nmat = 8; if ( argc > 8 ) layout = atoi(argv[8]); else layout=102; if ( argc > 9 ) ntest = atoi(argv[9]); else ntest = 1; if ( argc > 10 ) transa = argv[10][0]; else transa = 'N'; if ( argc > 11 ) transb = argv[11][0]; else transb = 'N'; if ( argc > 12 ) iunroll=atoi(argv[12]); else iunroll=0; if ( argc > 13 ) junroll=atoi(argv[13]); else junroll=0; if ( argc > 14 ) loopj=atoi(argv[14]); else loopj=0; if ( argc > 15 ) loopi=atoi(argv[15]); else loopi=0; salpha = (float)dalpha; m = LIBXSMM_MAX(m,1); n = LIBXSMM_MAX(n,1); k = LIBXSMM_MAX(k,1); ntest = LIBXSMM_MAX(ntest,1); nmat = LIBXSMM_MAX(nmat,VLEND); layout = LIBXSMM_MAX(LIBXSMM_MIN(layout,102),101); if ( transa!='N' && transa!='n' && transa!='T' && transa!='t' ) transa='N'; if ( transb!='N' && transb!='n' && transb!='T' && transb!='t' ) transb='N'; lda = LIBXSMM_MAX(lda,m); ldb = LIBXSMM_MAX(ldb,k); ldc = LIBXSMM_MAX(ldc,m); nmats = LIBXSMM_MAX(VLENS,nmat - (nmat%VLENS)); nmatd = LIBXSMM_MAX(VLEND,nmat - (nmat%VLEND)); #ifdef TEST_SINGLE nmat = nmats; #else nmat = nmatd; #endif op_count = (unsigned long)(nmat * 2.0 * (double)m * (double)n * (double)k); #ifdef TEST_SINGLE printf("This is a real*%d tester for JIT compact SGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize4,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLENS); #else printf("This is a real*%d tester for JIT compact DGEMM %c%c kernels! (m=%u n=%u k=%u lda=%u ldb=%u ldc=%u layout=%d nmat=%d alpha=%g beta=%g iun=%d jun=%d loopi=%d loopj=%d VLEN=%d)\n",typesize8,transa,transb,m,n,k,lda,ldb,ldc,layout,nmat,dalpha,dbeta,iunroll,junroll,loopi,loopj,VLEND); #endif #ifdef USE_XSMM_GENERATED printf("This code tests the LIBXSMM generated kernels\n"); #endif #ifdef USE_PREDEFINED_ASSEMBLY printf("This code tests some predefined assembly kernel\n"); #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) printf("This code tests kernel generation directly\n"); #endif #ifdef TIME_MKL printf("This code tests MKL compact batch directly\n"); #endif #ifdef AVX512_TESTING printf("This tests AVX512 binaries\n"); #endif #ifdef AVX2_TESTING printf("This tests AVX2 binaries\n"); #endif desc8 = libxsmm_pgemm_descriptor_init(&blob, typesize8, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #ifdef TEST_SINGLE desc4 = libxsmm_pgemm_descriptor_init(&blob, typesize4, m, n, k, lda, ldb, ldc, &dalpha, transa, transb, layout ); #endif printf("Descriptor set\n"); #ifdef USE_XSMM_GENERATED printf("calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); mykernel = libxsmm_dispatch_pgemm(desc8); printf("done calling libxsmm_dispatch_pgemm: typesize8=%u\n",typesize8); if ( mykernel == NULL ) printf("R8 Kernel after the create call is null\n"); #ifdef TEST_SINGLE mykernel = libxsmm_dispatch_pgemm(desc4); if ( mykernel == NULL ) printf("R4 kernel after the create call is null\n"); #endif #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) libxsmm_generator_pgemm_kernel( &io_generated_code, desc8, arch, iunroll, junroll, loopi, loopj ); #endif #ifndef NO_ACCURACY_CHECK printf("mallocing matrices\n"); #endif sa = (float *) malloc ( lda*k*nmat*sizeof(float) ); da = (double *) malloc ( lda*k*nmat*sizeof(double) ); sb = (float *) malloc ( ldb*n*nmat*sizeof(float) ); db = (double *) malloc ( ldb*n*nmat*sizeof(double) ); sc1 = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc1 = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sc = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dc = (double *) malloc ( ldc*n*nmat*sizeof(double) ); sd = (float *) malloc ( ldc*n*nmat*sizeof(float) ); dd = (double *) malloc ( ldc*n*nmat*sizeof(double) ); #ifndef NO_ACCURACY_CHECK printf("filling matrices\n"); #endif sfill_matrix ( sa, lda, m, k*nmat ); sfill_matrix ( sb, ldb, k, n*nmat ); sfill_matrix ( sc, ldc, m, n*nmat ); dfill_matrix ( da, lda, m, k*nmat ); dfill_matrix ( db, ldb, k, n*nmat ); dfill_matrix ( dc, ldc, m, n*nmat ); #ifndef NO_ACCURACY_CHECK for ( i = 0 ; i < ldc*n*nmat ; i++ ) sd[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) sc1[i]=sc[i]; for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc1[i]=dc[i]; printf("Pointing at the kernel now\n"); #endif #ifdef USE_XSMM_GENERATED cptr = (const unsigned char*) mykernel; #endif #ifdef USE_PREDEFINED_ASSEMBLY cptr = (const unsigned char*) gemm_; #endif #if defined(USE_KERNEL_GENERATION_DIRECTLY) && defined(__linux__) cptr = (const unsigned char*) &routine_output[0]; opcode_routine = (void *) &cptr[0]; #endif #ifndef TIME_MKL # define DUMP_ASSEMBLY_FILE #endif #ifdef DUMP_ASSEMBLY_FILE printf("Dumping assembly file\n"); FILE *fp = fopen("foo.s","w"); char buffer[80]; fputs("\t.text\n",fp); fputs("\t.align 256\n",fp); fputs("\t.globl gemm_\n",fp); fputs("gemm_:\n",fp); for (i = 0 ; i < 7000; i+=4 ) { sprintf(buffer,".byte 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",cptr[i],cptr[i+1],cptr[i+2],cptr[i+3]); fputs(buffer,fp); } fputs("\tretq\n",fp); fputs("\t.type gemm_,@function\n",fp); fputs("\t.size gemm_,.-gemm_\n",fp); fclose(fp); #endif #if defined(USE_MKL_FOR_REFERENCE) || defined(TIME_MKL) # include <mkl.h> MKL_LAYOUT CLAYOUT = (layout == 101) ? MKL_ROW_MAJOR : MKL_COL_MAJOR; MKL_SIDE SIDE = (side == 'R' || side == 'r') ? MKL_RIGHT : MKL_LEFT; MKL_UPLO UPLO = (uplo == 'U' || uplo == 'u') ? MKL_UPPER : MKL_LOWER; MKL_TRANSPOSE TRANSA = (transa == 'N' || transa == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_TRANSPOSE TRANSB = (transb == 'N' || transb == 'n') ? MKL_NOTRANS : MKL_TRANS; MKL_DIAG DIAG = (diag == 'N' || diag == 'n') ? MKL_NONUNIT : MKL_UNIT; MKL_COMPACT_PACK CMP_FORMAT = mkl_get_format_compact(); #if 0 MKL_COMPACT_PACK CMP_FORMAT = MKL_COMPACT_AVX; #endif #endif #ifndef NO_ACCURACY_CHECK printf("Before routine, initial A(1,1)=%g A[256]=%g\n",da[0],da[256]); #endif #ifdef USE_PREDEFINED_ASSEMBLY double one = 1.0; #endif double timer, firsttime = 0; #ifdef MKL_TIMER double tmptimer; tmptimer = dsecnd_(); #else unsigned long long l_start, l_end; #endif timer = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dc[i]=dc1[i]; for ( i = 0 , num = 0; i < (int)nmat ; i+= (int)VLEND, num++ ) { double *Ap = &da[num*lda*k*VLEND]; double *Bp = &db[num*ldb*n*VLEND]; double *Cp = &dc[num*ldc*n*VLEND]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #if !defined(USE_XSMM_GENERATED) && !defined(USE_PREDEFINED_ASSEMBLY) && !defined(USE_KERNEL_GENERATION_DIRECTLY) && !defined(TIME_MKL) && !defined(USE_PREDEFINED_ASSEMBLY_XCT) gen_compact_dgemm_ ( &layout, &m, &n, &k, &dalpha, Ap, &lda, Bp, &ldb, &dbeta, Cp, &ldc, &VLEND ); #endif #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif #ifdef USE_PREDEFINED_ASSEMBLY gemm_ ( Ap, Bp, Cp ); #endif #ifdef USE_KERNEL_GENERATION_DIRECTLY (*opcode_routine)( Ap, Bp, Cp ); #endif #ifdef TIME_MKL mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dc, ldc, CMP_FORMAT, nmat ); i+=nmatd; /* Because MKL will do everything */ #endif #ifdef MKL_TIMER dtmp = dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); dtmp = libxsmm_timer_duration(l_start,l_end); #endif if ( j == 0 ) firsttime=dtmp; timer += dtmp; } } if ( ntest >= 100 ) { /* Skip the first timing: super necessary if using MKL */ timer = (timer-firsttime)/((double)(ntest-1)); } else { timer /= ((double)ntest); } #ifndef NO_ACCURACY_CHECK printf("Average time to get through %u matrices: %g\n",nmat,timer); printf("Gflops: %g\n",(double)op_count/(timer*1.0e9)); printf("after routine, new C(1,1)=%g C[256]=%g\n",dc[0],dc[256]); #endif #ifdef TEST_SINGLE printf("Before r4 routine, initial C(1,1)=%g C[256]=%g\n",sc[0],sc[256]); for ( i = 0 , num = 0; i < nmats ; i+= VLENS, num++ ) { float *Ap = &sa[num*lda*k*VLENS]; float *Bp = &sb[num*ldb*n*VLENS]; float *Cp = &sc[num*ldc*n*VLENS]; #ifdef USE_XSMM_GENERATED mykernel ( Ap, Bp, Cp ); #endif } printf("after r4 routine, new C(1,1)=%g C]256]=%g\n",dc[0],dc[256]); #endif #ifndef NO_ACCURACY_CHECK /* Call some reference code now on a copy of the B matrix (C) */ double timer2 = 0.0; for ( j = 0 ; j < (int)ntest ; j++ ) { for ( i = 0 ; i < ldc*n*nmat ; i++ ) dd[i]=dc1[i]; #ifdef MKL_TIMER tmptimer = dsecnd_(); #else l_start = libxsmm_timer_tick(); #endif #ifndef USE_MKL_FOR_REFERENCE compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &dalpha, da, &lda, db, &ldb, &dbeta, dd, &ldc, &nmat, &VLEND ); #else mkl_dgemm_compact ( CLAYOUT, TRANSA, TRANSB, m, n, k, dalpha, da, lda, db, ldb, dbeta, dd, ldc, CMP_FORMAT, nmat ); #endif #ifdef MKL_TIMER timer2 += dsecnd_() - tmptimer; #else l_end = libxsmm_timer_tick(); timer2 += libxsmm_timer_duration(l_start,l_end); #endif } timer2 /= ((double)ntest); printf("Reference time=%g Reference Gflops=%g\n",timer2,op_count/(timer2*1.0e9)); /* Compute the residual between B and C */ dtmp = residual_d ( dc, ldc, m, n*nmat, dd, ldc, &nerrs, &ncorr ); printf("R8 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*8 %u case",m,n,layout); printf("\n"); #ifdef TEST_SINGLE /* Call some reference code now on a copy of the B matrix (C) */ compact_dgemm_ ( &layout, &transa, &transb, &m, &n, &k, &salpha, sa, &lda, sb, &ldb, &sbeta, sd, &ldc, &nmat, &VLENS ); /* Compute the residual between C and D */ dtmp = residual_s ( sc, ldc, m, n*nmat, sd, ldc, &nerrs, &ncorr ); printf("R4 mnk=%u %u %u ldabc=%u %u %u error: %g number of errors: %u corrects: %u",m,n,k,lda,ldb,ldc,dtmp,nerrs,ncorr); if ( nerrs > 0 ) printf(" ->FAILED at %ux%u real*4 case",m,n); printf("\n"); #endif #else for ( j = 0, nerrs = 0 ; j < lda*n*nmat; j++ ) { if ( isnan(dc[j]) || isinf(dc[j]) ) { if ( ++nerrs < 10 ) { printf("WARNING: dc[%d]=%g\n",j,dc[j]); } } } printf("%g,real*8 m/n/k=%u %u %u lda-c=%u %u %u Denormals=%u Time=%g Gflops=%g",op_count/(timer*1.0e9),m,n,k,lda,ldb,ldc,nerrs,timer,op_count/(timer*1.0e9)); if ( nerrs > 0 ) printf(" -> FAILED at %ux%u real*8 case",m,n); printf("\n"); #endif free(dd); free(sd); free(dc); free(sc); free(dc1); free(sc1); free(db); free(sb); free(da); free(sa); return 0; }