LIBXSTREAM_TARGET(mic) void makehist(const char* data, size_t* histogram) { size_t size; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_shape(0/*current context*/, 0/*data*/, &size)); LIBXSTREAM_CONCATENATE(histogram,HISTOGRAM)(data, size, histogram); }
} LIBXSTREAM_TARGET(mic) void mkl_imatcopy(size_t m, size_t n, double* matrix) { #if defined(LIBMICSMM_USE_MKLTRANS) && defined(__MKL) MKL_Dimatcopy('R', 'T', m, n, 1.0, matrix, n, m); #endif } template<typename T, typename U> LIBXSTREAM_TARGET(mic) void kernel(const U *LIBXSTREAM_RESTRICT stack, LIBXSTREAM_INVAL(U) m, LIBXSTREAM_INVAL(U) n, T *LIBXSTREAM_RESTRICT matrix) { size_t stacksize = 0; LIBXSTREAM_CHECK_CALL_ASSERT(libxstream_get_shape(0/*current context*/, 0/*stack*/, &stacksize)); LIBXSTREAM_PRINT_INFO("libsmm_acc_transpose (mic): stacksize=%%lu m=%i n=%i", static_cast<unsigned long>(stacksize), LIBXSTREAM_GETVAL(m), LIBXSTREAM_GETVAL(n)); #if defined(LIBXSTREAM_DEBUG) && defined(_OPENMP) const double start = omp_get_wtime(); #endif #if defined(_OPENMP) # pragma omp parallel for #endif for (U s = 0; s < stacksize; ++s) { T *const mat = matrix + stack[s]; #if defined(LIBMICSMM_USE_MKLTRANS) && defined(__MKL) mkl_imatcopy(static_cast<size_t>(LIBXSTREAM_GETVAL(m)), static_cast<size_t>(LIBXSTREAM_GETVAL(n)), mat); #else LIBXSTREAM_ALIGNED(T tmp[LIBMICSMM_MAX_MATRIX_SIZE], LIBXSTREAM_MAX_SIMD);