LIBXSMM_API int libxsmm_matcopy_thread(void* out, const void* in, unsigned int typesize, libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo, const int* prefetch, int tid, int nthreads) { int result = EXIT_SUCCESS; static int error_once = 0; assert(typesize <= 255); if (0 != out && out != in && 0 < typesize && 0 < m && 0 < n && m <= ldi && m <= ldo && /* use (signed) integer types, but check sanity of input */ 0 <= tid && tid < nthreads) { const unsigned int uldi = (unsigned int)ldi, uldo = (unsigned int)ldo; unsigned int tm = (unsigned int)m, tn = (unsigned int)n; const int iprefetch = (0 == prefetch ? 0 : *prefetch); libxsmm_xmcopyfunction xmatcopy = 0; LIBXSMM_INIT /* before leading tile sizes */ if (1 < nthreads) { libxsmm_blasint m0 = 0, n0 = 0, m1 = m, n1 = n; const unsigned int size = tm * tn, size2 = LIBXSMM_SQRT2(size); const unsigned int indx = LIBXSMM_MIN(size2 >> 10, 7); const unsigned int tidx = (4 < typesize ? 0 : 1); int mtasks; tm = LIBXSMM_MIN(tm, libxsmm_trans_tile[tidx][0/*M*/][indx]); tn = LIBXSMM_MIN(tn, libxsmm_trans_tile[tidx][1/*N*/][indx]); mtasks = ((1 < nthreads) ? ((int)((m + tm - 1) / tm)) : 1); if (1 < mtasks && nthreads <= mtasks) { /* only parallelized over M */ const int mc = (mtasks + nthreads - 1) / nthreads * tm; m0 = tid * mc; m1 = LIBXSMM_MIN(m0 + mc, m); } else if (1 < nthreads) { const int ntasks = nthreads / mtasks, mtid = tid / ntasks, ntid = tid - mtid * ntasks; const libxsmm_blasint nc = (((n + ntasks - 1) / ntasks + tn - 1) / tn) * tn; const libxsmm_blasint mc = tm; m0 = mtid * mc; m1 = LIBXSMM_MIN(m0 + mc, m); n0 = ntid * nc; n1 = LIBXSMM_MIN(n0 + nc, n); } if (0 != (1 & libxsmm_trans_jit) /* libxsmm_trans_jit: JIT'ted matrix-copy permitted? */ && (1 == typesize || 2 == typesize || 4 == typesize) /* TODO: support multiples */ /* avoid code-dispatch if task does not need the kernel for inner tiles */ && tm + m0 <= (unsigned int)(m1 - m0) && tn <= (unsigned int)(n1 - n0) /* TODO: investigate issue with Byte-element copy/MT on pre-AVX512 */ && (1 < typesize || LIBXSMM_X86_AVX2 < libxsmm_target_archid)) { libxsmm_descriptor_blob blob; const libxsmm_mcopy_descriptor *const desc = libxsmm_mcopy_descriptor_init(&blob, typesize, tm, tn, uldo, uldi, 0 != in ? 0 : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE, iprefetch, NULL/*default unroll*/); xmatcopy = libxsmm_dispatch_mcopy(desc); } if (0 != prefetch && 0 != *prefetch) { /* prefetch */ LIBXSMM_XCOPY( LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP, LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, xmatcopy, out, in, typesize, uldi, uldo, tm, tn, m0, m1, n0, n1); } else { /* no prefetch */ LIBXSMM_XCOPY( LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP, LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL_NOPF, xmatcopy, out, in, typesize, uldi, uldo, tm, tn, m0, m1, n0, n1); } } else {
int main(int argc, char* argv[]) { const int m = (1 < argc ? atoi(argv[1]) : 16); const int n = (2 < argc ? atoi(argv[2]) : m); const int unsigned ldi = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m); const int unsigned ldo = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, m); const int unroll = (5 < argc ? atoi(argv[5]) : 1); const int prefetch = (6 < argc ? atoi(argv[6]) : 0); const int flags = ((7 < argc && 0 != atoi(argv[7])) ? LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE : 0); const int iters = (8 < argc ? atoi(argv[8]) : 1); /* we should modify to test all data-types */ const libxsmm_mcopy_descriptor* desc; libxsmm_xmcopyfunction kernel; libxsmm_descriptor_blob blob; libxsmm_timer_tickint l_start; libxsmm_timer_tickint l_end; int error = 0, i, j; ELEM_TYPE *a, *b; double copy_time; printf("This is a tester for JIT matcopy kernels!\n"); desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE), m, n, ldo, ldi, flags, prefetch, &unroll); a = (ELEM_TYPE*)malloc(n * ldi * sizeof(ELEM_TYPE)); b = (ELEM_TYPE*)malloc(n * ldo * sizeof(ELEM_TYPE)); for (i = 0; i < n; i++) { for (j = 0; j < m; j++) { a[j+ldi*i] = (ELEM_TYPE)rand(); if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { b[j+ldo*i] = (ELEM_TYPE)rand(); } } } /* test dispatch call */ kernel = libxsmm_dispatch_mcopy(desc); if (kernel == 0) { printf("JIT error -> exit!!!!\n"); exit(EXIT_FAILURE); } /* let's call */ kernel(a, &ldi, b, &ldo, &a[128]); l_start = libxsmm_timer_tick(); for (i = 0; i < iters; ++i) { kernel(a, &ldi, b, &ldo, &a[128]); } l_end = libxsmm_timer_tick(); copy_time = libxsmm_timer_duration(l_start, l_end); for (i = 0; i < n; ++i) { for (j = 0; j < m; ++j) { if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) { if (LIBXSMM_NEQ(b[j+ldo*i], 0)) { printf("ERROR!!!\n"); i = n; error = 1; break; } } else if (LIBXSMM_NEQ(a[j+ldi*i], b[j+ldo*i])) { printf("ERROR!!!\n"); i = n; error = 1; break; } } } if (error == 0) { printf("CORRECT copy!!!!\n"); printf("Time taken is\t%.5f seconds\n", copy_time); } return EXIT_SUCCESS; }
LIBXSMM_XCOPY( LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP, LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, xmatcopy, out, in, typesize, uldi, uldo, tm, tn, m0, m1, n0, n1); } else { /* no prefetch */ LIBXSMM_XCOPY( LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP, LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL_NOPF, xmatcopy, out, in, typesize, uldi, uldo, tm, tn, m0, m1, n0, n1); } } else { libxsmm_descriptor_blob blob; /* libxsmm_trans_jit: JIT'ted matrix-copy permitted? */ const libxsmm_mcopy_descriptor *const desc = (0 != (1 & libxsmm_trans_jit) ? libxsmm_mcopy_descriptor_init(&blob, typesize, tm, tn, uldo, uldi, 0 != in ? 0 : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE, iprefetch, NULL/*default unroll*/) : 0); xmatcopy = libxsmm_dispatch_mcopy(desc); assert(0 == tid && 1 == nthreads); if (0 != xmatcopy) { /* JIT-kernel available */ if (0 != prefetch && 0 != *prefetch) { /* prefetch */ LIBXSMM_MCOPY_CALL(xmatcopy, typesize, in, &uldi, out, &uldo); } else { /* no prefetch */ LIBXSMM_MCOPY_CALL_NOPF(xmatcopy, typesize, in, &uldi, out, &uldo); } } else { /* no JIT */ const unsigned int size = tm * tn, size2 = LIBXSMM_SQRT2(size); const unsigned int indx = LIBXSMM_MIN(size2 >> 10, 7); const unsigned int tidx = (4 < typesize ? 0 : 1); tm = LIBXSMM_MIN(tm, libxsmm_trans_tile[tidx][0/*M*/][indx]);