Example #1
0
LIBXSMM_API int libxsmm_matcopy_thread(void* out, const void* in, unsigned int typesize,
  libxsmm_blasint m, libxsmm_blasint n, libxsmm_blasint ldi, libxsmm_blasint ldo,
  const int* prefetch, int tid, int nthreads)
{
  int result = EXIT_SUCCESS;
  static int error_once = 0;
  assert(typesize <= 255);
  if (0 != out && out != in && 0 < typesize && 0 < m && 0 < n && m <= ldi && m <= ldo &&
    /* use (signed) integer types, but check sanity of input */
    0 <= tid && tid < nthreads)
  {
    const unsigned int uldi = (unsigned int)ldi, uldo = (unsigned int)ldo;
    unsigned int tm = (unsigned int)m, tn = (unsigned int)n;
    const int iprefetch = (0 == prefetch ? 0 : *prefetch);
    libxsmm_xmcopyfunction xmatcopy = 0;
    LIBXSMM_INIT /* before leading tile sizes */
    if (1 < nthreads) {
      libxsmm_blasint m0 = 0, n0 = 0, m1 = m, n1 = n;
      const unsigned int size = tm * tn, size2 = LIBXSMM_SQRT2(size);
      const unsigned int indx = LIBXSMM_MIN(size2 >> 10, 7);
      const unsigned int tidx = (4 < typesize ? 0 : 1);
      int mtasks;
      tm = LIBXSMM_MIN(tm, libxsmm_trans_tile[tidx][0/*M*/][indx]);
      tn = LIBXSMM_MIN(tn, libxsmm_trans_tile[tidx][1/*N*/][indx]);
      mtasks = ((1 < nthreads) ? ((int)((m + tm - 1) / tm)) : 1);
      if (1 < mtasks && nthreads <= mtasks) { /* only parallelized over M */
        const int mc = (mtasks + nthreads - 1) / nthreads * tm;
        m0 = tid * mc; m1 = LIBXSMM_MIN(m0 + mc, m);
      }
      else if (1 < nthreads) {
        const int ntasks = nthreads / mtasks, mtid = tid / ntasks, ntid = tid - mtid * ntasks;
        const libxsmm_blasint nc = (((n + ntasks - 1) / ntasks + tn - 1) / tn) * tn;
        const libxsmm_blasint mc = tm;
        m0 = mtid * mc; m1 = LIBXSMM_MIN(m0 + mc, m);
        n0 = ntid * nc; n1 = LIBXSMM_MIN(n0 + nc, n);
      }
      if (0 != (1 & libxsmm_trans_jit) /* libxsmm_trans_jit: JIT'ted matrix-copy permitted? */
        && (1 == typesize || 2 == typesize || 4 == typesize) /* TODO: support multiples */
        /* avoid code-dispatch if task does not need the kernel for inner tiles */
        && tm + m0 <= (unsigned int)(m1 - m0) && tn <= (unsigned int)(n1 - n0)
        /* TODO: investigate issue with Byte-element copy/MT on pre-AVX512 */
        && (1 < typesize || LIBXSMM_X86_AVX2 < libxsmm_target_archid))
      {
        libxsmm_descriptor_blob blob;
        const libxsmm_mcopy_descriptor *const desc = libxsmm_mcopy_descriptor_init(&blob,
          typesize, tm, tn, uldo, uldi, 0 != in ? 0 : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE,
          iprefetch, NULL/*default unroll*/);
        xmatcopy = libxsmm_dispatch_mcopy(desc);
      }
      if (0 != prefetch && 0 != *prefetch) { /* prefetch */
        LIBXSMM_XCOPY(
          LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP,
          LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, xmatcopy, out, in,
          typesize, uldi, uldo, tm, tn, m0, m1, n0, n1);
      }
      else { /* no prefetch */
        LIBXSMM_XCOPY(
          LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP,
          LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL_NOPF, xmatcopy, out, in,
          typesize, uldi, uldo, tm, tn, m0, m1, n0, n1);
      }
    }
    else {
Example #2
0
int main(int argc, char* argv[])
{
  const int m = (1 < argc ? atoi(argv[1]) : 16);
  const int n = (2 < argc ? atoi(argv[2]) : m);
  const int unsigned ldi = LIBXSMM_MAX(3 < argc ? atoi(argv[3]) : 0, m);
  const int unsigned ldo = LIBXSMM_MAX(4 < argc ? atoi(argv[4]) : 0, m);
  const int unroll = (5 < argc ? atoi(argv[5]) : 1);
  const int prefetch = (6 < argc ? atoi(argv[6]) : 0);
  const int flags = ((7 < argc && 0 != atoi(argv[7])) ? LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE : 0);
  const int iters = (8 < argc ? atoi(argv[8]) : 1);

  /* we should modify to test all data-types */
  const libxsmm_mcopy_descriptor* desc;
  libxsmm_xmcopyfunction kernel;
  libxsmm_descriptor_blob blob;
  libxsmm_timer_tickint l_start;
  libxsmm_timer_tickint l_end;
  int error = 0, i, j;
  ELEM_TYPE *a, *b;
  double copy_time;

  printf("This is a tester for JIT matcopy kernels!\n");
  desc = libxsmm_mcopy_descriptor_init(&blob, sizeof(ELEM_TYPE),
    m, n, ldo, ldi, flags, prefetch, &unroll);

  a = (ELEM_TYPE*)malloc(n * ldi * sizeof(ELEM_TYPE));
  b = (ELEM_TYPE*)malloc(n * ldo * sizeof(ELEM_TYPE));

  for (i = 0; i < n; i++) {
    for (j = 0; j < m; j++) {
      a[j+ldi*i] = (ELEM_TYPE)rand();
      if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) {
        b[j+ldo*i] = (ELEM_TYPE)rand();
      }
    }
  }

  /* test dispatch call */
  kernel = libxsmm_dispatch_mcopy(desc);
  if (kernel == 0) {
    printf("JIT error -> exit!!!!\n");
    exit(EXIT_FAILURE);
  }

  /* let's call */
  kernel(a, &ldi, b, &ldo, &a[128]);

  l_start = libxsmm_timer_tick();
  for (i = 0; i < iters; ++i) {
    kernel(a, &ldi, b, &ldo, &a[128]);
  }
  l_end = libxsmm_timer_tick();
  copy_time = libxsmm_timer_duration(l_start, l_end);

  for (i = 0; i < n; ++i) {
    for (j = 0; j < m; ++j) {
      if (0 != (LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE & flags)) {
        if (LIBXSMM_NEQ(b[j+ldo*i], 0)) {
          printf("ERROR!!!\n");
          i = n;
          error = 1;
          break;
        }
      }
      else if (LIBXSMM_NEQ(a[j+ldi*i], b[j+ldo*i])) {
        printf("ERROR!!!\n");
        i = n;
        error = 1;
        break;
      }
    }
  }

  if (error == 0) {
    printf("CORRECT copy!!!!\n");
    printf("Time taken is\t%.5f seconds\n", copy_time);
  }

  return EXIT_SUCCESS;
}
Example #3
0
       LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL, xmatcopy, out, in,
       typesize, uldi, uldo, tm, tn, m0, m1, n0, n1);
   }
   else { /* no prefetch */
     LIBXSMM_XCOPY(
       LIBXSMM_NOOP, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP_ARGS, LIBXSMM_NOOP,
       LIBXSMM_MCOPY_KERNEL, LIBXSMM_MCOPY_CALL_NOPF, xmatcopy, out, in,
       typesize, uldi, uldo, tm, tn, m0, m1, n0, n1);
   }
 }
 else {
   libxsmm_descriptor_blob blob;
   /* libxsmm_trans_jit: JIT'ted matrix-copy permitted? */
   const libxsmm_mcopy_descriptor *const desc = (0 != (1 & libxsmm_trans_jit) ? libxsmm_mcopy_descriptor_init(&blob,
     typesize, tm, tn, uldo, uldi, 0 != in ? 0 : LIBXSMM_MATCOPY_FLAG_ZERO_SOURCE, iprefetch, NULL/*default unroll*/) : 0);
   xmatcopy = libxsmm_dispatch_mcopy(desc);
   assert(0 == tid && 1 == nthreads);
   if (0 != xmatcopy) { /* JIT-kernel available */
     if (0 != prefetch && 0 != *prefetch) { /* prefetch */
       LIBXSMM_MCOPY_CALL(xmatcopy, typesize, in, &uldi, out, &uldo);
     }
     else { /* no prefetch */
       LIBXSMM_MCOPY_CALL_NOPF(xmatcopy, typesize, in, &uldi, out, &uldo);
     }
   }
   else { /* no JIT */
     const unsigned int size = tm * tn, size2 = LIBXSMM_SQRT2(size);
     const unsigned int indx = LIBXSMM_MIN(size2 >> 10, 7);
     const unsigned int tidx = (4 < typesize ? 0 : 1);
     tm = LIBXSMM_MIN(tm, libxsmm_trans_tile[tidx][0/*M*/][indx]);
     tn = LIBXSMM_MIN(tn, libxsmm_trans_tile[tidx][1/*N*/][indx]);