Esempio n. 1
0
int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

  if (!range_n) {
    range[0] = 0;
    i        = arg -> n;
  } else {
    range[0] = range_n[0];
    i        = range_n[1] - range_n[0];
  }
  
  num_cpu  = 0;

  while (i > 0){
    
    width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);

    i -= width;
    if (i < 0) width = width + i;

    range[num_cpu + 1] = range[num_cpu] + width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = function;
    queue[num_cpu].args    = arg;
    queue[num_cpu].range_m = range_m;
    queue[num_cpu].range_n = &range[num_cpu];
#if 0   //defined(LOONGSON3A)
    queue[num_cpu].sa      = sa	+ GEMM_OFFSET_A1 * num_cpu;
    queue[num_cpu].sb      = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else
	queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
#endif
    queue[num_cpu].next    = &queue[num_cpu + 1];
    num_cpu ++;
  }
  
  if (num_cpu) {
#if 0 //defined(LOONGSON3A)
    queue[0].sa = sa;
    queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else
	queue[0].sa = sa;
	queue[0].sb = sb;
#endif
    queue[num_cpu - 1].next = NULL;
    
    exec_blas(num_cpu,
	      queue);
  }
   
  return 0;
}
Esempio n. 2
0
static void _init_thread_memory(void *buffer) {

  blas_queue_t queue[MAX_CPU_NUMBER];
  int num_cpu;

  for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {

    blas_queue_init(&queue[num_cpu]);
    queue[num_cpu].mode    = BLAS_DOUBLE | BLAS_REAL;
    queue[num_cpu].routine = &_touch_memory;
    queue[num_cpu].args    = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
  }

  queue[num_cpu - 1].next = NULL;
  queue[0].sa = buffer;
  
  exec_blas(num_cpu, queue);

}
Esempio n. 3
0
int CNAME(BLASLONG m, FLOAT  alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

  double dnum;
  int mask = 7;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif
#endif

  args.m = m;

  args.a = (void *)x;
  args.b = (void *)y;
  args.c = (void *)a;

  args.lda = incx;
  args.ldb = incy;
#ifndef COMPLEX
  args.alpha = (void *)&alpha;
#else
  args.alpha = (void *)alpha;
#endif

  dnum = (double)m * (double)m / (double)nthreads;
  num_cpu  = 0;

#ifndef LOWER

  range_m[MAX_CPU_NUMBER] = m;
  i          = 0;

  while (i < m){

    if (nthreads - num_cpu > 1) {

      double di = (double)(m - i);
      if (di * di - dnum > 0) {
	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
      } else {
	width = m - i;
      }

      if (width < 16) width = 16;
      if (width > m - i) width = m - i;

    } else {
      width = m - i;
    }

    range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = syr_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
    queue[num_cpu].range_n = NULL;
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i += width;
  }

#else

  range_m[0] = 0;
  i          = 0;

  while (i < m){

    if (nthreads - num_cpu > 1) {

      double di = (double)(m - i);
      if (di * di - dnum > 0) {
	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
      } else {
	width = m - i;
      }

      if (width < 16) width = 16;
      if (width > m - i) width = m - i;

    } else {
      width = m - i;
    }

    range_m[num_cpu + 1] = range_m[num_cpu] + width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = syr_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[num_cpu];
    queue[num_cpu].range_n = NULL;
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i += width;
  }

#endif

  if (num_cpu) {
    queue[0].sa = NULL;
    queue[0].sb = buffer;

    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  return 0;
}
Esempio n. 4
0
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
		       *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  blas_arg_t newarg;

  job_t          job[MAX_CPU_NUMBER];
  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1];
  BLASLONG range_N[MAX_CPU_NUMBER + 1];

  BLASLONG num_cpu_m, num_cpu_n;

  BLASLONG nthreads = args -> nthreads;

  BLASLONG width, i, j, k, js;
  BLASLONG m, n, n_from, n_to;
  int  mode;

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL | BLAS_NODE;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL | BLAS_NODE;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX | BLAS_NODE;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX | BLAS_NODE;
#endif  
#endif

  newarg.m        = args -> m;
  newarg.n        = args -> n;
  newarg.k        = args -> k;
  newarg.a        = args -> a;
  newarg.b        = args -> b;
  newarg.c        = args -> c;
  newarg.lda      = args -> lda;
  newarg.ldb      = args -> ldb;
  newarg.ldc      = args -> ldc;
  newarg.alpha    = args -> alpha;
  newarg.beta     = args -> beta;
  newarg.nthreads = args -> nthreads;
  newarg.common   = (void *)job;
   
#ifdef PARAMTEST
  newarg.gemm_p  = args -> gemm_p;
  newarg.gemm_q  = args -> gemm_q;
  newarg.gemm_r  = args -> gemm_r;
#endif

  if (!range_m) {
    range_M[0] = 0;
    m          = args -> m;
  } else {
    range_M[0] = range_m[0];
    m          = range_m[1] - range_m[0];
  }

  num_cpu_m  = 0;

  while (m > 0){
    
    width  = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);

    m -= width;
    if (m < 0) width = width + m;

    range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;

    num_cpu_m ++;
  }

  for (i = 0; i < num_cpu_m; i++) {
    queue[i].mode    = mode;
    queue[i].routine = inner_thread;
    queue[i].args    = &newarg;
    queue[i].range_m = &range_M[i];
    queue[i].range_n = &range_N[0];
    queue[i].sa      = NULL;
    queue[i].sb      = NULL;
    queue[i].next    = &queue[i + 1];
  }
  
  queue[0].sa = sa;
  queue[0].sb = sb;
  
  if (!range_n) {
    n_from = 0;
    n_to   = args -> n;
  } else {
    n_from = range_n[0];
    n_to   = range_n[1];
  }

  for(js = n_from; js < n_to; js += GEMM_R * nthreads){
    n = n_to - js;
    if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
    
    range_N[0] = js;

    num_cpu_n  = 0;

    while (n > 0){
      
      width  = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
      
      n -= width;
      if (n < 0) width = width + n;
      
      range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
      
      num_cpu_n ++;
    }
    
    for (j = 0; j < num_cpu_m; j++) {
      for (i = 0; i < num_cpu_m; i++) {
	for (k = 0; k < DIVIDE_RATE; k++) {
	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	}
      }
    }
    
    queue[num_cpu_m - 1].next = NULL;

    exec_blas(num_cpu_m, queue);
  }

  return 0;
}
Esempio n. 5
0
int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

  BLASLONG width, i, num_cpu;

  double dnum;
  int mask = 7;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif
#endif

  args.m = m;

  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)(buffer);

  args.ldb = incx;
  args.ldc = incx;

  dnum = (double)m * (double)m / (double)nthreads;
  num_cpu  = 0;

#ifndef LOWER

  range_m[MAX_CPU_NUMBER] = m;
  i          = 0;

  while (i < m){

    if (nthreads - num_cpu > 1) {

      double di = (double)(m - i);
      if (di * di - dnum > 0) {
	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
      } else {
	width = m - i;
      }

      if (width < 16) width = 16;
      if (width > m - i) width = m - i;

    } else {
      width = m - i;
    }

    range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
    range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = tpmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i += width;
  }

#else

  range_m[0] = 0;
  i          = 0;

  while (i < m){

    if (nthreads - num_cpu > 1) {

      double di = (double)(m - i);
      if (di * di - dnum > 0) {
	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
      } else {
	width = m - i;
      }

      if (width < 16) width = 16;
      if (width > m - i) width = m - i;

    } else {
      width = m - i;
    }

    range_m[num_cpu + 1] = range_m[num_cpu] + width;
    range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = tpmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[num_cpu];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i += width;
  }

#endif

  if (num_cpu) {
    queue[0].sa = NULL;
    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;

    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

#ifndef TRANS
  for (i = 1; i < num_cpu; i ++) {

#ifndef LOWER

    AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE,
#ifdef COMPLEX
	    ZERO,
#endif
	    buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);

#else

    AXPYU_K(m - range_m[i], 0, 0, ONE,
#ifdef COMPLEX
	    ZERO,
#endif
	    buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0);

#endif

  }
#endif

  COPY_K(m, buffer, 1, x, incx);

  return 0;
}
Esempio n. 6
0
int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
		       void *a, BLASLONG lda,
		       void *b, BLASLONG ldb, 
		       void *c, BLASLONG ldc, int (*function)(), int nthreads){
  
  blas_queue_t queue[MAX_CPU_NUMBER];
  blas_arg_t   args [MAX_CPU_NUMBER];

  BLASLONG i, width, astride, bstride;
  int num_cpu, calc_type;

  calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2;
  
  mode |= BLAS_LEGACY;

  for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);

  num_cpu = 0;
  i = m;
  
  while (i > 0){
    
    /* Adjust Parameters */
    width  = blas_quickdivide(i + nthreads - num_cpu - 1,
			      nthreads - num_cpu);

    i -= width;
    if (i < 0) width = width + i;
    
    astride = width * lda;

    if (!(mode & BLAS_TRANSB_T)) {
      bstride = width * ldb;
    } else {
      bstride = width;
    }

    astride <<= calc_type;
    bstride <<= calc_type;

    args[num_cpu].m = width;
    args[num_cpu].n = n;
    args[num_cpu].k = k;
    args[num_cpu].a = (void *)a;
    args[num_cpu].b = (void *)b;
    args[num_cpu].c = (void *)c;
    args[num_cpu].lda = lda;
    args[num_cpu].ldb = ldb;
    args[num_cpu].ldc = ldc;
    args[num_cpu].alpha = alpha;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = function;
    queue[num_cpu].args    = &args[num_cpu];
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    a = (void *)((BLASULONG)a + astride);
    b = (void *)((BLASULONG)b + bstride);
  
    num_cpu ++;
  }

  if (num_cpu) {
    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  return 0;
}
Esempio n. 7
0
int CNAME(BLASLONG n, BLASLONG k, FLOAT  alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

  BLASLONG width, i, num_cpu;
  double dnum;
  int mask = 7;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif
#endif

  args.n = n;
  args.k = k;
  
  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)buffer;
    
  args.lda = lda;
  args.ldb = incx;
  args.ldc = incy;

  dnum = (double)n * (double)n / (double)nthreads;
  num_cpu  = 0;
  
  if (n < 2 * k) {

#ifndef LOWER

    range_m[MAX_CPU_NUMBER] = n;
    i          = 0;
    
    while (i < n){
      
      if (nthreads - num_cpu > 1) {
	
	double di = (double)(n - i);
	if (di * di - dnum > 0) {
	  width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
	} else {
	  width = n - i;
	}
	
	if (width < 16) width = 16;
	if (width > n - i) width = n - i;
	
      } else {
	width = n - i;
      }
      
      range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
      range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i += width;
    }
    
#else
    
    range_m[0] = 0;
    i          = 0;
    
    while (i < n){
      
      if (nthreads - num_cpu > 1) {
	
	double di = (double)(n - i);
	if (di * di - dnum > 0) {
	  width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
	} else {
	  width = n - i;
	}
	
	if (width < 16) width = 16;
	if (width > n - i) width = n - i;
	
      } else {
	width = n - i;
      }
      
      range_m[num_cpu + 1] = range_m[num_cpu] + width;
      range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[num_cpu];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i += width;
    }
    
#endif
    
  } else {
    
    range_m[0] = 0;
    i          = n;
    
    while (i > 0){
      
      width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
      
      if (width < 4) width = 4;
      if (i < width) width = i;
      
      range_m[num_cpu + 1] = range_m[num_cpu] + width;
      
      range_n[num_cpu] = num_cpu * ((n + 15) & ~15);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[num_cpu];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i -= width;
    }
  }

  if (num_cpu) {
    queue[0].sa = NULL;
    queue[0].sb = buffer;
    queue[num_cpu - 1].next = NULL;
  
    exec_blas(num_cpu, queue);
  }
  
  for (i = 1; i < num_cpu; i ++) {
    AXPYU_K(n, 0, 0,
#ifndef COMPLEX
	    ONE,
#else
	    ONE, ZERO,
#endif
	    (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
  }

  AXPYU_K(n, 0, 0,
#ifndef COMPLEX
	  alpha,
#else
	  alpha[0], alpha[1],
#endif
	  buffer, 1, y, incy, NULL, 0);
  
  return 0;
}
Esempio n. 8
0
int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
  BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;

  BLASLONG width, i, j;
  BLASLONG divM, divN;

  divM = divide_rule[nthreads][0];
  divN = divide_rule[nthreads][1];

  if (!range_m) {
    range_M[0] = 0;
    i          = arg -> m;
  } else {
    range_M[0] = range_M[0];
    i          = range_M[1] - range_M[0];
  }

  num_cpu_m  = 0;

  while (i > 0){
    
    width  = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m);

    i -= width;
    if (i < 0) width = width + i;

    range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;

    num_cpu_m ++;
  }

  if (!range_n) {
    range_N[0] = 0;
    i          = arg -> n;
  } else {
    range_N[0] = range_n[0];
    i          = range_n[1] - range_n[0];
  }

  num_cpu_n  = 0;

  while (i > 0){
    
    width  = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n);

    i -= width;
    if (i < 0) width = width + i;

    range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;

    num_cpu_n ++;
  }

  procs = 0;

  for (j = 0; j < num_cpu_n; j++) {
    for (i = 0; i < num_cpu_m; i++) {

    queue[procs].mode    = mode;
    queue[procs].routine = function;
    queue[procs].args    = arg;
    queue[procs].range_m = &range_M[i];
    queue[procs].range_n = &range_N[j];
    queue[procs].sa      = NULL;
    queue[procs].sb      = NULL;
    queue[procs].next    = &queue[procs + 1];

    procs ++;
    }
  }
  
  if (procs) {
    queue[0].sa = sa;
    queue[0].sb = sb;

    queue[procs - 1].next = NULL;
    
    exec_blas(procs, queue);
  }
  
  return 0;
}
Esempio n. 9
0
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT  alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif
#endif

  args.m = m;
  args.n = n;

  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)buffer;

  args.lda = lda;
  args.ldb = incx;
  args.ldc = ku;
  args.ldd = kl;

  num_cpu  = 0;

  range_n[0] = 0;
  i          = n;

  while (i > 0){

    width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);

    if (width < 4) width = 4;
    if (i < width) width = i;

    range_n[num_cpu + 1] = range_n[num_cpu] + width;

#ifndef TRANSA
    range_m[num_cpu] = num_cpu * ((m + 15) & ~15);
#else
    range_m[num_cpu] = num_cpu * ((n + 15) & ~15);
#endif

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = gbmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[num_cpu];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i -= width;
  }

  if (num_cpu) {
    queue[0].sa = NULL;
#ifndef TRANSA
    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
#else
    queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
#endif

    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  for (i = 1; i < num_cpu; i ++) {
    AXPYU_K(
#ifndef TRANSA
	    m,
#else
	    n,
#endif
	    0, 0,
#ifndef COMPLEX
	    ONE,
#else
	    ONE, ZERO,
#endif
	    buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
  }

  AXPYU_K(
#ifndef TRANSA
	    m,
#else
	    n,
#endif
	    0, 0,
#ifndef COMPLEX
	    alpha,
#else
	    alpha[0], alpha[1],
#endif
	    buffer, 1, y, incy, NULL, 0);

  return 0;
}
Esempio n. 10
0
static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){

  blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
  job_t          job[MAX_CPU_NUMBER];
#else
  job_t *        job = NULL;
#endif

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range[MAX_CPU_NUMBER + 100];

  BLASLONG num_cpu;

  BLASLONG nthreads = args -> nthreads;

  BLASLONG width, i, j, k;
  BLASLONG n, n_from, n_to;
  int  mode, mask;
  double dnum;

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
  mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
  mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
  mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
  mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
  mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
  mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif  
#endif

  newarg.m        = args -> m;
  newarg.k        = args -> k;
  newarg.a        = args -> a;
  newarg.b        = args -> b;
  newarg.c        = args -> c;
  newarg.lda      = args -> lda;
  newarg.alpha    = args -> alpha;

#ifdef USE_ALLOC_HEAP
  job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  if(job==NULL){
    fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
    exit(1);
  }
#endif

  newarg.common   = (void *)job;
   
  n_from = 0;
  n_to   = args -> m;

#ifndef LOWER

  range[MAX_CPU_NUMBER] = n_to - n_from;
  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
      double di   = (double)i;
      
      width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
      
      if (num_cpu == 0) width = n - ((n - width) & ~mask);
      
      if ((width > n - i) || (width < mask)) width = n - i;
      
    } else {
      width = n - i;
    }

    range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = NULL;

    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

   for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];

#else

  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
	double di   = (double)i;
	
	width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
	
      if ((width > n - i) || (width < mask)) width = n - i;
	
    } else {
      width = n - i;
    }

    range[num_cpu + 1] = range[num_cpu] + width;
    
    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = NULL;
    queue[num_cpu].range_n = range;
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

#endif

  newarg.nthreads = num_cpu;

  if (num_cpu) {

    for (j = 0; j < num_cpu; j++) {
      for (i = 0; i < num_cpu; i++) {
	for (k = 0; k < DIVIDE_RATE; k++) {
	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	}
      }
    }
    
    queue[0].sa = sa;
    queue[0].sb = sb;
    queue[num_cpu - 1].next = NULL;
    
    exec_blas(num_cpu, queue);
  }
 
#ifdef USE_ALLOC_HEAP
  free(job);
#endif

  return 0;
}
Esempio n. 11
0
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG k, lda, ldc;
  BLASLONG m_from, m_to, n_from, n_to;

  FLOAT *alpha, *beta;
  FLOAT *a, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;

  BLASLONG ls, min_l, jjs, min_jj;
  BLASLONG is, min_i, div_n;

  BLASLONG i, current;
#ifdef LOWER
  BLASLONG start_i;
#endif

#ifdef TIMING
  BLASLONG rpcc_counter;
  BLASLONG copy_A = 0;
  BLASLONG copy_B = 0;
  BLASLONG kernel = 0;
  BLASLONG waiting1 = 0;
  BLASLONG waiting2 = 0;
  BLASLONG waiting3 = 0;
  BLASLONG waiting6[MAX_CPU_NUMBER];
  BLASLONG ops    = 0;

  for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif

  k = K;

  a = (FLOAT *)A;
  c = (FLOAT *)C;

  lda = LDA;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = N;

  /* Global Range */
  n_from = 0;
  n_to   = N;

  if (range_n) {
    m_from = range_n[mypos + 0];
    m_to   = range_n[mypos + 1];

    n_from = range_n[0];
    n_to   = range_n[args -> nthreads];
  }

  if (beta) {
#if !defined(COMPLEX) || defined(HERK)
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#if defined(COMPLEX) && !defined(HERK)
      && (alpha[1] == ZERO)
#endif
      ) return 0;

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",  mypos, m_from, m_to, n_from, n_to);
#endif

  div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
	                            + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);

  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
  }
  
  for(ls = 0; ls < k; ls += min_l){

    min_l = k - ls;
    if (min_l >= GEMM_Q * 2) {
      min_l  = GEMM_Q;
    } else {
      if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
    }

    min_i = m_to - m_from;
    
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else {
      if (min_i > GEMM_P) {
	min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
      }
    }

#ifdef LOWER
    xxx = (m_to - m_from - min_i) % GEMM_P;

    if (xxx) min_i -= GEMM_P - xxx;
#endif

    START_RPCC();
    
#ifndef LOWER
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
#else
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa);
#endif
    
    STOP_RPCC(copy_A);
    
    div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
	                              + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
    
    for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using buffer */
#ifndef LOWER
      for (i = 0; i < mypos; i++)
#else
      for (i = mypos + 1; i < args -> nthreads; i++)
#endif
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
#ifndef LOWER

      for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){

	min_jj = MIN(m_to, xxx + div_n) - jjs;

	if (xxx == m_from) {
	  if (min_jj > min_i) min_jj = min_i;
	} else {
	  if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
	}
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
			 c, ldc, m_from, jjs);

	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }

#else

      for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){

	min_jj = MIN(m_to, xxx + div_n) - jjs;

	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
			 c, ldc, m_to - min_i, jjs);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }

#endif
	
#ifndef LOWER
      for (i = 0; i <= mypos; i++)
#else
      for (i = mypos; i < args -> nthreads; i++)
#endif
	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];

      WMB;
    }

    
#ifndef LOWER
    current = mypos + 1;
    while (current < args -> nthreads) {
#else
    current = mypos - 1;
    while (current >= 0) {
#endif

	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
		 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	  
	  START_RPCC();
	  
#ifndef LOWER
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, 
			   m_from,
			   xxx);
#else
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, 
			   m_to - min_i,
			   xxx);
#endif
	  
	  STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	  
	  if (m_to - m_from == min_i) {
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	  }
	}
	
#ifndef LOWER
	current ++;
#else
	current --;
#endif
    }

#ifndef LOWER
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
#else
    start_i = min_i;

    for(is = m_from; is < m_to - start_i; is += min_i){
      min_i = m_to - start_i - is;
#endif

      if (min_i >= GEMM_P * 2) {
	min_i = GEMM_P;
      } else 
	if (min_i > GEMM_P) {
	  min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	}

      START_RPCC();
      
      ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;

      do {
	
	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
		                                                     + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	  STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif
	  
#ifndef LOWER
	  if (is + min_i >= m_to) {
#else
	  if (is + min_i >= m_to - start_i) {
#endif
	    /* Thread doesn't need this buffer any more */
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	    WMB;
	  }
	}
	
#ifndef LOWER
	current ++;
      } while (current != args -> nthreads);
#else
	current --;
      } while (current >= 0);
#endif
	
     
    }
  }
  
  START_RPCC();

  for (i = 0; i < args -> nthreads; i++) {
    if (i != mypos) {
      for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
      }
    }
  }

  STOP_RPCC(waiting3);

#ifdef TIMING
  BLASLONG waiting = waiting1 + waiting2 + waiting3;
  BLASLONG total = copy_A + copy_B + kernel + waiting;

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	  (double)waiting1 /(double)total * 100.,
	  (double)waiting2 /(double)total * 100.,
	  (double)waiting3 /(double)total * 100.,
	  (double)ops/(double)kernel / 4. * 100.);

#if 0
  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	  mypos, copy_A, copy_B, waiting);

  fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
	  mypos,
	  (double)waiting1/(double)waiting * 100.,
	  (double)waiting2/(double)waiting * 100.,
	  (double)waiting3/(double)waiting * 100.);
#endif
  fprintf(stderr, "\n");
#endif

  return 0;
}

int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
  job_t          job[MAX_CPU_NUMBER];
#else
  job_t *        job = NULL;
#endif

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range[MAX_CPU_NUMBER + 100];

  BLASLONG num_cpu;

  BLASLONG nthreads = args -> nthreads;

  BLASLONG width, i, j, k;
  BLASLONG n, n_from, n_to;
  int  mode, mask;
  double dnum;

  if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
    SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); 
    return 0;
  }

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
  mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
  mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
  mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
  mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
  mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
  mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif  
#endif

  newarg.m        = args -> m;
  newarg.n        = args -> n;
  newarg.k        = args -> k;
  newarg.a        = args -> a;
  newarg.b        = args -> b;
  newarg.c        = args -> c;
  newarg.lda      = args -> lda;
  newarg.ldb      = args -> ldb;
  newarg.ldc      = args -> ldc;
  newarg.alpha    = args -> alpha;
  newarg.beta     = args -> beta;

#ifdef USE_ALLOC_HEAP
  job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  if(job==NULL){
    fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
    exit(1);
  }
#endif

  newarg.common   = (void *)job;
   
  if (!range_n) {
    n_from = 0;
    n_to   = args -> n;
  } else {
    n_from = range_n[0];
    n_to   = range_n[1] - range_n[0];
  }

#ifndef LOWER

  range[MAX_CPU_NUMBER] = n_to - n_from;
  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
      double di   = (double)i;
      
      width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
      
      if (num_cpu == 0) width = n - ((n - width) & ~mask);
      
      if ((width > n - i) || (width < mask)) width = n - i;
      
    } else {
      width = n - i;
    }

    range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = range_m;

    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

   for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];

#else

  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
	double di   = (double)i;
	
	width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
	
      if ((width > n - i) || (width < mask)) width = n - i;
	
    } else {
      width = n - i;
    }

    range[num_cpu + 1] = range[num_cpu] + width;
    
    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = range_m;
    queue[num_cpu].range_n = range;
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

#endif

  newarg.nthreads = num_cpu;

  if (num_cpu) {

    for (j = 0; j < num_cpu; j++) {
      for (i = 0; i < num_cpu; i++) {
	for (k = 0; k < DIVIDE_RATE; k++) {
	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	}
      }
    }
    
    queue[0].sa = sa;
    queue[0].sb = sb;
    queue[num_cpu - 1].next = NULL;
    
    exec_blas(num_cpu, queue);
  }
 
#ifdef USE_ALLOC_HEAP
  free(job);
#endif

  return 0;
}
Esempio n. 12
0
int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, i;
  BLASLONG n_from, n_to;
  double dnum, nf, nt, di;

  int num_cpu;
  int mask = 0;

  if (!(mode & BLAS_COMPLEX)) {

    switch (mode & BLAS_PREC) {
    case BLAS_SINGLE:
      mask = SGEMM_UNROLL_MN - 1;
      break;
    case BLAS_DOUBLE:
      mask = DGEMM_UNROLL_MN - 1;
      break;
#ifdef EXPRECISION
    case BLAS_XDOUBLE:
      mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
      break;
#endif
    }
  } else {
    switch (mode & BLAS_PREC) {
    case BLAS_SINGLE:
      mask = CGEMM_UNROLL_MN - 1;
      break;
    case BLAS_DOUBLE:
      mask = ZGEMM_UNROLL_MN - 1;
      break;
#ifdef EXPRECISION
    case BLAS_XDOUBLE:
      mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
      break;
#endif
    }
  }

  n_from = 0;
  n_to   = arg -> n;

  if (range_n) {
    n_from = *(range_n + 0);
    n_to   = *(range_n + 1);
  }

  if (!(mode & BLAS_UPLO)) {

    nf = (double)(n_from);
    nt = (double)(n_to);

    dnum = (nt * nt - nf * nf) / (double)nthreads;

    num_cpu  = 0;

    range[0] = n_from;
    i        = n_from;

    while (i < n_to){

      if (nthreads - num_cpu > 1) {

	di = (double)i;
	width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;

	if ((width <= 0) || (width > n_to - i)) width = n_to - i;

      } else {
	width = n_to - i;
      }

      range[num_cpu + 1] = range[num_cpu] + width;

      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = function;
      queue[num_cpu].args    = arg;
      queue[num_cpu].range_m = range_m;
      queue[num_cpu].range_n = &range[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];

      num_cpu ++;
      i += width;
    }

  } else {

    nf = (double)(arg -> n - n_from);
    nt = (double)(arg -> n - n_to);

    dnum = (nt * nt - nf * nf) / (double)nthreads;

    num_cpu  = 0;

    range[0] = n_from;
    i        = n_from;

    while (i < n_to){

      if (nthreads - num_cpu > 1) {

	di = (double)(arg -> n - i);
	width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;

	if ((width <= 0) || (width > n_to - i)) width = n_to - i;

      } else {
	width = n_to - i;
      }

      range[num_cpu + 1] = range[num_cpu] + width;

      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = function;
      queue[num_cpu].args    = arg;
      queue[num_cpu].range_m = range_m;
      queue[num_cpu].range_n = &range[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];

      num_cpu ++;
      i += width;
    }

  }

  if (num_cpu) {
    queue[0].sa = sa;
    queue[0].sb = sb;
    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  return 0;
}