Esempio n. 1
0
int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

  if (!range_n) {
    range[0] = 0;
    i        = arg -> n;
  } else {
    range[0] = range_n[0];
    i        = range_n[1] - range_n[0];
  }
  
  num_cpu  = 0;

  while (i > 0){
    
    width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);

    i -= width;
    if (i < 0) width = width + i;

    range[num_cpu + 1] = range[num_cpu] + width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = function;
    queue[num_cpu].args    = arg;
    queue[num_cpu].range_m = range_m;
    queue[num_cpu].range_n = &range[num_cpu];
#if 0   //defined(LOONGSON3A)
    queue[num_cpu].sa      = sa	+ GEMM_OFFSET_A1 * num_cpu;
    queue[num_cpu].sb      = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else
	queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
#endif
    queue[num_cpu].next    = &queue[num_cpu + 1];
    num_cpu ++;
  }
  
  if (num_cpu) {
#if 0 //defined(LOONGSON3A)
    queue[0].sa = sa;
    queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else
	queue[0].sa = sa;
	queue[0].sb = sb;
#endif
    queue[num_cpu - 1].next = NULL;
    
    exec_blas(num_cpu,
	      queue);
  }
   
  return 0;
}
Esempio n. 2
0
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
		       *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  blas_arg_t newarg;

  job_t          job[MAX_CPU_NUMBER];
  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1];
  BLASLONG range_N[MAX_CPU_NUMBER + 1];

  BLASLONG num_cpu_m, num_cpu_n;

  BLASLONG nthreads = args -> nthreads;

  BLASLONG width, i, j, k, js;
  BLASLONG m, n, n_from, n_to;
  int  mode;

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL | BLAS_NODE;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL | BLAS_NODE;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX | BLAS_NODE;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX | BLAS_NODE;
#endif  
#endif

  newarg.m        = args -> m;
  newarg.n        = args -> n;
  newarg.k        = args -> k;
  newarg.a        = args -> a;
  newarg.b        = args -> b;
  newarg.c        = args -> c;
  newarg.lda      = args -> lda;
  newarg.ldb      = args -> ldb;
  newarg.ldc      = args -> ldc;
  newarg.alpha    = args -> alpha;
  newarg.beta     = args -> beta;
  newarg.nthreads = args -> nthreads;
  newarg.common   = (void *)job;
   
#ifdef PARAMTEST
  newarg.gemm_p  = args -> gemm_p;
  newarg.gemm_q  = args -> gemm_q;
  newarg.gemm_r  = args -> gemm_r;
#endif

  if (!range_m) {
    range_M[0] = 0;
    m          = args -> m;
  } else {
    range_M[0] = range_m[0];
    m          = range_m[1] - range_m[0];
  }

  num_cpu_m  = 0;

  while (m > 0){
    
    width  = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m);

    m -= width;
    if (m < 0) width = width + m;

    range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;

    num_cpu_m ++;
  }

  for (i = 0; i < num_cpu_m; i++) {
    queue[i].mode    = mode;
    queue[i].routine = inner_thread;
    queue[i].args    = &newarg;
    queue[i].range_m = &range_M[i];
    queue[i].range_n = &range_N[0];
    queue[i].sa      = NULL;
    queue[i].sb      = NULL;
    queue[i].next    = &queue[i + 1];
  }
  
  queue[0].sa = sa;
  queue[0].sb = sb;
  
  if (!range_n) {
    n_from = 0;
    n_to   = args -> n;
  } else {
    n_from = range_n[0];
    n_to   = range_n[1];
  }

  for(js = n_from; js < n_to; js += GEMM_R * nthreads){
    n = n_to - js;
    if (n > GEMM_R * nthreads) n = GEMM_R * nthreads;
    
    range_N[0] = js;

    num_cpu_n  = 0;

    while (n > 0){
      
      width  = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n);
      
      n -= width;
      if (n < 0) width = width + n;
      
      range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;
      
      num_cpu_n ++;
    }
    
    for (j = 0; j < num_cpu_m; j++) {
      for (i = 0; i < num_cpu_m; i++) {
	for (k = 0; k < DIVIDE_RATE; k++) {
	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	}
      }
    }
    
    queue[num_cpu_m - 1].next = NULL;

    exec_blas(num_cpu_m, queue);
  }

  return 0;
}
Esempio n. 3
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;
  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, nn, num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common = NULL;
  newarg.nthreads = args -> nthreads;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  width = FORMULA1(m, n, 0, init_bk, args -> nthreads);
  width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (width > n - init_bk) width = n - init_bk;

  if (width < init_bk) {
    long temp;

    temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
    temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N;
    if (temp < init_bk) init_bk = temp;

  }

  next_bk = init_bk;
  bk      = init_bk;

  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  info   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  is = 0;
  num_cpu = 0;

  sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);

  while (is < mn) {

    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    
    if (width < bk) {

      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif

      width = next_bk;
    }
    
    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    nn = n - bk - is;
    if (width > nn) width = nn;

    if (num_cpu > 1)  exec_blas_async_wait(num_cpu - 1, &queue[1]);

    range[0] = 0;
    range[1] = width;
    
    num_cpu = 1;
    nn -= width;
    
    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = m - bk - is;
    newarg.n   = n - bk - is;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    while (nn > 0){
      
      width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu);
      
      nn -= width;
      if (nn < 0) width = width + nn;
      
      range[num_cpu + 1] = range[num_cpu] + width;
      
      queue[num_cpu].mode    = mode;
      //queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].routine = (void *)inner_basic_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = NULL;
      queue[num_cpu].range_n = &range[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;

      num_cpu ++;
    }
    
    queue[num_cpu - 1].next = NULL;

    is += bk;
    
    bk = n - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;
    
    if (num_cpu > 1) {

      exec_blas_async(1, &queue[1]);
    
#if 0
      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0);

      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
#else

      if (range[1] >= bk * 4) {

	BLASLONG myrange[2];

	myrange[0] = 0;
	myrange[1] = bk;

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

	myrange[0] = bk;
	myrange[1] = range[1];

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

      } else {

	inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      }

#endif

      for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
      
      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);
      
    } else {

      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);
      
      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
    }

      if (iinfo && !info) info = iinfo + is;
      
  }
  
  next_bk = init_bk;
  bk      = init_bk;
  
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (width < bk) {
      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif
    }

    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
Esempio n. 4
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;

  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1];
  BLASLONG range_N[MAX_CPU_NUMBER + 1];

  job_t        job[MAX_CPU_NUMBER];

  BLASLONG width, nn, mm;
  BLASLONG i, j, k, is, bk;

  BLASLONG num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common   = (void *)job;

  info = 0;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  next_bk = init_bk;

  bk = mn;
  if (bk > next_bk) bk = next_bk;
  
  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  iinfo   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  if (iinfo && !info) info = iinfo;
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
  
  is = 0;
  num_cpu = 0;

  while (is < mn) {
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      
      if (next_bk > bk) next_bk = bk;

      width = next_bk;
      if (width > mn - is - bk) width = mn - is - bk;
    }
    
    if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);

    mm = m - bk - is;
    nn = n - bk - is;

    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = mm;
    newarg.n   = nn;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    nn -= width;

    range_n_mine[0] = 0;
    range_n_mine[1] = width;

    range_N[0] = width;
    range_M[0] = 0;

    num_cpu  = 0;
    
    while (nn > 0){
      
      if (mm >= nn) {

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	if (nn <=    0) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

      } else {

	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	if (mm <=    0) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
      }

      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = &range_M[num_cpu];
      queue[num_cpu].range_n = &range_N[0];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;
      
      num_cpu ++;

    }
    
    newarg.nthreads = num_cpu;
    
    if (num_cpu > 0) {
      for (j = 0; j < num_cpu; j++) {
	for (i = 0; i < num_cpu; i++) {
	  for (k = 0; k < DIVIDE_RATE; k++) {
	    job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	  }
	}
      }
    }

    is += bk;

    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;

    if (num_cpu > 0) {

      queue[num_cpu - 1].next = NULL;
      
      exec_blas_async(0, &queue[0]);
      
      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
      
      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      
      if (iinfo && !info) info = iinfo + is;

      for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};

      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);

    } else {

      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);

      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

      if (iinfo && !info) info = iinfo + is;
    
    }
    
  }
  
  next_bk = init_bk;
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      if (next_bk > bk) next_bk = bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
Esempio n. 5
0
int CNAME(BLASLONG n, BLASLONG k, FLOAT  alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

  BLASLONG width, i, num_cpu;
  double dnum;
  int mask = 7;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif
#endif

  args.n = n;
  args.k = k;
  
  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)buffer;
    
  args.lda = lda;
  args.ldb = incx;
  args.ldc = incy;

  dnum = (double)n * (double)n / (double)nthreads;
  num_cpu  = 0;
  
  if (n < 2 * k) {

#ifndef LOWER

    range_m[MAX_CPU_NUMBER] = n;
    i          = 0;
    
    while (i < n){
      
      if (nthreads - num_cpu > 1) {
	
	double di = (double)(n - i);
	if (di * di - dnum > 0) {
	  width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
	} else {
	  width = n - i;
	}
	
	if (width < 16) width = 16;
	if (width > n - i) width = n - i;
	
      } else {
	width = n - i;
      }
      
      range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
      range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i += width;
    }
    
#else
    
    range_m[0] = 0;
    i          = 0;
    
    while (i < n){
      
      if (nthreads - num_cpu > 1) {
	
	double di = (double)(n - i);
	if (di * di - dnum > 0) {
	  width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
	} else {
	  width = n - i;
	}
	
	if (width < 16) width = 16;
	if (width > n - i) width = n - i;
	
      } else {
	width = n - i;
      }
      
      range_m[num_cpu + 1] = range_m[num_cpu] + width;
      range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[num_cpu];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i += width;
    }
    
#endif
    
  } else {
    
    range_m[0] = 0;
    i          = n;
    
    while (i > 0){
      
      width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);
      
      if (width < 4) width = 4;
      if (i < width) width = i;
      
      range_m[num_cpu + 1] = range_m[num_cpu] + width;
      
      range_n[num_cpu] = num_cpu * ((n + 15) & ~15);
      
      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = sbmv_kernel;
      queue[num_cpu].args    = &args;
      queue[num_cpu].range_m = &range_m[num_cpu];
      queue[num_cpu].range_n = &range_n[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      
      num_cpu ++;
      i -= width;
    }
  }

  if (num_cpu) {
    queue[0].sa = NULL;
    queue[0].sb = buffer;
    queue[num_cpu - 1].next = NULL;
  
    exec_blas(num_cpu, queue);
  }
  
  for (i = 1; i < num_cpu; i ++) {
    AXPYU_K(n, 0, 0,
#ifndef COMPLEX
	    ONE,
#else
	    ONE, ZERO,
#endif
	    (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
  }

  AXPYU_K(n, 0, 0,
#ifndef COMPLEX
	  alpha,
#else
	  alpha[0], alpha[1],
#endif
	  buffer, 1, y, incy, NULL, 0);
  
  return 0;
}
Esempio n. 6
0
int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
		       void *a, BLASLONG lda,
		       void *b, BLASLONG ldb, 
		       void *c, BLASLONG ldc, int (*function)(), int nthreads){
  
  blas_queue_t queue[MAX_CPU_NUMBER];
  blas_arg_t   args [MAX_CPU_NUMBER];

  BLASLONG i, width, astride, bstride;
  int num_cpu, calc_type;

  calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2;
  
  mode |= BLAS_LEGACY;

  for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);

  num_cpu = 0;
  i = m;
  
  while (i > 0){
    
    /* Adjust Parameters */
    width  = blas_quickdivide(i + nthreads - num_cpu - 1,
			      nthreads - num_cpu);

    i -= width;
    if (i < 0) width = width + i;
    
    astride = width * lda;

    if (!(mode & BLAS_TRANSB_T)) {
      bstride = width * ldb;
    } else {
      bstride = width;
    }

    astride <<= calc_type;
    bstride <<= calc_type;

    args[num_cpu].m = width;
    args[num_cpu].n = n;
    args[num_cpu].k = k;
    args[num_cpu].a = (void *)a;
    args[num_cpu].b = (void *)b;
    args[num_cpu].c = (void *)c;
    args[num_cpu].lda = lda;
    args[num_cpu].ldb = ldb;
    args[num_cpu].ldc = ldc;
    args[num_cpu].alpha = alpha;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = function;
    queue[num_cpu].args    = &args[num_cpu];
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    a = (void *)((BLASULONG)a + astride);
    b = (void *)((BLASULONG)b + bstride);
  
    num_cpu ++;
  }

  if (num_cpu) {
    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  return 0;
}
Esempio n. 7
0
int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
  BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;

  BLASLONG width, i, j;
  BLASLONG divM, divN;

  divM = divide_rule[nthreads][0];
  divN = divide_rule[nthreads][1];

  if (!range_m) {
    range_M[0] = 0;
    i          = arg -> m;
  } else {
    range_M[0] = range_M[0];
    i          = range_M[1] - range_M[0];
  }

  num_cpu_m  = 0;

  while (i > 0){
    
    width  = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m);

    i -= width;
    if (i < 0) width = width + i;

    range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width;

    num_cpu_m ++;
  }

  if (!range_n) {
    range_N[0] = 0;
    i          = arg -> n;
  } else {
    range_N[0] = range_n[0];
    i          = range_n[1] - range_n[0];
  }

  num_cpu_n  = 0;

  while (i > 0){
    
    width  = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n);

    i -= width;
    if (i < 0) width = width + i;

    range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width;

    num_cpu_n ++;
  }

  procs = 0;

  for (j = 0; j < num_cpu_n; j++) {
    for (i = 0; i < num_cpu_m; i++) {

    queue[procs].mode    = mode;
    queue[procs].routine = function;
    queue[procs].args    = arg;
    queue[procs].range_m = &range_M[i];
    queue[procs].range_n = &range_N[j];
    queue[procs].sa      = NULL;
    queue[procs].sb      = NULL;
    queue[procs].next    = &queue[procs + 1];

    procs ++;
    }
  }
  
  if (procs) {
    queue[0].sa = sa;
    queue[0].sb = sb;

    queue[procs - 1].next = NULL;
    
    exec_blas(procs, queue);
  }
  
  return 0;
}
Esempio n. 8
0
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT  alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#else
int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){
#endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif
#endif

  args.m = m;
  args.n = n;

  args.a = (void *)a;
  args.b = (void *)x;
  args.c = (void *)buffer;

  args.lda = lda;
  args.ldb = incx;
  args.ldc = ku;
  args.ldd = kl;

  num_cpu  = 0;

  range_n[0] = 0;
  i          = n;

  while (i > 0){

    width  = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu);

    if (width < 4) width = 4;
    if (i < width) width = i;

    range_n[num_cpu + 1] = range_n[num_cpu] + width;

#ifndef TRANSA
    range_m[num_cpu] = num_cpu * ((m + 15) & ~15);
#else
    range_m[num_cpu] = num_cpu * ((n + 15) & ~15);
#endif

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = gbmv_kernel;
    queue[num_cpu].args    = &args;
    queue[num_cpu].range_m = &range_m[num_cpu];
    queue[num_cpu].range_n = &range_n[num_cpu];
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];

    num_cpu ++;
    i -= width;
  }

  if (num_cpu) {
    queue[0].sa = NULL;
#ifndef TRANSA
    queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
#else
    queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
#endif

    queue[num_cpu - 1].next = NULL;

    exec_blas(num_cpu, queue);
  }

  for (i = 1; i < num_cpu; i ++) {
    AXPYU_K(
#ifndef TRANSA
	    m,
#else
	    n,
#endif
	    0, 0,
#ifndef COMPLEX
	    ONE,
#else
	    ONE, ZERO,
#endif
	    buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
  }

  AXPYU_K(
#ifndef TRANSA
	    m,
#else
	    n,
#endif
	    0, 0,
#ifndef COMPLEX
	    alpha,
#else
	    alpha[0], alpha[1],
#endif
	    buffer, 1, y, incy, NULL, 0);

  return 0;
}