Esempio n. 1
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {

  BLASLONG n, info;
  BLASLONG bk, i, blocking;
  int mode;
  BLASLONG lda, range_N[2];
  blas_arg_t newarg;
  FLOAT *a;
  FLOAT alpha[2] = { ONE, ZERO};
  FLOAT beta [2] = {-ONE, ZERO};

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif

  n  = args -> n;
  a  = (FLOAT *)args -> a;
  lda = args -> lda;

  if (range_n) n  = range_n[1] - range_n[0];

  if (n <= DTB_ENTRIES) {
    info = TRTI2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  blocking = GEMM_Q;
  if (n < 4 * GEMM_Q) blocking = (n + 3) / 4;

  for (i = 0; i < n; i += blocking) {
    bk = n - i;
    if (bk > blocking) bk = blocking;

    range_N[0] = i;
    range_N[1] = i + bk;

    newarg.lda = lda;
    newarg.ldb = lda;
    newarg.ldc = lda;
    newarg.alpha = alpha;

    newarg.m = i;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;
    newarg.b = a + (    i * lda) * COMPSIZE;

    newarg.beta  = beta;
    newarg.nthreads = args -> nthreads;

    gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads);

    newarg.m = bk;
    newarg.n = bk;

    newarg.a = a + (i + i * lda) * COMPSIZE;

    CNAME  (&newarg, NULL, NULL, sa, sb, 0);

    newarg.m = i;
    newarg.n = n - i - bk;
    newarg.k = bk;

    newarg.a = a + (     i       * lda) * COMPSIZE;
    newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
    newarg.c = a + (    (i + bk) * lda) * COMPSIZE;

    newarg.beta  = NULL;

    gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads);

    newarg.a = a + (i +  i       * lda) * COMPSIZE;
    newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;

    newarg.m = bk;
    newarg.n = n - i - bk;

    gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads);

  }

  return 0;
}
Esempio n. 2
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, lda, offset;
  BLASLONG j, js, jmin, is, imin, jc, jcmin;
  BLASLONG jjs, min_jj;
  blasint *ipiv, iinfo, info;
  BLASLONG jb, mn, blocking;
  FLOAT *a, *offsetA, *offsetB;
  BLASLONG range_N[2];

  FLOAT *sbb;

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  mn = MIN(m, n);

  blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (blocking > GEMM_Q) blocking = GEMM_Q;

  if (blocking <= GEMM_UNROLL_N * 2) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }
  
  sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);

  info = 0;

  for (j = 0; j < mn; j += blocking) {
    
    jb = mn - j;
    if (jb > blocking) jb = blocking;

    offsetA = a +  j       * lda * COMPSIZE;
    offsetB = a + (j + jb) * lda * COMPSIZE;

    range_N[0] = offset + j;
    range_N[1] = offset + j + jb;

    iinfo   = CNAME(args, NULL, range_N, sa, sb, 0);

    if (iinfo && !info) info = iinfo + j;

    if (j + jb < n) {
      
      TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb);
      
      for (js = j + jb; js < n; js += REAL_GEMM_R){
	jmin = n - js;
	if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R;
	
	  for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){
	    min_jj = js + jmin - jjs;
	    if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
	    
#if 0
	    LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, 
#ifdef COMPLEX
		       ZERO,
#endif
		       a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1);
	    
	    GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE);
#else
	    LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, 
			a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE);
#endif
	    
	    
	    for (jc = 0; jc < jb; jc += GEMM_P) {
	      jcmin = jb - jc;
	      if (jcmin > GEMM_P) jcmin = GEMM_P;
	      
	      TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1,
#ifdef COMPLEX
			     ZERO,
#endif
			     sb  + jb * jc * COMPSIZE,
			     sbb + jb * (jjs - js) * COMPSIZE, 
			     a + (j + jc + jjs * lda) * COMPSIZE, lda, jc);
	    }
	  }


	for (is = j + jb; is < m; is += GEMM_P){
	  
	  imin = m - is;
	  if (imin > GEMM_P) imin = GEMM_P;

	  GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa);
	  
	  GEMM_KERNEL_N(imin, jmin, jb, dm1,
#ifdef COMPLEX
			ZERO,
#endif
			sa, sbb, 	a + (is + js * lda) * COMPSIZE, lda);
	}
      }
    }
  }
  
  for (j = 0; j < mn; j += jb) {
    jb = MIN(mn - j, blocking);
    LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO,
#ifdef COMPLEX
	       ZERO,
#endif
	       a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1);
    
  }

  return info;
}
Esempio n. 3
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG n, bk, i, blocking, lda;
  BLASLONG info;
  int mode;
  blas_arg_t newarg;
  FLOAT *a;
  FLOAT alpha[2] = { -ONE, ZERO};

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif
#endif

  if (args -> nthreads  == 1) {
    info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0);
    return info;
  }

  n  = args -> n;
  a  = (FLOAT *)args -> a;
  lda = args -> lda;

  if (range_n) n  = range_n[1] - range_n[0];

  if (n <= GEMM_UNROLL_N * 4) {
    info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  newarg.lda = lda;
  newarg.ldb = lda;
  newarg.ldc = lda;
  newarg.alpha = alpha;
  newarg.beta = NULL;
  newarg.nthreads = args -> nthreads;

  blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (blocking > GEMM_Q) blocking = GEMM_Q;

  for (i = 0; i < n; i += blocking) {
    bk = n - i;
    if (bk > blocking) bk = blocking;

    newarg.m = bk;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;

    info = CNAME(&newarg, NULL, NULL, sa, sb, 0);
    if (info) return info + i;

    if (n - i - bk > 0) {
      newarg.m = n - i - bk;
      newarg.n = bk;
      newarg.a = a + (i      + i * lda) * COMPSIZE;
      newarg.b = a + (i + bk + i * lda) * COMPSIZE;

      gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
		    &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads);

      newarg.n = n - i - bk;
      newarg.k = bk;
      newarg.a = a + (i + bk +  i       * lda) * COMPSIZE;
      newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE;

#ifndef USE_SIMPLE_THREADED_LEVEL3
      HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0);
#else
      syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO,
		  &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads);
#endif
    }
  }

  return 0;
}
Esempio n. 4
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG n, bk, i, blocking, lda;
  int mode;
  blas_arg_t newarg;
  FLOAT *a;
  FLOAT alpha[2] = { ONE, ZERO};

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  if (args -> nthreads  == 1) {
    LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); 
    return 0;
  }

  n  = args -> n;
  a  = (FLOAT *)args -> a;
  lda = args -> lda;

  if (range_n) n  = range_n[1] - range_n[0];

  if (n <= GEMM_UNROLL_N * 2) {
    LAUUM_U_SINGLE(args, NULL, range_n, sa, sb, 0);
    return 0;
  }

  newarg.lda = lda;
  newarg.ldb = lda;
  newarg.ldc = lda;
  newarg.alpha = alpha;
  newarg.beta = NULL;
  newarg.nthreads = args -> nthreads;

  blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (blocking > GEMM_Q) blocking = GEMM_Q;

  for (i = 0; i < n; i += blocking) {

    bk = n - i;
    if (bk > blocking) bk = blocking;
    
    newarg.n = i;
    newarg.k = bk;
    newarg.a = a + (    i * lda) * COMPSIZE;
    newarg.c = a;

    syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
		&newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads);

    newarg.m = i;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;
    newarg.b = a + (    i * lda) * COMPSIZE;

    gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE,
		  &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads);

    newarg.m = bk;
    newarg.n = bk;
    newarg.a = a + (i + i * lda) * COMPSIZE;

    CNAME(&newarg, NULL, NULL, sa, sb, 0);
  }
  
  return 0;
}
Esempio n. 5
0
void _free_r( struct _reent* r, void* ptr )
{
    CNAME( free )( ptr );
}
Esempio n. 6
0
void* _realloc_r( struct _reent* r, void* ptr, size_t size )
{
    return CNAME( realloc )( ptr, size );
}
Esempio n. 7
0
void* _malloc_r( struct _reent* r, size_t size )
{
    return CNAME( malloc )( size );
}
Esempio n. 8
0
void* _calloc_r( struct _reent* r, size_t nelem, size_t elem_size )
{
    return CNAME( calloc )( nelem, elem_size );
}
Esempio n. 9
0
int main(int argc, char *argv[])
{
    srand(time(0));

    int m = atoi(argv[1]);
    int n = atoi(argv[2]);
    int i, j;
    DOUBLE *x;
    DOUBLE *y;
    DOUBLE *A;
    DOUBLE *t;
	DOUBLE *buffer;
    int incx = 1;
    int incy = 1;
    DOUBLE alpha, beta;
    int lda = m;
    alpha = rand()/1.0/RAND_MAX - 0.5;
    beta = 1;//rand()/1.0/RAND_MAX - 0.5;
    x = (DOUBLE*)malloc(sizeof(DOUBLE)*n);
    y = (DOUBLE*)malloc(sizeof(DOUBLE)*m);
    t = (DOUBLE*)malloc(sizeof(DOUBLE)*m);
    A = (DOUBLE*)malloc(sizeof(DOUBLE)*m*n);
	buffer = (DOUBLE*)malloc(sizeof(DOUBLE)*m*n);

    for (i = 0; i < n; i++)
        x[i] = rand()/1.0/RAND_MAX - 0.5;
    for (i = 0; i < m; i++)
        t[i] = rand()/1.0/RAND_MAX - 0.5;
    for (i = 0; i < m*n; i++)
        A[i] = rand()/1.0/RAND_MAX - 0.5;

    //y = alpha*A*x + beta*y    //m row //n col

    unsigned long long int t1,t2,t3,t4,t5;
    //printf("acm\n");//ACML version
    memcpy(y,t,sizeof(DOUBLE)*m);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    sgemv('N', m, n, alpha, A, lda, x, incx, beta, y, incy);
    clock_gettime(CLOCK_MONOTONIC, &end);
    t1 =  1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec;

    //printf("one\n");//Native version with checksum
    memcpy(y,t,sizeof(DOUBLE)*m);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    one('N', m, n, alpha, A, lda, x, incx, beta, y, incy);
    clock_gettime(CLOCK_MONOTONIC, &end);
    t2 =  1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec;

    //printf("opc\n");//OpenBLAS version with checksum
    memcpy(y,t,sizeof(DOUBLE)*m);
    clock_gettime(CLOCK_MONOTONIC, &begin);
    //CNAME('N', m, n, alpha, A, lda, x, incx, beta, y, incy);
	openCkm(m,n,0,alpha,A,lda,x,incx,y,incy,buffer);
    clock_gettime(CLOCK_MONOTONIC, &end);
    t3 =  1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec;

    //printf("opn\n");//OpenBLAS version
    memcpy(y,t,sizeof(DOUBLE)*m);
    clock_gettime(CLOCK_MONOTONIC, &begin);
	CNAME(m,n,0,alpha,A,lda,x,incx,y,incy,buffer);
    clock_gettime(CLOCK_MONOTONIC, &end);
    t4 =  1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec;

    /*printf("opo\n");//OpenBLAS version (with new kernel)
    memcpy(y,t,sizeof(DOUBLE)*m);
    clock_gettime(CLOCK_MONOTONIC, &begin);
	CNAME_3(m,n,0,alpha,A,lda,x,incx,y,incy,buffer);
    clock_gettime(CLOCK_MONOTONIC, &end);
    t5 =  1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec;*/


    printf("acm%16lld\n",t1);
    printf("one%16lld\n",t2);
    printf("opc%16lld\n",t3);
    printf("opn%16lld\n",t4);
    //printf("opo%16lld\n",t5);
    return 0;
}
Esempio n. 10
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;
  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range[MAX_CPU_NUMBER + 1];

  BLASLONG width, nn, num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common = NULL;
  newarg.nthreads = args -> nthreads;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  width = FORMULA1(m, n, 0, init_bk, args -> nthreads);
  width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (width > n - init_bk) width = n - init_bk;

  if (width < init_bk) {
    long temp;

    temp = FORMULA2(m, n, 0, init_bk, args -> nthreads);
    temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N;
    if (temp < init_bk) init_bk = temp;

  }

  next_bk = init_bk;
  bk      = init_bk;

  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  info   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  is = 0;
  num_cpu = 0;

  sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);

  while (is < mn) {

    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    
    if (width < bk) {

      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif

      width = next_bk;
    }
    
    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    nn = n - bk - is;
    if (width > nn) width = nn;

    if (num_cpu > 1)  exec_blas_async_wait(num_cpu - 1, &queue[1]);

    range[0] = 0;
    range[1] = width;
    
    num_cpu = 1;
    nn -= width;
    
    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = m - bk - is;
    newarg.n   = n - bk - is;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    while (nn > 0){
      
      width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu);
      
      nn -= width;
      if (nn < 0) width = width + nn;
      
      range[num_cpu + 1] = range[num_cpu] + width;
      
      queue[num_cpu].mode    = mode;
      //queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].routine = (void *)inner_basic_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = NULL;
      queue[num_cpu].range_n = &range[num_cpu];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;

      num_cpu ++;
    }
    
    queue[num_cpu - 1].next = NULL;

    is += bk;
    
    bk = n - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;
    
    if (num_cpu > 1) {

      exec_blas_async(1, &queue[1]);
    
#if 0
      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0);

      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
#else

      if (range[1] >= bk * 4) {

	BLASLONG myrange[2];

	myrange[0] = 0;
	myrange[1] = bk;

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

	myrange[0] = bk;
	myrange[1] = range[1];

	inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1);

      } else {

	inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);

	iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      }

#endif

      for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};
      
      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);
      
    } else {

      inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1);
      
      iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
    }

      if (iinfo && !info) info = iinfo + is;
      
  }
  
  next_bk = init_bk;
  bk      = init_bk;
  
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = FORMULA1(m, n, is, bk, args -> nthreads);
    width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

    if (width < bk) {
      next_bk = FORMULA2(m, n, is, bk, args -> nthreads);
      next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);

      if (next_bk > bk) next_bk = bk;
#if 0
      if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is);
#else
      if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is);
#endif
    }

    if (width > mn - is - bk) {
      next_bk = mn - is - bk;
      width   = next_bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
Esempio n. 11
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG m, n, mn, lda, offset;
  BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2];
  blasint *ipiv, iinfo, info;
  int mode;
  blas_arg_t newarg;

  FLOAT *a, *sbb;
  FLOAT dummyalpha[2] = {ZERO, ZERO};

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range_M[MAX_CPU_NUMBER + 1];
  BLASLONG range_N[MAX_CPU_NUMBER + 1];

  job_t        job[MAX_CPU_NUMBER];

  BLASLONG width, nn, mm;
  BLASLONG i, j, k, is, bk;

  BLASLONG num_cpu;

  volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128)));

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
#endif  
#endif

  m    = args -> m;
  n    = args -> n;
  a    = (FLOAT *)args -> a;
  lda  = args -> lda;
  ipiv = (blasint *)args -> c;
  offset = 0;

  if (range_n) {
    m     -= range_n[0];
    n      = range_n[1] - range_n[0];
    offset = range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (m <= 0 || n <= 0) return 0;
  
  newarg.c   = ipiv;
  newarg.lda = lda;
  newarg.common   = (void *)job;

  info = 0;

  mn = MIN(m, n);

  init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
  if (init_bk > GEMM_Q) init_bk = GEMM_Q;

  if (init_bk <= GEMM_UNROLL_N) {
    info = GETF2(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  next_bk = init_bk;

  bk = mn;
  if (bk > next_bk) bk = next_bk;
  
  range_n_new[0] = offset;
  range_n_new[1] = offset + bk;
  
  iinfo   = CNAME(args, NULL, range_n_new, sa, sb, 0);
  
  if (iinfo && !info) info = iinfo;
  
  TRSM_ILTCOPY(bk, bk, a, lda, 0, sb);

  sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
  
  is = 0;
  num_cpu = 0;

  while (is < mn) {
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      
      if (next_bk > bk) next_bk = bk;

      width = next_bk;
      if (width > mn - is - bk) width = mn - is - bk;
    }
    
    if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]);

    mm = m - bk - is;
    nn = n - bk - is;

    newarg.a   = sb;
    newarg.b   = a + (is + is * lda) * COMPSIZE;
    newarg.d   = (void *)flag;
    newarg.m   = mm;
    newarg.n   = nn;
    newarg.k   = bk;
    newarg.ldb = is + offset;
    
    nn -= width;

    range_n_mine[0] = 0;
    range_n_mine[1] = width;

    range_N[0] = width;
    range_M[0] = 0;

    num_cpu  = 0;
    
    while (nn > 0){
      
      if (mm >= nn) {

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	if (nn <=    0) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

      } else {

	width  = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (mm < width) width = mm;
	mm -= width;
	range_M[num_cpu + 1] = range_M[num_cpu] + width;

	width  = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1);
	if (nn < width) width = nn;
	if (mm <=    0) width = nn;
	nn -= width;
	range_N[num_cpu + 1] = range_N[num_cpu] + width;
	
      }

      queue[num_cpu].mode    = mode;
      queue[num_cpu].routine = inner_advanced_thread;
      queue[num_cpu].args    = &newarg;
      queue[num_cpu].range_m = &range_M[num_cpu];
      queue[num_cpu].range_n = &range_N[0];
      queue[num_cpu].sa      = NULL;
      queue[num_cpu].sb      = NULL;
      queue[num_cpu].next    = &queue[num_cpu + 1];
      flag[num_cpu * CACHE_LINE_SIZE] = 1;
      
      num_cpu ++;

    }
    
    newarg.nthreads = num_cpu;
    
    if (num_cpu > 0) {
      for (j = 0; j < num_cpu; j++) {
	for (i = 0; i < num_cpu; i++) {
	  for (k = 0; k < DIVIDE_RATE; k++) {
	    job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	  }
	}
      }
    }

    is += bk;

    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    range_n_new[0] = offset + is;
    range_n_new[1] = offset + is + bk;

    if (num_cpu > 0) {

      queue[num_cpu - 1].next = NULL;
      
      exec_blas_async(0, &queue[0]);
      
      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);
      
      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);
      
      if (iinfo && !info) info = iinfo + is;

      for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {};

      TRSM_ILTCOPY(bk, bk, a + (is +  is * lda) * COMPSIZE, lda, 0, sb);

    } else {

      inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1);

      iinfo   = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0);

      if (iinfo && !info) info = iinfo + is;
    
    }
    
  }
  
  next_bk = init_bk;
  is = 0;
  
  while (is < mn) {
    
    bk = mn - is;
    if (bk > next_bk) bk = next_bk;
    
    width  = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1);
    if (width > mn - is - bk) width = mn - is - bk;

    if (width < bk) {
      next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1);
      if (next_bk > bk) next_bk = bk;
    }

    blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, 
		       a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0,
		       ipiv, 1, (void *)LASWP_PLUS, args -> nthreads);
    
    is += bk;
  }
  
  return info;
}
Esempio n. 12
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG  n, lda;
  FLOAT *a;

  BLASLONG info;
  BLASLONG bk, j, blocking;
  BLASLONG is, min_i;
  BLASLONG js, min_j;
  BLASLONG range_N[2];

  FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
		    + GEMM_PQ  * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
		  + GEMM_OFFSET_B);

#ifdef SHARED_ARRAY
  FLOAT *aa;
#endif

  n      = args -> n;
  a      = (FLOAT *)args -> a;
  lda    = args -> lda;
  
  if (range_n) {
    n      = range_n[1] - range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (n <= DTB_ENTRIES / 2) {
    info = POTF2_L(args, NULL, range_n, sa, sb, 0);
    return info;
  }

  blocking = GEMM_Q;
  if (n <= 4 * GEMM_Q) blocking = n / 4;

  for (j = 0; j < n; j += blocking) {
    bk = n - j;
    if (bk > blocking) bk = blocking;

    if (!range_n) {
      range_N[0] = j;
      range_N[1] = j + bk;
    } else {
      range_N[0] = range_n[0] + j;
      range_N[1] = range_n[0] + j + bk;
    }
    info = CNAME(args, NULL, range_N, sa, sb, 0);
    if (info) return info + j;

    if (n - j - bk > 0) {
     
      TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb);

      /* First tile */
      min_j = n - j - bk;
      if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R;

      for (is = j + bk; is < n; is += GEMM_P) {
	min_i = n - is;
	if (min_i > GEMM_P) min_i = GEMM_P;

#ifdef SHARED_ARRAY

	if (is < j + bk + min_j) {
	  aa = sb2 + bk * (is - j - bk) * COMPSIZE;
	} else {
	  aa = sa;
	}
	
	GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa);
	
	TRSM_KERNEL(min_i, bk, bk, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    aa,
		    sb,
		    a + (is + j * lda) * COMPSIZE, lda, 0);
	
	SYRK_KERNEL_L(min_i, min_j, bk, dm1,
		      aa,
		      sb2,
		      a + (is + (j + bk) * lda) * COMPSIZE, lda,
		      is - j - bk);

#else

	GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);

	TRSM_KERNEL(min_i, bk, bk, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    
		    sa,
		    sb,
		    a + (is + j * lda) * COMPSIZE, lda, 0);

	if (is < j + bk + min_j) {
	  GEMM_OTCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sb2 + bk * (is - j - bk) * COMPSIZE);
	}

	SYRK_KERNEL_L(min_i, min_j, bk, dm1,
		      sa,
		      sb2,
		      a + (is + (j + bk) * lda) * COMPSIZE, lda,
		      is - j - bk);
#endif
      }
      
      for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){
	min_j = n - js;
	if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R;

	GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2);
	
	for (is = js; is < n; is += GEMM_P) {
	  min_i = n - is;
	  if (min_i > GEMM_P) min_i = GEMM_P;
	  
#ifdef SHARED_ARRAY

	  if (is + min_i < js + min_j) {
	    aa = sb2 + bk * (is - js) * COMPSIZE;
	  } else {
	    GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
	    aa = sa;
	  }
	  
	  SYRK_KERNEL_L(min_i, min_j, bk, dm1,
			aa,
			sb2,
			a + (is + js * lda) * COMPSIZE, lda,
			is - js);

#else

	  GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa);
	  
	  SYRK_KERNEL_L(min_i, min_j, bk, dm1,
			sa,
			sb2,
			a + (is + js * lda) * COMPSIZE, lda,
			- is + js);
#endif

	}
      }

    }
    
  }

  return 0;
}
Esempio n. 13
0
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) {

  BLASLONG  n, lda;
  FLOAT *a;

  BLASLONG info;
  BLASLONG bk, blocking;
  BLASLONG is, min_i;
  BLASLONG jjs, min_jj;
  BLASLONG range_N[2];
  BLASLONG j, js, min_j;

#ifdef SHARED_ARRAY
  FLOAT *aa;
#endif
  
  FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb
		    + GEMM_PQ  * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)
		  + GEMM_OFFSET_B);

  n      = args -> n;
  a      = (FLOAT *)args -> a;
  lda    = args -> lda;

  if (range_n) {
    n      = range_n[1] - range_n[0];
    a     += range_n[0] * (lda + 1) * COMPSIZE;
  }

  if (n <= DTB_ENTRIES / 2) {
    info = POTF2_U(args, NULL, range_n, sa, sb, 0);
    return info;
  }
  
  blocking = GEMM_Q;
  if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4;
  
  for (j = 0; j < n; j += blocking) {
    bk = n - j;
    if (bk > blocking) bk = blocking;
    
    if (!range_n) {
      range_N[0] = j;
      range_N[1] = j + bk;
    } else {
      range_N[0] = range_n[0] + j;
      range_N[1] = range_n[0] + j + bk;
    }
    
    info = CNAME(args, NULL, range_N, sa, sb, 0);
    if (info) return info + j;
    
    if (n - j - bk > 0) {
      
      TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb);
      
      for(js = j + bk; js < n; js += REAL_GEMM_R) {
	min_j = n - js;
	if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R;
	
	for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){
	  min_jj = min_j + js - jjs;
	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
	  
	  GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE);
	  
	  for (is = 0; is < bk; is += GEMM_P) {
	    min_i = bk - is;
	    if (min_i > GEMM_P) min_i = GEMM_P;
	    
	    TRSM_KERNEL (min_i, min_jj, bk, dm1, 
#ifdef COMPLEX
			 ZERO,
#endif
			 sb + bk * is * COMPSIZE,
			 sb2 + bk * (jjs - js) * COMPSIZE,
			 a + (j + is + jjs * lda) * COMPSIZE, lda, is);
	  }
	}

	for (is = j + bk; is < js + min_j; is += min_i) {
	  min_i = js + min_j - is;
      
	  if (min_i >= GEMM_P * 2) {
	    min_i = GEMM_P;
	  } else 
	    if (min_i > GEMM_P) {
	      min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	    }
      
#ifdef SHARED_ARRAY
	  if ((is >= js) && (is + min_i <= js + min_j)) {
	    aa = sb2 + bk * (is - js) * COMPSIZE;
	  } else {
	    GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa);
	    aa = sa;
	  }
#else
	  GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa);
#endif
	  
	  SYRK_KERNEL_U(min_i, min_j, bk,
			dm1, 
			SA, sb2,
			a + (is + js * lda) * COMPSIZE, lda,
			is - js);
	  
	}
      }
    }
   
  }
  
  return 0;
}