Beispiel #1
0
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG k, lda, ldb, ldc;
  BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;

  FLOAT *alpha, *beta;
  FLOAT *a, *b, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;

  BLASLONG ls, min_l, jjs, min_jj;
  BLASLONG is, min_i, div_n;

  BLASLONG i, current;
  BLASLONG l1stride, l2size;

#ifdef TIMING
  BLASULONG rpcc_counter;
  BLASULONG copy_A = 0;
  BLASULONG copy_B = 0;
  BLASULONG kernel = 0;
  BLASULONG waiting1 = 0;
  BLASULONG waiting2 = 0;
  BLASULONG waiting3 = 0;
  BLASULONG waiting6[MAX_CPU_NUMBER];
  BLASULONG ops    = 0;

  for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif

  k = K;

  a = (FLOAT *)A;
  b = (FLOAT *)B;
  c = (FLOAT *)C;

  lda = LDA;
  ldb = LDB;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = M;

  if (range_m) {
    m_from = range_m[0];
    m_to   = range_m[1];
  }

  n_from = 0;
  n_to   = N;

  N_from = 0;
  N_to   = N;

  if (range_n) {
    n_from = range_n[mypos + 0];
    n_to   = range_n[mypos + 1];

    N_from = range_n[0];
    N_to   = range_n[args -> nthreads];
  }

  if (beta) {
#ifndef COMPLEX
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#ifdef COMPLEX
      && (alpha[1] == ZERO)
#endif
      ) return 0;

  l2size = GEMM_P * GEMM_Q;

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
	  mypos, m_from, m_to, n_from, n_to, N_from, N_to);

  fprintf(stderr, "GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R);

#endif

  div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  
  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
  }
  

  for(ls = 0; ls < k; ls += min_l){

    min_l = k - ls;

    if (min_l >= GEMM_Q * 2) {
      min_l  = GEMM_Q;
    } else {
      if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
    }

    l1stride = 1;
    min_i = m_to - m_from;
    
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else {
      if (min_i > GEMM_P) {
	min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
      } else {
	if (args -> nthreads == 1) l1stride = 0;
      }
    }

    START_RPCC();
    
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
    
    STOP_RPCC(copy_A);
    
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    
    for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using buffer */
      for (i = 0; i < args -> nthreads; i++)
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
#if defined(FUSED_GEMM) && !defined(TIMING)

      FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha,
			     sa, buffer[bufferside], b, ldb, c, ldc, m_from, xxx, ls);

#else

      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
	min_jj = MIN(n_to, xxx + div_n) - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride,
			 c, ldc, m_from, jjs);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }
#endif
	
      for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
      WMB;
    }

    current = mypos;
    
    do {
      current ++;
      if (current >= args -> nthreads) current = 0;
      
      div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
      
      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	
	if (current != mypos) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	    
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, m_from, xxx);

	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	}
	
	if (m_to - m_from == min_i) {
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	}
      }
    } while (current != mypos);
    

    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;

      if (min_i >= GEMM_P * 2) {
	min_i = GEMM_P;
      } else 
	if (min_i > GEMM_P) {
	  min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
	}
      
      START_RPCC();
      
      ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;
      do {
	
	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif

	if (is + min_i >= m_to) {
	  /* Thread doesn't need this buffer any more */
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	  WMB;
	}
	}
	
	current ++;
	if (current >= args -> nthreads) current = 0;
	
      } while (current != mypos);
      
    }
    
  }
  
  START_RPCC();

  for (i = 0; i < args -> nthreads; i++) {
    for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
      while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
    }
  }

  STOP_RPCC(waiting3);

#ifdef TIMING
  BLASLONG waiting = waiting1 + waiting2 + waiting3;
  BLASLONG total = copy_A + copy_B + kernel + waiting;

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	  (double)waiting1 /(double)total * 100.,
	  (double)waiting2 /(double)total * 100.,
	  (double)waiting3 /(double)total * 100.,
	  (double)ops/(double)kernel / 4. * 100.);

#if 0
  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	  mypos, copy_A, copy_B, waiting);

  fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
	  mypos,
	  (double)waiting1/(double)waiting * 100.,
	  (double)waiting2/(double)waiting * 100.,
	  (double)waiting3/(double)waiting * 100.);
#endif
  fprintf(stderr, "\n");
#endif

  return 0;
}
Beispiel #2
0
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
		  FLOAT *sa, FLOAT *sb, BLASLONG dummy){
  BLASLONG k, lda, ldb, ldc;
  FLOAT *alpha, *beta;
  FLOAT *a, *b, *c;
  BLASLONG m_from, m_to, n_from, n_to;

  BLASLONG ls, is, js, jjs;
  BLASLONG min_l, min_i, min_j, min_jj;

#ifdef TIMING
  BLASULONG rpcc_counter;
  BLASULONG BLASLONG innercost  = 0;
  BLASULONG BLASLONG outercost  = 0;
  BLASULONG BLASLONG kernelcost = 0;
  double total;
#endif

  k = K;

  a = (FLOAT *)A;
  b = (FLOAT *)B;
  c = (FLOAT *)C;

  lda = LDA;
  ldb = LDB;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = M;

  if (range_m) {
    m_from = *(((BLASLONG *)range_m) + 0);
    m_to   = *(((BLASLONG *)range_m) + 1);
  }

  n_from = 0;
  n_to   = N;

  if (range_n) {
    n_from = *(((BLASLONG *)range_n) + 0);
    n_to   = *(((BLASLONG *)range_n) + 1);
  }

  if (beta) {
#ifndef COMPLEX
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#ifdef COMPLEX
      && (alpha[1] == ZERO)
#endif
      ) return 0;

#if 0
  printf("GEMM: M_from : %ld  M_to : %ld  N_from : %ld  N_to : %ld  k : %ld\n", m_from, m_to, n_from, n_to, k);
  printf("GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R);
  printf("GEMM: SA .. %p  SB .. %p\n", sa, sb);
#endif

#ifdef DEBUG
  innercost = 0;
  outercost = 0;
  kernelcost = 0;
#endif

  for(js = n_from; js < n_to; js += GEMM3M_R){
    min_j = n_to - js;
    if (min_j > GEMM3M_R) min_j = GEMM3M_R;
    
    for(ls = 0; ls < k; ls += min_l){
      min_l = k - ls;
      
      if (min_l >= GEMM3M_Q * 2) {
	min_l = GEMM3M_Q;
      } else {
	if (min_l > GEMM3M_Q) {
	  min_l = (min_l + 1) / 2;
#ifdef UNROLL_X
	  min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
#endif
	}
      }
      
      min_i = m_to - m_from;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else {
	if (min_i > GEMM3M_P) {
	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      }

      START_RPCC();
      
      ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
      
      STOP_RPCC(innercost);
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif
	
	STOP_RPCC(outercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
	
	STOP_RPCC(kernelcost);
	
      }      
      
      for(is = m_from + min_i; is < m_to; is += min_i){
	min_i = m_to - is;
	if (min_i >= GEMM3M_P * 2) {
	  min_i = GEMM3M_P;
	} else 
	  if (min_i > GEMM3M_P) {
	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	  }
	
	START_RPCC();
	
	ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
	
	STOP_RPCC(innercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js);
	
	STOP_RPCC(kernelcost);
      }

      min_i = m_to - m_from;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else {
	if (min_i > GEMM3M_P) {
	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      }
      
      START_RPCC();
      
      ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
      
      STOP_RPCC(innercost);
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif

	STOP_RPCC(outercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
	
	STOP_RPCC(kernelcost);
	
      }      
      
      for(is = m_from + min_i; is < m_to; is += min_i){
	min_i = m_to - is;
	if (min_i >= GEMM3M_P * 2) {
	  min_i = GEMM3M_P;
	} else 
	  if (min_i > GEMM3M_P) {
	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	  }
	
	START_RPCC();
	
	ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
	
	STOP_RPCC(innercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js);
	
	STOP_RPCC(kernelcost);
	
      }

      min_i = m_to - m_from;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else {
	if (min_i > GEMM3M_P) {
	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      }
      
      START_RPCC();
      
      ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
      
      STOP_RPCC(innercost);
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) 
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif

	STOP_RPCC(outercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
	
	STOP_RPCC(kernelcost);
	
      }      
      
      for(is = m_from + min_i; is < m_to; is += min_i){
	min_i = m_to - is;
	if (min_i >= GEMM3M_P * 2) {
	  min_i = GEMM3M_P;
	} else 
	  if (min_i > GEMM3M_P) {
	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	  }
	
	START_RPCC();
	
	ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
	
	STOP_RPCC(innercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js);
	
	STOP_RPCC(kernelcost);
	
      }

    } /* end of js */
  } /* end of ls */
  

#ifdef TIMING
  total = (double)outercost + (double)innercost + (double)kernelcost;

  printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f\n",
	   innercost / total * 100., outercost / total * 100.,
	  kernelcost / total * 100.);

  printf( " Total %10.3f%%  %10.3f MFlops\n",
	  ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100,
	  2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost);
#endif
  
  return 0;
}
Beispiel #3
0
static int gemm_single(int mypos, struct sgemmargs *args)
{
	long m_from, m_to, n_from, n_to;

	long ls, is, js;
	long min_l, min_i, min_j;
	long jjs, min_jj;
	float *sa = saa[mypos];
	float *sb = sba[mypos];
	long l1stride, gemm_p, l2size;
	char transa = args->transa;
	long m = args->m;
	long n = args->n;
	long k = args->k;
	float alpha = args->alpha;
	float beta = args->beta;
	float *a = args->a;
	float *b = args->b;
	float *c = args->c;
	long lda = args->lda;
	long ldb = args->ldb;
	long ldc = args->ldc;

#ifdef TIMING
	unsigned long long rpcc_counter;
	unsigned long long innercost  = 0;
	unsigned long long outercost  = 0;
	unsigned long long kernelcost = 0;
	double total;
#endif

	m_from = 0;
	m_to   = m;
	n_from = 0;
	n_to   = n;
	if (beta != 1)
		BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);

	if((k == 0) || (alpha == 0))
		return 0;
	l2size = GEMM_P * GEMM_Q;
#if 0
	fprintf(stderr, "GEMM(Single): M_from : %ld  M_to : %ld  N_from : %ld  N_to : %ld  k : %ld\n", m_from, m_to, n_from, n_to, k);
	fprintf(stderr, "GEMM(Single):: P = %4ld  Q = %4ld  R = %4ld\n", (long)GEMM_P, (long)GEMM_Q, (long)GEMM_R);
	//  fprintf(stderr, "GEMM: SA .. %p  SB .. %p\n", sa, sb);

	//  fprintf(stderr, "A = %p  B = %p  C = %p\n\tlda = %ld  ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc);
#endif

#ifdef TIMING
	innercost = 0;
	outercost = 0;
	kernelcost = 0;
#endif

	for(js = n_from; js < n_to; js += GEMM_R)
	{
		min_j = n_to - js;
		if (min_j > GEMM_R)
			min_j = GEMM_R;

		for(ls = 0; ls < k; ls += min_l)
		{
			min_l = k - ls;
			if(min_l >= GEMM_Q * 2)
			{
				gemm_p = GEMM_P;
				min_l  = GEMM_Q;
			} else {
				if(min_l > GEMM_Q)
					min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
				gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
				while (gemm_p * min_l > l2size)
					gemm_p -= GEMM_UNROLL_M;
			}
			/* First, we have to move data A to L2 cache */
			min_i = m_to - m_from;
			l1stride = 1;
			if(min_i >= GEMM_P * 2)
				min_i = GEMM_P;
			else if(min_i > GEMM_P)
				min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
			else l1stride = 0;
			START_RPCC();
			if(transa)
				ICOPYT_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
			else if(args->ks0)
				icopy_operation(min_l, min_i, args, ls, m_from, sa);
			else ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
			STOP_RPCC(innercost);
			for(jjs = js; jjs < js + min_j; jjs += min_jj)
			{
				min_jj = min_j + js - jjs;
				if(min_jj >= 3*GEMM_UNROLL_N)
					min_jj = 3*GEMM_UNROLL_N;
				else if(min_jj > GEMM_UNROLL_N)
					min_jj = GEMM_UNROLL_N;
				START_RPCC();
				OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * l1stride);
				STOP_RPCC(outercost);
				START_RPCC();
				KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa,
					sb + min_l * (jjs - js) * l1stride, c, ldc, m_from, jjs);
				STOP_RPCC(kernelcost);
			}

			for(is = m_from + min_i; is < m_to; is += min_i)
			{
				min_i = m_to - is;
				if(min_i >= GEMM_P * 2)
					min_i = GEMM_P;
				else if(min_i > GEMM_P)
					min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
				START_RPCC();
				if(transa)
					ICOPYT_OPERATION(min_l, min_i, a, lda, ls, is, sa);
				else if(args->ks0)
					icopy_operation(min_l, min_i, args, ls, is, sa);
				else ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
				STOP_RPCC(innercost);
				START_RPCC();
				KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
				STOP_RPCC(kernelcost);
			} /* end of is */
		} /* end of js */
	} /* end of ls */

#ifdef TIMING
	total = (double)outercost + (double)innercost + (double)kernelcost;
	printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f  kernel Effi. : %5.2f Total Effi. : %5.2f\n",
		innercost / total * 100., outercost / total * 100.,
		kernelcost / total * 100.,
		(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100.  / 2.,
		(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. / 2.);
#endif
	return 0;
}
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG k, lda, ldc;
  BLASLONG m_from, m_to, n_from, n_to;

  FLOAT *alpha, *beta;
  FLOAT *a, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;

  BLASLONG ls, min_l, jjs, min_jj;
  BLASLONG is, min_i, div_n;

  BLASLONG i, current;
#ifdef LOWER
  BLASLONG start_i;
#endif

#ifdef TIMING
  BLASLONG rpcc_counter;
  BLASLONG copy_A = 0;
  BLASLONG copy_B = 0;
  BLASLONG kernel = 0;
  BLASLONG waiting1 = 0;
  BLASLONG waiting2 = 0;
  BLASLONG waiting3 = 0;
  BLASLONG waiting6[MAX_CPU_NUMBER];
  BLASLONG ops    = 0;

  for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif

  k = K;

  a = (FLOAT *)A;
  c = (FLOAT *)C;

  lda = LDA;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = N;

  /* Global Range */
  n_from = 0;
  n_to   = N;

  if (range_n) {
    m_from = range_n[mypos + 0];
    m_to   = range_n[mypos + 1];

    n_from = range_n[0];
    n_to   = range_n[args -> nthreads];
  }

  if (beta) {
#if !defined(COMPLEX) || defined(HERK)
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#if defined(COMPLEX) && !defined(HERK)
      && (alpha[1] == ZERO)
#endif
      ) return 0;

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",  mypos, m_from, m_to, n_from, n_to);
#endif

  div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
	                            + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);

  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
  }
  
  for(ls = 0; ls < k; ls += min_l){

    min_l = k - ls;
    if (min_l >= GEMM_Q * 2) {
      min_l  = GEMM_Q;
    } else {
      if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
    }

    min_i = m_to - m_from;
    
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else {
      if (min_i > GEMM_P) {
	min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
      }
    }

#ifdef LOWER
    xxx = (m_to - m_from - min_i) % GEMM_P;

    if (xxx) min_i -= GEMM_P - xxx;
#endif

    START_RPCC();
    
#ifndef LOWER
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
#else
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa);
#endif
    
    STOP_RPCC(copy_A);
    
    div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
	                              + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
    
    for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using buffer */
#ifndef LOWER
      for (i = 0; i < mypos; i++)
#else
      for (i = mypos + 1; i < args -> nthreads; i++)
#endif
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
#ifndef LOWER

      for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){

	min_jj = MIN(m_to, xxx + div_n) - jjs;

	if (xxx == m_from) {
	  if (min_jj > min_i) min_jj = min_i;
	} else {
	  if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
	}
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
			 c, ldc, m_from, jjs);

	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }

#else

      for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){

	min_jj = MIN(m_to, xxx + div_n) - jjs;

	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
			 c, ldc, m_to - min_i, jjs);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }

#endif
	
#ifndef LOWER
      for (i = 0; i <= mypos; i++)
#else
      for (i = mypos; i < args -> nthreads; i++)
#endif
	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];

      WMB;
    }

    
#ifndef LOWER
    current = mypos + 1;
    while (current < args -> nthreads) {
#else
    current = mypos - 1;
    while (current >= 0) {
#endif

	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
		 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	  
	  START_RPCC();
	  
#ifndef LOWER
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, 
			   m_from,
			   xxx);
#else
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, 
			   m_to - min_i,
			   xxx);
#endif
	  
	  STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	  
	  if (m_to - m_from == min_i) {
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	  }
	}
	
#ifndef LOWER
	current ++;
#else
	current --;
#endif
    }

#ifndef LOWER
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
#else
    start_i = min_i;

    for(is = m_from; is < m_to - start_i; is += min_i){
      min_i = m_to - start_i - is;
#endif

      if (min_i >= GEMM_P * 2) {
	min_i = GEMM_P;
      } else 
	if (min_i > GEMM_P) {
	  min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	}

      START_RPCC();
      
      ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;

      do {
	
	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
		                                                     + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	  STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif
	  
#ifndef LOWER
	  if (is + min_i >= m_to) {
#else
	  if (is + min_i >= m_to - start_i) {
#endif
	    /* Thread doesn't need this buffer any more */
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	    WMB;
	  }
	}
	
#ifndef LOWER
	current ++;
      } while (current != args -> nthreads);
#else
	current --;
      } while (current >= 0);
#endif
	
     
    }
  }
  
  START_RPCC();

  for (i = 0; i < args -> nthreads; i++) {
    if (i != mypos) {
      for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
      }
    }
  }

  STOP_RPCC(waiting3);

#ifdef TIMING
  BLASLONG waiting = waiting1 + waiting2 + waiting3;
  BLASLONG total = copy_A + copy_B + kernel + waiting;

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	  (double)waiting1 /(double)total * 100.,
	  (double)waiting2 /(double)total * 100.,
	  (double)waiting3 /(double)total * 100.,
	  (double)ops/(double)kernel / 4. * 100.);

#if 0
  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	  mypos, copy_A, copy_B, waiting);

  fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
	  mypos,
	  (double)waiting1/(double)waiting * 100.,
	  (double)waiting2/(double)waiting * 100.,
	  (double)waiting3/(double)waiting * 100.);
#endif
  fprintf(stderr, "\n");
#endif

  return 0;
}

int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
  job_t          job[MAX_CPU_NUMBER];
#else
  job_t *        job = NULL;
#endif

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range[MAX_CPU_NUMBER + 100];

  BLASLONG num_cpu;

  BLASLONG nthreads = args -> nthreads;

  BLASLONG width, i, j, k;
  BLASLONG n, n_from, n_to;
  int  mode, mask;
  double dnum;

  if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
    SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); 
    return 0;
  }

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
  mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
  mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
  mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
  mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
  mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
  mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif  
#endif

  newarg.m        = args -> m;
  newarg.n        = args -> n;
  newarg.k        = args -> k;
  newarg.a        = args -> a;
  newarg.b        = args -> b;
  newarg.c        = args -> c;
  newarg.lda      = args -> lda;
  newarg.ldb      = args -> ldb;
  newarg.ldc      = args -> ldc;
  newarg.alpha    = args -> alpha;
  newarg.beta     = args -> beta;

#ifdef USE_ALLOC_HEAP
  job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  if(job==NULL){
    fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
    exit(1);
  }
#endif

  newarg.common   = (void *)job;
   
  if (!range_n) {
    n_from = 0;
    n_to   = args -> n;
  } else {
    n_from = range_n[0];
    n_to   = range_n[1] - range_n[0];
  }

#ifndef LOWER

  range[MAX_CPU_NUMBER] = n_to - n_from;
  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
      double di   = (double)i;
      
      width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
      
      if (num_cpu == 0) width = n - ((n - width) & ~mask);
      
      if ((width > n - i) || (width < mask)) width = n - i;
      
    } else {
      width = n - i;
    }

    range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = range_m;

    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

   for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];

#else

  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
	double di   = (double)i;
	
	width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
	
      if ((width > n - i) || (width < mask)) width = n - i;
	
    } else {
      width = n - i;
    }

    range[num_cpu + 1] = range[num_cpu] + width;
    
    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = range_m;
    queue[num_cpu].range_n = range;
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

#endif

  newarg.nthreads = num_cpu;

  if (num_cpu) {

    for (j = 0; j < num_cpu; j++) {
      for (i = 0; i < num_cpu; i++) {
	for (k = 0; k < DIVIDE_RATE; k++) {
	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	}
      }
    }
    
    queue[0].sa = sa;
    queue[0].sb = sb;
    queue[num_cpu - 1].next = NULL;
    
    exec_blas(num_cpu, queue);
  }
 
#ifdef USE_ALLOC_HEAP
  free(job);
#endif

  return 0;
}
Beispiel #5
0
static int gemm_thread(long mypos, long nthreads, struct sgemmargs *args, long *range_m, long *range_n)
{
	float *buffer[DIVIDE_RATE];
	float *sa, *sb;
	long m_from, m_to, n_from, n_to;
	long xxx, bufferside;
	long ls, min_l, jjs, min_jj;
	long is, min_i, div_n;
	long i, current;
	long l1stride;
	char transa = args->transa;
	long m = args->m;
	long n = args->n;
	long k = args->k;
	float alpha = args->alpha;
	float beta = args->beta;
	float *a = args->a;
	float *b = args->b;
	float *c = args->c;
	long lda = args->lda;
	long ldb = args->ldb;
	long ldc = args->ldc;

#ifdef TIMING
	unsigned long rpcc_counter;
	unsigned long copy_A = 0;
	unsigned long copy_B = 0;
	unsigned long kernel = 0;
	unsigned long waiting1 = 0;
	unsigned long waiting2 = 0;
	unsigned long waiting3 = 0;
	unsigned long waiting6[MAX_CPU_NUMBER];
	unsigned long ops    = 0;

	for (i = 0; i < num_threads; i++)
		waiting6[i] = 0;
#endif
	sa = saa[mypos];
	sb = sba[mypos];
	m_from = 0;
	m_to = m;

	if(range_m)
	{
		m_from = range_m[mypos + 0];
		m_to   = range_m[mypos + 1];
	}
	n_from = 0;
	n_to   = n;
	if (range_n)
	{
		n_from = range_n[mypos + 0];
		n_to   = range_n[mypos + 1];
	}
	if(beta != 1)
		BETA_OPERATION(m_from, m_to, 0, n, beta, c, ldc);
	if(k == 0 || alpha == 0)
		return 0;
#if 0
	fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",
		mypos, m_from, m_to, n_from, n_to);
	fprintf(stderr, "GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (long)GEMM_P, (long)GEMM_Q, (long)GEMM_R);
#endif
	div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
	buffer[0] = sb;
	for (i = 1; i < DIVIDE_RATE; i++)
		buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1));
	for(ls = 0; ls < k; ls += min_l)
	{
		min_l = k - ls;
		if (min_l >= GEMM_Q * 2)
			min_l  = GEMM_Q;
		else if (min_l > GEMM_Q)
			min_l = (min_l + 1) / 2;
		l1stride = 1;
		min_i = m_to - m_from;
		if (min_i >= GEMM_P * 2)
			min_i = GEMM_P;
		else if(min_i > GEMM_P)
			min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
		else if (nthreads == 1)
			l1stride = 0;
		START_RPCC();
		//printf("icopy%ld (%ld,%ld)%ld (%ld,%ld)\n", mypos, min_l, min_i, lda, ls, m_from);
		if(transa)
			ICOPYT_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
		else if(args->ks0)
			icopy_operation(min_l, min_i, args, ls, m_from, sa);
		else ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
		STOP_RPCC(copy_A);
		div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
		for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++)
		{
			START_RPCC();
			/* Make sure if no one is using buffer */
			for (i = 0; i < nthreads; i++)
				while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside])
					{YIELDING;}
			STOP_RPCC(waiting1);
			for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj)
			{
				min_jj = MIN(n_to, xxx + div_n) - jjs;
				if(min_jj >= 3*GEMM_UNROLL_N)
					min_jj = 3*GEMM_UNROLL_N;
				else if (min_jj > GEMM_UNROLL_N)
					min_jj = GEMM_UNROLL_N;
				START_RPCC();
				OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * l1stride);
				STOP_RPCC(copy_B);
				START_RPCC();
				KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * l1stride, c, ldc, m_from, jjs);
				STOP_RPCC(kernel);
#ifdef TIMING
				ops += 2 * min_i * min_jj * min_l;
#endif
			}
			for (i = 0; i < nthreads; i++)
				job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (long)buffer[bufferside];
			WMB;
		}
		current = mypos;
		do {
			current ++;
			if(current >= nthreads)
				current = 0;
			div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
			for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++)
			{
				if (current != mypos)
				{
					START_RPCC();
					/* thread has to wait */
					while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0)
						{YIELDING;}
					STOP_RPCC(waiting2);
					START_RPCC();
					KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha, sa,
						(float *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx);
					STOP_RPCC(kernel);
	#ifdef TIMING
					ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
	#endif
				}
				if (m_to - m_from == min_i)
					job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
			}
		} while (current != mypos);
		for(is = m_from + min_i; is < m_to; is += min_i)
		{
			min_i = m_to - is;
			if (min_i >= GEMM_P * 2)
				min_i = GEMM_P;
			else if (min_i > GEMM_P)
				min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
			START_RPCC();
			//printf("icopya%ld (%ld,%ld)%ld (%ld,%ld)\n", mypos, min_l, min_i, lda, ls, is);
			if(transa)
				ICOPYT_OPERATION(min_l, min_i, a, lda, ls, is, sa);
			else if(args->ks0)
				icopy_operation(min_l, min_i, args, ls, is, sa);
			else ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
			STOP_RPCC(copy_A);
			current = mypos;
			do {
				div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
				for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++)
				{
					START_RPCC();
					KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa,
						(float *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx);
					STOP_RPCC(kernel);
#ifdef TIMING
					ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif
					if(is + min_i >= m_to)
					{
						/* Thread doesn't need this buffer any more */
						job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
						WMB;
					}
				}
				current ++;
				if(current >= nthreads)
					current = 0;
			} while (current != mypos);
		}
	}
	START_RPCC();
	for (i = 0; i < nthreads; i++)
		for (xxx = 0; xxx < DIVIDE_RATE; xxx++)
			while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] )
				{YIELDING;}
	STOP_RPCC(waiting3);
#ifdef TIMING
	long waiting = waiting1 + waiting2 + waiting3;
	long total = copy_A + copy_B + kernel + waiting;

	fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	(double)waiting1 /(double)total * 100.,
	(double)waiting2 /(double)total * 100.,
	(double)waiting3 /(double)total * 100.,
	(double)ops/(double)kernel / 4. * 100.);

#if 0
	fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	mypos, copy_A, copy_B, waiting);

	fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", mypos,
		(double)waiting1/(double)waiting * 100.,
		(double)waiting2/(double)waiting * 100.,
		(double)waiting3/(double)waiting * 100.);
#endif
	fprintf(stderr, "\n");
#endif
	return 0;
}