Пример #1
0
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG k, lda, ldb, ldc;
  BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;

  FLOAT *alpha, *beta;
  FLOAT *a, *b, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;

  BLASLONG ls, min_l, jjs, min_jj;
  BLASLONG is, min_i, div_n;

  BLASLONG i, current;
  BLASLONG l1stride, l2size;

#ifdef TIMING
  BLASULONG rpcc_counter;
  BLASULONG copy_A = 0;
  BLASULONG copy_B = 0;
  BLASULONG kernel = 0;
  BLASULONG waiting1 = 0;
  BLASULONG waiting2 = 0;
  BLASULONG waiting3 = 0;
  BLASULONG waiting6[MAX_CPU_NUMBER];
  BLASULONG ops    = 0;

  for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif

  k = K;

  a = (FLOAT *)A;
  b = (FLOAT *)B;
  c = (FLOAT *)C;

  lda = LDA;
  ldb = LDB;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = M;

  if (range_m) {
    m_from = range_m[0];
    m_to   = range_m[1];
  }

  n_from = 0;
  n_to   = N;

  N_from = 0;
  N_to   = N;

  if (range_n) {
    n_from = range_n[mypos + 0];
    n_to   = range_n[mypos + 1];

    N_from = range_n[0];
    N_to   = range_n[args -> nthreads];
  }

  if (beta) {
#ifndef COMPLEX
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#ifdef COMPLEX
      && (alpha[1] == ZERO)
#endif
      ) return 0;

  l2size = GEMM_P * GEMM_Q;

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
	  mypos, m_from, m_to, n_from, n_to, N_from, N_to);

  fprintf(stderr, "GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R);

#endif

  div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  
  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
  }
  

  for(ls = 0; ls < k; ls += min_l){

    min_l = k - ls;

    if (min_l >= GEMM_Q * 2) {
      min_l  = GEMM_Q;
    } else {
      if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
    }

    l1stride = 1;
    min_i = m_to - m_from;
    
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else {
      if (min_i > GEMM_P) {
	min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
      } else {
	if (args -> nthreads == 1) l1stride = 0;
      }
    }

    START_RPCC();
    
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
    
    STOP_RPCC(copy_A);
    
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    
    for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using buffer */
      for (i = 0; i < args -> nthreads; i++)
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
#if defined(FUSED_GEMM) && !defined(TIMING)

      FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha,
			     sa, buffer[bufferside], b, ldb, c, ldc, m_from, xxx, ls);

#else

      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
	min_jj = MIN(n_to, xxx + div_n) - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride,
			 c, ldc, m_from, jjs);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }
#endif
	
      for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
      WMB;
    }

    current = mypos;
    
    do {
      current ++;
      if (current >= args -> nthreads) current = 0;
      
      div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
      
      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	
	if (current != mypos) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	    
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, m_from, xxx);

	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	}
	
	if (m_to - m_from == min_i) {
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	}
      }
    } while (current != mypos);
    

    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;

      if (min_i >= GEMM_P * 2) {
	min_i = GEMM_P;
      } else 
	if (min_i > GEMM_P) {
	  min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
	}
      
      START_RPCC();
      
      ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;
      do {
	
	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif

	if (is + min_i >= m_to) {
	  /* Thread doesn't need this buffer any more */
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	  WMB;
	}
	}
	
	current ++;
	if (current >= args -> nthreads) current = 0;
	
      } while (current != mypos);
      
    }
    
  }
  
  START_RPCC();

  for (i = 0; i < args -> nthreads; i++) {
    for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
      while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
    }
  }

  STOP_RPCC(waiting3);

#ifdef TIMING
  BLASLONG waiting = waiting1 + waiting2 + waiting3;
  BLASLONG total = copy_A + copy_B + kernel + waiting;

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	  (double)waiting1 /(double)total * 100.,
	  (double)waiting2 /(double)total * 100.,
	  (double)waiting3 /(double)total * 100.,
	  (double)ops/(double)kernel / 4. * 100.);

#if 0
  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	  mypos, copy_A, copy_B, waiting);

  fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
	  mypos,
	  (double)waiting1/(double)waiting * 100.,
	  (double)waiting2/(double)waiting * 100.,
	  (double)waiting3/(double)waiting * 100.);
#endif
  fprintf(stderr, "\n");
#endif

  return 0;
}
Пример #2
0
static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  job_t *job = (job_t *)args -> common;

  BLASLONG xxx, bufferside;

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG jjs, min_jj, div_n;

  BLASLONG i, current;
  BLASLONG is, min_i;

  BLASLONG m, n_from, n_to;
  BLASLONG k = args -> k;

  BLASLONG lda = args -> lda;
  BLASLONG off = args -> ldb;

  FLOAT *a = (FLOAT *)args -> b + (k          ) * COMPSIZE;
  FLOAT *b = (FLOAT *)args -> b + (    k * lda) * COMPSIZE;
  FLOAT *c = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE;
  FLOAT *sbb= sb;

  blasint *ipiv = (blasint *)args -> c;

  volatile BLASLONG *flag = (volatile BLASLONG *)args -> d;

  if (args -> a == NULL) {
    TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb);
    sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
  } else {
    sb  = (FLOAT *)args -> a;
  }

  m      = range_m[1] - range_m[0];
  n_from = range_n[mypos + 0];
  n_to   = range_n[mypos + 1];

  a     += range_m[0] * COMPSIZE;
  c     += range_m[0] * COMPSIZE;

  div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  
  buffer[0] = sbb;


  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
  }

  for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
    
    for (i = 0; i < args -> nthreads; i++)
      while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {};
    
    for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
      min_jj = MIN(n_to, xxx + div_n) - jjs;
      if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

      if (GEMM_UNROLL_N <= 8) {

	LASWP_NCOPY(min_jj, off + 1, off + k, 
		    b + (- off + jjs * lda) * COMPSIZE, lda,
		    ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE);

      } else {

	LASWP_PLUS(min_jj, off + 1, off + k, ZERO, 
#ifdef COMPLEX
		   ZERO,
#endif
		   b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1);
	
	GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, 
		     buffer[bufferside] + (jjs - xxx) * k * COMPSIZE);
      }

      for (is = 0; is < k; is += GEMM_P) {
	min_i = k - is;
	if (min_i > GEMM_P) min_i = GEMM_P;
	
	TRSM_KERNEL_LT(min_i, min_jj, k, dm1,
#ifdef COMPLEX
		       ZERO,
#endif
		       sb + k * is * COMPSIZE,
		       buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, 
		       b   + (is + jjs * lda) * COMPSIZE, lda, is);
      }
    }
	
    for (i = 0; i < args -> nthreads; i++)
      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];

  }
  
  flag[mypos * CACHE_LINE_SIZE] = 0;
  
  if (m == 0) {
    for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
      job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0;
    }
  }

  for(is = 0; is < m; is += min_i){
    min_i = m - is;
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else 
      if (min_i > GEMM_P) {
	min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
      }
      
      ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
      
      current = mypos;

      do {
	
	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  if ((current != mypos) && (!is)) {
	    	    while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {};
	  }

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, lda, is, xxx);
	  
	  if (is + min_i >= m) {
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	  }
	}
	
	current ++;
	if (current >= args -> nthreads) current = 0;
	
      } while (current != mypos);
  }
  
  for (i = 0; i < args -> nthreads; i++) {
    for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
      while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {};
    }
  }

  return 0;
}
Пример #3
0
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG k, lda;
  BLASLONG m_from, m_to;

  FLOAT *alpha;
  FLOAT *a, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;

  BLASLONG jjs, min_jj;
  BLASLONG is, min_i, div_n;

  BLASLONG i, current;

  k = K;

  a = (FLOAT *)A;
  c = (FLOAT *)C;

  lda = LDA;

  alpha = (FLOAT *)args -> alpha;

  m_from = range_n[mypos + 0];
  m_to   = range_n[mypos + 1];

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld\n",  mypos, m_from, m_to);
#endif

  div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);

  buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B);
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
  }
  
#ifndef LOWER
  TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#else
  TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb);
#endif
  
  for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
    
    for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){
      
      min_jj = MIN(m_to, xxx + div_n) - jjs;
      
#ifndef LOWER
      if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
#else
      if (min_jj > GEMM_P)         min_jj = GEMM_P;
#endif

#ifndef LOWER
      OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);

      TRSM_KERNEL     (k, min_jj, k, dm1, 
#ifdef COMPLEX
		       ZERO,
#endif
		       sb,
		       buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
		       a + jjs * lda * COMPSIZE, lda, 0);
#else
      ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE);

      TRSM_KERNEL     (min_jj, k, k, dm1,
#ifdef COMPLEX
		       ZERO,
#endif
		       buffer[bufferside] + k * (jjs - xxx) * COMPSIZE,
		       sb,
		       a + jjs       * COMPSIZE, lda, 0);
#endif
    }
    
#ifndef LOWER
    for (i = 0; i <= mypos; i++)
      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#else
    for (i = mypos; i < args -> nthreads; i++)
      job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
#endif
    
    WMB;
  }
  
  min_i = m_to - m_from;
  
  if (min_i >= GEMM_P * 2) {
    min_i = GEMM_P;
  } else 
    if (min_i > GEMM_P) {
      min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
    }
  
#ifndef LOWER
  ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#else
  OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa);
#endif
  
  current = mypos;

#ifndef LOWER
  while (current < args -> nthreads)
#else
  while (current >= 0)
#endif
    {
      div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
      
      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	
	/* thread has to wait */
	if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	
	KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
			 sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			 c, lda, m_from, xxx);
	
	if (m_from + min_i >= m_to) {
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	  WMB;
	}
      }
      
#ifndef LOWER
      current ++;
#else
      current --;
#endif
    }
  
  for(is = m_from + min_i; is < m_to; is += min_i){
    min_i = m_to - is;
    
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else 
      if (min_i > GEMM_P) {
	min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
      }
    
#ifndef LOWER
    ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#else
    OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa);
#endif
    
    current = mypos;
    
#ifndef LOWER
    while (current < args -> nthreads)
#else
      while (current >= 0)
#endif
	{
	  div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	  
	  for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	    
	    KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
			     sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			     c, lda, is, xxx);
	    
	    if (is + min_i >= m_to) {
	      job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	      WMB;
	    }
	  }	
#ifndef LOWER
	  current ++;
#else
	  current --;
#endif
	}
  }
  
  for (i = 0; i < args -> nthreads; i++) {
    if (i != mypos) {
      for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
      }
    }
  }
  
  return 0;
  }
Пример #4
0
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG k, lda, ldc;
  BLASLONG m_from, m_to, n_from, n_to;

  FLOAT *alpha, *beta;
  FLOAT *a, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;

  BLASLONG ls, min_l, jjs, min_jj;
  BLASLONG is, min_i, div_n;

  BLASLONG i, current;
#ifdef LOWER
  BLASLONG start_i;
#endif

#ifdef TIMING
  BLASLONG rpcc_counter;
  BLASLONG copy_A = 0;
  BLASLONG copy_B = 0;
  BLASLONG kernel = 0;
  BLASLONG waiting1 = 0;
  BLASLONG waiting2 = 0;
  BLASLONG waiting3 = 0;
  BLASLONG waiting6[MAX_CPU_NUMBER];
  BLASLONG ops    = 0;

  for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif

  k = K;

  a = (FLOAT *)A;
  c = (FLOAT *)C;

  lda = LDA;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = N;

  /* Global Range */
  n_from = 0;
  n_to   = N;

  if (range_n) {
    m_from = range_n[mypos + 0];
    m_to   = range_n[mypos + 1];

    n_from = range_n[0];
    n_to   = range_n[args -> nthreads];
  }

  if (beta) {
#if !defined(COMPLEX) || defined(HERK)
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#if defined(COMPLEX) && !defined(HERK)
      && (alpha[1] == ZERO)
#endif
      ) return 0;

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",  mypos, m_from, m_to, n_from, n_to);
#endif

  div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
	                            + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);

  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE;
  }
  
  for(ls = 0; ls < k; ls += min_l){

    min_l = k - ls;
    if (min_l >= GEMM_Q * 2) {
      min_l  = GEMM_Q;
    } else {
      if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
    }

    min_i = m_to - m_from;
    
    if (min_i >= GEMM_P * 2) {
      min_i = GEMM_P;
    } else {
      if (min_i > GEMM_P) {
	min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
      }
    }

#ifdef LOWER
    xxx = (m_to - m_from - min_i) % GEMM_P;

    if (xxx) min_i -= GEMM_P - xxx;
#endif

    START_RPCC();
    
#ifndef LOWER
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
#else
    ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa);
#endif
    
    STOP_RPCC(copy_A);
    
    div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
	                              + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
    
    for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using buffer */
#ifndef LOWER
      for (i = 0; i < mypos; i++)
#else
      for (i = mypos + 1; i < args -> nthreads; i++)
#endif
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
#ifndef LOWER

      for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){

	min_jj = MIN(m_to, xxx + div_n) - jjs;

	if (xxx == m_from) {
	  if (min_jj > min_i) min_jj = min_i;
	} else {
	  if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
	}
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
			 c, ldc, m_from, jjs);

	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }

#else

      for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){

	min_jj = MIN(m_to, xxx + div_n) - jjs;

	if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN;
	
	START_RPCC();
	
	OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, 
			buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE);
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
			 sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE,
			 c, ldc, m_to - min_i, jjs);
	  
	STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }

#endif
	
#ifndef LOWER
      for (i = 0; i <= mypos; i++)
#else
      for (i = mypos; i < args -> nthreads; i++)
#endif
	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];

      WMB;
    }

    
#ifndef LOWER
    current = mypos + 1;
    while (current < args -> nthreads) {
#else
    current = mypos - 1;
    while (current >= 0) {
#endif

	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
		 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	  
	  START_RPCC();
	  
#ifndef LOWER
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, 
			   m_from,
			   xxx);
#else
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, 
			   m_to - min_i,
			   xxx);
#endif
	  
	  STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	  
	  if (m_to - m_from == min_i) {
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	  }
	}
	
#ifndef LOWER
	current ++;
#else
	current --;
#endif
    }

#ifndef LOWER
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
#else
    start_i = min_i;

    for(is = m_from; is < m_to - start_i; is += min_i){
      min_i = m_to - start_i - is;
#endif

      if (min_i >= GEMM_P * 2) {
	min_i = GEMM_P;
      } else 
	if (min_i > GEMM_P) {
	  min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	}

      START_RPCC();
      
      ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;

      do {
	
	div_n = ((range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
		                                                     + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	  STOP_RPCC(kernel);

#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif
	  
#ifndef LOWER
	  if (is + min_i >= m_to) {
#else
	  if (is + min_i >= m_to - start_i) {
#endif
	    /* Thread doesn't need this buffer any more */
	    job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
	    WMB;
	  }
	}
	
#ifndef LOWER
	current ++;
      } while (current != args -> nthreads);
#else
	current --;
      } while (current >= 0);
#endif
	
     
    }
  }
  
  START_RPCC();

  for (i = 0; i < args -> nthreads; i++) {
    if (i != mypos) {
      for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
	while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
      }
    }
  }

  STOP_RPCC(waiting3);

#ifdef TIMING
  BLASLONG waiting = waiting1 + waiting2 + waiting3;
  BLASLONG total = copy_A + copy_B + kernel + waiting;

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	  (double)waiting1 /(double)total * 100.,
	  (double)waiting2 /(double)total * 100.,
	  (double)waiting3 /(double)total * 100.,
	  (double)ops/(double)kernel / 4. * 100.);

#if 0
  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	  mypos, copy_A, copy_B, waiting);

  fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
	  mypos,
	  (double)waiting1/(double)waiting * 100.,
	  (double)waiting2/(double)waiting * 100.,
	  (double)waiting3/(double)waiting * 100.);
#endif
  fprintf(stderr, "\n");
#endif

  return 0;
}

int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  blas_arg_t newarg;

#ifndef USE_ALLOC_HEAP
  job_t          job[MAX_CPU_NUMBER];
#else
  job_t *        job = NULL;
#endif

  blas_queue_t queue[MAX_CPU_NUMBER];

  BLASLONG range[MAX_CPU_NUMBER + 100];

  BLASLONG num_cpu;

  BLASLONG nthreads = args -> nthreads;

  BLASLONG width, i, j, k;
  BLASLONG n, n_from, n_to;
  int  mode, mask;
  double dnum;

  if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
    SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); 
    return 0;
  }

#ifndef COMPLEX
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_REAL;
  mask  = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_REAL;
  mask  = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_REAL;
  mask  = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
#endif  
#else
#ifdef XDOUBLE
  mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
  mask  = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1;
#elif defined(DOUBLE)
  mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
  mask  = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1;
#else
  mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
  mask  = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1;
#endif  
#endif

  newarg.m        = args -> m;
  newarg.n        = args -> n;
  newarg.k        = args -> k;
  newarg.a        = args -> a;
  newarg.b        = args -> b;
  newarg.c        = args -> c;
  newarg.lda      = args -> lda;
  newarg.ldb      = args -> ldb;
  newarg.ldc      = args -> ldc;
  newarg.alpha    = args -> alpha;
  newarg.beta     = args -> beta;

#ifdef USE_ALLOC_HEAP
  job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
  if(job==NULL){
    fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
    exit(1);
  }
#endif

  newarg.common   = (void *)job;
   
  if (!range_n) {
    n_from = 0;
    n_to   = args -> n;
  } else {
    n_from = range_n[0];
    n_to   = range_n[1] - range_n[0];
  }

#ifndef LOWER

  range[MAX_CPU_NUMBER] = n_to - n_from;
  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
      double di   = (double)i;
      
      width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
      
      if (num_cpu == 0) width = n - ((n - width) & ~mask);
      
      if ((width > n - i) || (width < mask)) width = n - i;
      
    } else {
      width = n - i;
    }

    range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width;

    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = range_m;

    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

   for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu];

#else

  range[0] = 0;
  num_cpu  = 0;
  i        = 0;
  n        = n_to - n_from;

  dnum = (double)n * (double)n /(double)nthreads;

  while (i < n){
    
    if (nthreads - num_cpu > 1) {
      
	double di   = (double)i;
	
	width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
	
      if ((width > n - i) || (width < mask)) width = n - i;
	
    } else {
      width = n - i;
    }

    range[num_cpu + 1] = range[num_cpu] + width;
    
    queue[num_cpu].mode    = mode;
    queue[num_cpu].routine = inner_thread;
    queue[num_cpu].args    = &newarg;
    queue[num_cpu].range_m = range_m;
    queue[num_cpu].range_n = range;
    queue[num_cpu].sa      = NULL;
    queue[num_cpu].sb      = NULL;
    queue[num_cpu].next    = &queue[num_cpu + 1];
    
    num_cpu ++;
    i += width;
  }

#endif

  newarg.nthreads = num_cpu;

  if (num_cpu) {

    for (j = 0; j < num_cpu; j++) {
      for (i = 0; i < num_cpu; i++) {
	for (k = 0; k < DIVIDE_RATE; k++) {
	  job[j].working[i][CACHE_LINE_SIZE * k] = 0;
	}
      }
    }
    
    queue[0].sa = sa;
    queue[0].sb = sb;
    queue[num_cpu - 1].next = NULL;
    
    exec_blas(num_cpu, queue);
  }
 
#ifdef USE_ALLOC_HEAP
  free(job);
#endif

  return 0;
}
Пример #5
0
static int gemm_single(int mypos, struct sgemmargs *args)
{
	long m_from, m_to, n_from, n_to;

	long ls, is, js;
	long min_l, min_i, min_j;
	long jjs, min_jj;
	float *sa = saa[mypos];
	float *sb = sba[mypos];
	long l1stride, gemm_p, l2size;
	char transa = args->transa;
	long m = args->m;
	long n = args->n;
	long k = args->k;
	float alpha = args->alpha;
	float beta = args->beta;
	float *a = args->a;
	float *b = args->b;
	float *c = args->c;
	long lda = args->lda;
	long ldb = args->ldb;
	long ldc = args->ldc;

#ifdef TIMING
	unsigned long long rpcc_counter;
	unsigned long long innercost  = 0;
	unsigned long long outercost  = 0;
	unsigned long long kernelcost = 0;
	double total;
#endif

	m_from = 0;
	m_to   = m;
	n_from = 0;
	n_to   = n;
	if (beta != 1)
		BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);

	if((k == 0) || (alpha == 0))
		return 0;
	l2size = GEMM_P * GEMM_Q;
#if 0
	fprintf(stderr, "GEMM(Single): M_from : %ld  M_to : %ld  N_from : %ld  N_to : %ld  k : %ld\n", m_from, m_to, n_from, n_to, k);
	fprintf(stderr, "GEMM(Single):: P = %4ld  Q = %4ld  R = %4ld\n", (long)GEMM_P, (long)GEMM_Q, (long)GEMM_R);
	//  fprintf(stderr, "GEMM: SA .. %p  SB .. %p\n", sa, sb);

	//  fprintf(stderr, "A = %p  B = %p  C = %p\n\tlda = %ld  ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc);
#endif

#ifdef TIMING
	innercost = 0;
	outercost = 0;
	kernelcost = 0;
#endif

	for(js = n_from; js < n_to; js += GEMM_R)
	{
		min_j = n_to - js;
		if (min_j > GEMM_R)
			min_j = GEMM_R;

		for(ls = 0; ls < k; ls += min_l)
		{
			min_l = k - ls;
			if(min_l >= GEMM_Q * 2)
			{
				gemm_p = GEMM_P;
				min_l  = GEMM_Q;
			} else {
				if(min_l > GEMM_Q)
					min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
				gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
				while (gemm_p * min_l > l2size)
					gemm_p -= GEMM_UNROLL_M;
			}
			/* First, we have to move data A to L2 cache */
			min_i = m_to - m_from;
			l1stride = 1;
			if(min_i >= GEMM_P * 2)
				min_i = GEMM_P;
			else if(min_i > GEMM_P)
				min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
			else l1stride = 0;
			START_RPCC();
			if(transa)
				ICOPYT_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
			else if(args->ks0)
				icopy_operation(min_l, min_i, args, ls, m_from, sa);
			else ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
			STOP_RPCC(innercost);
			for(jjs = js; jjs < js + min_j; jjs += min_jj)
			{
				min_jj = min_j + js - jjs;
				if(min_jj >= 3*GEMM_UNROLL_N)
					min_jj = 3*GEMM_UNROLL_N;
				else if(min_jj > GEMM_UNROLL_N)
					min_jj = GEMM_UNROLL_N;
				START_RPCC();
				OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * l1stride);
				STOP_RPCC(outercost);
				START_RPCC();
				KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa,
					sb + min_l * (jjs - js) * l1stride, c, ldc, m_from, jjs);
				STOP_RPCC(kernelcost);
			}

			for(is = m_from + min_i; is < m_to; is += min_i)
			{
				min_i = m_to - is;
				if(min_i >= GEMM_P * 2)
					min_i = GEMM_P;
				else if(min_i > GEMM_P)
					min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
				START_RPCC();
				if(transa)
					ICOPYT_OPERATION(min_l, min_i, a, lda, ls, is, sa);
				else if(args->ks0)
					icopy_operation(min_l, min_i, args, ls, is, sa);
				else ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
				STOP_RPCC(innercost);
				START_RPCC();
				KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js);
				STOP_RPCC(kernelcost);
			} /* end of is */
		} /* end of js */
	} /* end of ls */

#ifdef TIMING
	total = (double)outercost + (double)innercost + (double)kernelcost;
	printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f  kernel Effi. : %5.2f Total Effi. : %5.2f\n",
		innercost / total * 100., outercost / total * 100.,
		kernelcost / total * 100.,
		(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100.  / 2.,
		(double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. / 2.);
#endif
	return 0;
}
Пример #6
0
static int gemm_thread(long mypos, long nthreads, struct sgemmargs *args, long *range_m, long *range_n)
{
	float *buffer[DIVIDE_RATE];
	float *sa, *sb;
	long m_from, m_to, n_from, n_to;
	long xxx, bufferside;
	long ls, min_l, jjs, min_jj;
	long is, min_i, div_n;
	long i, current;
	long l1stride;
	char transa = args->transa;
	long m = args->m;
	long n = args->n;
	long k = args->k;
	float alpha = args->alpha;
	float beta = args->beta;
	float *a = args->a;
	float *b = args->b;
	float *c = args->c;
	long lda = args->lda;
	long ldb = args->ldb;
	long ldc = args->ldc;

#ifdef TIMING
	unsigned long rpcc_counter;
	unsigned long copy_A = 0;
	unsigned long copy_B = 0;
	unsigned long kernel = 0;
	unsigned long waiting1 = 0;
	unsigned long waiting2 = 0;
	unsigned long waiting3 = 0;
	unsigned long waiting6[MAX_CPU_NUMBER];
	unsigned long ops    = 0;

	for (i = 0; i < num_threads; i++)
		waiting6[i] = 0;
#endif
	sa = saa[mypos];
	sb = sba[mypos];
	m_from = 0;
	m_to = m;

	if(range_m)
	{
		m_from = range_m[mypos + 0];
		m_to   = range_m[mypos + 1];
	}
	n_from = 0;
	n_to   = n;
	if (range_n)
	{
		n_from = range_n[mypos + 0];
		n_to   = range_n[mypos + 1];
	}
	if(beta != 1)
		BETA_OPERATION(m_from, m_to, 0, n, beta, c, ldc);
	if(k == 0 || alpha == 0)
		return 0;
#if 0
	fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n",
		mypos, m_from, m_to, n_from, n_to);
	fprintf(stderr, "GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (long)GEMM_P, (long)GEMM_Q, (long)GEMM_R);
#endif
	div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
	buffer[0] = sb;
	for (i = 1; i < DIVIDE_RATE; i++)
		buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1));
	for(ls = 0; ls < k; ls += min_l)
	{
		min_l = k - ls;
		if (min_l >= GEMM_Q * 2)
			min_l  = GEMM_Q;
		else if (min_l > GEMM_Q)
			min_l = (min_l + 1) / 2;
		l1stride = 1;
		min_i = m_to - m_from;
		if (min_i >= GEMM_P * 2)
			min_i = GEMM_P;
		else if(min_i > GEMM_P)
			min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
		else if (nthreads == 1)
			l1stride = 0;
		START_RPCC();
		//printf("icopy%ld (%ld,%ld)%ld (%ld,%ld)\n", mypos, min_l, min_i, lda, ls, m_from);
		if(transa)
			ICOPYT_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
		else if(args->ks0)
			icopy_operation(min_l, min_i, args, ls, m_from, sa);
		else ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
		STOP_RPCC(copy_A);
		div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
		for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++)
		{
			START_RPCC();
			/* Make sure if no one is using buffer */
			for (i = 0; i < nthreads; i++)
				while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside])
					{YIELDING;}
			STOP_RPCC(waiting1);
			for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj)
			{
				min_jj = MIN(n_to, xxx + div_n) - jjs;
				if(min_jj >= 3*GEMM_UNROLL_N)
					min_jj = 3*GEMM_UNROLL_N;
				else if (min_jj > GEMM_UNROLL_N)
					min_jj = GEMM_UNROLL_N;
				START_RPCC();
				OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, buffer[bufferside] + min_l * (jjs - xxx) * l1stride);
				STOP_RPCC(copy_B);
				START_RPCC();
				KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, buffer[bufferside] + min_l * (jjs - xxx) * l1stride, c, ldc, m_from, jjs);
				STOP_RPCC(kernel);
#ifdef TIMING
				ops += 2 * min_i * min_jj * min_l;
#endif
			}
			for (i = 0; i < nthreads; i++)
				job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (long)buffer[bufferside];
			WMB;
		}
		current = mypos;
		do {
			current ++;
			if(current >= nthreads)
				current = 0;
			div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
			for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++)
			{
				if (current != mypos)
				{
					START_RPCC();
					/* thread has to wait */
					while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0)
						{YIELDING;}
					STOP_RPCC(waiting2);
					START_RPCC();
					KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, alpha, sa,
						(float *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, m_from, xxx);
					STOP_RPCC(kernel);
	#ifdef TIMING
					ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
	#endif
				}
				if (m_to - m_from == min_i)
					job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
			}
		} while (current != mypos);
		for(is = m_from + min_i; is < m_to; is += min_i)
		{
			min_i = m_to - is;
			if (min_i >= GEMM_P * 2)
				min_i = GEMM_P;
			else if (min_i > GEMM_P)
				min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
			START_RPCC();
			//printf("icopya%ld (%ld,%ld)%ld (%ld,%ld)\n", mypos, min_l, min_i, lda, ls, is);
			if(transa)
				ICOPYT_OPERATION(min_l, min_i, a, lda, ls, is, sa);
			else if(args->ks0)
				icopy_operation(min_l, min_i, args, ls, is, sa);
			else ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
			STOP_RPCC(copy_A);
			current = mypos;
			do {
				div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
				for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++)
				{
					START_RPCC();
					KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, sa,
						(float *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, ldc, is, xxx);
					STOP_RPCC(kernel);
#ifdef TIMING
					ops += 2 * min_i * MIN(range_n[current + 1]  - xxx, div_n) * min_l;
#endif
					if(is + min_i >= m_to)
					{
						/* Thread doesn't need this buffer any more */
						job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
						WMB;
					}
				}
				current ++;
				if(current >= nthreads)
					current = 0;
			} while (current != mypos);
		}
	}
	START_RPCC();
	for (i = 0; i < nthreads; i++)
		for (xxx = 0; xxx < DIVIDE_RATE; xxx++)
			while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] )
				{YIELDING;}
	STOP_RPCC(waiting3);
#ifdef TIMING
	long waiting = waiting1 + waiting2 + waiting3;
	long total = copy_A + copy_B + kernel + waiting;

	fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f",
	mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	(double)waiting1 /(double)total * 100.,
	(double)waiting2 /(double)total * 100.,
	(double)waiting3 /(double)total * 100.,
	(double)ops/(double)kernel / 4. * 100.);

#if 0
	fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	mypos, copy_A, copy_B, waiting);

	fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", mypos,
		(double)waiting1/(double)waiting * 100.,
		(double)waiting2/(double)waiting * 100.,
		(double)waiting3/(double)waiting * 100.);
#endif
	fprintf(stderr, "\n");
#endif
	return 0;
}