예제 #1
0
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){

  BLASLONG k, lda, ldb, ldc;
  BLASLONG m_from, m_to, n_from, n_to, N_from, N_to;

  FLOAT *alpha, *beta;
  FLOAT *a, *b, *c;
  job_t *job = (job_t *)args -> common;
  BLASLONG xxx, bufferside;
  FLOAT *buffer[DIVIDE_RATE];

  BLASLONG ls, min_l, jjs, min_jj;
  BLASLONG is, min_i, div_n;
  BLASLONG i, current;

#ifdef TIMING
  BLASLONG rpcc_counter;
  BLASLONG copy_A = 0;
  BLASLONG copy_B = 0;
  BLASLONG kernel = 0;
  BLASLONG waiting1 = 0;
  BLASLONG waiting2 = 0;
  BLASLONG waiting3 = 0;
  BLASLONG waiting6[MAX_CPU_NUMBER];
  BLASLONG ops    = 0;

  for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0;
#endif

  k = K;

  a = (FLOAT *)A;
  b = (FLOAT *)B;
  c = (FLOAT *)C;

  lda = LDA;
  ldb = LDB;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = M;

  if (range_m) {
    m_from = range_m[0];
    m_to   = range_m[1];
  }

  n_from = 0;
  n_to   = N;

  N_from = 0;
  N_to   = N;

  if (range_n) {
    n_from = range_n[mypos + 0];
    n_to   = range_n[mypos + 1];

    N_from = range_n[0];
    N_to   = range_n[args -> nthreads];
  }

  if (beta) {
    if ((beta[0] != ONE) || (beta[1] != ZERO))
      BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0;

#if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
	  mypos, m_from, m_to, n_from, n_to, N_from, N_to);
#endif

  div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
  
  buffer[0] = sb;
  for (i = 1; i < DIVIDE_RATE; i++) {
    buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
  }
  
  for(ls = 0; ls < k; ls += min_l){
    min_l = k - ls;
    if (min_l >= GEMM3M_Q * 2) {
      min_l = GEMM3M_Q;
    } else {
      if (min_l > GEMM3M_Q) {
	min_l = (min_l + 1) / 2;
      }
    }

    min_i = m_to - m_from;
    
    if (min_i >= GEMM3M_P * 2) {
      min_i = GEMM3M_P;
    } else {
      if (min_i > GEMM3M_P) {
	min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
      }
    }


    START_RPCC();
    
    ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
    
    STOP_RPCC(copy_A);
    
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    
    for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using another buffer */
      for (i = 0; i < args -> nthreads; i++)
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
	min_jj = MIN(n_to, xxx + div_n) - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#else
	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#endif
	
	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	  KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
			   sa, buffer[bufferside] + min_l * (jjs - xxx),
			   c, ldc, m_from, jjs);
	  
	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }
	
      for (i = 0; i < args -> nthreads; i++)
	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
      }
    
    current = mypos;
    
    do {
      current ++;
      if (current >= args -> nthreads) current = 0;
      
      div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
      
      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	
	if (current != mypos) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	    
	  START_RPCC();


	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, ALPHA5, ALPHA6,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, m_from, xxx);

	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	}
	
	if (m_to - m_from == min_i) {
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	}
      }
    } while (current != mypos);
    
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else 
	if (min_i > GEMM3M_P) {
	  min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      
      START_RPCC();
      
      ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;
      do {
	
	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	STOP_RPCC(kernel);
#ifdef TIMING
	ops += 2 * min_i * (range_n[current + 1]  - range_n[current] - div_n) * min_l;
#endif
	if (is + min_i >= m_to) {
	  /* Thread doesn't need this buffer any more */
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	}
	}
	
	current ++;
	if (current >= args -> nthreads) current = 0;
	
      } while (current != mypos);
      
    } /* end of is */
    
    START_RPCC();
    
    ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
    
    STOP_RPCC(copy_A);
    
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    
    for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using another buffer */
      for (i = 0; i < args -> nthreads; i++)
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
	min_jj = MIN(n_to, xxx + div_n) - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#else
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#endif

	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	  KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
			   sa, buffer[bufferside] + min_l * (jjs - xxx),
			   c, ldc, m_from, jjs);
	  
	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }
	
      for (i = 0; i < args -> nthreads; i++)
	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
      }
    
    current = mypos;
    
    do {
      current ++;
      if (current >= args -> nthreads) current = 0;
      
      div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
      
      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	
	if (current != mypos) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	    
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, ALPHA11, ALPHA12,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, m_from, xxx);

	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	}
	
	if (m_to - m_from == min_i) {
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	}
      }
    } while (current != mypos);
    
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else 
	if (min_i > GEMM3M_P) {
	  min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      
      START_RPCC();
      
      ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;
      do {
	
	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	STOP_RPCC(kernel);
#ifdef TIMING
	ops += 2 * min_i * (range_n[current + 1]  - range_n[current] - div_n) * min_l;
#endif
	if (is + min_i >= m_to) {
	  /* Thread doesn't need this buffer any more */
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	}
	}
	
	current ++;
	if (current >= args -> nthreads) current = 0;
	
      } while (current != mypos);
      
    } /* end of is */
    

    START_RPCC();
    
    ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
    
    STOP_RPCC(copy_A);
    
    div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
    
    for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) {
      
      START_RPCC();
      
      /* Make sure if no one is using another buffer */
      for (i = 0; i < args -> nthreads; i++)
	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
      
      STOP_RPCC(waiting1);
      
      for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
	min_jj = MIN(n_to, xxx + div_n) - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) 
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#else
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx));
#endif

	STOP_RPCC(copy_B);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
			 sa, buffer[bufferside] + min_l * (jjs - xxx),
			 c, ldc, m_from, jjs);
	  
	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * min_jj * min_l;
#endif

      }
	
      for (i = 0; i < args -> nthreads; i++)
	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
      }
    
    current = mypos;
    
    do {
      current ++;
      if (current >= args -> nthreads) current = 0;
      
      div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
      
      for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	
	if (current != mypos) {
	  
	  START_RPCC();
	  
	  /* thread has to wait */
	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
	  
	  STOP_RPCC(waiting2);
	    
	  START_RPCC();

	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1]  - xxx,  div_n), min_l, ALPHA17, ALPHA18,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, m_from, xxx);

	STOP_RPCC(kernel);
#ifdef TIMING
	  ops += 2 * min_i * MIN(range_n[current + 1]  - xxx,  div_n) * min_l;
#endif
	}
	
	if (m_to - m_from == min_i) {
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	}
      }
    } while (current != mypos);
    
    for(is = m_from + min_i; is < m_to; is += min_i){
      min_i = m_to - is;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else 
	if (min_i > GEMM3M_P) {
	  min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      
      START_RPCC();
      
      ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
      
      STOP_RPCC(copy_A);
      
      current = mypos;
      do {
	
	div_n = (range_n[current + 1]  - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE;
	
	for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
	  
	  START_RPCC();
	  
	  KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18,
			   sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
			   c, ldc, is, xxx);
	  
	STOP_RPCC(kernel);
#ifdef TIMING
	ops += 2 * min_i * (range_n[current + 1]  - range_n[current] - div_n) * min_l;
#endif
	if (is + min_i >= m_to) {
	  /* Thread doesn't need this buffer any more */
	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
	}
	}
	
	current ++;
	if (current >= args -> nthreads) current = 0;
	
      } while (current != mypos);
      
    } /* end of is */

  }
  
  START_RPCC();

  for (i = 0; i < args -> nthreads; i++) {
    for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
      while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
    }
  }

  STOP_RPCC(waiting3);

#ifdef TIMING
  BLASLONG waiting = waiting1 + waiting2 + waiting3;
  BLASLONG total = copy_A + copy_B + kernel + waiting;

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2f  Copy_B : %6.2f  Wait : %6.2f Kernel : %6.2f\n",
	  mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100.,
	  (double)waiting /(double)total * 100.,
	  (double)ops/(double)kernel / 2. * 100.);

  fprintf(stderr, "GEMM   [%2ld] Copy_A : %6.2ld  Copy_B : %6.2ld  Wait : %6.2ld\n",
	  mypos, copy_A, copy_B, waiting);

#if 0
  fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n",
	  mypos,
	  (double)waiting1/(double)waiting * 100.,
	  (double)waiting2/(double)waiting * 100.,
	  (double)waiting3/(double)waiting * 100.);
#endif
  fprintf(stderr, "\n");
#endif



  return 0;
}
예제 #2
0
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
		  FLOAT *sa, FLOAT *sb, BLASLONG dummy){
  BLASLONG k, lda, ldb, ldc;
  FLOAT *alpha, *beta;
  FLOAT *a, *b, *c;
  BLASLONG m_from, m_to, n_from, n_to;

  BLASLONG ls, is, js, jjs;
  BLASLONG min_l, min_i, min_j, min_jj;

#ifdef TIMING
  BLASULONG rpcc_counter;
  BLASULONG BLASLONG innercost  = 0;
  BLASULONG BLASLONG outercost  = 0;
  BLASULONG BLASLONG kernelcost = 0;
  double total;
#endif

  k = K;

  a = (FLOAT *)A;
  b = (FLOAT *)B;
  c = (FLOAT *)C;

  lda = LDA;
  ldb = LDB;
  ldc = LDC;

  alpha = (FLOAT *)args -> alpha;
  beta  = (FLOAT *)args -> beta;

  m_from = 0;
  m_to   = M;

  if (range_m) {
    m_from = *(((BLASLONG *)range_m) + 0);
    m_to   = *(((BLASLONG *)range_m) + 1);
  }

  n_from = 0;
  n_to   = N;

  if (range_n) {
    n_from = *(((BLASLONG *)range_n) + 0);
    n_to   = *(((BLASLONG *)range_n) + 1);
  }

  if (beta) {
#ifndef COMPLEX
    if (beta[0] != ONE)
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
#endif
      BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc);
  }

  if ((k == 0) || (alpha == NULL)) return 0;

  if ((alpha[0] == ZERO)
#ifdef COMPLEX
      && (alpha[1] == ZERO)
#endif
      ) return 0;

#if 0
  printf("GEMM: M_from : %ld  M_to : %ld  N_from : %ld  N_to : %ld  k : %ld\n", m_from, m_to, n_from, n_to, k);
  printf("GEMM: P = %4ld  Q = %4ld  R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R);
  printf("GEMM: SA .. %p  SB .. %p\n", sa, sb);
#endif

#ifdef DEBUG
  innercost = 0;
  outercost = 0;
  kernelcost = 0;
#endif

  for(js = n_from; js < n_to; js += GEMM3M_R){
    min_j = n_to - js;
    if (min_j > GEMM3M_R) min_j = GEMM3M_R;
    
    for(ls = 0; ls < k; ls += min_l){
      min_l = k - ls;
      
      if (min_l >= GEMM3M_Q * 2) {
	min_l = GEMM3M_Q;
      } else {
	if (min_l > GEMM3M_Q) {
	  min_l = (min_l + 1) / 2;
#ifdef UNROLL_X
	  min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
#endif
	}
      }
      
      min_i = m_to - m_from;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else {
	if (min_i > GEMM3M_P) {
	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      }

      START_RPCC();
      
      ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
      
      STOP_RPCC(innercost);
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
	OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif
	
	STOP_RPCC(outercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6,
			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
	
	STOP_RPCC(kernelcost);
	
      }      
      
      for(is = m_from + min_i; is < m_to; is += min_i){
	min_i = m_to - is;
	if (min_i >= GEMM3M_P * 2) {
	  min_i = GEMM3M_P;
	} else 
	  if (min_i > GEMM3M_P) {
	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	  }
	
	START_RPCC();
	
	ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa);
	
	STOP_RPCC(innercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js);
	
	STOP_RPCC(kernelcost);
      }

      min_i = m_to - m_from;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else {
	if (min_i > GEMM3M_P) {
	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      }
      
      START_RPCC();
      
      ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
      
      STOP_RPCC(innercost);
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif

	STOP_RPCC(outercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12,
			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
	
	STOP_RPCC(kernelcost);
	
      }      
      
      for(is = m_from + min_i; is < m_to; is += min_i){
	min_i = m_to - is;
	if (min_i >= GEMM3M_P * 2) {
	  min_i = GEMM3M_P;
	} else 
	  if (min_i > GEMM3M_P) {
	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	  }
	
	START_RPCC();
	
	ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa);
	
	STOP_RPCC(innercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js);
	
	STOP_RPCC(kernelcost);
	
      }

      min_i = m_to - m_from;
      if (min_i >= GEMM3M_P * 2) {
	min_i = GEMM3M_P;
      } else {
	if (min_i > GEMM3M_P) {
	  min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	}
      }
      
      START_RPCC();
      
      ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa);
      
      STOP_RPCC(innercost);
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
	
	START_RPCC();
	
#if   defined(NN) || defined(NT) || defined(TN) || defined(TT) 
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
	OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0],  alpha[1], ls, jjs, sb + min_l * (jjs - js));
#else
	OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js));
#endif

	STOP_RPCC(outercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18,
			 sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs);
	
	STOP_RPCC(kernelcost);
	
      }      
      
      for(is = m_from + min_i; is < m_to; is += min_i){
	min_i = m_to - is;
	if (min_i >= GEMM3M_P * 2) {
	  min_i = GEMM3M_P;
	} else 
	  if (min_i > GEMM3M_P) {
	    min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
	  }
	
	START_RPCC();
	
	ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa);
	
	STOP_RPCC(innercost);
	
	START_RPCC();
	
	KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js);
	
	STOP_RPCC(kernelcost);
	
      }

    } /* end of js */
  } /* end of ls */
  

#ifdef TIMING
  total = (double)outercost + (double)innercost + (double)kernelcost;

  printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f\n",
	   innercost / total * 100., outercost / total * 100.,
	  kernelcost / total * 100.);

  printf( " Total %10.3f%%  %10.3f MFlops\n",
	  ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100,
	  2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost);
#endif
  
  return 0;
}