Exemple #1
0
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {

  BLASLONG m, n, lda, ldb;
  FLOAT *beta, *a, *b;
  BLASLONG ls, is, js;
  BLASLONG min_l, min_i, min_j;
  BLASLONG jjs, min_jj;

  m = args -> m;
  n = args -> n;

  a = (FLOAT *)args -> a;
  b = (FLOAT *)args -> b;

  lda = args -> lda;
  ldb = args -> ldb;

  beta  = (FLOAT *)args -> beta;

  if (range_m) {
    BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
    BLASLONG m_to   = *(((BLASLONG *)range_m) + 1);

    m = m_to - m_from;

    b += m_from * COMPSIZE;
  }

  if (beta) {
#ifndef COMPLEX
    if (beta[0] != ONE)
      GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb);
    if (beta[0] == ZERO) return 0;
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
      GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb);
    if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0;
#endif
  }

#if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA))
  for(js = 0; js < n; js += GEMM_R){
    min_j = n - js;
    if (min_j > GEMM_R) min_j = GEMM_R;

    for(ls = 0; ls < js; ls += GEMM_Q){
      min_l = js - ls;
      if (min_l > GEMM_Q) min_l = GEMM_Q;
      min_i = m;
      if (min_i > GEMM_P) min_i = GEMM_P;

      GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);

      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#ifndef TRANSA
	GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else
	GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#endif

	GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb + min_l * (jjs - js) * COMPSIZE,
		    b + (jjs * ldb) * COMPSIZE, ldb);
      }

      for(is = min_i; is < m; is += GEMM_P){
	min_i = m - is;
	if (min_i > GEMM_P) min_i = GEMM_P;

	GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);

	GEMM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb);
      }
    }

    for(ls = js; ls < js + min_j; ls += GEMM_Q){
      min_l = js + min_j - ls;
      if (min_l > GEMM_Q) min_l = GEMM_Q;
      min_i = m;
      if (min_i > GEMM_P) min_i = GEMM_P;

      GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);

#ifndef TRANSA
      TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb);
#else
      TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb);
#endif

      TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
		  ZERO,
#endif
		  sa,
		  sb,
		  b + (ls * ldb) * COMPSIZE, ldb, 0);

      for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){
	min_jj = min_j - min_l - ls + js - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#ifndef TRANSA
      GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
		   sb + min_l * (min_l + jjs) * COMPSIZE);
#else
      GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda,
		   sb + min_l * (min_l + jjs) * COMPSIZE);
#endif

      GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
		  ZERO,
#endif
		  sa,
		  sb + min_l * (min_l + jjs) * COMPSIZE,
		  b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb);
      }

      for(is = min_i; is < m; is += GEMM_P){
	min_i = m - is;
	if (min_i > GEMM_P) min_i = GEMM_P;

	GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);

	TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa,
		    sb,
		    b + (is + ls * ldb) * COMPSIZE, ldb, 0);

	GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa,
		    sb + min_l * min_l * COMPSIZE,
		    b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb);
      }
    }
  }

#else
  BLASLONG start_ls;

  for(js = n; js > 0; js -= GEMM_R){
    min_j = js;
    if (min_j > GEMM_R) min_j = GEMM_R;

    for (ls = js; ls < n; ls += GEMM_Q) {
      min_l = n - ls;
      if (min_l > GEMM_Q) min_l = GEMM_Q;
      min_i = m;
      if (min_i > GEMM_P) min_i = GEMM_P;

      GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);

      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#ifndef TRANSA
	GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else
	GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#endif

	GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb + min_l * (jjs - js) * COMPSIZE,
		    b + (jjs - min_j) * ldb * COMPSIZE, ldb);
      }

      for(is = min_i; is < m; is += GEMM_P){
	min_i = m - is;
	if (min_i > GEMM_P) min_i = GEMM_P;

	GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);

	GEMM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
      }
    }

    start_ls = js - min_j;
    while (start_ls + GEMM_Q < js) start_ls += GEMM_Q;

    for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){
      min_l = js - ls;
      if (min_l > GEMM_Q) min_l = GEMM_Q;
      min_i = m;
      if (min_i > GEMM_P) min_i = GEMM_P;

      GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa);

#ifndef TRANSA
      TRSM_OLNCOPY(min_l, min_l,           a + (ls + ls * lda) * COMPSIZE, lda,
		   0, sb + min_l * (min_j - js + ls) * COMPSIZE);
#else
      TRSM_OUTCOPY(min_l, min_l,           a + (ls + ls * lda) * COMPSIZE, lda,
		   0, sb + min_l * (min_j - js + ls) * COMPSIZE);
#endif

      TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
		  ZERO,
#endif
		  sa,
		  sb + min_l * (min_j - js + ls) * COMPSIZE,
		  b + (ls * ldb) * COMPSIZE, ldb, 0);

      for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){
	min_jj = min_j - js + ls - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

#ifndef TRANSA
	GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda,
		     sb + min_l * jjs * COMPSIZE);
#else
	GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda,
		     sb + min_l * jjs * COMPSIZE);
#endif

	GEMM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa,
		    sb + min_l * jjs * COMPSIZE,
		    b + (js - min_j + jjs) * ldb * COMPSIZE, ldb);
      }

      for(is = min_i; is < m; is += GEMM_P){
	min_i = m - is;
	if (min_i > GEMM_P) min_i = GEMM_P;

	GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa);

	TRSM_KERNEL(min_i, min_l, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa,
		    sb + min_l * (min_j - js + ls) * COMPSIZE,
		    b + (is + ls * ldb) * COMPSIZE, ldb, 0);

	GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa,
		    sb,
		    b + (is + (js - min_j) * ldb) * COMPSIZE, ldb);
      }

    }
  }

#endif

  return 0;
}
Exemple #2
0
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) {

  BLASLONG m, n, lda, ldb;
  FLOAT *beta, *a, *b;

  BLASLONG ls, is, js;
  BLASLONG min_l, min_i, min_j;
  BLASLONG jjs, min_jj;

  m = args -> m;
  n = args -> n;

  a = (FLOAT *)args -> a;
  b = (FLOAT *)args -> b;

  lda = args -> lda;
  ldb = args -> ldb;

  beta  = (FLOAT *)args -> beta;

  if (range_n) {
    BLASLONG n_from = *(((BLASLONG *)range_n) + 0);
    BLASLONG n_to   = *(((BLASLONG *)range_n) + 1);

    n = n_to - n_from;

    b += n_from * ldb * COMPSIZE;
  }

  if (beta) {
#ifndef COMPLEX
    if (beta[0] != ONE)
      GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb);
    if (beta[0] == ZERO) return 0;
#else
    if ((beta[0] != ONE) || (beta[1] != ZERO))
      GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb);
    if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0;
#endif
  }

  for(js = 0; js < n; js += GEMM_R){
    min_j = n - js;
    if (min_j > GEMM_R) min_j = GEMM_R;
    
#if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA))
    for(ls = 0; ls < m; ls += GEMM_Q){
      min_l = m - ls;
      if (min_l > GEMM_Q) min_l = GEMM_Q;
      min_i = min_l;
      if (min_i > GEMM_P) min_i = GEMM_P;
      
#ifndef TRANSA
      TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa);
#else
      TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa);
#endif
      
      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

	GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);

	TRSM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb + min_l * (jjs - js) * COMPSIZE, 
		    b + (ls + jjs * ldb) * COMPSIZE, ldb, 0);
      }

      for(is = ls + min_i; is < ls + min_l; is += GEMM_P){
	min_i = ls + min_l - is;
	if (min_i > GEMM_P) min_i = GEMM_P;
	
#ifndef TRANSA
	TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa);
#else
	TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa);
#endif
	
	TRSM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls);
      }
      

      for(is = ls + min_l; is < m; is += GEMM_P){
	min_i = m - is;
	if (min_i > GEMM_P) min_i = GEMM_P;
	
#ifndef TRANSA
	GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa);
#else
	GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa);
#endif
	
	GEMM_KERNEL(min_i, min_j, min_l, dm1, 
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); 
      }
    }
#else
    BLASLONG start_is;

    for(ls = m; ls > 0; ls -= GEMM_Q){
      min_l = ls;
      if (min_l > GEMM_Q) min_l = GEMM_Q;
      start_is = ls - min_l;
      while (start_is + GEMM_P < ls) start_is += GEMM_P;
      min_i = ls - start_is;
      if (min_i > GEMM_P) min_i = GEMM_P;

#ifndef TRANSA
      TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa);
#else
      TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa);
#endif

      for(jjs = js; jjs < js + min_j; jjs += min_jj){
	min_jj = min_j + js - jjs;
	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

	GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
	
	TRSM_KERNEL(min_i, min_jj, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb + min_l * (jjs - js) * COMPSIZE, 
		    b +  (start_is + jjs * ldb) * COMPSIZE, ldb,  start_is - ls + min_l);
      }
      
      for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){
	min_i = ls - is;
	if (min_i > GEMM_P) min_i = GEMM_P;
	
#ifndef TRANSA
	TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa);
#else
	TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa);
#endif
	TRSM_KERNEL(min_i, min_j, min_l, dm1,
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb, 
		    b +  (is + js * ldb) * COMPSIZE, ldb,  + is - (ls - min_l) );
      }

      
      for(is = 0; is < ls - min_l; is += GEMM_P){
	min_i = ls - min_l - is;
	if (min_i > GEMM_P) min_i = GEMM_P;
	
#ifndef TRANSA
	GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa);
#else
	GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa);
#endif

	GEMM_KERNEL(min_i, min_j, min_l, dm1, 
#ifdef COMPLEX
		    ZERO,
#endif
		    sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); 
      }
    }

#endif
  }

  return 0;
}
Exemple #3
0
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){

  BLASLONG i, j;
  BLASLONG loop;
  FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE];

  if (m + offset < 0) {
#ifndef LOWER
    GEMM_KERNEL(m, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    return 0;
  }

  if (n < offset) {
#ifdef LOWER
    GEMM_KERNEL(m, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    return 0;
  }


  if (offset > 0) {
#ifdef LOWER
    GEMM_KERNEL(m, offset, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    b += offset * k   * COMPSIZE;
    c += offset * ldc * COMPSIZE;
    n -= offset;
    offset = 0;

    if (n <= 0) return 0;
  }

  if (n > m + offset) {
#ifndef LOWER
      GEMM_KERNEL(m, n - m - offset, k,
		  alpha_r,
#ifdef COMPLEX
		  alpha_i,
#endif
		  a,
		  b + (m + offset) * k   * COMPSIZE,
		  c + (m + offset) * ldc * COMPSIZE, ldc);
#endif

    n = m + offset;
    if (n <= 0) return 0;
  }


  if (offset < 0) {
#ifndef LOWER
    GEMM_KERNEL(-offset, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    a -= offset * k   * COMPSIZE;
    c -= offset       * COMPSIZE;
    m += offset;
    offset = 0;

  if (m <= 0) return 0;
  }

  if (m > n - offset) {
#ifdef LOWER
    GEMM_KERNEL(m - n + offset, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a + (n - offset) * k * COMPSIZE,
		b,
		c + (n - offset)     * COMPSIZE, ldc);
#endif
    m = n + offset;
  if (m <= 0) return 0;
  }

  for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {

    int mm, nn;

    mm = (loop & ~(GEMM_UNROLL_MN - 1));
    nn = MIN(GEMM_UNROLL_MN, n - loop);

#ifndef LOWER
    GEMM_KERNEL(mm, nn, k,
		  alpha_r,
#ifdef COMPLEX
		  alpha_i,
#endif
		  a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
#endif

    if (flag) {
      GEMM_BETA(nn, nn, 0, ZERO,
#ifdef COMPLEX
		ZERO,
#endif
		NULL, 0, NULL, 0, subbuffer, nn);

      GEMM_KERNEL(nn, nn, k,
		    alpha_r,
#ifdef COMPLEX
		    alpha_i,
#endif
		    a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);


#ifndef LOWER

      for (j = 0; j < nn; j ++) {
	for (i = 0; i <= j; i ++) {
	  c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
	    subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0];
	  if (i != j) {
	    c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
	      subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1];
	  } else {
	    c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO;
	  }
	}
      }
#else
      for (j = 0; j < nn; j ++) {
	for (i = j; i < nn; i ++) {
	  c[(i + loop + (j + loop) * ldc) * 2 + 0] +=
	    subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0];
	  if (i != j) {
	    c[(i + loop + (j + loop) * ldc) * 2 + 1] +=
	      subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1];
	  } else {
	    c[(i + loop + (j + loop) * ldc) * 2 + 1]  = ZERO;
	  }
	}
      }
#endif
    }

#ifdef LOWER
    GEMM_KERNEL(m - mm - nn, nn, k,
		  alpha_r,
#ifdef COMPLEX
		  alpha_i,
#endif
		  a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
		  c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
#endif
  }

    return 0;
}
Exemple #4
0
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
#ifdef COMPLEX
	   FLOAT alpha_i,
#endif
	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){

  BLASLONG i, j;
  BLASLONG loop;
  FLOAT *cc, *ss;
  FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE];

  if (m + offset < 0) {
#ifndef LOWER
    GEMM_KERNEL(m, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    return 0;
  }

  if (n < offset) {
#ifdef LOWER
    GEMM_KERNEL(m, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    return 0;
  }

  if (offset > 0) {
#ifdef LOWER
    GEMM_KERNEL(m, offset, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    b += offset * k   * COMPSIZE;
    c += offset * ldc * COMPSIZE;
    n -= offset;
    offset = 0;

    if (n <= 0) return 0;
  }

  if (n > m + offset) {
#ifndef LOWER
      GEMM_KERNEL(m, n - m - offset, k,
		  alpha_r,
#ifdef COMPLEX
		  alpha_i,
#endif
		  a,
		  b + (m + offset) * k   * COMPSIZE,
		  c + (m + offset) * ldc * COMPSIZE, ldc);
#endif

    n = m + offset;
    if (n <= 0) return 0;
  }

  if (offset < 0) {
#ifndef LOWER
    GEMM_KERNEL(-offset, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a, b, c, ldc);
#endif
    a -= offset * k   * COMPSIZE;
    c -= offset       * COMPSIZE;
    m += offset;
    offset = 0;

  if (m <= 0) return 0;
  }

  if (m > n - offset) {
#ifdef LOWER
    GEMM_KERNEL(m - n + offset, n, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a + (n - offset) * k * COMPSIZE,
		b,
		c + (n - offset)     * COMPSIZE, ldc);
#endif
    m = n + offset;

  if (m <= 0) return 0;
  }

  for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) {

    int mm, nn;

    mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
    nn = MIN(GEMM_UNROLL_MN, n - loop);

#ifndef LOWER
    GEMM_KERNEL(mm, nn, k,
		  alpha_r,
#ifdef COMPLEX
		  alpha_i,
#endif
		  a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc);
#endif

    GEMM_BETA(nn, nn, 0, ZERO,
#ifdef COMPLEX
	      ZERO,
#endif
	      NULL, 0, NULL, 0, subbuffer, nn);

    GEMM_KERNEL(nn, nn, k,
		alpha_r,
#ifdef COMPLEX
		alpha_i,
#endif
		a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn);

    cc = c + (loop + loop * ldc) * COMPSIZE;
    ss = subbuffer;

#ifndef LOWER
 for (j = 0; j < nn; j ++) {
    for (i = 0; i <= j; i ++) {
#ifndef COMPLEX
      cc[i] += ss[i];
#else
      cc[i * 2 + 0] += ss[i * 2 + 0];
      cc[i * 2 + 1] += ss[i * 2 + 1];
#endif
    }
    ss += nn  * COMPSIZE;
    cc += ldc * COMPSIZE;
  }
#else
  for (j = 0; j < nn; j ++) {
    for (i = j; i < nn; i ++) {
#ifndef COMPLEX
      cc[i] += ss[i];
#else
      cc[i * 2 + 0] += ss[i * 2 + 0];
      cc[i * 2 + 1] += ss[i * 2 + 1];
#endif
    }
    ss += nn  * COMPSIZE;
    cc += ldc * COMPSIZE;
  }
#endif

#ifdef LOWER
    GEMM_KERNEL(m - mm - nn, nn, k,
		  alpha_r,
#ifdef COMPLEX
		  alpha_i,
#endif
		  a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE,
		  c + (mm + nn + loop * ldc) * COMPSIZE, ldc);
#endif

  }

    return 0;
}