int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; b += m_from * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; for(ls = 0; ls < js; ls += GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } for(ls = js; ls < js + min_j; ls += GEMM_Q){ min_l = js + min_j - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #else TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_l + jjs) * COMPSIZE, b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * min_l * COMPSIZE, b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); } } } #else BLASLONG start_ls; for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; for (ls = js; ls < n; ls += GEMM_Q) { min_l = n - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs - min_j) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OLNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #else TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } #endif return 0; }
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(ls = 0; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); } for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); } for(is = ls + min_l; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #else BLASLONG start_is; for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; start_is = ls - min_l; while (start_is + GEMM_P < ls) start_is += GEMM_P; min_i = ls - start_is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); } for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); } for(is = 0; is < ls - min_l; is += GEMM_P){ min_i = ls - min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #endif } return 0; }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ BLASLONG i, j; BLASLONG loop; FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; if (m + offset < 0) { #ifndef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (n < offset) { #ifdef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (offset > 0) { #ifdef LOWER GEMM_KERNEL(m, offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; n -= offset; offset = 0; if (n <= 0) return 0; } if (n > m + offset) { #ifndef LOWER GEMM_KERNEL(m, n - m - offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + (m + offset) * k * COMPSIZE, c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; if (n <= 0) return 0; } if (offset < 0) { #ifndef LOWER GEMM_KERNEL(-offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; m += offset; offset = 0; if (m <= 0) return 0; } if (m > n - offset) { #ifdef LOWER GEMM_KERNEL(m - n + offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (n - offset) * k * COMPSIZE, b, c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { int mm, nn; mm = (loop & ~(GEMM_UNROLL_MN - 1)); nn = MIN(GEMM_UNROLL_MN, n - loop); #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif if (flag) { GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); GEMM_KERNEL(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); #ifndef LOWER for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; if (i != j) { c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; } else { c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; } } } #else for (j = 0; j < nn; j ++) { for (i = j; i < nn; i ++) { c[(i + loop + (j + loop) * ldc) * 2 + 0] += subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; if (i != j) { c[(i + loop + (j + loop) * ldc) * 2 + 1] += subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; } else { c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; } } } #endif } #ifdef LOWER GEMM_KERNEL(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } return 0; }
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, #ifdef COMPLEX FLOAT alpha_i, #endif FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ BLASLONG i, j; BLASLONG loop; FLOAT *cc, *ss; FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; if (m + offset < 0) { #ifndef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (n < offset) { #ifdef LOWER GEMM_KERNEL(m, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif return 0; } if (offset > 0) { #ifdef LOWER GEMM_KERNEL(m, offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif b += offset * k * COMPSIZE; c += offset * ldc * COMPSIZE; n -= offset; offset = 0; if (n <= 0) return 0; } if (n > m + offset) { #ifndef LOWER GEMM_KERNEL(m, n - m - offset, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + (m + offset) * k * COMPSIZE, c + (m + offset) * ldc * COMPSIZE, ldc); #endif n = m + offset; if (n <= 0) return 0; } if (offset < 0) { #ifndef LOWER GEMM_KERNEL(-offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b, c, ldc); #endif a -= offset * k * COMPSIZE; c -= offset * COMPSIZE; m += offset; offset = 0; if (m <= 0) return 0; } if (m > n - offset) { #ifdef LOWER GEMM_KERNEL(m - n + offset, n, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (n - offset) * k * COMPSIZE, b, c + (n - offset) * COMPSIZE, ldc); #endif m = n + offset; if (m <= 0) return 0; } for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { int mm, nn; mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; nn = MIN(GEMM_UNROLL_MN, n - loop); #ifndef LOWER GEMM_KERNEL(mm, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); #endif GEMM_BETA(nn, nn, 0, ZERO, #ifdef COMPLEX ZERO, #endif NULL, 0, NULL, 0, subbuffer, nn); GEMM_KERNEL(nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); cc = c + (loop + loop * ldc) * COMPSIZE; ss = subbuffer; #ifndef LOWER for (j = 0; j < nn; j ++) { for (i = 0; i <= j; i ++) { #ifndef COMPLEX cc[i] += ss[i]; #else cc[i * 2 + 0] += ss[i * 2 + 0]; cc[i * 2 + 1] += ss[i * 2 + 1]; #endif } ss += nn * COMPSIZE; cc += ldc * COMPSIZE; } #else for (j = 0; j < nn; j ++) { for (i = j; i < nn; i ++) { #ifndef COMPLEX cc[i] += ss[i]; #else cc[i * 2 + 0] += ss[i * 2 + 0]; cc[i * 2 + 1] += ss[i * 2 + 1]; #endif } ss += nn * COMPSIZE; cc += ldc * COMPSIZE; } #endif #ifdef LOWER GEMM_KERNEL(m - mm - nn, nn, k, alpha_r, #ifdef COMPLEX alpha_i, #endif a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, c + (mm + nn + loop * ldc) * COMPSIZE, ldc); #endif } return 0; }