int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; b += m_from * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; for(ls = 0; ls < js; ls += GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } for(ls = js; ls < js + min_j; ls += GEMM_Q){ min_l = js + min_j - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #else TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_l + jjs) * COMPSIZE, b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * min_l * COMPSIZE, b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); } } } #else BLASLONG start_ls; for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; for (ls = js; ls < n; ls += GEMM_Q) { min_l = n - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs - min_j) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OLNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #else TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } #endif return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, j, blocking; BLASLONG is, min_i; BLASLONG js, min_j; BLASLONG range_N[2]; FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); #ifdef SHARED_ARRAY FLOAT *aa; #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_L(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = n / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); /* First tile */ min_j = n - j - bk; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for (is = j + bk; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifdef SHARED_ARRAY if (is < j + bk + min_j) { aa = sb2 + bk * (is - j - bk) * COMPSIZE; } else { aa = sa; } GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa); TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, #endif aa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, a + (is + (j + bk) * lda) * COMPSIZE, lda, is - j - bk); #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); if (is < j + bk + min_j) { GEMM_OTCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sb2 + bk * (is - j - bk) * COMPSIZE); } SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, a + (is + (j + bk) * lda) * COMPSIZE, lda, is - j - bk); #endif } for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){ min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2); for (is = js; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifdef SHARED_ARRAY if (is + min_i < js + min_j) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); aa = sa; } SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, a + (is + js * lda) * COMPSIZE, lda, - is + js); #endif } } } } return 0; }
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(ls = 0; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); } for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); } for(is = ls + min_l; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #else BLASLONG start_is; for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; start_is = ls - min_l; while (start_is + GEMM_P < ls) start_is += GEMM_P; min_i = ls - start_is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); } for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); } for(is = 0; is < ls - min_l; is += GEMM_P){ min_i = ls - min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #endif } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, blocking; BLASLONG is, min_i; BLASLONG jjs, min_jj; BLASLONG range_N[2]; BLASLONG j, js, min_j; #ifdef SHARED_ARRAY FLOAT *aa; #endif FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_U(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); for(js = j + bk; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); for (is = 0; is < bk; is += GEMM_P) { min_i = bk - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL (min_i, min_jj, bk, dm1, #ifdef COMPLEX ZERO, #endif sb + bk * is * COMPSIZE, sb2 + bk * (jjs - js) * COMPSIZE, a + (j + is + jjs * lda) * COMPSIZE, lda, is); } } for (is = j + bk; is < js + min_j; is += min_i) { min_i = js + min_j - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifdef SHARED_ARRAY if ((is >= js) && (is + min_i <= js + min_j)) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); aa = sa; } #else GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); #endif SYRK_KERNEL_U(min_i, min_j, bk, dm1, SA, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); } } } } return 0; }
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ FLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda; BLASLONG m_from, m_to; FLOAT *alpha; FLOAT *a, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; BLASLONG jjs, min_jj; BLASLONG is, min_i, div_n; BLASLONG i, current; k = K; a = (FLOAT *)A; c = (FLOAT *)C; lda = LDA; alpha = (FLOAT *)args -> alpha; m_from = range_n[mypos + 0]; m_to = range_n[mypos + 1]; #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); #endif div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } #ifndef LOWER TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); #else TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); #endif for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(m_to, xxx + div_n) - jjs; #ifndef LOWER if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; #else if (min_jj > GEMM_P) min_jj = GEMM_P; #endif #ifndef LOWER OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); TRSM_KERNEL (k, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, a + jjs * lda * COMPSIZE, lda, 0); #else ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); TRSM_KERNEL (min_jj, k, k, dm1, #ifdef COMPLEX ZERO, #endif buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, sb, a + jjs * COMPSIZE, lda, 0); #endif } #ifndef LOWER for (i = 0; i <= mypos; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #else for (i = mypos; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #endif WMB; } min_i = m_to - m_from; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #endif current = mypos; #ifndef LOWER while (current < args -> nthreads) #else while (current >= 0) #endif { div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { /* thread has to wait */ if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, m_from, xxx); if (m_from + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; #else current --; #endif } for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #endif current = mypos; #ifndef LOWER while (current < args -> nthreads) #else while (current >= 0) #endif { div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); if (is + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; #else current --; #endif } } for (i = 0; i < args -> nthreads; i++) { if (i != mypos) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; } } } return 0; }