blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda, offset; BLASLONG j, js, jmin, is, imin, jc, jcmin; BLASLONG jjs, min_jj; blasint *ipiv, iinfo, info; BLASLONG jb, mn, blocking; FLOAT *a, *offsetA, *offsetB; BLASLONG range_N[2]; FLOAT *sbb; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; mn = MIN(m, n); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; if (blocking <= GEMM_UNROLL_N * 2) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; for (j = 0; j < mn; j += blocking) { jb = mn - j; if (jb > blocking) jb = blocking; offsetA = a + j * lda * COMPSIZE; offsetB = a + (j + jb) * lda * COMPSIZE; range_N[0] = offset + j; range_N[1] = offset + j + jb; iinfo = CNAME(args, NULL, range_N, sa, sb, 0); if (iinfo && !info) info = iinfo + j; if (j + jb < n) { TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); for (js = j + jb; js < n; js += REAL_GEMM_R){ jmin = n - js; if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R; for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){ min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #if 0 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, #endif a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE); #else LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE); #endif for (jc = 0; jc < jb; jc += GEMM_P) { jcmin = jb - jc; if (jcmin > GEMM_P) jcmin = GEMM_P; TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1, #ifdef COMPLEX ZERO, #endif sb + jb * jc * COMPSIZE, sbb + jb * (jjs - js) * COMPSIZE, a + (j + jc + jjs * lda) * COMPSIZE, lda, jc); } } for (is = j + jb; is < m; is += GEMM_P){ imin = m - is; if (imin > GEMM_P) imin = GEMM_P; GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa); GEMM_KERNEL_N(imin, jmin, jb, dm1, #ifdef COMPLEX ZERO, #endif sa, sbb, a + (is + js * lda) * COMPSIZE, lda); } } } } for (j = 0; j < mn; j += jb) { jb = MIN(mn - j, blocking); LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, #ifdef COMPLEX ZERO, #endif a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); } return info; }
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_m) { BLASLONG m_from = *(((BLASLONG *)range_m) + 0); BLASLONG m_to = *(((BLASLONG *)range_m) + 1); m = m_to - m_from; b += m_from * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } #if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; for(ls = 0; ls < js; ls += GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } for(ls = js; ls < js + min_j; ls += GEMM_Q){ min_l = js + min_j - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #else TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * (min_l + jjs) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_l + jjs) * COMPSIZE, b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * min_l * COMPSIZE, b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); } } } #else BLASLONG start_ls; for(js = n; js > 0; js -= GEMM_R){ min_j = js; if (min_j > GEMM_R) min_j = GEMM_R; for (ls = js; ls < n; ls += GEMM_Q) { min_l = n - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #else GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs - min_j) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } start_ls = js - min_j; while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ min_l = js - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); #ifndef TRANSA TRSM_OLNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #else TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb + min_l * (min_j - js + ls) * COMPSIZE); #endif TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (ls * ldb) * COMPSIZE, ldb, 0); for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #ifndef TRANSA GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #else GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif GEMM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * jjs * COMPSIZE, b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); } for(is = min_i; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRSM_KERNEL(min_i, min_l, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (min_j - js + ls) * COMPSIZE, b + (is + ls * ldb) * COMPSIZE, ldb, 0); GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); } } } #endif return 0; }
static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ BLASLONG is, min_i; BLASLONG js, min_j; BLASLONG jjs, min_jj; BLASLONG m = args -> m; BLASLONG n = args -> n; BLASLONG k = args -> k; BLASLONG lda = args -> lda; BLASLONG off = args -> ldb; FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb = sb; volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; blasint *ipiv = (blasint *)args -> c; if (range_n) { n = range_n[1] - range_n[0]; c += range_n[0] * lda * COMPSIZE; d += range_n[0] * lda * COMPSIZE; } if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } for (js = 0; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = js + min_j - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, c + (- off + jjs * lda) * COMPSIZE, lda, ipiv, sbb + k * (jjs - js) * COMPSIZE); } else { LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE); } for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb + k * is * COMPSIZE, sbb + (jjs - js) * k * COMPSIZE, c + (is + jjs * lda) * COMPSIZE, lda, is); } } if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0; for (is = 0; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); GEMM_KERNEL_N(min_i, min_j, k, dm1, #ifdef COMPLEX ZERO, #endif sa, sbb, d + (is + js * lda) * COMPSIZE, lda); } } }
static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; FLOAT *buffer[DIVIDE_RATE]; BLASLONG jjs, min_jj, div_n; BLASLONG i, current; BLASLONG is, min_i; BLASLONG m, n_from, n_to; BLASLONG k = args -> k; BLASLONG lda = args -> lda; BLASLONG off = args -> ldb; FLOAT *a = (FLOAT *)args -> b + (k ) * COMPSIZE; FLOAT *b = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; FLOAT *c = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; FLOAT *sbb= sb; blasint *ipiv = (blasint *)args -> c; volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; if (args -> a == NULL) { TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); } else { sb = (FLOAT *)args -> a; } m = range_m[1] - range_m[0]; n_from = range_n[mypos + 0]; n_to = range_n[mypos + 1]; a += range_m[0] * COMPSIZE; c += range_m[0] * COMPSIZE; div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; buffer[0] = sbb; for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; } for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { for (i = 0; i < args -> nthreads; i++) while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(n_to, xxx + div_n) - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (GEMM_UNROLL_N <= 8) { LASWP_NCOPY(min_jj, off + 1, off + k, b + (- off + jjs * lda) * COMPSIZE, lda, ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); } else { LASWP_PLUS(min_jj, off + 1, off + k, ZERO, #ifdef COMPLEX ZERO, #endif b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); } for (is = 0; is < k; is += GEMM_P) { min_i = k - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL_LT(min_i, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb + k * is * COMPSIZE, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, b + (is + jjs * lda) * COMPSIZE, lda, is); } } for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; } flag[mypos * CACHE_LINE_SIZE] = 0; if (m == 0) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; } } for(is = 0; is < m; is += min_i){ min_i = m - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); } ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); current = mypos; do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { if ((current != mypos) && (!is)) { while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; } KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); if (is + min_i >= m) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; } } current ++; if (current >= args -> nthreads) current = 0; } while (current != mypos); } for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; } } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, blocking; BLASLONG is, min_i; BLASLONG jjs, min_jj; BLASLONG range_N[2]; BLASLONG j, js, min_j; #ifdef SHARED_ARRAY FLOAT *aa; #endif FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_U(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); for(js = j + bk; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); for (is = 0; is < bk; is += GEMM_P) { min_i = bk - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL (min_i, min_jj, bk, dm1, #ifdef COMPLEX ZERO, #endif sb + bk * is * COMPSIZE, sb2 + bk * (jjs - js) * COMPSIZE, a + (j + is + jjs * lda) * COMPSIZE, lda, is); } } for (is = j + bk; is < js + min_j; is += min_i) { min_i = js + min_j - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifdef SHARED_ARRAY if ((is >= js) && (is + min_i <= js + min_j)) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); aa = sa; } #else GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); #endif SYRK_KERNEL_U(min_i, min_j, bk, dm1, SA, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); } } } } return 0; }
int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(ls = 0; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); } for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); } for(is = ls + min_l; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #else BLASLONG start_is; for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; start_is = ls - min_l; while (start_is + GEMM_P < ls) start_is += GEMM_P; min_i = ls - start_is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); } for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); } for(is = 0; is < ls - min_l; is += GEMM_P){ min_i = ls - min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #endif } return 0; }