int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { BLASLONG m, n, lda, ldb; FLOAT *beta, *a, *b; BLASLONG ls, is, js; BLASLONG min_l, min_i, min_j; BLASLONG jjs, min_jj; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; b = (FLOAT *)args -> b; lda = args -> lda; ldb = args -> ldb; beta = (FLOAT *)args -> beta; if (range_n) { BLASLONG n_from = *(((BLASLONG *)range_n) + 0); BLASLONG n_to = *(((BLASLONG *)range_n) + 1); n = n_to - n_from; b += n_from * ldb * COMPSIZE; } if (beta) { #ifndef COMPLEX if (beta[0] != ONE) GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); if (beta[0] == ZERO) return 0; #else if ((beta[0] != ONE) || (beta[1] != ZERO)) GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; #endif } for(js = 0; js < n; js += GEMM_R){ min_j = n - js; if (min_j > GEMM_R) min_j = GEMM_R; #if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) for(ls = 0; ls < m; ls += GEMM_Q){ min_l = m - ls; if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); } for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); #else TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); } for(is = ls + min_l; is < m; is += GEMM_P){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #else BLASLONG start_is; for(ls = m; ls > 0; ls -= GEMM_Q){ min_l = ls; if (min_l > GEMM_Q) min_l = GEMM_Q; start_is = ls - min_l; while (start_is + GEMM_P < ls) start_is += GEMM_P; min_i = ls - start_is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); #endif for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); TRSM_KERNEL(min_i, min_jj, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb + min_l * (jjs - js) * COMPSIZE, b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); } for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #else TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); #endif TRSM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); } for(is = 0; is < ls - min_l; is += GEMM_P){ min_i = ls - min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifndef TRANSA GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif GEMM_KERNEL(min_i, min_j, min_l, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); } } #endif } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, blocking; BLASLONG is, min_i; BLASLONG jjs, min_jj; BLASLONG range_N[2]; BLASLONG j, js, min_j; #ifdef SHARED_ARRAY FLOAT *aa; #endif FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_U(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); for(js = j + bk; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); for (is = 0; is < bk; is += GEMM_P) { min_i = bk - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL (min_i, min_jj, bk, dm1, #ifdef COMPLEX ZERO, #endif sb + bk * is * COMPSIZE, sb2 + bk * (jjs - js) * COMPSIZE, a + (j + is + jjs * lda) * COMPSIZE, lda, is); } } for (is = j + bk; is < js + min_j; is += min_i) { min_i = js + min_j - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifdef SHARED_ARRAY if ((is >= js) && (is + min_i <= js + min_j)) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); aa = sa; } #else GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); #endif SYRK_KERNEL_U(min_i, min_j, bk, dm1, SA, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); } } } } return 0; }
static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ FLOAT *buffer[DIVIDE_RATE]; BLASLONG k, lda; BLASLONG m_from, m_to; FLOAT *alpha; FLOAT *a, *c; job_t *job = (job_t *)args -> common; BLASLONG xxx, bufferside; BLASLONG jjs, min_jj; BLASLONG is, min_i, div_n; BLASLONG i, current; k = K; a = (FLOAT *)A; c = (FLOAT *)C; lda = LDA; alpha = (FLOAT *)args -> alpha; m_from = range_n[mypos + 0]; m_to = range_n[mypos + 1]; #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); #endif div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); for (i = 1; i < DIVIDE_RATE; i++) { buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; } #ifndef LOWER TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); #else TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); #endif for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ min_jj = MIN(m_to, xxx + div_n) - jjs; #ifndef LOWER if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; #else if (min_jj > GEMM_P) min_jj = GEMM_P; #endif #ifndef LOWER OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); TRSM_KERNEL (k, min_jj, k, dm1, #ifdef COMPLEX ZERO, #endif sb, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, a + jjs * lda * COMPSIZE, lda, 0); #else ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); TRSM_KERNEL (min_jj, k, k, dm1, #ifdef COMPLEX ZERO, #endif buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, sb, a + jjs * COMPSIZE, lda, 0); #endif } #ifndef LOWER for (i = 0; i <= mypos; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #else for (i = mypos; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; #endif WMB; } min_i = m_to - m_from; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); #endif current = mypos; #ifndef LOWER while (current < args -> nthreads) #else while (current >= 0) #endif { div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { /* thread has to wait */ if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, m_from, xxx); if (m_from + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; #else current --; #endif } for(is = m_from + min_i; is < m_to; is += min_i){ min_i = m_to - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifndef LOWER ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #else OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); #endif current = mypos; #ifndef LOWER while (current < args -> nthreads) #else while (current >= 0) #endif { div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], c, lda, is, xxx); if (is + min_i >= m_to) { job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; WMB; } } #ifndef LOWER current ++; #else current --; #endif } } for (i = 0; i < args -> nthreads; i++) { if (i != mypos) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; } } } return 0; }