blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { BLASLONG n, info; BLASLONG bk, i, blocking; int mode; BLASLONG lda, range_N[2]; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; FLOAT beta [2] = {-ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= DTB_ENTRIES) { info = TRTI2(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; range_N[0] = i; range_N[1] = i + bk; newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.m = i; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + ( i * lda) * COMPSIZE; newarg.beta = beta; newarg.nthreads = args -> nthreads; gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; CNAME (&newarg, NULL, NULL, sa, sb, 0); newarg.m = i; newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + ( i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; newarg.c = a + ( (i + bk) * lda) * COMPSIZE; newarg.beta = NULL; gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; newarg.m = bk; newarg.n = n - i - bk; gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda, offset; BLASLONG j, js, jmin, is, imin, jc, jcmin; BLASLONG jjs, min_jj; blasint *ipiv, iinfo, info; BLASLONG jb, mn, blocking; FLOAT *a, *offsetA, *offsetB; BLASLONG range_N[2]; FLOAT *sbb; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; mn = MIN(m, n); blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; if (blocking <= GEMM_UNROLL_N * 2) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); info = 0; for (j = 0; j < mn; j += blocking) { jb = mn - j; if (jb > blocking) jb = blocking; offsetA = a + j * lda * COMPSIZE; offsetB = a + (j + jb) * lda * COMPSIZE; range_N[0] = offset + j; range_N[1] = offset + j + jb; iinfo = CNAME(args, NULL, range_N, sa, sb, 0); if (iinfo && !info) info = iinfo + j; if (j + jb < n) { TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); for (js = j + jb; js < n; js += REAL_GEMM_R){ jmin = n - js; if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R; for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){ min_jj = js + jmin - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #if 0 LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, #ifdef COMPLEX ZERO, #endif a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE); #else LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE); #endif for (jc = 0; jc < jb; jc += GEMM_P) { jcmin = jb - jc; if (jcmin > GEMM_P) jcmin = GEMM_P; TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1, #ifdef COMPLEX ZERO, #endif sb + jb * jc * COMPSIZE, sbb + jb * (jjs - js) * COMPSIZE, a + (j + jc + jjs * lda) * COMPSIZE, lda, jc); } } for (is = j + jb; is < m; is += GEMM_P){ imin = m - is; if (imin > GEMM_P) imin = GEMM_P; GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa); GEMM_KERNEL_N(imin, jmin, jb, dm1, #ifdef COMPLEX ZERO, #endif sa, sbb, a + (is + js * lda) * COMPSIZE, lda); } } } } for (j = 0; j < mn; j += jb) { jb = MIN(mn - j, blocking); LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, #ifdef COMPLEX ZERO, #endif a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); } return info; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; BLASLONG info; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { -ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); return info; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 4) { info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); return info; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; info = CNAME(&newarg, NULL, NULL, sa, sb, 0); if (info) return info + i; if (n - i - bk > 0) { newarg.m = n - i - bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; newarg.a = a + (i + bk + i * lda) * COMPSIZE; newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; #ifndef USE_SIMPLE_THREADED_LEVEL3 HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); #endif } } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, bk, i, blocking, lda; int mode; blas_arg_t newarg; FLOAT *a; FLOAT alpha[2] = { ONE, ZERO}; #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif if (args -> nthreads == 1) { LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); return 0; } n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) n = range_n[1] - range_n[0]; if (n <= GEMM_UNROLL_N * 2) { LAUUM_U_SINGLE(args, NULL, range_n, sa, sb, 0); return 0; } newarg.lda = lda; newarg.ldb = lda; newarg.ldc = lda; newarg.alpha = alpha; newarg.beta = NULL; newarg.nthreads = args -> nthreads; blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (blocking > GEMM_Q) blocking = GEMM_Q; for (i = 0; i < n; i += blocking) { bk = n - i; if (bk > blocking) bk = blocking; newarg.n = i; newarg.k = bk; newarg.a = a + ( i * lda) * COMPSIZE; newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; newarg.a = a + (i + i * lda) * COMPSIZE; CNAME(&newarg, NULL, NULL, sa, sb, 0); } return 0; }
void _free_r( struct _reent* r, void* ptr ) { CNAME( free )( ptr ); }
void* _realloc_r( struct _reent* r, void* ptr, size_t size ) { return CNAME( realloc )( ptr, size ); }
void* _malloc_r( struct _reent* r, size_t size ) { return CNAME( malloc )( size ); }
void* _calloc_r( struct _reent* r, size_t nelem, size_t elem_size ) { return CNAME( calloc )( nelem, elem_size ); }
int main(int argc, char *argv[]) { srand(time(0)); int m = atoi(argv[1]); int n = atoi(argv[2]); int i, j; DOUBLE *x; DOUBLE *y; DOUBLE *A; DOUBLE *t; DOUBLE *buffer; int incx = 1; int incy = 1; DOUBLE alpha, beta; int lda = m; alpha = rand()/1.0/RAND_MAX - 0.5; beta = 1;//rand()/1.0/RAND_MAX - 0.5; x = (DOUBLE*)malloc(sizeof(DOUBLE)*n); y = (DOUBLE*)malloc(sizeof(DOUBLE)*m); t = (DOUBLE*)malloc(sizeof(DOUBLE)*m); A = (DOUBLE*)malloc(sizeof(DOUBLE)*m*n); buffer = (DOUBLE*)malloc(sizeof(DOUBLE)*m*n); for (i = 0; i < n; i++) x[i] = rand()/1.0/RAND_MAX - 0.5; for (i = 0; i < m; i++) t[i] = rand()/1.0/RAND_MAX - 0.5; for (i = 0; i < m*n; i++) A[i] = rand()/1.0/RAND_MAX - 0.5; //y = alpha*A*x + beta*y //m row //n col unsigned long long int t1,t2,t3,t4,t5; //printf("acm\n");//ACML version memcpy(y,t,sizeof(DOUBLE)*m); clock_gettime(CLOCK_MONOTONIC, &begin); sgemv('N', m, n, alpha, A, lda, x, incx, beta, y, incy); clock_gettime(CLOCK_MONOTONIC, &end); t1 = 1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec; //printf("one\n");//Native version with checksum memcpy(y,t,sizeof(DOUBLE)*m); clock_gettime(CLOCK_MONOTONIC, &begin); one('N', m, n, alpha, A, lda, x, incx, beta, y, incy); clock_gettime(CLOCK_MONOTONIC, &end); t2 = 1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec; //printf("opc\n");//OpenBLAS version with checksum memcpy(y,t,sizeof(DOUBLE)*m); clock_gettime(CLOCK_MONOTONIC, &begin); //CNAME('N', m, n, alpha, A, lda, x, incx, beta, y, incy); openCkm(m,n,0,alpha,A,lda,x,incx,y,incy,buffer); clock_gettime(CLOCK_MONOTONIC, &end); t3 = 1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec; //printf("opn\n");//OpenBLAS version memcpy(y,t,sizeof(DOUBLE)*m); clock_gettime(CLOCK_MONOTONIC, &begin); CNAME(m,n,0,alpha,A,lda,x,incx,y,incy,buffer); clock_gettime(CLOCK_MONOTONIC, &end); t4 = 1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec; /*printf("opo\n");//OpenBLAS version (with new kernel) memcpy(y,t,sizeof(DOUBLE)*m); clock_gettime(CLOCK_MONOTONIC, &begin); CNAME_3(m,n,0,alpha,A,lda,x,incx,y,incy,buffer); clock_gettime(CLOCK_MONOTONIC, &end); t5 = 1000000000L*(end.tv_sec - begin.tv_sec) + end.tv_nsec - begin.tv_nsec;*/ printf("acm%16lld\n",t1); printf("one%16lld\n",t2); printf("opc%16lld\n",t3); printf("opn%16lld\n",t4); //printf("opo%16lld\n",t5); return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, mn, lda, offset; BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2]; blasint *ipiv, iinfo, info; int mode; blas_arg_t newarg; FLOAT *a, *sbb; FLOAT dummyalpha[2] = {ZERO, ZERO}; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range[MAX_CPU_NUMBER + 1]; BLASLONG width, nn, num_cpu; volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; newarg.c = ipiv; newarg.lda = lda; newarg.common = NULL; newarg.nthreads = args -> nthreads; mn = MIN(m, n); init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (init_bk > GEMM_Q) init_bk = GEMM_Q; if (init_bk <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } width = FORMULA1(m, n, 0, init_bk, args -> nthreads); width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width > n - init_bk) width = n - init_bk; if (width < init_bk) { long temp; temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N; if (temp < init_bk) init_bk = temp; } next_bk = init_bk; bk = init_bk; range_n_new[0] = offset; range_n_new[1] = offset + bk; info = CNAME(args, NULL, range_n_new, sa, sb, 0); TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); is = 0; num_cpu = 0; sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); while (is < mn) { width = FORMULA1(m, n, is, bk, args -> nthreads); width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width < bk) { next_bk = FORMULA2(m, n, is, bk, args -> nthreads); next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (next_bk > bk) next_bk = bk; #if 0 if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); #else if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); #endif width = next_bk; } if (width > mn - is - bk) { next_bk = mn - is - bk; width = next_bk; } nn = n - bk - is; if (width > nn) width = nn; if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]); range[0] = 0; range[1] = width; num_cpu = 1; nn -= width; newarg.a = sb; newarg.b = a + (is + is * lda) * COMPSIZE; newarg.d = (void *)flag; newarg.m = m - bk - is; newarg.n = n - bk - is; newarg.k = bk; newarg.ldb = is + offset; while (nn > 0){ width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu); nn -= width; if (nn < 0) width = width + nn; range[num_cpu + 1] = range[num_cpu] + width; queue[num_cpu].mode = mode; //queue[num_cpu].routine = inner_advanced_thread; queue[num_cpu].routine = (void *)inner_basic_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = NULL; queue[num_cpu].range_n = &range[num_cpu]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; flag[num_cpu * CACHE_LINE_SIZE] = 1; num_cpu ++; } queue[num_cpu - 1].next = NULL; is += bk; bk = n - is; if (bk > next_bk) bk = next_bk; range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; if (num_cpu > 1) { exec_blas_async(1, &queue[1]); #if 0 inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); #else if (range[1] >= bk * 4) { BLASLONG myrange[2]; myrange[0] = 0; myrange[1] = bk; inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); myrange[0] = bk; myrange[1] = range[1]; inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); } else { inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); } #endif for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); } if (iinfo && !info) info = iinfo + is; } next_bk = init_bk; bk = init_bk; is = 0; while (is < mn) { bk = mn - is; if (bk > next_bk) bk = next_bk; width = FORMULA1(m, n, is, bk, args -> nthreads); width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width < bk) { next_bk = FORMULA2(m, n, is, bk, args -> nthreads); next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (next_bk > bk) next_bk = bk; #if 0 if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); #else if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); #endif } if (width > mn - is - bk) { next_bk = mn - is - bk; width = next_bk; } blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); is += bk; } return info; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, mn, lda, offset; BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2]; blasint *ipiv, iinfo, info; int mode; blas_arg_t newarg; FLOAT *a, *sbb; FLOAT dummyalpha[2] = {ZERO, ZERO}; blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1]; BLASLONG range_N[MAX_CPU_NUMBER + 1]; job_t job[MAX_CPU_NUMBER]; BLASLONG width, nn, mm; BLASLONG i, j, k, is, bk; BLASLONG num_cpu; volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); #ifndef COMPLEX #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_REAL; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif #else #ifdef XDOUBLE mode = BLAS_XDOUBLE | BLAS_COMPLEX; #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_COMPLEX; #else mode = BLAS_SINGLE | BLAS_COMPLEX; #endif #endif m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (m <= 0 || n <= 0) return 0; newarg.c = ipiv; newarg.lda = lda; newarg.common = (void *)job; info = 0; mn = MIN(m, n); init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (init_bk > GEMM_Q) init_bk = GEMM_Q; if (init_bk <= GEMM_UNROLL_N) { info = GETF2(args, NULL, range_n, sa, sb, 0); return info; } next_bk = init_bk; bk = mn; if (bk > next_bk) bk = next_bk; range_n_new[0] = offset; range_n_new[1] = offset + bk; iinfo = CNAME(args, NULL, range_n_new, sa, sb, 0); if (iinfo && !info) info = iinfo; TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); is = 0; num_cpu = 0; while (is < mn) { width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width > mn - is - bk) width = mn - is - bk; if (width < bk) { next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); if (next_bk > bk) next_bk = bk; width = next_bk; if (width > mn - is - bk) width = mn - is - bk; } if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); mm = m - bk - is; nn = n - bk - is; newarg.a = sb; newarg.b = a + (is + is * lda) * COMPSIZE; newarg.d = (void *)flag; newarg.m = mm; newarg.n = nn; newarg.k = bk; newarg.ldb = is + offset; nn -= width; range_n_mine[0] = 0; range_n_mine[1] = width; range_N[0] = width; range_M[0] = 0; num_cpu = 0; while (nn > 0){ if (mm >= nn) { width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (nn < width) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (mm < width) width = mm; if (nn <= 0) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; } else { width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (mm < width) width = mm; mm -= width; range_M[num_cpu + 1] = range_M[num_cpu] + width; width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); if (nn < width) width = nn; if (mm <= 0) width = nn; nn -= width; range_N[num_cpu + 1] = range_N[num_cpu] + width; } queue[num_cpu].mode = mode; queue[num_cpu].routine = inner_advanced_thread; queue[num_cpu].args = &newarg; queue[num_cpu].range_m = &range_M[num_cpu]; queue[num_cpu].range_n = &range_N[0]; queue[num_cpu].sa = NULL; queue[num_cpu].sb = NULL; queue[num_cpu].next = &queue[num_cpu + 1]; flag[num_cpu * CACHE_LINE_SIZE] = 1; num_cpu ++; } newarg.nthreads = num_cpu; if (num_cpu > 0) { for (j = 0; j < num_cpu; j++) { for (i = 0; i < num_cpu; i++) { for (k = 0; k < DIVIDE_RATE; k++) { job[j].working[i][CACHE_LINE_SIZE * k] = 0; } } } } is += bk; bk = mn - is; if (bk > next_bk) bk = next_bk; range_n_new[0] = offset + is; range_n_new[1] = offset + is + bk; if (num_cpu > 0) { queue[num_cpu - 1].next = NULL; exec_blas_async(0, &queue[0]); inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); if (iinfo && !info) info = iinfo + is; for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); } else { inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); if (iinfo && !info) info = iinfo + is; } } next_bk = init_bk; is = 0; while (is < mn) { bk = mn - is; if (bk > next_bk) bk = next_bk; width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); if (width > mn - is - bk) width = mn - is - bk; if (width < bk) { next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); if (next_bk > bk) next_bk = bk; } blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); is += bk; } return info; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, j, blocking; BLASLONG is, min_i; BLASLONG js, min_j; BLASLONG range_N[2]; FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); #ifdef SHARED_ARRAY FLOAT *aa; #endif n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_L(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = n / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); /* First tile */ min_j = n - j - bk; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for (is = j + bk; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifdef SHARED_ARRAY if (is < j + bk + min_j) { aa = sb2 + bk * (is - j - bk) * COMPSIZE; } else { aa = sa; } GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa); TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, #endif aa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, a + (is + (j + bk) * lda) * COMPSIZE, lda, is - j - bk); #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); TRSM_KERNEL(min_i, bk, bk, dm1, #ifdef COMPLEX ZERO, #endif sa, sb, a + (is + j * lda) * COMPSIZE, lda, 0); if (is < j + bk + min_j) { GEMM_OTCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sb2 + bk * (is - j - bk) * COMPSIZE); } SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, a + (is + (j + bk) * lda) * COMPSIZE, lda, is - j - bk); #endif } for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){ min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2); for (is = js; is < n; is += GEMM_P) { min_i = n - is; if (min_i > GEMM_P) min_i = GEMM_P; #ifdef SHARED_ARRAY if (is + min_i < js + min_j) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); aa = sa; } SYRK_KERNEL_L(min_i, min_j, bk, dm1, aa, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); #else GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); SYRK_KERNEL_L(min_i, min_j, bk, dm1, sa, sb2, a + (is + js * lda) * COMPSIZE, lda, - is + js); #endif } } } } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; BLASLONG info; BLASLONG bk, blocking; BLASLONG is, min_i; BLASLONG jjs, min_jj; BLASLONG range_N[2]; BLASLONG j, js, min_j; #ifdef SHARED_ARRAY FLOAT *aa; #endif FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } if (n <= DTB_ENTRIES / 2) { info = POTF2_U(args, NULL, range_n, sa, sb, 0); return info; } blocking = GEMM_Q; if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; for (j = 0; j < n; j += blocking) { bk = n - j; if (bk > blocking) bk = blocking; if (!range_n) { range_N[0] = j; range_N[1] = j + bk; } else { range_N[0] = range_n[0] + j; range_N[1] = range_n[0] + j + bk; } info = CNAME(args, NULL, range_N, sa, sb, 0); if (info) return info + j; if (n - j - bk > 0) { TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); for(js = j + bk; js < n; js += REAL_GEMM_R) { min_j = n - js; if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ min_jj = min_j + js - jjs; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); for (is = 0; is < bk; is += GEMM_P) { min_i = bk - is; if (min_i > GEMM_P) min_i = GEMM_P; TRSM_KERNEL (min_i, min_jj, bk, dm1, #ifdef COMPLEX ZERO, #endif sb + bk * is * COMPSIZE, sb2 + bk * (jjs - js) * COMPSIZE, a + (j + is + jjs * lda) * COMPSIZE, lda, is); } } for (is = j + bk; is < js + min_j; is += min_i) { min_i = js + min_j - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; } else if (min_i > GEMM_P) { min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); } #ifdef SHARED_ARRAY if ((is >= js) && (is + min_i <= js + min_j)) { aa = sb2 + bk * (is - js) * COMPSIZE; } else { GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); aa = sa; } #else GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); #endif SYRK_KERNEL_U(min_i, min_j, bk, dm1, SA, sb2, a + (is + js * lda) * COMPSIZE, lda, is - js); } } } } return 0; }