static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { BLASLONG i; #ifndef LOWER if (m_from > n_from) n_from = m_from; if (m_to > n_to ) m_to = n_to; #else if (m_from < n_from) m_from = n_from; if (m_to < n_to ) n_to = m_to; #endif c += (m_from + n_from * ldc) * COMPSIZE; m_to -= m_from; n_to -= n_from; for (i = 0; i < n_to; i++){ #ifndef LOWER SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif c, 1, NULL, 0, NULL, 0); c += ldc * COMPSIZE; #else SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], #ifdef COMPLEX alpha[1], #endif c, 1, NULL, 0, NULL, 0); if (i < m_from - n_from) { c += ldc * COMPSIZE; } else { c += (1 + ldc) * COMPSIZE; } #endif } return 0; }
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb) { BLASLONG i; FLOAT *aptr,*bptr; if ( rows <= 0 ) return(0); if ( cols <= 0 ) return(0); aptr = a; bptr = b; if ( alpha == 0.0 ) { for ( i=0; i<cols ; i++ ) { SCAL_K(rows, 0,0, beta, bptr, 1, NULL, 0,NULL,0); bptr+=ldb; } return(0); } for (i = 0; i < cols; i++) { AXPBY_K(rows, alpha, aptr, 1, beta, bptr, 1); aptr += lda; bptr += ldb; } return(0); }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT aii; BLASLONG i; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i, lda, NULL, 0, NULL, 0); if (i < n - 1) { aii = DOTU_K(n - i - 1, a + i + 1 + i * lda, 1, a + i + 1 + i * lda, 1); *(a + i + i * lda) += aii; GEMV_T(n - i - 1, i, 0, dp1, a + (i + 1) , lda, a + (i + 1) + i * lda, 1, a + i , lda, sb); } } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj_r, ajj_i; #ifndef UNIT FLOAT ratio, den; #endif BLASLONG j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = 0; j < n; j++) { ajj_r = ONE; ajj_i = ZERO; #ifndef UNIT ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); if (fabs(ajj_r) >= fabs(ajj_i)){ ratio = ajj_i / ajj_r; den = 1. / (ajj_r * ( 1 + ratio * ratio)); ajj_r = den; ajj_i = -ratio * den; } else { ratio = ajj_r / ajj_i; den = 1. /(ajj_i * ( 1 + ratio * ratio)); ajj_r = ratio * den; ajj_i = -den; } *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; #endif ZTRMV (j, a , lda, a + j * lda * COMPSIZE, 1, sb); SCAL_K(j, 0, 0, -ajj_r, -ajj_i, a + j * lda * COMPSIZE, 1, NULL, 0, NULL, 0); } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj[2]; FLOAT *aoffset; BLASLONG i, j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } aoffset = a; for (j = 0; j < n; j++) { ajj[0] = DOTC_K(j, a + j * 2, lda, a + j * 2, lda); GET_IMAGE(ajj[1]); ajj[0] = *(aoffset + j * 2) - ajj[0]; if (ajj[0] <= 0){ *(aoffset + j * 2 + 0) = ajj[0]; *(aoffset + j * 2 + 1) = ZERO; return j + 1; } ajj[0] = SQRT(ajj[0]); *(aoffset + j * 2 + 0) = ajj[0]; *(aoffset + j * 2 + 1) = ZERO; i = n - j - 1; if (i > 0) { GEMV_O(i, j, 0, dm1, ZERO, a + (j + 1) * 2, lda, a + j * 2, lda, aoffset + (j + 1) * 2, 1, sb); SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, aoffset + (j + 1) * 2, 1, NULL, 0, NULL, 0); } aoffset += lda * 2; } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT temp[2]; BLASLONG i; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (i = 0; i < n; i++) { SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, a + i * COMPSIZE, lda, NULL, 0, NULL, 0); if (i < n - 1) { temp[0] = DOTC_K(n - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, a + (i + 1 + i * lda) * COMPSIZE, 1); GET_IMAGE(temp[1]); *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; GEMV_U(n - i - 1, i, 0, dp1, ZERO, a + ((i + 1) ) * COMPSIZE, lda, a + ((i + 1) + i * lda) * COMPSIZE, 1, a + ( i ) * COMPSIZE , lda, sb); } } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG n, lda; FLOAT *a; FLOAT ajj; BLASLONG j; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; if (range_n) { n = range_n[1] - range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } for (j = 0; j < n; j++) { ajj = ONE; #ifndef UNIT ajj /= *(a + j + j * lda); *(a + j + j * lda) = ajj; #endif TRMV (j, a , lda, a + j * lda, 1, sb); SCAL_K(j, 0, 0, -ajj, a + j * lda, 1, NULL, 0, NULL, 0); } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda, offset; blasint *ipiv; FLOAT *a; FLOAT temp1, temp2, temp3, temp4, ratio, den; blasint i, j; blasint ip, jp; blasint info; BLASLONG len; FLOAT *b; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } info = 0; b = a; for (j = 0; j < n; j++) { len = MIN(j, m); for (i = 0; i < len; i++) { ip = ipiv[i + offset] - 1 - offset; if (ip != i) { temp1 = *(b + i * 2 + 0); temp2 = *(b + i * 2 + 1); temp3 = *(b + ip * 2 + 0); temp4 = *(b + ip * 2 + 1); *(b + i * 2 + 0) = temp3; *(b + i * 2 + 1) = temp4; *(b + ip * 2 + 0) = temp1; *(b + ip * 2 + 1) = temp2; } } ZTRSV_NLU(len, a, lda, b, 1, sb); if (j < m) { GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); jp = j + IAMAX_K(m - j, b + j * 2, 1); ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp * 2 + 0); temp2 = *(b + jp * 2 + 1); if ((temp1 != ZERO) || (temp2 != ZERO)) { if (jp != j) { SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, a + jp * 2, lda, NULL, 0); } if (fabs(temp1) >= fabs(temp2)){ ratio = temp2 / temp1; den = dp1 /(temp1 * ( 1 + ratio * ratio)); temp3 = den; temp4 = -ratio * den; } else { ratio = temp1 / temp2; den = dp1 /(temp2 * ( 1 + ratio * ratio)); temp3 = ratio * den; temp4 = -den; } if (j + 1 < m) { SCAL_K(m - j - 1, 0, 0, temp3, temp4, b + (j + 1) * 2, 1, NULL, 0, NULL, 0); } } else { if (!info) info = j + 1; } } b += lda * 2; } return info; }
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ blasint n = *N; blasint incx = *INCX; FLOAT alpha = *ALPHA; #else void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #endif #ifdef SMP int mode, nthreads; #endif #ifndef CBLAS PRINT_DEBUG_NAME; #else PRINT_DEBUG_CNAME; #endif if (incx <= 0 || n <= 0) return; if (alpha == ONE) return; IDEBUG_START; FUNCTION_PROFILE_START(); #ifdef SMP nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0); #ifdef SMP } else { #ifdef DOUBLE mode = BLAS_DOUBLE | BLAS_REAL; #else mode = BLAS_SINGLE | BLAS_REAL; #endif blas_level1_thread(mode, n, 0, 0, #ifndef CBLAS ALPHA, #else &alpha, #endif x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); } #endif FUNCTION_PROFILE_END(1, n, n); IDEBUG_END; return; }
static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to; BLASLONG i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER if (i > 0) { #ifndef TRANS MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); #else result = MYDOT(i, a, 1, x, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + i * COMPSIZE + 0); ai = *(a + i * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (args -> m > i + 1) { #ifndef TRANS MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef LOWER a += (i + 1) * COMPSIZE; #else a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha = *ALPHA; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta = *BETA; blasint incy = *INCY; int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { FLOAT *buffer; int uplo; blasint info; #ifdef SMP int nthreads; #endif int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 1; if (Uplo == CblasLower) uplo = 0; info = -1; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; if (incy < 0 ) y -= (n - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (symv_thread[uplo])(n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; }
void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 2; if (trans == 'C') i = 3; if (trans == 'O') i = 4; if (trans == 'U') i = 5; if (trans == 'S') i = 6; if (trans == 'D') i = 7; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1,m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D, }; PRINT_DEBUG_CNAME; FLOAT alpha_r = *(ALPHA + 0); FLOAT alpha_i = *(ALPHA + 1); FLOAT beta_r = *(BETA + 0); FLOAT beta_i = *(BETA + 1); trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif /* Quick return if possible. */ if (m == 0 || n == 0) return; lenx = n; leny = m; if (trans & 1) lenx = m; if (trans & 1) leny = n; if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
void NAME(char *TRANS, blasint *M, blasint *N, blasint *KU, blasint *KL, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint ku = *KU; blasint kl = *KL; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT *buffer; #ifdef SMP int nthreads; #endif FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 2; if (trans == 'C') i = 3; if (trans == 'O') i = 4; if (trans == 'U') i = 5; if (trans == 'S') i = 6; if (trans == 'D') i = 7; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, blasint ku, blasint kl, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 2; if (TransA == CblasConjTrans) trans = 3; info = -1; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 3; if (TransA == CblasConjTrans) trans = 2; info = -1; t = n; n = m; m = t; t = ku; ku = kl; kl = t; if (incy == 0) info = 13; if (incx == 0) info = 10; if (lda < kl + ku + 1) info = 8; if (kl < 0) info = 5; if (ku < 0) info = 4; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans & 1) lenx = m; if (trans & 1) leny = n; if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if (alpha_r == ZERO && alpha_i == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (gbmv[(int)trans])(m, n, kl, ku, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gbmv_thread[(int)trans])(m, n, kl, ku, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, m * n / 2 + n, m * n); IDEBUG_END; return; }
static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n, k, n_from, n_to; BLASLONG i, length; #ifndef COMPLEX FLOAT result; #else FLOAT _Complex result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; lda = args -> lda; incx = args -> ldb; n = args -> n; k = args -> k; n_from = 0; n_to = n; //Use y as each thread's n* COMPSIZE elements in sb buffer y = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); a += n_from * lda * COMPSIZE; } if (incx != 1) { COPY_K(n, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * n + 1023) & ~1023); } SCAL_K(n, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); for (i = n_from; i < n_to; i++) { #ifndef LOWER length = i; if (length > k) length = k; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #else result = MYDOT(length , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif #else length = k; if (n - i - 1 < k) length = n - i - 1; MYAXPY(length, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1); #else result = MYDOT(length , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ; #endif #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1); #endif #endif #endif a += lda * COMPSIZE; } return 0; }
void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; FLOAT *buffer; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 0; if (trans == 'C') i = 1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans) lenx = m; if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP int nthreads_max = num_cpu_avail(2); int nthreads_avail = nthreads_max; double MNK = (double) m * (double) n; if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) nthreads = nthreads_avail; else nthreads = nthreads_max; if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
void NAME(char *TRANS, blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char trans = *TRANS; blasint m = *M; blasint n = *N; blasint lda = *LDA; blasint incx = *INCX; blasint incy = *INCY; FLOAT alpha = *ALPHA; FLOAT beta = *BETA; FLOAT *buffer; #ifdef SMP int nthreads; int nthreads_max; int nthreads_avail; double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; blasint info; blasint lenx, leny; blasint i; PRINT_DEBUG_NAME; TOUPPER(trans); info = 0; i = -1; if (trans == 'N') i = 0; if (trans == 'T') i = 1; if (trans == 'R') i = 0; if (trans == 'C') i = 1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (i < 0) info = 1; trans = i; if (info != 0){ BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy){ FLOAT *buffer; blasint lenx, leny; int trans; blasint info, t; #ifdef SMP int nthreads; int nthreads_max; int nthreads_avail; double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { GEMV_N, GEMV_T, }; PRINT_DEBUG_CNAME; trans = -1; info = 0; if (order == CblasColMajor) { if (TransA == CblasNoTrans) trans = 0; if (TransA == CblasTrans) trans = 1; if (TransA == CblasConjNoTrans) trans = 0; if (TransA == CblasConjTrans) trans = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (order == CblasRowMajor) { if (TransA == CblasNoTrans) trans = 1; if (TransA == CblasTrans) trans = 0; if (TransA == CblasConjNoTrans) trans = 1; if (TransA == CblasConjTrans) trans = 0; info = -1; t = n; n = m; m = t; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < MAX(1, m)) info = 6; if (n < 0) info = 3; if (m < 0) info = 2; if (trans < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; leny = m; if (trans) lenx = m; if (trans) leny = n; if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; #ifdef MAX_STACK_ALLOC // make it volatile because some gemv implementation (ex: dgemv_n.S) // do not restore all register volatile int stack_alloc_size = 0; //for gemv_n and gemv_t, try to allocate on stack stack_alloc_size = m + n; #ifdef ALIGNED_ACCESS stack_alloc_size += 3; #endif // if(stack_alloc_size < 128) //dgemv_n.S require a 128 bytes buffer // increasing instead of capping 128 // ABI STACK for windows 288 bytes stack_alloc_size += 288 / sizeof(FLOAT) ; if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) stack_alloc_size = 0; // stack overflow check volatile double stack_check = 3.14159265358979323846; FLOAT stack_buffer[stack_alloc_size]; buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); // printf("stack_alloc_size=%d\n", stack_alloc_size); #else //Original OpenBLAS/GotoBLAS codes. buffer = (FLOAT *)blas_memory_alloc(1); #endif #ifdef SMP nthreads_max = num_cpu_avail(2); nthreads_avail = nthreads_max; MNK = (double) m * (double) n; if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) nthreads_max = 1; if ( nthreads_max > nthreads_avail ) nthreads = nthreads_avail; else nthreads = nthreads_max; if (nthreads == 1) { #endif (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); #ifdef SMP } else { (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); } #endif // stack overflow check assert(stack_check==3.14159265358979323846); #ifdef MAX_STACK_ALLOC if(!stack_alloc_size){ blas_memory_free(buffer); } #else blas_memory_free(buffer); #endif FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; return; }
void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; blasint k = *K; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; blasint info; int uplo; FLOAT *buffer; #ifdef SMPBUG int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'V') uplo = 2; if (uplo_arg == 'M') uplo = 3; info = 0; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #else void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, blasint k, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy){ FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; FLOAT *buffer; int uplo; blasint info; #ifdef SMPBUG int nthreads; #endif PRINT_DEBUG_CNAME; uplo = -1; info = 0; if (order == CblasColMajor) { if (Uplo == CblasUpper) uplo = 0; if (Uplo == CblasLower) uplo = 1; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (order == CblasRowMajor) { if (Uplo == CblasUpper) uplo = 3; if (Uplo == CblasLower) uplo = 2; info = -1; if (incy == 0) info = 11; if (incx == 0) info = 8; if (lda < k + 1) info = 6; if (k < 0) info = 3; if (n < 0) info = 2; if (uplo < 0) info = 1; } if (info >= 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } #endif if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) y -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMPBUG nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (hbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); #ifdef SMPBUG } else { (hbmv_thread[uplo])(n, k, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); IDEBUG_END; return; }
void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ char uplo_arg = *UPLO; blasint n = *N; FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; blasint lda = *LDA; blasint incx = *INCX; FLOAT beta_r = BETA[0]; FLOAT beta_i = BETA[1]; blasint incy = *INCY; int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { SYMV_U, SYMV_L, }; #ifdef SMP int (*symv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { SYMV_THREAD_U, SYMV_THREAD_L, }; #endif blasint info; int uplo; FLOAT *buffer; #ifdef SMP int nthreads; #endif PRINT_DEBUG_NAME; TOUPPER(uplo_arg); uplo = -1; if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'L') uplo = 1; info = 0; if (incy == 0) info = 10; if (incx == 0) info = 7; if (lda < MAX(1, n)) info = 5; if (n < 0) info = 2; if (uplo < 0) info = 1; if (info != 0) { BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } if (n == 0) return; if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; IDEBUG_START; FUNCTION_PROFILE_START(); if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif (symv[uplo])(n, n, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); #ifdef SMP } else { (symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); } #endif blas_memory_free(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); IDEBUG_END; return; }
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG incx; BLASLONG m_from, m_to, i; #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; } #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #ifndef LOWER a += (m_from + 1) * m_from / 2 * COMPSIZE; #else a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; #endif for (i = m_from; i < m_to; i++) { #ifndef LOWER #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(i + 1, a, 1, x, 1); #else result = MYDOT(i , a, 1, x, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(i, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a, 1, y, 1, NULL, 0); a += (i + 1) * COMPSIZE; #else #if !defined(HEMV) && !defined(HEMVREV) result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); #else result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #endif #ifndef COMPLEX *(y + i * COMPSIZE) += result; #else #if !defined(HEMV) && !defined(HEMVREV) *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #else *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); #endif #endif MYAXPY(args -> m - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); a += (args -> m - i - 1) * COMPSIZE; #endif } return 0; }
static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG n_from, n_to; BLASLONG i, offset_l, offset_u, uu, ll, ku, kl; #ifdef TRANSA #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; ku = args -> ldc; kl = args -> ldd; n_from = 0; n_to = args -> n; if (range_m) y += *range_m * COMPSIZE; if (range_n) { n_from = *(range_n + 0); n_to = *(range_n + 1); a += n_from * lda * COMPSIZE; } n_to = MIN(n_to, args -> m + ku); #ifdef TRANSA if (incx != 1) { COPY_K(args -> m, x, incx, buffer, 1); x = buffer; buffer += ((COMPSIZE * args -> m + 1023) & ~1023); } #endif SCAL_K( #ifndef TRANSA args -> m, #else args -> n, #endif 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); offset_u = ku - n_from; offset_l = ku - n_from + args -> m; #ifndef TRANSA x += n_from * incx * COMPSIZE; y -= offset_u * COMPSIZE; #else x -= offset_u * COMPSIZE; y += n_from * COMPSIZE; #endif for (i = n_from; i < n_to; i++) { uu = MAX(offset_u, 0); ll = MIN(offset_l, ku + kl + 1); #ifndef TRANSA MYAXPY(ll - uu, 0, 0, *(x + 0), #ifdef COMPLEX #ifndef XCONJ *(x + 1), #else -*(x + 1), #endif #endif a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0); x += incx * COMPSIZE; #else result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1); #ifndef COMPLEX *y = result; #else *(y + 0) += CREAL(result); #ifndef XCONJ *(y + 1) += CIMAG(result); #else *(y + 1) -= CIMAG(result); #endif #endif x += COMPSIZE; #endif y += COMPSIZE; offset_u --; offset_l --; a += lda * COMPSIZE; } return 0; }
static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; BLASLONG lda, incx; BLASLONG m_from, m_to; BLASLONG i, is, min_i; #ifdef TRANS #ifndef COMPLEX FLOAT result; #else OPENBLAS_COMPLEX_FLOAT result; #endif #endif #if defined(COMPLEX) && !defined(UNIT) FLOAT ar, ai, xr, xi; #endif a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; m_from = 0; m_to = args -> m; if (range_m) { m_from = *(range_m + 0); m_to = *(range_m + 1); } if (incx != 1) { #ifndef LOWER COPY_K(m_to, x, incx, buffer, 1); #else COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); #endif x = buffer; buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS if (range_n) y += *range_n * COMPSIZE; #ifndef LOWER SCAL_K(m_to, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y, 1, NULL, 0, NULL, 0); #else SCAL_K(args -> m - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif #else SCAL_K(m_to - m_from, 0, 0, ZERO, #ifdef COMPLEX ZERO, #endif y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); #endif for (is = m_from; is < m_to; is += DTB_ENTRIES){ min_i = MIN(m_to - is, DTB_ENTRIES); #ifndef LOWER if (is > 0){ MYGEMV(is, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + is * lda * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y, 1, #else x, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif for (i = is; i < is + min_i; i++) { #ifndef LOWER if (i - is > 0) { #ifndef TRANS MYAXPY(i - is, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); #else result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif #ifndef COMPLEX #ifdef UNIT *(y + i * COMPSIZE) += *(x + i * COMPSIZE); #else *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE); #endif #else #ifdef UNIT *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); #else ar = *(a + (i + i * lda) * COMPSIZE + 0); ai = *(a + (i + i * lda) * COMPSIZE + 1); xr = *(x + i * COMPSIZE + 0); xi = *(x + i * COMPSIZE + 1); #if (TRANSA == 1) || (TRANSA == 2) *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; #else *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; #endif #endif #endif #ifdef LOWER if (is + min_i > i + 1) { #ifndef TRANS MYAXPY(is + min_i - i - 1, 0, 0, *(x + i * COMPSIZE + 0), #ifdef COMPLEX *(x + i * COMPSIZE + 1), #endif a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); #else result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); #ifndef COMPLEX *(y + i * COMPSIZE + 0) += result; #else *(y + i * COMPSIZE + 0) += CREAL(result); *(y + i * COMPSIZE + 1) += CIMAG(result); #endif #endif } #endif } #ifdef LOWER if (args -> m > is + min_i){ MYGEMV(args -> m - is - min_i, min_i, 0, ONE, #ifdef COMPLEX ZERO, #endif a + (is + min_i + is * lda) * COMPSIZE, lda, #ifndef TRANS x + is * COMPSIZE, 1, y + (is + min_i) * COMPSIZE, 1, #else x + (is + min_i) * COMPSIZE, 1, y + is * COMPSIZE, 1, #endif buffer); } #endif } return 0; }
blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { BLASLONG m, n, lda; blasint *ipiv, offset; FLOAT *a; FLOAT temp1, temp2; blasint i, j; blasint ip, jp; blasint info; BLASLONG len; FLOAT *b; m = args -> m; n = args -> n; a = (FLOAT *)args -> a; lda = args -> lda; ipiv = (blasint *)args -> c; offset = 0; if (range_n) { m -= range_n[0]; n = range_n[1] - range_n[0]; offset = range_n[0]; a += range_n[0] * (lda + 1) * COMPSIZE; } info = 0; b = a; for (j = 0; j < n; j++) { len = MIN(j, m); for (i = 0; i < len; i++) { ip = ipiv[i + offset] - 1 - offset; if (ip != i) { temp1 = *(b + i); temp2 = *(b + ip); *(b + i) = temp2; *(b + ip) = temp1; } } for (i = 1; i < len; i++) { b[i] -= DOTU_K(i, a + i, lda, b, 1); } if (j < m) { GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); jp = j + IAMAX_K(m - j, b + j, 1); ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp); if (temp1 != ZERO) { temp1 = dp1 / temp1; if (jp != j) { SWAP_K(j + 1, 0, 0, ZERO, a + j, lda, a + jp, lda, NULL, 0); } if (j + 1 < m) { SCAL_K(m - j - 1, 0, 0, temp1, b + j + 1, 1, NULL, 0, NULL, 0); } } else { if (!info) info = j + 1; } } b += lda; } return info; }