inline void gemv (CBLAS_ORDER const Order, CBLAS_TRANSPOSE const TransA, int const M, int const N, traits::complex_d const& alpha, traits::complex_d const* A, int const lda, traits::complex_d const* X, int const incX, traits::complex_d const& beta, traits::complex_d* Y, int const incY) { cblas_zgemv (Order, TransA, M, N, static_cast<void const*> (&alpha), static_cast<void const*> (A), lda, static_cast<void const*> (X), incX, static_cast<void const*> (&beta), static_cast<void*> (Y), incY); }
void CORE_zgemv_quark(Quark *quark) { PLASMA_enum trans; int m, n, lda, incx, incy; PLASMA_Complex64_t alpha, beta; const PLASMA_Complex64_t *A, *x; PLASMA_Complex64_t *y; quark_unpack_args_11( quark, trans, m, n, alpha, A, lda, x, incx, beta, y, incy ); cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)trans, m, n, CBLAS_SADDR(alpha), A, lda, x, incx, CBLAS_SADDR(beta), y, incy); }
inline void gemv(CBLAS_ORDER const Order, CBLAS_TRANSPOSE const TransA, int const M, int const N, double alpha, std::complex<double> const *A, int const lda, std::complex<double> const *X, int const incX, double beta, std::complex<double> *Y, int const incY ) { std::complex<double> alphaArg(alpha,0); std::complex<double> betaArg(beta,0); cblas_zgemv(Order, TransA, M, N, reinterpret_cast<cblas_double_complex_type const *>(&alphaArg), reinterpret_cast<cblas_double_complex_type const *>(A), lda, reinterpret_cast<cblas_double_complex_type const *>(X), incX, reinterpret_cast<cblas_double_complex_type const *>(&betaArg), reinterpret_cast<cblas_double_complex_type *>(Y), incY); }
void phi_gemv(const int N, const Complex *alpha, const Complex *A, const Complex *X, const Complex *beta, Complex *Y){ #ifndef NOBLAS #ifdef SINGLEPRECISION cblas_cgemv(CblasColMajor,CblasNoTrans,N,N,alpha,A,N,X,1,beta,Y,1); #else cblas_zgemv(CblasColMajor,CblasNoTrans,N,N,alpha,A,N,X,1,beta,Y,1); #endif #else int i,n; //multiply beta: for(i = 0; i < N; ++i) Y[i] *= (*beta); for(i = 0; i < N; ++i) { for (n = 0; n < N; ++n) Y[i] += (*alpha)*A[n*N+i]*X[n]; } #endif }
int CORE_zttlqt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; #ifdef COMPLEX static int ione = 1; #endif PLASMA_Complex64_t alpha; int i, j, l, ii, sb, mi, ni; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDA2"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* TODO: Need to check why some cases require * this to not have uninitialized values */ CORE_zlaset( PlasmaUpperLower, IB, N, 0., 0., T, LDT); for(ii = 0; ii < M; ii += IB) { sb = min(M-ii, IB); for(i = 0; i < sb; i++) { j = ii + i; mi = sb-i-1; ni = min( j + 1, N); /* * Generate elementary reflector H( II*IB+I ) to annihilate A( II*IB+I, II*IB+I:M ). */ #ifdef COMPLEX LAPACKE_zlacgv_work(ni, &A2[j], LDA2); LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1); #endif LAPACKE_zlarfg_work(ni+1, &A1[LDA1*j+j], &A2[j], LDA2, &TAU[j]); if (mi > 0) { /* * Apply H( j-1 ) to A( j:II+IB-1, j-1:M ) from the right. */ cblas_zcopy( mi, &A1[LDA1*j+(j+1)], 1, WORK, 1); cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaNoTrans, mi, ni, CBLAS_SADDR(zone), &A2[j+1], LDA2, &A2[j], LDA2, CBLAS_SADDR(zone), WORK, 1); alpha = -(TAU[j]); cblas_zaxpy( mi, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*j+j+1], 1); cblas_zgerc( CblasColMajor, mi, ni, CBLAS_SADDR(alpha), WORK, 1, &A2[j], LDA2, &A2[j+1], LDA2); } /* * Calculate T. */ if (i > 0 ) { l = min(i, max(0, N-ii)); alpha = -(TAU[j]); CORE_zpemv( PlasmaNoTrans, PlasmaRowwise, i , min(j, N), l, alpha, &A2[ii], LDA2, &A2[j], LDA2, zzero, &T[LDT*j], 1, WORK); /* T(0:i-1, j) = T(0:i-1, ii:j-1) * T(0:i-1, j) */ cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*j], 1); } #ifdef COMPLEX LAPACKE_zlacgv_work(ni, &A2[j], LDA2 ); LAPACKE_zlacgv_work(ione, &A1[LDA1*j+j], LDA1 ); #endif T[LDT*j+i] = TAU[j]; } /* Apply Q to the rest of the matrix to the right */ if (M > ii+sb) { mi = M-(ii+sb); ni = min(ii+sb, N); l = min(sb, max(0, ni-ii)); CORE_zparfb( PlasmaRight, PlasmaNoTrans, PlasmaForward, PlasmaRowwise, mi, IB, mi, ni, sb, l, &A1[LDA1*ii+ii+sb], LDA1, &A2[ii+sb], LDA2, &A2[ii], LDA2, &T[LDT*ii], LDT, WORK, M); } } return PLASMA_SUCCESS; }
int CORE_zttqrt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; static int ione = 1; PLASMA_Complex64_t alpha; int i, j, ii, sb, mi, ni; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(7, "Illegal value of LDA2"); return -7; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; for(ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for(i = 0; i < sb; i++) { /* * Generate elementary reflector H( II*IB+I ) to annihilate * A( II*IB+I:mi, II*IB+I ). */ mi = ii + i + 1; LAPACKE_zlarfg_work(mi+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], ione, &TAU[ii+i]); if (sb-i-1>0) { /* * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left. */ ni = sb-i-1; cblas_zcopy( ni, &A1[LDA1*(ii+i+1)+(ii+i)], LDA1, WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, mi, ni, CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zone), WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif alpha = -conj(TAU[ii+i]); cblas_zaxpy( ni, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*(ii+i+1)+ii+i], LDA1); #ifdef COMPLEX LAPACKE_zlacgv_work(ni, WORK, ione); #endif cblas_zgerc( CblasColMajor, mi, ni, CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1, WORK, 1, &A2[LDA2*(ii+i+1)], LDA2); } /* * Calculate T. */ if (i > 0 ) { cblas_zcopy(i, &A2[LDA2*(ii+i)+ii], 1, &WORK[ii], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaConjTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &A2[LDA2*ii+ii], LDA2, &WORK[ii], 1); alpha = -(TAU[ii+i]); for(j = 0; j < i; j++) { WORK[ii+j] = alpha * WORK[ii+j]; } if (ii > 0) { cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, ii, i, CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zzero), WORK, 1); cblas_zaxpy(i, CBLAS_SADDR(zone), &WORK[ii], 1, WORK, 1); } cblas_zcopy(i, WORK, 1, &T[LDT*(ii+i)], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*(ii+i)], 1); } T[LDT*(ii+i)+i] = TAU[ii+i]; } /* Apply Q' to the rest of the matrix to the left */ if (N > ii+sb) { CORE_zttrfb( PlasmaLeft, PlasmaConjTrans, PlasmaForward, PlasmaColumnwise, sb, N-(ii+sb), ii+sb, N-(ii+sb), sb, &A1[LDA1*(ii+sb)+ii], LDA1, &A2[LDA2*(ii+sb)], LDA2, &A2[LDA2*ii], LDA2, &T[LDT*ii], LDT, WORK, sb); } } return PLASMA_SUCCESS; }
void wrapper_cblas_zgemv(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const int M, const int N, const void *alpha, const void *A, const int lda, const void *X, const int incX, const void *beta, void *Y, const int incY) { cblas_zgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY); }
int CORE_ztsqrt(int M, int N, int IB, PLASMA_Complex64_t *A1, int LDA1, PLASMA_Complex64_t *A2, int LDA2, PLASMA_Complex64_t *T, int LDT, PLASMA_Complex64_t *TAU, PLASMA_Complex64_t *WORK) { static PLASMA_Complex64_t zone = 1.0; static PLASMA_Complex64_t zzero = 0.0; PLASMA_Complex64_t alpha; int i, ii, sb; /* Check input arguments */ if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDA2 < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA2"); return -8; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; for(ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for(i = 0; i < sb; i++) { /* * Generate elementary reflector H( II*IB+I ) to annihilate * A( II*IB+I:M, II*IB+I ) */ LAPACKE_zlarfg_work(M+1, &A1[LDA1*(ii+i)+ii+i], &A2[LDA2*(ii+i)], 1, &TAU[ii+i]); if (ii+i+1 < N) { /* * Apply H( II*IB+I ) to A( II*IB+I:M, II*IB+I+1:II*IB+IB ) from the left */ alpha = -conj(TAU[ii+i]); cblas_zcopy( sb-i-1, &A1[LDA1*(ii+i+1)+(ii+i)], LDA1, WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(sb-i-1, WORK, 1); #endif cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, sb-i-1, CBLAS_SADDR(zone), &A2[LDA2*(ii+i+1)], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zone), WORK, 1); #ifdef COMPLEX LAPACKE_zlacgv_work(sb-i-1, WORK, 1 ); #endif cblas_zaxpy( sb-i-1, CBLAS_SADDR(alpha), WORK, 1, &A1[LDA1*(ii+i+1)+ii+i], LDA1); #ifdef COMPLEX LAPACKE_zlacgv_work(sb-i-1, WORK, 1 ); #endif cblas_zgerc( CblasColMajor, M, sb-i-1, CBLAS_SADDR(alpha), &A2[LDA2*(ii+i)], 1, WORK, 1, &A2[LDA2*(ii+i+1)], LDA2); } /* * Calculate T */ alpha = -TAU[ii+i]; cblas_zgemv( CblasColMajor, (CBLAS_TRANSPOSE)PlasmaConjTrans, M, i, CBLAS_SADDR(alpha), &A2[LDA2*ii], LDA2, &A2[LDA2*(ii+i)], 1, CBLAS_SADDR(zzero), &T[LDT*(ii+i)], 1); cblas_ztrmv( CblasColMajor, (CBLAS_UPLO)PlasmaUpper, (CBLAS_TRANSPOSE)PlasmaNoTrans, (CBLAS_DIAG)PlasmaNonUnit, i, &T[LDT*ii], LDT, &T[LDT*(ii+i)], 1); T[LDT*(ii+i)+i] = TAU[ii+i]; } if (N > ii+sb) { CORE_ztsmqr( PlasmaLeft, PlasmaConjTrans, sb, N-(ii+sb), M, N-(ii+sb), IB, IB, &A1[LDA1*(ii+sb)+ii], LDA1, &A2[LDA2*(ii+sb)], LDA2, &A2[LDA2*ii], LDA2, &T[LDT*ii], LDT, WORK, sb); } } return PLASMA_SUCCESS; }
void test_gemv (void) { const double flteps = 1e-4, dbleps = 1e-6; { int order = 101; int trans = 111; int M = 1; int N = 1; int lda = 1; float alpha = 1.0f; float beta = -0.3f; float A[] = { -0.805f }; float X[] = { -0.965f }; int incX = -1; float Y[] = { 0.537f }; int incY = -1; float y_expected[] = { 0.615725f }; cblas_sgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], flteps, "sgemv(case 774)"); } }; }; { int order = 102; int trans = 111; int M = 1; int N = 1; int lda = 1; float alpha = 1.0f; float beta = -0.3f; float A[] = { -0.805f }; float X[] = { -0.965f }; int incX = -1; float Y[] = { 0.537f }; int incY = -1; float y_expected[] = { 0.615725f }; cblas_sgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], flteps, "sgemv(case 775)"); } }; }; { int order = 101; int trans = 112; int M = 1; int N = 1; int lda = 1; float alpha = 1.0f; float beta = 0.0f; float A[] = { -0.805f }; float X[] = { -0.965f }; int incX = -1; float Y[] = { 0.537f }; int incY = -1; float y_expected[] = { 0.776825f }; cblas_sgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], flteps, "sgemv(case 776)"); } }; }; { int order = 102; int trans = 112; int M = 1; int N = 1; int lda = 1; float alpha = 1.0f; float beta = 0.0f; float A[] = { -0.805f }; float X[] = { -0.965f }; int incX = -1; float Y[] = { 0.537f }; int incY = -1; float y_expected[] = { 0.776825f }; cblas_sgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], flteps, "sgemv(case 777)"); } }; }; { int order = 101; int trans = 111; int M = 1; int N = 1; int lda = 1; double alpha = -0.3; double beta = -1; double A[] = { -0.047 }; double X[] = { 0.672 }; int incX = -1; double Y[] = { 0.554 }; int incY = -1; double y_expected[] = { -0.5445248 }; cblas_dgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], dbleps, "dgemv(case 778)"); } }; }; { int order = 102; int trans = 111; int M = 1; int N = 1; int lda = 1; double alpha = -0.3; double beta = -1; double A[] = { -0.047 }; double X[] = { 0.672 }; int incX = -1; double Y[] = { 0.554 }; int incY = -1; double y_expected[] = { -0.5445248 }; cblas_dgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], dbleps, "dgemv(case 779)"); } }; }; { int order = 101; int trans = 112; int M = 1; int N = 1; int lda = 1; double alpha = -1; double beta = 1; double A[] = { -0.047 }; double X[] = { 0.672 }; int incX = -1; double Y[] = { 0.554 }; int incY = -1; double y_expected[] = { 0.585584 }; cblas_dgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], dbleps, "dgemv(case 780)"); } }; }; { int order = 102; int trans = 112; int M = 1; int N = 1; int lda = 1; double alpha = -1; double beta = 1; double A[] = { -0.047 }; double X[] = { 0.672 }; int incX = -1; double Y[] = { 0.554 }; int incY = -1; double y_expected[] = { 0.585584 }; cblas_dgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], y_expected[i], dbleps, "dgemv(case 781)"); } }; }; { int order = 101; int trans = 111; int M = 1; int N = 1; int lda = 1; float alpha[2] = {0.0f, 0.1f}; float beta[2] = {0.0f, 1.0f}; float A[] = { 0.629f, 0.801f }; float X[] = { 0.778f, -0.073f }; int incX = -1; float Y[] = { -0.976f, -0.682f }; int incY = -1; float y_expected[] = { 0.624274f, -0.921216f }; cblas_cgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], flteps, "cgemv(case 782) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], flteps, "cgemv(case 782) imag"); }; }; }; { int order = 102; int trans = 111; int M = 1; int N = 1; int lda = 1; float alpha[2] = {0.0f, 0.1f}; float beta[2] = {0.0f, 1.0f}; float A[] = { 0.629f, 0.801f }; float X[] = { 0.778f, -0.073f }; int incX = -1; float Y[] = { -0.976f, -0.682f }; int incY = -1; float y_expected[] = { 0.624274f, -0.921216f }; cblas_cgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], flteps, "cgemv(case 783) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], flteps, "cgemv(case 783) imag"); }; }; }; { int order = 101; int trans = 112; int M = 1; int N = 1; int lda = 1; float alpha[2] = {0.0f, 1.0f}; float beta[2] = {-0.3f, 0.1f}; float A[] = { 0.629f, 0.801f }; float X[] = { 0.778f, -0.073f }; int incX = -1; float Y[] = { -0.976f, -0.682f }; int incY = -1; float y_expected[] = { -0.216261f, 0.654835f }; cblas_cgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], flteps, "cgemv(case 784) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], flteps, "cgemv(case 784) imag"); }; }; }; { int order = 102; int trans = 112; int M = 1; int N = 1; int lda = 1; float alpha[2] = {0.0f, 1.0f}; float beta[2] = {-0.3f, 0.1f}; float A[] = { 0.629f, 0.801f }; float X[] = { 0.778f, -0.073f }; int incX = -1; float Y[] = { -0.976f, -0.682f }; int incY = -1; float y_expected[] = { -0.216261f, 0.654835f }; cblas_cgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], flteps, "cgemv(case 785) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], flteps, "cgemv(case 785) imag"); }; }; }; { int order = 101; int trans = 113; int M = 1; int N = 1; int lda = 1; float alpha[2] = {0.0f, 0.1f}; float beta[2] = {-0.3f, 0.1f}; float A[] = { 0.629f, 0.801f }; float X[] = { 0.778f, -0.073f }; int incX = -1; float Y[] = { -0.976f, -0.682f }; int incY = -1; float y_expected[] = { 0.427909f, 0.150089f }; cblas_cgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], flteps, "cgemv(case 786) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], flteps, "cgemv(case 786) imag"); }; }; }; { int order = 102; int trans = 113; int M = 1; int N = 1; int lda = 1; float alpha[2] = {0.0f, 0.1f}; float beta[2] = {-0.3f, 0.1f}; float A[] = { 0.629f, 0.801f }; float X[] = { 0.778f, -0.073f }; int incX = -1; float Y[] = { -0.976f, -0.682f }; int incY = -1; float y_expected[] = { 0.427909f, 0.150089f }; cblas_cgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], flteps, "cgemv(case 787) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], flteps, "cgemv(case 787) imag"); }; }; }; { int order = 101; int trans = 111; int M = 1; int N = 1; int lda = 1; double alpha[2] = {0, 0.1}; double beta[2] = {1, 0}; double A[] = { 0.932, -0.724 }; double X[] = { 0.334, -0.317 }; int incX = -1; double Y[] = { 0.348, 0.07 }; int incY = -1; double y_expected[] = { 0.401726, 0.078178 }; cblas_zgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], dbleps, "zgemv(case 788) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], dbleps, "zgemv(case 788) imag"); }; }; }; { int order = 102; int trans = 111; int M = 1; int N = 1; int lda = 1; double alpha[2] = {0, 0.1}; double beta[2] = {1, 0}; double A[] = { 0.932, -0.724 }; double X[] = { 0.334, -0.317 }; int incX = -1; double Y[] = { 0.348, 0.07 }; int incY = -1; double y_expected[] = { 0.401726, 0.078178 }; cblas_zgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], dbleps, "zgemv(case 789) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], dbleps, "zgemv(case 789) imag"); }; }; }; { int order = 101; int trans = 112; int M = 1; int N = 1; int lda = 1; double alpha[2] = {-0.3, 0.1}; double beta[2] = {0, 1}; double A[] = { 0.932, -0.724 }; double X[] = { 0.334, -0.317 }; int incX = -1; double Y[] = { 0.348, 0.07 }; int incY = -1; double y_expected[] = { -0.040808, 0.517356 }; cblas_zgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], dbleps, "zgemv(case 790) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], dbleps, "zgemv(case 790) imag"); }; }; }; { int order = 102; int trans = 112; int M = 1; int N = 1; int lda = 1; double alpha[2] = {-0.3, 0.1}; double beta[2] = {0, 1}; double A[] = { 0.932, -0.724 }; double X[] = { 0.334, -0.317 }; int incX = -1; double Y[] = { 0.348, 0.07 }; int incY = -1; double y_expected[] = { -0.040808, 0.517356 }; cblas_zgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], dbleps, "zgemv(case 791) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], dbleps, "zgemv(case 791) imag"); }; }; }; { int order = 101; int trans = 113; int M = 1; int N = 1; int lda = 1; double alpha[2] = {1, 0}; double beta[2] = {0, 0}; double A[] = { 0.932, -0.724 }; double X[] = { 0.334, -0.317 }; int incX = -1; double Y[] = { 0.348, 0.07 }; int incY = -1; double y_expected[] = { 0.540796, -0.053628 }; cblas_zgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], dbleps, "zgemv(case 792) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], dbleps, "zgemv(case 792) imag"); }; }; }; { int order = 102; int trans = 113; int M = 1; int N = 1; int lda = 1; double alpha[2] = {1, 0}; double beta[2] = {0, 0}; double A[] = { 0.932, -0.724 }; double X[] = { 0.334, -0.317 }; int incX = -1; double Y[] = { 0.348, 0.07 }; int incY = -1; double y_expected[] = { 0.540796, -0.053628 }; cblas_zgemv(order, trans, M, N, alpha, A, lda, X, incX, beta, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], y_expected[2*i], dbleps, "zgemv(case 793) real"); gsl_test_rel(Y[2*i+1], y_expected[2*i+1], dbleps, "zgemv(case 793) imag"); }; }; }; }
// This dMList function creates a list of the dM values giving the probabilities of mutations static PyObject *dMList(PyObject *self, PyObject *args) { // Calling variables are (in order): uts, only_need, n_aa, length, grs, dmlist, residue_to_compute, iwt, brs PyObject *uts, *only_need, *grs, *r_grs_tuple, *gr_diag, *p, *p_inv, *dmlist, *brs, *brz; long n_aa, length, n_aa2, n_uts, residue_to_compute, i_ut, x, y, index, only_need_index, only_need_i, only_need_i_n_aa, index2, iwt, n_aa3, z; double *arr_dmlist, *cp_inv, *cp, *cgr_diag, *cexpd, *cbrz, *cvrz, *cvrz_p_inv; complex double *complex_cp_inv, *complex_cp, *complex_cgr_diag, *complex_cexpd, *complex_naa2_list, *complex_naa_list, *complex_cvrz, *complex_cvrz_p_inv, *complex_cbrz; double ut, exp_utdx, exp_utdy, dx, dy; complex double complex_exp_utdx, complex_exp_utdy, complex_dx, complex_dy; int array_type; #ifdef USE_ACCELERATE_CBLAS complex double complex_one = 1, complex_zero = 0; #else complex double complex_dmxy, complex_v_p_inv_xy; double dmxy, v_p_inv_xy; long yindex, irowcolumn; #endif // Parse the arguments. if (! PyArg_ParseTuple( args, "O!O!llO!O!llO!", &PyList_Type, &uts, &PyList_Type, &only_need, &n_aa, &length, &PyList_Type, &grs, &PyArray_Type, &dmlist, &residue_to_compute, &iwt, &PyList_Type, &brs)) { PyErr_SetString(PyExc_TypeError, "Invalid calling arguments to dMList."); return NULL; } // Error checking on arguments if (length < 1) { // length of the protein PyErr_SetString(PyExc_ValueError, "length is less than one."); return NULL; } if (n_aa < 1) { // number of amino acids. Normally will be 20. PyErr_SetString(PyExc_ValueError, "n_aa is less than one."); return NULL; } if (PyList_GET_SIZE(grs) != length) { // make sure grs is of the same size as length PyErr_SetString(PyExc_ValueError, "grs is not of the same size as length."); return NULL; } n_uts = PyList_GET_SIZE(uts); // number of entries in uts if (n_uts < 1) { // make sure there are entries in uts PyErr_SetString(PyExc_ValueError, "uts has no entries."); return NULL; } if (PyList_GET_SIZE(only_need) != n_uts * length) { // make sure only_need is of correct size PyErr_SetString(PyExc_ValueError, "only_need is of wrong size."); return NULL; } if (! ((residue_to_compute >= 0) && (residue_to_compute < length))) { PyErr_SetString(PyExc_ValueError, "Invalid value for residue_to_compute."); return NULL; } if (! ((iwt >= 0) && (iwt < n_aa))) { PyErr_SetString(PyExc_ValueError, "Invalid value for iwt."); return NULL; } if (PyList_GET_SIZE(brs) != n_aa) { // make sure brs has one entry for each amino acid PyErr_SetString(PyExc_ValueError, "brs is not of length equal to n_aa"); return NULL; } n_aa2 = n_aa * n_aa; // square of the number of amino acids n_aa3 = n_aa2 * n_aa; // cube of the number of amino acids // The results will be returned in a numpy ndarray 'float_' (C type double) array called dmlist. // This array will be of size length * n_uts * n_aa3. arr_dmlist = (double *) PyArray_DATA(dmlist); // this is the data array of dmlist long const sizeof_cexpd = n_aa2 * sizeof(double); long const complex_sizeof_cexpd = n_aa2 * sizeof(complex double); // gr_diag, p, and p_inv are the eigenvalues, left, and right diagonalizing matrices of gr r_grs_tuple = PyList_GET_ITEM(grs, residue_to_compute); gr_diag = PyTuple_GET_ITEM(r_grs_tuple, 0); p = PyTuple_GET_ITEM(r_grs_tuple, 1); p_inv = PyTuple_GET_ITEM(r_grs_tuple, 2); // Now begin filling arr_dmlist with the appropriate values index = 0; only_need_index = residue_to_compute * n_uts; // determine if these arrays are complex double or real doubles array_type = PyArray_TYPE(gr_diag); if (array_type == NPY_DOUBLE) { // array is of doubles, real not complex // Note that these next assignments assume that the arrays are C-style contiguous cp = PyArray_DATA(p); cp_inv = PyArray_DATA(p_inv); cgr_diag = PyArray_DATA(gr_diag); cexpd = (double *) malloc(sizeof_cexpd); // allocate memory cvrz = (double *) malloc(sizeof_cexpd); // allocate memory cvrz_p_inv = (double *) malloc(sizeof_cexpd); // allocate memory for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in dmlist index += n_aa3; } else { // we need to compute at least some entries in dmlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value // Entries of cexpd are defined by D_xy = (exp(ut d_x) - exp(ut d_y)) / (d_x - d_y) // for x != y, and D_yy = ut exp(d_x ut) index2 = 0; for (x = 0; x < n_aa; x++) { dx = cgr_diag[x]; exp_utdx = exp(ut * dx); for (y = 0; y < n_aa; y++) { if (y == x) { cexpd[index2] = ut * exp_utdx; } else { dy = cgr_diag[y]; exp_utdy = exp(ut * dy); cexpd[index2] = (exp_utdx - exp_utdy) / (dx - dy); } index2++; } } for (z = 0; z < n_aa; z++) { // compute derivative with respect to z if (z == iwt) { // don't compute values with respect to wildtype index += n_aa2; continue; } brz = PyList_GET_ITEM(brs, z); cbrz = PyArray_DATA(brz); // cvrz is the element-by-element product of cbrz and cexpd for (index2 = 0; index2 < n_aa2; index2++) { cvrz[index2] = cbrz[index2] * cexpd[index2]; } #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_dgemm cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cvrz, n_aa, cp_inv, n_aa, (double) 0.0, cvrz_p_inv, n_aa); // multiply cvrz and cp_inv into cvrz_p_inv #else // multiply the matrices in pure C code // multiply cvrz and cp_inv into cvrz_p_inv index2 = 0; for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { v_p_inv_xy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { v_p_inv_xy += cvrz[irowcolumn] * cp_inv[yindex]; yindex += n_aa; } cvrz_p_inv[index2++] = v_p_inv_xy; } } #endif if (only_need_i == -2) { // we need to compute all of these entries in dmlist #ifdef USE_ACCELERATE_CBLAS cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cp, n_aa, cvrz_p_inv, n_aa, (double) 0.0, &arr_dmlist[index], n_aa); // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] index += n_aa2; #else // multiply the matrices in pure C code, and fill dmlist with the results // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { dmxy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { dmxy += cp[irowcolumn] * cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = dmxy; } } #endif } else { // we need to compute entries in dmlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in dmlist cblas_dgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, (double) 1.0, cvrz_p_inv, n_aa, &cp[only_need_i_n_aa], 1, (double) 0.0, &arr_dmlist[index], 1); index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { dmxy = 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { dmxy += cp[irowcolumn] * cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = dmxy; } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } } free(cexpd); free(cvrz); free(cvrz_p_inv); } else if (array_type == NPY_CDOUBLE) { // array is of complex doubles // Note that these next assignments assume that the arrays are C-style contiguous complex_cp = PyArray_DATA(p); complex_cp_inv = PyArray_DATA(p_inv); complex_cgr_diag = PyArray_DATA(gr_diag); complex_cexpd = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory complex_cvrz = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory complex_cvrz_p_inv = (complex double *) malloc(complex_sizeof_cexpd); // allocate memory complex_naa2_list = (complex double *) malloc(n_aa2 * sizeof(complex double)); // allocate memory complex_naa_list = (complex double *) malloc(n_aa * sizeof(complex double)); // allocate memory for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in dmlist index += n_aa3; } else { // we need to compute at least some entries in dmlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value // Entries of cexpd are defined by D_xy = (exp(ut d_x) - exp(ut d_y)) / (d_x - d_y) // for x != y, and D_yy = ut exp(d_x ut) index2 = 0; for (x = 0; x < n_aa; x++) { complex_dx = complex_cgr_diag[x]; complex_exp_utdx = cexp(ut * complex_dx); for (y = 0; y < n_aa; y++) { if (y == x) { complex_cexpd[index2] = ut * complex_exp_utdx; } else { complex_dy = complex_cgr_diag[y]; complex_exp_utdy = cexp(ut * complex_dy); complex_cexpd[index2] = (complex_exp_utdx - complex_exp_utdy) / (complex_dx - complex_dy); } index2++; } } for (z = 0; z < n_aa; z++) { // compute derivative with respect to z if (z == iwt) { // don't compute values with respect to wildtype index += n_aa2; continue; } brz = PyList_GET_ITEM(brs, z); complex_cbrz = PyArray_DATA(brz); // cvrz is the element-by-element product of cbrz and cexpd for (index2 = 0; index2 < n_aa2; index2++) { complex_cvrz[index2] = complex_cbrz[index2] * complex_cexpd[index2]; } #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_zgemm cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cvrz, n_aa, complex_cp_inv, n_aa, &complex_zero, complex_cvrz_p_inv, n_aa); // multiply cvrz and cp_inv into cvrz_p_inv #else // multiply the matrices in pure C code // multiply cvrz and cp_inv into cvrz_p_inv index2 = 0; for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { complex_v_p_inv_xy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { complex_v_p_inv_xy += complex_cvrz[irowcolumn] * complex_cp_inv[yindex]; yindex += n_aa; } complex_cvrz_p_inv[index2++] = complex_v_p_inv_xy; } } #endif if (only_need_i == -2) { // we need to compute all of these entries in dmlist #ifdef USE_ACCELERATE_CBLAS cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cp, n_aa, complex_cvrz_p_inv, n_aa, &complex_zero, complex_naa2_list, n_aa); // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] for (index2 = 0; index2 < n_aa2; index2++) { arr_dmlist[index + index2] = creal(complex_naa2_list[index2]); } index += n_aa2; #else // multiply the matrices in pure C code, and fill dmlist with the results // multiply cp and cvrz_p_inv into arr_dmlist[index : index + n_aa2] for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { complex_dmxy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { complex_dmxy += complex_cp[irowcolumn] * complex_cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = creal(complex_dmxy); } } #endif } else { // we need to compute entries in dmlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in dmlist cblas_zgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, &complex_one, complex_cvrz_p_inv, n_aa, &complex_cp[only_need_i_n_aa], 1, &complex_zero, complex_naa_list, 1); for (index2 = 0; index2 < n_aa; index2++) { arr_dmlist[index + index2] = creal(complex_naa_list[index2]); } index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { complex_dmxy = 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { complex_dmxy += complex_cp[irowcolumn] * complex_cvrz_p_inv[yindex]; yindex += n_aa; } arr_dmlist[index++] = creal(complex_dmxy); } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } } free(complex_cexpd); free(complex_cvrz); free(complex_cvrz_p_inv); free(complex_naa2_list); free(complex_naa_list); } else { // array is of neither real nor complex doubles PyErr_SetString(PyExc_ValueError, "matrices are neither double nor complex doubles."); return NULL; } return PyInt_FromLong((long) 1); }
// This MList function creates a list of the M values giving the probabilities of mutations static PyObject *MList(PyObject *self, PyObject *args) { // Calling variables are (in order): uts, only_need, n_aa, length, grs, mlist, residue_to_compute PyObject *uts, *only_need, *grs, *r_grs_tuple, *gr_diag, *p, *p_inv, *mlist; long n_aa, length, n_aa2, n_uts, residue_to_compute, i_ut, y, index, irowcolumn, only_need_index, only_need_i, only_need_i_n_aa; double *arr_mlist, *cp_inv, *cp, *cgr_diag, *cp_inv_exp; complex double *complex_cp_inv, *complex_cp, *complex_cgr_diag, *complex_cp_inv_exp, *complex_naa2_list, *complex_naa_list; double ut, exp_ut; complex double complex_exp_ut; int array_type; #ifdef USE_ACCELERATE_CBLAS complex double complex_one = 1, complex_zero = 0; long index2; #else complex double complex_mxy; double mxy; long x, yindex; #endif // Parse the arguments. if (! PyArg_ParseTuple( args, "O!O!llO!O!l", &PyList_Type, &uts, &PyList_Type, &only_need, &n_aa, &length, &PyList_Type, &grs, &PyArray_Type, &mlist, &residue_to_compute)) { PyErr_SetString(PyExc_TypeError, "Invalid calling arguments to MList."); return NULL; } // Error checking on arguments if (length < 1) { // length of the protein PyErr_SetString(PyExc_ValueError, "length is less than one."); return NULL; } if (n_aa < 1) { // number of amino acids. Normally will be 20. PyErr_SetString(PyExc_ValueError, "n_aa is less than one."); return NULL; } if (PyList_GET_SIZE(grs) != length) { // make sure grs is of the same size as length // PyErr_SetString(PyExc_ValueError, "grs is not of the same size as length."); char errstring[200]; sprintf(errstring, "grs is not of the same size as length: %ld, %ld", PyList_GET_SIZE(grs), length); PyErr_SetString(PyExc_ValueError, errstring); return NULL; } n_uts = PyList_GET_SIZE(uts); // number of entries in uts if (n_uts < 1) { // make sure there are entries in uts PyErr_SetString(PyExc_ValueError, "uts has no entries."); return NULL; } if (PyList_GET_SIZE(only_need) != n_uts * length) { // make sure only_need is of correct size PyErr_SetString(PyExc_ValueError, "only_need is of wrong size."); return NULL; } if (! ((residue_to_compute >= 0) && (residue_to_compute < length))) { char errstring[200]; sprintf(errstring, "Invalid value for residue_to_compute: %ld, %ld", residue_to_compute, length); PyErr_SetString(PyExc_ValueError, errstring); return NULL; } n_aa2 = n_aa * n_aa; // square of the number of amino acids // The results will be returned in a numpy ndarray 'float_' (C type double) array called mlist. // This array will be of size length * n_uts * n_aa2. arr_mlist = (double *) PyArray_DATA(mlist); // this is the data array of mlist long const sizeof_cp_inv = n_aa2 * sizeof(double); long const complex_sizeof_cp_inv = n_aa2 * sizeof(complex double); // Now begin filling arr_mlist with the appropriate values index = 0; only_need_index = residue_to_compute * n_uts; r_grs_tuple = PyList_GET_ITEM(grs, residue_to_compute); // gr_diag, p, and p_inv are the eigenvalues, left, and right diagonalizing matrices of gr gr_diag = PyTuple_GET_ITEM(r_grs_tuple, 0); p = PyTuple_GET_ITEM(r_grs_tuple, 1); p_inv = PyTuple_GET_ITEM(r_grs_tuple, 2); // determine if these arrays are complex double or real doubles array_type = PyArray_TYPE(gr_diag); if (array_type == NPY_DOUBLE) { // array is of doubles, real not complex cp_inv_exp = (double *) malloc(sizeof_cp_inv); // allocate memory // Note that these next assignments assume that the arrays are C-style contiguous cp = PyArray_DATA(p); cp_inv = PyArray_DATA(p_inv); cgr_diag = PyArray_DATA(gr_diag); for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in mlist index += n_aa2; } else { // we need to compute at least some entries in mlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value for (irowcolumn = 0; irowcolumn < n_aa; irowcolumn++) { exp_ut = exp(ut * cgr_diag[irowcolumn]); for (y = irowcolumn * n_aa; y < (irowcolumn + 1) * n_aa; y++) { cp_inv_exp[y] = cp_inv[y] * exp_ut; } } if (only_need_i == -2) { // we need to compute all of these entries in mlist #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_dgemm, and fill mlist with the results cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, (double) 1.0, cp, n_aa, cp_inv_exp, n_aa, (double) 0.0, &arr_mlist[index], n_aa); for (index2 = index; index2 < index + n_aa2; index2++) { arr_mlist[index2] = fabs(arr_mlist[index2]); } index += n_aa2; #else // multiply the matrices in pure C code, and fill mlist with the results for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { mxy = 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { mxy += cp[irowcolumn] * cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = fabs(mxy); } } #endif } else { // we need to compute entries in mlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in mlist cblas_dgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, (double) 1.0, cp_inv_exp, n_aa, &cp[only_need_i_n_aa], 1, (double) 0.0, &arr_mlist[index], 1); for (index2 = index; index2 < index + n_aa2 - only_need_i_n_aa; index2++) { arr_mlist[index2] = fabs(arr_mlist[index2]); } index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { mxy = 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { mxy += cp[irowcolumn] * cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = fabs(mxy); } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } free(cp_inv_exp); } else if (array_type == NPY_CDOUBLE) { // array is complex doubles complex_cp_inv_exp = (complex double *) malloc(complex_sizeof_cp_inv); // allocate memory complex_naa2_list = (complex double *) malloc(n_aa2 * sizeof(complex double)); // allocate memory complex_naa_list = (complex double *) malloc(n_aa * sizeof(complex double)); // allocate memory // Note that these next assignments assume that the arrays are C-style contiguous complex_cp = PyArray_DATA(p); complex_cp_inv = PyArray_DATA(p_inv); complex_cgr_diag = PyArray_DATA(gr_diag); for (i_ut = 0; i_ut < n_uts; i_ut++) { // loop over ut values only_need_i = PyInt_AS_LONG(PyList_GET_ITEM(only_need, only_need_index)); // value of only_need only_need_index++; if (only_need_i == -1) { // we don't need to do anything for these entries in mlist index += n_aa2; } else { // we need to compute at least some entries in mlist ut = PyFloat_AS_DOUBLE(PyList_GET_ITEM(uts, i_ut)); // ut value for (irowcolumn = 0; irowcolumn < n_aa; irowcolumn++) { complex_exp_ut = cexp(ut * complex_cgr_diag[irowcolumn]); for (y = irowcolumn * n_aa; y < (irowcolumn + 1) * n_aa; y++) { complex_cp_inv_exp[y] = complex_cp_inv[y] * complex_exp_ut; } } if (only_need_i == -2) { // we need to compute all of these entries in mlist #ifdef USE_ACCELERATE_CBLAS // multiply the matrices using cblas_zgemm, and fill mlist with the results cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n_aa, n_aa, n_aa, &complex_one, complex_cp, n_aa, complex_cp_inv_exp, n_aa, &complex_zero, complex_naa2_list, n_aa); for (index2 = 0; index2 < n_aa2; index2++) { arr_mlist[index + index2] = cabs(complex_naa2_list[index2]); } index += n_aa2; #else // multiply the matrices in pure C code, and fill mlist with the results for (x = 0; x < n_aa2; x += n_aa) { for (y = 0; y < n_aa; y++) { complex_mxy = (complex double) 0.0; yindex = y; for (irowcolumn = x; irowcolumn < x + n_aa; irowcolumn++) { complex_mxy += complex_cp[irowcolumn] * complex_cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = cabs(complex_mxy); } } #endif } else { // we need to compute entries in mlist only for x = only_need_i only_need_i_n_aa = only_need_i * n_aa; index += only_need_i_n_aa; #ifdef USE_ACCELERATE_CBLAS // do the matrix vector multiplication using cblas, and put results in mlist cblas_zgemv(CblasRowMajor, CblasTrans, n_aa, n_aa, &complex_one, complex_cp_inv_exp, n_aa, &complex_cp[only_need_i_n_aa], 1, &complex_zero, complex_naa_list, 1); for (index2 = 0; index2 < n_aa; index2++) { arr_mlist[index + index2] = cabs(complex_naa_list[index2]); } index += n_aa2 - only_need_i_n_aa; #else // do the matrix vector multiplication in pure C code, and put results in mlist for (y = 0; y < n_aa; y++) { complex_mxy = (complex double) 0.0; yindex = y; for (irowcolumn = only_need_i_n_aa; irowcolumn < only_need_i_n_aa + n_aa; irowcolumn++) { complex_mxy += complex_cp[irowcolumn] * complex_cp_inv_exp[yindex]; yindex += n_aa; } arr_mlist[index++] = cabs(complex_mxy); } index += n_aa2 - only_need_i_n_aa - n_aa; #endif } } } free(complex_cp_inv_exp); free(complex_naa2_list); free(complex_naa_list); } else { // array is of neither real nor complex doubles PyErr_SetString(PyExc_ValueError, "gr entries are neither double nor complex doubles."); return NULL; } return PyInt_FromLong((long) 1); }
void StateSet::orthonormalize() throw(std::exception) { // Handle the trivial case N=1 separately if (N == 1) { ortho_timer.start(); const double norm = (*state_array)[0].norm(); (*state_array)[0] *= 1.0/norm; ortho_timer.stop(); return; } ortho_timer.start(); dot_timer.start(); // NOTE: Because Eigensolver uses LAPACK, the overlap matrix is stored in column-major format #pragma omp parallel for for (size_t i=0; i<N; i++) { for (size_t j=i; j<N; j++) { overlapmatrix[N*j+i] = dot(i,j); } } dot_time += dot_timer.stop(); // Solve eigenvalue problem for the overlap matrix eigensolve_timer.start(); ESolver.solve(overlapmatrix); for (size_t n=0; n<N; n++) { const double eval = ESolver.eigenvalue(n); // Check that eigenvalues are OK. If states are propagated "too much", // they can become linearly dependent (or close enough so), which // causes the overlap matrix to have non-positive eigenvalues and as a // result the orthonormalization will fail. if (eval <= 0) { ortho_time += ortho_timer.stop(); eigensolve_time += eigensolve_timer.stop(); throw(NonPositiveEigenvalue(n, eval, overlapmatrix, N)); } else if (std::fpclassify(eval) != FP_NORMAL) { ortho_time += ortho_timer.stop(); eigensolve_time += eigensolve_timer.stop(); throw(NonNormalEigenvalue(n, eval, overlapmatrix, N)); } // Scale eigenvectors with the eigenvalues ESolver.scale_eigenvector(overlapmatrix, n, 1/sqrt(eval)); } eigensolve_time += eigensolve_timer.stop(); // Form orthonormal states from linear combinations lincomb_timer.start(); const comp one = 1; const comp zero = 0; const int iN = static_cast<int>(N); const int iM = static_cast<int>(datalayout.N); comp* const statedata = state_array->get_dataptr(); switch (ortho_algorithm) { case Default: // This is the in-place version, which uses less memory #pragma omp parallel { const size_t required_size = N*omp_get_num_threads(); const size_t thread_offset = N*omp_get_thread_num(); #pragma omp single { // Check for enough space on tempstate if (tempstate.size() < required_size) tempstate.resize(required_size); } comp* const temp = tempstate.data() + thread_offset; comp c; #pragma omp for for (size_t t=0; t<datalayout.N; t++) { // Save old state values cblas_zcopy(iN, statedata+t, iM, temp, 1); // Note that now overlapmatrix holds the eigenvectors cblas_zgemv(CblasRowMajor, CblasNoTrans, iN, iN, &one, overlapmatrix, iN, temp, 1, &zero, statedata+t, iM); } } break; case HighMem: // This is the out-of-place version, where the formation of linear // combinations can be expressed simply as a product of two (very // large) matrices. comp* const other_statedata = other_state_array->get_dataptr(); assert(statedata != NULL); assert(other_statedata != NULL); cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, iN, iM, iN, &one, overlapmatrix, iN, statedata, iM, &zero, other_statedata, iM); switch_state_arrays(); break; } lincomb_time += lincomb_timer.stop(); ortho_time += ortho_timer.stop(); }
void batchf_zgemv( const enum BBLAS_TRANS trans, const int m, const int n, const BBLAS_Complex64_t alpha, const BBLAS_Complex64_t **arrayA, const int lda, const BBLAS_Complex64_t **arrayx, const int incx, const BBLAS_Complex64_t beta, BBLAS_Complex64_t **arrayy, const int incy, const int batch_count, int info) { /* Local variables */ int first_index = 0; int batch_iter = 0; char func_name[15] = "batchf_zgemv"; /* Check input arguments */ if (batch_count < 0) { xerbla_batch(func_name, BBLAS_ERR_BATCH_COUNT, -1); } if ((trans != BblasTrans) && (trans != BblasNoTrans) && (trans != BblasConjTrans)) { xerbla_batch(func_name, BBLAS_ERR_TRANS, first_index); info = BBLAS_ERR_TRANS; } if (m < 0) { xerbla_batch(func_name, BBLAS_ERR_M, first_index); info = BBLAS_ERR_M; } if (n < 0) { xerbla_batch(func_name, BBLAS_ERR_N, first_index); info = BBLAS_ERR_N; } /* Column major */ if ((lda < 1) && (lda < m)) { xerbla_batch(func_name, BBLAS_ERR_LDA, first_index); info = BBLAS_ERR_LDA; } if (incx < 1) { xerbla_batch(func_name, BBLAS_ERR_INCX, first_index); info = BBLAS_ERR_INCX; } if (incy < 1) { xerbla_batch(func_name, BBLAS_ERR_INCY, first_index); info = BBLAS_ERR_INCY; } /* Call CBLAS */ for (batch_iter = 0; batch_iter < batch_count; batch_iter++) { cblas_zgemv( BblasColMajor, trans, m, n, CBLAS_SADDR( alpha ), arrayA[batch_iter], lda, arrayx[batch_iter], incx, CBLAS_SADDR( beta ), arrayy[batch_iter], incy); } /* End fixed size for loop */ /* Successful */ info = BBLAS_SUCCESS; }