void THBlas_(swap)(long n, real *x, long incx, real *y, long incy) { if(n == 1) { incx = 1; incy = 1; } #if defined(USE_BLAS) && (defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)) if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; #if defined(TH_REAL_IS_DOUBLE) cblas_dswap(i_n, x, i_incx, y, i_incy); #else cblas_sswap(i_n, x, i_incx, y, i_incy); #endif return; } #endif { long i; for(i = 0; i < n; i++) { real z = x[i*incx]; x[i*incx] = y[i*incy]; y[i*incy] = z; } } }
void THBlas_swap(long size, real *x, long xStride, real *y, long yStride) { if(size == 1) { xStride = 1; yStride = 1; } #if USE_CBLAS if( (size < INT_MAX) && (xStride < INT_MAX) && (yStride < INT_MAX) ) { #ifdef USE_DOUBLE cblas_dswap(size, x, xStride, y, yStride); #else cblas_sswap(size, x, xStride, y, yStride); #endif return; } #endif { long i; for(i = 0; i < size; i++) { real z = x[i*xStride]; x[i*xStride] = y[i*yStride]; y[i*yStride] = z; } } }
int main(int argc, char const *argv[]) { cblas_dswap(N, X, incx, Y, incy); printResult(); ContextDestroy(); return 0; }
JNIEXPORT void JNICALL Java_uncomplicate_neanderthal_CBLAS_dswap (JNIEnv *env, jclass clazz, jint N, jobject X, jint offsetX, jint incX, jobject Y, jint offsetY, jint incY) { double *cX = (double *) (*env)->GetDirectBufferAddress(env, X); double *cY = (double *) (*env)->GetDirectBufferAddress(env, Y); cblas_dswap(N, cX + offsetX, incX, cY + offsetY, incY); };
void bli_dswap( int n, double* x, int incx, double* y, int incy ) { #ifdef BLIS_ENABLE_CBLAS_INTERFACES cblas_dswap( n, x, incx, y, incy ); #else F77_dswap( &n, x, &incx, y, &incy ); #endif }
int CORE_dlaswp_ontile(PLASMA_desc descA, int i1, int i2, const int *ipiv, int inc) { int i, j, ip, it; double *A1; int lda1, lda2; /* Change i1 to C notation */ i1--; /* Check parameters */ if ( descA.nt > 1 ) { coreblas_error(1, "Illegal value of descA.nt"); return -1; } if ( i1 < 0 ) { coreblas_error(2, "Illegal value of i1"); return -2; } if ( (i2 <= i1) || (i2 > descA.m) ) { coreblas_error(3, "Illegal value of i2"); return -3; } if ( ! ( (i2 - i1 - i1%descA.mb -1) < descA.mb ) ) { coreblas_error(2, "Illegal value of i1,i2. They have to be part of the same block."); return -3; } if (inc > 0) { it = i1 / descA.mb; A1 = A(it, 0); lda1 = BLKLDD(descA, 0); for (j = i1; j < i2; ++j, ipiv+=inc) { ip = (*ipiv) - descA.i - 1; if ( ip != j ) { it = ip / descA.mb; i = ip % descA.mb; lda2 = BLKLDD(descA, it); cblas_dswap(descA.n, A1 + j, lda1, A(it, 0) + i, lda2 ); } } } else { it = (i2-1) / descA.mb; A1 = A(it, 0); lda1 = BLKLDD(descA, it); i1--; ipiv = &ipiv[(1-i2)*inc]; for (j = i2-1; j > i1; --j, ipiv+=inc) { ip = (*ipiv) - descA.i - 1; if ( ip != j ) { it = ip / descA.mb; i = ip % descA.mb; lda2 = BLKLDD(descA, it); cblas_dswap(descA.n, A1 + j, lda1, A(it, 0) + i, lda2 ); } } } return PLASMA_SUCCESS; }
static void CORE_dgetrf_rectil_rec(const PLASMA_desc A, int *IPIV, int *info, double *pivot, int thidx, int thcnt, int column, int width, int ft, int lt) { int ld, jp, n1, n2, lm, tmpM, piv_sf; int ip, j, it, i, ldft; int max_i, max_it, thwin; double zone = 1.0; double mzone = -1.0; double tmp1; double tmp2 = 0.; double pivval; double *Atop, *Atop2, *U, *L; double abstmp1; int offset = A.i; ldft = BLKLDD(A, 0); Atop = A(0, 0) + column * ldft; if ( width > 1 ) { /* Assumption: N = min( M, N ); */ n1 = width / 2; n2 = width - n1; Atop2 = Atop + n1 * ldft; CORE_dgetrf_rectil_rec( A, IPIV, info, pivot, thidx, thcnt, column, n1, ft, lt ); if ( *info != 0 ) return; if (thidx == 0) { /* Swap to the right */ int *lipiv = IPIV+column; int idxMax = column+n1; for (j = column; j < idxMax; ++j, ++lipiv) { ip = (*lipiv) - offset - 1; if ( ip != j ) { it = ip / A.mb; i = ip % A.mb; ld = BLKLDD(A, it); cblas_dswap(n2, Atop2 + j, ldft, A(it, 0) + (column+n1)*ld + i, ld ); } } /* Trsm on the upper part */ U = Atop2 + column; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, n1, n2, (zone), Atop + column, ldft, U, ldft ); /* SIgnal to other threads that they can start update */ CORE_dbarrier_thread( thidx, thcnt ); pivval = *pivot; if ( pivval == 0.0 ) { *info = column+n1; return; } else { if ( fabs(pivval) >= sfmin ) { piv_sf = 1; pivval = 1.0 / pivval; } else { piv_sf = 0; } } /* First tile */ { L = Atop + column + n1; tmpM = min(ldft, A.m) - column - n1; /* Scale last column of L */ if ( piv_sf ) { cblas_dscal( tmpM, (pivval), L+(n1-1)*ldft, 1 ); } else { int i; Atop2 = L+(n1-1)*ldft; for( i=0; i < tmpM; i++, Atop2++) *Atop2 = *Atop2 / pivval; } /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, tmpM, n2, n1, (mzone), L, ldft, U, ldft, (zone), U + n1, ldft ); /* Search Max in first column of U+n1 */ tmp2 = U[n1]; max_it = ft; max_i = cblas_idamax( tmpM, U+n1, 1 ) + n1; tmp1 = U[max_i]; abstmp1 = fabs(tmp1); max_i += column; } } else { pivval = *pivot; if ( pivval == 0.0 ) { *info = column+n1; return; } else { if ( fabs(pivval) >= sfmin ) { piv_sf = 1; pivval = 1.0 / pivval; } else { piv_sf = 0; } } ld = BLKLDD( A, ft ); L = A( ft, 0 ) + column * ld; lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; U = Atop2 + column; /* First tile */ /* Scale last column of L */ if ( piv_sf ) { cblas_dscal( lm, (pivval), L+(n1-1)*ld, 1 ); } else { int i; Atop2 = L+(n1-1)*ld; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } /* Wait for pivoting and triangular solve to be finished * before to really start the update */ CORE_dbarrier_thread( thidx, thcnt ); /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); /* Search Max in first column of L+n1*ld */ max_it = ft; max_i = cblas_idamax( lm, L+n1*ld, 1 ); tmp1 = L[n1*ld+max_i]; abstmp1 = fabs(tmp1); } /* Update the other blocks */ for( it = ft+1; it < lt; it++) { ld = BLKLDD( A, it ); L = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; /* Scale last column of L */ if ( piv_sf ) { cblas_dscal( lm, (pivval), L+(n1-1)*ld, 1 ); } else { int i; Atop2 = L+(n1-1)*ld; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); /* Search the max on the first column of L+n1*ld */ jp = cblas_idamax( lm, L+n1*ld, 1 ); if ( fabs( L[n1*ld+jp] ) > abstmp1 ) { tmp1 = L[n1*ld+jp]; abstmp1 = fabs(tmp1); max_i = jp; max_it = it; } } jp = offset + max_it*A.mb + max_i; CORE_damax1_thread( tmp1, thidx, thcnt, &thwin, &tmp2, pivot, jp + 1, IPIV + column + n1 ); if ( thidx == 0 ) { U[n1] = *pivot; /* all threads have the pivot element: no need for synchronization */ } if (thwin == thidx) { /* the thread that owns the best pivot */ if ( jp-offset != column+n1 ) /* if there is a need to exchange the pivot */ { ld = BLKLDD(A, max_it); Atop2 = A( max_it, 0 ) + (column + n1 )* ld + max_i; *Atop2 = tmp2; } } CORE_dgetrf_rectil_rec( A, IPIV, info, pivot, thidx, thcnt, column+n1, n2, ft, lt ); if ( *info != 0 ) return; if ( thidx == 0 ) { /* Swap to the left */ int *lipiv = IPIV+column+n1; int idxMax = column+width; for (j = column+n1; j < idxMax; ++j, ++lipiv) { ip = (*lipiv) - offset - 1; if ( ip != j ) { it = ip / A.mb; i = ip % A.mb; ld = BLKLDD(A, it); cblas_dswap(n1, Atop + j, ldft, A(it, 0) + column*ld + i, ld ); } } } } else if ( width == 1 ) { /* Search maximum for column 0 */ if ( column == 0 ) { if ( thidx == 0 ) tmp2 = Atop[column]; /* First tmp1 */ ld = BLKLDD(A, ft); Atop2 = A( ft, 0 ); lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; max_it = ft; max_i = cblas_idamax( lm, Atop2, 1 ); tmp1 = Atop2[max_i]; abstmp1 = fabs(tmp1); /* Update */ for( it = ft+1; it < lt; it++) { Atop2= A( it, 0 ); lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; jp = cblas_idamax( lm, Atop2, 1 ); if ( fabs(Atop2[jp]) > abstmp1 ) { tmp1 = Atop2[jp]; abstmp1 = fabs(tmp1); max_i = jp; max_it = it; } } jp = offset + max_it*A.mb + max_i; CORE_damax1_thread( tmp1, thidx, thcnt, &thwin, &tmp2, pivot, jp + 1, IPIV + column ); if ( thidx == 0 ) { Atop[0] = *pivot; /* all threads have the pivot element: no need for synchronization */ } if (thwin == thidx) { /* the thread that owns the best pivot */ if ( jp-offset != 0 ) /* if there is a need to exchange the pivot */ { Atop2 = A( max_it, 0 ) + max_i; *Atop2 = tmp2; } } } CORE_dbarrier_thread( thidx, thcnt ); /* If it is the last column, we just scale */ if ( column == (min(A.m, A.n))-1 ) { pivval = *pivot; if ( pivval != 0.0 ) { if ( thidx == 0 ) { if ( fabs(pivval) >= sfmin ) { pivval = 1.0 / pivval; /* * We guess than we never enter the function with m == A.mt-1 * because it means that there is only one thread */ lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; cblas_dscal( lm - column - 1, (pivval), Atop+column+1, 1 ); for( it = ft+1; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; cblas_dscal( lm, (pivval), Atop2, 1 ); } } else { /* * We guess than we never enter the function with m == A.mt-1 * because it means that there is only one thread */ int i; Atop2 = Atop + column + 1; lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; for( i=0; i < lm-column-1; i++, Atop2++) *Atop2 = *Atop2 / pivval; for( it = ft+1; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } } } else { if ( fabs(pivval) >= sfmin ) { pivval = 1.0 / pivval; for( it = ft; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; cblas_dscal( lm, (pivval), Atop2, 1 ); } } else { /* * We guess than we never enter the function with m == A.mt-1 * because it means that there is only one thread */ int i; for( it = ft; it < lt; it++) { ld = BLKLDD(A, it); Atop2 = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; for( i=0; i < lm; i++, Atop2++) *Atop2 = *Atop2 / pivval; } } } } else { *info = column + 1; return; } } } }
static inline void CORE_dgetrf_rectil_update(const PLASMA_desc A, int *IPIV, int column, int n1, int n2, int thidx, int thcnt, int ft, int lt) { int ld, lm, tmpM; int ip, j, it, i, ldft; double zone = 1.0; double mzone = -1.0; double *Atop, *Atop2, *U, *L; int offset = A.i; ldft = BLKLDD(A, 0); Atop = A(0, 0) + column * ldft; Atop2 = Atop + n1 * ldft; if (thidx == 0) { /* Swap to the right */ int *lipiv = IPIV+column; int idxMax = column+n1; for (j = column; j < idxMax; ++j, ++lipiv) { ip = (*lipiv) - offset - 1; if ( ip != j ) { it = ip / A.mb; i = ip % A.mb; ld = BLKLDD(A, it); cblas_dswap(n2, Atop2 + j, ldft, A(it, 0) + (column+n1)*ld + i, ld ); } } /* Trsm on the upper part */ U = Atop2 + column; cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, n1, n2, (zone), Atop + column, ldft, U, ldft ); /* Signal to other threads that they can start update */ CORE_dbarrier_thread( thidx, thcnt ); /* First tile */ L = Atop + column + n1; tmpM = min(ldft, A.m) - column - n1; /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, tmpM, n2, n1, (mzone), L, ldft, U, ldft, (zone), U + n1, ldft ); } else { ld = BLKLDD( A, ft ); L = A( ft, 0 ) + column * ld; lm = ft == A.mt-1 ? A.m - ft * A.mb : A.mb; U = Atop2 + column; /* Wait for pivoting and triangular solve to be finished * before to really start the update */ CORE_dbarrier_thread( thidx, thcnt ); /* First tile */ /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); } /* Update the other blocks */ for( it = ft+1; it < lt; it++) { ld = BLKLDD( A, it ); L = A( it, 0 ) + column * ld; lm = it == A.mt-1 ? A.m - it * A.mb : A.mb; /* Apply the GEMM */ cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, lm, n2, n1, (mzone), L, ld, U, ldft, (zone), L + n1*ld, ld ); } }
void test_swap (void) { const double flteps = 1e-4, dbleps = 1e-6; { int N = 1; float X[] = { 0.539f }; int incX = 1; float Y[] = { -0.262f }; int incY = -1; float expected1[] = { -0.262f }; float expected2[] = { 0.539f }; cblas_sswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[i], expected1[i], flteps, "sswap(case 88)"); } }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], expected2[i], flteps, "sswap(case 89)"); } }; }; { int N = 1; double X[] = { 0.906 }; int incX = 1; double Y[] = { 0.373 }; int incY = -1; double expected1[] = { 0.373 }; double expected2[] = { 0.906 }; cblas_dswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[i], expected1[i], dbleps, "dswap(case 90)"); } }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], expected2[i], dbleps, "dswap(case 91)"); } }; }; { int N = 1; float X[] = { -0.316f, -0.529f }; int incX = 1; float Y[] = { -0.313f, 0.363f }; int incY = -1; float expected1[] = { -0.313f, 0.363f }; float expected2[] = { -0.316f, -0.529f }; cblas_cswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[2*i], expected1[2*i], flteps, "cswap(case 92) real"); gsl_test_rel(X[2*i+1], expected1[2*i+1], flteps, "cswap(case 92) imag"); }; }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], expected2[2*i], flteps, "cswap(case 93) real"); gsl_test_rel(Y[2*i+1], expected2[2*i+1], flteps, "cswap(case 93) imag"); }; }; }; { int N = 1; double X[] = { 0.512, -0.89 }; int incX = 1; double Y[] = { -0.225, -0.511 }; int incY = -1; double expected1[] = { -0.225, -0.511 }; double expected2[] = { 0.512, -0.89 }; cblas_zswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[2*i], expected1[2*i], dbleps, "zswap(case 94) real"); gsl_test_rel(X[2*i+1], expected1[2*i+1], dbleps, "zswap(case 94) imag"); }; }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], expected2[2*i], dbleps, "zswap(case 95) real"); gsl_test_rel(Y[2*i+1], expected2[2*i+1], dbleps, "zswap(case 95) imag"); }; }; }; { int N = 1; float X[] = { 0.336f }; int incX = -1; float Y[] = { -0.431f }; int incY = 1; float expected1[] = { -0.431f }; float expected2[] = { 0.336f }; cblas_sswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[i], expected1[i], flteps, "sswap(case 96)"); } }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], expected2[i], flteps, "sswap(case 97)"); } }; }; { int N = 1; double X[] = { 0.764 }; int incX = -1; double Y[] = { -0.293 }; int incY = 1; double expected1[] = { -0.293 }; double expected2[] = { 0.764 }; cblas_dswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[i], expected1[i], dbleps, "dswap(case 98)"); } }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], expected2[i], dbleps, "dswap(case 99)"); } }; }; { int N = 1; float X[] = { -0.239f, 0.361f }; int incX = -1; float Y[] = { 0.149f, 0.347f }; int incY = 1; float expected1[] = { 0.149f, 0.347f }; float expected2[] = { -0.239f, 0.361f }; cblas_cswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[2*i], expected1[2*i], flteps, "cswap(case 100) real"); gsl_test_rel(X[2*i+1], expected1[2*i+1], flteps, "cswap(case 100) imag"); }; }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], expected2[2*i], flteps, "cswap(case 101) real"); gsl_test_rel(Y[2*i+1], expected2[2*i+1], flteps, "cswap(case 101) imag"); }; }; }; { int N = 1; double X[] = { -0.171, -0.936 }; int incX = -1; double Y[] = { 0.495, -0.835 }; int incY = 1; double expected1[] = { 0.495, -0.835 }; double expected2[] = { -0.171, -0.936 }; cblas_zswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[2*i], expected1[2*i], dbleps, "zswap(case 102) real"); gsl_test_rel(X[2*i+1], expected1[2*i+1], dbleps, "zswap(case 102) imag"); }; }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], expected2[2*i], dbleps, "zswap(case 103) real"); gsl_test_rel(Y[2*i+1], expected2[2*i+1], dbleps, "zswap(case 103) imag"); }; }; }; { int N = 1; float X[] = { -0.405f }; int incX = -1; float Y[] = { -0.213f }; int incY = -1; float expected1[] = { -0.213f }; float expected2[] = { -0.405f }; cblas_sswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[i], expected1[i], flteps, "sswap(case 104)"); } }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], expected2[i], flteps, "sswap(case 105)"); } }; }; { int N = 1; double X[] = { -0.761 }; int incX = -1; double Y[] = { -0.585 }; int incY = -1; double expected1[] = { -0.585 }; double expected2[] = { -0.761 }; cblas_dswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[i], expected1[i], dbleps, "dswap(case 106)"); } }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[i], expected2[i], dbleps, "dswap(case 107)"); } }; }; { int N = 1; float X[] = { 0.853f, 0.146f }; int incX = -1; float Y[] = { 0.009f, -0.178f }; int incY = -1; float expected1[] = { 0.009f, -0.178f }; float expected2[] = { 0.853f, 0.146f }; cblas_cswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[2*i], expected1[2*i], flteps, "cswap(case 108) real"); gsl_test_rel(X[2*i+1], expected1[2*i+1], flteps, "cswap(case 108) imag"); }; }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], expected2[2*i], flteps, "cswap(case 109) real"); gsl_test_rel(Y[2*i+1], expected2[2*i+1], flteps, "cswap(case 109) imag"); }; }; }; { int N = 1; double X[] = { -0.228, 0.386 }; int incX = -1; double Y[] = { 0.988, -0.084 }; int incY = -1; double expected1[] = { 0.988, -0.084 }; double expected2[] = { -0.228, 0.386 }; cblas_zswap(N, X, incX, Y, incY); { int i; for (i = 0; i < 1; i++) { gsl_test_rel(X[2*i], expected1[2*i], dbleps, "zswap(case 110) real"); gsl_test_rel(X[2*i+1], expected1[2*i+1], dbleps, "zswap(case 110) imag"); }; }; { int i; for (i = 0; i < 1; i++) { gsl_test_rel(Y[2*i], expected2[2*i], dbleps, "zswap(case 111) real"); gsl_test_rel(Y[2*i+1], expected2[2*i+1], dbleps, "zswap(case 111) imag"); }; }; }; }
int CORE_dtstrf(int M, int N, int IB, int NB, double *U, int LDU, double *A, int LDA, double *L, int LDL, int *IPIV, double *WORK, int LDWORK, int *INFO) { static double zzero = 0.0; static double mzone =-1.0; double alpha; int i, j, ii, sb; int im, ip; /* Check input arguments */ *INFO = 0; if (M < 0) { coreblas_error(1, "Illegal value of M"); return -1; } if (N < 0) { coreblas_error(2, "Illegal value of N"); return -2; } if (IB < 0) { coreblas_error(3, "Illegal value of IB"); return -3; } if ((LDU < max(1,NB)) && (NB > 0)) { coreblas_error(6, "Illegal value of LDU"); return -6; } if ((LDA < max(1,M)) && (M > 0)) { coreblas_error(8, "Illegal value of LDA"); return -8; } if ((LDL < max(1,IB)) && (IB > 0)) { coreblas_error(10, "Illegal value of LDL"); return -10; } /* Quick return */ if ((M == 0) || (N == 0) || (IB == 0)) return PLASMA_SUCCESS; /* Set L to 0 */ memset(L, 0, LDL*N*sizeof(double)); ip = 0; for (ii = 0; ii < N; ii += IB) { sb = min(N-ii, IB); for (i = 0; i < sb; i++) { im = cblas_idamax(M, &A[LDA*(ii+i)], 1); IPIV[ip] = ii+i+1; if (fabs(A[LDA*(ii+i)+im]) > fabs(U[LDU*(ii+i)+ii+i])) { /* * Swap behind. */ cblas_dswap(i, &L[LDL*ii+i], LDL, &WORK[im], LDWORK ); /* * Swap ahead. */ cblas_dswap(sb-i, &U[LDU*(ii+i)+ii+i], LDU, &A[LDA*(ii+i)+im], LDA ); /* * Set IPIV. */ IPIV[ip] = NB + im + 1; for (j = 0; j < i; j++) { A[LDA*(ii+j)+im] = zzero; } } if ((*INFO == 0) && (fabs(A[LDA*(ii+i)+im]) == zzero) && (fabs(U[LDU*(ii+i)+ii+i]) == zzero)) { *INFO = ii+i+1; } alpha = ((double)1. / U[LDU*(ii+i)+ii+i]); cblas_dscal(M, (alpha), &A[LDA*(ii+i)], 1); cblas_dcopy(M, &A[LDA*(ii+i)], 1, &WORK[LDWORK*i], 1); cblas_dger( CblasColMajor, M, sb-i-1, (mzone), &A[LDA*(ii+i)], 1, &U[LDU*(ii+i+1)+ii+i], LDU, &A[LDA*(ii+i+1)], LDA ); ip = ip+1; } /* * Apply the subpanel to the rest of the panel. */ if(ii+i < N) { for(j = ii; j < ii+sb; j++) { if (IPIV[j] <= NB) { IPIV[j] = IPIV[j] - ii; } } CORE_dssssm( NB, N-(ii+sb), M, N-(ii+sb), sb, sb, &U[LDU*(ii+sb)+ii], LDU, &A[LDA*(ii+sb)], LDA, &L[LDL*ii], LDL, WORK, LDWORK, &IPIV[ii]); for(j = ii; j < ii+sb; j++) { if (IPIV[j] <= NB) { IPIV[j] = IPIV[j] + ii; } } } } return PLASMA_SUCCESS; }
__cminpack_attr__ void __cminpack_func__(lmpar)(int n, real *r, int ldr, const int *ipvt, const real *diag, const real *qtb, real delta, real *par, real *x, real *sdiag, real *wa1, real *wa2) { /* Initialized data */ #define p1 .1 #define p001 .001 /* System generated locals */ real d1, d2; /* Local variables */ int j, l; real fp; real parc, parl; int iter; real temp, paru, dwarf; int nsing; real gnorm; real dxnorm; /* ********** */ /* subroutine lmpar */ /* given an m by n matrix a, an n by n nonsingular diagonal */ /* matrix d, an m-vector b, and a positive number delta, */ /* the problem is to determine a value for the parameter */ /* par such that if x solves the system */ /* a*x = b , sqrt(par)*d*x = 0 , */ /* in the least squares sense, and dxnorm is the euclidean */ /* norm of d*x, then either par is zero and */ /* (dxnorm-delta) .le. 0.1*delta , */ /* or par is positive and */ /* abs(dxnorm-delta) .le. 0.1*delta . */ /* this subroutine completes the solution of the problem */ /* if it is provided with the necessary information from the */ /* qr factorization, with column pivoting, of a. that is, if */ /* a*p = q*r, where p is a permutation matrix, q has orthogonal */ /* columns, and r is an upper triangular matrix with diagonal */ /* elements of nonincreasing magnitude, then lmpar expects */ /* the full upper triangle of r, the permutation matrix p, */ /* and the first n components of (q transpose)*b. on output */ /* lmpar also provides an upper triangular matrix s such that */ /* t t t */ /* p *(a *a + par*d*d)*p = s *s . */ /* s is employed within lmpar and may be of separate interest. */ /* only a few iterations are generally needed for convergence */ /* of the algorithm. if, however, the limit of 10 iterations */ /* is reached, then the output par will contain the best */ /* value obtained so far. */ /* the subroutine statement is */ /* subroutine lmpar(n,r,ldr,ipvt,diag,qtb,delta,par,x,sdiag, */ /* wa1,wa2) */ /* where */ /* n is a positive integer input variable set to the order of r. */ /* r is an n by n array. on input the full upper triangle */ /* must contain the full upper triangle of the matrix r. */ /* on output the full upper triangle is unaltered, and the */ /* strict lower triangle contains the strict upper triangle */ /* (transposed) of the upper triangular matrix s. */ /* ldr is a positive integer input variable not less than n */ /* which specifies the leading dimension of the array r. */ /* ipvt is an integer input array of length n which defines the */ /* permutation matrix p such that a*p = q*r. column j of p */ /* is column ipvt(j) of the identity matrix. */ /* diag is an input array of length n which must contain the */ /* diagonal elements of the matrix d. */ /* qtb is an input array of length n which must contain the first */ /* n elements of the vector (q transpose)*b. */ /* delta is a positive input variable which specifies an upper */ /* bound on the euclidean norm of d*x. */ /* par is a nonnegative variable. on input par contains an */ /* initial estimate of the levenberg-marquardt parameter. */ /* on output par contains the final estimate. */ /* x is an output array of length n which contains the least */ /* squares solution of the system a*x = b, sqrt(par)*d*x = 0, */ /* for the output par. */ /* sdiag is an output array of length n which contains the */ /* diagonal elements of the upper triangular matrix s. */ /* wa1 and wa2 are work arrays of length n. */ /* subprograms called */ /* minpack-supplied ... dpmpar,enorm,qrsolv */ /* fortran-supplied ... dabs,dmax1,dmin1,dsqrt */ /* argonne national laboratory. minpack project. march 1980. */ /* burton s. garbow, kenneth e. hillstrom, jorge j. more */ /* ********** */ /* dwarf is the smallest positive magnitude. */ dwarf = __cminpack_func__(dpmpar)(2); /* compute and store in x the gauss-newton direction. if the */ /* jacobian is rank-deficient, obtain a least squares solution. */ nsing = n; for (j = 0; j < n; ++j) { wa1[j] = qtb[j]; if (r[j + j * ldr] == 0. && nsing == n) { nsing = j; } if (nsing < n) { wa1[j] = 0.; } } # ifdef USE_CBLAS cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit, nsing, r, ldr, wa1, 1); # else if (nsing >= 1) { int k; for (k = 1; k <= nsing; ++k) { j = nsing - k; wa1[j] /= r[j + j * ldr]; temp = wa1[j]; if (j >= 1) { int i; for (i = 0; i < j; ++i) { wa1[i] -= r[i + j * ldr] * temp; } } } } # endif for (j = 0; j < n; ++j) { l = ipvt[j]-1; x[l] = wa1[j]; } /* initialize the iteration counter. */ /* evaluate the function at the origin, and test */ /* for acceptance of the gauss-newton direction. */ iter = 0; for (j = 0; j < n; ++j) { wa2[j] = diag[j] * x[j]; } dxnorm = __cminpack_enorm__(n, wa2); fp = dxnorm - delta; if (fp <= p1 * delta) { goto TERMINATE; } /* if the jacobian is not rank deficient, the newton */ /* step provides a lower bound, parl, for the zero of */ /* the function. otherwise set this bound to zero. */ parl = 0.; if (nsing >= n) { for (j = 0; j < n; ++j) { l = ipvt[j]-1; wa1[j] = diag[l] * (wa2[l] / dxnorm); } # ifdef USE_CBLAS cblas_dtrsv(CblasColMajor, CblasUpper, CblasTrans, CblasNonUnit, n, r, ldr, wa1, 1); # else for (j = 0; j < n; ++j) { real sum = 0.; if (j >= 1) { int i; for (i = 0; i < j; ++i) { sum += r[i + j * ldr] * wa1[i]; } } wa1[j] = (wa1[j] - sum) / r[j + j * ldr]; } # endif temp = __cminpack_enorm__(n, wa1); parl = fp / delta / temp / temp; } /* calculate an upper bound, paru, for the zero of the function. */ for (j = 0; j < n; ++j) { real sum; # ifdef USE_CBLAS sum = cblas_ddot(j+1, &r[j*ldr], 1, qtb, 1); # else sum = 0.; int i; for (i = 0; i <= j; ++i) { sum += r[i + j * ldr] * qtb[i]; } # endif l = ipvt[j]-1; wa1[j] = sum / diag[l]; } gnorm = __cminpack_enorm__(n, wa1); paru = gnorm / delta; if (paru == 0.) { paru = dwarf / min(delta,(real)p1) /* / p001 ??? */; } /* if the input par lies outside of the interval (parl,paru), */ /* set par to the closer endpoint. */ *par = max(*par,parl); *par = min(*par,paru); if (*par == 0.) { *par = gnorm / dxnorm; } /* beginning of an iteration. */ for (;;) { ++iter; /* evaluate the function at the current value of par. */ if (*par == 0.) { /* Computing MAX */ d1 = dwarf, d2 = p001 * paru; *par = max(d1,d2); } temp = sqrt(*par); for (j = 0; j < n; ++j) { wa1[j] = temp * diag[j]; } __cminpack_func__(qrsolv)(n, r, ldr, ipvt, wa1, qtb, x, sdiag, wa2); for (j = 0; j < n; ++j) { wa2[j] = diag[j] * x[j]; } dxnorm = __cminpack_enorm__(n, wa2); temp = fp; fp = dxnorm - delta; /* if the function is small enough, accept the current value */ /* of par. also test for the exceptional cases where parl */ /* is zero or the number of iterations has reached 10. */ if (fabs(fp) <= p1 * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) { goto TERMINATE; } /* compute the newton correction. */ # ifdef USE_CBLAS for (j = 0; j < nsing; ++j) { l = ipvt[j]-1; wa1[j] = diag[l] * (wa2[l] / dxnorm); } for (j = nsing; j < n; ++j) { wa1[j] = 0.; } /* exchange the diagonal of r with sdiag */ cblas_dswap(n, r, ldr+1, sdiag, 1); /* solve lower(r).x = wa1, result id put in wa1 */ cblas_dtrsv(CblasColMajor, CblasLower, CblasNoTrans, CblasNonUnit, nsing, r, ldr, wa1, 1); /* exchange the diagonal of r with sdiag */ cblas_dswap( n, r, ldr+1, sdiag, 1); # else /* !USE_CBLAS */ for (j = 0; j < n; ++j) { l = ipvt[j]-1; wa1[j] = diag[l] * (wa2[l] / dxnorm); } for (j = 0; j < n; ++j) { wa1[j] /= sdiag[j]; temp = wa1[j]; if (n > j+1) { int i; for (i = j+1; i < n; ++i) { wa1[i] -= r[i + j * ldr] * temp; } } } # endif /* !USE_CBLAS */ temp = __cminpack_enorm__(n, wa1); parc = fp / delta / temp / temp; /* depending on the sign of the function, update parl or paru. */ if (fp > 0.) { parl = max(parl,*par); } if (fp < 0.) { paru = min(paru,*par); } /* compute an improved estimate for par. */ /* Computing MAX */ d1 = parl, d2 = *par + parc; *par = max(d1,d2); /* end of an iteration. */ } TERMINATE: /* termination. */ if (iter == 0) { *par = 0.; } /* last card of subroutine lmpar. */ } /* lmpar_ */
// flips the left-right ordering of the columns of a matrix stored in rowmajor format void flipcolslr(double * A, long m, long n) { for(long idx = 0; idx < n/2; idx = idx + 1) cblas_dswap(m, A + idx, n, A + (n - 1 - idx), n); }
void dswap_(int *n, double *X,int *incx, double *Y, int *incy) { Blasx_Debug_Output("Calling dswap_\n "); cblas_dswap(*n,X,*incx,Y,*incy); }
// // Overloaded function for dispatching to // * CBLAS backend, and // * double value-type. // inline void swap( const int n, double* x, const int incx, double* y, const int incy ) { cblas_dswap( n, x, incx, y, incy ); }
void F77_dswap( const int *N, double *X, const int *incX, double *Y, const int *incY) { cblas_dswap(*N,X,*incX,Y,*incY); return; }
void My_dswap(gsl_vector* x, gsl_vector* y) { cblas_dswap(x->size, x->data, x->stride, y->data, y->stride); }
void dlaswp( long n, double a[], long lda, long k1, long k2, long ipiv[], long incx ) { /** * -- LAPACK auxiliary routine (version 1.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * October 31, 1992 * * .. Scalar Arguments ..*/ /** .. * .. Array Arguments ..*/ #undef ipiv_1 #define ipiv_1(a1) ipiv[a1-1] #undef a_2 #define a_2(a1,a2) a[a1-1+lda*(a2-1)] /** .. * * Purpose * ======= * * DLASWP performs a series of row interchanges on the matrix A. * One row interchange is initiated for each of rows K1 through K2 of A. * * Arguments * ========= * * N (input) INTEGER * The number of columns of the matrix A. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the matrix of column dimension N to which the row * interchanges will be applied. * On exit, the permuted matrix. * * LDA (input) INTEGER * The leading dimension of the array A. * * K1 (input) INTEGER * The first element of IPIV for which a row interchange will * be done. * * K2 (input) INTEGER * The last element of IPIV for which a row interchange will * be done. * * IPIV (input) INTEGER array, dimension (M*abs(INCX)) * The vector of pivot indices. Only the elements in positions * K1 through K2 of IPIV are accessed. * IPIV(K) = L implies rows K and L are to be interchanged. * * INCX (input) INTEGER * The increment between successive values of IPIV. If IPIV * is negative, the pivots are applied in reverse order. * * ===================================================================== * * .. Local Scalars ..*/ long i, ip, ix; /** .. * .. Executable Statements .. * * Interchange row I with row IPIV(I) for each of rows K1 through K2. **/ /*-----implicit-declarations-----*/ /*-----end-of-declarations-----*/ if( incx==0 ) return; if( incx>0 ) { ix = k1; } else { ix = 1 + ( 1-k2 )*incx; } if( incx==1 ) { for (i=k1 ; i<=k2 ; i+=1) { ip = ipiv_1( i ); if( ip!=i ) cblas_dswap( n, &a_2( i, 1 ), lda, &a_2( ip, 1 ), lda ); } } else if( incx>1 ) { for (i=k1 ; i<=k2 ; i+=1) { ip = ipiv_1( ix ); if( ip!=i ) cblas_dswap( n, &a_2( i, 1 ), lda, &a_2( ip, 1 ), lda ); ix = ix + incx; } } else if( incx<0 ) { for (i=k2 ; i>=k1 ; i+=-1) { ip = ipiv_1( ix ); if( ip!=i ) cblas_dswap( n, &a_2( i, 1 ), lda, &a_2( ip, 1 ), lda ); ix = ix + incx; } } return; /** * End of DLASWP **/ }
void dgetf2( long m, long n, double a[], long lda, long ipiv[], long *info ) { /** * -- LAPACK routine (version 1.1) -- * Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., * Courant Institute, Argonne National Lab, and Rice University * June 30, 1992 * * .. Scalar Arguments ..*/ /** .. * .. Array Arguments ..*/ #undef ipiv_1 #define ipiv_1(a1) ipiv[a1-1] #undef a_2 #define a_2(a1,a2) a[a1-1+lda*(a2-1)] /** .. * * Purpose * ======= * * DGETF2 computes an LU factorization of a general m-by-n matrix A * using partial pivoting with row interchanges. * * The factorization has the form * A = P * L * U * where P is a permutation matrix, L is lower triangular with unit * diagonal elements (lower trapezoidal if m > n), and U is upper * triangular (upper trapezoidal if m < n). * * This is the right-looking Level 2 BLAS version of the algorithm. * * Arguments * ========= * * M (input) INTEGER * The number of rows of the matrix A. M >= 0. * * N (input) INTEGER * The number of columns of the matrix A. N >= 0. * * A (input/output) DOUBLE PRECISION array, dimension (LDA,N) * On entry, the m by n matrix to be factored. * On exit, the factors L and U from the factorization * A = P*L*U; the unit diagonal elements of L are not stored. * * LDA (input) INTEGER * The leading dimension of the array A. LDA >= max(1,M). * * IPIV (output) INTEGER array, dimension (min(M,N)) * The pivot indices; for 1 <= i <= min(M,N), row i of the * matrix was interchanged with row IPIV(i). * * INFO (output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * > 0: if INFO = k, U(k,k) is exactly zero. The factorization * has been completed, but the factor U is exactly * singular, and division by zero will occur if it is used * to solve a system of equations. * * ===================================================================== * * .. Parameters ..*/ #undef one #define one 1.0e+0 #undef zero #define zero 0.0e+0 /** .. * .. Local Scalars ..*/ long j, jp; /** .. * .. Intrinsic Functions ..*/ /* intrinsic max, min;*/ /** .. * .. Executable Statements .. * * Test the input parameters. **/ /*-----implicit-declarations-----*/ /*-----end-of-declarations-----*/ *info = 0; if( m<0 ) { *info = -1; } else if( n<0 ) { *info = -2; } else if( lda<max( 1, m ) ) { *info = -4; } if( *info!=0 ) { xerbla( "dgetf2", -*info ); return; } /** * Quick return if possible **/ if( m==0 || n==0 ) return; for (j=1 ; j<=min( m, n ) ; j+=1) { /** * Find pivot and test for singularity. **/ jp = j + cblas_idamax( m-j+1, &a_2( j, j ), 1 ); ipiv_1( j ) = jp; if( a_2( jp, j )!=zero ) { /** * Apply the interchange to columns 1:N. **/ if( jp!=j ) cblas_dswap( n, &a_2( j, 1 ), lda, &a_2( jp, 1 ), lda ); /** * Compute elements J+1:M of J-th column. **/ if( j<m ) cblas_dscal( m-j, one / a_2( j, j ), &a_2( j+1, j ), 1 ); } else if( *info==0 ) { *info = j; } if( j<min( m, n ) ) { /** * Update trailing submatrix. **/ cblas_dger(CblasColMajor, m-j, n-j, -one, &a_2( j+1, j ), 1, &a_2( j, j+1 ), lda, &a_2( j+1, j+1 ), lda ); } } return; /** * End of DGETF2 **/ }
void STARPU_DSWAP(const int n, double *x, const int incx, double *y, const int incy) { cblas_dswap(n, x, incx, y, incy); }