void CBasePlayer::PostThink() { if( g_fGameOver ) goto pt_end; // intermission or finale if( !IsAlive() ) goto pt_end; // Handle Tank controlling if( m_pTank != NULL ) { // if they've moved too far from the gun, or selected a weapon, unuse the gun if( m_pTank->OnControls( this ) && !HasWeaponModelName() ) { m_pTank->Use( this, this, USE_SET, 2 ); // try fire the gun } else { // they've moved off the platform m_pTank->Use( this, this, USE_OFF, 0 ); m_pTank = NULL; } } // do weapon stuff ItemPostFrame(); // check to see if player landed hard enough to make a sound // falling farther than half of the maximum safe distance, but not as far a max safe distance will // play a bootscrape sound, and no damage will be inflicted. Fallling a distance shorter than half // of maximum safe distance will make no sound. Falling farther than max safe distance will play a // fallpain sound, and damage will be inflicted based on how far the player fell if( GetFlags().Any( FL_ONGROUND ) && ( GetHealth() > 0 ) && m_flFallVelocity >= PLAYER_FALL_PUNCH_THRESHHOLD ) { // ALERT ( at_console, "%f\n", m_flFallVelocity ); if( GetWaterType() == CONTENTS_WATER ) { // Did he hit the world or a non-moving entity? // BUG - this happens all the time in water, especially when // BUG - water has current force //CBaseEntity* pEntity = GetGroundEntity(); //if ( !pEntity || pEntity->GetAbsVelocity().z == 0 ) // EMIT_SOUND( this, CHAN_BODY, "player/pl_wade1.wav", 1, ATTN_NORM); } else if( m_flFallVelocity > PLAYER_MAX_SAFE_FALL_SPEED ) { // after this point, we start doing damage float flFallDamage = g_pGameRules->FlPlayerFallDamage( this ); if( flFallDamage > GetHealth() ) {//splat // note: play on item channel because we play footstep landing on body channel EMIT_SOUND( this, CHAN_ITEM, "common/bodysplat.wav", 1, ATTN_NORM ); } if( flFallDamage > 0 ) { TakeDamage( CWorld::GetInstance(), CWorld::GetInstance(), flFallDamage, DMG_FALL ); Vector vecPunchAngle = GetPunchAngle(); vecPunchAngle.x = 0; SetPunchAngle( vecPunchAngle ); } } if( IsAlive() ) { SetAnimation( PLAYER_WALK ); } } if( GetFlags().Any( FL_ONGROUND ) ) { if( m_flFallVelocity > 64 && !g_pGameRules->IsMultiplayer() ) { CSoundEnt::InsertSound( bits_SOUND_PLAYER, GetAbsOrigin(), m_flFallVelocity, 0.2 ); // ALERT( at_console, "fall %f\n", m_flFallVelocity ); } m_flFallVelocity = 0; } // select the proper animation for the player character if( IsAlive() ) { if( !GetAbsVelocity().x && !GetAbsVelocity().y ) SetAnimation( PLAYER_IDLE ); else if( ( GetAbsVelocity().x || GetAbsVelocity().y ) && ( GetFlags().Any( FL_ONGROUND ) ) ) SetAnimation( PLAYER_WALK ); else if( GetWaterLevel() > WATERLEVEL_FEET ) SetAnimation( PLAYER_WALK ); } StudioFrameAdvance(); CheckPowerups( this ); UpdatePlayerSound(); pt_end: #if defined( CLIENT_WEAPONS ) // Decay timers on weapons // go through all of the weapons and make a list of the ones to pack for( int i = 0; i < MAX_WEAPON_SLOTS; i++ ) { if( m_rgpPlayerItems[ i ] ) { CBasePlayerWeapon *pPlayerItem = m_rgpPlayerItems[ i ]; while( pPlayerItem ) { if( pPlayerItem->IsPredicted() ) { pPlayerItem->m_flNextPrimaryAttack = max( pPlayerItem->m_flNextPrimaryAttack - gpGlobals->frametime, -1.0f ); pPlayerItem->m_flNextSecondaryAttack = max( pPlayerItem->m_flNextSecondaryAttack - gpGlobals->frametime, -0.001f ); if( pPlayerItem->m_flTimeWeaponIdle != 1000 ) { pPlayerItem->m_flTimeWeaponIdle = max( pPlayerItem->m_flTimeWeaponIdle - gpGlobals->frametime, -0.001f ); } if( pPlayerItem->pev->fuser1 != 1000 ) { pPlayerItem->pev->fuser1 = max( pPlayerItem->pev->fuser1 - gpGlobals->frametime, -0.001f ); } pPlayerItem->DecrementTimers( gpGlobals->frametime ); // Only decrement if not flagged as NO_DECREMENT // if ( gun->m_flPumpTime != 1000 ) // { // gun->m_flPumpTime = max( gun->m_flPumpTime - gpGlobals->frametime, -0.001 ); // } } pPlayerItem = pPlayerItem->m_pNext; } } } m_flNextAttack -= gpGlobals->frametime; if( m_flNextAttack < -0.001 ) m_flNextAttack = -0.001; if( m_flNextAmmoBurn != 1000 ) { m_flNextAmmoBurn -= gpGlobals->frametime; if( m_flNextAmmoBurn < -0.001 ) m_flNextAmmoBurn = -0.001; } if( m_flAmmoStartCharge != 1000 ) { m_flAmmoStartCharge -= gpGlobals->frametime; if( m_flAmmoStartCharge < -0.001 ) m_flAmmoStartCharge = -0.001; } #endif // Track button info so we can detect 'pressed' and 'released' buttons next frame m_afButtonLast = GetButtons().Get(); }
BOOL CALLBACK ColumnEditorDlg::run_dlgProc(UINT message, WPARAM wParam, LPARAM) { switch (message) { case WM_INITDIALOG : { switchTo(activeText); ::SendDlgItemMessage(_hSelf, IDC_COL_DEC_RADIO, BM_SETCHECK, TRUE, 0); goToCenter(); NppParameters *pNppParam = NppParameters::getInstance(); ETDTProc enableDlgTheme = (ETDTProc)pNppParam->getEnableThemeDlgTexture(); if (enableDlgTheme) { enableDlgTheme(_hSelf, ETDT_ENABLETAB); redraw(); } return TRUE; } case WM_COMMAND : { switch (wParam) { case IDCANCEL : // Close display(false); return TRUE; case IDOK : { (*_ppEditView)->execute(SCI_BEGINUNDOACTION); const int stringSize = 1024; TCHAR str[stringSize]; bool isTextMode = (BST_CHECKED == ::SendDlgItemMessage(_hSelf, IDC_COL_TEXT_RADIO, BM_GETCHECK, 0, 0)); if (isTextMode) { ::SendDlgItemMessage(_hSelf, IDC_COL_TEXT_EDIT, WM_GETTEXT, stringSize, (LPARAM)str); display(false); if ((*_ppEditView)->execute(SCI_SELECTIONISRECTANGLE) || (*_ppEditView)->execute(SCI_GETSELECTIONS) > 1) { ColumnModeInfos colInfos = (*_ppEditView)->getColumnModeSelectInfo(); std::sort(colInfos.begin(), colInfos.end(), SortInPositionOrder()); (*_ppEditView)->columnReplace(colInfos, str); std::sort(colInfos.begin(), colInfos.end(), SortInSelectOrder()); (*_ppEditView)->setMultiSelections(colInfos); } else { int cursorPos = (*_ppEditView)->execute(SCI_GETCURRENTPOS); int cursorCol = (*_ppEditView)->execute(SCI_GETCOLUMN, cursorPos); int cursorLine = (*_ppEditView)->execute(SCI_LINEFROMPOSITION, cursorPos); int endPos = (*_ppEditView)->execute(SCI_GETLENGTH); int endLine = (*_ppEditView)->execute(SCI_LINEFROMPOSITION, endPos); int lineAllocatedLen = 1024; TCHAR *line = new TCHAR[lineAllocatedLen]; for (int i = cursorLine ; i <= endLine ; ++i) { int lineBegin = (*_ppEditView)->execute(SCI_POSITIONFROMLINE, i); int lineEnd = (*_ppEditView)->execute(SCI_GETLINEENDPOSITION, i); int lineEndCol = (*_ppEditView)->execute(SCI_GETCOLUMN, lineEnd); int lineLen = lineEnd - lineBegin + 1; if (lineLen > lineAllocatedLen) { delete [] line; line = new TCHAR[lineLen]; } (*_ppEditView)->getGenericText(line, lineLen, lineBegin, lineEnd); generic_string s2r(line); if (lineEndCol < cursorCol) { generic_string s_space(cursorCol - lineEndCol, ' '); s2r.append(s_space); s2r.append(str); } else { int posAbs2Start = (*_ppEditView)->execute(SCI_FINDCOLUMN, i, cursorCol); int posRelative2Start = posAbs2Start - lineBegin; s2r.insert(posRelative2Start, str); } (*_ppEditView)->replaceTarget(s2r.c_str(), lineBegin, lineEnd); } delete [] line; } } else { int initialNumber = ::GetDlgItemInt(_hSelf, IDC_COL_INITNUM_EDIT, NULL, TRUE); int increaseNumber = ::GetDlgItemInt(_hSelf, IDC_COL_INCREASENUM_EDIT, NULL, TRUE); UCHAR format = getFormat(); display(false); if ((*_ppEditView)->execute(SCI_SELECTIONISRECTANGLE) || (*_ppEditView)->execute(SCI_GETSELECTIONS) > 1) { ColumnModeInfos colInfos = (*_ppEditView)->getColumnModeSelectInfo(); std::sort(colInfos.begin(), colInfos.end(), SortInPositionOrder()); (*_ppEditView)->columnReplace(colInfos, initialNumber, increaseNumber, format); std::sort(colInfos.begin(), colInfos.end(), SortInSelectOrder()); (*_ppEditView)->setMultiSelections(colInfos); } else { int cursorPos = (*_ppEditView)->execute(SCI_GETCURRENTPOS); int cursorCol = (*_ppEditView)->execute(SCI_GETCOLUMN, cursorPos); int cursorLine = (*_ppEditView)->execute(SCI_LINEFROMPOSITION, cursorPos); int endPos = (*_ppEditView)->execute(SCI_GETLENGTH); int endLine = (*_ppEditView)->execute(SCI_LINEFROMPOSITION, endPos); int lineAllocatedLen = 1024; TCHAR *line = new TCHAR[lineAllocatedLen]; UCHAR f = format & MASK_FORMAT; bool isZeroLeading = (MASK_ZERO_LEADING & format) != 0; int base = 10; if (f == BASE_16) base = 16; else if (f == BASE_08) base = 8; else if (f == BASE_02) base = 2; int nbLine = endLine - cursorLine + 1; int endNumber = initialNumber + increaseNumber * (nbLine - 1); int nbEnd = getNbDigits(endNumber, base); int nbInit = getNbDigits(initialNumber, base); int nb = max(nbInit, nbEnd); for (int i = cursorLine ; i <= endLine ; ++i) { int lineBegin = (*_ppEditView)->execute(SCI_POSITIONFROMLINE, i); int lineEnd = (*_ppEditView)->execute(SCI_GETLINEENDPOSITION, i); int lineEndCol = (*_ppEditView)->execute(SCI_GETCOLUMN, lineEnd); int lineLen = lineEnd - lineBegin + 1; if (lineLen > lineAllocatedLen) { delete [] line; line = new TCHAR[lineLen]; } (*_ppEditView)->getGenericText(line, lineLen, lineBegin, lineEnd); generic_string s2r(line); // // Calcule generic_string // int2str(str, stringSize, initialNumber, base, nb, isZeroLeading); initialNumber += increaseNumber; if (lineEndCol < cursorCol) { generic_string s_space(cursorCol - lineEndCol, ' '); s2r.append(s_space); s2r.append(str); } else { int posAbs2Start = (*_ppEditView)->execute(SCI_FINDCOLUMN, i, cursorCol); int posRelative2Start = posAbs2Start - lineBegin; s2r.insert(posRelative2Start, str); } (*_ppEditView)->replaceTarget(s2r.c_str(), lineBegin, lineEnd); } delete [] line; } } (*_ppEditView)->execute(SCI_ENDUNDOACTION); (*_ppEditView)->getFocus(); return TRUE; } case IDC_COL_TEXT_RADIO : case IDC_COL_NUM_RADIO : { switchTo((wParam == IDC_COL_TEXT_RADIO)? activeText : activeNumeric); return TRUE; } default : { switch (HIWORD(wParam)) { case EN_SETFOCUS : case BN_SETFOCUS : //updateLinesNumbers(); return TRUE; default : return TRUE; } break; } } } default : return FALSE; } //return FALSE; }
/* Subroutine */ int zpttrs_(char *uplo, integer *n, integer *nrhs, doublereal *d__, doublecomplex *e, doublecomplex *b, integer *ldb, integer *info) { /* System generated locals */ integer b_dim1, b_offset, i__1, i__2, i__3; /* Local variables */ integer j, jb, nb, iuplo; logical upper; extern /* Subroutine */ int zptts2_(integer *, integer *, integer *, doublereal *, doublecomplex *, doublecomplex *, integer *), xerbla_(char *, integer *); extern integer ilaenv_(integer *, char *, char *, integer *, integer *, integer *, integer *); /* -- LAPACK routine (version 3.2) -- */ /* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */ /* November 2006 */ /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* ZPTTRS solves a tridiagonal system of the form */ /* A * X = B */ /* using the factorization A = U'*D*U or A = L*D*L' computed by ZPTTRF. */ /* D is a diagonal matrix specified in the vector D, U (or L) is a unit */ /* bidiagonal matrix whose superdiagonal (subdiagonal) is specified in */ /* the vector E, and X and B are N by NRHS matrices. */ /* Arguments */ /* ========= */ /* UPLO (input) CHARACTER*1 */ /* Specifies the form of the factorization and whether the */ /* vector E is the superdiagonal of the upper bidiagonal factor */ /* U or the subdiagonal of the lower bidiagonal factor L. */ /* = 'U': A = U'*D*U, E is the superdiagonal of U */ /* = 'L': A = L*D*L', E is the subdiagonal of L */ /* N (input) INTEGER */ /* The order of the tridiagonal matrix A. N >= 0. */ /* NRHS (input) INTEGER */ /* The number of right hand sides, i.e., the number of columns */ /* of the matrix B. NRHS >= 0. */ /* D (input) DOUBLE PRECISION array, dimension (N) */ /* The n diagonal elements of the diagonal matrix D from the */ /* factorization A = U'*D*U or A = L*D*L'. */ /* E (input) COMPLEX*16 array, dimension (N-1) */ /* If UPLO = 'U', the (n-1) superdiagonal elements of the unit */ /* bidiagonal factor U from the factorization A = U'*D*U. */ /* If UPLO = 'L', the (n-1) subdiagonal elements of the unit */ /* bidiagonal factor L from the factorization A = L*D*L'. */ /* B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) */ /* On entry, the right hand side vectors B for the system of */ /* linear equations. */ /* On exit, the solution vectors, X. */ /* LDB (input) INTEGER */ /* The leading dimension of the array B. LDB >= max(1,N). */ /* INFO (output) INTEGER */ /* = 0: successful exit */ /* < 0: if INFO = -k, the k-th argument had an illegal value */ /* ===================================================================== */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* .. Executable Statements .. */ /* Test the input arguments. */ /* Parameter adjustments */ --d__; --e; b_dim1 = *ldb; b_offset = 1 + b_dim1; b -= b_offset; /* Function Body */ *info = 0; upper = *(unsigned char *)uplo == 'U' || *(unsigned char *)uplo == 'u'; if (! upper && ! (*(unsigned char *)uplo == 'L' || *(unsigned char *)uplo == 'l')) { *info = -1; } else if (*n < 0) { *info = -2; } else if (*nrhs < 0) { *info = -3; } else if (*ldb < max(1,*n)) { *info = -7; } if (*info != 0) { i__1 = -(*info); xerbla_("ZPTTRS", &i__1); return 0; } /* Quick return if possible */ if (*n == 0 || *nrhs == 0) { return 0; } /* Determine the number of right-hand sides to solve at a time. */ if (*nrhs == 1) { nb = 1; } else { /* Computing MAX */ i__1 = 1, i__2 = ilaenv_(&c__1, "ZPTTRS", uplo, n, nrhs, &c_n1, &c_n1); nb = max(i__1,i__2); } /* Decode UPLO */ if (upper) { iuplo = 1; } else { iuplo = 0; } if (nb >= *nrhs) { zptts2_(&iuplo, n, nrhs, &d__[1], &e[1], &b[b_offset], ldb); } else { i__1 = *nrhs; i__2 = nb; for (j = 1; i__2 < 0 ? j >= i__1 : j <= i__1; j += i__2) { /* Computing MIN */ i__3 = *nrhs - j + 1; jb = min(i__3,nb); zptts2_(&iuplo, n, &jb, &d__[1], &e[1], &b[j * b_dim1 + 1], ldb); /* L10: */ } } return 0; /* End of ZPTTRS */ } /* zpttrs_ */
/* Subroutine */ int chbmv_(char *uplo, integer *n, integer *k, complex * alpha, complex *a, integer *lda, complex *x, integer *incx, complex * beta, complex *y, integer *incy, ftnlen uplo_len) { /* System generated locals */ integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5; real r__1; complex q__1, q__2, q__3, q__4; /* Builtin functions */ void r_cnjg(complex *, complex *); /* Local variables */ integer i__, j, l, ix, iy, jx, jy, kx, ky, info; complex temp1, temp2; extern logical lsame_(char *, char *, ftnlen, ftnlen); integer kplus1; extern /* Subroutine */ int xerbla_(char *, integer *, ftnlen); /* .. Scalar Arguments .. */ /* .. */ /* .. Array Arguments .. */ /* .. */ /* Purpose */ /* ======= */ /* CHBMV performs the matrix-vector operation */ /* y := alpha*A*x + beta*y, */ /* where alpha and beta are scalars, x and y are n element vectors and */ /* A is an n by n hermitian band matrix, with k super-diagonals. */ /* Arguments */ /* ========== */ /* UPLO - CHARACTER*1. */ /* On entry, UPLO specifies whether the upper or lower */ /* triangular part of the band matrix A is being supplied as */ /* follows: */ /* UPLO = 'U' or 'u' The upper triangular part of A is */ /* being supplied. */ /* UPLO = 'L' or 'l' The lower triangular part of A is */ /* being supplied. */ /* Unchanged on exit. */ /* N - INTEGER. */ /* On entry, N specifies the order of the matrix A. */ /* N must be at least zero. */ /* Unchanged on exit. */ /* K - INTEGER. */ /* On entry, K specifies the number of super-diagonals of the */ /* matrix A. K must satisfy 0 .le. K. */ /* Unchanged on exit. */ /* ALPHA - COMPLEX . */ /* On entry, ALPHA specifies the scalar alpha. */ /* Unchanged on exit. */ /* A - COMPLEX array of DIMENSION ( LDA, n ). */ /* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) */ /* by n part of the array A must contain the upper triangular */ /* band part of the hermitian matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row */ /* ( k + 1 ) of the array, the first super-diagonal starting at */ /* position 2 in row k, and so on. The top left k by k triangle */ /* of the array A is not referenced. */ /* The following program segment will transfer the upper */ /* triangular part of a hermitian band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = K + 1 - J */ /* DO 10, I = MAX( 1, J - K ), J */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) */ /* by n part of the array A must contain the lower triangular */ /* band part of the hermitian matrix, supplied column by */ /* column, with the leading diagonal of the matrix in row 1 of */ /* the array, the first sub-diagonal starting at position 1 in */ /* row 2, and so on. The bottom right k by k triangle of the */ /* array A is not referenced. */ /* The following program segment will transfer the lower */ /* triangular part of a hermitian band matrix from conventional */ /* full matrix storage to band storage: */ /* DO 20, J = 1, N */ /* M = 1 - J */ /* DO 10, I = J, MIN( N, J + K ) */ /* A( M + I, J ) = matrix( I, J ) */ /* 10 CONTINUE */ /* 20 CONTINUE */ /* Note that the imaginary parts of the diagonal elements need */ /* not be set and are assumed to be zero. */ /* Unchanged on exit. */ /* LDA - INTEGER. */ /* On entry, LDA specifies the first dimension of A as declared */ /* in the calling (sub) program. LDA must be at least */ /* ( k + 1 ). */ /* Unchanged on exit. */ /* X - COMPLEX array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCX ) ). */ /* Before entry, the incremented array X must contain the */ /* vector x. */ /* Unchanged on exit. */ /* INCX - INTEGER. */ /* On entry, INCX specifies the increment for the elements of */ /* X. INCX must not be zero. */ /* Unchanged on exit. */ /* BETA - COMPLEX . */ /* On entry, BETA specifies the scalar beta. */ /* Unchanged on exit. */ /* Y - COMPLEX array of DIMENSION at least */ /* ( 1 + ( n - 1 )*abs( INCY ) ). */ /* Before entry, the incremented array Y must contain the */ /* vector y. On exit, Y is overwritten by the updated vector y. */ /* INCY - INTEGER. */ /* On entry, INCY specifies the increment for the elements of */ /* Y. INCY must not be zero. */ /* Unchanged on exit. */ /* Further Details */ /* =============== */ /* Level 2 Blas routine. */ /* -- Written on 22-October-1986. */ /* Jack Dongarra, Argonne National Lab. */ /* Jeremy Du Croz, Nag Central Office. */ /* Sven Hammarling, Nag Central Office. */ /* Richard Hanson, Sandia National Labs. */ /* ===================================================================== */ /* .. Parameters .. */ /* .. */ /* .. Local Scalars .. */ /* .. */ /* .. External Functions .. */ /* .. */ /* .. External Subroutines .. */ /* .. */ /* .. Intrinsic Functions .. */ /* .. */ /* Test the input parameters. */ /* Parameter adjustments */ a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --x; --y; /* Function Body */ info = 0; if (! lsame_(uplo, "U", (ftnlen)1, (ftnlen)1) && ! lsame_(uplo, "L", ( ftnlen)1, (ftnlen)1)) { info = 1; } else if (*n < 0) { info = 2; } else if (*k < 0) { info = 3; } else if (*lda < *k + 1) { info = 6; } else if (*incx == 0) { info = 8; } else if (*incy == 0) { info = 11; } if (info != 0) { xerbla_("CHBMV ", &info, (ftnlen)6); return 0; } /* Quick return if possible. */ if (*n == 0 || (alpha->r == 0.f && alpha->i == 0.f && (beta->r == 1.f && beta->i == 0.f))) { return 0; } /* Set up the start points in X and Y. */ if (*incx > 0) { kx = 1; } else { kx = 1 - (*n - 1) * *incx; } if (*incy > 0) { ky = 1; } else { ky = 1 - (*n - 1) * *incy; } /* Start the operations. In this version the elements of the array A */ /* are accessed sequentially with one pass through A. */ /* First form y := beta*y. */ if (beta->r != 1.f || beta->i != 0.f) { if (*incy == 1) { if (beta->r == 0.f && beta->i == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; y[i__2].r = 0.f, y[i__2].i = 0.f; /* L10: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = i__; i__3 = i__; q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] .r; y[i__2].r = q__1.r, y[i__2].i = q__1.i; /* L20: */ } } } else { iy = ky; if (beta->r == 0.f && beta->i == 0.f) { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; y[i__2].r = 0.f, y[i__2].i = 0.f; iy += *incy; /* L30: */ } } else { i__1 = *n; for (i__ = 1; i__ <= i__1; ++i__) { i__2 = iy; i__3 = iy; q__1.r = beta->r * y[i__3].r - beta->i * y[i__3].i, q__1.i = beta->r * y[i__3].i + beta->i * y[i__3] .r; y[i__2].r = q__1.r, y[i__2].i = q__1.i; iy += *incy; /* L40: */ } } } } if (alpha->r == 0.f && alpha->i == 0.f) { return 0; } if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { /* Form y when upper triangle of A is stored. */ kplus1 = *k + 1; if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = j; q__1.r = alpha->r * x[i__2].r - alpha->i * x[i__2].i, q__1.i = alpha->r * x[i__2].i + alpha->i * x[i__2].r; temp1.r = q__1.r, temp1.i = q__1.i; temp2.r = 0.f, temp2.i = 0.f; l = kplus1 - j; /* Computing MAX */ i__2 = 1, i__3 = j - *k; i__4 = j - 1; for (i__ = max(i__2,i__3); i__ <= i__4; ++i__) { i__2 = i__; i__3 = i__; i__5 = l + i__ + j * a_dim1; q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] .r; q__1.r = y[i__3].r + q__2.r, q__1.i = y[i__3].i + q__2.i; y[i__2].r = q__1.r, y[i__2].i = q__1.i; r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__2 = i__; q__2.r = q__3.r * x[i__2].r - q__3.i * x[i__2].i, q__2.i = q__3.r * x[i__2].i + q__3.i * x[i__2].r; q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; temp2.r = q__1.r, temp2.i = q__1.i; /* L50: */ } i__4 = j; i__2 = j; i__3 = kplus1 + j * a_dim1; r__1 = a[i__3].r; q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; q__2.r = y[i__2].r + q__3.r, q__2.i = y[i__2].i + q__3.i; q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r; q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; y[i__4].r = q__1.r, y[i__4].i = q__1.i; /* L60: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__4 = jx; q__1.r = alpha->r * x[i__4].r - alpha->i * x[i__4].i, q__1.i = alpha->r * x[i__4].i + alpha->i * x[i__4].r; temp1.r = q__1.r, temp1.i = q__1.i; temp2.r = 0.f, temp2.i = 0.f; ix = kx; iy = ky; l = kplus1 - j; /* Computing MAX */ i__4 = 1, i__2 = j - *k; i__3 = j - 1; for (i__ = max(i__4,i__2); i__ <= i__3; ++i__) { i__4 = iy; i__2 = iy; i__5 = l + i__ + j * a_dim1; q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] .r; q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; y[i__4].r = q__1.r, y[i__4].i = q__1.i; r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = ix; q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r; q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; temp2.r = q__1.r, temp2.i = q__1.i; ix += *incx; iy += *incy; /* L70: */ } i__3 = jy; i__4 = jy; i__2 = kplus1 + j * a_dim1; r__1 = a[i__2].r; q__3.r = r__1 * temp1.r, q__3.i = r__1 * temp1.i; q__2.r = y[i__4].r + q__3.r, q__2.i = y[i__4].i + q__3.i; q__4.r = alpha->r * temp2.r - alpha->i * temp2.i, q__4.i = alpha->r * temp2.i + alpha->i * temp2.r; q__1.r = q__2.r + q__4.r, q__1.i = q__2.i + q__4.i; y[i__3].r = q__1.r, y[i__3].i = q__1.i; jx += *incx; jy += *incy; if (j > *k) { kx += *incx; ky += *incy; } /* L80: */ } } } else { /* Form y when lower triangle of A is stored. */ if (*incx == 1 && *incy == 1) { i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = j; q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r; temp1.r = q__1.r, temp1.i = q__1.i; temp2.r = 0.f, temp2.i = 0.f; i__3 = j; i__4 = j; i__2 = j * a_dim1 + 1; r__1 = a[i__2].r; q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; y[i__3].r = q__1.r, y[i__3].i = q__1.i; l = 1 - j; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { i__4 = i__; i__2 = i__; i__5 = l + i__ + j * a_dim1; q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] .r; q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; y[i__4].r = q__1.r, y[i__4].i = q__1.i; r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = i__; q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r; q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; temp2.r = q__1.r, temp2.i = q__1.i; /* L90: */ } i__3 = j; i__4 = j; q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r; q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; y[i__3].r = q__1.r, y[i__3].i = q__1.i; /* L100: */ } } else { jx = kx; jy = ky; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__3 = jx; q__1.r = alpha->r * x[i__3].r - alpha->i * x[i__3].i, q__1.i = alpha->r * x[i__3].i + alpha->i * x[i__3].r; temp1.r = q__1.r, temp1.i = q__1.i; temp2.r = 0.f, temp2.i = 0.f; i__3 = jy; i__4 = jy; i__2 = j * a_dim1 + 1; r__1 = a[i__2].r; q__2.r = r__1 * temp1.r, q__2.i = r__1 * temp1.i; q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; y[i__3].r = q__1.r, y[i__3].i = q__1.i; l = 1 - j; ix = jx; iy = jy; /* Computing MIN */ i__4 = *n, i__2 = j + *k; i__3 = min(i__4,i__2); for (i__ = j + 1; i__ <= i__3; ++i__) { ix += *incx; iy += *incy; i__4 = iy; i__2 = iy; i__5 = l + i__ + j * a_dim1; q__2.r = temp1.r * a[i__5].r - temp1.i * a[i__5].i, q__2.i = temp1.r * a[i__5].i + temp1.i * a[i__5] .r; q__1.r = y[i__2].r + q__2.r, q__1.i = y[i__2].i + q__2.i; y[i__4].r = q__1.r, y[i__4].i = q__1.i; r_cnjg(&q__3, &a[l + i__ + j * a_dim1]); i__4 = ix; q__2.r = q__3.r * x[i__4].r - q__3.i * x[i__4].i, q__2.i = q__3.r * x[i__4].i + q__3.i * x[i__4].r; q__1.r = temp2.r + q__2.r, q__1.i = temp2.i + q__2.i; temp2.r = q__1.r, temp2.i = q__1.i; /* L110: */ } i__3 = jy; i__4 = jy; q__2.r = alpha->r * temp2.r - alpha->i * temp2.i, q__2.i = alpha->r * temp2.i + alpha->i * temp2.r; q__1.r = y[i__4].r + q__2.r, q__1.i = y[i__4].i + q__2.i; y[i__3].r = q__1.r, y[i__3].i = q__1.i; jx += *incx; jy += *incy; /* L120: */ } } } return 0; /* End of CHBMV . */ } /* chbmv_ */
doublereal dlange_(char *norm, integer * m, integer * n, doublereal * a, integer * lda, doublereal * work) { /* -- LAPACK auxiliary routine (version 3.1) -- Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. November 2006 Purpose ======= DLANGE returns the value of the one norm, or the Frobenius norm, or the infinity norm, or the element of largest absolute value of a real matrix A. Description =========== DLANGE returns the value DLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm' ( ( norm1(A), NORM = '1', 'O' or 'o' ( ( normI(A), NORM = 'I' or 'i' ( ( normF(A), NORM = 'F', 'f', 'E' or 'e' where norm1 denotes the one norm of a matrix (maximum column sum), normI denotes the infinity norm of a matrix (maximum row sum) and normF denotes the Frobenius norm of a matrix (square root of sum of squares). Note that max(abs(A(i,j))) is not a consistent matrix norm. Arguments ========= NORM (input) CHARACTER*1 Specifies the value to be returned in DLANGE as described above. M (input) INTEGER The number of rows of the matrix A. M >= 0. When M = 0, DLANGE is set to zero. N (input) INTEGER The number of columns of the matrix A. N >= 0. When N = 0, DLANGE is set to zero. A (input) DOUBLE PRECISION array, dimension (LDA,N) The m by n matrix A. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(M,1). WORK (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)), where LWORK >= M when NORM = 'I'; otherwise, WORK is not referenced. ===================================================================== Parameter adjustments */ /* Table of constant values */ static integer c__1 = 1; /* System generated locals */ integer a_dim1, a_offset, i__1, i__2; doublereal ret_val, d__1, d__2, d__3; /* Builtin functions */ double sqrt(doublereal); /* Local variables */ static integer i__, j; static doublereal sum, scale; extern logical lsame_(char *, char *); static doublereal value; extern /* Subroutine */ int dlassq_(integer *, doublereal *, integer *, doublereal *, doublereal *); a_dim1 = *lda; a_offset = 1 + a_dim1; a -= a_offset; --work; /* Function Body */ if (min(*m, *n) == 0) { value = 0.; } else if (lsame_(norm, "M")) { /* Find max(abs(A(i,j))). */ value = 0.; i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { /* Computing MAX */ d__2 = value, d__3 = (d__1 = a[i__ + j * a_dim1], abs(d__1)); value = max(d__2, d__3); /* L10: */ } /* L20: */ } } else if (lsame_(norm, "O") || *(unsigned char *) norm == '1') { /* Find norm1(A). */ value = 0.; i__1 = *n; for (j = 1; j <= i__1; ++j) { sum = 0.; i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { sum += (d__1 = a[i__ + j * a_dim1], abs(d__1)); /* L30: */ } value = max(value, sum); /* L40: */ } } else if (lsame_(norm, "I")) { /* Find normI(A). */ i__1 = *m; for (i__ = 1; i__ <= i__1; ++i__) { work[i__] = 0.; /* L50: */ } i__1 = *n; for (j = 1; j <= i__1; ++j) { i__2 = *m; for (i__ = 1; i__ <= i__2; ++i__) { work[i__] += (d__1 = a[i__ + j * a_dim1], abs(d__1)); /* L60: */ } /* L70: */ } value = 0.; i__1 = *m; for (i__ = 1; i__ <= i__1; ++i__) { /* Computing MAX */ d__1 = value, d__2 = work[i__]; value = max(d__1, d__2); /* L80: */ } } else if (lsame_(norm, "F") || lsame_(norm, "E")) { /* Find normF(A). */ scale = 0.; sum = 1.; i__1 = *n; for (j = 1; j <= i__1; ++j) { dlassq_(m, &a[j * a_dim1 + 1], &c__1, &scale, &sum); /* L90: */ } value = scale * sqrt(sum); } ret_val = value; return ret_val; /* End of DLANGE */ } /* dlange_ */
void str_cli(FILE * fd, int sockfd) { int maxfdp1, val, stdineof; ssize_t n, nwritten; fd_set rset, wset; char to[MAXLINE], fr[MAXLINE]; char *toiptr, *tooptr, *friptr, *froptr; val = Fcntl(sockfd, F_GETFL, 0); Fcntl(sockfd, F_SETFL, val | O_NONBLOCK); val = Fcntl(STDIN_FILENO, F_GETFL, 0); Fcntl(STDIN_FILENO, F_SETFL, val | O_NONBLOCK); val = Fcntl(STDOUT_FILENO, F_GETFL, 0); Fcntl(STDOUT_FILENO, F_SETFL, val | O_NONBLOCK); toiptr = tooptr = &to[MAXLINE]; friptr = froptr = &fr[MAXLINE]; stdineof = 0; maxfdp1 = max(max(STDIN_FILENO, STDOUT_FILENO), sockfd) + 1; for(;;) { FD_ZERO(&rset); FD_ZERO(&wset); if(stdineof == 0 && toiptr < &to[MAXLINE]) FD_SET(STDIN_FILENO, &rset); if( friptr < &fr[MAXLINE] ) FD_SET(STDOUT_FILENO, &rset); if(toiptr != tooptr) FD_SET(sockfd, &wset); if(friptr != froptr) FD_SET(STDOUT_FILENO, &wset); Select(maxfdp1, &rset, &wset, NULL, NULL); if (FD_ISSET(STDIN_FILENO, &rset)) { if( (n = read(STDIN_FILENO, toiptr, &to[MAXLINE] - toiptr)) < 0) { if( errno != EWOULDBLOCK ) err_sys("read error on stdin"); } else if(n == 0) { fprintf(stderr, "%s: EOF on stdin\n", gf_time()); stdineof = 1; if (toiptr == tooptr) Shutdown(sockfd, SHUT_WR); } else { fprintf(stderr, "%s: read %d bytes from stdin\n", gf_time(), n); toiptr += n; FD_SET(sockfd, &wset); } } if(FD_ISSET(sockfd, &rset)) { if( (n = read(STDOUT_FILENO, friptr, &fr[MAXLINE] - friptr)) < 0) { if( errno != EWOULDBLOCK ) err_sys("read error on socket"); } else if(n == 0) { fprintf(stderr, "%s: EOF on stdin\n", gf_time()); if(stdineof) return; /* normal termination */ else err_quit("str_cli server terminated prematurely"); } else { fprintf(stderr, "%s: read %d bytes from socket\n", gf_time(), n); friptr += n; FD_SET(STDOUT_FILENO, &wset); } } if(FD_ISSET(STDOUT_FILENO, &wset) && ( n = (friptr - froptr)) > 0) { if( (nwritten= write(STDOUT_FILENO, froptr, n)) < 0) { if(errno != EWOULDBLOCK) err_sys("write error to stdout"); } else { fprintf(stderr, "%s: wrote %d bytes to stdout\n", gf_time(), nwritten); froptr += n; if(froptr == friptr) froptr = friptr = fr; } } if(FD_ISSET(sockfd, &wset) && ( n = (toiptr - tooptr)) > 0) { if( (nwritten= write(STDOUT_FILENO, froptr, n)) < 0) { if(errno != EWOULDBLOCK) err_sys("write error to socket"); } else { fprintf(stderr, "%s: wrote %d bytes to socket\n", gf_time(), nwritten); tooptr += n; if(tooptr == toiptr) { tooptr = toiptr = to; if(stdineof) Shutdown(sockfd, SHUT_WR); } } } } }
/** Purpose ------- DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix dA. The factorization has the form dA = U**H * U, if UPLO = MagmaUpper, or dA = L * L**H, if UPLO = MagmaLower, where U is an upper triangular matrix and L is lower triangular. This is the block version of the algorithm, calling Level 3 BLAS. Arguments --------- @param[in] ngpu INTEGER Number of GPUs to use. ngpu > 0. @param[in] uplo magma_uplo_t - = MagmaUpper: Upper triangle of dA is stored; - = MagmaLower: Lower triangle of dA is stored. @param[in] n INTEGER The order of the matrix dA. N >= 0. @param[in,out] d_lA DOUBLE PRECISION array of pointers on the GPU, dimension (ngpu) On entry, the symmetric matrix dA distributed over GPUs (dl_A[d] points to the local matrix on the d-th GPU). It is distributed in 1D block column or row cyclic (with the block size of nb) if UPLO = MagmaUpper or MagmaLower, respectively. If UPLO = MagmaUpper, the leading N-by-N upper triangular part of dA contains the upper triangular part of the matrix dA, and the strictly lower triangular part of dA is not referenced. If UPLO = MagmaLower, the leading N-by-N lower triangular part of dA contains the lower triangular part of the matrix dA, and the strictly upper triangular part of dA is not referenced. \n On exit, if INFO = 0, the factor U or L from the Cholesky factorization dA = U**H * U or dA = L * L**H. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,N). To benefit from coalescent memory accesses LDDA must be divisible by 16. @param[out] info INTEGER - = 0: successful exit - < 0: if INFO = -i, the i-th argument had an illegal value - > 0: if INFO = i, the leading minor of order i is not positive definite, and the factorization could not be completed. @ingroup magma_dposv_comp ********************************************************************/ extern "C" magma_int_t magma_dpotrf_mgpu_right( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magmaDouble_ptr d_lA[], magma_int_t ldda, magma_int_t *info ) { #define dlA(id, i, j) (d_lA[(id)] + (j) * ldda + (i)) #define dlP(id, i, j) (d_lP[(id)] + (j) * ldda + (i)) #define panel(j) (panel + (j)) #define tmppanel(j) (tmppanel + (j)) #define tmpprevpanel(j) (tmpprevpanel + (j)) #define STREAM_ID(i) (nqueue > 1 ? 1+((i)/nb)%(nqueue-1) : 0) double c_one = MAGMA_D_ONE; double c_neg_one = MAGMA_D_NEG_ONE; double d_one = 1.0; double d_neg_one = -1.0; const char* uplo_ = lapack_uplo_const( uplo ); magma_int_t j, nb, d, id, j_local, blkid, crosspoint, prevtrsmrows=0, nqueue = 5; double *panel, *tmppanel0, *tmppanel1, *tmppanel, *tmpprevpanel; double *d_lP[MagmaMaxGPUs], *dlpanel, *dlpanels[MagmaMaxGPUs]; magma_int_t rows, trsmrows, igpu, n_local[MagmaMaxGPUs], ldpanel; magma_queue_t queues[MagmaMaxGPUs][10]; *info = 0; if ( uplo != MagmaUpper && uplo != MagmaLower ) { *info = -1; } else if (n < 0) { *info = -2; } else if (ldda < max(1,n)) { *info = -4; } if (*info != 0) { magma_xerbla( __func__, -(*info) ); return *info; } magma_device_t orig_dev; magma_getdevice( &orig_dev ); magma_queue_t orig_stream; magmablasGetKernelStream( &orig_stream ); nb = magma_get_dpotrf_nb(n); ldpanel = ldda; magma_setdevice(0); if (MAGMA_SUCCESS != magma_dmalloc_pinned( &panel, 2 * nb * ldpanel )) { *info = MAGMA_ERR_HOST_ALLOC; return *info; } tmppanel0 = panel; tmppanel1 = tmppanel0 + nb * ldpanel; if ((nb <= 1) || (nb >= n)) { // Use unblocked code. magma_dgetmatrix( n, n, dlA(0, 0, 0), ldda, panel, ldpanel); lapackf77_dpotrf( uplo_, &n, panel, &ldpanel, info); magma_dsetmatrix( n, n, panel, ldpanel, dlA(0, 0, 0), ldda ); } else { for( d = 0; d < ngpu; d++ ) { // local-n and local-ld n_local[d] = ((n / nb) / ngpu) * nb; if (d < (n / nb) % ngpu) n_local[d] += nb; else if (d == (n / nb) % ngpu) n_local[d] += n % nb; magma_setdevice(d); magma_device_sync(); if (MAGMA_SUCCESS != magma_dmalloc( &d_lP[d], nb * ldda )) { for( j = 0; j < d; j++ ) { magma_setdevice(j); magma_free( d_lP[d] ); } *info = MAGMA_ERR_DEVICE_ALLOC; return *info; } for( j=0; j < nqueue; j++ ) { magma_queue_create( &queues[d][j] ); } } //#define ENABLE_TIMER #if defined (ENABLE_TIMER) real_Double_t therk[4], tmtc, tcchol, tctrsm, tctm, tmnp, tcnp; real_Double_t ttot_herk[4] = {0,0,0,0}, ttot_mtc = 0, ttot_cchol = 0, ttot_ctrsm = 0, ttot_ctm = 0, ttot_mnp = 0, ttot_cnp = 0; printf("\n\n %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "j", "nb", "row", "mtc", "CPU_np", "panel", "ctrsm", "CH+TRSM", "CPU", "dsyrk[0]", "dsyrk[1]", "dsyrk[2]", "dsyrk[3]", "ctm P", "gpu_np"); printf(" ====================================================================================================\n"); #endif // Use blocked code. if (uplo == MagmaUpper) { printf( " === not supported, yet ===\n" ); } else { blkid = -1; if (ngpu == 4) crosspoint = n; else if (ngpu == 3) crosspoint = n; else if (ngpu == 2) crosspoint = 20160; else crosspoint = 0; crosspoint = 0; //n; //n -- > gpu always does next panel, 0 --> cpu always does next panel crosspoint = n; #if defined (ENABLE_TIMER) real_Double_t tget = magma_wtime(), tset = 0.0, ttot = 0.0; #endif if ( n > nb ) { // send first panel to cpu magma_setdevice(0); tmppanel = tmppanel0; magma_dgetmatrix_async(n, nb, dlA(0, 0, 0), ldda, tmppanel(0), ldpanel, queues[0][0] ); } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); } tget = magma_wtime()-tget; #endif // Compute the Cholesky factorization A = L*L' for (j = 0; (j + nb) < n; j += nb) { #if defined (ENABLE_TIMER) therk[0] = therk[1] = therk[2] = therk[3] = tmtc = tcchol = tctrsm = tctm = tmnp = tcnp = 0.0; #endif blkid += 1; tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; // Set the gpu number that holds the current panel id = (j / nb) % ngpu; magma_setdevice(id); // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; rows = n - j; // Wait for the panel on cpu magma_queue_sync( queues[id][0] ); if (j > 0 && prevtrsmrows > crosspoint) { #if defined (ENABLE_TIMER) tcnp = magma_wtime(); #endif tmpprevpanel = ((blkid - 1) % 2) == 0 ? tmppanel0 : tmppanel1; blasf77_dgemm( MagmaNoTransStr, MagmaConjTransStr, &rows, &nb, &nb, &c_neg_one, tmpprevpanel(j), &ldpanel, tmpprevpanel(j), &ldpanel, &c_one, tmppanel(j), &ldpanel ); #if defined (ENABLE_TIMER) tcnp = magma_wtime() - tcnp; ttot_cnp += tcnp; #endif } #if defined (ENABLE_TIMER) tcchol = magma_wtime(); #endif lapackf77_dpotrf(MagmaLowerStr, &nb, tmppanel(j), &ldpanel, info); if (*info != 0) { *info = *info + j; break; } #if defined (ENABLE_TIMER) tcchol = magma_wtime() - tcchol; ttot_cchol += tcchol; tctrsm = magma_wtime(); #endif trsmrows = rows - nb; if (trsmrows > 0) { blasf77_dtrsm(MagmaRightStr, MagmaLowerStr, MagmaConjTransStr, MagmaNonUnitStr, &trsmrows, &nb, &c_one, tmppanel(j), &ldpanel, tmppanel(j + nb), &ldpanel); } #if defined (ENABLE_TIMER) tctrsm = magma_wtime() - tctrsm; ttot_ctrsm += tctrsm; tctm = magma_wtime(); #endif d = (id + 1) % ngpu; // send current panel to gpus for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_int_t myrows = 0; magma_int_t row_offset = 0; if ( d == id ) { dlpanel = dlA(d, j, j_local); myrows = rows; row_offset = 0; } else { dlpanel = dlP(d, 0, 0); myrows = trsmrows; row_offset = nb; } if (myrows > 0) { magma_setdevice(d); magma_dsetmatrix_async(myrows, nb, tmppanel(j + row_offset), ldpanel, dlpanel, ldda, queues[d][0] ); } } /* make sure panel is on GPUs */ d = (id + 1) % ngpu; for (igpu = 0; igpu < ngpu; igpu++, d = (d + 1) % ngpu ) { magma_setdevice(d); magma_queue_sync( queues[d][0] ); } #if defined (ENABLE_TIMER) tctm = magma_wtime() - tctm; ttot_ctm += tctm; #endif if ( (j + nb) < n) { magma_int_t offset = 0; magma_int_t row_offset = 0; if (j + nb + nb < n) { d = (id + 1) % ngpu; magma_setdevice(d); magma_int_t j_local2 = (j + nb) / (nb * ngpu) * nb; if (trsmrows <= crosspoint) { #if defined (ENABLE_TIMER) tmnp = magma_wtime(); #endif // do gemm on look ahead panel if ( d == id ) { dlpanel = dlA(d, j + nb, j_local); } else { dlpanel = dlP(d, 0, 0); } magmablasSetKernelStream( queues[d][STREAM_ID(j_local2)] ); #define DSYRK_ON_DIAG #ifdef DSYRK_ON_DIAG magma_dsyrk( MagmaLower, MagmaNoTrans, nb, nb, d_neg_one, dlpanel, ldda, d_one, dlA(d, j + nb, j_local2), ldda); magma_dgemm( MagmaNoTrans, MagmaConjTrans, trsmrows-nb, nb, nb, c_neg_one, dlpanel+nb, ldda, dlpanel, ldda, c_one, dlA(d, j + nb +nb, j_local2), ldda); #else magma_dgemm( MagmaNoTrans, MagmaConjTrans, trsmrows, nb, nb, c_neg_one, dlpanel, ldda, dlpanel, ldda, c_one, dlA(d, j + nb, j_local2), ldda); #endif #if defined (ENABLE_TIMER) magma_device_sync(); tmnp = magma_wtime() - tmnp; ttot_mnp += tmnp; #endif } // send next panel to cpu magma_queue_sync( queues[d][STREAM_ID(j_local2)] ); // make sure lookahead is done tmppanel = ((blkid+1) % 2 == 0) ? tmppanel0 : tmppanel1; magma_dgetmatrix_async(rows-nb, nb, dlA(d, j+nb, j_local2), ldda, tmppanel(j+nb), ldpanel, queues[d][0] ); tmppanel = (blkid % 2 == 0) ? tmppanel0 : tmppanel1; offset = j + nb + nb; row_offset = nb; } else { offset = j + nb; row_offset = 0; } if (n - offset > 0) { // syrk on multiple gpu for (d = 0; d < ngpu; d++ ) { if ( d == id ) { dlpanels[d] = dlA(d, j + nb + row_offset, j_local); } else { dlpanels[d] = dlP(d, row_offset, 0); } } #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { therk[d] = magma_wtime(); } #endif //magmablasSetKernelStream( queues[d] ); //magma_dsyrk( MagmaLower, MagmaNoTrans, n - offset, nb, // d_neg_one, dlpanel, ldda, // d_one, &d_lA[d][offset + offset*ldda], ldda ); #ifdef DSYRK_ON_DIAG magma_dsyrk_mgpu #else magma_dsyrk_mgpu2 #endif (ngpu, MagmaLower, MagmaNoTrans, nb, n - offset, nb, d_neg_one, dlpanels, ldda, 0, d_one, d_lA, ldda, offset, nqueue, queues ); #if defined (ENABLE_TIMER) for( d=0; d < ngpu; d++ ) { magma_setdevice(d); magma_device_sync(); therk[d] = magma_wtime() - therk[d]; ttot_herk[d] += therk[d]; } #endif } prevtrsmrows = trsmrows; #if defined (ENABLE_TIMER) ttot += (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(%d) %10.3lf\n", j, nb, rows, tmtc, tcnp, // gemm tcchol, // potrf tctrsm, // trsm (tcchol + tctrsm), (tmtc+tcnp+tcchol+tctrsm), therk[0], therk[1], therk[2], therk[3], // syrk tctm, // copy panel to GPU tmnp, // lookahead on GPU (id + 1) % ngpu, (tcnp+tcchol+tctrsm+therk[0]+therk[1]+therk[2]+tctm+tmnp)); fflush(0); #endif } } for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( id=0; id < nqueue; id++ ) { magma_queue_sync( queues[d][id] ); } } #if defined (ENABLE_TIMER) printf("\n%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf\n", n, n, 0, ttot_mtc, ttot_cnp, // gemm ttot_cchol, // potrf ttot_ctrsm, // trsm (ttot_cchol + ttot_ctrsm), (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm), ttot_herk[0], ttot_herk[1], ttot_herk[2], ttot_herk[3], // syrk ttot_ctm, // copy panel to GPU ttot_mnp, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)); printf("%10d %10d %10d %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf %10.3lf(-) %10.3lf (ratio)\n", n, n, 0, ttot_mtc/ttot, ttot_cnp/ttot, // gemm ttot_cchol/ttot, // potrf ttot_ctrsm/ttot, // trsm (ttot_cchol + ttot_ctrsm)/ttot, (ttot_mtc+ttot_cnp+ttot_cchol+ttot_ctrsm)/ttot, ttot_herk[0]/ttot, ttot_herk[1]/ttot, ttot_herk[2]/ttot, ttot_herk[3]/ttot, // syrk ttot_ctm/ttot, // copy panel to GPU ttot_mnp/ttot, // lookahead on GPU (ttot_cnp+ttot_cchol+ttot_ctrsm+ttot_herk[0]+ttot_herk[1]+ttot_herk[2]+ttot_ctm+ttot_mnp)/ttot); #endif // cholesky for the last block if (j < n && *info == 0) { rows = n - j; id = (j / nb) % ngpu; // Set the local index where the current panel is j_local = j / (nb * ngpu) * nb; magma_setdevice(id); #if defined (ENABLE_TIMER) tset = magma_wtime(); #endif magma_dgetmatrix(rows, rows, dlA(id, j, j_local), ldda, panel(j), ldpanel); lapackf77_dpotrf(MagmaLowerStr, &rows, panel(j), &ldpanel, info); magma_dsetmatrix(rows, rows, panel(j), ldpanel, dlA(id, j, j_local), ldda); #if defined (ENABLE_TIMER) tset = magma_wtime() - tset; #endif } #if defined (ENABLE_TIMER) printf( " matrix_get,set: %10.3lf %10.3lf -> %10.3lf\n",tget,tset,ttot+tget+tset ); #endif } // end of else not upper // clean up for( d = 0; d < ngpu; d++ ) { magma_setdevice(d); for( j=0; j < nqueue; j++ ) { magma_queue_destroy( queues[d][j] ); } magma_free( d_lP[d] ); } } // end of not lapack // free workspace magma_free_pinned( panel ); magma_setdevice( orig_dev ); magmablasSetKernelStream( orig_stream ); return *info; } /* magma_dpotrf_mgpu_right */