void ConjugateGradientSolver::solveGpu(Vector3F * X, CudaCSRMatrix * stiffnessMatrix) { void * dFixed = m_deviceIsFixed->bufferOnDevice(); m_deviceX->hostToDevice(X); void * dX = m_deviceX->bufferOnDevice(); m_deviceRhs->hostToDevice(m_b); void * dRhs = m_deviceRhs->bufferOnDevice(); cuConjugateGradient_prevresidual((float3 *)m_devicePrev->bufferOnDevice(), (float3 *)m_deviceResidual->bufferOnDevice(), (mat33 *)stiffnessMatrix->deviceValue(), (uint *)stiffnessMatrix->deviceRowPtr(), (uint *)stiffnessMatrix->deviceColInd(), (uint *)dFixed, (float3 *)dX, (float3 *)dRhs, m_numRows); // ceglg.write("cg init"); // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways); // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways); // m_devicePrev->deviceToHost(m_prev->data()); Vector3F * prev = (Vector3F *)m_prev->data(); for(unsigned k=0; k< 1; k++) { // std::cout<<" prev["<<k<<"] h "<<prev[k]; } for(unsigned i=0;i<i_max;i++) { cuConjugateGradient_Ax((float3 *)m_devicePrev->bufferOnDevice(), (float3 *)m_deviceUpdate->bufferOnDevice(), (float3 *)m_deviceResidual->bufferOnDevice(), (float *)m_deviceD->bufferOnDevice(), (float *)m_deviceD2->bufferOnDevice(), (mat33 *)stiffnessMatrix->deviceValue(), (uint *)stiffnessMatrix->deviceRowPtr(), (uint *)stiffnessMatrix->deviceColInd(), (uint *)dFixed, m_numRows); // ceglg.write("cg step "); // ceglg.write(i); // ceglg.writeVec3(m_deviceUpdate, m_numRows, "cg update", CudaDbgLog::FAlways); // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways); // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways); float d =0; float d2=0; m_reduce->sumF(d, (float *)m_deviceD->bufferOnDevice(), m_numRows); m_reduce->sumF(d2, (float *)m_deviceD2->bufferOnDevice(), m_numRows); if(fabs(d2)< 1e-10f) d2 = 1e-10f; float d3 = d/d2; cuConjugateGradient_addX((float3 *)dX, (float3 *)m_deviceResidual->bufferOnDevice(), (float *)m_deviceD->bufferOnDevice(), (float3 *)m_devicePrev->bufferOnDevice(), (float3 *)m_deviceUpdate->bufferOnDevice(), d3, (uint *)dFixed, m_numRows); // ceglg.write("addX"); // ceglg.writeVec3(m_deviceUpdate, m_numRows, "cg update", CudaDbgLog::FAlways); // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways); // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways); // float d1 = 0.f; m_reduce->sumF(d1, (float *)m_deviceD->bufferOnDevice(), m_numRows); // if(i>29) std::cout<<" d1["<<i<<"] "<<d1<<" "; if(i >= i_max && d1 < 0.001f) break; if(fabs(d)<1e-10f) d = 1e-10f; float d4 = d1/d; cuConjugateGradient_addResidual((float3 *)m_devicePrev->bufferOnDevice(), (float3 *)m_deviceResidual->bufferOnDevice(), d4, (uint *)dFixed, m_numRows); // ceglg.write("add residual"); // ceglg.writeVec3(m_deviceUpdate, m_numRows, "cg update", CudaDbgLog::FAlways); // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways); // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways); // } cudaThreadSynchronize(); m_deviceX->deviceToHost(X); }
void CudaConjugateGradientSolver::solve(void * X, CudaCSRMatrix * A, void * fixed, float * error) { // cglg.writeVec3(m_rhs, m_dimension, "cg b", CudaDbgLog::FAlways); //cglg.writeMat33(A->valueBuf(), // A->numNonZero(), // " cg A ", CudaDbgLog::FAlways); cuConjugateGradient_prevresidual((float3 *)previous(), (float3 *)residual(), (mat33 *)A->deviceValue(), (uint *)A->deviceRowPtr(), (uint *)A->deviceColInd(), (uint *)fixed, (float3 *)X, (float3 *)rightHandSide(), m_dimension); for(int i=0; i<FemGlobal::CGSolverMaxNumIterations; i++) { cuConjugateGradient_Ax((float3 *)previous(), (float3 *)updated(), (float3 *)residual(), (float *)diff(), (float *)diff2(), (mat33 *)A->deviceValue(), (uint *)A->deviceRowPtr(), (uint *)A->deviceColInd(), (uint *)fixed, m_dimension); float d =0; float d2=0; m_reduce->sum<float>(d, (float *)diff(), m_dimension); m_reduce->sum<float>(d2, (float *)diff2(), m_dimension); if(fabs(d2)< 1e-10f) d2 = 1e-10f; float d3 = d/d2; cuConjugateGradient_addX((float3 *)X, (float3 *)residual(), (float *)diff(), (float3 *)previous(), (float3 *)updated(), d3, (uint *)fixed, m_dimension); float d1 = 0.f; m_reduce->sum<float>(d1, (float *)diff(), m_dimension); if(error) *error = d1; if(d1 < 0.01f) break; if(fabs(d)<1e-10f) d = 1e-10f; float d4 = d1/d; cuConjugateGradient_addResidual((float3 *)previous(), (float3 *)residual(), d4, (uint *)fixed, m_dimension); } }