void ConjugateGradientSolver::solveGpu(Vector3F * X, CudaCSRMatrix * stiffnessMatrix) 
{	
    void * dFixed = m_deviceIsFixed->bufferOnDevice();
    m_deviceX->hostToDevice(X);
    void * dX = m_deviceX->bufferOnDevice();
    
    m_deviceRhs->hostToDevice(m_b);
    void * dRhs = m_deviceRhs->bufferOnDevice();
    
	cuConjugateGradient_prevresidual((float3 *)m_devicePrev->bufferOnDevice(),
                            (float3 *)m_deviceResidual->bufferOnDevice(),
                            (mat33 *)stiffnessMatrix->deviceValue(),
                            (uint *)stiffnessMatrix->deviceRowPtr(),
                            (uint *)stiffnessMatrix->deviceColInd(),
                            (uint *)dFixed,
                            (float3 *)dX,
                            (float3 *)dRhs,
                            m_numRows);
    
    // ceglg.write("cg init");
    // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways);
    // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways);
    
    // m_devicePrev->deviceToHost(m_prev->data());

    Vector3F * prev = (Vector3F *)m_prev->data();
	
	for(unsigned k=0; k< 1; k++) {
        // std::cout<<" prev["<<k<<"] h "<<prev[k];
    }
    
	for(unsigned i=0;i<i_max;i++) {
	    cuConjugateGradient_Ax((float3 *)m_devicePrev->bufferOnDevice(),
                            (float3 *)m_deviceUpdate->bufferOnDevice(),
                            (float3 *)m_deviceResidual->bufferOnDevice(),
                            (float *)m_deviceD->bufferOnDevice(),
                            (float *)m_deviceD2->bufferOnDevice(),
                            (mat33 *)stiffnessMatrix->deviceValue(),
                            (uint *)stiffnessMatrix->deviceRowPtr(),
                            (uint *)stiffnessMatrix->deviceColInd(),
                            (uint *)dFixed,
                            m_numRows);
        
        // ceglg.write("cg step ");
        // ceglg.write(i);
    
        // ceglg.writeVec3(m_deviceUpdate, m_numRows, "cg update", CudaDbgLog::FAlways);
        // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways);
        // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways);
    
        float d =0;
		float d2=0;
		
		m_reduce->sumF(d, (float *)m_deviceD->bufferOnDevice(), m_numRows);
        m_reduce->sumF(d2, (float *)m_deviceD2->bufferOnDevice(), m_numRows);
        
		if(fabs(d2)< 1e-10f)
			d2 = 1e-10f;

		float d3 = d/d2;
		cuConjugateGradient_addX((float3 *)dX,
                            (float3 *)m_deviceResidual->bufferOnDevice(),
                            (float *)m_deviceD->bufferOnDevice(),
                            (float3 *)m_devicePrev->bufferOnDevice(),
                            (float3 *)m_deviceUpdate->bufferOnDevice(),
                            d3,
                            (uint *)dFixed,
                            m_numRows);
        // ceglg.write("addX");
        // ceglg.writeVec3(m_deviceUpdate, m_numRows, "cg update", CudaDbgLog::FAlways);
        // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways);
        // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways);
        // 
        float d1 = 0.f;

        m_reduce->sumF(d1, (float *)m_deviceD->bufferOnDevice(), m_numRows);
        
        // if(i>29) std::cout<<" d1["<<i<<"] "<<d1<<" ";
        
		if(i >= i_max && d1 < 0.001f)
			break;

		if(fabs(d)<1e-10f)
			d = 1e-10f;

		float d4 = d1/d;
		cuConjugateGradient_addResidual((float3 *)m_devicePrev->bufferOnDevice(),
                            (float3 *)m_deviceResidual->bufferOnDevice(),
                            d4,
                            (uint *)dFixed,
                            m_numRows);
        
        // ceglg.write("add residual");
        // ceglg.writeVec3(m_deviceUpdate, m_numRows, "cg update", CudaDbgLog::FAlways);
        // ceglg.writeVec3(m_deviceResidual, m_numRows, "cg residual", CudaDbgLog::FAlways);
        // ceglg.writeVec3(m_devicePrev, m_numRows, "cg prev", CudaDbgLog::FAlways);
        // 
	}	 
	cudaThreadSynchronize();
	m_deviceX->deviceToHost(X);
}
Exemplo n.º 2
0
void CudaConjugateGradientSolver::solve(void * X,
                                        CudaCSRMatrix * A,
                                        void * fixed,
                                        float * error)
{
    // cglg.writeVec3(m_rhs, m_dimension, "cg b", CudaDbgLog::FAlways);
    //cglg.writeMat33(A->valueBuf(),
    //				A->numNonZero(),
    //				" cg A ", CudaDbgLog::FAlways);

    cuConjugateGradient_prevresidual((float3 *)previous(),
                                     (float3 *)residual(),
                                     (mat33 *)A->deviceValue(),
                                     (uint *)A->deviceRowPtr(),
                                     (uint *)A->deviceColInd(),
                                     (uint *)fixed,
                                     (float3 *)X,
                                     (float3 *)rightHandSide(),
                                     m_dimension);

    for(int i=0; i<FemGlobal::CGSolverMaxNumIterations; i++) {
        cuConjugateGradient_Ax((float3 *)previous(),
                               (float3 *)updated(),
                               (float3 *)residual(),
                               (float *)diff(),
                               (float *)diff2(),
                               (mat33 *)A->deviceValue(),
                               (uint *)A->deviceRowPtr(),
                               (uint *)A->deviceColInd(),
                               (uint *)fixed,
                               m_dimension);

        float d =0;
        float d2=0;

        m_reduce->sum<float>(d, (float *)diff(), m_dimension);
        m_reduce->sum<float>(d2, (float *)diff2(), m_dimension);

        if(fabs(d2)< 1e-10f)
            d2 = 1e-10f;

        float d3 = d/d2;
        cuConjugateGradient_addX((float3 *)X,
                                 (float3 *)residual(),
                                 (float *)diff(),
                                 (float3 *)previous(),
                                 (float3 *)updated(),
                                 d3,
                                 (uint *)fixed,
                                 m_dimension);

        float d1 = 0.f;

        m_reduce->sum<float>(d1, (float *)diff(), m_dimension);

        if(error) *error = d1;

        if(d1 < 0.01f)
            break;

        if(fabs(d)<1e-10f)
            d = 1e-10f;

        float d4 = d1/d;
        cuConjugateGradient_addResidual((float3 *)previous(),
                                        (float3 *)residual(),
                                        d4,
                                        (uint *)fixed,
                                        m_dimension);
    }
}