/* Solves A*x = b for over-determined systems. Solves using At*A*x = At*b trick where At is the transponate of A */ Vec3 leastSquares(size_t N, const Vec3* A, const real* b) { if (N == 3) { const real A_mat[3*3] = { A[0].x, A[0].y, A[0].z, A[1].x, A[1].y, A[1].z, A[2].x, A[2].y, A[2].z, }; return solve3x3(A_mat, b); } real At_A[3][3]; real At_b[3]; for (int i=0; i<3; ++i) { for (int j=0; j<3; ++j) { real sum = 0; for (size_t k=0; k<N; ++k) { sum += A[k][i] * A[k][j]; } At_A[i][j] = sum; } } for (int i=0; i<3; ++i) { real sum = 0; for (size_t k=0; k<N; ++k) { sum += A[k][i] * b[k]; } At_b[i] = sum; } /* // Improve conditioning: real offset = 0.0001; At_A[0][0] += offset; At_A[1][1] += offset; At_A[2][2] += offset; */ static_assert(sizeof(At_A) == 9*sizeof(real), "pack"); return solve3x3(&At_A[0][0], At_b); }
int Solver::solve( Complex<float>* &x_in, // buffer holding the solutions Complex<float>* &A_in, // buffer holding matrices Complex<float>* &b_in, // buffer holding the left sides int &N, // size of each linear system int &batch, // number of linear systems bool &x_on_gpu, // true if x should remain on the gpu bool &A_on_gpu, // true if R is already on the gpu bool &b_on_gpu // true if b is already on the gpu ) { if (N < 1 || N > 72) { printIt("Error in Solver: Min N is 1 and max N is 72.\n"); return cudaErrorLaunchFailure; } if (batch < 1) { printIt("Erroro in Solver: batch must be larger than 0.\n"); return cudaErrorLaunchFailure; } cuComplex *x = (cuComplex*)x_in; cuComplex *A = (cuComplex*)A_in; cuComplex *b = (cuComplex*)b_in; // TODO: Call zfsolve_batch in solve.h cudaError e; cuComplex* x_gpu; cuComplex* A_gpu; cuComplex* b_gpu; size_t x_buffer_size = N*batch*sizeof(cuComplex); size_t A_buffer_size = N*N*batch*sizeof(cuComplex); size_t b_buffer_size = N*batch*sizeof(cuComplex); if (x_on_gpu) { x_gpu = x; } else { // TODO: make a function for this... e = cudaMalloc<cuComplex>(&x_gpu, x_buffer_size); if (e != cudaSuccess) return e; e = cudaMemcpy((void*)x_gpu, (void*)x, x_buffer_size, cudaMemcpyHostToDevice); if (e != cudaSuccess) return e; } if (A_on_gpu) { A_gpu = (cuComplex*) A; // not good, but what to do? } else { e = cudaMalloc<cuComplex>(&A_gpu, A_buffer_size); if (e != cudaSuccess) return e; e = cudaMemcpy((void*)A_gpu, (void*)A, A_buffer_size, cudaMemcpyHostToDevice); if (e != cudaSuccess) return e; } if (b_on_gpu) { b_gpu = (cuComplex*) b; } else { e = cudaMalloc<cuComplex>(&b_gpu, b_buffer_size); if (e != cudaSuccess) return e; e = cudaMemcpy((void*)b_gpu, (void*)b, b_buffer_size, cudaMemcpyHostToDevice); if (e != cudaSuccess) return e; } // TODO: // For small matrices one should use a kernel evaluating the invers (one thread per matrix) // There is one such kernel for 3x3 in the RealTimeCapon folder, it should be possible to make kernels for 4x4 and 5x5 as well. // Tip from Nvidia: up to 10x10 it should be faster to go with a 1-matrix-per-thread appraoch. // This seems to be done in zfsolve_batch. There is different kernels beeing called depending on N. if (N == 1) { e = (cudaError) solve1x1(x_gpu, A_gpu, b_gpu, batch); } else if (N == 3 && this->solverType == Solver::DIRECT) { e = (cudaError) solve3x3(x_gpu, A_gpu, b_gpu, batch); } else { e = (cudaError) zfsolve_batch(A_gpu, b_gpu, x_gpu, N, batch); } if (e != cudaSuccess) return e; // TODO: What about allocated gpu memory if kernel fail? if (!x_on_gpu) { e = cudaMemcpy((void*)x, (void*)x_gpu, x_buffer_size, cudaMemcpyDeviceToHost); if (e != cudaSuccess) return e; e = cudaFree((void*)x_gpu); if (e != cudaSuccess) return e; } if (!A_on_gpu) { e = cudaFree((void*)A_gpu); if (e != cudaSuccess) return e; } if (!b_on_gpu) { e = cudaFree((void*)b_gpu); if (e != cudaSuccess) return e; } return e; }