コード例 #1
0
ファイル: Solver.cpp プロジェクト: emilk/Dual-Contouring
	/*
	Solves A*x = b for over-determined systems.

	Solves using  At*A*x = At*b   trick where At is the transponate of A
	*/
	Vec3 leastSquares(size_t N, const Vec3* A, const real* b)
	{
		if (N == 3) {
			const real A_mat[3*3] = {
				A[0].x, A[0].y, A[0].z,
				A[1].x, A[1].y, A[1].z,
				A[2].x, A[2].y, A[2].z,
			};
			return solve3x3(A_mat, b);
		}
		
		real At_A[3][3];
		real At_b[3];

		for (int i=0; i<3; ++i) {
			for (int j=0; j<3; ++j) {
				real sum = 0;
				for (size_t k=0; k<N; ++k) {
					sum += A[k][i] * A[k][j];
				}
				At_A[i][j] = sum;
			}
		}

		for (int i=0; i<3; ++i) {
			real sum = 0;

			for (size_t k=0; k<N; ++k) {
				sum += A[k][i] * b[k];
			}

			At_b[i] = sum;
		}
		
		
		/*
		// Improve conditioning:
		real offset = 0.0001;
		At_A[0][0] += offset;
		At_A[1][1] += offset;
		At_A[2][2] += offset;
		 */
		
		static_assert(sizeof(At_A) == 9*sizeof(real), "pack");

		return solve3x3(&At_A[0][0], At_b);
	}
コード例 #2
0
ファイル: Solver.cpp プロジェクト: jpaasen/cos
int Solver::solve(
   Complex<float>* &x_in,        // buffer holding the solutions
   Complex<float>* &A_in,  // buffer holding matrices
   Complex<float>* &b_in,  // buffer holding the left sides
   int &N,                 // size of each linear system
   int &batch,             // number of linear systems
   bool &x_on_gpu,         // true if x should remain on the gpu
   bool &A_on_gpu,         // true if R is already on the gpu
   bool &b_on_gpu          // true if b is already on the gpu
   )
{
   if (N < 1 || N > 72) {
      printIt("Error in Solver: Min N is 1 and max N is 72.\n");
      return cudaErrorLaunchFailure;
   }

   if (batch < 1) {
      printIt("Erroro in Solver: batch must be larger than 0.\n");
      return cudaErrorLaunchFailure;
   }

   cuComplex *x = (cuComplex*)x_in;
   cuComplex *A = (cuComplex*)A_in;
   cuComplex *b = (cuComplex*)b_in;

   // TODO: Call zfsolve_batch in solve.h
   cudaError e;

   cuComplex* x_gpu;
   cuComplex* A_gpu;
   cuComplex* b_gpu;

   size_t x_buffer_size = N*batch*sizeof(cuComplex);
   size_t A_buffer_size = N*N*batch*sizeof(cuComplex);
   size_t b_buffer_size = N*batch*sizeof(cuComplex);

   if (x_on_gpu) {
      x_gpu = x; 
   } else { // TODO: make a function for this...
      e = cudaMalloc<cuComplex>(&x_gpu, x_buffer_size);
      if (e != cudaSuccess) return e;
      e = cudaMemcpy((void*)x_gpu, (void*)x, x_buffer_size, cudaMemcpyHostToDevice);
      if (e != cudaSuccess) return e;
   }
   if (A_on_gpu) {
      A_gpu = (cuComplex*) A; // not good, but what to do?
   } else {
      e = cudaMalloc<cuComplex>(&A_gpu, A_buffer_size);
      if (e != cudaSuccess) return e;
      e = cudaMemcpy((void*)A_gpu, (void*)A, A_buffer_size, cudaMemcpyHostToDevice);
      if (e != cudaSuccess) return e;
   }
   if (b_on_gpu) {
      b_gpu = (cuComplex*) b;
   } else {
      e = cudaMalloc<cuComplex>(&b_gpu, b_buffer_size);
      if (e != cudaSuccess) return e;
      e = cudaMemcpy((void*)b_gpu, (void*)b, b_buffer_size, cudaMemcpyHostToDevice);
      if (e != cudaSuccess) return e;
   }

   // TODO:
   // For small matrices one should use a kernel evaluating the invers (one thread per matrix)
   // There is one such kernel for 3x3 in the RealTimeCapon folder, it should be possible to make kernels for 4x4 and 5x5 as well.
   // Tip from Nvidia: up to 10x10 it should be faster to go with a 1-matrix-per-thread appraoch.
   // This seems to be done in zfsolve_batch. There is different kernels beeing called depending on N.

   if (N == 1) {
      e = (cudaError) solve1x1(x_gpu, A_gpu, b_gpu, batch);
   } else if (N == 3 && this->solverType == Solver::DIRECT) {
      e = (cudaError) solve3x3(x_gpu, A_gpu, b_gpu, batch);
   } else {
      e = (cudaError) zfsolve_batch(A_gpu, b_gpu, x_gpu, N, batch);
   }

   if (e != cudaSuccess) return e; // TODO: What about allocated gpu memory if kernel fail?

   if (!x_on_gpu) {
      e = cudaMemcpy((void*)x, (void*)x_gpu, x_buffer_size, cudaMemcpyDeviceToHost);
      if (e != cudaSuccess) return e;
      e = cudaFree((void*)x_gpu);
      if (e != cudaSuccess) return e;
   }
   if (!A_on_gpu) {
      e = cudaFree((void*)A_gpu);
      if (e != cudaSuccess) return e;
   }
   if (!b_on_gpu) {
      e = cudaFree((void*)b_gpu);
      if (e != cudaSuccess) return e;
   }

   return e;
}