Ejemplo n.º 1
0
void testMatrix::testMatrices()
{
    Vector4 testTransV = Vector4(4,5,6,1);
    Matrix4x4 testTrans = Matrix4x4(1,0,0,-4,0,1,0,-5,0,0,1,-6,0,0,0,1);
    Matrix4x4 testRotZ90 = Matrix4x4(0.f,-1.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,1.f);
    Matrix4x4 testRotY90 = Matrix4x4(0.f,0.f,1.f,0.f,0.f,1.f,0.f,0.f,-1.f,0.f,0.f,0.f,0.f,0.f,0.f,1.f);
    Matrix4x4 testRotX90 = Matrix4x4(1.f,0.f,0.f,0.f,0.f,0.f,-1.f,0.f,0.f,1.f,0.f,0.f,0.f,0.f,0.f,1.f);

    //y
    Matrix4x4 testRot1 = getRotMat(Vector4(0,0,0,1),Vector4(0,1,0,1),90);
    //x
    Matrix4x4 testRot2 = getRotMat(Vector4(0,0,0,1),Vector4(1,0,0,1), 90);
    //z
    Matrix4x4 testRot3 = getRotMat(Vector4(0,0,0,1),Vector4(0,0,1,1), 90);


    Matrix4x4 trz90 = getRotZMat(M_PI/2);
    Matrix4x4 try90 = getRotYMat(M_PI/2);
    Matrix4x4 trx90 = getRotXMat(M_PI/2);
    Matrix4x4 trans = getTransMat(testTransV);


    //Test rotation on z axis
    compareMatrices(&testRotZ90,&trz90);

    //Z axis as arbitrary rotation
    compareMatrices(&testRotZ90,&testRot3);

    //inverses are equivalent:
    compareMatrices(&getInvRotZMat(M_PI/2),&getInvRotMat(Vector4(0,0,0,1),Vector4(0,0,1,1),90));

    //Test rotation on y axis
    compareMatrices(&testRotY90,&try90);

    //Y axis as arbitrary rotation
    compareMatrices(&testRotY90,&testRot1);

    //inverses are equivalent:
    compareMatrices(&getInvRotYMat(M_PI/2),&getInvRotMat(Vector4(0,0,0,1),Vector4(0,1,0,1),90));

    //Test rotation on x axis
    compareMatrices(&testRotX90,&trx90);

    //X axis as arbitrary rotation
    compareMatrices(&testRotX90,&testRot2);

    //inverses are equivalent:
    compareMatrices(&getInvRotXMat(M_PI/2),&getInvRotMat(Vector4(0,0,0,1),Vector4(1,0,0,1),90));

    //test translation
    compareMatrices(&testTrans,&trans);

}
Ejemplo n.º 2
0
TEST_F(ExoticaVelSolverTest, IFT) //!< Inverse-computation Function Test
{
  //!< Temporaries
  Eigen::MatrixXd test_matrix;
  tinyxml2::XMLDocument xml_document;
  boost::shared_ptr<tinyxml2::XMLHandle> xml_handle_ptr;

  //!< Load the Matrix for inversion testing
  ASSERT_EQ(tinyxml2::XML_NO_ERROR, xml_document.LoadFile((resource_path_ + std::string("../storage.xml")).c_str()));
  xml_handle_ptr.reset(new tinyxml2::XMLHandle(xml_document.RootElement()));
  ASSERT_NE(nullptr, xml_handle_ptr->FirstChildElement("InvertibleMatrix").ToElement());
  ASSERT_TRUE(exotica::getMatrix(*(xml_handle_ptr->FirstChildElement("InvertibleMatrix").ToElement()), test_matrix));

  //!< Now iterate through all the registered velocity solvers
  for (int i=0; i<registered_types_.size(); i++)//!< Iterate through the registered tasks
  {
    std::string xml_path;
    if (exotica::TestRegistrar::Instance()->findXML(registered_types_[i], xml_path)) //!< If it is wished to be tested...
    {
      if (xml_document.LoadFile((resource_path_ + xml_path).c_str()) != tinyxml2::XML_NO_ERROR) //!< Attempt to load file
      {
        ADD_FAILURE() << " : Could not Load initialiser for " << registered_types_[i] << " (file: "<<resource_path_ + xml_path <<")."; //!< Have to use this method to add failure since I do not want to continue for this object but do not wish to abort for all the others...
        continue;//!< Go to next object
      }
      if (!(vel_solv_ptr_ = exotica::VelocitySolverCreator::Instance()->createObject(registered_types_[i], params_))) //!< If we could not create 
      {
        ADD_FAILURE() << " : Could not create object of type " << registered_types_[i]; //!< Have to use this method to add failure since I do not want to continue for this object but do not wish to abort for all the others...
        continue;//!< Go to next object
      }
      xml_handle_ptr.reset(new tinyxml2::XMLHandle(xml_document.RootElement())); //!< Get handle to root element
      *xml_handle_ptr = xml_handle_ptr->FirstChildElement("VelocitySolver");//!< Locate the child
      if (!vel_solv_ptr_->initBase(*xml_handle_ptr))
      {
        ADD_FAILURE() << " : Could not initialise " << registered_types_[i];
        continue;
      }
      Eigen::MatrixXd temp_inverse;
      if (!vel_solv_ptr_->getInverse(test_matrix, Eigen::MatrixXd::Identity(30,30)*0.0000001, Eigen::MatrixXd::Identity(30,30), temp_inverse)) //!< Since we know matrix is perfectly invertible
      {
        ADD_FAILURE() << " : Could not compute inverse for " << registered_types_[i];
        continue;
      }
      if (temp_inverse.rows() == 30 and temp_inverse.cols() == 30)
      {
        EXPECT_TRUE(compareMatrices(Eigen::MatrixXd::Identity(30,30), test_matrix*temp_inverse, TOLERANCE_L));
      }
      else
      {
        ADD_FAILURE() << " : Jacobian computation for " << registered_types_[i] << " incorrect";
        continue;
      }
    }
  }
}
Ejemplo n.º 3
0
	inline int compareAndSum(int val,const pair<MyMat,MyMat > &p)	{
		return val+compareMatrices(p.first,p.second);
	}
//////////////////////////////////////////////////////////////////////////////
// Main function
//////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
	// Create a context
	cl::Context context(CL_DEVICE_TYPE_GPU);

	// Get a device of the context
	int deviceNr = argc < 2 ? 1 : atoi(argv[1]);
	std::cout << "Using device " << deviceNr << " / " << context.getInfo<CL_CONTEXT_DEVICES>().size() << std::endl;
	ASSERT (deviceNr > 0);
	ASSERT ((size_t) deviceNr <= context.getInfo<CL_CONTEXT_DEVICES>().size());
	cl::Device device = context.getInfo<CL_CONTEXT_DEVICES>()[deviceNr - 1];
	std::vector<cl::Device> devices;
	devices.push_back(device);
	OpenCL::printDeviceInfo(std::cout, device);

	// Create a command queue
	cl::CommandQueue queue(context, device, CL_QUEUE_PROFILING_ENABLE);

	// Declare some values
	std::size_t wgSize = 16;
	std::size_t countAX_BY = 512;
	std::size_t countAY = 1024;
	std::size_t countBX = 768;

	std::size_t countCX = countBX;
	std::size_t countCY = countAY;
	std::size_t countA = countAX_BY * countAY;
	std::size_t countB = countBX * countAX_BY;
	std::size_t countC = countCX * countCY;
	std::size_t sizeA = countA * sizeof (float);
	std::size_t sizeB = countB * sizeof (float);
	std::size_t sizeC = countC * sizeof (float);

	// Load the source code
	cl::Program program = OpenCL::loadProgramSource(context, "src/OpenCLExercise4_MatrixMultiplication.cl");
	// Compile the source code. This is similar to program.build(devices) but will print more detailed error messages
	// This will pass the value of wgSize as a preprocessor constant "WG_SIZE" to the OpenCL C compiler
	OpenCL::buildProgram(program, devices, "-DWG_SIZE=" + boost::lexical_cast<std::string>(wgSize));

	// Allocate space for output data from CPU and GPU on the host
	std::vector<float> h_inputA (countA);
	std::vector<float> h_inputB (countB);
	std::vector<float> h_outputCCpu (countC);
	std::vector<float> h_outputCAtlas (countC);
	std::vector<float> h_outputCGpu (countC);

	// Allocate space for input and output data on the device
	cl::Buffer d_inputA (context, CL_MEM_READ_WRITE, sizeA);
	cl::Buffer d_inputB (context, CL_MEM_READ_WRITE, sizeB);
	cl::Buffer d_outputC (context, CL_MEM_READ_WRITE, sizeC);
	cl::Image2D d_inputAImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countAX_BY, countAY);
	cl::Image2D d_inputBImg (context, CL_MEM_READ_ONLY, cl::ImageFormat(CL_R, CL_FLOAT), countBX, countAX_BY);

	// Initialize memory to 0xff (useful for debugging because otherwise GPU memory will contain information from last execution)
	memset(h_inputA.data(), 255, sizeA);
	memset(h_inputB.data(), 255, sizeB);
	memset(h_outputCCpu.data(), 255, sizeC);
	memset(h_outputCAtlas.data(), 255, sizeC);
	memset(h_outputCGpu.data(), 255, sizeC);
	//TODO: GPU
	queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data());
	queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data());
	queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data());

	//////// Generate input data ////////////////////////////////
	// Use random input data
	for (std::size_t i = 0; i < countA; i++)
		h_inputA[i] = (rand() % 100) / 5.0f - 10.0f;
	for (std::size_t i = 0; i < countB; i++)
		h_inputB[i] = (rand() % 100) / 5.0f - 10.0f;
	// Use integer numbers as data
	/*
	for (std::size_t i = 0; i < countA; i++)
		h_inputA[i] = i;
	for (std::size_t i = 0; i < countB; i++)
		h_inputB[i] = (int)i - 5;
	*/

	// Do calculation on the host side
	Core::TimeSpan cpuStart = Core::getCurrentTime();
	matrixMulHost(h_inputA, h_inputB, h_outputCCpu, countAX_BY, countAY, countBX);
	Core::TimeSpan cpuEnd = Core::getCurrentTime();

	// Do calculation on using libatlas
	Core::TimeSpan atlasStart = Core::getCurrentTime();
	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, countAY, countBX, countAX_BY, 1.0, h_inputA.data(), countAX_BY, h_inputB.data(), countBX, 0.0, h_outputCAtlas.data(), countCX);
	Core::TimeSpan atlasEnd = Core::getCurrentTime();

	Core::TimeSpan cpuTime = cpuEnd - cpuStart;
	Core::TimeSpan atlasTime = atlasEnd - atlasStart;
	printPerformanceHeader();
	printPerformance("CPU", cpuTime, atlasTime);
	printPerformance("Atlas", atlasTime, atlasTime);

	if (!compareMatrices(h_outputCCpu, "CPU", h_outputCAtlas, "Atlas", countCX, countCY))
		return 1;

	// Copy input data to device
	cl::Event copy1;
	cl::Event copy2;
	queue.enqueueWriteBuffer(d_inputA, true, 0, sizeA, h_inputA.data(), NULL, &copy1);
	queue.enqueueWriteBuffer(d_inputB, true, 0, sizeB, h_inputB.data(), NULL, &copy2);

	// Iterate over all implementations (task 1 - 2)
	for (int impl = 1; impl <= 4; impl++) {
		// Reinitialize output memory to 0xff
		memset(h_outputCGpu.data(), 255, sizeC);
		queue.enqueueWriteBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data());

		// Create a kernel object
		std::string kernelName = "matrixMulKernel" + boost::lexical_cast<std::string> (impl);
		cl::Kernel matrixMulKernel(program, kernelName.c_str ());

		if (impl == 4) {
		cl::size_t<3> origin;
		origin[0] = origin[1] = origin[2] = 0;
		cl::size_t<3> region;
		region[0] = countAX_BY;
		region[1] = countAY;
		region[2] = 1;
		queue.enqueueWriteImage(d_inputAImg, true, origin, region, countAX_BY * sizeof (float), 0, h_inputA.data(), NULL, &copy1);
		region[0] = countBX;
		region[1] = countAX_BY;
		queue.enqueueWriteImage(d_inputBImg, true, origin, region, countBX * sizeof (float), 0, h_inputB.data(), NULL, &copy2);
		}

		// Launch kernel on the device
		cl::Event kernelExecution;
		if (impl == 4)
		matrixMulKernel.setArg<cl::Image2D>(0, d_inputAImg);
		else
		matrixMulKernel.setArg<cl::Buffer>(0, d_inputA);
		if (impl == 4)
		matrixMulKernel.setArg<cl::Image2D>(1, d_inputBImg);
		else
		matrixMulKernel.setArg<cl::Buffer>(1, d_inputB);
		matrixMulKernel.setArg<cl::Buffer>(2, d_outputC);
		matrixMulKernel.setArg<cl_uint>(3, countAX_BY);
		matrixMulKernel.setArg<cl_uint>(4, countAY);
		matrixMulKernel.setArg<cl_uint>(5, countBX);
		if (impl == 3)
			matrixMulKernel.setArg(6, cl::Local(2 * wgSize * wgSize * sizeof(float)));
		queue.enqueueNDRangeKernel(matrixMulKernel, cl::NullRange, cl::NDRange(countCX, countCY), cl::NDRange(wgSize, wgSize), NULL, &kernelExecution);

		// Copy output data back to host
		cl::Event copy3;
		queue.enqueueReadBuffer(d_outputC, true, 0, sizeC, h_outputCGpu.data(), NULL, &copy3);

		// Print performance data
		Core::TimeSpan gpuTime = OpenCL::getElapsedTime(kernelExecution);
		Core::TimeSpan copyTime = OpenCL::getElapsedTime(copy1) + OpenCL::getElapsedTime(copy2) + OpenCL::getElapsedTime(copy3);
		printPerformance(kernelName, gpuTime, copyTime, atlasTime);

		// Check whether results are correct
		if (!compareMatrices(h_outputCCpu, "CPU", h_outputCGpu, "GPU", countCX, countCY))
			return 1;
	}

	std::cout << "Success" << std::endl;

	//dumpMatrix ("A", h_inputA, countAX_BY, countAY);
	//dumpMatrix ("B", h_inputB, countBX, countAX_BY);
	//dumpMatrix ("C", h_outputCCpu, countCX, countCY);

	return 0;
}
Ejemplo n.º 5
0
int main(int argc, char * argv[])
{

  //checking for desired number of threads
  if (argc != 1)
  {
    if (strcmp(argv[1],"-t") || (threads = atoi(argv[2])) < 2)
    {
      printf("Usage: %s [-t <NUMBER_OF_THREADS>]\n Default Number of Threads: 2\n"
        , argv[0]);
      exit(-1);
    }
    printf("Pthreads and OpenMP Calculations will be done with %i threads.\n", threads);
  }
  struct timeval start, end;
  matrix firsts[MAX_TEST_CASES];
  matrix seconds[MAX_TEST_CASES];
  matrix results[MAX_TEST_CASES];


  //parses all Testmatrices and stores them in two arrays
  // int num; //the actual number of testcases
  printf("Reading matrices from InputFile...\n");
  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
    int end = parseMatrices(PATH_TO_TESTS, i, &firsts[i], &seconds[i]);
    if (end == 0)
    {
      printf("Parsing went wrong\n");
      exit(0);
    }
  }

  FILE* performance;

  //first, sequential computation
  printf("Sequential calculation...\n");
  gettimeofday(&start, NULL);
  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
    sequential(&firsts[i], &seconds[i], &results[i]);
  }
  gettimeofday(&end, NULL);

  //prints the results of sequential computation to file
  printf("Printing results...\n");
  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
    if(!printMatrix(&results[i], PATH_TO_RESULTS))
    {
      printf("Theres a problem with the output stream.\n");
      exit(0);
    }
  }
  printf("Sequential calculation complete. Check %s for results\n"
    , PATH_TO_RESULTS);

  performance = fopen(PATH_TO_TIMES, "w");
  fprintf(performance,
    "Sequential Implementation took %.3lf seconds for all testcases.\n"
    , getDifference(start, end));

  //openMP Implementation is up next
  matrix ompresults[MAX_TEST_CASES];
  printf("OpenMP calculation...\n");
  gettimeofday(&start, NULL);
  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
    openMP(&(firsts[i]), &(seconds[i]), &(ompresults[i]), threads);
  }
  gettimeofday(&end, NULL);

  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
      if(!compareMatrices(&results[i], &ompresults[i]))
      {
        printf("OMP-Implementation has faults!");
        exit(0);
      }
  }

  fprintf(performance,
    "OpenMP-Implementation took %.3lf seconds for all testcases.\n"
    , getDifference(start, end));

  //lastly, pthreads implementation
  matrix ptresults[MAX_TEST_CASES];
  printf("Posix-Threads calculation...\n");

  gettimeofday(&start, NULL);
  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
    multithreaded(&firsts[i], &seconds[i], &ptresults[i], threads);
  }
  gettimeofday(&end, NULL);

  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
      if(!compareMatrices(&results[i], &ptresults[i]))
      {
        printf("Pthread-Implementation has faults!");
        exit(0);
      }
  }

  fprintf(performance,
    "Pthreads-Implementation took %.3lf seconds for all testcases.\n"
    , getDifference(start, end));

  //cleaning up
  printf("Cleaning up...\n");
  fclose(performance);
  for (int i = 0; i < MAX_TEST_CASES; ++i)
  {
      free(firsts[i].values);
      free(seconds[i].values);
      free(results[i].values);
      free(ompresults[i].values);
      free(ptresults[i].values);
  }

  printf("All done, see %s for a summarization of each implementation's performance.\n"
    , PATH_TO_TIMES);
  exit(0);

}