GPU_PERF_TEST(CartToPolar, cv::gpu::DeviceInfo, cv::Size) { cv::gpu::DeviceInfo devInfo = GET_PARAM(0); cv::Size size = GET_PARAM(1); cv::gpu::setDevice(devInfo.deviceID()); cv::Mat x_host(size, CV_32FC1); cv::Mat y_host(size, CV_32FC1); fill(x_host, -100.0, 100.0); fill(y_host, -100.0, 100.0); cv::gpu::GpuMat x(x_host); cv::gpu::GpuMat y(y_host); cv::gpu::GpuMat magnitude; cv::gpu::GpuMat angle; TEST_CYCLE() { cv::gpu::cartToPolar(x, y, magnitude, angle); } }
int main(int argc, char** argv) { // timer struct timeval st, et; float gputime = 0.0, cputime = 0.0; // read Sparse Matrix from file or generate if (argc < 2 || argc > 4) { printf("Correct Usage: <executable> <input matrix file>\n"); exit(-1); } // init the network agile::NetworkEnvironment environment(argc, argv); // allocate a GPU typedef agile::GPUCommunicator<unsigned, float, float> communicator_type; communicator_type com; com.allocateGPU(); char spmfileName[256]; strcpy(spmfileName, argv[1]); if (!fileIsReadable(spmfileName)) { printf("Non-existent input matrix file\n"); exit(-1); } unsigned m_num_rows, m_num_cols; std::vector<unsigned> m_row_nnz; std::vector<unsigned> m_column_index; std::vector<float> m_data; // read in matrix from matrix-market file readSparseMatrix(spmfileName, 0, m_num_rows, m_num_cols, m_row_nnz, m_column_index, m_data); std::cout << m_num_rows << "\t" << m_num_cols << "\t"; /* PRINT_VEC("m_row_nnz", m_row_nnz); PRINT_VEC("m_column_index", m_column_index); PRINT_VEC("m_data", m_data); */ // init gpu matrix agile::GPUCSMatrix<float> A(m_row_nnz, m_column_index, m_data); // init random vector std::vector<float> x_host(m_num_cols, 0); srand(time(NULL)); for (unsigned i=0; i<m_num_cols; ++i) x_host[i] = rand() / (float)RAND_MAX; //PRINT_VEC("RANDOM X VECTOR", x_host); // init gpu vector agile::GPUVector<float> x(m_num_cols); x.assignFromHost(x_host.begin(), x_host.end()); // init result gpu vector: y agile::GPUVector<float> y(m_num_rows); // start time gettimeofday(&st, NULL); for (unsigned t=0; t<NUM_ITER; ++t) { // gpu multiplication agile::multiply(A, x, y); cudaThreadSynchronize(); } // stop time gettimeofday(&et, NULL); gputime = ((et.tv_sec-st.tv_sec)*1000.0 + (et.tv_usec - st.tv_usec)/1000.0)/NUM_ITER; // transfer GPU multiplication result back to cpu std::vector<float> y_host; y.copyToHost(y_host); //----------------- CPU computation from ibm demo --------------------------- SpMatrix m; readSparseMatrix(&m, spmfileName, 0); unsigned int numNonZeroElements = m.numNZEntries; unsigned int memSize_row = sizeof(float) * m_num_rows; // allocate host memory float* h_x = (float*) malloc(memSize_row); #if PADDED_CSR float *h_val; unsigned int *h_indices, *h_rowIndices; genPaddedCSRFormat(&m, &h_val, &h_rowIndices, &h_indices); #else float* h_val = (float*) malloc(sizeof(float)*numNonZeroElements); unsigned int* h_indices = (unsigned int*) malloc(sizeof(int)*numNonZeroElements); unsigned int* h_rowIndices = (unsigned int*) malloc(sizeof(int)*(m_num_rows+1)); genCSRFormat(&m, h_val, h_rowIndices, h_indices); #endif // CPU REFERENCE float* reference = (float*) malloc(memSize_row); #if EXEC_CPU #if TIMER gettimeofday(&st, NULL); #endif // compute reference solution #if BCSR float *val; unsigned int *rowIndices, *indices; unsigned int numblocks; genBCSRFormat(&m, &val, &rowIndices, &indices, &numblocks, BCSR_r, BCSR_c); computeSpMV_BCSR(reference, val, rowIndices, indices, &(x_host[0]), m_num_rows, m_num_cols, BCSR_r, BCSR_c); #else computeSpMV(reference, h_val, h_rowIndices, h_indices, &(x_host[0]), m_num_rows); #endif #if TIMER gettimeofday(&et, NULL); cputime = (et.tv_sec-st.tv_sec)*1000.0 + (et.tv_usec - st.tv_usec)/1000.0; #endif #endif float flops= ((numNonZeroElements * 2) / (gputime*1000000)); //printf("GPU (ms) \tCPU (ms) \tGFLOPS\n"); printf("%f\t%f\t%f\t", gputime, cputime, flops); #if VERIFY // check result float error_norm, ref_norm, diff; error_norm = 0; ref_norm = 0; for (unsigned i = 0; i < m_num_rows; ++i) { diff = reference[i] - y_host[i]; error_norm += diff * diff; ref_norm += reference[i] * reference[i]; } error_norm = (float)sqrt((double)error_norm); ref_norm = (float)sqrt((double)ref_norm); if (fabs(ref_norm) < 1e-7) printf ("Test FAILED"); else printf( "Test %s", ((error_norm / ref_norm) < 1e-6f) ? "PASSED" : "FAILED"); #endif free(reference); free(h_x); #if !PADDED_CSR free(h_val); free(h_indices); free(h_rowIndices); #endif return 0; }