int main(int argc, char **argv) { unsigned int *inputVals; unsigned int *inputPos; unsigned int *outputVals; unsigned int *outputPos; size_t numElems; std::string input_file; std::string template_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 3: input_file = std::string(argv[1]); template_file = std::string(argv[2]); output_file = "HW4_output.png"; break; case 4: input_file = std::string(argv[1]); template_file = std::string(argv[2]); output_file = std::string(argv[3]); break; default: std::cerr << "Usage: ./HW4 input_file template_file [output_filename]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&inputVals, &inputPos, &outputVals, &outputPos, numElems, input_file, template_file); GpuTimer timer; timer.Start(); //call the students' code your_sort(inputVals, inputPos, outputVals, outputPos, numElems); timer.Stop(); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); printf("\n"); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } //check results and output the red-eye corrected image postProcess(outputVals, outputPos, numElems, output_file); // check code moved from HW4.cu /**************************************************************************** * You can use the code below to help with debugging, but make sure to * * comment it out again before submitting your assignment for grading, * * otherwise this code will take too much time and make it seem like your * * GPU implementation isn't fast enough. * * * * This code MUST RUN BEFORE YOUR CODE in case you accidentally change * * the input values when implementing your radix sort. * * * * This code performs the reference radix sort on the host and compares your * * sorted values to the reference. * * * * Thrust containers are used for copying memory from the GPU * * ************************************************************************* */ thrust::device_ptr<unsigned int> d_inputVals(inputVals); thrust::device_ptr<unsigned int> d_inputPos(inputPos); thrust::host_vector<unsigned int> h_inputVals(d_inputVals, d_inputVals+numElems); thrust::host_vector<unsigned int> h_inputPos(d_inputPos, d_inputPos + numElems); thrust::host_vector<unsigned int> h_outputVals(numElems); thrust::host_vector<unsigned int> h_outputPos(numElems); reference_calculation(&h_inputVals[0], &h_inputPos[0], &h_outputVals[0], &h_outputPos[0], numElems); //postProcess(&h_outputVals[0], &h_outputPos[0], numElems, reference_file); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); thrust::device_ptr<unsigned int> d_outputVals(outputVals); thrust::device_ptr<unsigned int> d_outputPos(outputPos); thrust::host_vector<unsigned int> h_yourOutputVals(d_outputVals, d_outputVals + numElems); thrust::host_vector<unsigned int> h_yourOutputPos(d_outputPos, d_outputPos + numElems); checkResultsExact(&h_outputVals[0], &h_yourOutputVals[0], numElems); checkResultsExact(&h_outputPos[0], &h_yourOutputPos[0], numElems); checkCudaErrors(cudaFree(inputVals)); checkCudaErrors(cudaFree(inputPos)); checkCudaErrors(cudaFree(outputVals)); checkCudaErrors(cudaFree(outputPos)); return 0; }
int main(int argc, char **argv){ int nTaps = 8; int nChannels = 1024; int nStreams = 2; int NUM_THREADS = 512; int seg_blocks = 5000; unsigned int data_size = 10000+nTaps-1; unsigned int nBlocks = 0; float error = 1.1f; bool debug=true; if (debug) printf("\t\tWelcome\n"); Complex *h_signal, *h_spectra_pinned, *h_spectra_ref, *h_data_pinned; float *h_coeff; if (argc >= 2) nChannels = atof(argv[1]); if (argc >= 3) NUM_THREADS = atof(argv[2]); if (argc >= 4) nStreams = (atof(argv[3])); if (argc >= 5) nTaps = (atof(argv[4])); if (argc >= 6) seg_blocks = (atof(argv[5])); if (argc >= 7) data_size = (atof(argv[6])+nTaps-1)*nChannels; nBlocks = (data_size+nTaps-1)/nChannels;nBlocks = data_size/nChannels; if (debug) printf("\nHost memory allocation...\t"); checkCudaErrors(cudaMallocHost((void**)&h_spectra_pinned, data_size*sizeof(Complex))); checkCudaErrors(cudaMallocHost((void**)&h_data_pinned, data_size*sizeof(Complex))); h_signal = (Complex *)malloc(data_size*sizeof(Complex)); h_spectra_ref = (Complex *)malloc(data_size*sizeof(Complex)); h_coeff = (float *)malloc(nTaps*nChannels*sizeof(float)); if (debug) printf("done."); if (debug) printf("\nHost memory memset...\t\t"); memset(h_spectra_pinned, 0.0, sizeof(Complex)*data_size); memset(h_spectra_ref, 0.0, sizeof(Complex)*data_size); if (debug) printf("done."); if (debug) printf("\nLoad window coefficients...\t"); //Load_window_data(h_coeff); for (int i = 0; i < nTaps*nChannels; i++) h_coeff[i] = rand() / (float)RAND_MAX; if (debug) printf("done."); if (debug) printf("\nRandom data set...\t\t"); srand(time(NULL)); for (int i=0; i < (int)data_size; i++){ h_signal[i].x = rand() / (float)RAND_MAX; h_signal[i].y = rand() / (float)RAND_MAX; } for (int i = 0; i < (int)data_size; i++){ h_data_pinned[i] = h_signal[i]; } if (debug) printf("done."); if (debug) printf("\nReference calculation...\t"); reference_calculation(h_signal, h_spectra_ref, h_coeff, nChannels, nBlocks, nTaps); if (debug) printf("done.\n"); //printf("CPU jedna %g druha %g", h_spectra_ref[3584], h_spectra_ref[7*512 + 259999].x); gpu_code(h_data_pinned, h_spectra_pinned, h_coeff, nChannels, nBlocks, data_size, NUM_THREADS, nTaps, nStreams, seg_blocks); if (debug){ error = reference_code(h_spectra_ref, h_spectra_pinned, nChannels, nTaps, nBlocks); printf( "error = %lf\n", error); } checkCudaErrors(cudaFreeHost(h_spectra_pinned)); checkCudaErrors(cudaFreeHost(h_data_pinned)); delete[] h_signal; delete[] h_spectra_ref; delete[] h_coeff; }