void RunTest(ResultDatabase &resultDB, const int npasses, const int verbose, const int noPB, const float repeatF, ProgressBar &pb, const char* precision, const int micdev) { char sizeStr[128]; static __declspec(target(mic)) T *hostMem; int realRepeats = (int)round(repeatF*20); if (realRepeats < 2) realRepeats = 2; // Allocate host memory int halfNumFloats = 1024*1024; int numFloats = 2*halfNumFloats; hostMem = (T*)_mm_malloc(sizeof(T)*numFloats,64); sprintf (sizeStr, "Size:%07d", numFloats); float t = 0.0f; double TH; double flopCount; double gflop; for (int pass=0 ; pass<npasses ; ++pass) { ////////// Add1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add1_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * omp_get_num_threads(); gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Add2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add2_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * 120 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Add4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add4_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * 60 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Add8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Add8_MIC<T>(numFloats,hostMem, realRepeats, 10.0); } t = curr_second()-TH; flopCount = (double)numFloats * realRepeats * 80 * 3; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Add8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul1_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 200; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul2_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 100 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul4_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 50 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// Mul8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { Mul8_MIC<T>(numFloats,hostMem, realRepeats, 1.01); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 25 * 8; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("Mul8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd1_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * omp_get_num_threads() * 1; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd2_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 120 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd4_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 60 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MAdd8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MAdd8_MIC<T>(numFloats,hostMem, realRepeats, 10.0, 0.9899); } t = curr_second()-TH; flopCount = (double)numFloats * 2 * realRepeats * 30 * 8; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MAdd8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd1 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd1_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 160 * 1; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd1")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd2 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd2_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 80 * 2; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd2")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd4 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd4_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 40 * 4; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd4")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); ////////// MulMAdd8 ////////// InitData<T>(hostMem,numFloats); #pragma offload target(mic:micdev) in(hostMem:length(numFloats) free_if(0)) {} TH = curr_second(); #pragma offload target(mic:micdev) in(numFloats,realRepeats) nocopy(hostMem) { MulMAdd8_MIC<T>(numFloats,hostMem, realRepeats, 3.75, 0.355); } t = curr_second()-TH; flopCount = (double)numFloats * 3 * realRepeats * 20 * 8; gflop = flopCount / (double)(t*1e9); resultDB.AddResult(string("MulMAdd8")+precision, sizeStr, "GFLOPS", gflop); #pragma offload target(mic:micdev) out(hostMem:length(numFloats) alloc_if(0)) {} CheckResults<T>(hostMem,numFloats); pb.addItersDone(); if (!verbose && !noPB)pb.Show(stdout); } _mm_free(hostMem); }
template <class T> void RunTest(cl_device_id id, cl_context ctx, cl_command_queue queue, ResultDatabase &resultDB, int npasses, int verbose, int quiet, float repeatF, size_t localWorkSize, ProgressBar &pb, const char* typeName, const char* precision, const char* pragmaText) { int err; cl_mem mem1; char sizeStr[128]; T *hostMem, *hostMem2; int aIdx = 0; while ((aTests!=0) && (aTests[aIdx].name!=0)) { ostringstream oss; struct _benchmark_type temp = aTests[aIdx]; // Calculate adjusted repeat factor int tentativeRepeats = (int)round(repeatF*temp.numRepeats); if (tentativeRepeats < 2) { tentativeRepeats = 2; double realRepeatF = ((double)tentativeRepeats) / temp.numRepeats; if (realRepeatF>8.0*repeatF) // do not cut the number of unrolls // by more than a factor of 8 realRepeatF = 8.0*repeatF; temp.numUnrolls = (int)round(repeatF*temp.numUnrolls/realRepeatF); } temp.numRepeats = tentativeRepeats; // Generate kernel source code generateKernel(oss, temp, typeName, pragmaText); std::string kernelCode(oss.str()); // If in verbose mode, print the kernel if (verbose) { cout << "Code for kernel " << temp.name << ":\n" + kernelCode << endl; } // Alloc host memory int halfNumFloatsMax = temp.halfBufSizeMax*1024; int numFloatsMax = 2*halfNumFloatsMax; hostMem = new T[numFloatsMax]; hostMem2 = new T[numFloatsMax]; // Allocate device memory mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(T)*numFloatsMax, NULL, &err); CL_CHECK_ERROR(err); // Issue a copy to force device allocation err = clEnqueueWriteBuffer(queue, mem1, true, 0, numFloatsMax*sizeof(T), hostMem, 0, NULL, NULL); CL_CHECK_ERROR(err); // Create kernel program object const char* progSource[] = {kernelCode.c_str()}; cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, NULL, &err); CL_CHECK_ERROR(err); // Compile the program err = clBuildProgram(prog, 1, &id, opts, NULL, NULL); CL_CHECK_ERROR(err); if (err != 0) { char log[5000]; size_t retsize = 0; err = clGetProgramBuildInfo(prog, id, CL_PROGRAM_BUILD_LOG, 5000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Build error." << endl; cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; return; } // Check if we have to dump the PTX (NVIDIA only) // Disabled by default // Set environment variable DUMP_PTX to enable char* dumpPtx = getenv("DUMP_PTX"); if (dumpPtx && !strcmp(dumpPtx, "1")) { // must dump the PTX dumpPTXCode(ctx, prog, temp.name); } // Extract out kernel cl_kernel kernel_madd = clCreateKernel(prog, temp.name, &err); CL_CHECK_ERROR(err); err = clSetKernelArg (kernel_madd, 0, sizeof(cl_mem), (void*)&mem1); CL_CHECK_ERROR (err); err = clSetKernelArg (kernel_madd, 1, sizeof(cl_int), (void*)&temp.numRepeats); CL_CHECK_ERROR (err); if (verbose) { cout << "Running kernel " << temp.name << endl; } for (int halfNumFloats=temp.halfBufSizeMin*1024 ; halfNumFloats<=temp.halfBufSizeMax*1024 ; halfNumFloats*=temp.halfBufSizeStride) { // Set up input memory, first half = second half int numFloats = 2*halfNumFloats; for (int j=0; j<halfNumFloats; ++j) { hostMem[j] = hostMem[numFloats-j-1] = (T)(drand48()*5.0); } size_t globalWorkSize = numFloats; for (int pas=0 ; pas<npasses ; ++pas) { err = clEnqueueWriteBuffer (queue, mem1, true, 0, numFloats*sizeof(T), hostMem, 0, NULL, NULL); CL_CHECK_ERROR(err); Event evKernel(temp.name); err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel.CLEvent()); CL_CHECK_ERROR(err); err = clWaitForEvents(1, &evKernel.CLEvent()); CL_CHECK_ERROR(err); evKernel.FillTimingInfo(); double flopCount = (double)numFloats * temp.flopCount * temp.numRepeats * temp.numUnrolls * temp.numStreams; double gflop = flopCount / (double)(evKernel.SubmitEndRuntime()); sprintf (sizeStr, "Size:%07d", numFloats); resultDB.AddResult(string(temp.name)+precision, sizeStr, "GFLOPS", gflop); // Zero out the test host memory for (int j=0 ; j<numFloats ; ++j) { hostMem2[j] = 0.0; } // Read the result device memory back to the host err = clEnqueueReadBuffer(queue, mem1, true, 0, numFloats*sizeof(T), hostMem2, 0, NULL, NULL); CL_CHECK_ERROR(err); // Check the result -- At a minimum the first half of memory // should match the second half exactly for (int j=0 ; j<halfNumFloats ; ++j) { if (hostMem2[j] != hostMem2[numFloats-j-1]) { cout << "Error; hostMem2[" << j << "]=" << hostMem2[j] << " is different from its twin element hostMem2[" << (numFloats-j-1) << "]=" << hostMem2[numFloats-j-1] <<"; stopping check\n"; break; } } // update progress bar pb.addItersDone(); if (!verbose && !quiet) pb.Show(stdout); } } err = clReleaseKernel (kernel_madd); CL_CHECK_ERROR(err); err = clReleaseProgram (prog); CL_CHECK_ERROR(err); err = clReleaseMemObject(mem1); CL_CHECK_ERROR(err); aIdx += 1; delete[] hostMem; delete[] hostMem2; } // Now, test hand-tuned custom kernels // 2D - width and height of input const int w = 2048, h = 2048; const int bytes = w * h * sizeof(T); // Allocate some device memory mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, bytes, NULL, &err); CL_CHECK_ERROR(err); // Get a couple non-zero random numbers float val1 = 0, val2 = 0; while (val1==0 || val2==0) { val1 = drand48(); val2 = drand48(); } // For each custom kernel for (int kCounter = 0; kCounter < 2; kCounter++) { // Calculate adjusted repeat factor int tentativeRepeats = (int)round(repeatF*5); int nUnrolls = 100; if (tentativeRepeats < 2) { tentativeRepeats = 2; double realRepeatF = ((double)tentativeRepeats) / 5; if (realRepeatF>8.0*repeatF) // do not cut the number of unrolls // by more than a factor of 8 realRepeatF = 8.0*repeatF; nUnrolls = (int)round(repeatF*100/realRepeatF); } // Double precision not currently supported string kSource = generateUKernel(kCounter, false, tentativeRepeats, nUnrolls, typeName, pragmaText); const char* progSource[] = {kSource.c_str()}; cl_program prog = clCreateProgramWithSource(ctx, 1, progSource, NULL, &err); CL_CHECK_ERROR(err); // Compile kernel err = clBuildProgram(prog, 1, &id, opts, NULL, NULL); CL_CHECK_ERROR(err); // Extract out kernel cl_kernel kernel_madd = clCreateKernel(prog, "peak", &err); // Calculate kernel launch parameters //size_t localWorkSize = maxGroupSize<128?maxGroupSize:128; size_t globalWorkSize = w * h; // Set the arguments err = clSetKernelArg(kernel_madd, 0, sizeof(cl_mem), (void*)&mem1); CL_CHECK_ERROR(err); err = clSetKernelArg(kernel_madd, 1, sizeof(T), (void*)&val1); CL_CHECK_ERROR(err); err = clSetKernelArg(kernel_madd, 2, sizeof(T), (void*)&val2); CL_CHECK_ERROR(err); // Event object for timing Event evKernel_madd("madd"); for (int passCounter=0; passCounter < npasses; passCounter++) { err = clEnqueueNDRangeKernel(queue, kernel_madd, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, &evKernel_madd.CLEvent()); CL_CHECK_ERROR(err); // Wait for the kernel to finish err = clWaitForEvents(1, &evKernel_madd.CLEvent()); CL_CHECK_ERROR(err); evKernel_madd.FillTimingInfo(); // Calculate result and add to DB char atts[1024]; double nflopsPerItem = getUFlopCount(kCounter, false, tentativeRepeats, nUnrolls); sprintf(atts, "Size:%d", w*h); double gflops = (double) (nflopsPerItem*w*h) / (double) evKernel_madd.SubmitEndRuntime(); if (kCounter) { resultDB.AddResult(string("MulMAddU")+precision, atts, "GFLOPS", gflops); } else { resultDB.AddResult(string("MAddU")+precision, atts, "GFLOPS", gflops); } // update progress bar pb.addItersDone(); if (!verbose && !quiet) { pb.Show(stdout); } } err = clReleaseKernel(kernel_madd); CL_CHECK_ERROR(err); err = clReleaseProgram(prog); CL_CHECK_ERROR(err); } err = clReleaseMemObject(mem1); CL_CHECK_ERROR(err); }