int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; cl_command_queue queue; cl_uint numOfPlatforms; cl_int error; cl_mem matrixAMemObj; // input matrix A mem buffer cl_mem matrixBMemObj; // input matrix B mem buffer cl_mem matrixCMemObj; // input matrix C mem buffer cl_int* matrixA; // input matrix A cl_int* matrixB; // input matrix B cl_int* matrixC; // input matrix C cl_uint widthA = WIDTH_G; cl_uint heightA = HEIGHT_G; cl_uint widthB = WIDTH_G; cl_uint heightB = HEIGHT_G; { // allocate memory for input and output matrices // based on whatever matrix theory i know. matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int)); matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int)); matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int)); memset(matrixA, 0, widthA * heightA * sizeof(cl_int)); memset(matrixB, 0, widthB * heightB * sizeof(cl_int)); memset(matrixC, 0, widthB * heightA * sizeof(cl_int)); fillRandom(matrixA, widthA, heightA, 643); fillRandom(matrixB, widthB, heightB, 991); } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_int i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { perror("Can't locate a OpenCL compliant device i.e. GPU"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"mmult.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; const char options[] = ""; size_t log_size; error = clBuildProgram(program, 1, &device, options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } // Queue is created with profiling enabled cl_command_queue_properties props; props |= CL_QUEUE_PROFILING_ENABLE; queue = clCreateCommandQueue(context, device, props, &error); cl_kernel kernel = clCreateKernel(program, "mmmult", &error); matrixAMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthA * heightA * sizeof(cl_int), matrixA, &error); matrixBMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthB * heightB * sizeof(cl_int), matrixB, &error); matrixCMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, widthB * heightA * sizeof(cl_int), 0, &error); clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB); clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA); clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj); clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj); clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj); size_t globalThreads[] = {heightA}; size_t localThreads[] = {256}; cl_event exeEvt; cl_ulong executionStart, executionEnd; error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &exeEvt); clWaitForEvents(1, &exeEvt); if(error != CL_SUCCESS) { printf("Kernel execution failure!\n"); exit(-22); } // let's understand how long it took? clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL); clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL); clReleaseEvent(exeEvt); printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000); printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart)); clEnqueueReadBuffer(queue, matrixCMemObj, CL_TRUE, 0, widthB * heightA * sizeof(cl_int), matrixC, 0, NULL, NULL); if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB)) printf("Passed!\n"); else printf("Failed!\n"); /* Clean up */ for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); clReleaseMemObject(matrixAMemObj); clReleaseMemObject(matrixBMemObj); clReleaseMemObject(matrixCMemObj); } free(matrixA); free(matrixB); free(matrixC); }
XdevLComputeDeviceBufferCL::~XdevLComputeDeviceBufferCL() { if(nullptr != m_memory) { clReleaseMemObject(m_memory); } }
int main(int argc, char *argv[]) { int iGlobalSize = 1; int iCheck1, iCheck2, iCheck3, iCheck4; size_t iGlobalWorkSize = -1; size_t iLocalWorkSize = -1; if (argc > 1) // Size of input vector { iCheck1 = atoi(argv[1]); if (iCheck1 != 0) { iGlobalSize = iCheck1; } } int iNoReps = 100; // Number of repetitions. if (argc > 2) { iCheck2 = atoi(argv[2]); if (iCheck2 != 0) { iNoReps = iCheck2; } } /* if (argc > 3) // Global work size { iCheck3 = atoi(argv[3]); if (iCheck3 != 0) { iGlobalWorkSize = iCheck3; } } if (argc > 4) // Local work size { iCheck4 = atoi(argv[4]); if (iCheck4 != 0) { iLocalWorkSize = iCheck4; } } */ int bPrint = 0; if (argc > 3) // Originally 5. { bPrint = 1; } // printf("The global size is %d, the global work size is %ld, and the local work size is %ld. \n", iGlobalSize, iGlobalWorkSize, iLocalWorkSize); /* size_t * ipGlobalWorkParam = NULL; if (iGlobalWorkSize != -1) { ipGlobalWorkParam = &iGlobalWorkSize; } size_t * ipLocalWorkParam = NULL; if (iLocalWorkSize != -1) { ipLocalWorkParam = &iLocalWorkSize; } */ GCAQ * TheGCAQ = GCAQSetup(); if (TheGCAQ == NULL) { return 1; } #if BIGFLOAT const char *szFloatOpt = "-DBIGFLOAT"; #else const char *szFloatOpt = NULL; #endif const int iNoKernels = 1; char *ourKernelStrings[6] = { szDotProduct, szReduce, szDotProduct2, szReduce2, szDotProduct4, szReduce4}; GPAK *TheGPAK = GPAKSetup(TheGCAQ, iNoKernels, ourKernelStrings, szFloatOpt); if (TheGPAK == NULL) { GCAQShutdown(TheGCAQ); return 2; } INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0]; INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo); FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT)); FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT)); fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult); freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult); // printvector("dot prod", iExpOutputSize, fExpDotProdResult); // printvector("reduce", iExpOutputSize, fExpReduceResult); FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFIncrease(iGlobalSize, inputDataF); // For the dot product. FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFNull(iGlobalSize, outputDataD); // For the reduction. FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFNull(iGlobalSize, outputDataR); struct timespec start[iNoKernels]; struct timespec end[iNoKernels]; // create buffers for the input and ouput int err; cl_mem inputF, outputF, outputAll; inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for F"); return 3; } outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for output 7"); return 9; } outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for output 8"); return 9; } clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL); int iRep; int iKernel; int i; int iLengthTotal = iGlobalSize; size_t iGlobalWorkThing = iGlobalSize; int iSomething = 1; for (iKernel = 0; iKernel < iNoKernels; iKernel++) { for (i = 0; i < iLengthTotal; i++) { outputDataD[i] = 0.0; outputDataR[i] = 0.0; } clock_gettime(CLOCK_MONOTONIC, &(start[iKernel])); for (iRep = 0; iRep < iNoReps; iRep++) { clSetKernelArg(TheGPAK->TheKernels[iKernel], 0, sizeof(int), &iLengthTotal); clSetKernelArg(TheGPAK->TheKernels[iKernel], 1, sizeof(cl_mem), &inputF); clSetKernelArg(TheGPAK->TheKernels[iKernel], 2, iSomething * iLocalWorkSize * sizeof(float), NULL); // Was 3 clSetKernelArg(TheGPAK->TheKernels[iKernel], 3, sizeof(cl_mem), &outputAll); // Was 4 clEnqueueNDRangeKernel(TheGCAQ->TheQueue, TheGPAK->TheKernels[iKernel], 1, NULL, &iGlobalWorkThing, &(TheGPAK->TheMaxWorkGroupSizes[iKernel]), 0, NULL, NULL); clFinish(TheGCAQ->TheQueue); // copy the results from out of the output buffer if (iKernel % 2 == 0) { clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataD, 0, NULL, NULL); } else { clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataR, 0, NULL, NULL); } } clock_gettime(CLOCK_MONOTONIC, &(end[iKernel])); if (bPrint) { for (i = 0; i < iExpOutputSize; i++) { if (iKernel % 2 == 0) { if (outputDataD[i] != fExpDotProdResult[i]) { printf ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n", iKernel, i, outputDataD[i], fExpDotProdResult[i]); break; } } else { if (outputDataR[i] != fExpReduceResult[i]) { printf ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n", iKernel, i, outputDataR[i], fExpReduceResult[i]); break; } } } } // if ((iKernel % 2) == 1) // { // iLengthTotal = iLengthTotal / 2; // iSomething = iSomething * 2; // iGlobalWorkThing = iGlobalWorkThing / 2; // } } clReleaseMemObject(inputF); clReleaseMemObject(outputF); clReleaseMemObject(outputAll); // print the results // if (bPrint) // { // printf("output %d: \n", iGlobalSize); // for(i=0;i<iExpOutputSize; i++) // { // printf("%d - %f - %f\n", i, outputDataD[i], outputDataR[i]); // } // } // cleanup - release OpenCL resources free(inputDataF); free(outputDataD); free(outputDataR); GPAKShutdown(TheGPAK); GCAQShutdown (TheGCAQ); printf("%d - ", iGlobalSize); for (iKernel = 0; iKernel < iNoKernels; iKernel++) { printf("%f - ", (1.0 * TLPERS * iGlobalSize * iNoReps) / (MEGAHERTZ * timespecDiff(&(end[iKernel]), &(start[iKernel])))); } printf("\n"); return 0; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufParam; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); int lenParam = 5; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL); /* Call clblas function. */ err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSrotm() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROTM placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufParam); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int MemoryOptimizations::cleanup() { /* Releases OpenCL resources (Context, Memory etc.) */ cl_int status; for(int i = 0; i < NUM_KERNELS; i++) { status = clReleaseKernel(kernel[i]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; } status = clReleaseProgram(program); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; status = clReleaseMemObject(inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseContext(context); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; /* release program resources (input memory etc.) */ if(input) free(input); if(output) free(output); /* release device list */ if(devices) free(devices); if(maxWorkItemSizes) free(maxWorkItemSizes); return SDK_SUCCESS; }
int main() { // START:context cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); // END:context // START:queue cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); // END:queue // START:kernel char* source = read_source("multiply_arrays.cl"); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL); // END:kernel // START:buffers cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS]; random_fill(a, NUM_ELEMENTS); random_fill(b, NUM_ELEMENTS); cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, a, NULL); cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, b, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL); // END:buffers // START:execute clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); size_t work_units = NUM_ELEMENTS; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL); // END:execute // START:results cl_float results[NUM_ELEMENTS]; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS, results, 0, NULL, NULL); // END:results // START:cleanup clReleaseMemObject(inputA); clReleaseMemObject(inputB); clReleaseMemObject(output); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); // END:cleanup for (int i = 0; i < NUM_ELEMENTS; ++i) { printf("%f * %f = %f\n", a[i], b[i], results[i]); } return 0; }
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; // print headers shrLog("Time in seconds for various numbers of elements for each kernel\n"); shrLog("\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n"); shrLog("%d", kernel); for (int i = minN; i <= maxN; i *= 2) { int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); double reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata); reduceTime = dTotalTime/(double)testIterations; } else { reduceTime = -1.0; } shrLog(", %.4f m", reduceTime); } } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); }
END_TEST START_TEST (test_read_write_rect) { cl_device_id device; cl_context ctx; cl_command_queue queue; cl_int result; cl_mem buf, buf_part; cl_platform_id platform = 0; cl_uint num_platforms = 0; clGetPlatformIDs(1, &platform, &num_platforms); // Grid xyz = (5 x 7 x 2) unsigned char grid[70] = { 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 1, 2, 3, 2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 3, 1, 0, 0, 2, 3, 2, 0, 1, 3, 3, 3, 1, 2, 3, 3, 3, 2, 3, 3, 3, 3, 3 }; // Middle of the "image" : 3 x 3 x 2 centered at (3, 3) unsigned char part[18] = { 2, 2, 2, 2, 3, 2, 2, 2, 2, 1, 3, 1, 2, 3, 2, 3, 3, 3 }; unsigned char buffer[70], buffer_part[18]; size_t host_origin[3] = {0, 0, 0}; size_t buf_origin[3] = {0, 0, 0}; size_t region[3] = {5, 7, 2}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer), buffer, &result); fail_if( result != CL_SUCCESS, "cannot create a valid CL_MEM_USE_HOST_PTR read-write buffer" ); buf_part = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer_part), buffer_part, &result); fail_if( result != CL_SUCCESS, "cannot create a buffer for the part that will be read" ); // Write grid into buffer result = clEnqueueWriteBufferRect(queue, buf, 1, buf_origin, host_origin, region, 0, 0, 0, 0, grid, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking write buffer rect event with pitches guessed" ); fail_if( std::memcmp(buffer, grid, sizeof(buffer)) != 0, "buffer doesn't contain the data" ); // Read it back into a temporary region buf_origin[0] = 1; buf_origin[1] = 2; buf_origin[2] = 0; // host_origin remains (0, 0, 0) region[0] = 3; region[1] = 3; region[2] = 2; result = clEnqueueReadBufferRect(queue, buf, 1, buf_origin, host_origin, region, 5, 5*7, 0, 0, buffer_part, 0, 0, 0); fail_if( result != CL_SUCCESS, "unable to queue a blocking write buffer rect event with host pitches guessed" ); fail_if( std::memcmp(buffer_part, part, sizeof(part)) != 0, "the part of the buffer was not correctly read" ); // Clear the temporary region and re-read into it using buf_part std::memset(buffer_part, 0, sizeof(buffer_part)); cl_event event; result = clEnqueueCopyBufferRect(queue, buf, buf_part, buf_origin, host_origin, region, 5, 5*7, 0, 0, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy buffer rect event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for the event" ); fail_if( std::memcmp(buffer_part, part, sizeof(part)) != 0, "the part of the buffer was not correctly read using a buffer" ); clReleaseEvent(event); clReleaseMemObject(buf_part); clReleaseMemObject(buf); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
END_TEST START_TEST (test_copy_buffer) { cl_device_id device; cl_context ctx; cl_command_queue queue; cl_int result; cl_mem src_buf, dst_buf; cl_event event; cl_platform_id platform = 0; cl_uint num_platforms = 0; clGetPlatformIDs(1, &platform, &num_platforms); char src[] = "This is the data."; char dst[] = "Overwrite this..."; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); src_buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(src), src, &result); fail_if( result != CL_SUCCESS, "cannot create the source buffer" ); dst_buf = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(dst), dst, &result); fail_if( result != CL_SUCCESS, "cannot create the destination buffer" ); result = clEnqueueCopyBuffer(queue, src_buf, dst_buf, 0, 0, sizeof(src), 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy buffer event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for the event" ); fail_if( std::memcmp(src, dst, sizeof(src)) != 0, "the buffer wasn't copied" ); clReleaseEvent(event); clReleaseMemObject(src_buf); clReleaseMemObject(dst_buf); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
void JNIContext::dispose(JNIEnv *jenv, Config* config) { //fprintf(stdout, "dispose()\n"); cl_int status = CL_SUCCESS; jenv->DeleteGlobalRef(kernelObject); jenv->DeleteGlobalRef(kernelClass); if (context != 0){ status = clReleaseContext(context); //fprintf(stdout, "dispose context %0lx\n", context); CLException::checkCLError(status, "clReleaseContext()"); context = (cl_context)0; } if (commandQueue != 0){ if (config->isTrackingOpenCLResources()){ commandQueueList.remove((cl_command_queue)commandQueue, __LINE__, __FILE__); } status = clReleaseCommandQueue((cl_command_queue)commandQueue); //fprintf(stdout, "dispose commandQueue %0lx\n", commandQueue); CLException::checkCLError(status, "clReleaseCommandQueue()"); commandQueue = (cl_command_queue)0; } if (program != 0){ status = clReleaseProgram((cl_program)program); //fprintf(stdout, "dispose program %0lx\n", program); CLException::checkCLError(status, "clReleaseProgram()"); program = (cl_program)0; } if (kernel != 0){ status = clReleaseKernel((cl_kernel)kernel); //fprintf(stdout, "dispose kernel %0lx\n", kernel); CLException::checkCLError(status, "clReleaseKernel()"); kernel = (cl_kernel)0; } if (argc > 0){ for (int i=0; i< argc; i++){ KernelArg *arg = args[i]; if (!arg->isPrimitive()){ if (arg->arrayBuffer != NULL){ if (arg->arrayBuffer->mem != 0){ if (config->isTrackingOpenCLResources()){ memList.remove((cl_mem)arg->arrayBuffer->mem, __LINE__, __FILE__); } status = clReleaseMemObject((cl_mem)arg->arrayBuffer->mem); //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->arrayBuffer->mem); CLException::checkCLError(status, "clReleaseMemObject()"); arg->arrayBuffer->mem = (cl_mem)0; } if (arg->arrayBuffer->javaArray != NULL) { jenv->DeleteWeakGlobalRef((jweak) arg->arrayBuffer->javaArray); } delete arg->arrayBuffer; arg->arrayBuffer = NULL; } } if (arg->name != NULL){ free(arg->name); arg->name = NULL; } if (arg->javaArg != NULL ) { jenv->DeleteGlobalRef((jobject) arg->javaArg); } delete arg; arg=args[i]=NULL; } delete[] args; args=NULL; // do we need to call clReleaseEvent on any of these that are still retained.... delete[] readEvents; readEvents = NULL; delete[] writeEvents; writeEvents = NULL; delete[] executeEvents; executeEvents = NULL; if (config->isProfilingEnabled()) { if (config->isProfilingCSVEnabled()) { if (profileFile != NULL && profileFile != stderr) { fclose(profileFile); } } delete[] readEventArgs; readEventArgs=0; delete[] writeEventArgs; writeEventArgs=0; } } if (config->isTrackingOpenCLResources()){ fprintf(stderr, "after dispose{ \n"); commandQueueList.report(stderr); memList.report(stderr); readEventList.report(stderr); executeEventList.report(stderr); writeEventList.report(stderr); fprintf(stderr, "}\n"); } }
int main(int argc, char *argv[]) { //FILE *fp; cl_platform_id platform_id[2]; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret_code; cl_mem image_in_mem = NULL; cl_mem image_out_mem = NULL; cl_mem twiddle_factors_mem = NULL; cl_float2 *image_in_host; cl_float2 *twiddle_factors_host; cl_kernel kernel_twiddle_factors; cl_kernel kernel_matriz_transpose; cl_kernel kernel_lowpass_filter; pgm_t ipgm; pgm_t opgm; image_file_t *image_filename; char *output_filename; FILE *fp; const char *kernel_filename = C_NOME_ARQ_KERNEL; size_t source_size; char *source_str; cl_int i, j,n ,m; cl_int raio = 0; size_t global_wg[2]; size_t local_wg[2]; float *image_amplitudes; size_t log_size; char *log_file; cl_event kernels_events_out_fft[4]; cl_ulong kernel_runtime = (cl_ulong) 0; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; struct event_in_fft_t *fft_events; //=== Timer count start ============================================================================== timer_reset(); timer_start(); //=================================================================================================== if (argc < 2) { printf("**Erro: O arquivo de entrada eh necessario.\n"); exit(EXIT_FAILURE); } image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(EXIT_FAILURE); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); //=================================================================================================== /* Abrindo imagem do arquivo para objeto de memoria local*/ if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); n = ipgm.width; raio = n/8; m = (cl_int)(log((double)n)/log(2.0)); image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2)); twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j]; image_in_host[n*i + j].s[1] = (float)0; } } fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t)); kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event)); //=================================================================================================== CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms)); if (ret_num_platforms == 0 ) { fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_id[1]); //=================================================================================================== context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code); //=================================================================================================== /* Transfer data to memory buffer */ CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event)); image_tam = n*n*sizeof(cl_float2); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code); //=================================================================================================== ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //=================================================================================================== if (ret_code != CL_SUCCESS) { // Determine the size of the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); //=================================================================================================== // Allocate memory for the log log_file = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL); printf("%s\n", log_file); system("pause"); exit(0); } kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code); kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code); kernel_lowpass_filter = clCreateKernel(program, "lowpass_filter", &ret_code); /* Processa os fatores Wn*/ //=================================================================================================== CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem)); CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n/2, 1); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0])); //=================================================================================================== /* Executa a FFT em N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1])); //=================================================================================================== /* Executa a FFT N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]); //=================================================================================================== /* Processa o filtro passa baixa */ CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2])); //=================================================================================================== /* Obtem a FFT inversa*/ fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3])); //=================================================================================================== fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event)); //=================================================================================================== //== Total time elapsed ============================================================================ timer_stop(); tempo_total = get_elapsed_time(); //================================================================================================== //====== Get time of Profile Info ================================================================== // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); for (i = 0; i < MAX_CALL_FFT; i++) { kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; if (fft_events[i].kernel_normalize != NULL) { CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } } for (j=0; j < MAX_CALL_FFT*m; j++){ kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; /* save_log_debug(write_host_dev_run_time,fp); save_log_debug(read_dev_host_run_time,fp); close_log_debug(fp); */ image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //================================================================================================== save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== image_amplitudes = (float*)malloc(n*n*sizeof(float)); for (i=0; i < n; i++) { for (j=0; j < n; j++) { image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1])); } } //clFlush(cmd_queue); //clFinish(cmd_queue); opgm.width = n; opgm.height = n; normalizar_pgm(&opgm, image_amplitudes); escrever_pgm(&opgm, output_filename); //=================================================================================================== clFinish(cmd_queue); clReleaseKernel(kernel_twiddle_factors); clReleaseKernel(kernel_matriz_transpose); clReleaseKernel(kernel_lowpass_filter); clReleaseProgram(program); clReleaseMemObject(image_in_mem); clReleaseMemObject(image_out_mem); clReleaseMemObject(twiddle_factors_mem); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); clReleaseEvent(read_dev_host_event); clReleaseEvent(write_host_dev_event); clReleaseEvent(kernels_events_out_fft[0]); clReleaseEvent(kernels_events_out_fft[1]); clReleaseEvent(kernels_events_out_fft[2]); clReleaseEvent(kernels_events_out_fft[3]); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(image_amplitudes); free(source_str); free(image_in_host); free(image_filename); free(twiddle_factors_host); free(output_filename); free(fft_events); free(kernel_butter_events); //_CrtDumpMemoryLeaks(); return 0; }
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength, int bLength, int keyLength, uint32_t** matches, char* clFile, int x, int y) { int gap = 0, myoffset = 0; cl_platform_id *platforms; cl_uint num_platforms = 0; cl_device_id *devices; cl_uint num_devices = 0; cl_context context; cl_command_queue command_queue; cl_image_format imgFormat; cl_mem aImg; cl_mem bImg; cl_mem res_buf; cl_program program; cl_kernel kernel; cl_uint *results; FILE *prgm_fptr; struct stat prgm_sbuf; char *prgm_data; size_t prgm_size; size_t offset; size_t count; const size_t global_work_size[] = { x, y }; const size_t origin[] = { 0, 0, 0 }; const size_t region[] = { aLength, 1, 1 }; cl_int ret; cl_uint i; cl_bool imageSupport; struct timeval t1, t2; double elapsedTime; results = malloc(sizeof(cl_uint) * aLength); imgFormat.image_channel_order = CL_RGBA; imgFormat.image_channel_data_type = CL_UNSIGNED_INT32; /* figure out how many CL platforms are available */ ret = clGetPlatformIDs(0, NULL, &num_platforms); if (CL_SUCCESS != ret) { print_error ("Error getting the number of platform IDs: %d", ret); exit(EXIT_FAILURE); } if (0 == num_platforms) { print_error ("No CL platforms were found."); exit(EXIT_FAILURE); } /* allocate space for each available platform ID */ if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get all of the platform IDs */ ret = clGetPlatformIDs(num_platforms, platforms, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting platform IDs: %d", ret); exit(EXIT_FAILURE); } /* find a platform that supports given device type */ // print_error ("Number of platforms found: %d", num_platforms); for (i = 0; i < num_platforms; i++) { ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL, &num_devices); if (CL_SUCCESS != ret) continue; if (0 < num_devices) break; } /* make sure at least one device was found */ if (num_devices == 0) { print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU")); exit(EXIT_FAILURE); } /* only one device is necessary... */ num_devices = 1; if (NULL == (devices = malloc((sizeof *devices) * num_devices))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get one device id */ ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices, devices, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting device IDs: %d", ret); exit(EXIT_FAILURE); } ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to get Device Info: %d", ret); exit(EXIT_FAILURE); } if(imageSupport == CL_FALSE) { print_error ("Failure: Images are not supported!"); exit(EXIT_FAILURE); } /* create a context for the CPU device that was found earlier */ context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret); if (NULL == context || CL_SUCCESS != ret) { print_error ("Failed to create context: %d", ret); exit(EXIT_FAILURE); } /* create a command queue for the CPU device */ command_queue = clCreateCommandQueue(context, devices[0], 0, &ret); if (NULL == command_queue || CL_SUCCESS != ret) { print_error ("Failed to create a command queue: %d", ret); exit(EXIT_FAILURE); } /* create buffers on the CL device */ aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == aImg || CL_SUCCESS != ret) { print_error ("Failed to create a image: %d", ret); exit(EXIT_FAILURE); } bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == bImg || CL_SUCCESS != ret) { print_error ("Failed to create b image: %d", ret); exit(EXIT_FAILURE); } int res_bufSize = aLength; res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * res_bufSize, NULL, &ret); if (NULL == res_buf || CL_SUCCESS != ret) { print_error ("Failed to create b buffer: %d", ret); exit(EXIT_FAILURE); } /* read the opencl program code into a string */ prgm_fptr = fopen(clFile, "r"); if (NULL == prgm_fptr) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } if (0 != stat(clFile, &prgm_sbuf)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } prgm_size = prgm_sbuf.st_size; prgm_data = malloc(prgm_size); if (NULL == prgm_data) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* make sure all data is read from the file (just in case fread returns * short) */ offset = 0; while (prgm_size - offset != (count = fread(prgm_data + offset, 1, prgm_size - offset, prgm_fptr))) offset += count; if (0 != fclose(prgm_fptr)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } /* create a 'program' from the source */ program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data, &prgm_size, &ret); if (NULL == program || CL_SUCCESS != ret) { print_error ("Failed to create program with source: %d", ret); exit(EXIT_FAILURE); } /* compile the program.. (it uses llvm or something) */ ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL); if (CL_SUCCESS != ret) { size_t size; char *log = calloc(1, 4000); if (NULL == log) { print_error ("Out of memory"); exit(EXIT_FAILURE); } print_error ("Failed to build program: %d", ret); ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 4096, log, &size); if (CL_SUCCESS != ret) { print_error ("Failed to get program build info: %d", ret); exit(EXIT_FAILURE); } fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log); exit(EXIT_FAILURE); } /* pull out a reference to your kernel */ kernel = clCreateKernel(program, "cgm_kernel", &ret); if (NULL == kernel || CL_SUCCESS != ret) { print_error ("Failed to create kernel: %d", ret); exit(EXIT_FAILURE); } gettimeofday(&t1, NULL); /* write data to these buffers */ clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0, (void*) aImg, 0, NULL, NULL); clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0, (void*) bImg, 0, NULL, NULL); /* set your kernel's arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 4, sizeof(int), &gap); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } /* make sure buffers have been written before executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* enque this kernel for execution... */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue kernel: %d", ret); exit(EXIT_FAILURE); } /* wait for the kernel to finish executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* copy the contents of dev_buf from the CL device to the host (CPU) */ ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint) * aLength, results, 0, NULL, NULL); gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms if (CL_SUCCESS != ret) { print_error ("Failed to copy data from device to host: %d", ret); exit(EXIT_FAILURE); } ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* make sure the content of the buffer are what we expect */ //for (i = 0; i < aLength; i++) // printf("%d\n", results[i]); /* free up resources */ ret = clReleaseKernel(kernel); if (CL_SUCCESS != ret) { print_error ("Failed to release kernel: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseProgram(program); if (CL_SUCCESS != ret) { print_error ("Failed to release program: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(aImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(bImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) { print_error ("Failed to release command queue: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseContext(context))) { print_error ("Failed to release context: %d", ret); exit(EXIT_FAILURE); } matches = &results; return elapsedTime; }
static void clrpc_client_test2(void) { int err; int size = 1024; cl_uint nplatforms = 0; cl_platform_id* platforms = 0; cl_uint nplatforms_ret; clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); printf( "after call one i get nplatforms_ret = %d", nplatforms_ret); if (nplatforms_ret == 0) exit(1); nplatforms = nplatforms_ret; platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id)); clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); int i; for(i=0;i<nplatforms;i++) { clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj; int is_rpc; if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) { printf( "platforms[%d] local=%p remote=%p\n", i,(void*)tmp->local, (void*)tmp->remote); } else { printf( "platforms[%d] not RPC\n",i); } } char buffer[1024]; size_t sz; cl_platform_id rpc_platform = 0; for(i=0;i<nplatforms;i++) { clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } int iplat; for(iplat=0;iplat<nplatforms;iplat++) { printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat); cl_uint ndevices = 0; cl_device_id* devices = 0; cl_uint ndevices_ret; clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); printf( "after call one i get ndevices_ret = %d\n", ndevices_ret); if (ndevices_ret > 10) exit(-1); ndevices = ndevices_ret; devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id)); clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); if (!ndevices_ret) { //printf("no devices, stopping.\n"); //exit(1); printf("no devices, skipping.\n"); continue; } for(i=0;i<ndevices;i++) { clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj; clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz); printf( "CL_DEVICE_NAME |%s|\n",buffer); cl_platform_id tmpid; clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz); printf("%p\n",platforms[iplat]); fflush(stdout); printf("%p\n",tmpid); fflush(stdout); clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 }; printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]); cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err); cl_command_queue* cmdq = (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue)); for(i=0;i<ndevices;i++) { cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err); printf( "cmdq %d %p",i,cmdq[i]); } cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); int* a = (int*)malloc(1024*sizeof(int)); int* b = (int*)malloc(1024*sizeof(int)); int* c = (int*)malloc(1024*sizeof(int)); int* d = (int*)malloc(1024*sizeof(int)); char* prgsrc[] = { "__kernel void my_kern( int n, __global int* a, __global int* b )\n" " { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" }; size_t prgsrc_sz = strlen(prgsrc[0]) + 1; cl_program prg = clCreateProgramWithSource(ctx,1, (const char**)prgsrc,&prgsrc_sz,&err); clBuildProgram(prg,ndevices,devices,0,0,0); cl_kernel krn = clCreateKernel(prg,"my_kern",&err); int idev; for(idev=0;idev<ndevices;idev++) { printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat); for(i=0;i<size;i++) a[i] = i*10; for(i=0;i<size;i++) b[i] = i*10+1; for(i=0;i<size;i++) c[i] = 0; for(i=0;i<size;i++) d[i] = 0; cl_event ev[8]; for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n"); clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a, 0,0,&ev[0]); clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b, 1,ev,&ev[1]); clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 2,ev,&ev[2]); clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 3,ev,&ev[3]); size_t offset = 0; size_t gwsz = 128; size_t lwsz = 16; clSetKernelArg(krn,0,sizeof(int),&size); clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]); clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]); clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 6,ev,&ev[6]); clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 7,ev,&ev[7]); clFlush(cmdq[idev]); clWaitForEvents(8,ev); for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n"); for(i=0;i<8;i++) clReleaseEvent(ev[i]); } clReleaseKernel(krn); clReleaseProgram(prg); clReleaseMemObject(a_buf); clReleaseMemObject(b_buf); clReleaseMemObject(c_buf); clReleaseMemObject(d_buf); clReleaseCommandQueue(cmdq[0]); clReleaseContext(ctx); // printf("sleeping ...\n"); // sleep(1); } // clrpc_final(); }
//////////////////////////////////////////////////////////////////////////////////// // Measure the local memoy to local memoy bandwidth. //////////////////////////////////////////////////////////////////////////////////// int measureLocalMemory(cl_device_id device_id, cl_context context, cl_command_queue commands, unsigned int type, int f4, unsigned int elements, unsigned int iterations, bool larg, double time_taken[2]) { cl_int err = CL_SUCCESS; const char* source_path = "mem_streaming.cl"; char buf[512]; int elementsToAlloc = elements; size_t local, global; for(size_t ws = 0; ws <= 1; ++ws) { if(ws == 0) { // Execute the kernel using just one single workitem local = 1; global = 1; } else { // Execute the kernel using the max number of threads on each processor _DEVICE_INFO* info = get_device_info(device_id); size_t* tmp = info->max_work_item_sizes; local = tmp[0]; free(tmp); global = info->max_compute_units; while(local > elements) local /= 2; global *= local; } if(type == 1) elementsToAlloc = (elements + local-1)/local; if(f4 == 0) sprintf(buf, "#define dtype float\n"); else sprintf(buf, "#define dtype float%d\n", (int)pow(2.0, f4)); sprintf(buf+strlen(buf), "#define VEC %d\n#define ELEMENTS %d\n#define localRange %lu\n", f4, elementsToAlloc, local); if(larg) sprintf(buf+strlen(buf), "#define LARG\n"); cl_program program = load_kernel(source_path, context, buf); if(!program) { fprintf(stderr, "Error: Failed to create compute program!\n"); return 1; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { size_t len; char buffer[8096]; fprintf(stderr, "Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); fprintf(stderr, "%s\n", buffer); return 1; } // Create the compute kernel cl_kernel kernel; switch(type) { case 1: kernel = clCreateKernel(program, "private_mem", &err); break; case 2: kernel = clCreateKernel(program, "global_mem", &err); break; default: kernel = clCreateKernel(program, "local_mem", &err); } if (!kernel || err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to create compute kernel!\n"); return 1; } float* hOutput = (float*)malloc(global * sizeof(float)); memset(hOutput, 0, global * sizeof(float)); cl_mem output = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * global, hOutput, NULL); if (!output || err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to allocate device memory!\n"); return 1; } // Set the arguments to our compute kernel err = CL_SUCCESS; err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &output); cl_mem g1, g2; switch(type) { case 1: break; case 2: switch(f4) { case(1): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2) * elements*2, NULL, NULL); break; case(2): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * elements*2, NULL, NULL); break; case(3): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * elements*2, NULL, NULL); break; case(4): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float16) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float16) * elements*2, NULL, NULL); break; default: g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * elements*2, NULL, NULL); break; break; } err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &g1); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &g2); break; default: if(larg) switch(f4) { case(1): err |= clSetKernelArg(kernel, 1, sizeof(cl_float2)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float2)*elements*2, NULL); break; case(2): err |= clSetKernelArg(kernel, 1, sizeof(cl_float4)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float4)*elements*2, NULL); break; case(3): err |= clSetKernelArg(kernel, 1, sizeof(cl_float8)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float8)*elements*2, NULL); break; case(4): err |= clSetKernelArg(kernel, 1, sizeof(cl_float8)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float8)*elements*2, NULL); break; default: err |= clSetKernelArg(kernel, 1, sizeof(cl_float)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float)*elements*2, NULL); break; break; } } if (err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to set kernel arguments! %d\n", err); return 1; } // warmup for(unsigned i = 0; i < WARMUP_CYCLES; ++i) { err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); } // start actual measurement unsigned long start_time = current_msecs(); for(unsigned i = 0; i < iterations; ++i) { err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { fprintf(stderr, "Error %i: Failed to execute kernel!\n%s\n", err, oclErrorString(err)); return 1; } clFlush(commands); } clFinish(commands); time_taken[ws] = elapsed_msecs(start_time) / 1000.0; /* cl_event read; err = clEnqueueReadBuffer(commands, output, CL_FALSE, 0, global*sizeof(float), hOutput, 0, NULL, &read); if (err) { fprintf(stderr, "Error %i: Failed read buffer!\n%s\n", err, oclErrorString(err)); return 1; } clWaitForEvents(1, &read); for(size_t i = 0; i < global; ++i) printf(", %d %f ", i, hOutput[i]); printf("\n\n"); */ free(hOutput); clReleaseMemObject(output); if(type == 2) { clReleaseMemObject(g1); clReleaseMemObject(g2); } clReleaseProgram(program); clReleaseKernel(kernel); } return err; }
inline void vector_sum(const int arraySize, const double* inputA, const double* inputB, double* output) { /* Allocate memory buffers */ /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than * allocating it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within * the kernel. */ bool createMemoryObjectSuccess = true; int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; int errorNumber = 0; int bufferSize = arraySize*sizeof(double); memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputA, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 1."); memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, (void*)inputB, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 2."); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, bufferSize, output, &errorNumber); checkErr(errorNumber, "Failed to create buffer, 3."); /* Enqueue commands and kernels */ /* Enqueue to the command queues the commands that control the sequence * and synchronization of kernel execution, reading and writing of data, * and manipulation of memory objects */ /* Execute a kernel function */ /* Call clSetKernelArg() for each parameter in the kernel */ bool setKernelArgumentsSuccess = true; setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1])); setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2])); if (not setKernelArgumentsSuccess) { cleanUpOpenCL(); std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Determine the work-group size and index space for the kernel */ const size_t globalWorkSize[1] = {arraySize}; const size_t localWorkSize[1] = { 1 }; /* Enqueue the kernel for execution in the command queue */ //for (int j = 0; j < ITER; j++) { if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":" << __LINE__ <<std::endl; exit(1); } //} /* Get a pointer to the output data */ output = (double*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, arraySize, 0, NULL, NULL, &errorNumber); if (not checkSuccess(errorNumber)) { cleanUpOpenCL(); std::cerr << "Failed to map buffer " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } /* Wait for kernel execution */ if (not checkSuccess(clFinish(commandQueue))) { cleanUpOpenCL(); std::cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << std::endl; exit(1); } /* Unmap the memory objects as we finished using them in the CPU */ if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clReleaseMemObject(memoryObjects[1]))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } if (not checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL))) { cleanUpOpenCL(); std::cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ << std::endl; exit(1); } }
END_TEST START_TEST (test_read_write_image) { cl_device_id device; cl_context ctx; cl_command_queue queue; cl_mem image2d, part2d; cl_int result; cl_platform_id platform = 0; cl_uint num_platforms = 0; clGetPlatformIDs(1, &platform, &num_platforms); unsigned char image2d_data_24bpp[3*3*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 128, 128, 128, 0, 0, 0, 255, 0, 255, 255, 0, 0, 0, 128, 0, 0, 255, 128, 0, 0, 128, 0, 255, 0, 0, 0, 0, 0 }; unsigned char image2d_part_24bpp[2*2*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 0, 0, 255, 0, 255, 255, 0, 0 }; unsigned char image2d_buffer[3*3*4]; unsigned char image2d_part[2*2*4]; cl_image_format fmt; fmt.image_channel_data_type = CL_UNORM_INT8; fmt.image_channel_order = CL_RGBA; size_t origin[3] = {0, 0, 0}; size_t region[3] = {3, 3, 1}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); image2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 3, 3, 0, image2d_buffer, &result); fail_if( result != CL_SUCCESS || image2d == 0, "cannot create a valid 3x3 image2D" ); part2d = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 2, 2, 0, image2d_part, &result); fail_if( result != CL_SUCCESS || image2d == 0, "cannot create a valid 2x2 image2D" ); // Write data in buffer result = clEnqueueWriteImage(queue, image2d, 1, origin, region, 0, 0, image2d_data_24bpp, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking write image event" ); // Read it back region[0] = 2; region[1] = 2; result = clEnqueueReadImage(queue, image2d, 1, origin, region, 0, 0, image2d_part, 0, 0, 0); fail_if( result != CL_SUCCESS, "cannot enqueue a blocking read image event" ); // Compare #if 0 // images not supported fail_if( std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0, "reading and writing images doesn't produce the correct result" ); #endif // Read it back using a buffer cl_event event; std::memset(image2d_part, 0, sizeof(image2d_part)); result = clEnqueueCopyImage(queue, image2d, part2d, origin, origin, region, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to enqueue a copy image event" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "unable to wait for events" ); // Compare #if 0 // images not supported fail_if( std::memcmp(image2d_part, image2d_part_24bpp, sizeof(image2d_part)) != 0, "copying images doesn't produce the correct result" ); #endif clReleaseEvent(event); clReleaseMemObject(part2d); clReleaseMemObject(image2d); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
int main(int argc, char **argv) { /* test name */ char name[] = "test_sampler_address_clamp"; size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 }; size_t srcdir_length, name_length, filename_size; char *filename = NULL; char *source = NULL; cl_device_id devices[1]; cl_context context = NULL; cl_command_queue queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_int result; int retval = -1; /* image parameters */ cl_uchar4 *imageData; cl_image_format image_format; cl_image_desc image_desc; printf("Running test %s...\n", name); memset(&image_desc, 0, sizeof(cl_image_desc)); image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; image_desc.image_width = 4; image_desc.image_height = 4; image_format.image_channel_order = CL_RGBA; image_format.image_channel_data_type = CL_UNSIGNED_INT8; imageData = (cl_uchar4*)malloc (4 * 4 * sizeof(cl_uchar4)); if (imageData == NULL) { puts("out of host memory\n"); goto error; } memset (imageData, 1, 4*4*sizeof(cl_uchar4)); /* determine file name of kernel source to load */ srcdir_length = strlen(SRCDIR); name_length = strlen(name); filename_size = srcdir_length + name_length + 16; filename = (char *)malloc(filename_size + 1); if (!filename) { puts("out of memory"); goto error; } snprintf(filename, filename_size, "%s/%s.cl", SRCDIR, name); /* read source code */ source = poclu_read_file (filename); TEST_ASSERT (source != NULL && "Kernel .cl not found."); /* setup an OpenCL context and command queue using default device */ context = poclu_create_any_context(); if (!context) { puts("clCreateContextFromType call failed\n"); goto error; } result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), devices, NULL); if (result != CL_SUCCESS) { puts("clGetContextInfo call failed\n"); goto error; } queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queue) { puts("clCreateCommandQueue call failed\n"); goto error; } /* Create image */ cl_mem image = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &image_format, &image_desc, imageData, &result); if (result != CL_SUCCESS) { puts("image creation failed\n"); goto error; } /* create and build program */ program = clCreateProgramWithSource (context, 1, (const char **)&source, NULL, NULL); if (!program) { puts("clCreateProgramWithSource call failed\n"); goto error; } result = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (result != CL_SUCCESS) { puts("clBuildProgram call failed\n"); goto error; } /* execute the kernel with give name */ kernel = clCreateKernel(program, name, NULL); if (!kernel) { puts("clCreateKernel call failed\n"); goto error; } result = clSetKernelArg( kernel, 0, sizeof(cl_mem), &image); if (result) { puts("clSetKernelArg failed\n"); goto error; } result = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (result != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } result = clFinish(queue); if (result == CL_SUCCESS) retval = 0; error: if (image) { clReleaseMemObject (image); } if (kernel) { clReleaseKernel(kernel); } if (program) { clReleaseProgram(program); } if (queue) { clReleaseCommandQueue(queue); } if (context) { clUnloadCompiler (); clReleaseContext (context); } if (source) { free(source); } if (filename) { free(filename); } if (imageData) { free(imageData); } if (retval) { printf("FAIL\n"); return 1; } printf("OK\n"); return 0; }
END_TEST START_TEST (test_copy_image_buffer) { cl_device_id device; cl_context ctx; cl_command_queue queue; cl_mem image, buffer; cl_int result; cl_event event; cl_platform_id platform = 0; cl_uint num_platforms = 0; clGetPlatformIDs(1, &platform, &num_platforms); unsigned char image_buffer[3*3*4] = { 255, 0, 0, 0, 0, 255, 0, 0, 0, 0, 255, 0, 128, 0, 0, 0, 0, 128, 0, 0, 0, 0, 128, 0, 64, 0, 0, 0, 0, 64, 0, 0, 0, 0, 64, 0 }; // Square that will be put in image_buffer at (1, 0) unsigned char buffer_buffer[2*2*4+1] = { 33, // Oh, a padding ! 255, 255, 255, 0, 255, 0, 255, 0, 0, 255, 255, 0, 255, 255, 0, 0 }; // What we must get once re-reading 2x2 rect at (1, 1) unsigned char correct_data[2*2*4] = { 0, 255, 255, 0, 255, 255, 0, 0, 0, 64, 0, 0, 0, 0, 64, 0 }; cl_image_format fmt; fmt.image_channel_data_type = CL_UNORM_INT8; fmt.image_channel_order = CL_RGBA; size_t origin[3] = {1, 0, 0}; size_t region[3] = {2, 2, 1}; result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, 0); fail_if( result != CL_SUCCESS, "unable to get the default device" ); ctx = clCreateContext(0, 1, &device, 0, 0, &result); fail_if( result != CL_SUCCESS || ctx == 0, "unable to create a valid context" ); queue = clCreateCommandQueue(ctx, device, 0, &result); fail_if( result != CL_SUCCESS || queue == 0, "cannot create a command queue" ); image = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &fmt, 3, 3, 0, image_buffer, &result); fail_if( result != CL_SUCCESS, "unable to create a 3x3 bgra image" ); buffer = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(buffer_buffer), buffer_buffer, &result); fail_if( result != CL_SUCCESS, "unable to create a buffer object" ); // Write buffer in image result = clEnqueueCopyBufferToImage(queue, buffer, image, 1, origin, region, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy buffer to image event, buffer offset 1, image 2x2 @ (1, 0)" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "cannot wait for event" ); clReleaseEvent(event); // Read it back into buffer, again with an offset origin[1] = 1; result = clEnqueueCopyImageToBuffer(queue, image, buffer, origin, region, 1, 0, 0, &event); fail_if( result != CL_SUCCESS, "unable to queue a copy image to buffer event, buffer offset 1, image 2x2 @ (1, 1)" ); result = clWaitForEvents(1, &event); fail_if( result != CL_SUCCESS, "cannot wait for event" ); #if 0 // images not supported fail_if( std::memcmp(buffer_buffer + 1, correct_data, sizeof(correct_data)) != 0, "copying data around isn't working the expected way" ); #endif // Map the image and check pointers unsigned char *mapped; size_t row_pitch; origin[0] = 0; origin[1] = 0; origin[2] = 0; mapped = (unsigned char *)clEnqueueMapImage(queue, image, 1, CL_MAP_READ, origin, region, &row_pitch, 0, 0, 0, 0, &result); fail_if( result != CL_SUCCESS, "unable to map an image" ); #if 0 // images not supported fail_if( mapped != image_buffer, "mapped aread doesn't match host ptr" ); #endif clReleaseEvent(event); clReleaseMemObject(image); clReleaseMemObject(buffer); clReleaseCommandQueue(queue); clReleaseContext(ctx); }
static cl_int opencl_plugin_init_mesh_buffers(opencl_plugin plugin, cl_int mesh_data_count, mesh_data *mesh_data_list) { cl_int err; cl_int i; cl_mem new_vertex_buffer = NULL, new_triangle_buffer = NULL; cl_int total_num_vertices = 0, total_num_triangles = 0; assert(plugin != NULL); assert(mesh_data_count >= 0); assert(mesh_data_list != NULL); for (i = 0; i < mesh_data_count; i++) { total_num_vertices += mesh_data_list[i].num_vertices; total_num_triangles += mesh_data_list[i].num_triangles; } if (total_num_vertices > plugin->vertex_buffer_capacity) { /* Current buffer not big enough, free old buffer first */ if (plugin->vertex_buffer) { clReleaseMemObject(plugin->vertex_buffer); plugin->vertex_buffer = NULL; } plugin->vertex_buffer_capacity = 0; /* TODO: Maybe better dynamic resizing (factor = 1.5)? */ new_vertex_buffer = clCreateBuffer(plugin->context, CL_MEM_READ_ONLY, sizeof(float) * 3 * total_num_vertices, NULL, &err); CHECK_CL_ERROR(err); plugin->vertex_buffer_capacity = total_num_vertices; plugin->vertex_buffer = new_vertex_buffer; new_vertex_buffer = NULL; } if (total_num_triangles > plugin->triangle_buffer_capacity) { /* Current buffer not big enough, free old buffer first */ if (plugin->triangle_buffer) { clReleaseMemObject(plugin->triangle_buffer); plugin->triangle_buffer = NULL; } plugin->triangle_buffer_capacity = 0; /* TODO: Maybe better dynamic resizing (factor = 1.5)? */ new_triangle_buffer = clCreateBuffer(plugin->context, CL_MEM_READ_ONLY, sizeof(cl_int) * 3 * total_num_triangles, NULL, &err); CHECK_CL_ERROR(err); plugin->triangle_buffer_capacity = total_num_triangles; plugin->triangle_buffer = new_triangle_buffer; new_triangle_buffer = NULL; } total_num_vertices = 0; total_num_triangles = 0; for (i = 0; i < mesh_data_count; i++) { mesh_data *mesh_data = &mesh_data_list[i]; err = clEnqueueWriteBuffer( plugin->queue, plugin->vertex_buffer, CL_FALSE, sizeof(float) * 3 * total_num_vertices, sizeof(float) * 3 * mesh_data->num_vertices, mesh_data->vertices, 0, NULL, NULL); CHECK_CL_ERROR(err); err = clEnqueueWriteBuffer( plugin->queue, plugin->triangle_buffer, CL_FALSE, sizeof(cl_int) * 3 * total_num_triangles, sizeof(cl_int) * 3 * mesh_data->num_triangles, mesh_data->triangles, 0, NULL, NULL); CHECK_CL_ERROR(err); total_num_vertices += mesh_data_list[i].num_vertices; total_num_triangles += mesh_data_list[i].num_triangles; } /* Wait for all buffer writes to finish, TODO: investigate this further */ err = clFinish(plugin->queue); CHECK_CL_ERROR(err); return 0; error: if (new_vertex_buffer) clReleaseMemObject(new_vertex_buffer); if (new_triangle_buffer) clReleaseMemObject(new_triangle_buffer); return -1; }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelINIT; cl_event kernel_event, finish_event; cl_mem objPARTICULAS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "initparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelINIT = clCreateKernel(program, "calc_particles_init", &ret); // Creamos el buffer para las partÃculas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret); const size_t global = 4; const size_t local_work_size = 1; // Transferimos el frame al dispositivo cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseKernel(kernelINIT); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
bool runTest( int argc, const char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads; cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1); clReleaseKernel(reductionKernel); if (smallBlock) maxThreads = 64; // number of threads per block else maxThreads = 128; int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog(" %d elements\n", size); shrLog(" %d threads (max)\n", maxThreads); cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE); shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE); #ifdef GPU_PROFILING if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); return true; } else #endif { // create random input data on CPU unsigned int bytes = size * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; shrLog(" %d blocks\n\n", numBlocks); // allocate mem for the result on host side T* h_odata = (T*)malloc(numBlocks * sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; T gpu_result = 0; gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, &dTotalTime, h_odata, d_idata, d_odata); #ifdef GPU_PROFILING double reduceTime = dTotalTime/(double)testIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); #endif // compute reference solution shrLog("\nComparing against Host/C++ computation...\n"); T cpu_result = reduceCPU<T>(h_idata, size); if (datatype == REDUCE_INT) { shrLog(" GPU result = %d\n", gpu_result); shrLog(" CPU result = %d\n\n", cpu_result); shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED"); } else { shrLog(" GPU result = %.9f\n", gpu_result); shrLog(" CPU result = %.9f\n\n", cpu_result); double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12; double diff = abs((double)gpu_result - (double)cpu_result); shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED"); } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return (gpu_result == cpu_result); } }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e2; int opsMAC = 2; // operations per MAC cl_short4 *in, *out; cl_half *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_half *) calloc(9*9, sizeof(*ck)); in[0].x = 0x3c00; in[1].x = 0x4000; in[dsize].x = 0x4100; ck[0] = 0x3c00; ck[1] = 0x4000; ck[9] = 0x3000; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }
int main(int argc, char **argv) { int start,end; unsigned long p[64], c[64], k[56]; unsigned long res; build_samples (p, c, k, 0); set_low_keys(k); cl_platform_id cpPlatform; clGetPlatformIDs(1, &cpPlatform, NULL); cl_device_id cdDevice; clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); char cBuffer[1024]; clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DEVICE_NAME:\t\t%s\n", cBuffer); clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DRIVER_VERSION:\t%s\n\n", cBuffer); cl_uint compute_units; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS:\t%u\n", compute_units); size_t workitem_dims; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims); size_t workitem_size[3]; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); size_t workgroup_size; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size); cl_uint clock_frequency; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL); printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency); cl_context GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, NULL); cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL); cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, p, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, c, NULL); cl_mem GPUVector3 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 56, k, NULL); cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned long), &res, NULL); size_t szKernelLength; char* cSourceCL = oclLoadProgSource("ocl_deseval.cl", "", &szKernelLength); cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&cSourceCL, &szKernelLength, NULL); if (clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL)!=CL_SUCCESS) { char cBuffer[2048]; if(clGetProgramBuildInfo(OpenCLProgram,cdDevice,CL_PROGRAM_BUILD_LOG,sizeof(cBuffer),cBuffer,NULL)==CL_SUCCESS); printf("Build error:\n%s\n",cBuffer); exit(1); } cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "keysearch", NULL); clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2); clSetKernelArg(OpenCLVectorAdd, 3, sizeof(cl_mem), (void*)&GPUVector3); size_t WorkSize[1] = {1024}; start=clock(); for (int i=0; i<1024; i++) { //clEnqueueWriteBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // 56 * sizeof(unsigned long), k, 0, NULL, NULL); clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); //clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // sizeof(unsigned long), &res, 0, NULL, NULL); if(res!=0) { printf("Key found\n"); //key_found(res,k); break; } increment_key (k); } end=clock(); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); printf ("Searched %i keys in %.3f seconds\n", 1000000, ((double)(end-start))/CLOCKS_PER_SEC); return 0; }
int CommandGenerate::execute(const std::vector<std::string>& p_args) { if(p_args.size() < 10) { help(); return -1; } unsigned int platformId = atol(p_args[1].c_str()); unsigned int deviceId = atol(p_args[2].c_str()); unsigned int staggerSize = atol(p_args[3].c_str()); unsigned int threadsNumber = atol(p_args[4].c_str()); unsigned int hashesNumber = atol(p_args[5].c_str()); unsigned int nonceSize = PLOT_SIZE * staggerSize; std::cerr << "Threads number: " << threadsNumber << std::endl; std::cerr << "Hashes number: " << hashesNumber << std::endl; unsigned int numjobs = (p_args.size() - 5)/4; std::cerr << numjobs << " plot(s) to do." << std::endl; unsigned int staggerMbSize = staggerSize / 4; std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl; std::vector<std::string> paths(numjobs); std::vector<std::ofstream *> out_files(numjobs); std::vector<unsigned long long> addresses(numjobs); std::vector<unsigned long long> startNonces(numjobs); std::vector<unsigned long long> endNonces(numjobs); std::vector<unsigned int> noncesNumbers(numjobs); std::vector<unsigned char*> buffersCpu(numjobs); std::vector<bool> saving_thread_flags(numjobs); std::vector<std::future<void>> save_threads(numjobs); unsigned long long maxNonceNumber = 0; unsigned long long totalNonces = 0; int returnCode = 0; try { for (unsigned int i = 0; i < numjobs; i++) { std::cerr << "----" << std::endl; std::cerr << "Job number " << i << std::endl; unsigned int argstart = 6 + i*4; paths[i] = std::string(p_args[argstart]); addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10); startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10); noncesNumbers[i] = atol(p_args[argstart+3].c_str()); maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]); totalNonces += noncesNumbers[i]; std::ostringstream outFile; outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \ noncesNumbers[i] << "_" << staggerSize; std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc; out_files[i] = new std::ofstream(outFile.str(), file_mode); assert(out_files[i]); if(noncesNumbers[i] % staggerSize != 0) { noncesNumbers[i] -= noncesNumbers[i] % staggerSize; noncesNumbers[i] += staggerSize; } endNonces[i] = startNonces[i] + noncesNumbers[i]; unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024; std::cerr << "Path: " << outFile.str() << std::endl; std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl; std::cerr << "Creating CPU buffer" << std::endl; buffersCpu[i] = new unsigned char[nonceSize]; if(!buffersCpu[i]) { throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)"); } saving_thread_flags[i] = false; std::cerr << "----" << std::endl; } cl_platform_id platforms[4]; cl_uint platformsNumber; cl_device_id devices[32]; cl_uint devicesNumber; cl_context context = 0; cl_command_queue commandQueue = 0; cl_mem bufferGpuGen = 0; cl_mem bufferGpuScoops = 0; cl_program program = 0; cl_kernel kernelStep1 = 0; cl_kernel kernelStep2 = 0; cl_kernel kernelStep3 = 0; int error; std::cerr << "Retrieving OpenCL platforms" << std::endl; error = clGetPlatformIDs(4, platforms, &platformsNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL platforms"); } if(platformId >= platformsNumber) { throw std::runtime_error("No platform found with the provided id"); } std::cerr << "Retrieving OpenCL GPU devices" << std::endl; error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL devices"); } if(deviceId >= devicesNumber) { throw std::runtime_error("No device found with the provided id"); } std::cerr << "Creating OpenCL context" << std::endl; context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL context"); } std::cerr << "Creating OpenCL command queue" << std::endl; commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL command queue"); } std::cerr << "Creating OpenCL GPU generation buffer" << std::endl; bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer"); } std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl; bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer"); } std::cerr << "Creating OpenCL program" << std::endl; std::string source = loadSource("kernel/nonce.cl"); const char* sources[] = {source.c_str()}; size_t sourcesLength[] = {source.length()}; program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL program"); } std::cerr << "Building OpenCL program" << std::endl; error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0); if(error != CL_SUCCESS) { size_t logSize; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); char* log = new char[logSize]; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0); std::cerr << log << std::endl; delete[] log; throw OpenclError(error, "Unable to build the OpenCL program"); } std::cerr << "Creating OpenCL step1 kernel" << std::endl; kernelStep1 = clCreateKernel(program, "nonce_step1", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step2 kernel" << std::endl; kernelStep2 = clCreateKernel(program, "nonce_step2", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step3 kernel" << std::endl; kernelStep3 = clCreateKernel(program, "nonce_step3", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize); error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen); error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } size_t globalWorkSize = staggerSize; size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber; time_t startTime = time(0); unsigned int totalNoncesCompleted = 0; for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) { for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) { unsigned long long nonce = startNonces[jobnum] + nonce_ordinal; if (nonce > endNonces[jobnum]) { break; } std::cout << "Running with start nonce " << nonce << std::endl; // Is a cl_ulong always an unsigned long long? unsigned int error = 0; error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step1 kernel launch"); } unsigned int hashesSize = hashesNumber * HASH_SIZE; for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) { error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce); error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset); error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel launch"); } error = clFinish(commandQueue); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel finish"); } } totalNoncesCompleted += staggerSize; double percent = 100.0 * (double)totalNoncesCompleted / totalNonces; time_t currentTime = time(0); double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0; double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed; std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s"; std::cerr << "... "; error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step3 kernel launch"); } if (saving_thread_flags[jobnum]) { save_threads[jobnum].wait(); // Wait for last job to finish saving_thread_flags[jobnum] = false; } error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in synchronous read"); } saving_thread_flags[jobnum] = true; save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]); } } //Clean up for (unsigned int i = 0; i < paths.size(); i += 1) { if (saving_thread_flags[i]) { std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl; save_threads[i].wait(); saving_thread_flags[i] = false; std::cerr << "done waiting for final save" << std::endl; if (buffersCpu[i]) { delete[] buffersCpu[i]; } } } if(kernelStep3) { clReleaseKernel(kernelStep3); } if(kernelStep2) { clReleaseKernel(kernelStep2); } if(kernelStep1) { clReleaseKernel(kernelStep1); } if(program) { clReleaseProgram(program); } if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); } if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); } if(commandQueue) { clReleaseCommandQueue(commandQueue); } if(context) { clReleaseContext(context); } time_t currentTime = time(0); double elapsedTime = difftime(currentTime, startTime) / 60.0; double speed = (double)totalNonces / elapsedTime; std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s"; std::cerr << " " << std::endl; } catch(const OpenclError& ex) { std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl; returnCode = -1; } catch(const std::exception& ex) { std::cerr << "[ERROR] " << ex.what() << std::endl; returnCode = -1; } return returnCode; }
unsigned int kernel_launch ( cl_kernel kernel, cl_context context, cl_command_queue cmd_queue, unsigned int n, int * a, int * b, int * c ) { int error = 1; int * aVec = calloc( n , sizeof ( int ) ); int * bVec = calloc( n , sizeof ( int ) ); int * cVec = calloc( n , sizeof ( int ) ); cl_int err; if( aVec == NULL || bVec == NULL || cVec == NULL ) { return ( 0 ); } /* Here it is not needed */ fill_aVec ( n, a, aVec ); fill_bVec ( n, b, bVec ); cl_mem aVec_device = malloc_device (context, n * sizeof( int ), &error); if(error) { return ( 0 ); } init_device_mem_int (context, cmd_queue, aVec_device, aVec, n, &error); if(error) { return ( 0 ); } cl_mem bVec_device = malloc_device (context, n * sizeof( int ), &error); if(error) { return ( 0 ); } init_device_mem_int (context, cmd_queue, bVec_device, bVec, n, &error); if(error) { return ( 0 ); } cl_mem cVec_device = malloc_device (context, n * sizeof( int ), &error); if(error) { return ( 0 ); } init_device_mem_int (context, cmd_queue, cVec_device, cVec, n, &error); if(error) { return ( 0 ); } err = clFinish( cmd_queue ); if( err != CL_SUCCESS ) { return ( 0 ); } set_kernel_arguments ( kernel, cmd_queue, aVec_device, bVec_device, cVec_device ); err = clFinish( cmd_queue ); if( err != CL_SUCCESS ) { return ( 0 ); } size_t WorkSizeGlobal[] = {n}; size_t WorkSizeLocal[] = {1}; err = clEnqueueNDRangeKernel( cmd_queue, kernel, 1, NULL, WorkSizeGlobal, WorkSizeLocal, 0, NULL, NULL); if( err != CL_SUCCESS ) { return ( 0 ); } err=clFinish( cmd_queue ); if( err != CL_SUCCESS ) { return ( 0 ); } read_device_mem_int (cmd_queue, n, c, cVec_device, &error); if( error ) { return ( 0 ); } /*Here c should contain the result */ free (aVec); free (bVec); free (cVec); clReleaseMemObject(aVec_device); clReleaseMemObject(bVec_device); clReleaseMemObject(cVec_device); return ( 1 ); }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", devices_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); memObjects[0] = input_buffer; memObjects[1] = output_buffer; double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err)); printf("setting up kernel args cl_mem:%lx \n",input_buffer); fflush(stdout); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA; i++) { double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;; CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { double data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL)); //printf(" %lg", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_mxpa/histo_intermediates.cl", "src/opencl_mxpa/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) / (int)final_localWS[0])*(int)final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #define GPU_PROFILING 1 #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-3) ? QA_PASSED : QA_FAILED); }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program binary */ unsigned char *bin; size_t bin_len; cl_int bin_ret; /* Read program binary */ if (argc == 2) bin = read_buffer((char *)argv[1], &bin_len); else { printf("error: No binary specified\n"); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithBinary' failed\n"); exit(1); } if (bin_ret != CL_SUCCESS) { printf("error: Invalid binary for device\n"); exit(1); } printf("program=%p\n", program); /* Free binary */ free(bin); printf("program binary loaded\n"); printf("\n"); ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object*/ cl_kernel kernel; kernel = clCreateKernel(program, "native_tan_float4", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float4 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float4)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float4){{2.0, 2.0, 2.0, 2.0}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float4), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float4 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float4)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float4)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float4), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float4)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main(void) { cl_int err; cl_platform_id platforms[MAX_PLATFORMS]; cl_uint nplatforms; cl_device_id devices[MAX_DEVICES]; cl_uint ndevices; cl_uint i, j; err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms); if (err != CL_SUCCESS) return EXIT_FAILURE; for (i = 0; i < nplatforms; i++) { err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices); if (err != CL_SUCCESS) return EXIT_FAILURE; for (j = 0; j < ndevices; j++) { cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; const int buf_size = 1024; cl_int host_buf[buf_size]; cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * buf_size, NULL, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; cl_event buf_event; if (clEnqueueReadBuffer(queue, buf, CL_TRUE, 0, sizeof(cl_int) * buf_size, &host_buf, 0, NULL, &buf_event) != CL_SUCCESS) return EXIT_FAILURE; clFinish(queue); cl_command_queue event_command_queue; size_t param_val_size_ret; if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_QUEUE, sizeof(cl_command_queue), &event_command_queue, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_command_queue) || event_command_queue != queue) return EXIT_FAILURE; cl_command_type command_type; if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &command_type, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_command_type) || command_type != CL_COMMAND_READ_BUFFER) return EXIT_FAILURE; cl_int execution_status; if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &execution_status, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_int) || execution_status != CL_COMPLETE) return EXIT_FAILURE; cl_uint ref_count; if (clGetEventInfo(buf_event, CL_EVENT_REFERENCE_COUNT, sizeof(cl_uint), &ref_count, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_uint) || ref_count != 1) { printf("FAIL: expected refcount 1, got %d\n", ref_count); return EXIT_FAILURE; } clReleaseEvent(buf_event); clReleaseMemObject(buf); clReleaseCommandQueue(queue); } } return EXIT_SUCCESS; }