int main() { int nU, nX, nY; // Довжина універсалу, множин X, Y printf("Введіть універсальну множину"); int* universal = inputSet(nU); printf("\nВведіть множину Х"); int* x = inputSet(nX, universal, nU); printf("\nВведіть множину У"); int *y = inputSet(nY, universal, nU); printf("\nОтримані множини:"); printSet(universal, nU); printSet(x, nX); printSet(y, nY); printUnion(x, nX, y, nY); printIntersect(x, nX, y, nY); printDiff(x, nX, y, nY); printDiff(y, nY, x, nX); printInverse(x, nX, universal, nU); printInverse(y, nY, universal, nU); printLinearMultiply(x, nX, y, nY); free(universal); free(x); free(y); }
bool TestMaster::compare(const char *file, uint32_t line, const char *aName, const char *bName, const char *opText, const A &a, const B &b, const OP &op, bool fatal) { if (op(a,b)) { ++threadState().passCnt; return true; } std::string str; str += aName; str += opText; str += bName; std::ostringstream lhs; std::ostringstream rhs; lhs << a; rhs << b; { vespalib::LockGuard guard(_lock); checkFailed(guard, file, line, str.c_str()); printDiff(guard, str, file, line, lhs.str(), rhs.str()); handleFailure(guard, fatal); } return false; }
int run() { if (isMongos()) { toolError() << "mongotop only works on instances of mongod." << std::endl; return EXIT_FAILURE; } NamespaceStats prev = getData(); while ( true ) { sleepsecs(mongoTopGlobalParams.sleep); NamespaceStats now; try { now = getData(); } catch ( std::exception& e ) { toolError() << "can't get data: " << e.what() << std::endl; continue; } if ( now.size() == 0 ) return -2; try { printDiff( prev , now ); } catch ( AssertionException& e ) { toolError() << "\nerror: " << e.what() << std::endl; } prev = now; } return 0; }
int main(int argc, char *argv[]) { //INITILIZATION MPIInit(argc, argv); PTHREADInit(argc, argv); if (rank == 0) gettimeofday(&tvalBefore, NULL); //READFILE int fileArray[1000]; for (int ix = 0; ix < 10; ix++) { int file = fileArray[ix]; //Open new file with suffix ix openWriteFile((char *)nameGenerate("dummy", ix).c_str(), &file); //Print 0 to each file int num = 0; if (rank == 0) { lseek(file, 0, SEEK_SET); write(file, &num, sizeof(int)); } //Generate lock parameter struct flock lock; memset(&lock, 0, sizeof(lock)); MPIBarrier(); //Lock file lock.l_type = F_WRLCK; fcntl(file,F_SETLKW, &lock); //Read from file num lseek(file, 0, SEEK_SET); read(file, &num, sizeof(int)); num++; //Overwrite num+1 to file lseek(file, 0, SEEK_SET); write(file, &num,sizeof(int)); printf("(Process %2d) ix = %4d | num = %4d\n",rank, ix, num); //Unlock the file lock.l_type = F_UNLCK; fcntl(file, F_SETLK, &lock); } //FINALIZATION MPIBarrier(); if (rank == 0) { gettimeofday(&tvalAfter, NULL); printDiff(tvalBefore, tvalAfter); } MPIFinalize(); PTHREADFinalize(); return 0; }
int main() { int i, j, k, ii, jj, kk; int TTI, TTJ, TTK; int mini, minj, mink; TTI = 2000/8; TTJ = 64/2; TTK = 64/(2*2); for(i=0;i<2000;i++){ for(j=0;j<2000;j++){ B[i][j] = 1; } } for(i=0;i<2000;i++){ for(j=0;j<2000;j++){ C[i][j] = 1; } } // codigo no tiling for(i=0;i<2000;i+=1){ for(k=0;k<2000;k+=1){ for(j=0;j<2000;j+=1){ A[i][j] += B[i][k]*C[k][j]; } } } // codigo tiling for(ii=0;ii<2000;ii+=TTI){ for(kk=0;kk<2000;kk+=TTK){ for(jj=0;jj<2000;jj+=TTJ){ mini = MIN(ii+TTI,2000); for(i=ii;i<mini;i++){ mink = MIN(kk+TTK,2000); for(k=kk;k<mink;k++){ minj = MIN(jj+TTJ,2000); for(j=jj;j<minj;j++){ // indice mas externo se encuentra en la dimension contigua AA[i][j] += B[i][k]*C[k][j]; } } } } } } printDiff(A, AA, 2000, 2000, 100, 1.0e-3f); }
void printAll(float A[][M][maxDegree+1], float Acopy[][M][maxDegree+1], float P[][N][maxDegree+1], float Pinv[][N][maxDegree+1], float Q[][M][maxDegree+1], float Qinv[][M][maxDegree+1], float PAtest[][M][maxDegree+1], float diagTest[][N][maxDegree+1], float PPinvTest[][N][maxDegree+1], float QQinvTest[][M][maxDegree+1]) { printf("diag: "); print2ArrayM(A, N); printf("P: "); print2ArrayN(P, N); printf("Q: "); print2ArrayM(Q, M); printf("Pinv: "); print2ArrayN(Pinv, N); printf("Qinv: "); print2ArrayM(Qinv, M); matNNxmatNM(P, Acopy, PAtest); matNMxmatMM(PAtest, Q, diagTest); printf("diagTest: "); print2ArrayM(diagTest, N); matNNxmatNN(P, Pinv, PPinvTest); int n1, n2; for(n1 = 0; n1 < N; ++n1) { for(n2 = 0; n2 < N; ++n2) { clearZeroes2(PPinvTest[n1][n2]); } } printf("PPinvTest: "); print2ArrayM(PPinvTest, N); matNNxmatNN(Q, Qinv, QQinvTest); int m1, m2; for(m1 = 0; m1 < M; ++m1) { for(m2 = 0; m2 < M; ++m2) { clearZeroes2(QQinvTest[m1][m2]); } } printf("QQinvTest: "); print2ArrayM(QQinvTest, M); if (N > M) { printf("Given this matrix, the following are conditions that must be satisfied so this system of differential equations is consistent:\n\n"); printDiff(P, M); } }
int run() { _sleep = getParam( "sleep" , _sleep ); auth(); BSONObj prev = getData(); while ( true ) { sleepsecs( _sleep ); BSONObj now; try { now = getData(); } catch ( std::exception& e ) { cout << "can't get data: " << e.what() << endl; continue; } if ( now.isEmpty() ) return -2; try { printDiff( prev , now ); } catch ( AssertionException& e ) { cout << "\nerror: " << e.what() << "\n" << now << endl; } prev = now; } return 0; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test for //////////////////////////////////////////////////////////////////////////////// int runTest(int argc, const char** argv) { cl_platform_id cpPlatform = NULL; cl_uint ciDeviceCount = 0; cl_device_id *cdDevices = NULL; cl_int ciErrNum = CL_SUCCESS; //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount); cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } //Create the context cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create OpenCL context!\n"); return ciErrNum; } if(shrCheckCmdLineFlag(argc, (const char**)argv, "device")) { // User specified GPUs char* deviceList; char* deviceStr; char* next_token; shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList); #ifdef WIN32 deviceStr = strtok_s (deviceList," ,.-", &next_token); #else deviceStr = strtok (deviceList," ,.-"); #endif ciDeviceCount = 0; while(deviceStr != NULL) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr)); if( device == (cl_device_id) -1 ) { shrLog(" Device %s does not exist!\n", deviceStr); return -1; } shrLog("Device %s: ", deviceStr); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } ++ciDeviceCount; #ifdef WIN32 deviceStr = strtok_s (NULL," ,.-", &next_token); #else deviceStr = strtok (NULL," ,.-"); #endif } free(deviceList); } else { // Find out how many GPU's to compute on all available GPUs size_t nDeviceBytes; ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes); ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum); return ciErrNum; } else if (ciDeviceCount == 0) { shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum); return -1; } // create command-queues for(unsigned int i = 0; i < ciDeviceCount; ++i) { // get and print the device for this queue cl_device_id device = oclGetDev(cxGPUContext, i); shrLog("Device %d: ", i); oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create command queue commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum); return ciErrNum; } } } // Optional Command-line multiplier for matrix sizes shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); iSizeMultiple = CLAMP(iSizeMultiple, 1, 10); uiWA = WA * iSizeMultiple; uiHA = HA * iSizeMultiple; uiWB = WB * iSizeMultiple; uiHB = HB * iSizeMultiple; uiWC = WC * iSizeMultiple; uiHC = HC * iSizeMultiple; shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); // allocate host memory for matrices A and B unsigned int size_A = uiWA * uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A_data = (float*)malloc(mem_size_A); unsigned int size_B = uiWB * uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B_data = (float*)malloc(mem_size_B); // initialize host memory srand(2006); shrFillArray(h_A_data, size_A); shrFillArray(h_B_data, size_B); // allocate host memory for result unsigned int size_C = uiWC * uiHC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // create OpenCL buffer pointing to the host memory cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, mem_size_A, h_A_data, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: clCreateBuffer\n"); return ciErrNum; } // Program Setup size_t program_length; const char* header_path = shrFindFilePath("matrixMul.h", argv[0]); oclCheckError(header_path != NULL, shrTRUE); char* header = oclLoadProgSource(header_path, "", &program_length); if(!header) { shrLog("Error: Failed to load the header %s!\n", header_path); return -1000; } const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]); oclCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, header, &program_length); if(!source) { shrLog("Error: Failed to load compute program %s!\n", source_path); return -2000; } // create the program cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, &program_length, &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create program\n"); return ciErrNum; } free(header); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then return error shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); return ciErrNum; } // write out PTX if requested on the command line if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") ) { oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx"); } // Create Kernel for(unsigned int i = 0; i < ciDeviceCount; ++i) { multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum); if (ciErrNum != CL_SUCCESS) { shrLog("Error: Failed to create kernel\n"); return ciErrNum; } } // Run multiplication on 1..deviceCount GPUs to compare improvement shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount); for(unsigned int k = 1; k <= ciDeviceCount; ++k) { matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C); } // compute reference solution shrLog("Comparing results with CPU computation... \n\n"); float* reference = (float*) malloc(mem_size_C); computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB); // check result shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f); if (res != shrTRUE) { printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f); } // clean up OCL resources ciErrNum = clReleaseMemObject(h_A); for(unsigned int k = 0; k < ciDeviceCount; ++k) { ciErrNum |= clReleaseKernel( multiplicationKernel[k] ); ciErrNum |= clReleaseCommandQueue( commandQueue[k] ); } ciErrNum |= clReleaseProgram(cpProgram); ciErrNum |= clReleaseContext(cxGPUContext); if(ciErrNum != CL_SUCCESS) { shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum); return ciErrNum; } // clean up memory free(h_A_data); free(h_B_data); free(h_C); free(reference); return ((shrTRUE == res) ? CL_SUCCESS : -3000); }
int main(int argc, char** argv) { printDiff(); return 0; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A)); checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B)); checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C)); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { const float alpha = 1.0f; const float beta = 0.0f; cublasHandle_t handle; cudaEvent_t start, stop; checkCudaErrors(cublasCreate(&handle)); //Perform warmup operation with cublas checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); // Allocate CUDA events that we'll use for timing checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); // Record the start event checkCudaErrors(cudaEventRecord(start, NULL)); for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); } printf("done.\n"); // Record the stop event checkCudaErrors(cudaEventRecord(stop, NULL)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // copy result from device to host checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); // Destroy the handle checkCudaErrors(cublasDestroy(handle)); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength) { uint N = 0; CUdeviceptr d_Output; struct timeval start,stop; gettimeofday(&start, NULL); initScan(); gettimeofday(&stop, NULL); if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE) { for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){ if(arrayLength <= i){ N = i; break; } } checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N); //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE) { N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else if(arrayLength <= MAX_LL_SIZE) { N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE); printf("N = %d\n",N); checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint))); checkCudaErrors(cudaDeviceSynchronize()); scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N); checkCudaErrors(cudaDeviceSynchronize()); }else{ cuMemFree(d_Output); closeScan(); return NULL; } closeScan(); cuMemFree(*d_Input); *d_Input = d_Output; printf("inside scan time:\n"); printDiff(start,stop); return d_Output; }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); error = cudaMalloc((void **) &d_A, mem_size_A); if (error != cudaSuccess) { printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_B, mem_size_B); if (error != cudaSuccess) { printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_C, mem_size_C); if (error != cudaSuccess) { printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // create and start timer StopWatchInterface *timerMemIn = NULL; sdkCreateTimer(&timerMemIn); // start the timer sdkStartTimer(&timerMemIn); // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } sdkStopTimer(&timerMemIn); printf("\nMemory H2D Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemIn)); sdkDeleteTimer(&timerMemIn); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } const float alpha = 1.0f; const float beta = 0.0f; //Perform warmup operation with cublas ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } // Allocate CUDA events that we'll use for timing cudaEvent_t start; error = cudaEventCreate(&start); if (error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } cudaEvent_t stop; error = cudaEventCreate(&stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Record the start event error = cudaEventRecord(start, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } } printf("done.\n"); // Record the stop event error = cudaEventRecord(stop, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Wait for the stop event to complete error = cudaEventSynchronize(stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // create and start timer StopWatchInterface *timerMemOut = NULL; sdkCreateTimer(&timerMemOut); // start the timer sdkStartTimer(&timerMemOut); // copy result from device to host error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); sdkStopTimer(&timerMemOut); printf("\Memory D2H Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemOut)); sdkDeleteTimer(&timerMemOut); if (error != cudaSuccess) { printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } checkError(cublasDestroy(handle), "cublasDestroy() error!\n"); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); // create and start timer StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); // start the timer sdkStartTimer(&timer); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); sdkStopTimer(&timer); printf("\nCPU Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
int main(int argc, const char** argv) { cl_uint platform_count; cl_platform_id platforms[5]; cl_int err = CL_SUCCESS; unsigned int i, p; cl_device_type dev_type = CL_DEVICE_TYPE_ALL; void * ptrs[BLOCKS]; cl_command_queue cqs[BLOCKS]; cl_mem d_A[BLOCKS]; cl_mem d_C[BLOCKS]; cl_mem d_B[BLOCKS]; cl_event GPUDone[BLOCKS]; cl_event GPUExecution[BLOCKS]; struct timeval start, end; int workOffset[BLOCKS]; int workSize[BLOCKS]; unsigned int sizePerGPU = HC / BLOCKS; unsigned int sizeMod = HC % BLOCKS; size_t A_size = WA * HA; size_t A_mem_size = sizeof(TYPE) * A_size; TYPE* A_data; size_t B_size = WB * HB; size_t B_mem_size = sizeof(TYPE) * B_size; TYPE* B_data; size_t C_size = WC * HC; size_t C_mem_size = sizeof(TYPE) * C_size; TYPE* C_data; parse_args(argc, argv); check(clGetPlatformIDs(5, platforms, &platform_count)); if (platform_count == 0) { printf("No platform found\n"); exit(77); } cl_uint device_count; cl_uint devs[platform_count]; cl_device_id * devices[platform_count]; cl_context ctx[platform_count]; cl_command_queue * commandQueue[platform_count]; device_count = 0; for (p=0; p<platform_count; p++) { cl_platform_id platform = platforms[p]; err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]); if (err == CL_DEVICE_NOT_FOUND) { devs[p] = 0; continue; } if (devs[p] == 0) { printf("No OpenCL device found\n"); exit(77); } if (err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err); exit(EXIT_FAILURE); } if (devs[p] == 0) continue; devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]); commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]); check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL)); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err)); for(i = 0; i < devs[p]; ++i) { cl_device_id device = devices[p][i]; char name[2048]; name[0] = '\0'; clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL); printf("Device %d: %s\n", i, name); check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); } device_count += devs[p]; } if (device_count == 0) error("No device found\n"); cl_kernel multiplicationKernel[platform_count]; printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n", (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC); // allocate host memory for matrices A, B and C A_data = (TYPE*)malloc(A_mem_size); if (A_data == NULL) { perror("malloc"); } B_data = (TYPE*)malloc(B_mem_size); if (B_data == NULL) { perror("malloc"); } C_data = (TYPE*) malloc(C_mem_size); if (C_data == NULL) { perror("malloc"); } cl_program program[platform_count]; for (p=0; p<platform_count; p++) { if (devs[p] == 0) continue; check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err)); check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL)); check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err)); } printf("Initializing data...\n"); srand(2008); fillArray(A_data, A_size); fillArray(B_data, B_size); memset(C_data, 0, C_size); printf("Computing...\n"); workOffset[0] = 0; gettimeofday(&start, NULL); size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE}; int c = 0; for (p=0; p<platform_count;p++) { for (i=0; i<devs[p]; i++) { check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err)); c++; } } for(i=0; i < BLOCKS; ++i) { int d = i % device_count; cl_uint platform = 0; // determine device platform int dev = d; for (platform = 0; platform < platform_count; platform++) { if ((cl_int)(dev - devs[platform]) < 0) break; dev -= devs[platform]; } workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU; check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err)); check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err)); check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i])); check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i])); check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d])); check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i])); size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])}; check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i])); // Non-blocking copy of result from device to host cqs[i] = commandQueue[platform][dev]; check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err)); if(i+1 < BLOCKS) workOffset[i + 1] = workOffset[i] + workSize[i]; } // CPU sync with GPU for (p=0; p<platform_count;p++) { cl_uint dev; for (dev=0; dev<devs[p]; dev++) { clFinish(commandQueue[p][dev]); } } gettimeofday(&end, NULL); double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec)); double dSeconds = timing/1000/1000; double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB; double gflops = 1.0e-9 * dNumOps/dSeconds; printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n", gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]); // compute reference solution if (check) { printf("Comparing results with CPU computation... "); TYPE* reference = (TYPE*)malloc(C_mem_size); computeReference(reference, A_data, B_data, HA, WA, WB); // check result int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f); if (res == 0) { printf("\n\n"); printDiff(reference, C_data, WC, HC, 100, 1.0e-5f); } else printf("PASSED\n\n"); free(reference); } for(i = 0; i < BLOCKS; i++) { clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL); } for(i = 0; i < BLOCKS; i++) { clFinish(cqs[i]); } for (i=0; i<device_count; i++) { clReleaseMemObject(d_B[i]); } for(i = 0; i < BLOCKS; i++) { clReleaseMemObject(d_A[i]); clReleaseMemObject(d_C[i]); clReleaseEvent(GPUExecution[i]); clReleaseEvent(GPUDone[i]); } for (p=0; p<platform_count;p++) { if (devs[p] == 0) continue; check(clReleaseKernel(multiplicationKernel[p])); check(clReleaseProgram(program[p])); check(clReleaseContext(ctx[p])); cl_uint k; for(k = 0; k < devs[p]; ++k) { check(clReleaseCommandQueue(commandQueue[p][k])); } } free(A_data); free(B_data); free(C_data); return 0; }