Пример #1
0
int main() {
    int nU, nX, nY; // Довжина універсалу, множин X, Y
    printf("Введіть універсальну множину");
    int* universal = inputSet(nU);
    printf("\nВведіть множину Х");
    int* x = inputSet(nX, universal, nU);
    printf("\nВведіть множину У");
    int *y = inputSet(nY, universal, nU);

    printf("\nОтримані множини:");
    printSet(universal, nU);
    printSet(x, nX);
    printSet(y, nY);

    printUnion(x, nX, y, nY);
    printIntersect(x, nX, y, nY);
    printDiff(x, nX, y, nY);
    printDiff(y, nY, x, nX);
    printInverse(x, nX, universal, nU);
    printInverse(y, nY, universal, nU);
    printLinearMultiply(x, nX, y, nY);

    free(universal);
    free(x);
    free(y);
}
Пример #2
0
bool
TestMaster::compare(const char *file, uint32_t line,
                    const char *aName, const char *bName,
                    const char *opText,
                    const A &a, const B &b, const OP &op, bool fatal)
{
    if (op(a,b)) {
        ++threadState().passCnt;
        return true;
    }
    std::string str;
    str += aName;
    str += opText;
    str += bName;
    std::ostringstream lhs;
    std::ostringstream rhs;
    lhs << a;
    rhs << b;
    {
        vespalib::LockGuard guard(_lock);
        checkFailed(guard, file, line, str.c_str());
        printDiff(guard, str, file, line, lhs.str(), rhs.str());
        handleFailure(guard, fatal);
    }
    return false;
}
Пример #3
0
        int run() {
            if (isMongos()) {
                toolError() << "mongotop only works on instances of mongod." << std::endl;
                return EXIT_FAILURE;
            }

            NamespaceStats prev = getData();

            while ( true ) {
                sleepsecs(mongoTopGlobalParams.sleep);
                
                NamespaceStats now;
                try {
                    now = getData();
                }
                catch ( std::exception& e ) {
                    toolError() << "can't get data: " << e.what() << std::endl;
                    continue;
                }

                if ( now.size() == 0 )
                    return -2;
                
                try {
                    printDiff( prev , now );
                }
                catch ( AssertionException& e ) {
                    toolError() << "\nerror: " << e.what() << std::endl;
                }

                prev = now;
            }

            return 0;
        }
Пример #4
0
int main(int argc, char *argv[])
{
  //INITILIZATION
  MPIInit(argc, argv);
  PTHREADInit(argc, argv);
  if (rank == 0) gettimeofday(&tvalBefore, NULL);

  //READFILE
  int fileArray[1000];

  for (int ix = 0; ix < 10; ix++) {
    int file = fileArray[ix];

    //Open new file with suffix ix
    openWriteFile((char *)nameGenerate("dummy", ix).c_str(), &file);

    //Print 0 to each file
    int num = 0;
    if (rank == 0) {
      lseek(file, 0, SEEK_SET);
      write(file, &num, sizeof(int));
    }

    //Generate lock parameter
    struct flock lock;
    memset(&lock, 0, sizeof(lock));

    MPIBarrier();

    //Lock file
    lock.l_type = F_WRLCK;
    fcntl(file,F_SETLKW, &lock);

    //Read from file num
    lseek(file, 0, SEEK_SET);
    read(file, &num, sizeof(int));
    num++;

    //Overwrite num+1 to file
    lseek(file, 0, SEEK_SET);
    write(file, &num,sizeof(int));
    printf("(Process %2d) ix = %4d | num = %4d\n",rank, ix, num);

    //Unlock the file
    lock.l_type = F_UNLCK;
    fcntl(file, F_SETLK, &lock);
  }

  //FINALIZATION
  MPIBarrier();
  if (rank == 0) {
    gettimeofday(&tvalAfter, NULL);
    printDiff(tvalBefore, tvalAfter);
  }
  MPIFinalize();
  PTHREADFinalize();

  return 0;
}
Пример #5
0
int
main()
{
	int i, j, k, ii, jj, kk;
	int TTI, TTJ, TTK;
	int mini, minj, mink;
	
	TTI = 2000/8;
	TTJ = 64/2;
	TTK = 64/(2*2);
	
	for(i=0;i<2000;i++){
		for(j=0;j<2000;j++){
			B[i][j] = 1;
		}
	}
	
	for(i=0;i<2000;i++){
		for(j=0;j<2000;j++){
			C[i][j] = 1;
		}
	}
	
	// codigo no tiling
	
	for(i=0;i<2000;i+=1){
		for(k=0;k<2000;k+=1){
			for(j=0;j<2000;j+=1){
				A[i][j] += B[i][k]*C[k][j];
			}
		}
	}
	
	// codigo tiling
	for(ii=0;ii<2000;ii+=TTI){
		for(kk=0;kk<2000;kk+=TTK){
			for(jj=0;jj<2000;jj+=TTJ){
				mini = MIN(ii+TTI,2000);
				for(i=ii;i<mini;i++){
					mink = MIN(kk+TTK,2000);
					for(k=kk;k<mink;k++){
						minj = MIN(jj+TTJ,2000);
						for(j=jj;j<minj;j++){
							// indice mas externo se encuentra en la dimension contigua
							AA[i][j] += B[i][k]*C[k][j];
						}
					}
				}
			}
		}
	}
	
	printDiff(A, AA, 2000, 2000, 100, 1.0e-3f);
}
Пример #6
0
void printAll(float A[][M][maxDegree+1], float Acopy[][M][maxDegree+1], 
			  float P[][N][maxDegree+1], float Pinv[][N][maxDegree+1], 
			  float Q[][M][maxDegree+1], float Qinv[][M][maxDegree+1], 
			  float PAtest[][M][maxDegree+1], float diagTest[][N][maxDegree+1], 
			  float PPinvTest[][N][maxDegree+1], float QQinvTest[][M][maxDegree+1]) {

	printf("diag: ");
	print2ArrayM(A, N);

	printf("P: ");
	print2ArrayN(P, N);

	printf("Q: ");
	print2ArrayM(Q, M);

	printf("Pinv: ");
	print2ArrayN(Pinv, N);
	
	printf("Qinv: ");
	print2ArrayM(Qinv, M);

	matNNxmatNM(P, Acopy, PAtest);
	matNMxmatMM(PAtest, Q, diagTest);
	printf("diagTest: ");
	print2ArrayM(diagTest, N);

	matNNxmatNN(P, Pinv, PPinvTest);
	int n1, n2;
	for(n1 = 0; n1 < N; ++n1) {
		for(n2 = 0; n2 < N; ++n2) {
			clearZeroes2(PPinvTest[n1][n2]);
		}
	}
	printf("PPinvTest: ");
	print2ArrayM(PPinvTest, N);

	matNNxmatNN(Q, Qinv, QQinvTest);
	int m1, m2;
	for(m1 = 0; m1 < M; ++m1) {
		for(m2 = 0; m2 < M; ++m2) {
			clearZeroes2(QQinvTest[m1][m2]);
		}
	}

	printf("QQinvTest: ");
	print2ArrayM(QQinvTest, M);

	if (N > M) {
		printf("Given this matrix, the following are conditions that must be satisfied so this system of differential equations is consistent:\n\n");
		printDiff(P, M);
	}
	
}
Пример #7
0
        int run() {
            _sleep = getParam( "sleep" , _sleep );

            auth();
            
            BSONObj prev = getData();

            while ( true ) {
                sleepsecs( _sleep );
                
                BSONObj now;
                try {
                    now = getData();
                }
                catch ( std::exception& e ) {
                    cout << "can't get data: " << e.what() << endl;
                    continue;
                }

                if ( now.isEmpty() )
                    return -2;
                
                try {
                    printDiff( prev , now );
                }
                catch ( AssertionException& e ) {
                    cout << "\nerror: " << e.what() << "\n"
                         << now
                         << endl;
                }


                prev = now;
            }

            return 0;
        }
Пример #8
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for 
////////////////////////////////////////////////////////////////////////////////
int runTest(int argc, const char** argv)
{
    cl_platform_id cpPlatform = NULL;
    cl_uint ciDeviceCount = 0;
    cl_device_id *cdDevices = NULL;
    cl_int ciErrNum = CL_SUCCESS;

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &ciDeviceCount);
    cdDevices = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, ciDeviceCount, cdDevices, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    //Create the context
    cxGPUContext = clCreateContext(0, ciDeviceCount, cdDevices, NULL, NULL, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create OpenCL context!\n");
        return ciErrNum;
    }

    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        // User specified GPUs
        char* deviceList;
        char* deviceStr;
        char* next_token;
        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);

        #ifdef WIN32
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   
        ciDeviceCount = 0;
        while(deviceStr != NULL) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
			if( device == (cl_device_id) -1  ) {
				shrLog(" Device %s does not exist!\n", deviceStr);
				return -1;
			}
			
			shrLog("Device %s: ", deviceStr);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");
           
            // create command queue
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
                
            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
	    size_t nDeviceBytes;
	    ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
	    ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        if (ciErrNum != CL_SUCCESS)
        {
            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
            return ciErrNum;
        }
        else if (ciDeviceCount == 0)
        {
            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
            return -1;
        } 

        // create command-queues
        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, i);
            shrLog("Device %d: ", i);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");

            // create command queue
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
        }
    }

    // Optional Command-line multiplier for matrix sizes
    shrGetCmdLineArgumenti(argc, (const char**)argv, "sizemult", &iSizeMultiple); 
    iSizeMultiple = CLAMP(iSizeMultiple, 1, 10);
    uiWA = WA * iSizeMultiple;
    uiHA = HA * iSizeMultiple;
    uiWB = WB * iSizeMultiple;
    uiHB = HB * iSizeMultiple;
    uiWC = WC * iSizeMultiple;
    uiHC = HC * iSizeMultiple;
    shrLog("\nUsing Matrix Sizes: A(%u x %u), B(%u x %u), C(%u x %u)\n", 
            uiWA, uiHA, uiWB, uiHB, uiWC, uiHC);

    // allocate host memory for matrices A and B
    unsigned int size_A = uiWA * uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float* h_A_data = (float*)malloc(mem_size_A);
    unsigned int size_B = uiWB * uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float* h_B_data = (float*)malloc(mem_size_B);

    // initialize host memory
    srand(2006);
    shrFillArray(h_A_data, size_A);
    shrFillArray(h_B_data, size_B);

    // allocate host memory for result
    unsigned int size_C = uiWC * uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;
    float* h_C = (float*) malloc(mem_size_C);

    // create OpenCL buffer pointing to the host memory
    cl_mem h_A = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
				    mem_size_A, h_A_data, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: clCreateBuffer\n");
        return ciErrNum;
    }

    // Program Setup
    size_t program_length;
    const char* header_path = shrFindFilePath("matrixMul.h", argv[0]);
    oclCheckError(header_path != NULL, shrTRUE);
    char* header = oclLoadProgSource(header_path, "", &program_length);
    if(!header)
    {
        shrLog("Error: Failed to load the header %s!\n", header_path);
        return -1000;
    }
    const char* source_path = shrFindFilePath("matrixMul.cl", argv[0]);
    oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, header, &program_length);
    if(!source)
    {
        shrLog("Error: Failed to load compute program %s!\n", source_path);
        return -2000;
    }

    // create the program
    cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&source, 
                                                    &program_length, &ciErrNum);
    if (ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failed to create program\n");
        return ciErrNum;
    }
    free(header);
    free(source);
    
    // build the program
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then return error
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
        return ciErrNum;
    }

    // write out PTX if requested on the command line
    if(shrCheckCmdLineFlag(argc, argv, "dump-ptx") )
    {
        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclMatrixMul.ptx");
    }

    // Create Kernel
    for(unsigned int i = 0; i < ciDeviceCount; ++i) {
        multiplicationKernel[i] = clCreateKernel(cpProgram, "matrixMul", &ciErrNum);
        if (ciErrNum != CL_SUCCESS)
        {
            shrLog("Error: Failed to create kernel\n");
            return ciErrNum;
        }
    }
        
    // Run multiplication on 1..deviceCount GPUs to compare improvement
    shrLog("\nRunning Computations on 1 - %d GPU's...\n\n", ciDeviceCount);
    for(unsigned int k = 1; k <= ciDeviceCount; ++k) 
    {
        matrixMulGPU(k, h_A, h_B_data, mem_size_B, h_C);
    }

    // compute reference solution
    shrLog("Comparing results with CPU computation... \n\n");
    float* reference = (float*) malloc(mem_size_C);
    computeGold(reference, h_A_data, h_B_data, uiHA, uiWA, uiWB);

    // check result
    shrBOOL res = shrCompareL2fe(reference, h_C, size_C, 1.0e-6f);
    if (res != shrTRUE) 
    {
        printDiff(reference, h_C, uiWC, uiHC, 100, 1.0e-5f);
    }

    // clean up OCL resources
    ciErrNum = clReleaseMemObject(h_A);
    for(unsigned int k = 0; k < ciDeviceCount; ++k) 
    {
        ciErrNum |= clReleaseKernel( multiplicationKernel[k] );
        ciErrNum |= clReleaseCommandQueue( commandQueue[k] );
    }
    ciErrNum |= clReleaseProgram(cpProgram);
    ciErrNum |= clReleaseContext(cxGPUContext);
    if(ciErrNum != CL_SUCCESS)
    {
        shrLog("Error: Failure releasing OpenCL resources: %d\n", ciErrNum);
        return ciErrNum;
    }

    // clean up memory
    free(h_A_data);
    free(h_B_data);
    free(h_C);
    free(reference);
    
    return ((shrTRUE == res) ? CL_SUCCESS : -3000);
}
Пример #9
0
int main(int argc, char** argv)
{
  printDiff();
  return 0;
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
    cudaDeviceProp deviceProp;

    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    // set seed for rand()
    srand(2006);

    // allocate host memory for matrices A and B
    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = (float *)malloc(mem_size_A);
    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = (float *)malloc(mem_size_B);

    // set seed for rand()
    srand(2006);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    // allocate device memory
    float *d_A, *d_B, *d_C;
    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;

    // allocate host memory for the result
    float *h_C      = (float *) malloc(mem_size_C);
    float *h_CUBLAS = (float *) malloc(mem_size_C);

    checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
    checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
    checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));

    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

    // create and start timer
    printf("Computing result using CUBLAS...");

    // execute the kernel
    int nIter = 30;

    // CUBLAS version 2.0
    {
        const float alpha = 1.0f;
        const float beta  = 0.0f;
        cublasHandle_t handle;
        cudaEvent_t start, stop;

        checkCudaErrors(cublasCreate(&handle));

        //Perform warmup operation with cublas
        checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA));

        // Allocate CUDA events that we'll use for timing
        checkCudaErrors(cudaEventCreate(&start));
        checkCudaErrors(cudaEventCreate(&stop));

        // Record the start event
        checkCudaErrors(cudaEventRecord(start, NULL));

        for (int j = 0; j < nIter; j++)
        {
            //note cublas is column primary!
            //need to transpose the order
            checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA));

        }

        printf("done.\n");

        // Record the stop event
        checkCudaErrors(cudaEventRecord(stop, NULL));

        // Wait for the stop event to complete
        checkCudaErrors(cudaEventSynchronize(stop));

        float msecTotal = 0.0f;
        checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

        // Compute and print the performance
        float msecPerMatrixMul = msecTotal / nIter;
        double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
        printf(
            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
            gigaFlops,
            msecPerMatrixMul,
            flopsPerMatrixMul);

        // copy result from device to host
        checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost));

        // Destroy the handle
        checkCudaErrors(cublasDestroy(handle));
    }

    // compute reference solution
    printf("Computing result using host CPU...");
    float *reference = (float *)malloc(mem_size_C);
    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);
    printf("done.\n");

    // check result (CUBLAS)
    bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);

    if (resCUBLAS != true)
    {
        printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
    }

    printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");

    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_C));

    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();

    if (resCUBLAS == true)
    {
        return EXIT_SUCCESS;    // return value = 1
    }
    else
    {
        return EXIT_FAILURE;     // return value = 0
    }
}
Пример #11
0
CUdeviceptr presum(CUdeviceptr *d_Input, uint arrayLength)
{

    uint N = 0;
    CUdeviceptr d_Output;
    struct timeval start,stop;
    gettimeofday(&start, NULL);
    initScan();
    gettimeofday(&stop, NULL);

    if(arrayLength <= MAX_SHORT_ARRAY_SIZE && arrayLength > MIN_SHORT_ARRAY_SIZE)
      {    
        for(uint i = 4; i<=MAX_SHORT_ARRAY_SIZE ; i<<=1){
          if(arrayLength <= i){
            N = i;
            break;
          }
        }
        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveShort((uint *)d_Output, (uint *)(*d_Input), N);
        //szWorkgroup = scanExclusiveShort((uint *)d_Output, (uint *)d_Input, 1, N);

        checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LARGE_ARRAY_SIZE)
    {

      N = MAX_SHORT_ARRAY_SIZE * iDivUp(arrayLength,MAX_SHORT_ARRAY_SIZE);

      checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      
      
      checkCudaErrors(cudaDeviceSynchronize());

      scanExclusiveLarge((uint *)d_Output, (uint *)(*d_Input), N);
      
      checkCudaErrors(cudaDeviceSynchronize());

    }else if(arrayLength <= MAX_LL_SIZE)
      {


        N = MAX_LARGE_ARRAY_SIZE * iDivUp(arrayLength,MAX_LARGE_ARRAY_SIZE);

        printf("N = %d\n",N);

        checkCudaErrors(cudaMalloc((void **)&d_Output, N * sizeof(uint)));      

        checkCudaErrors(cudaDeviceSynchronize());

        scanExclusiveLL((uint *)d_Output, (uint *)(*d_Input), N);
        
        checkCudaErrors(cudaDeviceSynchronize());

      }else{
      cuMemFree(d_Output);
      closeScan();

      return NULL;      
    }

    closeScan();

    cuMemFree(*d_Input);
    *d_Input = d_Output;

    
    printf("inside scan time:\n");
    printDiff(start,stop);

    return d_Output;
}
Пример #12
0
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
    cudaDeviceProp deviceProp;
    cudaError_t error;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    // set seed for rand()
    srand(2006);

    // allocate host memory for matrices A and B
    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = (float *)malloc(mem_size_A);
    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = (float *)malloc(mem_size_B);

    // set seed for rand()
    srand(2006);

    // initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);

    // allocate device memory
    float *d_A, *d_B, *d_C;
    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(float) * size_C;

    // allocate host memory for the result
    float *h_C      = (float *) malloc(mem_size_C);
    float *h_CUBLAS = (float *) malloc(mem_size_C);

    error = cudaMalloc((void **) &d_A, mem_size_A);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMalloc((void **) &d_B, mem_size_B);

    if (error != cudaSuccess)
    {
        printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

	error = cudaMalloc((void **) &d_C, mem_size_C);

	if (error != cudaSuccess)
	{
		printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__);
		exit(EXIT_FAILURE);
	}

	// create and start timer
	StopWatchInterface *timerMemIn = NULL;
	sdkCreateTimer(&timerMemIn);
	// start the timer
	sdkStartTimer(&timerMemIn);

    // copy host memory to device
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);

    if (error != cudaSuccess)
    {
        printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

	sdkStopTimer(&timerMemIn);
	printf("\nMemory H2D Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemIn));
	sdkDeleteTimer(&timerMemIn);

    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

    // create and start timer
    printf("Computing result using CUBLAS...");

    // execute the kernel
    int nIter = 30;

    // CUBLAS version 2.0
    {
        cublasHandle_t handle;

        cublasStatus_t ret;

        ret = cublasCreate(&handle);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        const float alpha = 1.0f;
        const float beta  = 0.0f;
        //Perform warmup operation with cublas
        ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        // Allocate CUDA events that we'll use for timing
        cudaEvent_t start;
        error = cudaEventCreate(&start);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        cudaEvent_t stop;
        error = cudaEventCreate(&stop);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        // Record the start event
        error = cudaEventRecord(start, NULL);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        for (int j = 0; j < nIter; j++)
        {
            //note cublas is column primary!
            //need to transpose the order
            ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA);

            if (ret != CUBLAS_STATUS_SUCCESS)
            {
                printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
                exit(EXIT_FAILURE);
            }
        }

        printf("done.\n");

        // Record the stop event
        error = cudaEventRecord(stop, NULL);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        // Wait for the stop event to complete
        error = cudaEventSynchronize(stop);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        float msecTotal = 0.0f;
        error = cudaEventElapsedTime(&msecTotal, start, stop);

        if (error != cudaSuccess)
        {
            fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error));
            exit(EXIT_FAILURE);
        }

        // Compute and print the performance
        float msecPerMatrixMul = msecTotal / nIter;
        double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB;
        double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
        printf(
            "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n",
            gigaFlops,
            msecPerMatrixMul,
            flopsPerMatrixMul);

		// create and start timer
		StopWatchInterface *timerMemOut = NULL;
		sdkCreateTimer(&timerMemOut);
		// start the timer
		sdkStartTimer(&timerMemOut);

        // copy result from device to host
        error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);

		sdkStopTimer(&timerMemOut);
		printf("\Memory D2H Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemOut));
		sdkDeleteTimer(&timerMemOut);

        if (error != cudaSuccess)
        {
            printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }

        checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
    }

    // compute reference solution
    printf("Computing result using host CPU...");
    float *reference = (float *)malloc(mem_size_C);

	// create and start timer
	StopWatchInterface *timer = NULL;
	sdkCreateTimer(&timer);
	// start the timer
	sdkStartTimer(&timer);

    matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB);

	sdkStopTimer(&timer);
	printf("\nCPU Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
	sdkDeleteTimer(&timer);

    printf("done.\n");

    // check result (CUBLAS)
    bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);

    if (resCUBLAS != true)
    {
        printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f);
    }

    printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL");

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    free(reference);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cudaDeviceReset();

    if (resCUBLAS == true)
    {
        return EXIT_SUCCESS;    // return value = 1
    }
    else
    {
        return EXIT_FAILURE;     // return value = 0
    }
}
Пример #13
0
int main(int argc, const char** argv) {
	cl_uint platform_count;
	cl_platform_id platforms[5];

	cl_int err = CL_SUCCESS;
	unsigned int i, p;

	cl_device_type dev_type = CL_DEVICE_TYPE_ALL;

	void * ptrs[BLOCKS];
	cl_command_queue cqs[BLOCKS];
	cl_mem d_A[BLOCKS];
	cl_mem d_C[BLOCKS];
	cl_mem d_B[BLOCKS];

	cl_event GPUDone[BLOCKS];
	cl_event GPUExecution[BLOCKS];
	struct timeval start, end;

	int workOffset[BLOCKS];
	int workSize[BLOCKS];

	unsigned int sizePerGPU = HC / BLOCKS;
	unsigned int sizeMod = HC % BLOCKS;

	size_t A_size = WA * HA;
	size_t A_mem_size = sizeof(TYPE) * A_size;
	TYPE* A_data;

	size_t B_size = WB * HB;
	size_t B_mem_size = sizeof(TYPE) * B_size;
	TYPE* B_data;

	size_t C_size = WC * HC;
	size_t C_mem_size = sizeof(TYPE) * C_size;
	TYPE* C_data;

	parse_args(argc, argv);

	check(clGetPlatformIDs(5, platforms, &platform_count));
	if (platform_count == 0) {
		printf("No platform found\n");
		exit(77);
	}

	cl_uint device_count;
	cl_uint devs[platform_count];
	cl_device_id * devices[platform_count];
	cl_context ctx[platform_count];
	cl_command_queue * commandQueue[platform_count];

	device_count = 0;
	for (p=0; p<platform_count; p++) {
		cl_platform_id platform = platforms[p];

		err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
		if (err == CL_DEVICE_NOT_FOUND) {
			devs[p] = 0;
			continue;
		}
		if (devs[p] == 0) {
		     printf("No OpenCL device found\n");
		     exit(77);
		}
		if (err != CL_SUCCESS) {
			fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err);
			exit(EXIT_FAILURE);
		}
		if (devs[p] == 0)
			continue;

		devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
		commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);

		check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));

		cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
		check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));

		for(i = 0; i < devs[p]; ++i)
		{
			cl_device_id device = devices[p][i];
			char name[2048];
			name[0] = '\0';
			clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
			printf("Device %d: %s\n", i, name);

			check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
		}

		device_count += devs[p];
	}

	if (device_count == 0)
		error("No device found\n");



	cl_kernel multiplicationKernel[platform_count];

	printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
			(unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);

	// allocate host memory for matrices A, B and C
	A_data = (TYPE*)malloc(A_mem_size);
	if (A_data == NULL) {
		perror("malloc");
	}

	B_data = (TYPE*)malloc(B_mem_size);
	if (B_data == NULL) {
		perror("malloc");
	}

	C_data = (TYPE*) malloc(C_mem_size);
	if (C_data == NULL) {
		perror("malloc");
	}

	cl_program program[platform_count];

	for (p=0; p<platform_count; p++) {
		if (devs[p] == 0)
			continue;

		check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));

		check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));

		check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
	}

	printf("Initializing data...\n");
	srand(2008);
	fillArray(A_data, A_size);
	fillArray(B_data, B_size);
	memset(C_data, 0, C_size);


	printf("Computing...\n");
	workOffset[0] = 0;
	gettimeofday(&start, NULL);

	size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
	int c = 0;
	for (p=0; p<platform_count;p++) {
		for (i=0; i<devs[p]; i++) {
			check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
			c++;
		}
	}

	for(i=0; i < BLOCKS; ++i)
	{
		int d = i % device_count;
		cl_uint platform = 0;

		// determine device platform
		int dev = d;
		for (platform = 0; platform < platform_count; platform++) {
			if ((cl_int)(dev - devs[platform]) < 0)
				break;
			dev -= devs[platform];
		}

		workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;

		check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY  | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
		check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));

		check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i]));
		check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d]));
		check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i]));

		size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};

		check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));

		// Non-blocking copy of result from device to host
		cqs[i] = commandQueue[platform][dev];
		check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));

		if(i+1 < BLOCKS)
			workOffset[i + 1] = workOffset[i] + workSize[i];
	}


	// CPU sync with GPU
	for (p=0; p<platform_count;p++) {
		cl_uint dev;
		for (dev=0; dev<devs[p]; dev++) {
			clFinish(commandQueue[p][dev]);
		}
	}

	gettimeofday(&end, NULL);
	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));

	double dSeconds = timing/1000/1000;
	double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
	double gflops = 1.0e-9 * dNumOps/dSeconds;

	printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
			gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);

	// compute reference solution
	if (check) {
		printf("Comparing results with CPU computation... ");
		TYPE* reference = (TYPE*)malloc(C_mem_size);
		computeReference(reference, A_data, B_data, HA, WA, WB);

		// check result
		int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
		if (res == 0) {
			printf("\n\n");
			printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
		}
		else printf("PASSED\n\n");
		free(reference);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clFinish(cqs[i]);
	}

	for (i=0; i<device_count; i++) {
		clReleaseMemObject(d_B[i]);
	}

	for(i = 0; i < BLOCKS; i++)
	{
		clReleaseMemObject(d_A[i]);
		clReleaseMemObject(d_C[i]);
		clReleaseEvent(GPUExecution[i]);
		clReleaseEvent(GPUDone[i]);
	}


	for (p=0; p<platform_count;p++) {
		if (devs[p] == 0)
			continue;

		check(clReleaseKernel(multiplicationKernel[p]));
		check(clReleaseProgram(program[p]));
		check(clReleaseContext(ctx[p]));
		cl_uint k;
		for(k = 0; k < devs[p]; ++k)
		{
			check(clReleaseCommandQueue(commandQueue[p][k]));
		}
	}

	free(A_data);
	free(B_data);
	free(C_data);

	return 0;
}