void *computeInDevice(void *arg) { cl_device_id clDeviceID = (cl_device_id) arg; cl_uint numDevices; cl_uint numPlatforms; cl_context clContext; cl_kernel clKernel; cl_command_queue clCommandQueue; cl_program clProgram; cl_int err; cl_mem AObj; cl_mem BObj; cl_mem CObj; FILE *fp; char *kernelSrc; size_t kernelSrcSize; double timeStart, timeEnd; int n = N; float alpha = ALPHA; float beta = BETA; size_t localWorkSize[2], globalWorkSize[2]; fp = fopen("gemm.cl", "r"); kernelSrc = (char*)malloc(KERNEL_SRC_SIZE); kernelSrcSize = fread(kernelSrc, 1, KERNEL_SRC_SIZE, fp); fclose(fp); // err = clGetDeviceIDs(clPlatformID, CL_DEVICE_TYPE_CPU, 1, &clDeviceID, &numDevices); // if(err != CL_SUCCESS) printf("Error: clGetDeviceIDs\n"); clContext = clCreateContext(NULL, 1, &clDeviceID, NULL, NULL, &err); if(err != CL_SUCCESS) printf("Error: clCreateContext\n"); clCommandQueue = clCreateCommandQueue(clContext, clDeviceID, 0, &err); if(err != CL_SUCCESS) printf("Error: clCreateCommandQueue\n"); AObj = clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(float) * N * N, NULL, NULL); BObj = clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(float) * N * N, NULL, NULL); CObj = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(float) * N * N, NULL, NULL); if(!AObj || !BObj || !CObj) printf("Error: clCreateBuffer\n"); err = clEnqueueWriteBuffer(clCommandQueue, AObj, CL_TRUE, 0, sizeof(float) * N * N, A, 0, NULL, NULL); err = clEnqueueWriteBuffer(clCommandQueue, BObj, CL_TRUE, 0, sizeof(float) * N * N, B, 0, NULL, NULL); err = clEnqueueWriteBuffer(clCommandQueue, CObj, CL_TRUE, 0, sizeof(float) * N * N, C, 0, NULL, NULL); if(err != CL_SUCCESS)printf("Error: clEnqueueWriteBuffer\n"); clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&kernelSrc, (const size_t *)&kernelSrcSize, &err); if(err != CL_SUCCESS) printf("Error: clCreateProgramWithSource\n"); err = clBuildProgram(clProgram, 1, &clDeviceID, NULL, NULL, NULL); if(err != CL_SUCCESS) printf("Error: clBuildProgram\n"); clKernel = clCreateKernel(clProgram, "gemm", &err); if(err != CL_SUCCESS) printf("Error: clCreateKernel\n"); localWorkSize[0] = LOCAL_WORK_GROUP_X; localWorkSize[1] = LOCAL_WORK_GROUP_Y; globalWorkSize[0] = (size_t)ceil(((float)N) / ((float)LOCAL_WORK_GROUP_X)) * LOCAL_WORK_GROUP_X; globalWorkSize[1] = (size_t)ceil(((float)N) / ((float)LOCAL_WORK_GROUP_Y)) * LOCAL_WORK_GROUP_Y; printf("localWorkSize: %zu %zu\n",localWorkSize[0], localWorkSize[1]); printf("globalWorkSize: %zu %zu\n",globalWorkSize[0], globalWorkSize[1]); err = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&AObj); err |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&BObj); err |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&CObj); err |= clSetKernelArg(clKernel, 3, sizeof(float), (void *)&alpha); err |= clSetKernelArg(clKernel, 4, sizeof(float), (void *)&beta); err |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&n); if(err != CL_SUCCESS) printf("Error: clSetKernelArg\n"); timeStart = getTime(); err = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if(err != CL_SUCCESS) printf("Error: clEnqueueNDRangeKernel\n"); // printf("%s\n", clGetErrMsg(err)); clFinish(clCommandQueue); float* deviceOutput; deviceOutput = (float*)malloc(N*N*sizeof(float)); err = clEnqueueReadBuffer(clCommandQueue, CObj, CL_TRUE, 0, N*N*sizeof(float), deviceOutput, 0, NULL, NULL); if(err != CL_SUCCESS) printf("Error: clEnqueueReadBuffer\n"); timeEnd = getTime(); printf("Exe. Time: \t%0.2fs\n", timeEnd - timeStart); // printArray(deviceOutput); correctnessCheck(deviceOutput, seqOutput); free(deviceOutput); err = clFlush(clCommandQueue); err = clReleaseKernel(clKernel); err = clReleaseProgram(clProgram); err = clReleaseMemObject(AObj); err = clReleaseMemObject(BObj); err = clReleaseMemObject(CObj); err = clReleaseCommandQueue(clCommandQueue); err = clReleaseContext(clContext); if(err != CL_SUCCESS) printf("Error: clRelease\n"); return NULL; }
int main(int argc, char **argv) { if (argc < 2) { fprintf(stderr, "Usage: reordering_test matrix_in_matrix_market_format\n"); return -1; } CSR *A = new CSR(argv[1], 0, true /* force-symmetric */); int nnz = A->getNnz(); double flops = 2*nnz; double bytes = (sizeof(double) + sizeof(int))*nnz + sizeof(double)*(A->m + A->n); printf("original bandwidth %d\n", A->getBandwidth()); double *x = MALLOC(double, A->m); double *y = MALLOC(double, A->m); // allocate a large buffer to flush out cache bufToFlushLlc = (double *)_mm_malloc(LLC_CAPACITY, 64); const int REPEAT = 128; double times[REPEAT]; for (int i = 0; i < REPEAT; ++i) { flushLlc(); double t = omp_get_wtime(); A->multiplyWithVector(y, x); times[i] = omp_get_wtime() - t; } correctnessCheck(A, y); printf("SpMV BW"); printEfficiency(times, REPEAT, flops, bytes); #ifdef MKL for (int i = 0; i < REPEAT; ++i) { flushLlc(); double t = omp_get_wtime(); mkl_cspblas_dcsrgemv( "N", &A->m, A->values, A->rowptr, A->colidx, x, y); times[i] = omp_get_wtime() - t; } correctnessCheck(A, y); printf("MKL SpMV BW"); printEfficiency(times, REPEAT, flops, bytes); #endif int *perm = MALLOC(int, A->m); int *inversePerm = MALLOC(int, A->m); for (int o = BFS; o <= RCM; ++o) { Option option = (Option)o; switch (option) { case BFS: printf("\nBFS reordering\n"); break; case RCM_WO_SOURCE_SELECTION: printf("\nRCM reordering w/o source selection heuristic\n"); break; case RCM: printf("\nRCM reordering\n"); break; default: assert(false); break; } double t = -omp_get_wtime(); switch (option) { case BFS: A->getBFSPermutation(perm, inversePerm); break; case RCM_WO_SOURCE_SELECTION: A->getRCMPermutation(perm, inversePerm, false); break; case RCM: A->getRCMPermutation(perm, inversePerm); break; default: assert(false); break; } t += omp_get_wtime(); printf( "Constructing permutation takes %g (%.2f gbps)\n", t, nnz*4/t/1e9); isPerm(perm, A->m); isPerm(inversePerm, A->m); t = -omp_get_wtime(); CSR *APerm = A->permute(perm, inversePerm); t += omp_get_wtime(); printf("Permute takes %g (%.2f gbps)\n", t, bytes/t/1e9); printf("Permuted bandwidth %d\n", APerm->getBandwidth()); for (int i = 0; i < REPEAT; ++i) { flushLlc(); t = omp_get_wtime(); APerm->multiplyWithVector(y, x); times[i] = omp_get_wtime() - t; } printf("SpMV BW"); printEfficiency(times, REPEAT, flops, bytes); delete APerm; } FREE(x); FREE(y); delete A; }