コード例 #1
0
ファイル: gemm.cpp プロジェクト: xlpiao/code
void *computeInDevice(void *arg)
{
    cl_device_id clDeviceID = (cl_device_id) arg;
    cl_uint numDevices;
    cl_uint numPlatforms;
    cl_context clContext;
    cl_kernel clKernel;
    cl_command_queue clCommandQueue;
    cl_program clProgram;

    cl_int err;

    cl_mem AObj;
    cl_mem BObj;
    cl_mem CObj;

    FILE *fp;
    char *kernelSrc;
    size_t kernelSrcSize;
    double timeStart, timeEnd;

    int n = N;
    float alpha = ALPHA;
    float beta = BETA;
    size_t localWorkSize[2], globalWorkSize[2];

    fp = fopen("gemm.cl", "r");

    kernelSrc = (char*)malloc(KERNEL_SRC_SIZE);
    kernelSrcSize = fread(kernelSrc, 1, KERNEL_SRC_SIZE, fp);
    fclose(fp);

    // err = clGetDeviceIDs(clPlatformID, CL_DEVICE_TYPE_CPU, 1, &clDeviceID, &numDevices);
    // if(err != CL_SUCCESS) printf("Error: clGetDeviceIDs\n");

    clContext = clCreateContext(NULL, 1, &clDeviceID, NULL, NULL, &err);
    if(err != CL_SUCCESS) printf("Error: clCreateContext\n");

    clCommandQueue = clCreateCommandQueue(clContext, clDeviceID, 0, &err);
    if(err != CL_SUCCESS) printf("Error: clCreateCommandQueue\n");

    AObj = clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(float) * N * N, NULL, NULL);
    BObj = clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(float) * N * N, NULL, NULL);
    CObj = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(float) * N * N, NULL, NULL);
    if(!AObj || !BObj || !CObj) printf("Error: clCreateBuffer\n");

    err = clEnqueueWriteBuffer(clCommandQueue, AObj, CL_TRUE, 0, sizeof(float) * N * N, A, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(clCommandQueue, BObj, CL_TRUE, 0, sizeof(float) * N * N, B, 0, NULL, NULL);
    err = clEnqueueWriteBuffer(clCommandQueue, CObj, CL_TRUE, 0, sizeof(float) * N * N, C, 0, NULL, NULL);
    if(err != CL_SUCCESS)printf("Error: clEnqueueWriteBuffer\n");

    clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&kernelSrc, (const size_t *)&kernelSrcSize, &err);
    if(err != CL_SUCCESS) printf("Error: clCreateProgramWithSource\n");

    err = clBuildProgram(clProgram, 1, &clDeviceID, NULL, NULL, NULL);
    if(err != CL_SUCCESS) printf("Error: clBuildProgram\n");

    clKernel = clCreateKernel(clProgram, "gemm", &err);
    if(err != CL_SUCCESS) printf("Error: clCreateKernel\n");

    localWorkSize[0] = LOCAL_WORK_GROUP_X;
    localWorkSize[1] = LOCAL_WORK_GROUP_Y;
    globalWorkSize[0] = (size_t)ceil(((float)N) / ((float)LOCAL_WORK_GROUP_X)) * LOCAL_WORK_GROUP_X;
    globalWorkSize[1] = (size_t)ceil(((float)N) / ((float)LOCAL_WORK_GROUP_Y)) * LOCAL_WORK_GROUP_Y;
    printf("localWorkSize: %zu  %zu\n",localWorkSize[0], localWorkSize[1]);
    printf("globalWorkSize: %zu  %zu\n",globalWorkSize[0], globalWorkSize[1]);

    err =  clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&AObj);
    err |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&BObj);
    err |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&CObj);
    err |= clSetKernelArg(clKernel, 3, sizeof(float), (void *)&alpha);
    err |= clSetKernelArg(clKernel, 4, sizeof(float), (void *)&beta);
    err |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&n);

    if(err != CL_SUCCESS) printf("Error: clSetKernelArg\n");

    timeStart = getTime();

    err = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
    if(err != CL_SUCCESS) printf("Error: clEnqueueNDRangeKernel\n");
    // printf("%s\n", clGetErrMsg(err));
    clFinish(clCommandQueue);

    float* deviceOutput;
    deviceOutput = (float*)malloc(N*N*sizeof(float));
    err = clEnqueueReadBuffer(clCommandQueue, CObj, CL_TRUE, 0, N*N*sizeof(float), deviceOutput, 0, NULL, NULL);
    if(err != CL_SUCCESS) printf("Error: clEnqueueReadBuffer\n");

    timeEnd = getTime();
    printf("Exe. Time: \t%0.2fs\n", timeEnd - timeStart);

    // printArray(deviceOutput);

    correctnessCheck(deviceOutput, seqOutput);
    free(deviceOutput);

    err = clFlush(clCommandQueue);
    err = clReleaseKernel(clKernel);
    err = clReleaseProgram(clProgram);
    err = clReleaseMemObject(AObj);
    err = clReleaseMemObject(BObj);
    err = clReleaseMemObject(CObj);
    err = clReleaseCommandQueue(clCommandQueue);
    err = clReleaseContext(clContext);
    if(err != CL_SUCCESS) printf("Error: clRelease\n");

    return NULL;
}
コード例 #2
0
ファイル: reordering_test.cpp プロジェクト: ttnghia/SpMP
int main(int argc, char **argv)
{
  if (argc < 2) {
    fprintf(stderr, "Usage: reordering_test matrix_in_matrix_market_format\n");
    return -1;
  }

  CSR *A = new CSR(argv[1], 0, true /* force-symmetric */);
  int nnz = A->getNnz();
  double flops = 2*nnz;
  double bytes = (sizeof(double) + sizeof(int))*nnz + sizeof(double)*(A->m + A->n);

  printf("original bandwidth %d\n", A->getBandwidth());

  double *x = MALLOC(double, A->m);
  double *y = MALLOC(double, A->m);

  // allocate a large buffer to flush out cache
  bufToFlushLlc = (double *)_mm_malloc(LLC_CAPACITY, 64);

  const int REPEAT = 128;
  double times[REPEAT];

  for (int i = 0; i < REPEAT; ++i) {
    flushLlc();

    double t = omp_get_wtime();
    A->multiplyWithVector(y, x);
    times[i] = omp_get_wtime() - t;
  }
  correctnessCheck(A, y);

  printf("SpMV BW");
  printEfficiency(times, REPEAT, flops, bytes);

#ifdef MKL
  for (int i = 0; i < REPEAT; ++i) {
    flushLlc();

    double t = omp_get_wtime();
    mkl_cspblas_dcsrgemv(
      "N", &A->m, A->values, A->rowptr, A->colidx, x, y);
    times[i] = omp_get_wtime() - t;
  }
  correctnessCheck(A, y);

  printf("MKL SpMV BW");
  printEfficiency(times, REPEAT, flops, bytes);
#endif

  int *perm = MALLOC(int, A->m);
  int *inversePerm = MALLOC(int, A->m);

  for (int o = BFS; o <= RCM; ++o) {
    Option option = (Option)o;

    switch (option) {
    case BFS:
      printf("\nBFS reordering\n");
      break;
    case RCM_WO_SOURCE_SELECTION:
      printf("\nRCM reordering w/o source selection heuristic\n");
      break;
    case RCM:
      printf("\nRCM reordering\n");
      break;
    default: assert(false); break;
    }

    double t = -omp_get_wtime();
    switch (option) {
    case BFS:
      A->getBFSPermutation(perm, inversePerm);
      break;
    case RCM_WO_SOURCE_SELECTION:
      A->getRCMPermutation(perm, inversePerm, false);
      break;
    case RCM:
      A->getRCMPermutation(perm, inversePerm);
      break;
    default: assert(false); break;
    }
    t += omp_get_wtime();

    printf(
      "Constructing permutation takes %g (%.2f gbps)\n",
      t, nnz*4/t/1e9);

    isPerm(perm, A->m);
    isPerm(inversePerm, A->m);

    t = -omp_get_wtime();
    CSR *APerm = A->permute(perm, inversePerm);
    t += omp_get_wtime();

    printf("Permute takes %g (%.2f gbps)\n", t, bytes/t/1e9);
    printf("Permuted bandwidth %d\n", APerm->getBandwidth());

    for (int i = 0; i < REPEAT; ++i) {
      flushLlc();

      t = omp_get_wtime();
      APerm->multiplyWithVector(y, x);
      times[i] = omp_get_wtime() - t;
    }
    printf("SpMV BW");
    printEfficiency(times, REPEAT, flops, bytes);

    delete APerm;
  }

  FREE(x);
  FREE(y);

  delete A;
}