Beispiel #1
0
int
main ( int argc, char *argv[] )
{
    int matrix_dim = 32; /* default matrix_dim */
    int opt, option_index=0;
    func_ret_t ret;
    const char *input_file = NULL;
    float *m, *mm;
    stopwatch sw;

    cl_device_id clDevice;
    cl_context clContext;
    cl_command_queue clCommands;
    cl_program clProgram;
    cl_kernel clKernel_diagonal;
    cl_kernel clKernel_perimeter;
    cl_kernel clKernel_internal;
    cl_int dev_type;

    cl_int errcode;

    FILE *kernelFile;
    char *kernelSource;
    size_t kernelLength;

    cl_mem d_m;

    ocd_init(&argc, &argv, NULL);
    ocd_options opts = ocd_get_options();
    platform_id = opts.platform_id;
    device_id = opts.device_id;


    while ((opt = getopt_long(argc, argv, "::vs:i:",
                              long_options, &option_index)) != -1 ) {
        switch(opt) {
        case 'i':
            input_file = optarg;
            break;
        case 'v':
            do_verify = 1;
            break;
        case 's':
            matrix_dim = atoi(optarg);
            fprintf(stderr, "Currently not supported, use -i instead\n");
            fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]);
            exit(EXIT_FAILURE);
        case '?':
            fprintf(stderr, "invalid option\n");
            break;
        case ':':
            fprintf(stderr, "missing argument\n");
            break;
        default:
            fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file||-p platform|-d device]\n",
                    argv[0]);
            exit(EXIT_FAILURE);
        }
    }

    if ( (optind < argc) || (optind == 1)) {
        fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file|-p platform|-d device]\n", argv[0]);
        exit(EXIT_FAILURE);
    }

    if (input_file) {
        printf("Reading matrix from file %s\n", input_file);
        ret = create_matrix_from_file(&m, input_file, &matrix_dim);
        if (ret != RET_SUCCESS) {
            m = NULL;
            fprintf(stderr, "error create matrix from file %s\n", input_file);
            exit(EXIT_FAILURE);
        }
    } else {
        printf("No input file specified!\n");
        exit(EXIT_FAILURE);
    }

    if (do_verify) {
        printf("Before LUD\n");
        print_matrix(m, matrix_dim);
        matrix_duplicate(m, &mm, matrix_dim);
    }

//  errcode = clGetPlatformIDs(NUM_PLATFORM, clPlatform, NULL);
//  CHECKERR(errcode);
//
//  errcode = clGetDeviceIDs(clPlatform[PLATFORM_ID], USEGPU ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
//  CHECKERR(errcode);
#ifdef USEGPU
    dev_type = CL_DEVICE_TYPE_GPU;
#elif defined(USE_AFPGA)
    dev_type = CL_DEVICE_TYPE_ACCELERATOR;
#else
    dev_type = CL_DEVICE_TYPE_CPU;
#endif


    clDevice = GetDevice(platform_id, device_id,dev_type);
    size_t max_worksize[3];
    errcode = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES,sizeof(size_t)*3, &max_worksize, NULL);
    CHECKERR(errcode);
    while(BLOCK_SIZE*BLOCK_SIZE>max_worksize[0])
        BLOCK_SIZE = BLOCK_SIZE/2;

    clContext = clCreateContext(NULL, 1, &clDevice, NULL, NULL, &errcode);
    CHECKERR(errcode);

    clCommands = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &errcode);
    CHECKERR(errcode);

    kernelFile = fopen("lud_kernel.cl", "r");
    fseek(kernelFile, 0, SEEK_END);
    kernelLength = (size_t) ftell(kernelFile);
    kernelSource = (char *) malloc(sizeof(char)*kernelLength);
    rewind(kernelFile);
    fread((void *) kernelSource, kernelLength, 1, kernelFile);
    fclose(kernelFile);

    clProgram = clCreateProgramWithSource(clContext, 1, (const char **) &kernelSource, &kernelLength, &errcode);
    CHECKERR(errcode);

    free(kernelSource);
    char arg[100];
    sprintf(arg,"-D BLOCK_SIZE=%d", (int)BLOCK_SIZE);
    errcode = clBuildProgram(clProgram, 1, &clDevice, arg, NULL, NULL);
    if (errcode == CL_BUILD_PROGRAM_FAILURE)
    {
        char *log;
        size_t logLength;
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLength);
        log = (char *) malloc(sizeof(char)*logLength);
        errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, logLength, (void *) log, NULL);
        fprintf(stderr, "Kernel build error! Log:\n%s", log);
        free(log);
        return 0;
    }
    CHECKERR(errcode);

    clKernel_diagonal = clCreateKernel(clProgram, "lud_diagonal", &errcode);
    CHECKERR(errcode);
    clKernel_perimeter = clCreateKernel(clProgram, "lud_perimeter", &errcode);
    CHECKERR(errcode);
    clKernel_internal = clCreateKernel(clProgram, "lud_internal", &errcode);
    CHECKERR(errcode);

    d_m = clCreateBuffer(clContext, CL_MEM_READ_WRITE, matrix_dim*matrix_dim*sizeof(float), NULL, &errcode);
    CHECKERR(errcode);

    /* beginning of timing point */
    stopwatch_start(&sw);

    errcode = clEnqueueWriteBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent);

    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_H2D, "Matrix Copy", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    CHECKERR(errcode);

    int i=0;
    size_t localWorkSize[2];
    size_t globalWorkSize[2];
    //printf("BLOCK_SIZE: %d\n",BLOCK_SIZE);
//	printf("max Work-item Size: %d\n",(int)max_worksize[0]);
#ifdef START_POWER
    for( int iter = 0; iter < 1000; iter++)
#endif
        for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
            errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);

            localWorkSize[0] = BLOCK_SIZE;
            globalWorkSize[0] = BLOCK_SIZE;

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHECKERR(errcode);
            errcode = clSetKernelArg(clKernel_perimeter, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_perimeter, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_perimeter, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);
            localWorkSize[0] = BLOCK_SIZE*2;
            globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0];

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_perimeter, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Perimeter Kernel", ocdTempTimer)
            CHECKERR(errcode);
            END_TIMER(ocdTempTimer)
            errcode = clSetKernelArg(clKernel_internal, 0, sizeof(cl_mem), (void *) &d_m);
            errcode |= clSetKernelArg(clKernel_internal, 1, sizeof(int), (void *) &matrix_dim);
            errcode |= clSetKernelArg(clKernel_internal, 2, sizeof(int), (void *) &i);
            CHECKERR(errcode);
            localWorkSize[0] = BLOCK_SIZE;
            localWorkSize[1] = BLOCK_SIZE;
            globalWorkSize[0] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[0];
            globalWorkSize[1] = ((matrix_dim-i)/BLOCK_SIZE-1)*localWorkSize[1];

            errcode = clEnqueueNDRangeKernel(clCommands, clKernel_internal, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
            clFinish(clCommands);
            START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Internal Kernel", ocdTempTimer)
            END_TIMER(ocdTempTimer)
            CHECKERR(errcode);
        }
    errcode = clSetKernelArg(clKernel_diagonal, 0, sizeof(cl_mem), (void *) &d_m);
    errcode |= clSetKernelArg(clKernel_diagonal, 1, sizeof(int), (void *) &matrix_dim);
    errcode |= clSetKernelArg(clKernel_diagonal, 2, sizeof(int), (void *) &i);
    CHECKERR(errcode);
    localWorkSize[0] = BLOCK_SIZE;
    globalWorkSize[0] = BLOCK_SIZE;

    errcode = clEnqueueNDRangeKernel(clCommands, clKernel_diagonal, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "Diagonal Kernels", ocdTempTimer)
    CHECKERR(errcode);

    END_TIMER(ocdTempTimer)

    errcode = clEnqueueReadBuffer(clCommands, d_m, CL_TRUE, 0, matrix_dim*matrix_dim*sizeof(float), (void *) m, 0, NULL, &ocdTempEvent);
    clFinish(clCommands);
    START_TIMER(ocdTempEvent, OCD_TIMER_D2H, "Matrix copy", ocdTempTimer)
    END_TIMER(ocdTempTimer)
    /* end of timing point */
    stopwatch_stop(&sw);
    printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));

    clReleaseMemObject(d_m);

    if (do_verify) {
        printf("After LUD\n");
        print_matrix(m, matrix_dim);
        printf(">>>Verify<<<<\n");
        printf("matrix_dim: %d\n",matrix_dim);
        lud_verify(mm, m, matrix_dim);
        free(mm);
    }

    clReleaseKernel(clKernel_diagonal);
    clReleaseKernel(clKernel_perimeter);
    clReleaseKernel(clKernel_internal);
    clReleaseProgram(clProgram);
    clReleaseCommandQueue(clCommands);
    clReleaseContext(clContext);

    free(m);
    ocd_finalize();
    return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */
Beispiel #2
0
int
main ( int argc, char *argv[] )
{
    int matrix_dim = 32; /* default matrix_dim */
    int opt, option_index=0, error=0;
    func_ret_t ret;
    const char *input_file = NULL;
    double *m, *mm;
    stopwatch sw;
    int i;

    while ((opt = getopt_long(argc, argv, ":vs:i:",
                              long_options, &option_index)) != -1 ) {
        switch(opt){
        case 'v':
            do_verify = 1;
            break;
        case 's':
            matrix_dim = atoi(optarg);
            break;
        case '?':
            fprintf(stderr, "invalid option\n");
            error=1;
            break;
        case ':':
            fprintf(stderr, "missing argument\n");
            error=1;
            break;
        default:
            error=1;
        }
    }

    if ((optind < argc) || (optind == 1) || error) {
        fprintf(stderr, "Usage: %s [-v] [-s matrix_size]\n", argv[0]);
        exit(EXIT_FAILURE);
    }

     if(matrix_dim>1) {
        fprintf(stderr, "Generating matrix of size %d x %d\n", matrix_dim, matrix_dim);
        ret = create_matrix_from_random(&m, matrix_dim);
        if(ret != RET_SUCCESS){
            m = NULL;
            fprintf(stderr, "error could not generate random matrix of size %d x %d!\n", matrix_dim, matrix_dim);
            exit(EXIT_FAILURE);
        }
    }
    else {
        fprintf(stderr, "No input file or valid matrix size specified!\n");
        exit(EXIT_FAILURE);
    }

    if (do_verify){
        //printf("Before LUD\n");
        //print_matrix(m, matrix_dim);
        matrix_duplicate(m, &mm, matrix_dim);
    }

    stopwatch_start(&sw);
    lud_base(m, matrix_dim);
    stopwatch_stop(&sw);

    if (matrix_dim == 1024) {
        for (i=0; i<100; ++i) {
            if (m[expected_row_indices[i]*matrix_dim + expected_col_indices[i]] != expected_values[i]) {
                fprintf(stderr, "ERROR: value at index (%d,%d) = '%.*f' is different from the expected value '%.*f'\n", 
                    expected_row_indices[i],
                    expected_col_indices[i],
                    // the 21 parameter prints enough significant decimal digits to obtain the same floating-point number
                    // when read back
                    21, m[expected_row_indices[i]*matrix_dim + expected_col_indices[i]], 
                    21, expected_values[i]
                );
                fprintf(stderr, "Received values:\n");
                for (i=0; i<100; ++i) {
                    fprintf(stderr, "%.*f, ", 21, m[expected_row_indices[i]*matrix_dim + expected_col_indices[i]]);
                }
                fprintf(stderr, "\n");
                exit(1);
            }
        }
    } else {
        fprintf(stderr, "WARNING: No self-checking step for dimension '%d'\n", matrix_dim);
    }

    if (do_verify){
        //fprintf(stderr, "After LUD\n");
        //print_matrix(m, matrix_dim);
        fprintf(stderr, ">>>Verify<<<<\n");
        lud_verify(mm, m, matrix_dim);
        free(mm);
    }

    free(m);
    printf("{ \"status\": %d, \"options\": \"-s %d\", \"time\": %f }\n", 1, matrix_dim, get_interval_by_sec(&sw));
    return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */
Beispiel #3
0
int
main ( int argc, char *argv[] )
{
//printf("Starting..\n");
  int matrix_dim = 32; /* default size */
  int opt, option_index=0;
  func_ret_t ret;
  const char *input_file = NULL;
  float *m, *mm;
  stopwatch sw;
	int grid_x=0;
	int grid_y=0;

	
  while ((opt = getopt_long(argc, argv, "::vs:i:x:y:", 
                            long_options, &option_index)) != -1 ) {
    switch(opt){
    case 'i':
      input_file = optarg;
      break;
    case 'v':
      do_verify = 1;
      break;
    case 's':
      matrix_dim = atoi(optarg);
      //printf("Generate input matrix internally, size =%d\n", matrix_dim);
      // fprintf(stderr, "Currently not supported, use -i instead\n");
      // fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
      // exit(EXIT_FAILURE);
      break;
    case 'x':
	grid_x = atoi(optarg);
	break;
    case 'y':
	grid_y = atoi(optarg);
	break;

    case '?':
      fprintf(stderr, "invalid option\n");
      break;
    case ':':
      fprintf(stderr, "missing argument\n");
      break;
    default:
      fprintf(stderr, "1Usage: %s [-v] [-s matrix_size|-i input_file]\n",
	      argv[0]);
      exit(EXIT_FAILURE);
    }
  }
 /* 
  if ( (optind < argc) || (optind == 1)) {
    fprintf(stderr, "2Usage: %s [-v] [-n no. of threads] [-s matrix_size|-i input_file]\n", argv[0]);
    exit(EXIT_FAILURE);
  }
*/
  if (input_file) {
    //printf("Reading matrix from file %s\n", input_file);
    ret = create_matrix_from_file(&m, input_file, &matrix_dim);
    if (ret != RET_SUCCESS) {
      m = NULL;
      fprintf(stderr, "error create matrix from file %s\n", input_file);
      exit(EXIT_FAILURE);
    }
  }
  else if (matrix_dim) {
    //printf("Creating matrix internally size=%d\n", matrix_dim);
    ret = create_matrix(&m, matrix_dim);
    if (ret != RET_SUCCESS) {
      m = NULL;
      fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim);
      exit(EXIT_FAILURE);
    }
  }
 
  else {
    printf("No input file specified!\n");
    exit(EXIT_FAILURE);
  } 

  if (do_verify){
    /* print_matrix(m, matrix_dim); */
    matrix_duplicate(m, &mm, matrix_dim);
  }
wul();
//printf("Starting. . . \n");
//lud_oacc(m, matrix_dim,grid_x,grid_y);

  stopwatch_start(&sw);
//  lud_omp(m, matrix_dim);
  lud_oacc(m, matrix_dim,grid_x,grid_y);
  stopwatch_stop(&sw);
  printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));

  if (do_verify){
    printf("After LUD\n");
    /* print_matrix(m, matrix_dim); */
    printf(">>>Verify<<<<\n");
    lud_verify(mm, m, matrix_dim); 
    free(mm);
  }

  free(m);

  return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */
int
main ( int argc, char *argv[] )
{
  printf("WG size of kernel = %d X %d\n", BLOCK_SIZE, BLOCK_SIZE);
	int matrix_dim = 32; /* default matrix_dim */
	int opt, option_index=0;
	func_ret_t ret;
	const char *input_file = NULL;
	float *m, *mm;
	stopwatch sw;
	
	while ((opt = getopt_long(argc, argv, "::vs:i:", 
                            long_options, &option_index)) != -1 ) {
		switch(opt){
			case 'i':
			input_file = optarg;
			break;
			case 'v':
			do_verify = 1;
			break;
        case 's':
			matrix_dim = atoi(optarg);
			printf("Generate input matrix internally, size =%d\n", matrix_dim);
			// fprintf(stderr, "Currently not supported, use -i instead\n");
			// fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
			// exit(EXIT_FAILURE);
			break;
        case '?':
			fprintf(stderr, "invalid option\n");
			break;
        case ':':
			fprintf(stderr, "missing argument\n");
			break;
        default:
			fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n",
                  argv[0]);
			exit(EXIT_FAILURE);
		}
	}
  
	if ( (optind < argc) || (optind == 1)) {
		fprintf(stderr, "Usage: %s [-v] [-s matrix_size|-i input_file]\n", argv[0]);
		exit(EXIT_FAILURE);
	}	

	if (input_file) {
		printf("Reading matrix from file %s\n", input_file);
		ret = create_matrix_from_file(&m, input_file, &matrix_dim);
		if (ret != RET_SUCCESS) {
			m = NULL;
			fprintf(stderr, "error create matrix from file %s\n", input_file);
			exit(EXIT_FAILURE);
		}
	} 
	
	else if (matrix_dim) {
	  printf("Creating matrix internally size=%d\n", matrix_dim);
	  ret = create_matrix(&m, matrix_dim);
	  if (ret != RET_SUCCESS) {
	    m = NULL;
	    fprintf(stderr, "error create matrix internally size=%d\n", matrix_dim);
	    exit(EXIT_FAILURE);
	  }
	}

	else {
	  printf("No input file specified!\n");
	  exit(EXIT_FAILURE);
	}

	if (do_verify){
		printf("Before LUD\n");
		// print_matrix(m, matrix_dim);
		matrix_duplicate(m, &mm, matrix_dim);
	}
	
	int sourcesize = 1024*1024;
	char * source = (char *)calloc(sourcesize, sizeof(char)); 
	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }

	char * kernel_lud_diag   = "lud_diagonal";
	char * kernel_lud_peri   = "lud_perimeter";
	char * kernel_lud_inter  = "lud_internal";
	FILE * fp = fopen("./lud_kernel.cl", "rb"); 
	if(!fp) { printf("ERROR: unable to open '%s'\n"); return -1; }
	fread(source + strlen(source), sourcesize, 1, fp);
	fclose(fp);

	// Use 1: GPU  0: CPU
	int use_gpu = 1;
	// OpenCL initialization
	if(initialize(use_gpu)) return -1;
	// compile kernel
	cl_int err = 0;
	const char * slist[2] = { source, 0 };
	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
	if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
	char clOptions[110];
	//  sprintf(clOptions,"-I../../src"); 
	sprintf(clOptions," ");
#ifdef BLOCK_SIZE
	sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE);
#endif

	err = clBuildProgram(prog, 0, NULL, clOptions, NULL, NULL);
	{ // show warnings/errors
		//static char log[65536]; memset(log, 0, sizeof(log));
		//cl_device_id device_id = 0;
		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
	}
	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
    
	cl_kernel diagnal;
	cl_kernel perimeter;
	cl_kernel internal;
	diagnal   = clCreateKernel(prog, kernel_lud_diag, &err);  
	perimeter = clCreateKernel(prog, kernel_lud_peri, &err);  
	internal  = clCreateKernel(prog, kernel_lud_inter, &err);  
	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
	clReleaseProgram(prog);
  
	//size_t local_work[3] = { 1, 1, 1 };
	//size_t global_work[3] = {1, 1, 1 }; 
  
	cl_mem d_m;
	d_m = clCreateBuffer(context, CL_MEM_READ_WRITE, matrix_dim*matrix_dim * sizeof(float), NULL, &err );
	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1;} 

	/* beginning of timing point */
	stopwatch_start(&sw);
	err = clEnqueueWriteBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
	
	int i=0;
	for (i=0; i < matrix_dim-BLOCK_SIZE; i += BLOCK_SIZE) {
	 
	  clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
	  clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	  clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
	  clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
      
	  size_t global_work1[3]  = {BLOCK_SIZE, 1, 1};
	  size_t local_work1[3]  = {BLOCK_SIZE, 1, 1};
	   
	  err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
	  if(err != CL_SUCCESS) { printf("ERROR:  diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
	  
	  clSetKernelArg(perimeter, 0, sizeof(void *), (void*) &d_m);
	  clSetKernelArg(perimeter, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	  clSetKernelArg(perimeter, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	  clSetKernelArg(perimeter, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	  clSetKernelArg(perimeter, 4, sizeof(cl_int), (void*) &matrix_dim);
	  clSetKernelArg(perimeter, 5, sizeof(cl_int), (void*) &i);
	  
	  size_t global_work2[3] = {BLOCK_SIZE * 2 * ((matrix_dim-i)/BLOCK_SIZE-1), 1, 1};
	  size_t local_work2[3]  = {BLOCK_SIZE * 2, 1, 1};
	  
	  err = clEnqueueNDRangeKernel(cmd_queue, perimeter, 2, NULL, global_work2, local_work2, 0, 0, 0);
	  if(err != CL_SUCCESS) { printf("ERROR:  perimeter clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
	  
	  clSetKernelArg(internal, 0, sizeof(void *), (void*) &d_m);
	  clSetKernelArg(internal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	  clSetKernelArg(internal, 2, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	  clSetKernelArg(internal, 3, sizeof(cl_int), (void*) &matrix_dim);
	  clSetKernelArg(internal, 4, sizeof(cl_int), (void*) &i);
      
	  size_t global_work3[3] = {BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), BLOCK_SIZE * ((matrix_dim-i)/BLOCK_SIZE-1), 1};
	  size_t local_work3[3] = {BLOCK_SIZE, BLOCK_SIZE, 1};

	  err = clEnqueueNDRangeKernel(cmd_queue, internal, 2, NULL, global_work3, local_work3, 0, 0, 0);
	  if(err != CL_SUCCESS) { printf("ERROR:  internal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
	}
	clSetKernelArg(diagnal, 0, sizeof(void *), (void*) &d_m);
	clSetKernelArg(diagnal, 1, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
	clSetKernelArg(diagnal, 2, sizeof(cl_int), (void*) &matrix_dim);
	clSetKernelArg(diagnal, 3, sizeof(cl_int), (void*) &i);
      
	size_t global_work1[3]  = {BLOCK_SIZE, 1, 1};
	size_t local_work1[3]  = {BLOCK_SIZE, 1, 1};
	err = clEnqueueNDRangeKernel(cmd_queue, diagnal, 2, NULL, global_work1, local_work1, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR:  diagnal clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }	
	
	err = clEnqueueReadBuffer(cmd_queue, d_m, 1, 0, matrix_dim*matrix_dim*sizeof(float), m, 0, 0, 0);
	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueReadBuffer  d_m (size:%d) => %d\n", matrix_dim*matrix_dim, err); return -1; }
	clFinish(cmd_queue);
	/* end of timing point */
	stopwatch_stop(&sw);
	printf("Time consumed(ms): %lf\n", 1000*get_interval_by_sec(&sw));

	clReleaseMemObject(d_m);

	if (do_verify){
		printf("After LUD\n");
		// print_matrix(m, matrix_dim);
		printf(">>>Verify<<<<\n");
		lud_verify(mm, m, matrix_dim); 
		free(mm);
	}

	free(m);
	
	if(shutdown()) return -1;
	
}				
Beispiel #5
0
int main (int argc, char *argv[])
{
	int matrix_dim = 32; /* default matrix_dim */
	int opt, option_index = 0;
	func_ret_t ret;
	const char *input_file = NULL;
	const char *cubin_file = NULL;
	float *m, *mm;
	struct timeval tv;
	CUdeviceptr d_m;
	CUcontext ctx;
	CUmodule mod;
	CUresult res;
	
	while ((opt = getopt_long(argc, argv, "::vs:i:c:", 
							  long_options, &option_index)) != -1 ) {
		switch(opt) {
		case 'c':
			cubin_file = optarg;
			break;
        case 'i':
			input_file = optarg;
			break;
        case 'v':
			do_verify = 1;
			break;
        case 's':
			matrix_dim = atoi(optarg);
			fprintf(stderr, "Currently not supported, use -i instead\n");
			fprintf(stderr, 
					"Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n",
					argv[0]);
			exit(EXIT_FAILURE);
        case '?':
			fprintf(stderr, "invalid option\n");
			break;
        case ':':
			fprintf(stderr, "missing argument\n");
			break;
        default:
			fprintf(stderr, 
					"Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n",
					argv[0]);
			exit(EXIT_FAILURE);
		}
	}
	
	if ( (optind < argc) || (optind == 1)) {
		fprintf(stderr, 
				"Usage: %s [-v] [-s matrix_size|-i input_file|-c cubin]\n",
				argv[0]);
		exit(EXIT_FAILURE);
	}
	
	if (!cubin_file) {
		printf("No cubin file specified!\n");
		exit(EXIT_FAILURE);
	}

	if (input_file) {
		printf("Reading matrix from file %s\n", input_file);
		ret = create_matrix_from_file(&m, input_file, &matrix_dim);
		if (ret != RET_SUCCESS) {
			m = NULL;
			fprintf(stderr, "error create matrix from file %s\n", input_file);
			exit(EXIT_FAILURE);
		}
	} else {
		printf("No input file specified!\n");
		exit(EXIT_FAILURE);
	}
	
	if (do_verify){
		print_matrix(m, matrix_dim);

		matrix_duplicate(m, &mm, matrix_dim);
	}

	/*
	 * call our common CUDA initialization utility function.
	 */
	res = cuda_driver_api_init(&ctx, &mod, cubin_file);
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_init failed: res = %u\n", res);
		return -1;
	}

	res = cuMemAlloc(&d_m, matrix_dim * matrix_dim * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed\n");
		return -1;
	}

	/*
	 * measurement start!
	 */
	time_measure_start(&tv);

	res = cuMemcpyHtoD(d_m, m, matrix_dim * matrix_dim * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %u\n", res);
		return -1;
	}
	
	lud_launch(mod, d_m, matrix_dim);
	
	res = cuMemcpyDtoH(m, d_m, matrix_dim * matrix_dim * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH failed: res = %u\n", res);
		return -1;
	}
	
	/*
	 * measurement end! will print out the time.
	 */
	time_measure_end(&tv);

	res = cuMemFree(d_m);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", res);
		return -1;
	}

	res = cuda_driver_api_exit(ctx, mod);
	if (res != CUDA_SUCCESS) {
		printf("cuda_driver_api_exit faild: res = %u\n", res);
		return -1;
	}

	if (do_verify){
		print_matrix(m, matrix_dim);
		printf(">>>Verify<<<<\n");
		lud_verify(mm, m, matrix_dim); 
		free(mm);
	}
	
	free(m);
	
	return EXIT_SUCCESS;
}				/* ----------  end of function main  ---------- */