Ejemplo n.º 1
0
  void SXFunctionInternal::evaluateOpenCL() {
    // OpenCL return flag
    cl_int ret;

    // Set OpenCL Kernel Parameters
    int kernel_arg = 0;

    // Pass inputs
    for (int i=0; i<nIn(); ++i) {
      ret = clSetKernelArg(kernel_, kernel_arg++,
                           sizeof(cl_mem), static_cast<void *>(&input_memobj_[i]));
      casadi_assert(ret == CL_SUCCESS);
    }

    // Pass outputs
    for (int i=0; i<nOut(); ++i) {
      ret = clSetKernelArg(kernel_, kernel_arg++, sizeof(cl_mem),
                           static_cast<void *>(&output_memobj_[i]));
      casadi_assert(ret == CL_SUCCESS);
    }

    // Execute OpenCL Kernel
    executeKernel(kernel_);

    // Get outputs
    for (int i=0; i<output_memobj_.size(); ++i) {
      ret = clEnqueueReadBuffer(sparsity_propagation_kernel_.command_queue, output_memobj_[i],
                                CL_TRUE, 0,
                                outputNoCheck(i).size() * sizeof(cl_double),
                                reinterpret_cast<void*>(outputNoCheck(i).ptr()), 0, NULL, NULL);
      casadi_assert(ret == CL_SUCCESS);
    }
  }
// copy image and process using OpenCL
//*****************************************************************************
void processImage()
{
    // activate destination buffer
    glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, pbo_source);

    //// read data into pbo. note: use BGRA format for optimal performance
    glReadPixels(0, 0, image_width, image_height, GL_BGRA, GL_UNSIGNED_BYTE, NULL); 

    if (bPostprocess)
    {
        if (iProcFlag == 0) 
        {
            pboRegister();
            executeKernel(blur_radius);
            pboUnregister();
        } 
        else 
        {
            // map the PBOs
            glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, pbo_source);    
            glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);    
            
            unsigned int* source_ptr = (unsigned int*)glMapBufferARB(GL_PIXEL_PACK_BUFFER_ARB,
                                                                     GL_READ_ONLY_ARB);
            
            unsigned int* dest_ptr = (unsigned int*)glMapBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB,
                                                                   GL_WRITE_ONLY_ARB);
            
            // Postprocessing on the CPU
            postprocessHost(source_ptr, dest_ptr, image_width, image_height, 0, blur_radius, 0.8f, 4.0f);
            
            // umap the PBOs
            glUnmapBufferARB(GL_PIXEL_PACK_BUFFER_ARB);
            glBindBufferARB(GL_PIXEL_PACK_BUFFER_ARB, 0);
            glUnmapBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB); 
            glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
        }

        // download texture from PBO
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);
        glBindTexture(GL_TEXTURE_2D, tex_screen);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 
                        image_width, image_height, 
                        GL_BGRA, GL_UNSIGNED_BYTE, NULL);

    } 
    else 
    {
        // download texture from PBO
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_source);
        glBindTexture(GL_TEXTURE_2D, tex_screen);
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, 
                        image_width, image_height, 
                        GL_BGRA, GL_UNSIGNED_BYTE, NULL);
        
    }
}
Ejemplo n.º 3
0
Archivo: boot.c Proyecto: GemBit/PICOS
void boot(void) {
    bootData.totalSectors = diskParameter.totalSectors;
    bootData.sectorSize = diskParameter.sectorSize;
    bootData.mbr = (MBR *) codeStart;

    /* 内存探测应该这最先,这要用于分配缓冲大小 */
    detectMemory();

    /* 加载内核 */
    loadKernel();

    /* 设置高分辨率模式应该在最后,因为前面有文字打印操作 */
    setVbeMode(getPreferredResolution());

    executeKernel();
}
Ejemplo n.º 4
0
  void SXFunctionInternal::spEvaluateOpenCL(bool fwd) {
    // OpenCL return flag
    cl_int ret;

    // Select a kernel
    cl_kernel kernel = fwd ? sp_fwd_kernel_ : sp_adj_kernel_;

    // Set OpenCL Kernel Parameters
    int kernel_arg = 0;

    // Pass inputs
    for (int i=0; i<nIn(); ++i) {
      ret = clSetKernelArg(kernel, kernel_arg++,
                           sizeof(cl_mem), static_cast<void *>(&sp_input_memobj_[i]));
      casadi_assert(ret == CL_SUCCESS);
    }

    // Pass outputs
    for (int i=0; i<nOut(); ++i) {
      ret = clSetKernelArg(kernel, kernel_arg++,
                           sizeof(cl_mem), static_cast<void *>(&sp_output_memobj_[i]));
      casadi_assert(ret == CL_SUCCESS);
    }

    // Execute OpenCL Kernel
    executeKernel(kernel);

    // Get inputs
    for (int i=0; i<sp_input_memobj_.size(); ++i) {
      ret = clEnqueueReadBuffer(sparsity_propagation_kernel_.command_queue,
                                sp_input_memobj_[i], CL_TRUE, 0,
                                inputNoCheck(i).size() * sizeof(cl_ulong),
                                reinterpret_cast<void*>(inputNoCheck(i).ptr()), 0, NULL, NULL);
      casadi_assert(ret == CL_SUCCESS);
    }

    // Get outputs
    for (int i=0; i<sp_output_memobj_.size(); ++i) {
      ret = clEnqueueReadBuffer(sparsity_propagation_kernel_.command_queue,
                                sp_output_memobj_[i], CL_TRUE, 0,
                                outputNoCheck(i).size() * sizeof(cl_ulong),
                                reinterpret_cast<void*>(outputNoCheck(i).ptr()), 0, NULL, NULL);
      casadi_assert(ret == CL_SUCCESS);
    }
  }
// Run a test sequence without any GL 
//*****************************************************************************
void TestNoGL()
{
    // execute OpenCL kernel without GL interaction
    cl_pbos[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, 4 * image_width * image_height, NULL, &ciErrNum);
    cl_pbos[1] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, 4 * image_width * image_height, NULL, &ciErrNum);

    // set the args values
    ciErrNum |= clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &(cl_pbos[0]));
    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &(cl_pbos[1]));
    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(image_width), &image_width);
    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(image_width), &image_height);
    
    // Start timer 0 and process n loops on the GPU 
    executeKernel(blur_radius);

    // Cleanup and exit
    Cleanup(EXIT_SUCCESS);
}
Ejemplo n.º 6
0
void scan(clContext *clCxt,cl_mem &ginput,cl_mem &goutput,Plan *plan,int elemnum)
{
    cl_mem gadsys;
    int steplength =  ((plan->registergroup + plan->localmemgroup) * plan->cta) / plan->vectorlength; 
    int taillen = elemnum - steplength * plan->vectorlength * plan->localthread * plan->workgroup, tailgroup;
    tailgroup = (taillen+2047)>>11;
    int adsyslen = plan->workgroup+tailgroup+2;
    int *adsys = new int[adsyslen];
     
    memset(adsys,0,adsyslen*sizeof(int));
    create(clCxt,&gadsys,adsyslen*sizeof(int)); 
    upload(clCxt,(void *)adsys,&gadsys,adsyslen * sizeof(int));
    int registersize = plan->registergroup * plan->cta;
    int localmemsize = plan->localmemgroup * plan->cta;
    char build_options[200];
    if(plan->coalesced==1){
  	    sprintf(build_options ,
        "-D NB_VEC_%d -D NB_L%d -D NB_G%d -D NB_CTA_%d -D STEP_NUM=%d -D NB_REG_GRP=%d -D NB_REG_SIZE=%d -D NB_LOCAL_GRP=%d -D NB_LOCAL_SIZE=%d",
        plan->vectorlength,plan->localthread,plan->workgroup,plan->cta,steplength,plan->registergroup,
        registersize,plan->localmemgroup,localmemsize + 1);
    }
    else{
  	    sprintf(build_options ,
        "-D NB_VEC_%d -D NB_L%d -D NB_G%d -D NB_CTA_%d -D STEP_NUM=%d -D NB_REG_GRP=%d -D NB_REG_SIZE=%d -D NB_LOCAL_GRP=%d -D NB_LOCAL_SIZE=%d",
        plan->vectorlength,plan->localthread,plan->workgroup,plan->cta,steplength,plan->registergroup,
        registersize,plan->localmemgroup,localmemsize);
    }
    cout<<build_options<<endl;
    vector<pair<size_t ,const void *> > args;
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&ginput ));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&gadsys ));
    args.push_back( make_pair( sizeof(cl_mem) , (void *)&goutput ));
    args.push_back( make_pair( sizeof(cl_int) , (void *)&plan->workgroup ));
    size_t globalthreads[3] = {plan->localthread * plan->workgroup,1,1};
    size_t localthreads[3] = {plan->localthread,1,1};
    
    timeRcd.kerneltime = 0;
    timeRcd.totaltime = 0;
    executeKernel("scan.cl","scan",args,globalthreads,localthreads,build_options,clCxt);

    if(taillen!=0){
  	    sprintf(build_options ,
        "-D NB_VEC_TAIL -D NB_L64 -D NB_G%d -D NB_CTA_16 -D STEP_NUM=32 -D NB_REG_GRP=1 -D NB_REG_SIZE=16 -D NB_LOCAL_GRP=1 -D NB_LOCAL_SIZE=17",
        tailgroup);
        args.clear();
        args.push_back( make_pair( sizeof(cl_mem) , (void *)&ginput));
        args.push_back( make_pair( sizeof(cl_mem) , (void *)&gadsys));
        args.push_back( make_pair( sizeof(cl_mem) , (void *)&goutput));
        args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
        args.push_back( make_pair( sizeof(cl_int) , (void *)&plan->workgroup));
        args.push_back( make_pair( sizeof(cl_int) , (void *)&tailgroup));
        args.push_back( make_pair( sizeof(cl_int) , (void *)&taillen));
        globalthreads[0] = 64 * tailgroup;
        localthreads[0] = 64;
        //timeRcd.kerneltime = 0;
        //timeRcd.totaltime = 0;
        executeKernel("scan.cl","scantail",args,globalthreads,localthreads,build_options,clCxt);
    }

//#define PRINT_A    
#if defined PRINT_A   
    download(clCxt,&gadsys,(void *)adsys,adsyslen*sizeof(int));
    for(int i=0,k=1;i< adsyslen;k++)
    {
        cout <<setiosflags(ios::left) << "line:"<<k<<"   ";
        for(int j=0;j < 10 && i< adsyslen;j++,i++)
           cout << setiosflags(ios::left) << setw(10) << adsys[i];
        cout << endl;
    }
#endif    
    delete adsys; 
    clReleaseMemObject(gadsys);

}
Ejemplo n.º 7
0
Archivo: hd5GPU.c Proyecto: leobago/fti
int main ( int argc, char *argv[]){
  int i;
  int state;
  int sizeOfDimension;
  int success = 1;
  int FTI_APP_RANK;
  herr_t status;
  threeD ***ptr = allocateLinearMemory(XSIZE, YSIZE, ZSIZE );
  threeD *devPtr;
  int result;
  MPI_Init(&argc, &argv);
  result = FTI_Init(argv[1], MPI_COMM_WORLD);
  if (result == FTI_NREC) {
    exit(RECOVERY_FAILED);
  }
  int crash = atoi(argv[2]);
  int level = atoi(argv[3]);



  memset(&ptr[0][0][0],0, sizeof(threeD) * (XSIZE * YSIZE * ZSIZE));

  int numGpus = getProperties();
  MPI_Comm_rank(FTI_COMM_WORLD,&FTI_APP_RANK);

  setDevice(FTI_APP_RANK%numGpus);

  dictionary *ini = iniparser_load( argv[1] );
  int grank;    
  MPI_Comm_rank(MPI_COMM_WORLD,&grank);
  int nbHeads = (int)iniparser_getint(ini, "Basic:head", -1); 
  int finalTag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107);
  int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1);
  int headRank = grank - grank%nodeSize;

  FTIT_complexType coordinateDef;
  FTIT_type threeDType;
  FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, x),0, "X"); 
  FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, y),1, "y"); 
  FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, z),2, "z"); 
  FTI_AddSimpleField( &coordinateDef, &FTI_INTG, offsetof( threeD, id),3, "id"); 
  FTI_InitComplexType(&threeDType, &coordinateDef, 4 , sizeof(threeD), "ThreeD", NULL);
  

  if ( (nbHeads<0) || (nodeSize<0) ) {
    printf("wrong configuration (for head or node-size settings)! %d %d\n",nbHeads, nodeSize);
    MPI_Abort(MPI_COMM_WORLD, -1);
  }

  allocateMemory((void **) &devPtr, (XSIZE * YSIZE * ZSIZE*sizeof(threeD)));
  FTI_Protect(0, devPtr,  (XSIZE * YSIZE * ZSIZE),threeDType);
  int dimLength[3] = {ZSIZE,YSIZE,XSIZE};
  if (grank == 0)
    for ( i =0 ; i < 3; i++){
      printf("Dimension is %d size is %d\n", dimLength[i], XSIZE*YSIZE*ZSIZE*sizeof(threeDType) / (1024*1024));
    }
  FTI_DefineDataset(0, 3, dimLength , "GPU TOPOLOGY" , NULL);
  state = FTI_Status();
  if ( state == INIT ){
    executeKernel(devPtr);
    FTI_Checkpoint(1,level);
    if ( crash ) {
      if( nbHeads > 0 ) { 
        int value = FTI_ENDW;
        MPI_Send(&value, 1, MPI_INT, headRank, finalTag, MPI_COMM_WORLD);
        MPI_Barrier(MPI_COMM_WORLD);
      }
      MPI_Finalize();
      exit(0);
    }
  }else{
    result = FTI_Recover();
    if (result != FTI_SCES) {
      exit(RECOVERY_FAILED);
    }
    hostCopy(devPtr, &ptr[0][0][0],(XSIZE * YSIZE * ZSIZE*sizeof(threeD)));
  }
  threeD ***validationMemory= allocateLinearMemory(XSIZE, YSIZE, ZSIZE );
  initData(&validationMemory[0][0][0]);

  if (state == RESTART || state == KEEP) {
    int tmp;
    result =  memcmp(&validationMemory[0][0][0], &ptr[0][0][0],(XSIZE * YSIZE * ZSIZE*sizeof(threeD)));
    MPI_Allreduce(&result, &tmp, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD);
    result = tmp;

  }

  deallocateLinearMemory(ZSIZE , ptr);
  deallocateLinearMemory(ZSIZE , validationMemory);
  freeCuda(devPtr);

  if (FTI_APP_RANK == 0 && (state == RESTART || state == KEEP)) {
    if (result == 0) {
      printf("[SUCCESSFUL]\n");
    } else {
      printf("[NOT SUCCESSFUL]\n");
      success=0;
    }
  }

  MPI_Barrier(FTI_COMM_WORLD);
  FTI_Finalize();
  MPI_Finalize();

  if (success == 1)
    return 0;
  else
    exit(DATA_CORRUPT);
}