Example #1
0
int main()
{
    std_setup();
    
    int n = 256;
    double x0 = 10;
    
    double tf = 10.0;
    double dt = 0.05;
    int expansion_order = 7;
    int hdaf_order = 8;
    
    grid2D<double,double,double > grid( n,-10,10.0, n,-10,10.0 ),u,v,C2,damping;
    grid = 0.0;
    
    u = grid;
    v = grid;
    C2 = grid;
    damping = grid;
    
    u = u_0_func();
    C2 = C2_func();
    damping = damping_func();
    
    sprintf(fname,"/workspace/output/acoustic_propagate_2d/out_dat/C2.dat" );
    writeFile(C2,fname);
    
    
    void * pdata = 0;
    method1_init( &pdata, grid.n1, grid.n2, grid.dx1(), grid.dx2(), C2.array, damping.array, 7, hdaf_order, hdaf_order, 0.8, 0.8 );
    
    //acoustic_propagator P;
    //P.init( grid, 7, C2, damping );
    
    double t = 0.0;
    
    cout << "step: " << t << "\t" << L2norm(u) << endl;
    
    n=0;
    double * b_times = new double [ int(tf/dt+5) ];
    
    while (t <= tf)
    {
    
        output(t,u,v);
        
        double t1 = get_real_time();
        
        method1_execute( pdata, dt, u.array,v.array,u.array,v.array );
        //P( dt, u,v,u,v );
        
        double t2 = get_real_time();
        
        b_times[n] = t2-t1;
        double mean = 0;
        for (int j=0; j<n; j++)
            mean += b_times[j]/n;
        cout << mean << endl;
        n++;
        
        
        double mag = L2norm(u);
        cout << "step: " << t << "\t" << t2-t1 << "\t" << mag << endl;
        
        t += dt;
        
        if ( mag > 1E10 ) break;
    }
       
    output(t,u,v);
}
CUresult  cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch,
                                  CUdeviceptr d_dstARGB, size_t nDestPitch,
                                  uint32 width,          uint32 height,
                                  CUfunction fpFunc, CUstream streamID)
{
    CUresult status;
    // Each thread will output 2 pixels at a time.  The grid size width is half
    // as large because of this
    dim3 block(32,16,1);
    dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1);

#if __CUDA_API_VERSION >= 4000
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
    void *args[] = { &d_srcNV12, &nSourcePitch,
                     &d_dstARGB, &nDestPitch,
                     &width, &height
                   };

    // new CUDA 4.0 Driver API Kernel launch call
    status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z,
                            block.x, block.y, block.z,
                            0, streamID,
                            args, NULL);
#else
    // This is the older Driver API launch method from CUDA (V1.0 to V3.2)
    checkCudaErrors(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1));
    int offset = 0;

    // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers

    // device pointer for Source Surface
    checkCudaErrors(cuParamSetv(fpFunc, offset, &d_srcNV12,    sizeof(d_srcNV12)));
    offset += sizeof(d_srcNV12);

    // set the Source pitch
    checkCudaErrors(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch)));
    offset += sizeof(nSourcePitch);

    // device pointer for Destination Surface
    checkCudaErrors(cuParamSetv(fpFunc, offset, &d_dstARGB,    sizeof(d_dstARGB)));
    offset += sizeof(d_dstARGB);

    //  set the Destination Pitch
    checkCudaErrors(cuParamSetv(fpFunc, offset, &nDestPitch,   sizeof(nDestPitch)));
    offset += sizeof(nDestPitch);

    // set the width of the image
    ALIGN_OFFSET(offset, __alignof(width));
    checkCudaErrors(cuParamSeti(fpFunc, offset, width));
    offset += sizeof(width);

    // set the height of the image
    ALIGN_OFFSET(offset, __alignof(height));
    checkCudaErrors(cuParamSeti(fpFunc, offset, height));
    offset += sizeof(height);

    checkCudaErrors(cuParamSetSize(fpFunc, offset));

    // Launching the kernel, we need to pass in the grid dimensions
    CUresult status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID);
#endif

    if (CUDA_SUCCESS != status)
    {
        fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %p, retval = %d\n", fpFunc, status);
        return status;
    }

    return status;
}
Example #3
0
int main(int argc, char **argv) {
   OptionParser opts;

   string mapFile, evidFile;//interactFile,ignoreFile;

   int factor;

   opts.addOption(new StringOption("map", 
            "--map <filename>                 : map file",
            "../input/grid.bmp", mapFile, false));

   opts.addOption(new StringOption("evidence", 
            "--evidence <filename>            : evidence file",
            "", evidFile, true));
   opts.addOption(new IntOption("factor", 
            "--factor <int>                   : scaling factor",
            1, factor, true));


   opts.parse(argc,argv);

   cout << "Loading Map File"<<endl;
   BMPFile bmpFile(mapFile); 
   Grid grid(bmpFile, black);
//   cout << "xdim: "<<grid.dims().first<<" yDim: "<<grid.dims().second<<endl;
   cout << "Loading Evidence"<<endl;
   //Evidence trainSet(evidFile, grid, factor);
   /* used when need to train two seperate models
   Evidence evid_int(interactFile, grid, factor);
   Evidence evid_ig(ignoreFile, grid, factor);
   Evidence train_int(grid),test_int(grid),train_ig(grid), test_ig(grid);
   evid_int.split(train_int, test_int, 0.05);
   evid_ig.split(train_ig, test_ig, 0.05);
   */
   Evidence evid(evidFile,grid,factor);
   Evidence trainSet(grid),testSet(grid);
   evid.split(trainSet,testSet,0.05);
   cout<<"Optimize over "<<trainSet.size()<<" examples"<<endl;
#if 0 
   for (int i=0; i < evid.size(); i++) {
      cout << "Evid "<<i<<endl;
      vector<pair<int, int> > traj = evid.at(i);
      vector<double> timestamps = evid.getTimes(i);

      cout << timestamps.size()<<"  "<<traj.size()<<endl;

      for (int j=0; j < traj.size(); j++) {
         cout << timestamps.at(j)<<"  "<<traj.at(j).first
            << "  "<<traj.at(j).second<<endl;
      } 
   }
#endif
//   testSet.write("testTraj.data");

   cout << "Generating Feature Set"<<endl;

   vector<PosFeature> features;

   cout << "   Constant Feature"<<endl;

   ConstantFeature constFeat(grid);
   features.push_back(constFeat);

   cout << "   Obstacle Feature"<<endl;

   ObstacleFeature obsFeat(grid);
   features.push_back(obsFeat);
	

   for (int i=1; i < 5; i++) {
      cout << "   Blur Feature "<<i<<endl;
      ObstacleBlurFeature blurFeat(grid, 5*i);
      features.push_back(blurFeat);
   }

   /*
   cout << "    Robot Feature"<<endl;
   RobotGlobalFeature robglobal(grid,snackbot,factor);
   features.push_back(robglobal);
   //  robot local blurres features
   for (int i=1; i < 5; i++) {
      cout << "  RobotBlur Feature "<<i<<endl;
      RobotLocalBlurFeature robblur(grid,snackbot,5*i,factor);
      features.push_back(robblur);
   }
	
   */
 
   /* 
   cout << "   Creating feature array"<<endl;
   FeatureArray featArray2(features);

   cout << "   Creating lower resolution feature array"<<endl;
   FeatureArray featArray(featArray2, factor);
   */

   cout << " Speed Feature"<<endl;
   vector<double> speedTable(2,0.0);
   speedTable.at(1) = 0.75;
   //speedTable.at(2) = 1.1;
   DisVecSeqFeature speedfeat(speedTable);


   /* Robset training weights: 
	* -3.83 -8.35991 -2.6512 -5.43475 -3.15203 -3.29758
	*  0.596987 0.439284
	* 0.589445 -0.82448
	* Non-robot-ending trainng weights:
	* -4.57257  -6.2 -0.3537 -2.7385 -0.9357 -0.2797
	* -0.495205 -0.2863
	* -1.2225 0.43993
	*/
   vector<double> weights(6+2+2, -0.0);
   weights.at(0) = -25;	
   weights.at(1) = -8.36;
   weights.at(2) = -2.65;
   weights.at(3) = -5.43;
   weights.at(4) = -3.17;
   weights.at(5) = -3.34;
   
   weights.at(6) = 0.5; // robot feature
   weights.at(7) = 0.3; // robot feature
  
   weights.at(8) = -0.29;  // velocity feature
   weights.at(9) = -1.11; // velocity feature

   //weights.push_back(1.5);//the last parameter is for velocity feature
   Parameters params(weights);

   DisSeqOrderInferEngine engine(8,InferenceEngine::GRID8);

   trajOptimizerplus optimizer(grid,trainSet,features,speedfeat,engine);

   optimizer.optimize(params,0.005,1000,1.0,OPT_EXP);

   return 0;

}
Example #4
0
extern "C" magma_int_t
magma_slarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, 
                float **v_array, magma_int_t ldv,
                float **tau_array, float **T_array, magma_int_t ldt, 
                float **work_array, magma_int_t lwork, 
                magma_int_t batchCount, magma_queue_t queue)
{
    float c_one  = MAGMA_S_ONE;
    float c_zero = MAGMA_S_ZERO;

    if ( k <= 0) return 0;
    if ( stair_T > 0 && k <= stair_T) return 0;

    magma_int_t maxnb = max_shared_bsiz;

    magma_int_t info = 0;
    if (stair_T > 0 && stair_T > maxnb) {
        info = -3;
    }
    else if (lwork < k*ldt) {
        info = -10;
    }
    if (info != 0) {
        magma_xerbla( __func__, -(info) );
        return info;
    }

    magma_int_t DEBUG=0;
    magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T;

    magma_int_t i, j, prev_n, mycol, rows;

    float **dW1_displ  = NULL;
    float **dW2_displ  = NULL;
    float **dW3_displ  = NULL;
    float **dTstep_array  = NULL;

    magma_malloc((void**)&dW1_displ,  batchCount * sizeof(*dW1_displ));
    magma_malloc((void**)&dW2_displ,  batchCount * sizeof(*dW2_displ));
    magma_malloc((void**)&dW3_displ,  batchCount * sizeof(*dW3_displ));
    magma_malloc((void**)&dTstep_array,  batchCount * sizeof(*dTstep_array));

    //float *Tstep =  k > nb ? work : T;
    if (k > nb)
    {
        magma_sdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue);
    }
    else
    {
        magma_sdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue);
    }

    //magma_int_t ldtstep = k > nb ? k : ldt;
    magma_int_t ldtstep = ldt; //a enlever
    // stair_T = 0 meaning all T
    // stair_T > 0 meaning the triangular portion of T has been computed. 
    //                    the value of stair_T is the nb of these triangulars
   

    //GEMV compute the whole triangular upper portion of T (phase 1)
    // TODO addcublas to check perf

    magma_sgemm_batched( MagmaConjTrans, MagmaNoTrans, 
                         k, k, n, 
                         c_one,  v_array, ldv, 
                                 v_array, ldv, 
                         c_zero, dTstep_array, ldtstep, 
                         batchCount, queue );

    magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue );
    // no need for it as T is expected to be lower zero
    //if (k > nb) magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue );
    

    //TRMV
    //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k]
    // TRMV is split over block of column of size nb 
    // the update should be done from top to bottom so:
    // 1- a gemm using the previous computed columns
    //    of T to update rectangular upper protion above 
    //    the triangle of my columns 
    // 2- the columns need to be updated by a serial 
    //    loop over of gemv over itself. since we limit the
    //    shared memory to nb, this nb column 
    //    are split vertically by chunk of nb rows

    dim3 grid(1, 1, batchCount);

    for (j=0; j < k; j += nb)
    {
        prev_n =  j;
        mycol  =  min(nb, k-j);
        // note that myrow = prev_n + mycol;
        if (prev_n > 0 && mycol > 0) {
            if (DEBUG == 3) {
                printf("doing gemm on the rectangular portion of size %lld %lld of T(%lld,%lld)\n",
                        (long long) prev_n, (long long) mycol, (long long) 0, (long long) j );
            }

            magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue);
            magma_sdisplace_pointers(dW2_displ, T_array,     ldt, 0, j, batchCount, queue);
            magma_sgemm_batched( MagmaNoTrans, MagmaNoTrans, 
                                 prev_n, mycol, prev_n, 
                                 c_one,  T_array, ldt, 
                                         dW1_displ, ldtstep, 
                                 c_zero, dW2_displ, ldt, 
                                 batchCount, queue );

            // update my rectangular portion (prev_n,mycol) using sequence of gemv 
            magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
            magma_sdisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);

            for (i=0; i < prev_n; i += nb)
            {
                rows = min(nb,prev_n-i);
                if (DEBUG == 3) {
                    printf("        doing recstrmv on the rectangular portion of size %lld %lld of T(%lld,%lld)\n",
                            (long long) rows, (long long) mycol, (long long) i, (long long) j );
                }

                if (rows > 0 && mycol > 0)
                {
                    magma_sdisplace_pointers(dW2_displ, T_array,     ldt, i, j, batchCount, queue);
                    magmablas_slarft_recstrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue);
                }
            }
        }

        // the upper rectangular protion is updated, now if needed update the triangular portion
        if (stair_T == 0) {
            if (DEBUG == 3) {
                printf("doing strmv on the triangular portion of size %lld %lld of T(%lld,%lld)\n",
                        (long long) mycol, (long long) mycol, (long long) j, (long long) j );
            }

            if (mycol > 0)
            {
                magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
                magma_sdisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);
                magma_sdisplace_pointers(dW2_displ, T_array,     ldt, j, j, batchCount, queue);
                magmablas_slarft_strmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue);
            }
        }
    }// end of j

    magma_free(dW1_displ);
    magma_free(dW2_displ);
    magma_free(dW3_displ);
    magma_free(dTstep_array);

    return 0;
}