int main() { std_setup(); int n = 256; double x0 = 10; double tf = 10.0; double dt = 0.05; int expansion_order = 7; int hdaf_order = 8; grid2D<double,double,double > grid( n,-10,10.0, n,-10,10.0 ),u,v,C2,damping; grid = 0.0; u = grid; v = grid; C2 = grid; damping = grid; u = u_0_func(); C2 = C2_func(); damping = damping_func(); sprintf(fname,"/workspace/output/acoustic_propagate_2d/out_dat/C2.dat" ); writeFile(C2,fname); void * pdata = 0; method1_init( &pdata, grid.n1, grid.n2, grid.dx1(), grid.dx2(), C2.array, damping.array, 7, hdaf_order, hdaf_order, 0.8, 0.8 ); //acoustic_propagator P; //P.init( grid, 7, C2, damping ); double t = 0.0; cout << "step: " << t << "\t" << L2norm(u) << endl; n=0; double * b_times = new double [ int(tf/dt+5) ]; while (t <= tf) { output(t,u,v); double t1 = get_real_time(); method1_execute( pdata, dt, u.array,v.array,u.array,v.array ); //P( dt, u,v,u,v ); double t2 = get_real_time(); b_times[n] = t2-t1; double mean = 0; for (int j=0; j<n; j++) mean += b_times[j]/n; cout << mean << endl; n++; double mag = L2norm(u); cout << "step: " << t << "\t" << t2-t1 << "\t" << mag << endl; t += dt; if ( mag > 1E10 ) break; } output(t,u,v); }
CUresult cudaLaunchNV12toARGBDrv(CUdeviceptr d_srcNV12, size_t nSourcePitch, CUdeviceptr d_dstARGB, size_t nDestPitch, uint32 width, uint32 height, CUfunction fpFunc, CUstream streamID) { CUresult status; // Each thread will output 2 pixels at a time. The grid size width is half // as large because of this dim3 block(32,16,1); dim3 grid((width+(2*block.x-1))/(2*block.x), (height+(block.y-1))/block.y, 1); #if __CUDA_API_VERSION >= 4000 // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) void *args[] = { &d_srcNV12, &nSourcePitch, &d_dstARGB, &nDestPitch, &width, &height }; // new CUDA 4.0 Driver API Kernel launch call status = cuLaunchKernel(fpFunc, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, streamID, args, NULL); #else // This is the older Driver API launch method from CUDA (V1.0 to V3.2) checkCudaErrors(cuFuncSetBlockShape(fpFunc, block.x, block.y, 1)); int offset = 0; // This method calls cuParamSetv() to pass device pointers also allows the ability to pass 64-bit device pointers // device pointer for Source Surface checkCudaErrors(cuParamSetv(fpFunc, offset, &d_srcNV12, sizeof(d_srcNV12))); offset += sizeof(d_srcNV12); // set the Source pitch checkCudaErrors(cuParamSetv(fpFunc, offset, &nSourcePitch, sizeof(nSourcePitch))); offset += sizeof(nSourcePitch); // device pointer for Destination Surface checkCudaErrors(cuParamSetv(fpFunc, offset, &d_dstARGB, sizeof(d_dstARGB))); offset += sizeof(d_dstARGB); // set the Destination Pitch checkCudaErrors(cuParamSetv(fpFunc, offset, &nDestPitch, sizeof(nDestPitch))); offset += sizeof(nDestPitch); // set the width of the image ALIGN_OFFSET(offset, __alignof(width)); checkCudaErrors(cuParamSeti(fpFunc, offset, width)); offset += sizeof(width); // set the height of the image ALIGN_OFFSET(offset, __alignof(height)); checkCudaErrors(cuParamSeti(fpFunc, offset, height)); offset += sizeof(height); checkCudaErrors(cuParamSetSize(fpFunc, offset)); // Launching the kernel, we need to pass in the grid dimensions CUresult status = cuLaunchGridAsync(fpFunc, grid.x, grid.y, streamID); #endif if (CUDA_SUCCESS != status) { fprintf(stderr, "cudaLaunchNV12toARGBDrv() failed to launch Kernel Function %p, retval = %d\n", fpFunc, status); return status; } return status; }
int main(int argc, char **argv) { OptionParser opts; string mapFile, evidFile;//interactFile,ignoreFile; int factor; opts.addOption(new StringOption("map", "--map <filename> : map file", "../input/grid.bmp", mapFile, false)); opts.addOption(new StringOption("evidence", "--evidence <filename> : evidence file", "", evidFile, true)); opts.addOption(new IntOption("factor", "--factor <int> : scaling factor", 1, factor, true)); opts.parse(argc,argv); cout << "Loading Map File"<<endl; BMPFile bmpFile(mapFile); Grid grid(bmpFile, black); // cout << "xdim: "<<grid.dims().first<<" yDim: "<<grid.dims().second<<endl; cout << "Loading Evidence"<<endl; //Evidence trainSet(evidFile, grid, factor); /* used when need to train two seperate models Evidence evid_int(interactFile, grid, factor); Evidence evid_ig(ignoreFile, grid, factor); Evidence train_int(grid),test_int(grid),train_ig(grid), test_ig(grid); evid_int.split(train_int, test_int, 0.05); evid_ig.split(train_ig, test_ig, 0.05); */ Evidence evid(evidFile,grid,factor); Evidence trainSet(grid),testSet(grid); evid.split(trainSet,testSet,0.05); cout<<"Optimize over "<<trainSet.size()<<" examples"<<endl; #if 0 for (int i=0; i < evid.size(); i++) { cout << "Evid "<<i<<endl; vector<pair<int, int> > traj = evid.at(i); vector<double> timestamps = evid.getTimes(i); cout << timestamps.size()<<" "<<traj.size()<<endl; for (int j=0; j < traj.size(); j++) { cout << timestamps.at(j)<<" "<<traj.at(j).first << " "<<traj.at(j).second<<endl; } } #endif // testSet.write("testTraj.data"); cout << "Generating Feature Set"<<endl; vector<PosFeature> features; cout << " Constant Feature"<<endl; ConstantFeature constFeat(grid); features.push_back(constFeat); cout << " Obstacle Feature"<<endl; ObstacleFeature obsFeat(grid); features.push_back(obsFeat); for (int i=1; i < 5; i++) { cout << " Blur Feature "<<i<<endl; ObstacleBlurFeature blurFeat(grid, 5*i); features.push_back(blurFeat); } /* cout << " Robot Feature"<<endl; RobotGlobalFeature robglobal(grid,snackbot,factor); features.push_back(robglobal); // robot local blurres features for (int i=1; i < 5; i++) { cout << " RobotBlur Feature "<<i<<endl; RobotLocalBlurFeature robblur(grid,snackbot,5*i,factor); features.push_back(robblur); } */ /* cout << " Creating feature array"<<endl; FeatureArray featArray2(features); cout << " Creating lower resolution feature array"<<endl; FeatureArray featArray(featArray2, factor); */ cout << " Speed Feature"<<endl; vector<double> speedTable(2,0.0); speedTable.at(1) = 0.75; //speedTable.at(2) = 1.1; DisVecSeqFeature speedfeat(speedTable); /* Robset training weights: * -3.83 -8.35991 -2.6512 -5.43475 -3.15203 -3.29758 * 0.596987 0.439284 * 0.589445 -0.82448 * Non-robot-ending trainng weights: * -4.57257 -6.2 -0.3537 -2.7385 -0.9357 -0.2797 * -0.495205 -0.2863 * -1.2225 0.43993 */ vector<double> weights(6+2+2, -0.0); weights.at(0) = -25; weights.at(1) = -8.36; weights.at(2) = -2.65; weights.at(3) = -5.43; weights.at(4) = -3.17; weights.at(5) = -3.34; weights.at(6) = 0.5; // robot feature weights.at(7) = 0.3; // robot feature weights.at(8) = -0.29; // velocity feature weights.at(9) = -1.11; // velocity feature //weights.push_back(1.5);//the last parameter is for velocity feature Parameters params(weights); DisSeqOrderInferEngine engine(8,InferenceEngine::GRID8); trajOptimizerplus optimizer(grid,trainSet,features,speedfeat,engine); optimizer.optimize(params,0.005,1000,1.0,OPT_EXP); return 0; }
extern "C" magma_int_t magma_slarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, float **v_array, magma_int_t ldv, float **tau_array, float **T_array, magma_int_t ldt, float **work_array, magma_int_t lwork, magma_int_t batchCount, magma_queue_t queue) { float c_one = MAGMA_S_ONE; float c_zero = MAGMA_S_ZERO; if ( k <= 0) return 0; if ( stair_T > 0 && k <= stair_T) return 0; magma_int_t maxnb = max_shared_bsiz; magma_int_t info = 0; if (stair_T > 0 && stair_T > maxnb) { info = -3; } else if (lwork < k*ldt) { info = -10; } if (info != 0) { magma_xerbla( __func__, -(info) ); return info; } magma_int_t DEBUG=0; magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; magma_int_t i, j, prev_n, mycol, rows; float **dW1_displ = NULL; float **dW2_displ = NULL; float **dW3_displ = NULL; float **dTstep_array = NULL; magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); //float *Tstep = k > nb ? work : T; if (k > nb) { magma_sdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); } else { magma_sdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); } //magma_int_t ldtstep = k > nb ? k : ldt; magma_int_t ldtstep = ldt; //a enlever // stair_T = 0 meaning all T // stair_T > 0 meaning the triangular portion of T has been computed. // the value of stair_T is the nb of these triangulars //GEMV compute the whole triangular upper portion of T (phase 1) // TODO addcublas to check perf magma_sgemm_batched( MagmaConjTrans, MagmaNoTrans, k, k, n, c_one, v_array, ldv, v_array, ldv, c_zero, dTstep_array, ldtstep, batchCount, queue ); magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue ); // no need for it as T is expected to be lower zero //if (k > nb) magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue ); //TRMV //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] // TRMV is split over block of column of size nb // the update should be done from top to bottom so: // 1- a gemm using the previous computed columns // of T to update rectangular upper protion above // the triangle of my columns // 2- the columns need to be updated by a serial // loop over of gemv over itself. since we limit the // shared memory to nb, this nb column // are split vertically by chunk of nb rows dim3 grid(1, 1, batchCount); for (j=0; j < k; j += nb) { prev_n = j; mycol = min(nb, k-j); // note that myrow = prev_n + mycol; if (prev_n > 0 && mycol > 0) { if (DEBUG == 3) { printf("doing gemm on the rectangular portion of size %lld %lld of T(%lld,%lld)\n", (long long) prev_n, (long long) mycol, (long long) 0, (long long) j ); } magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); magma_sdisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); magma_sgemm_batched( MagmaNoTrans, MagmaNoTrans, prev_n, mycol, prev_n, c_one, T_array, ldt, dW1_displ, ldtstep, c_zero, dW2_displ, ldt, batchCount, queue ); // update my rectangular portion (prev_n,mycol) using sequence of gemv magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_sdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); for (i=0; i < prev_n; i += nb) { rows = min(nb,prev_n-i); if (DEBUG == 3) { printf(" doing recstrmv on the rectangular portion of size %lld %lld of T(%lld,%lld)\n", (long long) rows, (long long) mycol, (long long) i, (long long) j ); } if (rows > 0 && mycol > 0) { magma_sdisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); magmablas_slarft_recstrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); } } } // the upper rectangular protion is updated, now if needed update the triangular portion if (stair_T == 0) { if (DEBUG == 3) { printf("doing strmv on the triangular portion of size %lld %lld of T(%lld,%lld)\n", (long long) mycol, (long long) mycol, (long long) j, (long long) j ); } if (mycol > 0) { magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); magma_sdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); magma_sdisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); magmablas_slarft_strmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); } } }// end of j magma_free(dW1_displ); magma_free(dW2_displ); magma_free(dW3_displ); magma_free(dTstep_array); return 0; }