void conv3d_blas_cpu::fprop() { create_Y(); init_convmat(); init_u(); // iterate over each training instance mwSize N = getVolN(X); for (mwSize i = 0; i < N; i++) { // make phiX: the convolution matrix vol_to_convmat(X, i); // convolution: Y_ = phiX * F_ matw F_ = make_F_(); matw Y_ = make_Y_(i); AxBtoC(convmat, F_, Y_, true); // overwrite Y_ // plus the bias: Y_ += u * B matw B_ = make_B_(); AxBtoC(u, B_, Y_, false); // accumulation on Y_ } free_u(); free_convmat(); }
void conv3d_blas_cpu::bprop() { create_dX(); create_dF(); create_dB(); init_convmat(); init_u(); mwSize N = getVolN(X); matw dF_ = make_dF_(); matw dB_ = make_dB_(); for (mwSize i = 0; i < N; ++i) { // make phiX: the convolution matrix vol_to_convmat(X, i); // dF += phiX' * dY_ matw dY_ = make_dY_(i); ATxBtoC(convmat, dY_, dF_, false); // accumulation on dF_ // dB += u' * dY ATxBtoC(u, dY_, dB_, false); // accumulation on dB_ // dphiX = dY * F' matw F_ = make_F_(); // safe to reuse convmat memory, remember to overwrite it! AxBTtoC(dY_, F_, convmat, true); // dX(:,:,:,:,i) <-- dphiX vol_from_convmat(dX, i); } free_u(); free_convmat(); }
void find_control_points() { int L = qpoints-3, k = 6; float b, a; Point H, I, J, P, Q, R; init_u(); // Calcula os parametros control_points[0] = ds[0]; control_points[1] = ds[1]; a = d_u(1)/(d_u(1) + d_u(2)); b = d_u(2)/(d_u(1) + d_u(2)); H.x = ds[1].x * b + ds[2].x * a; H.y = ds[1].y * b + ds[2].y * a; control_points[2] = H; a = d_u(L-1)/(d_u(L-1) + d_u(L)); b = d_u(L)/(d_u(L-1) + d_u(L)); I.x = ds[L].x * b + ds[L+1].x * a; I.y = ds[L].y * b + ds[L+1].y * a; control_points[(3*L)-2] = I; a = d_u(L-1)/(d_u(L-1) + d_u(L)); b = d_u(L)/(d_u(L-1) + d_u(L)); J.x = (control_points[(3*L) - 4].x * b) + (control_points[(3*L) - 2].x * a); J.y = (control_points[(3*L) - 4].y * b) + (control_points[(3*L) - 2].y * a); control_points[(3*L)-3] = J; control_points[(3*L) - 1] = ds[L+1]; control_points[3*L] = ds[L+2]; for(int i = 1; i <= L - 2; i++){ a = d_u(i)/(d_u(i) + d_u(i+1)); // a = 1-b b = d_u(i+1)/(d_u(i) + d_u(i+1)); P.x = (control_points[(3*i) - 1].x * b) + (control_points[(3*i) + 1].x * a); P.y = (control_points[(3*i) - 1].y * b) + (control_points[(3*i) + 1].y * a); control_points[3*i] = P; a = d_u(i)/(d_u(i) + d_u(i+1) + d_u(i+2)); // a = 1-b b = (d_u(i+1) + d_u(i+2))/(d_u(i) + d_u(i+1) + d_u(i+2)); Q.x = (ds[i+1].x * b) + (ds[i+2].x * a); Q.y = (ds[i+1].y * b) + (ds[i+2].y * a); control_points[(3*i)+1] = Q; a = (d_u(i) + d_u(i+1))/(d_u(i) + d_u(i+1) + d_u(i+2)); // a = 1-b b = d_u(i+2)/(d_u(i) + d_u(i+1) + d_u(i+2)); R.x = (ds[i+1].x * b) + (ds[i+2].x * a); R.y = (ds[i+1].y * b) + (ds[i+2].y * a); control_points[(3*i)+2] = R; k += 3; } }
int main(int argc, char** argv){ load_conf(argv[1]); //Initialisiere Felder vector<vector<double> > u_0; vector<vector<double> > v_0; vector<vector<double> > T; init_u(u_0, v_0); init_T(T); //Erstelle den Temperaturvektor std::vector<double> T_Vec = reshape_vector(T); //Berechne die BTCS-Matrix std::vector<std::vector<double> > M = BCTS_implicit_Matrix(u_0,v_0); //Berechne die untere Dreiecksmatrix std::vector<std::vector<double> > LD = triangularize(M); //Integration long int t_start;//TIME LOG time(&t_start); cout << "W = " << omega<< endl; int i_t = 0;//Zähler für die Snapshots for(int n=0; n*dt<t_fin; n++){ //Füge Dirichlet-Randbedingungen in den Vektor ein impose_dirichlet(T_Vec,u_0,v_0); //Löse das Gleichungssystem SOR(T_Vec,M,LD, omega, r_end); //Snapshots if( (n+1)*dt >= t_snap[i_t] && (n+1)*dt<t_snap[i_t+1]){ ostringstream snap_name; snap_name <<dirname<< (n+1)*dt << "_" << Pe << "_"<< Nx<<"_"<<Ny<<"_"<<dt<<"_"<<b_Q<<".txt"; save_data(shape_back(T_Vec, T_unten,T_oben), snap_name.str().c_str()); } if((n+1)*dt>=t_snap[i_t]){ i_t+=1; } } cout<<endl; T=shape_back(T_Vec, T_unten, T_oben); //TIME LOG long int t_finished; time(&t_finished); cout << "\n\n"<< t_finished-t_start<<endl; //print_array(T); save_data(T,"aktuell.txt"); return 0; }
int main(void) { /* Local scalars */ char uplo, uplo_i; lapack_int n, n_i; lapack_int ncvt, ncvt_i; lapack_int nru, nru_i; lapack_int ncc, ncc_i; lapack_int ldvt, ldvt_i; lapack_int ldvt_r; lapack_int ldu, ldu_i; lapack_int ldu_r; lapack_int ldc, ldc_i; lapack_int ldc_r; lapack_int info, info_i; lapack_int i; int failed; /* Local arrays */ float *d = NULL, *d_i = NULL; float *e = NULL, *e_i = NULL; lapack_complex_float *vt = NULL, *vt_i = NULL; lapack_complex_float *u = NULL, *u_i = NULL; lapack_complex_float *c = NULL, *c_i = NULL; float *work = NULL, *work_i = NULL; float *d_save = NULL; float *e_save = NULL; lapack_complex_float *vt_save = NULL; lapack_complex_float *u_save = NULL; lapack_complex_float *c_save = NULL; lapack_complex_float *vt_r = NULL; lapack_complex_float *u_r = NULL; lapack_complex_float *c_r = NULL; /* Iniitialize the scalar parameters */ init_scalars_cbdsqr( &uplo, &n, &ncvt, &nru, &ncc, &ldvt, &ldu, &ldc ); ldvt_r = ncvt+2; ldu_r = n+2; ldc_r = ncc+2; uplo_i = uplo; n_i = n; ncvt_i = ncvt; nru_i = nru; ncc_i = ncc; ldvt_i = ldvt; ldu_i = ldu; ldc_i = ldc; /* Allocate memory for the LAPACK routine arrays */ d = (float *)LAPACKE_malloc( n * sizeof(float) ); e = (float *)LAPACKE_malloc( n * sizeof(float) ); vt = (lapack_complex_float *) LAPACKE_malloc( ldvt*ncvt * sizeof(lapack_complex_float) ); u = (lapack_complex_float *) LAPACKE_malloc( ldu*n * sizeof(lapack_complex_float) ); c = (lapack_complex_float *) LAPACKE_malloc( ldc*ncc * sizeof(lapack_complex_float) ); work = (float *)LAPACKE_malloc( 4*n * sizeof(float) ); /* Allocate memory for the C interface function arrays */ d_i = (float *)LAPACKE_malloc( n * sizeof(float) ); e_i = (float *)LAPACKE_malloc( n * sizeof(float) ); vt_i = (lapack_complex_float *) LAPACKE_malloc( ldvt*ncvt * sizeof(lapack_complex_float) ); u_i = (lapack_complex_float *) LAPACKE_malloc( ldu*n * sizeof(lapack_complex_float) ); c_i = (lapack_complex_float *) LAPACKE_malloc( ldc*ncc * sizeof(lapack_complex_float) ); work_i = (float *)LAPACKE_malloc( 4*n * sizeof(float) ); /* Allocate memory for the backup arrays */ d_save = (float *)LAPACKE_malloc( n * sizeof(float) ); e_save = (float *)LAPACKE_malloc( n * sizeof(float) ); vt_save = (lapack_complex_float *) LAPACKE_malloc( ldvt*ncvt * sizeof(lapack_complex_float) ); u_save = (lapack_complex_float *) LAPACKE_malloc( ldu*n * sizeof(lapack_complex_float) ); c_save = (lapack_complex_float *) LAPACKE_malloc( ldc*ncc * sizeof(lapack_complex_float) ); /* Allocate memory for the row-major arrays */ vt_r = (lapack_complex_float *) LAPACKE_malloc( n*(ncvt+2) * sizeof(lapack_complex_float) ); u_r = (lapack_complex_float *) LAPACKE_malloc( nru*(n+2) * sizeof(lapack_complex_float) ); c_r = (lapack_complex_float *) LAPACKE_malloc( n*(ncc+2) * sizeof(lapack_complex_float) ); /* Initialize input arrays */ init_d( n, d ); init_e( n, e ); init_vt( ldvt*ncvt, vt ); init_u( ldu*n, u ); init_c( ldc*ncc, c ); init_work( 4*n, work ); /* Backup the ouptut arrays */ for( i = 0; i < n; i++ ) { d_save[i] = d[i]; } for( i = 0; i < n; i++ ) { e_save[i] = e[i]; } for( i = 0; i < ldvt*ncvt; i++ ) { vt_save[i] = vt[i]; } for( i = 0; i < ldu*n; i++ ) { u_save[i] = u[i]; } for( i = 0; i < ldc*ncc; i++ ) { c_save[i] = c[i]; } /* Call the LAPACK routine */ cbdsqr_( &uplo, &n, &ncvt, &nru, &ncc, d, e, vt, &ldvt, u, &ldu, c, &ldc, work, &info ); /* Initialize input data, call the column-major middle-level * interface to LAPACK routine and check the results */ for( i = 0; i < n; i++ ) { d_i[i] = d_save[i]; } for( i = 0; i < n; i++ ) { e_i[i] = e_save[i]; } for( i = 0; i < ldvt*ncvt; i++ ) { vt_i[i] = vt_save[i]; } for( i = 0; i < ldu*n; i++ ) { u_i[i] = u_save[i]; } for( i = 0; i < ldc*ncc; i++ ) { c_i[i] = c_save[i]; } for( i = 0; i < 4*n; i++ ) { work_i[i] = work[i]; } info_i = LAPACKE_cbdsqr_work( LAPACK_COL_MAJOR, uplo_i, n_i, ncvt_i, nru_i, ncc_i, d_i, e_i, vt_i, ldvt_i, u_i, ldu_i, c_i, ldc_i, work_i ); failed = compare_cbdsqr( d, d_i, e, e_i, vt, vt_i, u, u_i, c, c_i, info, info_i, ldc, ldu, ldvt, n, ncc, ncvt, nru ); if( failed == 0 ) { printf( "PASSED: column-major middle-level interface to cbdsqr\n" ); } else { printf( "FAILED: column-major middle-level interface to cbdsqr\n" ); } /* Initialize input data, call the column-major high-level * interface to LAPACK routine and check the results */ for( i = 0; i < n; i++ ) { d_i[i] = d_save[i]; } for( i = 0; i < n; i++ ) { e_i[i] = e_save[i]; } for( i = 0; i < ldvt*ncvt; i++ ) { vt_i[i] = vt_save[i]; } for( i = 0; i < ldu*n; i++ ) { u_i[i] = u_save[i]; } for( i = 0; i < ldc*ncc; i++ ) { c_i[i] = c_save[i]; } for( i = 0; i < 4*n; i++ ) { work_i[i] = work[i]; } info_i = LAPACKE_cbdsqr( LAPACK_COL_MAJOR, uplo_i, n_i, ncvt_i, nru_i, ncc_i, d_i, e_i, vt_i, ldvt_i, u_i, ldu_i, c_i, ldc_i ); failed = compare_cbdsqr( d, d_i, e, e_i, vt, vt_i, u, u_i, c, c_i, info, info_i, ldc, ldu, ldvt, n, ncc, ncvt, nru ); if( failed == 0 ) { printf( "PASSED: column-major high-level interface to cbdsqr\n" ); } else { printf( "FAILED: column-major high-level interface to cbdsqr\n" ); } /* Initialize input data, call the row-major middle-level * interface to LAPACK routine and check the results */ for( i = 0; i < n; i++ ) { d_i[i] = d_save[i]; } for( i = 0; i < n; i++ ) { e_i[i] = e_save[i]; } for( i = 0; i < ldvt*ncvt; i++ ) { vt_i[i] = vt_save[i]; } for( i = 0; i < ldu*n; i++ ) { u_i[i] = u_save[i]; } for( i = 0; i < ldc*ncc; i++ ) { c_i[i] = c_save[i]; } for( i = 0; i < 4*n; i++ ) { work_i[i] = work[i]; } if( ncvt != 0 ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, ncvt, vt_i, ldvt, vt_r, ncvt+2 ); } if( nru != 0 ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, nru, n, u_i, ldu, u_r, n+2 ); } if( ncc != 0 ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, ncc, c_i, ldc, c_r, ncc+2 ); } info_i = LAPACKE_cbdsqr_work( LAPACK_ROW_MAJOR, uplo_i, n_i, ncvt_i, nru_i, ncc_i, d_i, e_i, vt_r, ldvt_r, u_r, ldu_r, c_r, ldc_r, work_i ); if( ncvt != 0 ) { LAPACKE_cge_trans( LAPACK_ROW_MAJOR, n, ncvt, vt_r, ncvt+2, vt_i, ldvt ); } if( nru != 0 ) { LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nru, n, u_r, n+2, u_i, ldu ); } if( ncc != 0 ) { LAPACKE_cge_trans( LAPACK_ROW_MAJOR, n, ncc, c_r, ncc+2, c_i, ldc ); } failed = compare_cbdsqr( d, d_i, e, e_i, vt, vt_i, u, u_i, c, c_i, info, info_i, ldc, ldu, ldvt, n, ncc, ncvt, nru ); if( failed == 0 ) { printf( "PASSED: row-major middle-level interface to cbdsqr\n" ); } else { printf( "FAILED: row-major middle-level interface to cbdsqr\n" ); } /* Initialize input data, call the row-major high-level * interface to LAPACK routine and check the results */ for( i = 0; i < n; i++ ) { d_i[i] = d_save[i]; } for( i = 0; i < n; i++ ) { e_i[i] = e_save[i]; } for( i = 0; i < ldvt*ncvt; i++ ) { vt_i[i] = vt_save[i]; } for( i = 0; i < ldu*n; i++ ) { u_i[i] = u_save[i]; } for( i = 0; i < ldc*ncc; i++ ) { c_i[i] = c_save[i]; } for( i = 0; i < 4*n; i++ ) { work_i[i] = work[i]; } /* Init row_major arrays */ if( ncvt != 0 ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, ncvt, vt_i, ldvt, vt_r, ncvt+2 ); } if( nru != 0 ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, nru, n, u_i, ldu, u_r, n+2 ); } if( ncc != 0 ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, ncc, c_i, ldc, c_r, ncc+2 ); } info_i = LAPACKE_cbdsqr( LAPACK_ROW_MAJOR, uplo_i, n_i, ncvt_i, nru_i, ncc_i, d_i, e_i, vt_r, ldvt_r, u_r, ldu_r, c_r, ldc_r ); if( ncvt != 0 ) { LAPACKE_cge_trans( LAPACK_ROW_MAJOR, n, ncvt, vt_r, ncvt+2, vt_i, ldvt ); } if( nru != 0 ) { LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nru, n, u_r, n+2, u_i, ldu ); } if( ncc != 0 ) { LAPACKE_cge_trans( LAPACK_ROW_MAJOR, n, ncc, c_r, ncc+2, c_i, ldc ); } failed = compare_cbdsqr( d, d_i, e, e_i, vt, vt_i, u, u_i, c, c_i, info, info_i, ldc, ldu, ldvt, n, ncc, ncvt, nru ); if( failed == 0 ) { printf( "PASSED: row-major high-level interface to cbdsqr\n" ); } else { printf( "FAILED: row-major high-level interface to cbdsqr\n" ); } /* Release memory */ if( d != NULL ) { LAPACKE_free( d ); } if( d_i != NULL ) { LAPACKE_free( d_i ); } if( d_save != NULL ) { LAPACKE_free( d_save ); } if( e != NULL ) { LAPACKE_free( e ); } if( e_i != NULL ) { LAPACKE_free( e_i ); } if( e_save != NULL ) { LAPACKE_free( e_save ); } if( vt != NULL ) { LAPACKE_free( vt ); } if( vt_i != NULL ) { LAPACKE_free( vt_i ); } if( vt_r != NULL ) { LAPACKE_free( vt_r ); } if( vt_save != NULL ) { LAPACKE_free( vt_save ); } if( u != NULL ) { LAPACKE_free( u ); } if( u_i != NULL ) { LAPACKE_free( u_i ); } if( u_r != NULL ) { LAPACKE_free( u_r ); } if( u_save != NULL ) { LAPACKE_free( u_save ); } if( c != NULL ) { LAPACKE_free( c ); } if( c_i != NULL ) { LAPACKE_free( c_i ); } if( c_r != NULL ) { LAPACKE_free( c_r ); } if( c_save != NULL ) { LAPACKE_free( c_save ); } if( work != NULL ) { LAPACKE_free( work ); } if( work_i != NULL ) { LAPACKE_free( work_i ); } return 0; }
int main() { int enable_profiling = 0; #ifdef DO_TIMING enable_profiling = 1; #endif //print_platforms_devices(); cl_context ctx; cl_command_queue queue; create_context_on("NVIDIA", NULL, 0, &ctx, &queue, enable_profiling); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- // read the cl file char buf[100]; sprintf(buf, "mg-kernel-ver%d.cl", VERSION); char *knl_text = read_file(buf); //get work group dimensions and gflop info. int wg_dims , wg_x, wg_y, wg_z, z_div, fetch_per_pt, flops_per_pt; if (sscanf(knl_text, "// workgroup: (%d,%d,%d) z_div:%d fetch_per_pt:%d flops_per_pt:%d", &wg_x, &wg_y, &wg_z, &z_div, &fetch_per_pt, &flops_per_pt) == 6) { wg_dims = 3; } else if (sscanf(knl_text, "// workgroup: (%d,%d) fetch_per_pt:%d flops_per_pt:%d", &wg_x, &wg_y, &fetch_per_pt, &flops_per_pt) == 4) { wg_dims = 2; wg_z = -1; z_div = -1; } else { perror("reading workgroup spec"); abort(); } #ifdef USE_DOUBLE char *compile_opt = "-DFTYPE=double"; #else char *compile_opt = "-DFTYPE=float"; #endif // creation of the kernel cl_kernel poisson_knl = kernel_from_string(ctx, knl_text, "fd_update", compile_opt); free(knl_text); // my compiler complains about this one. OJO!! // -------------------------------------------------------------------------- // set up grid // -------------------------------------------------------------------------- const unsigned points = POINTS; const ftype minus_bdry = -1, plus_bdry = 1; // We're dividing into (points-1) intervals. ftype dx = (plus_bdry-minus_bdry)/(points-1); // -------------------------------------------------------------------------- // allocate and initialize CPU memory // -------------------------------------------------------------------------- int use_alignment; unsigned dim_other = points; //if order 2 then 1 point extra on each side #ifdef USE_ALIGNMENT // adjusts dimension so that the next row starts in a number divisible by 16 unsigned dim_x = ((dim_other + 15) / 16) * 16; unsigned field_start = 0; use_alignment = 1; #else unsigned dim_x = dim_other; unsigned field_start = 0;// this one puts me right at the beginning use_alignment = 0; #endif // --------Allocate forcing uexact, r and u vectors ------------------------- const size_t field_size = 0+dim_x*dim_x*dim_x; // extra large to fit the 2^n constrain in GPU ftype *f = malloc(field_size*sizeof(ftype)); CHECK_SYS_ERROR(!f, "allocating f"); ftype *u = malloc (field_size*sizeof(ftype)); CHECK_SYS_ERROR(!u, "allocating u"); ftype *uexact = malloc (field_size*sizeof(ftype)); CHECK_SYS_ERROR(!uexact, "allocating uexact"); ftype *r = malloc(field_size * sizeof(ftype)); CHECK_SYS_ERROR(!r, "allocating residual r"); // -------------------------------------------------------------------------- // initialize // -------------------------------------------------------------------------- // zero out (necessary to initialize everything bec. I measure norms) for (size_t i = 0; i < field_size; ++i){ f[i] = 0; u[i] = 0; uexact[i] = 0; r[i] = 0; } // set up the forcing field init_f (points, f, dx, field_start, dim_x, dim_other, minus_bdry); // Initialize u with initial boundary conditions init_u ( points, u , minus_bdry, plus_bdry, dx, field_start, dim_x, dim_other); // Initialize the exact solution init_uexact(points, u, uexact, dx, field_size, field_start, dim_x, dim_other); // -------------------------------------------------------------------------- // Setup the v-cycles // -------------------------------------------------------------------------- unsigned n1, n2, n3, ncycles; n1 = 50; n2 = 60; n3 = 1; ncycles = 2; ftype *sweeps = malloc (ncycles*sizeof(ftype)); ftype *rnorm = malloc (ncycles*sizeof(ftype)); ftype *enorm = malloc (ncycles*sizeof(ftype)); ftype rtol = 1.0e-05; // Find the norm of the residual (choose your method) sweeps[0] =0; resid (r, f, u, dx, field_size, field_start, dim_x, dim_other); rnorm[0] = norm( r , field_size) * dx; U_error(u, uexact, r, field_size); enorm[0] = norm( r, field_size ) * dx; for(unsigned icycle = 1; icycle <= ncycles; icycle++){ mgv(f, u, dx, n1, n2, n3, field_size, points, use_alignment, dim_x, ctx, queue, poisson_knl, wg_dims , wg_x, wg_y, wg_z, z_div, fetch_per_pt, flops_per_pt); //update u through a v-cycle sweeps[icycle] = sweeps[icycle -1] + (4 * (n1 + n2)/3); resid (r, f, u, dx, field_size, field_start, dim_x, dim_other); rnorm[icycle] = norm( r, field_size ) * dx; U_error(u, uexact, r, field_size); enorm[icycle] = norm( r, field_size ) * dx; //cfacts = (rnorm(icycle)/rnorm(icycle - 1))^(1 / (n1 + n2)) not necessary //disp something here if I want to. //printf("norm of the cycle %f", enorm[icycle]); if(rnorm[icycle] <= rtol * rnorm[0]) break; } #ifdef DO_TIMING printf(" ftype:%d ver:%d align:%d pts:%d\tgflops:%.1f\tmcells:%.1f\tgbytes:%.1f [/sec]\tout_gflops:%.6f\n", (int) sizeof(ftype), VERSION, use_alignment, points, gflops_performed/seconds_taken, mcells_updated/seconds_taken, gbytes_accessed/seconds_taken, gflops_performed/tot_secs); #endif // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_GUARDED(clReleaseKernel, (poisson_knl)); CALL_CL_GUARDED(clReleaseCommandQueue, (queue)); CALL_CL_GUARDED(clReleaseContext, (ctx)); }