void ScharrDerivative(const RowMatrixXf& image, const int x_deg, const int y_deg, const int size, const bool normalize, RowMatrixXf* out) { const int sigma = size * 2 + 1; Eigen::RowVectorXf kernel1(sigma); kernel1.setZero(); kernel1(0) = -1.0; kernel1(sigma - 1) = 1.0; Eigen::RowVectorXf kernel2(sigma); kernel2.setZero(); if (!normalize) { kernel2(0) = 3; kernel2(sigma / 2) = 10; kernel2(sigma - 1) = 3; } else { float w = 10.0 / 3.0; float norm = 1.0 / (2.0 * size * (w + 2.0)); kernel2(0) = norm; kernel2(sigma / 2) = w * norm; kernel2(sigma - 1) = norm; } if (x_deg == 1) { SeparableConvolution2d(image, kernel1, kernel2, REFLECT, out); } else { SeparableConvolution2d(image, kernel2, kernel1, REFLECT, out); } return; }
void test_vmul_drv() { int n = 216 * 216 * 216; int repeat = 20; std::vector<double_complex> v1(n); std::vector<double> v2(n); mdarray<double_complex, 1> f1(n); mdarray<double, 1> f2(n); for (int i = 0; i < n; i++) { v1[i] = 1.0 / double_complex(i + 1, i + 1); v2[i] = 2.0; f1[i] = 1.0 / double_complex(i + 1, i + 1); f2[i] = 2.0; } std::cout << "vector size: " << n << std::endl; double t = kernel1(repeat, n, v1, v2); std::cout << "kernel1 time: " << t << " speed: " << double(n * (2 * sizeof(double_complex) + sizeof(double)) * repeat) / t / (1 << 30) << " GBs" << std::endl; t = kernel2(repeat, n, f1, f2); std::cout << "kernel2 time: " << t << " speed: " << double(n * (2 * sizeof(double_complex) + sizeof(double)) * repeat) / t / (1 << 30) << " GBs" << std::endl; t = kernel3(repeat, n, f1, v2); std::cout << "kernel3 time: " << t << " speed: " << double(n * (2 * sizeof(double_complex) + sizeof(double)) * repeat) / t / (1 << 30) << " GBs" << std::endl; }
vec SingleSampleCCE::cluster_evolution(int cce_order, int index) {/*{{{*/ vector<cSPIN> spin_list = _my_clusters.getCluster(cce_order, index); cClusterIndex clstIndex = _my_clusters.getClusterIndex(cce_order, index); Hamiltonian hami0 = create_spin_hamiltonian(_center_spin, _state_pair.first, spin_list, clstIndex); Hamiltonian hami1 = create_spin_hamiltonian(_center_spin, _state_pair.second, spin_list, clstIndex); vector<QuantumOperator> hm_list1 = riffle((QuantumOperator) hami0, (QuantumOperator) hami1, _pulse_num); vector<QuantumOperator> hm_list2 = riffle((QuantumOperator) hami1, (QuantumOperator) hami0, _pulse_num); vector<double> time_segment = Pulse_Interval(_pulse_name, _pulse_num); PureState psi = create_cluster_state(clstIndex); PiecewiseFullMatrixVectorEvolution kernel1(hm_list1, time_segment, psi); PiecewiseFullMatrixVectorEvolution kernel2(hm_list2, time_segment, psi); kernel1.setTimeSequence( _t0, _t1, _nTime); kernel2.setTimeSequence( _t0, _t1, _nTime); ClusterCoherenceEvolution dynamics1(&kernel1); ClusterCoherenceEvolution dynamics2(&kernel2); dynamics1.run(); dynamics2.run(); return calc_observables(&kernel1, &kernel2); }/*}}}*/
int main(void) { char c = 1; int * data1 = (int*)malloc(SIZE*sizeof(int)); auto acc = hc::accelerator(); int * data1_d = (int*)hc::am_alloc(SIZE*sizeof(int), acc, 0); grid_launch_parm lp; grid_launch_init(&lp); lp.grid_dim = gl_dim3(GRID_SIZE, 1); lp.group_dim = gl_dim3(TILE_SIZE, 1); hc::completion_future cf; lp.cf = &cf; kernel1(lp, data1_d, c); lp.cf->wait(); static hc::accelerator_view av = acc.get_default_view(); av.copy(data1_d, data1, SIZE*sizeof(int)); bool ret = 0; for(int i = 0; i < SIZE; ++i) { if((data1[i] != (int)c)) { ret = 1; break; } } hc::am_free(data1); free(data1); return ret; }
int main(void) { int *data1 = (int *)malloc(SIZE*sizeof(int)); auto acc = hc::accelerator(); int* data1_d = (int*)hc::am_alloc(SIZE*sizeof(int), acc, 0); grid_launch_parm lp; grid_launch_init(&lp); lp.gridDim.x = GRID_SIZE; lp.groupDim.x = TILE_SIZE; hc::completion_future cf; lp.cf = &cf; kernel1(lp, data1_d); lp.cf->wait(); hc::am_copy(data1, data1_d, SIZE*sizeof(int)); bool ret = 0; for(int i = 0; i < SIZE; ++i) { if(data1[i] != i) { ret = 1; break; } } hc::am_free(data1_d); free(data1); return ret; }
int main(void) { Foo data1(5); Bar* data2 = (Bar*)malloc(SIZE*sizeof(Bar)); auto acc = hc::accelerator(); Bar* data2_d = (Bar*)hc::am_alloc(SIZE*sizeof(Bar), acc, 0); grid_launch_parm lp; grid_launch_init(&lp); lp.grid_dim = gl_dim3(GRID_SIZE, 1); lp.group_dim = gl_dim3(TILE_SIZE, 1); hc::completion_future cf; lp.cf = &cf; kernel1(lp, data1, data2_d); lp.cf->wait(); hc::am_copy(data2, data2_d, SIZE*sizeof(Bar)); bool ret = 0; for(int i = 0; i < SIZE; ++i) { if((data2[i].x != i + data1.y)) { ret = 1; break; } } hc::am_free(data2_d); free(data2); return ret; }
int main(void) { Foo* data1 = (Foo*)malloc(SIZE*sizeof(Foo)); Bar* data2 = (Bar*)malloc(SIZE*sizeof(Bar)); constStructconst* data3 = (constStructconst*)malloc(SIZE*sizeof(constStructconst)); for(int i = 0; i < SIZE; ++i) { data3[i].x = i; } auto acc = hc::accelerator(); Foo* data1_d = (Foo*)hc::am_alloc(SIZE*sizeof(Foo), acc, 0); Bar* data2_d = (Bar*)hc::am_alloc(SIZE*sizeof(Bar), acc, 0); constStructconst* data3_d = (constStructconst*)hc::am_alloc(SIZE*sizeof(constStructconst), acc, 0); hc::am_copy(data3_d, data3, SIZE*sizeof(constStructconst)); grid_launch_parm lp; grid_launch_init(&lp); lp.gridDim = gl_dim3(GRID_SIZE, 1); lp.groupDim = gl_dim3(TILE_SIZE, 1); hc::completion_future cf; lp.cf = &cf; kernel1(lp, data1_d, data2_d, data3_d); lp.cf->wait(); hc::am_copy(data1, data1_d, SIZE*sizeof(Foo)); hc::am_copy(data2, data2_d, SIZE*sizeof(Bar)); bool ret = 0; for(int i = 0; i < SIZE; ++i) { if((data1[i].x != i) || (data2[i].x != i + data3[i].x)) { ret = 1; break; } } hc::am_free(data1_d); hc::am_free(data2_d); hc::am_free(data3_d); free(data1); free(data2); free(data3); return ret; }
int main() { check_offloading(); int cpuExec = 0; #pragma omp target map(tofrom: cpuExec) { cpuExec = omp_is_initial_device(); } int max_teams = 256; int gpu_threads = 256; int cpu_threads = 32; int max_threads = cpuExec ? cpu_threads : gpu_threads; a = (double *) malloc(MAX_N * sizeof(double)); a_h = (double *) malloc(MAX_N * sizeof(double)); b = (double *) malloc(MAX_N * sizeof(double)); c = (double *) malloc(MAX_N * sizeof(double)); #pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N]) for (int n = 32 ; n < MAX_N ; n+=5000) { int t = 0; reset_input(a, a_h, b, c); #pragma omp target update to(a[:n],b[:n],c[:n]) for (int ths = 1; ths <= 1024; ths *= 3) { for(int sch = 1 ; sch <= n ; sch *= 1200) { t+=4; #pragma omp target #pragma omp parallel { add_f1(a, b, c, n, sch); add_f2(a, b, c, n, sch); add_f3(a, b, c, n, sch); add_f4(a, b, c, n, sch); } } } // check results for each 'n' for (int times = 0 ; times < t ; times++) for (int i = 0; i < n; ++i) a_h[i] += b[i] + c[i]; #pragma omp target update from(a[:n]) for (int i = 0; i < n; ++i) { if (a_h[i] != a[i]) { printf("Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, a_h[i], a[i]); return 1; } } } // loop 'n' printf("Succeeded\n"); for (int n = 32 ; n < MAX_N ; n+=5000) { int t = 0; reset_input(a, a_h, b, c); #pragma omp target update to(a[:n],b[:n],c[:n]) for (int ths = 1; ths <= 1024; ths *= 3) { for(int sch = 1 ; sch <= n ; sch *= 1200) { t+=4; #pragma omp target parallel num_threads(1024) { add_f1(a, b, c, n, sch); add_f2(a, b, c, n, sch); add_f3(a, b, c, n, sch); add_f4(a, b, c, n, sch); } } } // check results for each 'n' for (int times = 0 ; times < t ; times++) for (int i = 0; i < n; ++i) a_h[i] += b[i] + c[i]; #pragma omp target update from(a[:n]) for (int i = 0; i < n; ++i) { if (a_h[i] != a[i]) { printf("Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, a_h[i], a[i]); return 1; } } } // loop 'n' printf("Succeeded\n"); for (int n = 32 ; n < MAX_N ; n+=5000) { int t = 0; reset_input(a, a_h, b, c); #pragma omp target update to(a[:n],b[:n],c[:n]) for (int tms = 1 ; tms <= 256 ; tms *= 2) { // 8 times for (int ths = 32 ; ths <= 1024 ; ths *= 2) { // 6 times for(int sch = 1 ; sch <= n ; sch *= 1200) { t+=4; #pragma omp target teams num_teams(tms) thread_limit(ths) { tadd_dpf1<double>(a, b, c, n, sch); add_dpf2(a, b, c, n, sch); add_dpf3(a, b, c, n, sch); add_dpf4(a, b, c, n, sch); } } // loop 'sch' } // loop 'ths' } // loop 'tms' // check results for each 'n' for (int times = 0 ; times < t ; times++) for (int i = 0; i < n; ++i) a_h[i] += b[i] + c[i]; #pragma omp target update from(a[:n]) for (int i = 0; i < n; ++i) { if (a_h[i] != a[i]) { printf("Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, a_h[i], a[i]); return 1; } } } // loop 'n' printf("Succeeded\n"); #pragma omp target exit data map(release:a[:MAX_N],b[:MAX_N],c[:MAX_N]) #define N (957*3) double Ad[N], Bd[N], Cd[N]; #define INIT() { \ INIT_LOOP(N, { \ Ad[i] = 1 << 16; \ Bd[i] = i << 16; \ Cd[i] = -(i << 16); \ }) \ } INIT(); double RESULT[256]; int VALID[256]; long long EXPECTED[7]; EXPECTED[0] = 34; EXPECTED[1] = 2311; EXPECTED[2] = 4795; EXPECTED[3] = 7532; EXPECTED[4] = 10468; EXPECTED[5] = 12999; EXPECTED[6] = 15345; unsigned e = 0; for (int t = 2; t <= max_threads; t+=39) { long long OUT = 0; int num_threads = t; int num_tests = 0; #pragma omp target teams map(tofrom: OUT, num_tests) num_teams(1) thread_limit(max_threads) { #pragma omp parallel num_threads(num_threads) { for (int offset = 0; offset < 32; offset++) { for (int factor = 1; factor < 33; factor++) { kernel1(num_threads, RESULT, VALID, offset, factor, N, Ad, Bd, Cd, &OUT, &num_tests); } } } } if (OUT + num_tests != EXPECTED[e++]) printf ("Failed test with num_threads = %d, OUT + num_tests = %ld\n", t, OUT + num_tests); else printf ("Succeeded\n"); } if (cpuExec) { DUMP_SUCCESS(6); } e = 0; for (int t = 2; t <= max_threads; t+=39) { long long OUT = 0; int num_threads = t; int num_tests = 0; #pragma omp target parallel map(tofrom: OUT, num_tests) num_threads(num_threads) { for (int offset = 0; offset < 32; offset++) { for (int factor = 1; factor < 33; factor++) { kernel1(num_threads, RESULT, VALID, offset, factor, N, Ad, Bd, Cd, &OUT, &num_tests); } } } if (OUT + num_tests != EXPECTED[e++]) printf ("Failed test with num_threads = %d, OUT + num_tests = %ld\n", t, OUT + num_tests); else printf ("Succeeded\n"); } if (cpuExec) { DUMP_SUCCESS(6); } long long OUT = 0; int num_tests = 0; #pragma omp target map(tofrom: OUT, num_tests) { kernel1(1, RESULT, VALID, 0, 1, N, Ad, Bd, Cd, &OUT, &num_tests); } if (OUT + num_tests != 1) printf ("Failed test with OUT + num_tests = %ld\n", OUT + num_tests); else printf ("Succeeded\n"); return 0; }