C++ (Cpp) kernel1示例

示例#1

0

显示文件

文件： convolution.cpp 项目： HarveyLiuFly/TheiaSfM

void ScharrDerivative(const RowMatrixXf& image,
                      const int x_deg,
                      const int y_deg,
                      const int size,
                      const bool normalize,
                      RowMatrixXf* out) {
  const int sigma = size * 2 + 1;
  Eigen::RowVectorXf kernel1(sigma);
  kernel1.setZero();
  kernel1(0) = -1.0;
  kernel1(sigma - 1) = 1.0;

  Eigen::RowVectorXf kernel2(sigma);
  kernel2.setZero();
  if (!normalize) {
    kernel2(0) = 3;
    kernel2(sigma / 2) = 10;
    kernel2(sigma - 1) = 3;
  } else {
    float w = 10.0 / 3.0;
    float norm = 1.0 / (2.0 * size * (w + 2.0));
    kernel2(0) = norm;
    kernel2(sigma / 2) = w * norm;
    kernel2(sigma - 1) = norm;
  }

  if (x_deg == 1) {
    SeparableConvolution2d(image, kernel1, kernel2, REFLECT, out);
  } else {
    SeparableConvolution2d(image, kernel2, kernel1, REFLECT, out);
  }
  return;
}

示例#2

0

显示文件

文件： test_vmult.cpp 项目： cocteautwins/SIRIUS-develop

void test_vmul_drv()
{
    int n = 216 * 216 * 216;
    int repeat = 20;

    std::vector<double_complex> v1(n);
    std::vector<double> v2(n);

    mdarray<double_complex, 1> f1(n);
    mdarray<double, 1> f2(n);
    for (int i = 0; i < n; i++)
    {
        v1[i] = 1.0 / double_complex(i + 1, i + 1);
        v2[i] = 2.0;
        f1[i] = 1.0 / double_complex(i + 1, i + 1);
        f2[i] = 2.0;
    }

    std::cout << "vector size: " << n << std::endl;

    double t = kernel1(repeat, n, v1, v2);
    std::cout << "kernel1 time: " << t << " speed: " << double(n * (2 * sizeof(double_complex) + sizeof(double)) * repeat) / t / (1 << 30) << " GBs" << std::endl;

    t = kernel2(repeat, n, f1, f2);
    std::cout << "kernel2 time: " << t << " speed: " << double(n * (2 * sizeof(double_complex) + sizeof(double)) * repeat) / t / (1 << 30) << " GBs" << std::endl;

    t = kernel3(repeat, n, f1, v2);
    std::cout << "kernel3 time: " << t << " speed: " << double(n * (2 * sizeof(double_complex) + sizeof(double)) * repeat) / t / (1 << 30) << " GBs" << std::endl;

}

示例#3

0

显示文件

文件： cce.cpp 项目： HoldenGao/oops

vec SingleSampleCCE::cluster_evolution(int cce_order, int index)
{/*{{{*/
    vector<cSPIN> spin_list = _my_clusters.getCluster(cce_order, index);
    cClusterIndex clstIndex = _my_clusters.getClusterIndex(cce_order, index);

    Hamiltonian hami0 = create_spin_hamiltonian(_center_spin, _state_pair.first, spin_list, clstIndex);
    Hamiltonian hami1 = create_spin_hamiltonian(_center_spin, _state_pair.second, spin_list, clstIndex);

    vector<QuantumOperator> hm_list1 = riffle((QuantumOperator) hami0, (QuantumOperator) hami1, _pulse_num);
    vector<QuantumOperator> hm_list2 = riffle((QuantumOperator) hami1, (QuantumOperator) hami0, _pulse_num);
    vector<double> time_segment = Pulse_Interval(_pulse_name, _pulse_num);

    PureState psi = create_cluster_state(clstIndex);

    PiecewiseFullMatrixVectorEvolution kernel1(hm_list1, time_segment, psi);
    PiecewiseFullMatrixVectorEvolution kernel2(hm_list2, time_segment, psi);
    kernel1.setTimeSequence( _t0, _t1, _nTime);
    kernel2.setTimeSequence( _t0, _t1, _nTime);

    ClusterCoherenceEvolution dynamics1(&kernel1);
    ClusterCoherenceEvolution dynamics2(&kernel2);
    dynamics1.run();
    dynamics2.run();

    return calc_observables(&kernel1, &kernel2);
}/*}}}*/

示例#4

0

显示文件

文件： char_argument.cpp 项目： jvesely/hcc

int main(void) {

  char c = 1;

  int * data1 = (int*)malloc(SIZE*sizeof(int));
  auto acc = hc::accelerator();
  int * data1_d = (int*)hc::am_alloc(SIZE*sizeof(int), acc, 0);

  grid_launch_parm lp;
  grid_launch_init(&lp);

  lp.grid_dim = gl_dim3(GRID_SIZE, 1);
  lp.group_dim = gl_dim3(TILE_SIZE, 1);

  hc::completion_future cf;
  lp.cf = &cf;
  kernel1(lp, data1_d, c);
  lp.cf->wait();

  static hc::accelerator_view av = acc.get_default_view();
  av.copy(data1_d, data1, SIZE*sizeof(int));

  bool ret = 0;
  for(int i = 0; i < SIZE; ++i) {
    if((data1[i] != (int)c)) {
      ret = 1;
      break;
    }
  }

  hc::am_free(data1);
  free(data1);

  return ret;
}

示例#5

0

显示文件

文件： completion_future.cpp 项目： aunzat/hcc

int main(void) {

  int *data1 = (int *)malloc(SIZE*sizeof(int));

  auto acc = hc::accelerator();
  int* data1_d = (int*)hc::am_alloc(SIZE*sizeof(int), acc, 0);

  grid_launch_parm lp;
  grid_launch_init(&lp);

  lp.gridDim.x = GRID_SIZE;
  lp.groupDim.x = TILE_SIZE;

  hc::completion_future cf;
  lp.cf = &cf;
  kernel1(lp, data1_d);
  lp.cf->wait();

  hc::am_copy(data1, data1_d, SIZE*sizeof(int));

  bool ret = 0;
  for(int i = 0; i < SIZE; ++i) {
    if(data1[i] != i) {
      ret = 1;
      break;
    }
  }

  hc::am_free(data1_d);
  free(data1);

  return ret;
}

示例#6

0

显示文件

文件： customtype_byval.cpp 项目： UncleHandsome/hcc

int main(void) {

  Foo data1(5);
  Bar* data2 = (Bar*)malloc(SIZE*sizeof(Bar));

  auto acc = hc::accelerator();
  Bar* data2_d = (Bar*)hc::am_alloc(SIZE*sizeof(Bar), acc, 0);

  grid_launch_parm lp;
  grid_launch_init(&lp);

  lp.grid_dim = gl_dim3(GRID_SIZE, 1);
  lp.group_dim = gl_dim3(TILE_SIZE, 1);

  hc::completion_future cf;
  lp.cf = &cf;
  kernel1(lp, data1, data2_d);
  lp.cf->wait();

  hc::am_copy(data2, data2_d, SIZE*sizeof(Bar));

  bool ret = 0;
  for(int i = 0; i < SIZE; ++i) {
    if((data2[i].x != i + data1.y)) {
      ret = 1;
      break;
    }
  }

  hc::am_free(data2_d);
  free(data2);

  return ret;
}

示例#7

0

显示文件

文件： customtype_byptr.cpp 项目： arodrigx7/hcc

int main(void) {

  Foo* data1 = (Foo*)malloc(SIZE*sizeof(Foo));
  Bar* data2 = (Bar*)malloc(SIZE*sizeof(Bar));
  constStructconst* data3 = (constStructconst*)malloc(SIZE*sizeof(constStructconst));
  for(int i = 0; i < SIZE; ++i) {
    data3[i].x = i;
  }

  auto acc = hc::accelerator();
  Foo* data1_d = (Foo*)hc::am_alloc(SIZE*sizeof(Foo), acc, 0);
  Bar* data2_d = (Bar*)hc::am_alloc(SIZE*sizeof(Bar), acc, 0);
  constStructconst* data3_d = (constStructconst*)hc::am_alloc(SIZE*sizeof(constStructconst), acc, 0);
  hc::am_copy(data3_d, data3, SIZE*sizeof(constStructconst));

  grid_launch_parm lp;
  grid_launch_init(&lp);

  lp.gridDim = gl_dim3(GRID_SIZE, 1);
  lp.groupDim = gl_dim3(TILE_SIZE, 1);

  hc::completion_future cf;
  lp.cf = &cf;
  kernel1(lp, data1_d, data2_d, data3_d);
  lp.cf->wait();

  hc::am_copy(data1, data1_d, SIZE*sizeof(Foo));
  hc::am_copy(data2, data2_d, SIZE*sizeof(Bar));

  bool ret = 0;
  for(int i = 0; i < SIZE; ++i) {
    if((data1[i].x != i) || (data2[i].x != i + data3[i].x)) {
      ret = 1;
      break;
    }
  }

  hc::am_free(data1_d);
  hc::am_free(data2_d);
  hc::am_free(data3_d);
  free(data1);
  free(data2);
  free(data3);

  return ret;
}

示例#8

0

显示文件

文件： main.cpp 项目： clang-ykt/openmp

int main() {
  check_offloading();

  int cpuExec = 0;
  #pragma omp target map(tofrom: cpuExec)
  {
    cpuExec = omp_is_initial_device();
  }
  int max_teams = 256;
  int gpu_threads = 256;
  int cpu_threads = 32;
  int max_threads = cpuExec ? cpu_threads : gpu_threads;

  a = (double *) malloc(MAX_N * sizeof(double));
  a_h = (double *) malloc(MAX_N * sizeof(double));
  b = (double *) malloc(MAX_N * sizeof(double));
  c = (double *) malloc(MAX_N * sizeof(double));

#pragma omp target enter data map(to:a[:MAX_N],b[:MAX_N],c[:MAX_N])

  for (int n = 32 ; n < MAX_N ; n+=5000) {
    int t = 0;
    reset_input(a, a_h, b, c);

#pragma omp target update to(a[:n],b[:n],c[:n])

    for (int ths = 1; ths <= 1024; ths *= 3) {
      for(int sch = 1 ; sch <= n ; sch *= 1200) {
        t+=4;
#pragma omp target
#pragma omp parallel
        {
          add_f1(a, b, c, n, sch);
          add_f2(a, b, c, n, sch);
          add_f3(a, b, c, n, sch);
          add_f4(a, b, c, n, sch);
        }
      }
    }

    // check results for each 'n'
    for (int times = 0 ; times < t ; times++)
      for (int i = 0; i < n; ++i)
        a_h[i] += b[i] + c[i];

#pragma omp target update from(a[:n])

    for (int i = 0; i < n; ++i) {
      if (a_h[i] != a[i]) {
        printf("Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, a_h[i], a[i]);
        return 1;
      }
    }
  } // loop 'n'
  printf("Succeeded\n");


  for (int n = 32 ; n < MAX_N ; n+=5000) {
    int t = 0;
    reset_input(a, a_h, b, c);

#pragma omp target update to(a[:n],b[:n],c[:n])

    for (int ths = 1; ths <= 1024; ths *= 3) {
      for(int sch = 1 ; sch <= n ; sch *= 1200) {
        t+=4;
#pragma omp target parallel num_threads(1024)
        {
          add_f1(a, b, c, n, sch);
          add_f2(a, b, c, n, sch);
          add_f3(a, b, c, n, sch);
          add_f4(a, b, c, n, sch);
        }
      }
    }

    // check results for each 'n'
    for (int times = 0 ; times < t ; times++)
      for (int i = 0; i < n; ++i)
        a_h[i] += b[i] + c[i];

#pragma omp target update from(a[:n])

    for (int i = 0; i < n; ++i) {
      if (a_h[i] != a[i]) {
        printf("Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, a_h[i], a[i]);
        return 1;
      }
    }
  } // loop 'n'
  printf("Succeeded\n");


  for (int n = 32 ; n < MAX_N ; n+=5000) {
    int t = 0;
    reset_input(a, a_h, b, c);

#pragma omp target update to(a[:n],b[:n],c[:n])

    for (int tms = 1 ; tms <= 256 ; tms *= 2) { // 8 times
      for (int ths = 32 ; ths <= 1024 ; ths *= 2) { // 6 times
        for(int sch = 1 ; sch <= n ; sch *= 1200) {
          t+=4;
#pragma omp target teams num_teams(tms) thread_limit(ths)
          {
            tadd_dpf1<double>(a, b, c, n, sch);
            add_dpf2(a, b, c, n, sch);
            add_dpf3(a, b, c, n, sch);
            add_dpf4(a, b, c, n, sch);
          }
        } // loop 'sch'
      } // loop 'ths'
    } // loop 'tms'

    // check results for each 'n'
    for (int times = 0 ; times < t ; times++)
      for (int i = 0; i < n; ++i)
        a_h[i] += b[i] + c[i];

#pragma omp target update from(a[:n])

    for (int i = 0; i < n; ++i) {
      if (a_h[i] != a[i]) {
        printf("Error at n = %d, i = %d: host = %lf, device = %lf\n", n, i, a_h[i], a[i]);
        return 1;
      }
    }
  } // loop 'n'
  printf("Succeeded\n");

#pragma omp target exit data map(release:a[:MAX_N],b[:MAX_N],c[:MAX_N])



  #define N (957*3)
  double Ad[N], Bd[N], Cd[N];

  #define INIT() { \
    INIT_LOOP(N, { \
      Ad[i] = 1 << 16; \
      Bd[i] = i << 16; \
      Cd[i] = -(i << 16); \
    }) \
  }

  INIT();

  double RESULT[256];
  int VALID[256];
  long long EXPECTED[7];
  EXPECTED[0] = 34; EXPECTED[1] = 2311; EXPECTED[2] = 4795;
  EXPECTED[3] = 7532; EXPECTED[4] = 10468; EXPECTED[5] = 12999;
  EXPECTED[6] = 15345;
  unsigned e = 0;
  for (int t = 2; t <= max_threads; t+=39) {
    long long OUT = 0;
    int num_threads = t;
    int num_tests = 0;
    #pragma omp target teams map(tofrom: OUT, num_tests) num_teams(1) thread_limit(max_threads)
    {
      #pragma omp parallel num_threads(num_threads)
      {
        for (int offset = 0; offset < 32; offset++) {
          for (int factor = 1; factor < 33; factor++) {
            kernel1(num_threads, RESULT, VALID, offset, factor,
                    N, Ad, Bd, Cd, &OUT, &num_tests);
          }
        }
      }
    }

    if (OUT + num_tests != EXPECTED[e++])
      printf ("Failed test with num_threads = %d, OUT + num_tests = %ld\n",
              t, OUT + num_tests);
    else
      printf ("Succeeded\n");
  }
  if (cpuExec) {
    DUMP_SUCCESS(6);
  }


  e = 0;
  for (int t = 2; t <= max_threads; t+=39) {
    long long OUT = 0;
    int num_threads = t;
    int num_tests = 0;
    #pragma omp target parallel map(tofrom: OUT, num_tests) num_threads(num_threads)
    {
        for (int offset = 0; offset < 32; offset++) {
          for (int factor = 1; factor < 33; factor++) {
            kernel1(num_threads, RESULT, VALID, offset, factor,
                    N, Ad, Bd, Cd, &OUT, &num_tests);
          }
        }
    }

    if (OUT + num_tests != EXPECTED[e++])
      printf ("Failed test with num_threads = %d, OUT + num_tests = %ld\n",
              t, OUT + num_tests);
    else
      printf ("Succeeded\n");
  }
  if (cpuExec) {
    DUMP_SUCCESS(6);
  }


  long long OUT = 0;
  int num_tests = 0;
  #pragma omp target map(tofrom: OUT, num_tests)
  {
    kernel1(1, RESULT, VALID, 0, 1,
            N, Ad, Bd, Cd, &OUT, &num_tests);
  }

  if (OUT + num_tests != 1)
    printf ("Failed test with OUT + num_tests = %ld\n",
            OUT + num_tests);
  else
    printf ("Succeeded\n");

  return 0;
}