void print_run_summary(sampler *samp){
    /*
     Stop the global timer and print a small summary of the run.

     Input:
          sampler *samp        Pointer to sampler structure which has been initialized.
                               Sampling must be performed before running this routine.

     Output:
          Print a short summary of the run, including sample rate and acceptance rate.
          Print the the mean and standard deviation of all components sampled.
     */

    double elapsed_sample = timestamp_diff_in_seconds(samp->time1, samp->time2);

    double elapsed_total = timestamp_diff_in_seconds(samp->time1_total, samp->time2_total);

    // --------------------------------------------------------------------------
    // check output
    // --------------------------------------------------------------------------

    printf("Time steps = %d\n", samp->M);
    printf("Total samples = %d\n", samp->M * samp->K);
    printf("ldim = %d\tgdim = %d\n", (int) samp->ldim[0], (int) samp->gdim[0]);
    printf("Total accepted = %lu\n", samp->accepted_total);
    printf("Acceptance rate = %f\n", (cl_float) samp->accepted_total / ((cl_float) (samp->M * samp->K)) ) ;
    printf("Time for kernel runs = %f\n", elapsed_sample);
    printf("Sample rate, kernel time only = %f million samples / s\n", samp->M * samp->K * 1e-6 / elapsed_sample);

    printf("Total time = %f\n", elapsed_total);
    printf("Sample rate, total time = %f million samples / s\n", samp->M * samp->K * 1e-6 / elapsed_total);
    printf("\n");


    // Basic numerical estimate of mean and standard deviation of each component in the chain
    double mean, sigma;
    float *X = (float *) malloc(samp->total_samples * sizeof(float));
    if(!X){ perror("Allocation failure basic stats"); abort(); }

    for(int i=0; i<samp->num_to_save; i++){

        for(int j=0; j<samp->total_samples; j++)
            X[j] = samp->samples_host[i + j * (samp->num_to_save)];

        compute_mean_stddev(X, &mean, &sigma, samp->total_samples);

        printf("Statistics for X_%d:\t", samp->indices_to_save_host[i]);
        printf("Mean = %f,\tsigma = %f\n", mean, sigma);

    }
    printf("\n");
    free(X);
}
示例#2
0
int main(int argc, char** argv)
{
  int print_results = 0;

  // check for correct number of arguments
  if (argc < 3) {
    usage();
    return EXIT_FAILURE;
  } else if (argc > 3) {
    print_results = atoi(argv[3]);
  }

  // initialize vars and allocate memory
  const int n = atoi(argv[2]);
  int* a = malloc(sizeof(int) * n);

  // initialize local array
  if (init_array(argv[1], 0, n, &a[0]) != EXIT_SUCCESS) {
    printf("File %s could not be opened!\n", argv[1]);
    return EXIT_FAILURE;
  }

  // take a timestamp before the sort starts
  timestamp_type time1, time2;
  get_timestamp(&time1);

  // sort elements
  radix_sort(&a[0], n);
  
  // take a timestamp after the process finished sorting
  get_timestamp(&time2);

  // calculate fish updates per second
  double elapsed = timestamp_diff_in_seconds(time1,time2);
  printf("%f s\n", elapsed);
  printf("%d elements sorted\n", n);
  printf("%f elements/s\n", n / elapsed);

  // print sorted resutls
  if (print_results) {
    print_array(&a[0], n);
  }

  // release resources no longer used
  free(a);

  return 0;
}
示例#3
0
double measure_access(void *x, size_t array_size, size_t ntrips)
{
  timestamp_type t1;
  get_timestamp(&t1);

  for (size_t i = 0; i<ntrips; ++i)
    for(size_t j = 0; j<array_size; ++j)
    {
      *(((char*)x) + ((j * 1009) % array_size)) += 1;
    }

  timestamp_type t2;
  get_timestamp(&t2);

  return timestamp_diff_in_seconds(t1, t2);
}
示例#4
0
int main()
{
  int result = 0;
  const int n = 1024*1024;

  float *allocation;
  if (errno = posix_memalign((void **) &allocation, 64, n*sizeof(float) + 64))
    perror("allocating a");

  float __attribute__ ((aligned (1))) *b = malloc(n*sizeof(float));
  if (errno = posix_memalign((void **) &b, 64, n*sizeof(float) + 64))
    perror("allocating b");

  float __attribute__ ((aligned (64))) *a = (float *) (((char *) allocation) + 0);

  /*
  puts("write");
  for (int i = 0; i<n; ++i)
    a[i] = i;
    */

  timestamp_type t1;
  get_timestamp(&t1);

  for (int ntrips = 0; ntrips < 1000; ++ntrips)
  {
    for (int i = 0; i<n; ++i)
      b[i] = 2*a[i];
  }

  timestamp_type t2;
  get_timestamp(&t2);

  printf("elapsed time: %g s\n",
      timestamp_diff_in_seconds(t1, t2));

  // fake a dependency on a
  for (int i = 0; i<n; ++i)
    result += a[i];

  free(allocation);
  return result;
}
示例#5
0
int main(int argc, char** argv){
    
    timestamp_type time1, time2;

    
    if (argc != 3) {
        printf("USAGE: ./jacobi-omp.o <Number of points (N)> <Num Iter>\n");
        abort();
    }
    int N = atoi(argv[1]);
    int numIter = atoi(argv[2]);
    double* u_k = (double*) malloc(N*sizeof(double));
    
    get_timestamp(&time1);
    //Initialize u_k
    int i;
    for (i=0; i<N; i++) {
        u_k[i] = 0.0;
    }
    
    int nthreads;
    #pragma omp parallel
    {
        nthreads = omp_get_num_threads();
        int tid = omp_get_thread_num();
        printf("(%d) starting jacobi iteration. \n", tid);
        #pragma omp barrier
        jacobi_iteration(u_k, N, nthreads, numIter);
    }
    
    get_timestamp(&time2);
    double elapsed = timestamp_diff_in_seconds(time1,time2);
    printf("Time elapsed is %f seconds.\n", elapsed);
//    print_solution(u_k, N);
    free(u_k);
}
示例#6
0
int main(int argc, char **argv)
{
  int rank_count, my_rank, worker_count;
  // FIXME kill
  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD, &rank_count);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

  worker_count = rank_count-1;

  if (argc != 3)
  {
    fprintf(stderr, "need two arguments!\n");
    abort();
  }

  const long n = atol(argv[1]);
  const int ntrips = atoi(argv[2]);

  // FIXME kill
  if (n % worker_count != 0)
  {
    fprintf(stderr, "size not divisible\n");
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  long divided_n = n / worker_count;

  printf("rank %d/%d reporting for duty\n", my_rank, rank_count);

  const int tag = 0;
  if (my_rank == 0)
  {
    printf("doing %d trips...\n", ntrips);

    double *x = (double *) malloc(sizeof(double) * n);
    if (!x) { perror("alloc x"); MPI_Abort(MPI_COMM_WORLD, 1); }
    double *y = (double *) malloc(sizeof(double) * n);
    if (!y) { perror("alloc y"); MPI_Abort(MPI_COMM_WORLD, 1); }
    double *z = (double *) malloc(sizeof(double) * n);
    if (!z) { perror("alloc z"); MPI_Abort(MPI_COMM_WORLD, 1); }

    for (int i = 0; i < n; ++i)
    {
      x[i] = i;
      y[i] = 2*i;
    }

    timestamp_type time1, time2;
    get_timestamp(&time1);

    for (int i = 0; i < worker_count; ++i)
    {
      printf("before send %d\n", i);
      MPI_Send(x + i*divided_n, divided_n, MPI_DOUBLE, i+1, tag, MPI_COMM_WORLD);
      MPI_Send(y + i*divided_n, divided_n, MPI_DOUBLE, i+1, tag, MPI_COMM_WORLD);
      printf("after send %d\n", i);
    }
    for (int i = 0; i < worker_count; ++i)
    {
      printf("before recv %d\n", i);
      MPI_Recv(z + i*divided_n, divided_n, MPI_DOUBLE, i+1, tag, MPI_COMM_WORLD,
          MPI_STATUS_IGNORE);
      printf("after recv %d\n", i);
    }

    get_timestamp(&time2);
    double elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips;
    printf("%f GB/s\n",
        3*n*sizeof(double)/1e9/elapsed);
    printf("%f GFlops/s\n",
        n/1e9/elapsed);

    for (int i = 0; i < n; ++i)
    {
      if (z[i] != x[i] + y[i])
      {
        printf("bad %d\n", i);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
    }
  }
  else
  {
    double *xbuf = (double *) malloc(sizeof(double) * divided_n);
    if (!xbuf) { perror("alloc xbuf"); MPI_Abort(MPI_COMM_WORLD, 1); }
    double *ybuf = (double *) malloc(sizeof(double) * divided_n);
    if (!ybuf) { perror("alloc ybuf"); MPI_Abort(MPI_COMM_WORLD, 1); }
    double *zbuf = (double *) malloc(sizeof(double) * divided_n);
    if (!zbuf) { perror("alloc zbuf"); MPI_Abort(MPI_COMM_WORLD, 1); }

    MPI_Recv(xbuf, divided_n, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD,
        MPI_STATUS_IGNORE);
    MPI_Recv(ybuf, divided_n, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD,
        MPI_STATUS_IGNORE);

    printf("start rank %d\n", my_rank);
    for (int trip = 0; trip < ntrips; ++trip)
    {
      for (int i = 0; i < divided_n; ++i)
      {
        zbuf[i] = xbuf[i] + ybuf[i];
      }
    }
    printf("done rank %d\n", my_rank);

    MPI_Send(zbuf, divided_n, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
    printf("done rank %d\n", my_rank);
  }

  MPI_Finalize();

  return 0;
}
示例#7
0
int main( int argc, char **argv) {

  srand(time(NULL));

  mytimer timer;
  timer.total_time = timer.init_time = timer.comp_time = timer.comm_time = 0.0;
  timestamp_type time_s, time_e;

  int mpi_rank, mpi_size;
  int i, r, rows, cols, total_rows, err;
  int *points_per_proc, *buffer;
  MPI_File filename;
  
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);

  options opt;
  parse_command_line(argc, argv, &opt);

  err = MPI_File_open(MPI_COMM_WORLD, opt.filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &filename);
  if (err) {
    if (mpi_rank == 0) fprintf(stderr, "Couldn't open file %s\n", argv[1]);
    MPI_Finalize();
    exit(1);
  }

  double **data = mpi_read_data(&filename, &rows, &cols, mpi_rank, mpi_size, opt.overlap);
  points_per_proc = (int*) calloc(mpi_size, sizeof(int));
  check(points_per_proc); 
  buffer = (int*) calloc(mpi_size, sizeof(int));
  check(buffer);
  buffer[mpi_rank] = rows;
  MPI_Barrier(MPI_COMM_WORLD);

  MPI_Allreduce(buffer, points_per_proc, mpi_size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
  free(buffer);
  MPI_Allreduce(&rows, &total_rows, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

  opt.n_points = total_rows;
  opt.dimensions = cols;
  opt.local_rows = rows;

  if(mpi_rank == 0 && opt.verbose > 1) {
    printf("Total rows: %d\n", opt.n_points);
    for(r = 0; r < mpi_size; r++)
      printf("proc %d has %d\n", r, points_per_proc[r]);
  }
  for(r=0; r < mpi_size; r++) {
    MPI_Barrier(MPI_COMM_WORLD);
    if(mpi_rank == r && opt.verbose > 2) {
      for(i=0; i < rows; i++) {
        printf("proc %d: %d --- ", mpi_rank, i);
        print_vec(data[i], cols);
      }
    }
  }

  // allocate centroids, everyone gets their own copy
  double **centroids = (double**) alloc2d(opt.n_centroids, opt.dimensions);
  // allocate  cluster memberships
  // only track the ones this process is responsible for
  int *membership = (int*) malloc(opt.local_rows * sizeof(int));
  check(membership);

  double inertia = DBL_MAX;
  int total_iterations = 0;
  get_timestamp(&time_s);
  total_iterations = kmeans(data, centroids, membership, &inertia, mpi_rank, mpi_size, points_per_proc, &timer, opt);
  get_timestamp(&time_e);
  timer.total_time = timestamp_diff_in_seconds(time_s, time_e);

  if(mpi_rank == 0 && opt.verbose > 0) { 
    print_vecs(centroids, opt, "centroids");
  }

  if(mpi_rank == 0) {
    printf("\nMPI K-MEANS\n");
    printf("%dx%d data, %d clusters, %d trials, %d cores\n", opt.n_points, opt.dimensions, opt.n_centroids, opt.trials, mpi_size);
    printf("Inertia: %f\n", inertia);
    printf("Total Iterations: %d\n", total_iterations);
    printf("Runtime: %fs\n", timer.total_time);
    printf("Initialization time: %fs\n", timer.init_time);
    printf("Computation time: %fs\n", timer.comp_time);
    printf("Communication time: %fs\n", timer.comm_time);
  }

  MPI_File_close(&filename);

  free(points_per_proc);
  free(*data);
  free(data);
  free(*centroids);
  free(centroids);
  free(membership);

  MPI_Finalize();

  return 0;
}
示例#8
0
int _kmeans(double **data, double **centroids, int *membership, \
            double *inertia, int rank, int size, int *ppp, mytimer *t, options opt) {

  timestamp_type time_is, time_ie;
  timestamp_type time_cs, time_ce;
  #ifdef TIME_ALL
    timestamp_type comm_s, comm_e;
  #endif

  double dist, total_inertia, total_delta, delta = (double) opt.n_points;
  int i, center, iters = 0;

  // allocate for new centroids that will be computed
  double **new_centers = (double**) alloc2d(opt.n_centroids, opt.dimensions);
  memset(*new_centers, 0, opt.n_centroids * opt.dimensions * sizeof(double));

  // allocate array to count points in each cluster, initialize to 0
  int *count_centers = (int*) calloc(opt.n_centroids, sizeof(int));
  check(count_centers);
  int *new_count_centers = (int*) calloc(opt.n_centroids, sizeof(int));
  check(new_count_centers);

  // if a cluster has 0 points assigned use this for random reinitialization
  double *point = (double*) malloc(opt.dimensions * sizeof(double));
  check(point);
  double *tofree = point;

  get_timestamp(&time_is);
  t->comm_time += initialize(data, centroids, ppp, rank, size, opt);
  get_timestamp(&time_ie);

  #ifdef TIME_ALL
    get_timestamp(&comm_s);
  #endif
  MPI_Bcast(*centroids, opt.n_centroids*opt.dimensions, MPI_DOUBLE, 0, MPI_COMM_WORLD);
  #ifdef TIME_ALL
    get_timestamp(&comm_e);
    t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e);
  #endif

  get_timestamp(&time_cs);
  while (delta / ((double) opt.n_points) > opt.tol && iters < opt.max_iter) {
    // MPI_Barrier(MPI_COMM_WORLD);
    delta = 0.0;
    *inertia = 0.0;
    for(i = 0; i < opt.local_rows; i++){
        find_nearest_centroid(data[i], centroids, opt, &center, &dist);
        *inertia += dist;
        if (membership[i] != center) {
            delta++;
            membership[i] = center;       
        }
        add(new_centers[center], data[i], opt);
        new_count_centers[center]++; 
    }

    #ifdef TIME_ALL
      get_timestamp(&comm_s);
    #endif
    MPI_Allreduce(*new_centers, *centroids, opt.n_centroids * opt.dimensions, 
                  MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(new_count_centers, count_centers, opt.n_centroids, 
                  MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    #ifdef TIME_ALL
      get_timestamp(&comm_e);
      t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e);
    #endif

    for(i = 0; i < opt.n_centroids; i++) {
        if(count_centers[i] == 0) {

          if(rank == 0){
            add(centroids[i], data[randint(opt.local_rows)], opt);
          }
          // broadcast this new point to everyone
          #ifdef TIME_ALL
            get_timestamp(&comm_s);
          #endif
          MPI_Bcast(centroids[i], opt.dimensions, MPI_DOUBLE, 0, MPI_COMM_WORLD);
          #ifdef TIME_ALL
            get_timestamp(&comm_e);
            t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e);
          #endif
          // add to delta to ensure we dont stop after this
          delta += opt.tol * opt.local_rows + 1.0;
        }
        // all good to divide, count is not 0
        else {
          // calculate the new center
          div_by(centroids[i], count_centers[i], opt);
        }
    }

    // sum up the number of cluster assignments that changed
    #ifdef TIME_ALL
      get_timestamp(&comm_s);
    #endif
    MPI_Allreduce(&delta, &total_delta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    delta = total_delta;
    // sum up the inertias
    MPI_Allreduce(inertia, &total_inertia, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    *inertia = total_inertia;
    #ifdef TIME_ALL
      get_timestamp(&comm_e);
      t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e);
    #endif

    // zero out new_centers and count_centers
    memset(*new_centers, 0, opt.n_centroids * opt.dimensions * sizeof(double));
    memset(new_count_centers, 0, opt.n_centroids * sizeof(int));
    memset(count_centers, 0, opt.n_centroids * sizeof(int));

    iters++;
    if(opt.verbose > 1 && rank == 0) {
        printf("\n\titers: %d\n", iters);
        printf("\tdelta: %d\n", (int)   delta);
        printf("\teps: %f\n", delta / ((double) opt.n_points));
        printf("\tinertia: %f\n", *inertia);
    }
  }
  get_timestamp(&time_ce);
  t->init_time += timestamp_diff_in_seconds(time_is, time_ie);
  t->comp_time += timestamp_diff_in_seconds(time_cs, time_ce);


  free(*new_centers);
  free(new_centers);
  free(count_centers);
  free(new_count_centers);
  free(tofree);

  if(iters == opt.max_iter && rank == 0 && opt.verbose > 0) {
      printf("HIT MAX ITERS\n");
  }

  return iters;
} 
示例#9
0
double initialize(double **data, double **centroids, int *ppp, int rank, int size, options opt) {
  MPI_Status status;
  double comm_time = 0.0;

  if(rank == 0) {
    #ifdef TIME_ALL
      timestamp_type comm_s, comm_e;  
    #endif
    int i, idx, owner;

    int *init = (int*) malloc(opt.n_centroids * sizeof(int));
    check(init);

    double *point = (double*) malloc(opt.dimensions * sizeof(double));
    check(point);
    double *tofree = point;

    for(i = 0; i < opt.n_centroids; i++){
        while(In(idx = randint(opt.n_points), init, i));
        init[i] = idx;
        owner = get_owner(&idx, ppp);
        if(owner != 0) {
          #ifdef TIME_ALL
            get_timestamp(&comm_s);
          #endif
          MPI_Send(&idx, 1, MPI_INT, owner, 999, MPI_COMM_WORLD);
          MPI_Recv(point, opt.dimensions, MPI_DOUBLE, owner, 999, MPI_COMM_WORLD, &status);
          #ifdef TIME_ALL
            get_timestamp(&comm_e);
            comm_time += timestamp_diff_in_seconds(comm_s, comm_e);
          #endif
        } 
        else{
          point = data[idx];
        }
        // printf("%d owned by %d at %d ", init[i], owner, idx);
        // print_vec(point, opt.dimensions);
        memcpy(centroids[i], point, opt.dimensions * sizeof(double));
        point = tofree;
    }
    idx = -1;
    #ifdef TIME_ALL
      get_timestamp(&comm_s);
    #endif
    for(i = 1; i < size; i++)
      MPI_Send(&idx, 1, MPI_INT, i, 999, MPI_COMM_WORLD);
    #ifdef TIME_ALL
      get_timestamp(&comm_e);
      comm_time += timestamp_diff_in_seconds(comm_s, comm_e);
    #endif

    free(init);
    free(tofree);
  }
  else {
    int get_point;
    while(1) {
      MPI_Recv(&get_point, 1, MPI_INT, 0, 999, MPI_COMM_WORLD, &status);
      if(get_point != -1)
        MPI_Send(data[get_point], opt.dimensions, MPI_DOUBLE, 0, 999, MPI_COMM_WORLD);
      else break;
    }
  }
  return comm_time;
}
int main(int argc, char** argv)
{
    if (argc != 5)
    {
        fprintf(stderr, "Need four arguments, m, n, iterations, and a (0,1) to indicate if testing is desired.\n");
        abort();
    }
    /* Size of Matrix */
    int m = atoi(argv[1]);
    int n = atoi(argv[2]);
    int iterations = atoi(argv[3]);

    int testing = atoi(argv[4]);
    if(iterations < 1) {
        printf("\nIterations must be non-zero positive #.\n");
        abort();
    }


    /* Initilize matrices A, Atest, and Q */
    double * A = malloc(m * n * sizeof(double));
    double * Atest = malloc( m * n * sizeof(double));
    double * Q = malloc(m * m * sizeof(double));

    /* Fill up matrix A with elements from [0,10)*/
    int i = 0;
    srand ( 1 );

    timestamp_type time1, time2;
    get_timestamp(&time1);

    for(int i2 = 0; i2 < iterations; i2++) {
        for(i=0; i< (m*n) ; i++) {
            A[i] = (double) (rand() %1000)/100;
            Atest[i] = A[i];
        }
        /* BlockedQR replaces A with R, which is why we needed to copy A in order to test code */
        BlockedQR2(A, m, n, Q);

        /* Test Code */
        if(testing) {
            printf(" Testing Code ..... \n");
            double *Qt = malloc( m* m* sizeof(double));
            MatrixTranspose(Q, m, m, Qt);
            testUpperTriangular(A, m, n);
            printf(" R is Upper Triangular! \n");
            testOrthogonal(Q, Qt, m);
            printf(" Q is Orthogonal! \n");
            IsQRequalToA(Q, A, Atest, m, m, m, n);
            printf(" A = QR! QR factorization was sucessful!\n");
            free(Qt);
        }
    }
    get_timestamp(&time2);
    double elapsed = timestamp_diff_in_seconds(time1,time2);
    //  double gbs = m * n * iterations / elapsed / 1e9;
    double gfps = m * n * n / elapsed / 1000000000;

    writetofile2("blockedQR2_8_time.txt", m, n, elapsed);
    writetofile2("blockedQR2_8_gfps.txt", m, n, gfps);

    if(verbose) printf("QR2: Time elasped = %f s over %d iterations\n", elapsed, iterations);

    free(A);
    free(Atest);
    free(Q);


    return 0;
}
示例#11
0
int main(int argc, char *argv[])
{

  if (argc < 3){
    printf("Arguments required, Quitting...\n");
    return 1;
  }
  int N = atoi(argv[1]);
  int max_iter = atoi(argv[2]);
  double h2 = 1.0/(N+1)/(N+1);
  double *u, *uc, *f;

  // allocate arrays
  int n_per_proc =  N + 2;
  
  u = (double *) malloc(n_per_proc*sizeof (double));
  uc = (double *) malloc(n_per_proc*sizeof (double));

  f = (double *) malloc(n_per_proc*sizeof (double));

  // initialize f and u
  int i;
  for (i = 0; i < n_per_proc-1; i++) {
    f[i] = 1.0;
    u[i] = 0.0;
  }

  // Begin iterations
  double resid_init, resid_cur;
  resid_init = calc_resid(n_per_proc, h2, f, u);
  resid_cur = resid_init;
  printf("%f\n", resid_init);
  
  int iter = 0;

  timestamp_type t1, t2;

  get_timestamp(&t1);
  while (resid_cur / resid_init > STOP_ITER_RAT){
    
    /* resid_cur = 0.0; */
    u[0] = 0.0;
    u[n_per_proc - 1] = 0.0;
    jacobi_laplace(n_per_proc, h2, f, u, uc);
    
    resid_cur = calc_resid(n_per_proc, h2, f, u);
    printf("Resid is %f\n", resid_cur );
    
    if (++iter > max_iter) break;

  }
  get_timestamp(&t2);


  printf("Total time: %f\n", timestamp_diff_in_seconds(t1,t2));
  // deallocate
  free(f);
  free(u);
  free(uc);
  
  return 0;
}
示例#12
0
int main(int argc, char** argv) 
{
/* Check for two arguemnts, m = height of matrix, n = width of matrix k=iterations  */ 
  if (argc != 5)
  {
    fprintf(stderr, "Need four arguments, m, n, iterations, and a (0,1) to indicate if testing is desired.\n");
    abort(); 
  } 
 
  /* Size of Matrix */  
  int m = atoi(argv[1]);
  int n = atoi(argv[2]);
  int iterations = atoi(argv[3]);
  int testing = atoi(argv[4]);
  if(iterations < 1){
    printf("\nIterations must be non-zero positive #.\n");
    abort();
  }

 /* Initilize matrices A, Atest, and Q */
  double * A = malloc(m * n * sizeof(double));
  double * Atest = malloc( m * n * sizeof(double));	
  double * Q = malloc(m * m * sizeof(double));
  double * Qt = malloc(m * m * sizeof(double));
  double * R = malloc(m * n * sizeof(double));

 /* Fill up matrix A with elements from [0,10)*/
  int i = 0; 
  srand ( 1 );	 

  timestamp_type time1, time2;
  get_timestamp(&time1); 

  for(int i2 = 0; i2 < iterations; i2++){
    for(i=0; i< (m*n) ; i++){
      A[i] = (double) (rand() %1000)/100;
      Atest[i] = A[i];
    }

    WY(A, m, n, Q, Qt, R);
    if(testing){
      testUpperTriangular(R, m, n);
      printf(" R is Upper Triangular! \n");
      testOrthogonal(Q, Qt, m);
      printf(" Q is Orthogonal! \n");
      IsQRequalToA(Q, R, Atest, m, m, m, n);  	
      printf(" A = QR! QR factorization was sucessful!\n");
    }
  }
  get_timestamp(&time2);
  double elapsed = timestamp_diff_in_seconds(time1,time2);
  //double gbs = m * n * 8 * iterations / elapsed / 1e9;
  double gflops = m * n * n / elapsed / 1000000000; //fabs(m * n * n  - n * n * n / 3); 
  writetofile2("wy_time.txt", m, n, elapsed);
  writetofile2("wy_gfps.txt", m, n, gflops);
  writetofile2("wy_mbyn.txt", m, n, (elapsed/(double)iterations));
  if(verbose) printf("Time elasped = %f s over %d iterations\n", elapsed, iterations);
  
  free(A);
  free(Atest);
  free(Q);
  free(Qt);
  free(R);

  return 0;
}
示例#13
0
void SplitNode(Node *node, double **data, int n, int first, int level) {

/* Creates two branches of the decision tree on the array data. End condition
 * creates leaf if the purity of the node is small or if there are few
 * samples on the branch of node
 *
 * node  = pointer to node in decision tree
 * data  = table of unsorted data with features and labels (with last
 *         column as the label (data[i][d-1]))
 * n     = length of table (# of rows/samples) on branch of node
 * first = first index of samples on branch of node
 * level = the depth of node in the tree
 */
    
    timestamp_type sort_start, sort_stop, split_start, split_stop;
    double sort_time = 0.;
    double split_time = 0.;

    int max_level = 3;
    int min_points = 6;

    node->left = NULL;
    node->right = NULL;
    node->index = -1;

    //Get initial counts for positive/negative labels
    int i;
    int pos = 0;
    double pos_w = 0;//positive weight
    double tot = 0;//total weight
    for (i = 0; i < n; ++i) {
        tot += data[first+i][D];

        if (data[first+i][D-1] > 0){
            pos += 1;
            pos_w += data[first+i][D];
        }
    }
    int neg = n - pos;
    double neg_w = tot - pos_w;
    
    //Declare class for node in case of pruning on child
    if (pos_w > neg_w)
        node->label = 1;
    else if (pos_w < neg_w)
        node->label = -1;
    else if (node->parent)
        node->label = node->parent->label;
    else {
        //printf("Root node is evenly balanced.\n");
        node->label = 0;
    }

    //If branch is small or almost pure, make leaf
    if (n < min_points) {
        //printf("small branch: %d points\n", n, level);
        return;
    }
    else if (level == max_level) {
        //printf("leaf node: level = max\n");
        return;
    }
    else if (pos == 0 || neg == 0) {
        //printf("pure node\n");
        return;
    }

    ///////////////TEST//////////////////
    //printf("LEVEL: %d\n", level);
    //printf("pos=%d, neg=%d, posw=%f, negw=%f, lab=%f\n", pos, neg, pos_w, neg_w, node->label);
    //printf("GINI: %f\n", GINI(pos_w, tot));
    /////////////////////////////////////


    int col;
    int row; //best row to split at for particular column/feature
    int localrow; //first + localrow = row; receives BestSplit which returns integer in [-1, n-1]
    double threshold; //best threshold to split at for column/feature
    double impurity; //impurity for best split in feature/column
    int bestcol = -1; //feature with best split
    int bestrow = first+n-1; //best row to split for best feature
    double bestthresh; //threshold split for best feature (data[bestrow][bestcol])
    double Pmin = GINI(pos_w, tot); //minimum impurity seen so far

    //Sort table. Then find best column/feature, threshold, and impurity
    for (col = 0; col < D-1; ++col) {
        //printf("\r%5d/%5d", col, D);
        //fflush(stdout);
        get_timestamp(&sort_start);
        Sort(data, first, first+n-1, col);
        get_timestamp(&sort_stop);
        get_timestamp(&split_start);
        localrow = WeightedBestSplit(data, n, first, col, pos_w, tot, &impurity);
        get_timestamp(&split_stop);
        sort_time += timestamp_diff_in_seconds(sort_start, sort_stop);
        split_time += timestamp_diff_in_seconds(split_start, split_stop);
        row = first + localrow;
        threshold = data[row][col];

        //If current column has better impurity, save col, thresh, and Pmin
        if (impurity < Pmin) {
            bestcol = col;
            bestrow = row;
            bestthresh = threshold;
            Pmin = impurity;
        }
    }
    //printf("\r           \r");
    //printf("Sort  time: %f sec\nSplit time: %f sec\n", sort_time, split_time);

    //If splitting doesn't improve purity (best split is at the end) stop
    if (bestrow == first+n-1) {
        //printf("no improvement\n");
        return;
    }


    Sort(data, first, first+n-1, bestcol);

    //For feature, threshold with best impurity, save to node attributes
    node->index = bestcol;
    node->threshold = bestthresh;

    printf("Best feature: %d, Best thresh: %f, Impurity: %f\n", node->index, node->threshold, Pmin);

    //Create right and left children
    Node *l = malloc(sizeof(Node));
    Node *r = malloc(sizeof(Node));
    l->parent = node;
    r->parent = node;
    l->right = NULL;
    l->left = NULL;
    r->right = NULL;
    r->left = NULL;
    
    node->left = l;
    node->right = r;

    int first_r = bestrow+1;
    int n_l = first_r - first;
    int n_r = n - n_l;

    //printf("LEFT\n");
    SplitNode(l, data, n_l, first, level+1);
    //printf("RIGHT\n");
    SplitNode(r, data, n_r, first_r, level+1);

    return;
}
示例#14
0
int
main (int argv, char **argc)
{
  /////////////////////////
  ////// SAME IN EVERY FILE
  /////////////////////////

  // create context and command queue
  cl_context       __sheets_context;
  cl_command_queue __sheets_queue;
  int              _i;
  cl_int           __cl_err;
  
  create_context_on(SHEETS_PLAT_NAME,
		    SHEETS_DEV_NAME,
		    0,		/* choose the first (only) available device */
		    &__sheets_context,
		    &__sheets_queue,
		    0);

  // compile kernels
  for (_i = 0; _i < NKERNELS; _i++) {
    compiled_kernels[_i] = kernel_from_string(__sheets_context,
					      kernel_strings[_i],
					      kernel_names[_i],
					      SHEETS_KERNEL_COMPILE_OPTS);
  }

  ////// [END]

  size_t __SIZE_wav = atoi(argc[1]);

  float wav[__SIZE_wav];
  const char *file_name = "mytune.wav";
  int in_thrsh_cnt = 0;

  timestamp_type st;
  timestamp_type end;

  get_timestamp(&st);		
  for (_i = 0; _i < __SIZE_wav; _i++) {
    wav[_i] = (float) rand() / RAND_MAX;
    if (in_thrsh(wav[_i], 0.1112, 0.7888))
      in_thrsh_cnt++;
  }
  get_timestamp(&end);

  printf("cpu execution took %f seconds\n", timestamp_diff_in_seconds(st, end));

  get_timestamp(&st);

  /////////////////
  ////// GFUNC CALL
  /////////////////

  /// create variables for function arguments given as literals
  float __PRIM_band_restrict_ARG2 = 0.1112f;
  float __PRIM_band_restrict_ARG3 = 0.7888f;

  /// return array (always arg0)
  cl_mem __CLMEM_band_restrict_ARG0 = clCreateBuffer(__sheets_context, 
						     CL_MEM_WRITE_ONLY, 
						     sizeof(float) * __SIZE_wav, 
						     NULL, 
						     &__cl_err);
  CHECK_CL_ERROR(__cl_err, "clCreateBuffer");
					      
  /// input arrays
  cl_mem __CLMEM_band_restrict_ARG1 = clCreateBuffer(__sheets_context, 
						     CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
						     sizeof(float) * __SIZE_wav, 
						     (void *) wav, 
						     &__cl_err);
  CHECK_CL_ERROR(__cl_err, "clCreateBuffer");

  /// write to device memory
  CALL_CL_GUARDED(clEnqueueWriteBuffer,
		 (__sheets_queue,
		  __CLMEM_band_restrict_ARG1,
		  CL_TRUE,	/* blocking write */
		  0, 		/* no offset */
		  sizeof(float) * __SIZE_wav,
		  wav,
		  0,		/* no wait list */
		  NULL,
		  NULL)
		  );
  
  /// set up kernel arguments
  SET_4_KERNEL_ARGS(compiled_kernels[0],
		    __CLMEM_band_restrict_ARG0,
		    __CLMEM_band_restrict_ARG1,
		    __PRIM_band_restrict_ARG2,
		    __PRIM_band_restrict_ARG3);

  /// enqueue kernel
  cl_event __CLEVENT_band_restrict_CALL;
  CALL_CL_GUARDED(clEnqueueNDRangeKernel,
		  (__sheets_queue,
		   compiled_kernels[0],
		   1,		/* 1 dimension */
		   0,		/* 0 offset */
		   &__SIZE_wav,
		   NULL,	/* let OpenCL break things up */
		   0,		/* no events in wait list */
		   NULL,	/* empty wait list */
		   &__CLEVENT_band_restrict_CALL)
		  );

  /// allocate space for cpu return array
  float out[__SIZE_wav];
  
  CALL_CL_GUARDED(clEnqueueReadBuffer,
		  (__sheets_queue,
		   __CLMEM_band_restrict_ARG0,
		   CL_TRUE,	 /* blocking read */
		   0,		 /* 0 offset */
		   sizeof(float) * __SIZE_wav, 	 /* read whole buffer */
		   (void *) out, /* host pointer */
		   1,		 /* wait for gfunc to finish */
		   &__CLEVENT_band_restrict_CALL, /* "" */
		   NULL)			  /* no need to wait for this call though */
		  );
  
  ////// [END] GFUNC CALL

  get_timestamp(&end);

  printf("gfunc call took %f seconds\n", timestamp_diff_in_seconds(st, end));

  ////// Validate call
  int c = 0;

  for (_i = 0; _i < __SIZE_wav; _i++) {
    if (in_thrsh(out[_i], 0.1112, 0.7888)) {
      c++;
    } else if(out[_i]) {
      exit(1);
    }
  }

  printf("\n");
	 
  assert(in_thrsh_cnt == c);

  //////////////
  ////// CLEANUP
  //////////////

  CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG0));
  CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG1));
  for (_i = 0; _i < NKERNELS; _i++) {
    CALL_CL_GUARDED(clReleaseKernel, (compiled_kernels[_i]));
  }
  CALL_CL_GUARDED(clReleaseCommandQueue, (__sheets_queue));
  CALL_CL_GUARDED(clReleaseContext, (__sheets_context));

  return 0;
}
示例#15
0
int main (int argc, char *argv[])
{
  double *a, *b, *c;

  if (argc != 3)
  {
    fprintf(stderr, "Usage: %s size_of_vector num_adds\n", argv[0]);
    abort();
  }

  const cl_long N = (cl_long) atol(argv[1]);
  const int num_adds = atoi(argv[2]);


  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);

  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("vec-add-kernel.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "sum", NULL);
  free(knl_text);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
  posix_memalign((void**)&a, 32, N*sizeof(double));
  if (!a) { fprintf(stderr, "alloc a"); abort(); }
  posix_memalign((void**)&b, 32, N*sizeof(double));
  if (!b) { fprintf(stderr, "alloc b"); abort(); }
  posix_memalign((void**)&c, 32, N*sizeof(double));
  if (!c) { fprintf(stderr, "alloc c"); abort(); }

  for(cl_long n = 0; n < N; ++n)
  {
    a[n] = n;
    b[n] = 2*n;
  }

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      sizeof(double) * N, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      sizeof(double) * N, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      sizeof(double) * N, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0,
        N * sizeof(double), a,
        0, NULL, NULL));

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
        N * sizeof(double), b,
        0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  CALL_CL_SAFE(clFinish(queue));

  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int add = 0; add < num_adds; ++add)
  {
    SET_4_KERNEL_ARGS(knl, N, buf_a, buf_b, buf_c);
    size_t  local_size[] = { 128 };
    size_t global_size[] = { ((N + local_size[0] - 1)/local_size[0])*
                             local_size[0] };
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL,
          global_size, local_size, 0, NULL, NULL));
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_adds;
  printf("%f s\n", elapsed);
  printf("%f GB/s\n", 3*N*sizeof(double)/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clEnqueueReadBuffer(
        queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
        N * sizeof(double), c,
        0, NULL, NULL));



  for(cl_long i = 0; i < N; ++i)
    if(c[i] != 3*i)
    {
      printf("BAD %ld\n", (long)i);
      abort();
    }
  printf("GOOD\n");

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_a));
  CALL_CL_SAFE(clReleaseMemObject(buf_b));
  CALL_CL_SAFE(clReleaseMemObject(buf_c));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));

  free(a);
  free(b);
  free(c);

  return 0;
}
void main(int argc, char** argv)
{
	//int k = atoi(argv[1]);	
	//int  N  = pow(2,k);
	int	N=1024;
	int	k=10;
	
	float * a = (float *) malloc(sizeof(float)*N* N * 2);
	float * b = (float *) malloc(sizeof(float) *N*N * 2);
	float * c = (float *) malloc(sizeof(float) * N*N* 2);
	float p = 2*M_PI ;	
	for (int i =0; i< N*N; i++)
	{
		a[2*i] = 1;
		a[2*i+1] = 0;
		b[2*i] = 1;
		b[2*i+1] = 0;
	}
#if 0 
	srand(1);
	for(int i =0;i<N*N;i++)
	{	
		a[2*i]=sin(i%N *2 *M_PI);
		//printf("%f\n",uu[2*i]);
		a[2*i+1] =0 ;
	}
#endif
	print_platforms_devices();

	cl_context ctx;
	cl_command_queue queue;
	create_context_on("NVIDIA","GeForce GTX 590",0,&ctx,&queue,0);

	cl_int status;
	cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float)  * N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");
	
	cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) * N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_d = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float)*N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");
	cl_mem buf_e = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_f = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_g = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float) *N*N*2, a,
	0, NULL, NULL));

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float) *N *N* 2, b,
	0, NULL, NULL));

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float)  *N* N*2, c,
	0, NULL, NULL));

	char *knl_text = read_file("vec_add.cl");
	cl_kernel vec_add = kernel_from_string(ctx, knl_text, "sum", NULL);
	free(knl_text);

	knl_text = read_file("mat_etr_mul.cl");
	cl_kernel mat_etr_mul = kernel_from_string(ctx, knl_text, "mult", NULL);
	free(knl_text);


	knl_text = read_file("radix-4-float.cl");
	cl_kernel fft1D = kernel_from_string(ctx, knl_text, "fft1D", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-init.cl");
	cl_kernel fft_init = kernel_from_string(ctx, knl_text, "fft1D_init", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-interm.cl");
	cl_kernel fft_interm = kernel_from_string(ctx, knl_text, "fft1D", NULL);
	free(knl_text);

	knl_text = read_file("transpose-soln-gpu.cl");
	cl_kernel mat_trans = kernel_from_string(ctx, knl_text, "transpose", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-modi.cl");
	cl_kernel fft_init_w = kernel_from_string(ctx, knl_text, "fft1D_init", NULL);
	free(knl_text);

	knl_text = read_file("vec_zero.cl");
	cl_kernel vec_zero = kernel_from_string(ctx, knl_text, "zero", NULL);
	free(knl_text);

	knl_text = read_file("reduction.cl");
	cl_kernel reduct_mul = kernel_from_string(ctx, knl_text, "reduction_mult", NULL);
	free(knl_text);

	knl_text = read_file("reduction1D.cl");
	cl_kernel reduct = kernel_from_string(ctx, knl_text, "reduction", NULL);
	free(knl_text);

	knl_text = read_file("reduction-init.cl");
	cl_kernel reduct_init = kernel_from_string(ctx, knl_text, "reduction_init", NULL);
	free(knl_text);


	knl_text = read_file("reduct-energy.cl");
	cl_kernel reduct_eng = kernel_from_string(ctx, knl_text, "reduction_eng", NULL);
	free(knl_text);

	knl_text = read_file("resid.cl");
	cl_kernel resid = kernel_from_string(ctx, knl_text, "resid", NULL);
	free(knl_text);

	knl_text = read_file("resid-init.cl");
	cl_kernel resid_init = kernel_from_string(ctx, knl_text, "resid_init", NULL);
	free(knl_text);


	knl_text = read_file("radix-4-big.cl");
	cl_kernel fft_big = kernel_from_string(ctx, knl_text, "fft1D_big", NULL);
	free(knl_text);
	knl_text = read_file("radix-4-big-clean.cl");
	cl_kernel fft_clean = kernel_from_string(ctx, knl_text, "fft1D_clean", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-2D.cl");
	cl_kernel fft_2D = kernel_from_string(ctx, knl_text, "fft2D_big", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-2D-clean.cl");
	cl_kernel fft_2D_clean = kernel_from_string(ctx, knl_text, "fft2D_clean", NULL);
	free(knl_text);


	knl_text = read_file("mat-trans-3D.cl");
	cl_kernel mat_trans_3D = kernel_from_string(ctx, knl_text, "transpose_3D", NULL);
	free(knl_text);
	int Ns =1 ;
	int direction = 1;
	timestamp_type time1, time2;
	
	struct parameter param;

	param.N = N;
	param.epsilon = 0.1;
	param.s =1;
	
	float kk =1e-4;



	param.h = 2*PI/N;
	param.N = N;
	
 	param.maxCG = 1000;
	param.maxN = 5;
	
	//Minimum and starting time step
	float mink = 1e-7;
	float startk = 1e-4;

	// Tolerances
	param.Ntol = 1e-4;
	param.cgtol = 1e-7;
	float ksafety = 0.8;
	float kfact = 1.3;
	float kfact2 = 1/1.3;
	float Nfact = 0.7;
	float CGfact = 0.7;
	double elapsed ;

	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);

//for(int s=0;s<100;s++)
	//fft_1D_big(buf_a,buf_b,buf_c,N,fft_big,fft_clean,mat_trans,queue,direction,0);
	//fft_1D_new(buf_a,buf_b,buf_c,N,fft_init,fft_interm, fft1D,queue,direction,0);
	//fft_1D(buf_a,buf_b,buf_c,N,fft_init, fft1D,queue,direction,0);
	//fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1);
	//fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1);
	//fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction);
	//fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean,
			//mat_trans,mat_trans_3D,queue,direction);
	//fft_w(buf_a,buf_b,buf_c,buf_d,buf_e,N,0.1,0,1,fft_init_w,fft_init,fft1D,mat_trans,queue);
#if 0
	frhs(buf_a,buf_b,buf_c,buf_d,buf_e,&param,fft1D_init,fft1D,mat_trans,
		 vec_add, queue);
#endif
#if 0	
	float E1 = energy(buf_a, buf_b, buf_c,buf_d, buf_e,buf_f,1e-4, 
				&param, fft_init,fft1D,mat_trans,reduct_eng,
				reduct,queue);
#endif

	
	//float reside = residual(buf_a,buf_b,resid,resid_init,queue,N*N);
	/*fft_d_q(buf_a,buf_b,buf_c,buf_d, N,0.1,k ,1, 
		 fft1D_init,
		fft1D,mat_trans,queue);*/
	//for(int j= 0;j<N;j++)
	//{
		//fft_1D_w_orig(buf_a,buf_b,buf_c,N,fft1D_init,fft1D,queue,1,j);
	//}
	//fft_shar(buf_a,buf_b,buf_c,buf_d,N,0.1,0,1,fft1D_init,fft1D,mat_trans,queue);
	//mat__trans(buf_a,buf_b,N,mat_trans,queue,4,0.1,0,1);
	//double elapsed = reduction_mult(buf_a, buf_b,buf_c,N*N,reduct_mul,reduct,queue);

	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft_1D_big(buf_a,buf_b,buf_c,N*N,fft_big,fft_clean,mat_trans,queue,direction,0);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Hierarchy 1D FFT of size %d  array  on gpu takes %f s\n", N*N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");

	


	CALL_CL_GUARDED(clFinish, (queue));

	
	get_timestamp(&time1);
	fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Navie 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");
	//printf("data access from global achieve %f GB/s\n",sizeof(float)*2*16*N*N/elapsed*1e-9);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("local data exchange 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");


	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Hierarchy 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");


	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean,
			mat_trans,mat_trans_3D,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Using 2D kernel 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");



	get_timestamp(&time1);






	direction = -1;
	//fft_1D(buf_b,buf_c,buf_d,N,fft_init, fft1D,queue,direction,0);
	fft2D(buf_b,buf_c,buf_d,buf_e,N,fft_init,fft1D,mat_trans,queue, direction);
	//fft2D_new(buf_b,buf_c,buf_e,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, -1);
	//fft2D_big(buf_b,buf_c,buf_d,buf_e,N,fft_big,fft_clean,mat_trans,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	//printf("1D inverse %f s\n", elapsed);
	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

		printf("test success and %f \n",test);		
	

	#endif
	#if 0
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		2*N*N* sizeof(float), c,
        	0, NULL, NULL));
	

	/*for(int i =0; i<  N; i++)
	{
		printf("a%f+ i*",a[2*i]);		
		printf("%f\n",a[2*i+1]);
	}*/
	int T = 10<N? 10:N ;
	for(int i =0; i<  T; i++)
	{
		printf("%f + i*",a[2*i]);		
		printf("%f\t",a[2*i+1]);
		printf("%f + i*",c[2*i]);		
		printf("%f\n",c[2*i+1]);
	}

	#endif 
/*	for( Ns = 1;Ns < N; Ns *= 2 )
	{
		for (int j = 0; j<N/2; j++)
		{
			fftiteration(j,N,Ns,a,b);
		}
		float * d;
		d = a ;
		a = b;
		b = d;
		//printf("ok\n");

	}

*/


	
	CALL_CL_GUARDED(clReleaseMemObject, (buf_a));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_b));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_c));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_d));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_e));
	CALL_CL_GUARDED(clReleaseKernel, (fft1D));
	CALL_CL_GUARDED(clReleaseKernel, (fft_init));
	CALL_CL_GUARDED(clReleaseKernel, (vec_add));
	CALL_CL_GUARDED(clReleaseKernel, (reduct_mul));
	CALL_CL_GUARDED(clReleaseKernel, (reduct));
	CALL_CL_GUARDED(clReleaseKernel, (mat_trans));
	CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
	CALL_CL_GUARDED(clReleaseContext, (ctx));

}
示例#17
0
文件: serial.c 项目: gungun1010/GPUAV
int main(int argc, char **argv)
{
    char cmd[1000];
    FILE * fp;
    FILE * sigDb;
    char * fileName = NULL;
    char * sigPattern = NULL;
    size_t len = 0;
    size_t sigLen = 0;
    ssize_t readFile;
    ssize_t readSig;   
    uint8_t *fileBuf;
    uint8_t *sigBuf;
    size_t sizeFb;
    uint8_t *found;
    int count=0;
    strcpy(cmd, "find ");
    strcat(cmd, MOUNT); 
    strcat(cmd, " -type f > filesToScan.txt");

    system(cmd);

    fp = fopen("filesToScan.txt", "r");
    if (fp == NULL)
        exit(EXIT_FAILURE);
    
    //sigDb = fopen("mainCPUsig.ndb","r");
    timestamp_type time1, time2;
    get_timestamp(&time1);

    while ((readFile = getline(&fileName, &len, fp)) != -1) {
        printf("scaning: %s", fileName);

        remove_char_from_string('\n',fileName);
        loadFile(fileName, &fileBuf, &sizeFb);
        
        sigDb = fopen("mainCPUsig5k.ndb","r");
        while ((readSig = getline(&sigPattern, &sigLen, sigDb)) != -1){
            remove_char_from_string('\n',sigPattern);
            sigLen = strlen(sigPattern)/2;

            hex2data(&sigBuf, sigPattern);

            found = boyer_moore(fileBuf,sizeFb, sigBuf, sigLen);
            if(found != NULL){
                printf("    found virus in %s\n", fileName);
                count++;
            }
            free(sigBuf);

        }

        fclose(sigDb);
        printf("\n");
        free(fileBuf);
    }

    fclose(fp);
    get_timestamp(&time2);
    double elapsed = timestamp_diff_in_seconds(time1,time2);
    printf("%f s\n", elapsed);
    printf("virus count: %d\n", count);
    //loadFile(argv[1], &fileBuf, &sizeFb);
}
示例#18
0
int main (int argc, char *argv[]){
	
	int N, i, j; 
	double  h, h2, f, r, r0, tol, rt;
	double *u, *unew;
	int MAX_ITER;

	if (argc != 3){
		fprintf(stderr, "must input discretization size N and # of iterations\n");
		exit(0);
	}
	
	N = atoi( argv[1] );
	MAX_ITER = atoi( argv[2] );
	h = 1./(N+1); h2 = h*h;
	f = 1.;

	// allocate u, unew
	u    = (double *) malloc( (N+2) * sizeof(double));
	unew = (double *) malloc( (N+2) * sizeof(double));

 	// fill arrays
	for(j = 0; j<=N+1; j++){
		u[j]    = 0.;
		unew[j] = 0.;
	}

/*
        // initial residual	
	r0 = 0.0;
	for(j=1; j<=N; j++){
		rt  = (-u[j-1] + 2.*u[j] - u[j+1]) / h2 - f;
		r0 += rt*rt;  
	}
	r0 = sqrt(r0/N);
*/

	#pragma omp parallel
	{
		printf("Hello, I am thread %d of %d\n", 
			omp_get_thread_num(), 
			omp_get_num_threads());
	}

	timestamp_type start_t, stop_t;	
	get_timestamp(&start_t);

/*
	r = r0;
	while (r/r0 > tol){
*/
	
	for(i = 0; i < MAX_ITER; i++ ){
		       
#pragma omp parallel for default(none) \
			 schedule(static) \
			 shared(u,unew,h2,f,N)
		//jacobi iteration
		for(j= 1; j <=N ; j++){
			unew[j] = (h2*f + u[j-1] + u[j+1] ) * 0.5;
		}
	//	printf("Thread %d done\n", omp_get_thread_num());		

		
#pragma omp parallel for default(none) \
			 schedule(static) \
			 shared(u,unew,h2,f,N)
		// copy work 
		for(j= 1; j <=N ; j++){
			u[j] = unew[j];
		}
/*
 		r = 0.0;
		// compute residual
		for(j=1; j <= N ; j++){
			rt = (-u[j-1] + 2.*u[j] - u[j+1]) / h2 - f;
			r = r + rt*rt; 	
		}

		r = sqrt(r/N);		
*/		
//		printf("the residual at %dth iteration is %.14f\n", i+1, r);
	}

	get_timestamp(&stop_t);
	double elapsed = timestamp_diff_in_seconds(start_t, stop_t);
	
	printf("Total number of iterations is %d\n", MAX_ITER);
	printf("Time elapsed is %f seconds.\n", elapsed);
//	printf("the residual at %dth iteration is %.14f\n", MAX_ITER, r);
	
	free(u); free(unew);	
	
	return 0;
}
示例#19
0
int main (int argc, char *argv[])
{
  double *a, *a_reduced;

  if (argc != 3)
  {
    fprintf(stderr, "Usage: %s N nloops\n", argv[0]);
    abort();
  }

  const cl_long N = (cl_long) atol(argv[1]);
  const int nloops = atoi(argv[2]);

  cl_long Ngroups = (N + LDIM  - 1)/LDIM;
  Ngroups = (Ngroups + 8  - 1)/8;

  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);

  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("full_reduction.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "reduction",
      "-DLDIM=" STRINGIFY(LDIM));
  free(knl_text);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
  posix_memalign((void**)&a, 32, N*sizeof(double));
  if (!a) { fprintf(stderr, "alloc a"); abort(); }
  posix_memalign((void**)&a_reduced, 32, Ngroups*sizeof(double));
  if (!a_reduced) { fprintf(stderr, "alloc a_reduced"); abort(); }

  srand48(8);
  for(cl_long n = 0; n < N; ++n)
    a[n] = (double)drand48();
    // a[n] = n;

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N*sizeof(double),
      0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_a_reduced[2];
  buf_a_reduced[0] = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      Ngroups*sizeof(double), 0, &status);
  buf_a_reduced[1] = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      Ngroups*sizeof(double), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0,
        N*sizeof(double), a,
        0, NULL, NULL));

  timestamp_type tic, toc;
  double elapsed;

  // --------------------------------------------------------------------------
  // run reduction_simple on device
  // --------------------------------------------------------------------------

  printf("Simple Reduction\n");
  double sum_gpu = 0.0;
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&tic);
  for(int loop = 0; loop < nloops; ++loop)
  {
    int r = 0;
    size_t Ngroups_loop = Ngroups;
    SET_3_KERNEL_ARGS(knl, N, buf_a, buf_a_reduced[r]);

    size_t local_size[] = { LDIM };
    size_t global_size[] = { Ngroups_loop*LDIM };

    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL,
          global_size, local_size, 0, NULL, NULL));

    while(Ngroups_loop > 1)
    {
      cl_long N_reduce = Ngroups_loop;
      Ngroups_loop = (N_reduce + LDIM  - 1)/LDIM;
      Ngroups_loop = (Ngroups_loop + 8  - 1)/8;

      size_t local_size[] = { LDIM };
      size_t global_size[] = { Ngroups_loop*LDIM };

      SET_3_KERNEL_ARGS(knl, N_reduce, buf_a_reduced[r], buf_a_reduced[(r+1)%2]);

      CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL,
            global_size, local_size, 0, NULL, NULL));

      r = (r+1)%2;
    }

    CALL_CL_SAFE(clEnqueueReadBuffer(
          queue, buf_a_reduced[r], /*blocking*/ CL_TRUE, /*offset*/ 0,
          Ngroups_loop*sizeof(double), a_reduced, 0, NULL, NULL));

    sum_gpu = 0.0;
    for(cl_long n = 0; n < Ngroups_loop; ++n)
      sum_gpu += a_reduced[n];
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  elapsed = timestamp_diff_in_seconds(tic,toc)/nloops;
  printf("%f s\n", elapsed);
  printf("%f GB/s\n", N*sizeof(double)/1e9/elapsed);

  double sum_cpu = 0.0;
  for(cl_long n = 0; n < N; ++n)
    sum_cpu += a[n];

  printf("Sum CPU: %e\n", sum_cpu);

  printf("Sum GPU: %e\n", sum_gpu);

  printf("Relative Error: %e\n", fabs(sum_cpu-sum_gpu)/sum_gpu);

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_a));
  CALL_CL_SAFE(clReleaseMemObject(buf_a_reduced[0]));
  CALL_CL_SAFE(clReleaseMemObject(buf_a_reduced[1]));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));

  free(a);
  free(a_reduced);

  return 0;
}
示例#20
0
int main(int argc, char *argv[])
{
  int error, xsize, ysize, rgb_max;
  int *r, *b, *g;

  float *gray, *congray, *congray_cl;

  // identity kernel
  // float filter[] = {
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,1,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  //   0,0,0,0,0,0,0,
  // };

  // 45 degree motion blur
  float filter[] =
    {0,      0,      0,      0,      0, 0.0145,      0,
     0,      0,      0,      0, 0.0376, 0.1283, 0.0145,
     0,      0,      0, 0.0376, 0.1283, 0.0376,      0,
     0,      0, 0.0376, 0.1283, 0.0376,      0,      0,
     0, 0.0376, 0.1283, 0.0376,      0,      0,      0,
0.0145, 0.1283, 0.0376,      0,      0,      0,      0,
     0, 0.0145,      0,      0,      0,      0,      0};

  // mexican hat kernel
  // float filter[] = {
  //   0, 0,-1,-1,-1, 0, 0,
  //   0,-1,-3,-3,-3,-1, 0,
  //  -1,-3, 0, 7, 0,-3,-1,
  //  -1,-3, 7,24, 7,-3,-1,
  //  -1,-3, 0, 7, 0,-3,-1,
  //   0,-1,-3,-3,-3,-1, 0,
  //   0, 0,-1,-1,-1, 0, 0
  // };


  if(argc != 3)
  {
    fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]);
    abort();
  }

  const char* filename = argv[1];
  const int num_loops = atoi(argv[2]);


  // --------------------------------------------------------------------------
  // load image
  // --------------------------------------------------------------------------
  printf("Reading ``%s''\n", filename);
  ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b);
  printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize);

  // --------------------------------------------------------------------------
  // allocate CPU buffers
  // --------------------------------------------------------------------------
  posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float));
  if(!gray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float));
  if(!congray) { fprintf(stderr, "alloc gray"); abort(); }
  posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float));
  if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); }

  // --------------------------------------------------------------------------
  // convert image to grayscale
  // --------------------------------------------------------------------------
  for(int n = 0; n < xsize*ysize; ++n)
    gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max;

  // --------------------------------------------------------------------------
  // execute filter on cpu
  // --------------------------------------------------------------------------
  for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i)
  {
    for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j)
    {
      float sum = 0;
      for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k)
      {
        for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l)
        {
          sum += gray[(i+k)*xsize + (j+l)] *
            filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)];
        }
      }
      congray[i*xsize + j] = sum;
    }
  }

  // --------------------------------------------------------------------------
  // output cpu filtered image
  // --------------------------------------------------------------------------
  printf("Writing cpu filtered image\n");
  for(int n = 0; n < xsize*ysize; ++n)
    r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max);
  error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // get an OpenCL context and queue
  // --------------------------------------------------------------------------
  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);
  print_device_info_from_queue(queue);

  // --------------------------------------------------------------------------
  // load kernels
  // --------------------------------------------------------------------------
  char *knl_text = read_file("convolution.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL);
  free(knl_text);

#ifdef NON_OPTIMIZED
  int deviceWidth = xsize;
#else
  int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX;
#endif
  int deviceHeight = ysize;
  size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float);

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_int status;
  cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY,
      deviceDataSize, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY,
     FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        deviceDataSize, gray, 0, NULL, NULL));
#else
  size_t buffer_origin[3] = {0,0,0};
  size_t host_origin[3] = {0,0,0};
  size_t region[3] = {deviceWidth*sizeof(float), ysize, 1};
  clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE,
                           buffer_origin, host_origin, region,
                           deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
                           gray, 0, NULL, NULL);
#endif

  CALL_CL_SAFE(clEnqueueWriteBuffer(
        queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0,
        FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL));

  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  cl_int rows = ysize;
  cl_int cols = xsize;
  cl_int filterWidth = FILTER_WIDTH;
  cl_int paddingPixels = 2*HALF_FILTER_WIDTH;

  size_t local_size[] = { WGX, WGY };
  size_t global_size[] = {
    ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0],
    ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1],
  };

  cl_int localWidth = local_size[0] + paddingPixels;
  cl_int localHeight = local_size[1] + paddingPixels;
  size_t localMemSize = localWidth * localHeight * sizeof(float);

  CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray));
  CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray));
  CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter));
  CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows));
  CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols));
  CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth));
  CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL));
  CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight));
  CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth));

  // --------------------------------------------------------------------------
  // print kernel info
  // --------------------------------------------------------------------------
  print_kernel_info(queue, knl);

  CALL_CL_SAFE(clFinish(queue));
  timestamp_type tic, toc;
  get_timestamp(&tic);
  for(int loop = 0; loop < num_loops; ++loop)
  {
    CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL,
          global_size, local_size, 0, NULL, NULL));

    // Edit: Copy the blurred image to input buffer
#ifdef NON_OPTIMIZED
    CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0,
          deviceDataSize, 0, NULL, NULL));
#else
    clEnqueueCopyBufferRect(queue, buf_congray, buf_gray,
                            buffer_origin, host_origin, region,
                            deviceWidth*sizeof(float), 0,
                            xsize*sizeof(float), 0,
                            0, NULL, NULL);
#endif
  }
  CALL_CL_SAFE(clFinish(queue));
  get_timestamp(&toc);

  double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops;
  printf("%f s\n", elapsed);
  printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed);
  printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed);
  printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH)
	 *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed);

  // --------------------------------------------------------------------------
  // transfer back & check
  // --------------------------------------------------------------------------
#ifdef NON_OPTIMIZED
  CALL_CL_SAFE(clEnqueueReadBuffer(
        queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,
        xsize * ysize * sizeof(float), congray_cl,
        0, NULL, NULL));
#else
  buffer_origin[0] = 3*sizeof(float);
  buffer_origin[1] = 3;
  buffer_origin[2] = 0;

  host_origin[0] = 3*sizeof(float);
  host_origin[1] = 3;
  host_origin[2] = 0;

  region[0] = (xsize-paddingPixels)*sizeof(float);
  region[1] = (ysize-paddingPixels);
  region[2] = 1;

  clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,
      buffer_origin, host_origin, region,
      deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,
      congray_cl, 0, NULL, NULL);
#endif

  // --------------------------------------------------------------------------
  // output OpenCL filtered image
  // --------------------------------------------------------------------------
  printf("Writing OpenCL filtered image\n");

  // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect
  for(int n = 0; n < xsize*ysize; ++n) {
    int color = (int)(congray_cl[n] * rgb_max);

    if (color < 0) {
      color = 0;
    } else if (color > 255) {
      color = 255;
    }

    r[n] = g[n] = b[n] = color;
  }
  error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b);
  if(error) { fprintf(stderr, "error writing image"); abort(); }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_SAFE(clReleaseMemObject(buf_congray));
  CALL_CL_SAFE(clReleaseMemObject(buf_gray));
  CALL_CL_SAFE(clReleaseMemObject(buf_filter));
  CALL_CL_SAFE(clReleaseKernel(knl));
  CALL_CL_SAFE(clReleaseCommandQueue(queue));
  CALL_CL_SAFE(clReleaseContext(ctx));
  free(gray);
  free(congray);
  free(congray_cl);
  free(r);
  free(b);
  free(g);
}
示例#21
0
int main(int argc, char **argv)
{
  if (argc != 3)
  {
    fprintf(stderr, "need two arguments!\n");
    abort();
  }

  const long n = atol(argv[1]);
  const long size = n*n;
  const int ntrips = atoi(argv[2]);

  cl_context ctx;
  cl_command_queue queue;
  create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0);

  cl_int status;

  // --------------------------------------------------------------------------
  // load kernels 
  // --------------------------------------------------------------------------
  char *knl_text = read_file("transpose-soln.cl");
  cl_kernel knl = kernel_from_string(ctx, knl_text, "transpose", NULL);
  free(knl_text);

  // --------------------------------------------------------------------------
  // allocate and initialize CPU memory
  // --------------------------------------------------------------------------
#ifdef USE_PINNED
  cl_mem buf_a_host = clCreateBuffer(ctx,
      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");
  cl_mem buf_b_host = clCreateBuffer(ctx,
      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  value_type *a = (value_type *) clEnqueueMapBuffer(queue, buf_a_host,
      /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 
      /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status);
  CHECK_CL_ERROR(status, "clEnqueueMapBuffer");
  value_type *b = (value_type *) clEnqueueMapBuffer(queue, buf_b_host,
      /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 
      /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status);
  CHECK_CL_ERROR(status, "clEnqueueMapBuffer");

#else
  value_type *a = (value_type *) malloc(sizeof(value_type) * size);
  if (!a) { perror("alloc x"); abort(); }
  value_type *b = (value_type *) malloc(sizeof(value_type) * size);
  if (!b) { perror("alloc y"); abort(); }
#endif

  for (size_t j = 0; j < n; ++j)
    for (size_t i = 0; i < n; ++i)
      a[i + j*n] = i + j*n;

  // --------------------------------------------------------------------------
  // allocate device memory
  // --------------------------------------------------------------------------
  cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE,
      sizeof(value_type) * size, 0, &status);
  CHECK_CL_ERROR(status, "clCreateBuffer");

  // --------------------------------------------------------------------------
  // transfer to device
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clFinish, (queue));

  timestamp_type time1, time2;
  get_timestamp(&time1);

  CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        queue, buf_a, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), a,
        0, NULL, NULL));

  CALL_CL_GUARDED(clEnqueueWriteBuffer, (
        queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), b,
        0, NULL, NULL));

  get_timestamp(&time2);
  double elapsed = timestamp_diff_in_seconds(time1,time2);
  printf("transfer: %f s\n", elapsed);
  printf("transfer: %f GB/s\n",
      2*size*sizeof(value_type)/1e9/elapsed);


  // --------------------------------------------------------------------------
  // run code on device
  // --------------------------------------------------------------------------

  CALL_CL_GUARDED(clFinish, (queue));

  get_timestamp(&time1);

  for (int trip = 0; trip < ntrips; ++trip)
  {
    SET_3_KERNEL_ARGS(knl, buf_a, buf_b, n);
    size_t ldim[] = { 16, 16 };
    size_t gdim[] = { n, n };
    CALL_CL_GUARDED(clEnqueueNDRangeKernel,
        (queue, knl,
         /*dimensions*/ 2, NULL, gdim, ldim,
         0, NULL, NULL));
  }

  CALL_CL_GUARDED(clFinish, (queue));

  get_timestamp(&time2);
  elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips;
  printf("%f s\n", elapsed);
  printf("%f GB/s\n",
      2*size*sizeof(value_type)/1e9/elapsed);

  CALL_CL_GUARDED(clEnqueueReadBuffer, (
        queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0,
        size * sizeof(value_type), b,
        0, NULL, NULL));

  CALL_CL_GUARDED(clFinish, (queue));

  for (size_t i = 0; i < n; ++i)
    for (size_t j = 0; j < n; ++j)
      if (a[i + j*n] != b[j + i*n])
      {
        printf("bad %d %d\n", i, j);
        abort();
      }

  // --------------------------------------------------------------------------
  // clean up
  // --------------------------------------------------------------------------
  CALL_CL_GUARDED(clFinish, (queue));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_a));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_b));
  CALL_CL_GUARDED(clReleaseKernel, (knl));
  CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
  CALL_CL_GUARDED(clReleaseContext, (ctx));

#ifdef USE_PINNED
  CALL_CL_GUARDED(clReleaseMemObject, (buf_a_host));
  CALL_CL_GUARDED(clReleaseMemObject, (buf_b_host));
#else
  free(a);
  free(b);
#endif
  return 0;
}
示例#22
0
int main(int argc, char ** argv){

	// check input
	if (argc != 3)
	{
	  fprintf(stderr, "in main: need two arguments!\n");
	  abort();
	}

	// seed the random number generator
	//srand( (int) time(0));
	srand( (int) 4);

	// parameters
	const long m = atol(argv[1]);  
	const long n = atol(argv[2]);  
	long mn = 0;	// min of m,n
	long len_beta = 0;
	if ( m < n){
		mn = m;
		len_beta = mn;
	}
	else{
		mn = n;
		len_beta = mn-1;
	}
	double a = 1;
	double b = 2;
	double tol = 1.0e-9;

	// big matrix storage
	double *A = (double *) malloc(sizeof(double) *m*n);
	if(!A) { fprintf(stderr,"in main: failed to allocate A\n"); abort();}
	double *A2 = (double *) malloc(sizeof(double) *m*n);
	if(!A2) { fprintf(stderr,"in main: failed to allocate A2\n"); abort();}
	double *B = (double *) malloc(sizeof(double) *m*n);
	if(!B) { fprintf(stderr,"in main: failed to allocate B\n"); abort();}
	double *A_Copy = (double *) malloc(sizeof(double) *m*n);
	if(!A_Copy) { fprintf(stderr,"in main: failed to allocate A_Copy\n"); abort();}
	double *A_Result = (double *) malloc(sizeof(double) *m*n);
	if(!A_Result) { fprintf(stderr,"in main: failed to allocate A_Result\n"); abort();}
	double *temp = (double *) malloc(sizeof(double) *m*n);
	if(!temp) { fprintf(stderr,"in main: failed to allocate temp\n"); abort();}
	double *temp2 = (double *) malloc(sizeof(double) *m*m);
	if(!temp2) { fprintf(stderr,"in main: failed to allocate temp2\n"); abort();}
	double *temp3 = (double *) malloc(sizeof(double) *n*n);
	if(!temp3) { fprintf(stderr,"in main: failed to allocate temp3\n"); abort();}
	double *U = (double *) malloc(sizeof(double) *m*m);
	if(!U) { fprintf(stderr,"in main: failed to allocate U\n"); abort();}
	double *UT = (double *) malloc(sizeof(double) *m*m);
	if(!UT) { fprintf(stderr,"in main: failed to allocate UT\n"); abort();}
	double *V = (double *) malloc(sizeof(double) *n*n);
	if(!V) { fprintf(stderr,"in main: failed to allocate V\n"); abort();}
	double *VT = (double *) malloc(sizeof(double) *n*n);
	if(!VT) { fprintf(stderr,"in main: failed to allocate VT\n"); abort();}
	
	// diagonal component storage
	double *alpha = (double *) malloc(sizeof(double) *mn);
	if(!alpha) { fprintf(stderr,"in main: failed to allocate alpha\n"); abort();}
	double *beta = (double *) malloc(sizeof(double) *len_beta);
	if(!beta) { fprintf(stderr,"in main: failed to allocate beta\n"); abort();}
	double *alpha2 = (double *) malloc(sizeof(double) *mn);
	if(!alpha) { fprintf(stderr,"in main: failed to allocate alpha2\n"); abort();}
	double *beta2 = (double *) malloc(sizeof(double) *len_beta);
	if(!beta) { fprintf(stderr,"in main: failed to allocate beta2\n"); abort();}
	
	// fill A, A_Copy
	for (int i=0; i<m*n; i++){
		A[i] = rand_d(a,b);
		A_Copy[i] = A[i];
		A2[i] = A[i];
	}
	
	timestamp_type time1, time2;
	
	// compute the bidiagonal form
	get_timestamp(&time1);
	bidiag_par(m,n,A,alpha,beta);
	get_timestamp(&time2);
	double elapsed_par = timestamp_diff_in_seconds(time1,time2);
	printf("time_par = %g\n",elapsed_par);
	get_timestamp(&time1);
	bidiag_seq(m,n,A2,alpha,beta);
	get_timestamp(&time2);
	double elapsed_seq = timestamp_diff_in_seconds(time1,time2);
	printf("time_seq = %g\n",elapsed_seq);
	// form the orthogonal matrices
	//form_u_par(m,n,A,U);
	//form_v_par(m,n,A,V);
	//form_bidiag(m,n,alpha,beta,B);
	//transpose(n,n,V,VT);
	//transpose(m,m,U,UT);
	
	
	// check the result of A_Result = U * B * V^T
	//dgemm_simple(m,n,n,B,VT,temp);
	//dgemm_simple(m,n,m,U,temp,A_Result);
	//dgemm_simple(n,n,n,VT,V,temp3);
	//dgemm_simple(m,m,m,UT,U,temp2);
	
	int errors = 0;
	for (int i=0; i < m*n; i++){
		if ( fabs(A[i]-A2[i]) > tol ){
			errors++;
		}
	}
	printf("ERRORS = %d\n",errors);
	
	
	//print_matrix(A,m,n,"A = ");
	//print_matrix(A2,m,n,"A2 = ");
	//print_matrix(A_Copy,m,n,"A_Copy = ");
	//print_matrix(A_Result,m,n,"A_Result = ");
	//print_matrix(B,m,n,"B = ");
	//print_matrix(U,m,m,"U = ");
	//print_matrix(V,n,n,"V = ");
	//print_matrix(temp2,m,m,"temp2 = ");
	//print_matrix(temp3,n,n,"temp3 = ");
		
	free(A);
	free(A2);
	free(A_Copy);
	free(A_Result);
	free(B);
	free(temp);
	free(temp2);
	free(temp3);
	free(U);
	free(UT);
	free(V);
	free(VT);
	free(alpha);
	free(alpha2);
	free(beta);
	free(beta2);
	

	return 0;
	
}
示例#23
0
void runTimings(int use_gpu){
  int ntrips = 10;
  char device_name[256];
  
  timestamp_type time1, time2;
  
  ////////////////////////////////////////////////////
  ///GPU TIMINGS
  ////////////////////////////////////////////////////
  
  init_opencl(use_gpu);
  load_cl_kernels(&clData);
  allocate_cl_buffers(&clData);
  
 
  print_device_info_from_queue(clData.queue);
  get_device_name_from_queue(clData.queue, device_name, 256);
  
  transfer_buffers_to_gpu();
  
  double advectionVelocityTimeGPU, advectionDensityTimeGPU, divergenceTimeGPU, projectJacobiTimeGPU, projectCGTimeGPU, pressureApplyTimeGPU;

    
  transfer_buffers_to_gpu();

  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    run_cl_advect_velocity(&clData, dt);
  }
  flush_cl_queue();
  get_timestamp(&time2);
  advectionVelocityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips;



  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    run_cl_calculate_divergence(&clData, dt);
  }
  flush_cl_queue();
  get_timestamp(&time2);
  divergenceTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips;

  transfer_buffers_to_cpu();
  flush_cl_queue();
  
  //This needs ntrips different divergence matrices to get accurate timings.
  //This is because by the time the second time it is called it will detect
  //the system is solved and exit after one matrix
  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    transfer_cl_float_buffer_from_device(&clData,clData.buf_pressure,g_pressure,clData.n,true);
    transfer_cl_float_buffer_from_device(&clData,clData.buf_divergence,g_divergence,clData.n,true);
    
    run_cl_cg_no_mtx(&clData,g_pressure, g_divergence,  g_cg_r, g_cg_d, g_cg_q, clData.n, 10, 0.0001f);
    flush_cl_queue();
    
    transfer_cl_float_buffer_to_device(&clData,clData.buf_pressure,g_pressure,clData.n,true);
  }
  flush_cl_queue();
  get_timestamp(&time2);
  projectCGTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips;




  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    for(int i = 0; i < 20; ++i)
    {
      run_cl_pressure_solve(&clData, dt);
    }
  }
  flush_cl_queue();
  get_timestamp(&time2);
  projectJacobiTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips;



  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    run_cl_pressure_apply(&clData, dt);
  }
  flush_cl_queue();
  get_timestamp(&time2);
  pressureApplyTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips;

  

  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    run_cl_advect_density(&clData, dt);
  }
  flush_cl_queue();
  get_timestamp(&time2);
  advectionDensityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips;

  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Velocity",advectionVelocityTimeGPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeGPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Density",advectionDensityTimeGPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeGPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Divergence",divergenceTimeGPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeGPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Projection Jacobi",projectJacobiTimeGPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeGPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t",device_name,NX,NY,NZ,"GPU", "Projection Conjugate Gradient",projectCGTimeGPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeGPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Pressure Apply",pressureApplyTimeGPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeGPU);
  

  cleanup_cl(&clData);
  
  
  
  
  ////////////////////////////////////////////////////
  ///CPU TIMINGS
  ////////////////////////////////////////////////////
  double advectionVelocityTimeCPU, advectionDensityTimeCPU, divergenceTimeCPU, projectJacobiTimeCPU, projectCGTimeCPU, pressureApplyTimeCPU;

  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    advect_velocity_RK2(dt, g_u, g_v, g_w, g_u_prev, g_v_prev, g_w_prev);
  }
  get_timestamp(&time2);
  advectionVelocityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips;


  //project(dt,g_u,g_v, g_w, g_divergence, g_pressure, g_pressure_prev, g_laplacian_matrix,useCG);
  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    calculate_divergence(g_divergence, g_u, g_v, g_w, dt);
  }
  get_timestamp(&time2);
  divergenceTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips;


  //This needs ntrips different divergence matrices to get accurate timings.
  //This is because by the time the second time it is called it will detect
  //the system is solved and exit after one matrix
  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    pressure_solve_cg_no_matrix(g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q);
  }
  get_timestamp(&time2);
  projectCGTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips;

  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    pressure_solve(g_pressure,g_pressure_prev, g_divergence, dt);
  }
  get_timestamp(&time2);
  projectJacobiTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips;

  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    pressure_apply(g_u, g_v, g_w, g_pressure, dt);
  }
  get_timestamp(&time2);
  pressureApplyTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips;


  get_timestamp(&time1);
  for(int i = 0; i < ntrips; ++i)
  {
    advectRK2(dt,g_dens,g_dens_prev, g_u, g_v, g_w);
  }
  get_timestamp(&time2);
  advectionDensityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips;


  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Velocity",advectionVelocityTimeCPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeCPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Density",advectionDensityTimeCPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeCPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Divergence",divergenceTimeCPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeCPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Jacobi",projectJacobiTimeCPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeCPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Conjugate Gradient",projectCGTimeCPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeCPU);
  
  printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Pressure Apply",pressureApplyTimeCPU);
  printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeCPU);
  
}