void print_run_summary(sampler *samp){ /* Stop the global timer and print a small summary of the run. Input: sampler *samp Pointer to sampler structure which has been initialized. Sampling must be performed before running this routine. Output: Print a short summary of the run, including sample rate and acceptance rate. Print the the mean and standard deviation of all components sampled. */ double elapsed_sample = timestamp_diff_in_seconds(samp->time1, samp->time2); double elapsed_total = timestamp_diff_in_seconds(samp->time1_total, samp->time2_total); // -------------------------------------------------------------------------- // check output // -------------------------------------------------------------------------- printf("Time steps = %d\n", samp->M); printf("Total samples = %d\n", samp->M * samp->K); printf("ldim = %d\tgdim = %d\n", (int) samp->ldim[0], (int) samp->gdim[0]); printf("Total accepted = %lu\n", samp->accepted_total); printf("Acceptance rate = %f\n", (cl_float) samp->accepted_total / ((cl_float) (samp->M * samp->K)) ) ; printf("Time for kernel runs = %f\n", elapsed_sample); printf("Sample rate, kernel time only = %f million samples / s\n", samp->M * samp->K * 1e-6 / elapsed_sample); printf("Total time = %f\n", elapsed_total); printf("Sample rate, total time = %f million samples / s\n", samp->M * samp->K * 1e-6 / elapsed_total); printf("\n"); // Basic numerical estimate of mean and standard deviation of each component in the chain double mean, sigma; float *X = (float *) malloc(samp->total_samples * sizeof(float)); if(!X){ perror("Allocation failure basic stats"); abort(); } for(int i=0; i<samp->num_to_save; i++){ for(int j=0; j<samp->total_samples; j++) X[j] = samp->samples_host[i + j * (samp->num_to_save)]; compute_mean_stddev(X, &mean, &sigma, samp->total_samples); printf("Statistics for X_%d:\t", samp->indices_to_save_host[i]); printf("Mean = %f,\tsigma = %f\n", mean, sigma); } printf("\n"); free(X); }
int main(int argc, char** argv) { int print_results = 0; // check for correct number of arguments if (argc < 3) { usage(); return EXIT_FAILURE; } else if (argc > 3) { print_results = atoi(argv[3]); } // initialize vars and allocate memory const int n = atoi(argv[2]); int* a = malloc(sizeof(int) * n); // initialize local array if (init_array(argv[1], 0, n, &a[0]) != EXIT_SUCCESS) { printf("File %s could not be opened!\n", argv[1]); return EXIT_FAILURE; } // take a timestamp before the sort starts timestamp_type time1, time2; get_timestamp(&time1); // sort elements radix_sort(&a[0], n); // take a timestamp after the process finished sorting get_timestamp(&time2); // calculate fish updates per second double elapsed = timestamp_diff_in_seconds(time1,time2); printf("%f s\n", elapsed); printf("%d elements sorted\n", n); printf("%f elements/s\n", n / elapsed); // print sorted resutls if (print_results) { print_array(&a[0], n); } // release resources no longer used free(a); return 0; }
double measure_access(void *x, size_t array_size, size_t ntrips) { timestamp_type t1; get_timestamp(&t1); for (size_t i = 0; i<ntrips; ++i) for(size_t j = 0; j<array_size; ++j) { *(((char*)x) + ((j * 1009) % array_size)) += 1; } timestamp_type t2; get_timestamp(&t2); return timestamp_diff_in_seconds(t1, t2); }
int main() { int result = 0; const int n = 1024*1024; float *allocation; if (errno = posix_memalign((void **) &allocation, 64, n*sizeof(float) + 64)) perror("allocating a"); float __attribute__ ((aligned (1))) *b = malloc(n*sizeof(float)); if (errno = posix_memalign((void **) &b, 64, n*sizeof(float) + 64)) perror("allocating b"); float __attribute__ ((aligned (64))) *a = (float *) (((char *) allocation) + 0); /* puts("write"); for (int i = 0; i<n; ++i) a[i] = i; */ timestamp_type t1; get_timestamp(&t1); for (int ntrips = 0; ntrips < 1000; ++ntrips) { for (int i = 0; i<n; ++i) b[i] = 2*a[i]; } timestamp_type t2; get_timestamp(&t2); printf("elapsed time: %g s\n", timestamp_diff_in_seconds(t1, t2)); // fake a dependency on a for (int i = 0; i<n; ++i) result += a[i]; free(allocation); return result; }
int main(int argc, char** argv){ timestamp_type time1, time2; if (argc != 3) { printf("USAGE: ./jacobi-omp.o <Number of points (N)> <Num Iter>\n"); abort(); } int N = atoi(argv[1]); int numIter = atoi(argv[2]); double* u_k = (double*) malloc(N*sizeof(double)); get_timestamp(&time1); //Initialize u_k int i; for (i=0; i<N; i++) { u_k[i] = 0.0; } int nthreads; #pragma omp parallel { nthreads = omp_get_num_threads(); int tid = omp_get_thread_num(); printf("(%d) starting jacobi iteration. \n", tid); #pragma omp barrier jacobi_iteration(u_k, N, nthreads, numIter); } get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2); printf("Time elapsed is %f seconds.\n", elapsed); // print_solution(u_k, N); free(u_k); }
int main(int argc, char **argv) { int rank_count, my_rank, worker_count; // FIXME kill MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &rank_count); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); worker_count = rank_count-1; if (argc != 3) { fprintf(stderr, "need two arguments!\n"); abort(); } const long n = atol(argv[1]); const int ntrips = atoi(argv[2]); // FIXME kill if (n % worker_count != 0) { fprintf(stderr, "size not divisible\n"); MPI_Abort(MPI_COMM_WORLD, 1); } long divided_n = n / worker_count; printf("rank %d/%d reporting for duty\n", my_rank, rank_count); const int tag = 0; if (my_rank == 0) { printf("doing %d trips...\n", ntrips); double *x = (double *) malloc(sizeof(double) * n); if (!x) { perror("alloc x"); MPI_Abort(MPI_COMM_WORLD, 1); } double *y = (double *) malloc(sizeof(double) * n); if (!y) { perror("alloc y"); MPI_Abort(MPI_COMM_WORLD, 1); } double *z = (double *) malloc(sizeof(double) * n); if (!z) { perror("alloc z"); MPI_Abort(MPI_COMM_WORLD, 1); } for (int i = 0; i < n; ++i) { x[i] = i; y[i] = 2*i; } timestamp_type time1, time2; get_timestamp(&time1); for (int i = 0; i < worker_count; ++i) { printf("before send %d\n", i); MPI_Send(x + i*divided_n, divided_n, MPI_DOUBLE, i+1, tag, MPI_COMM_WORLD); MPI_Send(y + i*divided_n, divided_n, MPI_DOUBLE, i+1, tag, MPI_COMM_WORLD); printf("after send %d\n", i); } for (int i = 0; i < worker_count; ++i) { printf("before recv %d\n", i); MPI_Recv(z + i*divided_n, divided_n, MPI_DOUBLE, i+1, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("after recv %d\n", i); } get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%f GB/s\n", 3*n*sizeof(double)/1e9/elapsed); printf("%f GFlops/s\n", n/1e9/elapsed); for (int i = 0; i < n; ++i) { if (z[i] != x[i] + y[i]) { printf("bad %d\n", i); MPI_Abort(MPI_COMM_WORLD, 1); } } } else { double *xbuf = (double *) malloc(sizeof(double) * divided_n); if (!xbuf) { perror("alloc xbuf"); MPI_Abort(MPI_COMM_WORLD, 1); } double *ybuf = (double *) malloc(sizeof(double) * divided_n); if (!ybuf) { perror("alloc ybuf"); MPI_Abort(MPI_COMM_WORLD, 1); } double *zbuf = (double *) malloc(sizeof(double) * divided_n); if (!zbuf) { perror("alloc zbuf"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Recv(xbuf, divided_n, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(ybuf, divided_n, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("start rank %d\n", my_rank); for (int trip = 0; trip < ntrips; ++trip) { for (int i = 0; i < divided_n; ++i) { zbuf[i] = xbuf[i] + ybuf[i]; } } printf("done rank %d\n", my_rank); MPI_Send(zbuf, divided_n, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD); printf("done rank %d\n", my_rank); } MPI_Finalize(); return 0; }
int main( int argc, char **argv) { srand(time(NULL)); mytimer timer; timer.total_time = timer.init_time = timer.comp_time = timer.comm_time = 0.0; timestamp_type time_s, time_e; int mpi_rank, mpi_size; int i, r, rows, cols, total_rows, err; int *points_per_proc, *buffer; MPI_File filename; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); options opt; parse_command_line(argc, argv, &opt); err = MPI_File_open(MPI_COMM_WORLD, opt.filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &filename); if (err) { if (mpi_rank == 0) fprintf(stderr, "Couldn't open file %s\n", argv[1]); MPI_Finalize(); exit(1); } double **data = mpi_read_data(&filename, &rows, &cols, mpi_rank, mpi_size, opt.overlap); points_per_proc = (int*) calloc(mpi_size, sizeof(int)); check(points_per_proc); buffer = (int*) calloc(mpi_size, sizeof(int)); check(buffer); buffer[mpi_rank] = rows; MPI_Barrier(MPI_COMM_WORLD); MPI_Allreduce(buffer, points_per_proc, mpi_size, MPI_INT, MPI_SUM, MPI_COMM_WORLD); free(buffer); MPI_Allreduce(&rows, &total_rows, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); opt.n_points = total_rows; opt.dimensions = cols; opt.local_rows = rows; if(mpi_rank == 0 && opt.verbose > 1) { printf("Total rows: %d\n", opt.n_points); for(r = 0; r < mpi_size; r++) printf("proc %d has %d\n", r, points_per_proc[r]); } for(r=0; r < mpi_size; r++) { MPI_Barrier(MPI_COMM_WORLD); if(mpi_rank == r && opt.verbose > 2) { for(i=0; i < rows; i++) { printf("proc %d: %d --- ", mpi_rank, i); print_vec(data[i], cols); } } } // allocate centroids, everyone gets their own copy double **centroids = (double**) alloc2d(opt.n_centroids, opt.dimensions); // allocate cluster memberships // only track the ones this process is responsible for int *membership = (int*) malloc(opt.local_rows * sizeof(int)); check(membership); double inertia = DBL_MAX; int total_iterations = 0; get_timestamp(&time_s); total_iterations = kmeans(data, centroids, membership, &inertia, mpi_rank, mpi_size, points_per_proc, &timer, opt); get_timestamp(&time_e); timer.total_time = timestamp_diff_in_seconds(time_s, time_e); if(mpi_rank == 0 && opt.verbose > 0) { print_vecs(centroids, opt, "centroids"); } if(mpi_rank == 0) { printf("\nMPI K-MEANS\n"); printf("%dx%d data, %d clusters, %d trials, %d cores\n", opt.n_points, opt.dimensions, opt.n_centroids, opt.trials, mpi_size); printf("Inertia: %f\n", inertia); printf("Total Iterations: %d\n", total_iterations); printf("Runtime: %fs\n", timer.total_time); printf("Initialization time: %fs\n", timer.init_time); printf("Computation time: %fs\n", timer.comp_time); printf("Communication time: %fs\n", timer.comm_time); } MPI_File_close(&filename); free(points_per_proc); free(*data); free(data); free(*centroids); free(centroids); free(membership); MPI_Finalize(); return 0; }
int _kmeans(double **data, double **centroids, int *membership, \ double *inertia, int rank, int size, int *ppp, mytimer *t, options opt) { timestamp_type time_is, time_ie; timestamp_type time_cs, time_ce; #ifdef TIME_ALL timestamp_type comm_s, comm_e; #endif double dist, total_inertia, total_delta, delta = (double) opt.n_points; int i, center, iters = 0; // allocate for new centroids that will be computed double **new_centers = (double**) alloc2d(opt.n_centroids, opt.dimensions); memset(*new_centers, 0, opt.n_centroids * opt.dimensions * sizeof(double)); // allocate array to count points in each cluster, initialize to 0 int *count_centers = (int*) calloc(opt.n_centroids, sizeof(int)); check(count_centers); int *new_count_centers = (int*) calloc(opt.n_centroids, sizeof(int)); check(new_count_centers); // if a cluster has 0 points assigned use this for random reinitialization double *point = (double*) malloc(opt.dimensions * sizeof(double)); check(point); double *tofree = point; get_timestamp(&time_is); t->comm_time += initialize(data, centroids, ppp, rank, size, opt); get_timestamp(&time_ie); #ifdef TIME_ALL get_timestamp(&comm_s); #endif MPI_Bcast(*centroids, opt.n_centroids*opt.dimensions, MPI_DOUBLE, 0, MPI_COMM_WORLD); #ifdef TIME_ALL get_timestamp(&comm_e); t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e); #endif get_timestamp(&time_cs); while (delta / ((double) opt.n_points) > opt.tol && iters < opt.max_iter) { // MPI_Barrier(MPI_COMM_WORLD); delta = 0.0; *inertia = 0.0; for(i = 0; i < opt.local_rows; i++){ find_nearest_centroid(data[i], centroids, opt, ¢er, &dist); *inertia += dist; if (membership[i] != center) { delta++; membership[i] = center; } add(new_centers[center], data[i], opt); new_count_centers[center]++; } #ifdef TIME_ALL get_timestamp(&comm_s); #endif MPI_Allreduce(*new_centers, *centroids, opt.n_centroids * opt.dimensions, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(new_count_centers, count_centers, opt.n_centroids, MPI_INT, MPI_SUM, MPI_COMM_WORLD); #ifdef TIME_ALL get_timestamp(&comm_e); t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e); #endif for(i = 0; i < opt.n_centroids; i++) { if(count_centers[i] == 0) { if(rank == 0){ add(centroids[i], data[randint(opt.local_rows)], opt); } // broadcast this new point to everyone #ifdef TIME_ALL get_timestamp(&comm_s); #endif MPI_Bcast(centroids[i], opt.dimensions, MPI_DOUBLE, 0, MPI_COMM_WORLD); #ifdef TIME_ALL get_timestamp(&comm_e); t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e); #endif // add to delta to ensure we dont stop after this delta += opt.tol * opt.local_rows + 1.0; } // all good to divide, count is not 0 else { // calculate the new center div_by(centroids[i], count_centers[i], opt); } } // sum up the number of cluster assignments that changed #ifdef TIME_ALL get_timestamp(&comm_s); #endif MPI_Allreduce(&delta, &total_delta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); delta = total_delta; // sum up the inertias MPI_Allreduce(inertia, &total_inertia, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); *inertia = total_inertia; #ifdef TIME_ALL get_timestamp(&comm_e); t->comm_time += timestamp_diff_in_seconds(comm_s, comm_e); #endif // zero out new_centers and count_centers memset(*new_centers, 0, opt.n_centroids * opt.dimensions * sizeof(double)); memset(new_count_centers, 0, opt.n_centroids * sizeof(int)); memset(count_centers, 0, opt.n_centroids * sizeof(int)); iters++; if(opt.verbose > 1 && rank == 0) { printf("\n\titers: %d\n", iters); printf("\tdelta: %d\n", (int) delta); printf("\teps: %f\n", delta / ((double) opt.n_points)); printf("\tinertia: %f\n", *inertia); } } get_timestamp(&time_ce); t->init_time += timestamp_diff_in_seconds(time_is, time_ie); t->comp_time += timestamp_diff_in_seconds(time_cs, time_ce); free(*new_centers); free(new_centers); free(count_centers); free(new_count_centers); free(tofree); if(iters == opt.max_iter && rank == 0 && opt.verbose > 0) { printf("HIT MAX ITERS\n"); } return iters; }
double initialize(double **data, double **centroids, int *ppp, int rank, int size, options opt) { MPI_Status status; double comm_time = 0.0; if(rank == 0) { #ifdef TIME_ALL timestamp_type comm_s, comm_e; #endif int i, idx, owner; int *init = (int*) malloc(opt.n_centroids * sizeof(int)); check(init); double *point = (double*) malloc(opt.dimensions * sizeof(double)); check(point); double *tofree = point; for(i = 0; i < opt.n_centroids; i++){ while(In(idx = randint(opt.n_points), init, i)); init[i] = idx; owner = get_owner(&idx, ppp); if(owner != 0) { #ifdef TIME_ALL get_timestamp(&comm_s); #endif MPI_Send(&idx, 1, MPI_INT, owner, 999, MPI_COMM_WORLD); MPI_Recv(point, opt.dimensions, MPI_DOUBLE, owner, 999, MPI_COMM_WORLD, &status); #ifdef TIME_ALL get_timestamp(&comm_e); comm_time += timestamp_diff_in_seconds(comm_s, comm_e); #endif } else{ point = data[idx]; } // printf("%d owned by %d at %d ", init[i], owner, idx); // print_vec(point, opt.dimensions); memcpy(centroids[i], point, opt.dimensions * sizeof(double)); point = tofree; } idx = -1; #ifdef TIME_ALL get_timestamp(&comm_s); #endif for(i = 1; i < size; i++) MPI_Send(&idx, 1, MPI_INT, i, 999, MPI_COMM_WORLD); #ifdef TIME_ALL get_timestamp(&comm_e); comm_time += timestamp_diff_in_seconds(comm_s, comm_e); #endif free(init); free(tofree); } else { int get_point; while(1) { MPI_Recv(&get_point, 1, MPI_INT, 0, 999, MPI_COMM_WORLD, &status); if(get_point != -1) MPI_Send(data[get_point], opt.dimensions, MPI_DOUBLE, 0, 999, MPI_COMM_WORLD); else break; } } return comm_time; }
int main(int argc, char** argv) { if (argc != 5) { fprintf(stderr, "Need four arguments, m, n, iterations, and a (0,1) to indicate if testing is desired.\n"); abort(); } /* Size of Matrix */ int m = atoi(argv[1]); int n = atoi(argv[2]); int iterations = atoi(argv[3]); int testing = atoi(argv[4]); if(iterations < 1) { printf("\nIterations must be non-zero positive #.\n"); abort(); } /* Initilize matrices A, Atest, and Q */ double * A = malloc(m * n * sizeof(double)); double * Atest = malloc( m * n * sizeof(double)); double * Q = malloc(m * m * sizeof(double)); /* Fill up matrix A with elements from [0,10)*/ int i = 0; srand ( 1 ); timestamp_type time1, time2; get_timestamp(&time1); for(int i2 = 0; i2 < iterations; i2++) { for(i=0; i< (m*n) ; i++) { A[i] = (double) (rand() %1000)/100; Atest[i] = A[i]; } /* BlockedQR replaces A with R, which is why we needed to copy A in order to test code */ BlockedQR2(A, m, n, Q); /* Test Code */ if(testing) { printf(" Testing Code ..... \n"); double *Qt = malloc( m* m* sizeof(double)); MatrixTranspose(Q, m, m, Qt); testUpperTriangular(A, m, n); printf(" R is Upper Triangular! \n"); testOrthogonal(Q, Qt, m); printf(" Q is Orthogonal! \n"); IsQRequalToA(Q, A, Atest, m, m, m, n); printf(" A = QR! QR factorization was sucessful!\n"); free(Qt); } } get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2); // double gbs = m * n * iterations / elapsed / 1e9; double gfps = m * n * n / elapsed / 1000000000; writetofile2("blockedQR2_8_time.txt", m, n, elapsed); writetofile2("blockedQR2_8_gfps.txt", m, n, gfps); if(verbose) printf("QR2: Time elasped = %f s over %d iterations\n", elapsed, iterations); free(A); free(Atest); free(Q); return 0; }
int main(int argc, char *argv[]) { if (argc < 3){ printf("Arguments required, Quitting...\n"); return 1; } int N = atoi(argv[1]); int max_iter = atoi(argv[2]); double h2 = 1.0/(N+1)/(N+1); double *u, *uc, *f; // allocate arrays int n_per_proc = N + 2; u = (double *) malloc(n_per_proc*sizeof (double)); uc = (double *) malloc(n_per_proc*sizeof (double)); f = (double *) malloc(n_per_proc*sizeof (double)); // initialize f and u int i; for (i = 0; i < n_per_proc-1; i++) { f[i] = 1.0; u[i] = 0.0; } // Begin iterations double resid_init, resid_cur; resid_init = calc_resid(n_per_proc, h2, f, u); resid_cur = resid_init; printf("%f\n", resid_init); int iter = 0; timestamp_type t1, t2; get_timestamp(&t1); while (resid_cur / resid_init > STOP_ITER_RAT){ /* resid_cur = 0.0; */ u[0] = 0.0; u[n_per_proc - 1] = 0.0; jacobi_laplace(n_per_proc, h2, f, u, uc); resid_cur = calc_resid(n_per_proc, h2, f, u); printf("Resid is %f\n", resid_cur ); if (++iter > max_iter) break; } get_timestamp(&t2); printf("Total time: %f\n", timestamp_diff_in_seconds(t1,t2)); // deallocate free(f); free(u); free(uc); return 0; }
int main(int argc, char** argv) { /* Check for two arguemnts, m = height of matrix, n = width of matrix k=iterations */ if (argc != 5) { fprintf(stderr, "Need four arguments, m, n, iterations, and a (0,1) to indicate if testing is desired.\n"); abort(); } /* Size of Matrix */ int m = atoi(argv[1]); int n = atoi(argv[2]); int iterations = atoi(argv[3]); int testing = atoi(argv[4]); if(iterations < 1){ printf("\nIterations must be non-zero positive #.\n"); abort(); } /* Initilize matrices A, Atest, and Q */ double * A = malloc(m * n * sizeof(double)); double * Atest = malloc( m * n * sizeof(double)); double * Q = malloc(m * m * sizeof(double)); double * Qt = malloc(m * m * sizeof(double)); double * R = malloc(m * n * sizeof(double)); /* Fill up matrix A with elements from [0,10)*/ int i = 0; srand ( 1 ); timestamp_type time1, time2; get_timestamp(&time1); for(int i2 = 0; i2 < iterations; i2++){ for(i=0; i< (m*n) ; i++){ A[i] = (double) (rand() %1000)/100; Atest[i] = A[i]; } WY(A, m, n, Q, Qt, R); if(testing){ testUpperTriangular(R, m, n); printf(" R is Upper Triangular! \n"); testOrthogonal(Q, Qt, m); printf(" Q is Orthogonal! \n"); IsQRequalToA(Q, R, Atest, m, m, m, n); printf(" A = QR! QR factorization was sucessful!\n"); } } get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2); //double gbs = m * n * 8 * iterations / elapsed / 1e9; double gflops = m * n * n / elapsed / 1000000000; //fabs(m * n * n - n * n * n / 3); writetofile2("wy_time.txt", m, n, elapsed); writetofile2("wy_gfps.txt", m, n, gflops); writetofile2("wy_mbyn.txt", m, n, (elapsed/(double)iterations)); if(verbose) printf("Time elasped = %f s over %d iterations\n", elapsed, iterations); free(A); free(Atest); free(Q); free(Qt); free(R); return 0; }
void SplitNode(Node *node, double **data, int n, int first, int level) { /* Creates two branches of the decision tree on the array data. End condition * creates leaf if the purity of the node is small or if there are few * samples on the branch of node * * node = pointer to node in decision tree * data = table of unsorted data with features and labels (with last * column as the label (data[i][d-1])) * n = length of table (# of rows/samples) on branch of node * first = first index of samples on branch of node * level = the depth of node in the tree */ timestamp_type sort_start, sort_stop, split_start, split_stop; double sort_time = 0.; double split_time = 0.; int max_level = 3; int min_points = 6; node->left = NULL; node->right = NULL; node->index = -1; //Get initial counts for positive/negative labels int i; int pos = 0; double pos_w = 0;//positive weight double tot = 0;//total weight for (i = 0; i < n; ++i) { tot += data[first+i][D]; if (data[first+i][D-1] > 0){ pos += 1; pos_w += data[first+i][D]; } } int neg = n - pos; double neg_w = tot - pos_w; //Declare class for node in case of pruning on child if (pos_w > neg_w) node->label = 1; else if (pos_w < neg_w) node->label = -1; else if (node->parent) node->label = node->parent->label; else { //printf("Root node is evenly balanced.\n"); node->label = 0; } //If branch is small or almost pure, make leaf if (n < min_points) { //printf("small branch: %d points\n", n, level); return; } else if (level == max_level) { //printf("leaf node: level = max\n"); return; } else if (pos == 0 || neg == 0) { //printf("pure node\n"); return; } ///////////////TEST////////////////// //printf("LEVEL: %d\n", level); //printf("pos=%d, neg=%d, posw=%f, negw=%f, lab=%f\n", pos, neg, pos_w, neg_w, node->label); //printf("GINI: %f\n", GINI(pos_w, tot)); ///////////////////////////////////// int col; int row; //best row to split at for particular column/feature int localrow; //first + localrow = row; receives BestSplit which returns integer in [-1, n-1] double threshold; //best threshold to split at for column/feature double impurity; //impurity for best split in feature/column int bestcol = -1; //feature with best split int bestrow = first+n-1; //best row to split for best feature double bestthresh; //threshold split for best feature (data[bestrow][bestcol]) double Pmin = GINI(pos_w, tot); //minimum impurity seen so far //Sort table. Then find best column/feature, threshold, and impurity for (col = 0; col < D-1; ++col) { //printf("\r%5d/%5d", col, D); //fflush(stdout); get_timestamp(&sort_start); Sort(data, first, first+n-1, col); get_timestamp(&sort_stop); get_timestamp(&split_start); localrow = WeightedBestSplit(data, n, first, col, pos_w, tot, &impurity); get_timestamp(&split_stop); sort_time += timestamp_diff_in_seconds(sort_start, sort_stop); split_time += timestamp_diff_in_seconds(split_start, split_stop); row = first + localrow; threshold = data[row][col]; //If current column has better impurity, save col, thresh, and Pmin if (impurity < Pmin) { bestcol = col; bestrow = row; bestthresh = threshold; Pmin = impurity; } } //printf("\r \r"); //printf("Sort time: %f sec\nSplit time: %f sec\n", sort_time, split_time); //If splitting doesn't improve purity (best split is at the end) stop if (bestrow == first+n-1) { //printf("no improvement\n"); return; } Sort(data, first, first+n-1, bestcol); //For feature, threshold with best impurity, save to node attributes node->index = bestcol; node->threshold = bestthresh; printf("Best feature: %d, Best thresh: %f, Impurity: %f\n", node->index, node->threshold, Pmin); //Create right and left children Node *l = malloc(sizeof(Node)); Node *r = malloc(sizeof(Node)); l->parent = node; r->parent = node; l->right = NULL; l->left = NULL; r->right = NULL; r->left = NULL; node->left = l; node->right = r; int first_r = bestrow+1; int n_l = first_r - first; int n_r = n - n_l; //printf("LEFT\n"); SplitNode(l, data, n_l, first, level+1); //printf("RIGHT\n"); SplitNode(r, data, n_r, first_r, level+1); return; }
int main (int argv, char **argc) { ///////////////////////// ////// SAME IN EVERY FILE ///////////////////////// // create context and command queue cl_context __sheets_context; cl_command_queue __sheets_queue; int _i; cl_int __cl_err; create_context_on(SHEETS_PLAT_NAME, SHEETS_DEV_NAME, 0, /* choose the first (only) available device */ &__sheets_context, &__sheets_queue, 0); // compile kernels for (_i = 0; _i < NKERNELS; _i++) { compiled_kernels[_i] = kernel_from_string(__sheets_context, kernel_strings[_i], kernel_names[_i], SHEETS_KERNEL_COMPILE_OPTS); } ////// [END] size_t __SIZE_wav = atoi(argc[1]); float wav[__SIZE_wav]; const char *file_name = "mytune.wav"; int in_thrsh_cnt = 0; timestamp_type st; timestamp_type end; get_timestamp(&st); for (_i = 0; _i < __SIZE_wav; _i++) { wav[_i] = (float) rand() / RAND_MAX; if (in_thrsh(wav[_i], 0.1112, 0.7888)) in_thrsh_cnt++; } get_timestamp(&end); printf("cpu execution took %f seconds\n", timestamp_diff_in_seconds(st, end)); get_timestamp(&st); ///////////////// ////// GFUNC CALL ///////////////// /// create variables for function arguments given as literals float __PRIM_band_restrict_ARG2 = 0.1112f; float __PRIM_band_restrict_ARG3 = 0.7888f; /// return array (always arg0) cl_mem __CLMEM_band_restrict_ARG0 = clCreateBuffer(__sheets_context, CL_MEM_WRITE_ONLY, sizeof(float) * __SIZE_wav, NULL, &__cl_err); CHECK_CL_ERROR(__cl_err, "clCreateBuffer"); /// input arrays cl_mem __CLMEM_band_restrict_ARG1 = clCreateBuffer(__sheets_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * __SIZE_wav, (void *) wav, &__cl_err); CHECK_CL_ERROR(__cl_err, "clCreateBuffer"); /// write to device memory CALL_CL_GUARDED(clEnqueueWriteBuffer, (__sheets_queue, __CLMEM_band_restrict_ARG1, CL_TRUE, /* blocking write */ 0, /* no offset */ sizeof(float) * __SIZE_wav, wav, 0, /* no wait list */ NULL, NULL) ); /// set up kernel arguments SET_4_KERNEL_ARGS(compiled_kernels[0], __CLMEM_band_restrict_ARG0, __CLMEM_band_restrict_ARG1, __PRIM_band_restrict_ARG2, __PRIM_band_restrict_ARG3); /// enqueue kernel cl_event __CLEVENT_band_restrict_CALL; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (__sheets_queue, compiled_kernels[0], 1, /* 1 dimension */ 0, /* 0 offset */ &__SIZE_wav, NULL, /* let OpenCL break things up */ 0, /* no events in wait list */ NULL, /* empty wait list */ &__CLEVENT_band_restrict_CALL) ); /// allocate space for cpu return array float out[__SIZE_wav]; CALL_CL_GUARDED(clEnqueueReadBuffer, (__sheets_queue, __CLMEM_band_restrict_ARG0, CL_TRUE, /* blocking read */ 0, /* 0 offset */ sizeof(float) * __SIZE_wav, /* read whole buffer */ (void *) out, /* host pointer */ 1, /* wait for gfunc to finish */ &__CLEVENT_band_restrict_CALL, /* "" */ NULL) /* no need to wait for this call though */ ); ////// [END] GFUNC CALL get_timestamp(&end); printf("gfunc call took %f seconds\n", timestamp_diff_in_seconds(st, end)); ////// Validate call int c = 0; for (_i = 0; _i < __SIZE_wav; _i++) { if (in_thrsh(out[_i], 0.1112, 0.7888)) { c++; } else if(out[_i]) { exit(1); } } printf("\n"); assert(in_thrsh_cnt == c); ////////////// ////// CLEANUP ////////////// CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG0)); CALL_CL_GUARDED(clReleaseMemObject, (__CLMEM_band_restrict_ARG1)); for (_i = 0; _i < NKERNELS; _i++) { CALL_CL_GUARDED(clReleaseKernel, (compiled_kernels[_i])); } CALL_CL_GUARDED(clReleaseCommandQueue, (__sheets_queue)); CALL_CL_GUARDED(clReleaseContext, (__sheets_context)); return 0; }
int main (int argc, char *argv[]) { double *a, *b, *c; if (argc != 3) { fprintf(stderr, "Usage: %s size_of_vector num_adds\n", argv[0]); abort(); } const cl_long N = (cl_long) atol(argv[1]); const int num_adds = atoi(argv[2]); cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("vec-add-kernel.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "sum", NULL); free(knl_text); // -------------------------------------------------------------------------- // allocate and initialize CPU memory // -------------------------------------------------------------------------- posix_memalign((void**)&a, 32, N*sizeof(double)); if (!a) { fprintf(stderr, "alloc a"); abort(); } posix_memalign((void**)&b, 32, N*sizeof(double)); if (!b) { fprintf(stderr, "alloc b"); abort(); } posix_memalign((void**)&c, 32, N*sizeof(double)); if (!c) { fprintf(stderr, "alloc c"); abort(); } for(cl_long n = 0; n < N; ++n) { a[n] = n; b[n] = 2*n; } // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(double) * N, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(double) * N, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(double) * N, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0, N * sizeof(double), a, 0, NULL, NULL)); CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0, N * sizeof(double), b, 0, NULL, NULL)); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- CALL_CL_SAFE(clFinish(queue)); timestamp_type tic, toc; get_timestamp(&tic); for(int add = 0; add < num_adds; ++add) { SET_4_KERNEL_ARGS(knl, N, buf_a, buf_b, buf_c); size_t local_size[] = { 128 }; size_t global_size[] = { ((N + local_size[0] - 1)/local_size[0])* local_size[0] }; CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL, global_size, local_size, 0, NULL, NULL)); } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); double elapsed = timestamp_diff_in_seconds(tic,toc)/num_adds; printf("%f s\n", elapsed); printf("%f GB/s\n", 3*N*sizeof(double)/1e9/elapsed); // -------------------------------------------------------------------------- // transfer back & check // -------------------------------------------------------------------------- CALL_CL_SAFE(clEnqueueReadBuffer( queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0, N * sizeof(double), c, 0, NULL, NULL)); for(cl_long i = 0; i < N; ++i) if(c[i] != 3*i) { printf("BAD %ld\n", (long)i); abort(); } printf("GOOD\n"); // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_a)); CALL_CL_SAFE(clReleaseMemObject(buf_b)); CALL_CL_SAFE(clReleaseMemObject(buf_c)); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(a); free(b); free(c); return 0; }
void main(int argc, char** argv) { //int k = atoi(argv[1]); //int N = pow(2,k); int N=1024; int k=10; float * a = (float *) malloc(sizeof(float)*N* N * 2); float * b = (float *) malloc(sizeof(float) *N*N * 2); float * c = (float *) malloc(sizeof(float) * N*N* 2); float p = 2*M_PI ; for (int i =0; i< N*N; i++) { a[2*i] = 1; a[2*i+1] = 0; b[2*i] = 1; b[2*i+1] = 0; } #if 0 srand(1); for(int i =0;i<N*N;i++) { a[2*i]=sin(i%N *2 *M_PI); //printf("%f\n",uu[2*i]); a[2*i+1] =0 ; } #endif print_platforms_devices(); cl_context ctx; cl_command_queue queue; create_context_on("NVIDIA","GeForce GTX 590",0,&ctx,&queue,0); cl_int status; cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) * N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) * N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_d = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float)*N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_e = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_f = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_g = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(float) *N *N* 2 , 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float) *N*N*2, a, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float) *N *N* 2, b, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float) *N* N*2, c, 0, NULL, NULL)); char *knl_text = read_file("vec_add.cl"); cl_kernel vec_add = kernel_from_string(ctx, knl_text, "sum", NULL); free(knl_text); knl_text = read_file("mat_etr_mul.cl"); cl_kernel mat_etr_mul = kernel_from_string(ctx, knl_text, "mult", NULL); free(knl_text); knl_text = read_file("radix-4-float.cl"); cl_kernel fft1D = kernel_from_string(ctx, knl_text, "fft1D", NULL); free(knl_text); knl_text = read_file("radix-4-init.cl"); cl_kernel fft_init = kernel_from_string(ctx, knl_text, "fft1D_init", NULL); free(knl_text); knl_text = read_file("radix-4-interm.cl"); cl_kernel fft_interm = kernel_from_string(ctx, knl_text, "fft1D", NULL); free(knl_text); knl_text = read_file("transpose-soln-gpu.cl"); cl_kernel mat_trans = kernel_from_string(ctx, knl_text, "transpose", NULL); free(knl_text); knl_text = read_file("radix-4-modi.cl"); cl_kernel fft_init_w = kernel_from_string(ctx, knl_text, "fft1D_init", NULL); free(knl_text); knl_text = read_file("vec_zero.cl"); cl_kernel vec_zero = kernel_from_string(ctx, knl_text, "zero", NULL); free(knl_text); knl_text = read_file("reduction.cl"); cl_kernel reduct_mul = kernel_from_string(ctx, knl_text, "reduction_mult", NULL); free(knl_text); knl_text = read_file("reduction1D.cl"); cl_kernel reduct = kernel_from_string(ctx, knl_text, "reduction", NULL); free(knl_text); knl_text = read_file("reduction-init.cl"); cl_kernel reduct_init = kernel_from_string(ctx, knl_text, "reduction_init", NULL); free(knl_text); knl_text = read_file("reduct-energy.cl"); cl_kernel reduct_eng = kernel_from_string(ctx, knl_text, "reduction_eng", NULL); free(knl_text); knl_text = read_file("resid.cl"); cl_kernel resid = kernel_from_string(ctx, knl_text, "resid", NULL); free(knl_text); knl_text = read_file("resid-init.cl"); cl_kernel resid_init = kernel_from_string(ctx, knl_text, "resid_init", NULL); free(knl_text); knl_text = read_file("radix-4-big.cl"); cl_kernel fft_big = kernel_from_string(ctx, knl_text, "fft1D_big", NULL); free(knl_text); knl_text = read_file("radix-4-big-clean.cl"); cl_kernel fft_clean = kernel_from_string(ctx, knl_text, "fft1D_clean", NULL); free(knl_text); knl_text = read_file("radix-4-2D.cl"); cl_kernel fft_2D = kernel_from_string(ctx, knl_text, "fft2D_big", NULL); free(knl_text); knl_text = read_file("radix-4-2D-clean.cl"); cl_kernel fft_2D_clean = kernel_from_string(ctx, knl_text, "fft2D_clean", NULL); free(knl_text); knl_text = read_file("mat-trans-3D.cl"); cl_kernel mat_trans_3D = kernel_from_string(ctx, knl_text, "transpose_3D", NULL); free(knl_text); int Ns =1 ; int direction = 1; timestamp_type time1, time2; struct parameter param; param.N = N; param.epsilon = 0.1; param.s =1; float kk =1e-4; param.h = 2*PI/N; param.N = N; param.maxCG = 1000; param.maxN = 5; //Minimum and starting time step float mink = 1e-7; float startk = 1e-4; // Tolerances param.Ntol = 1e-4; param.cgtol = 1e-7; float ksafety = 0.8; float kfact = 1.3; float kfact2 = 1/1.3; float Nfact = 0.7; float CGfact = 0.7; double elapsed ; CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); //for(int s=0;s<100;s++) //fft_1D_big(buf_a,buf_b,buf_c,N,fft_big,fft_clean,mat_trans,queue,direction,0); //fft_1D_new(buf_a,buf_b,buf_c,N,fft_init,fft_interm, fft1D,queue,direction,0); //fft_1D(buf_a,buf_b,buf_c,N,fft_init, fft1D,queue,direction,0); //fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1); //fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1); //fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction); //fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean, //mat_trans,mat_trans_3D,queue,direction); //fft_w(buf_a,buf_b,buf_c,buf_d,buf_e,N,0.1,0,1,fft_init_w,fft_init,fft1D,mat_trans,queue); #if 0 frhs(buf_a,buf_b,buf_c,buf_d,buf_e,¶m,fft1D_init,fft1D,mat_trans, vec_add, queue); #endif #if 0 float E1 = energy(buf_a, buf_b, buf_c,buf_d, buf_e,buf_f,1e-4, ¶m, fft_init,fft1D,mat_trans,reduct_eng, reduct,queue); #endif //float reside = residual(buf_a,buf_b,resid,resid_init,queue,N*N); /*fft_d_q(buf_a,buf_b,buf_c,buf_d, N,0.1,k ,1, fft1D_init, fft1D,mat_trans,queue);*/ //for(int j= 0;j<N;j++) //{ //fft_1D_w_orig(buf_a,buf_b,buf_c,N,fft1D_init,fft1D,queue,1,j); //} //fft_shar(buf_a,buf_b,buf_c,buf_d,N,0.1,0,1,fft1D_init,fft1D,mat_trans,queue); //mat__trans(buf_a,buf_b,N,mat_trans,queue,4,0.1,0,1); //double elapsed = reduction_mult(buf_a, buf_b,buf_c,N*N,reduct_mul,reduct,queue); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft_1D_big(buf_a,buf_b,buf_c,N*N,fft_big,fft_clean,mat_trans,queue,direction,0); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Hierarchy 1D FFT of size %d array on gpu takes %f s\n", N*N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Navie 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); //printf("data access from global achieve %f GB/s\n",sizeof(float)*2*16*N*N/elapsed*1e-9); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("local data exchange 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Hierarchy 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean, mat_trans,mat_trans_3D,queue,direction); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); printf("Using 2D kernel 2D FFT of size %d * %d matrix on gpu takes %f s\n", N,N,elapsed); printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9); printf("---------------------------------------------\n"); get_timestamp(&time1); direction = -1; //fft_1D(buf_b,buf_c,buf_d,N,fft_init, fft1D,queue,direction,0); fft2D(buf_b,buf_c,buf_d,buf_e,N,fft_init,fft1D,mat_trans,queue, direction); //fft2D_new(buf_b,buf_c,buf_e,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, -1); //fft2D_big(buf_b,buf_c,buf_d,buf_e,N,fft_big,fft_clean,mat_trans,queue,direction); CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2); //printf("1D inverse %f s\n", elapsed); #if 0 float test; CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0, sizeof(float), &test, 0, NULL, NULL)); printf("test success and %f \n",test); #endif #if 0 CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0, 2*N*N* sizeof(float), c, 0, NULL, NULL)); /*for(int i =0; i< N; i++) { printf("a%f+ i*",a[2*i]); printf("%f\n",a[2*i+1]); }*/ int T = 10<N? 10:N ; for(int i =0; i< T; i++) { printf("%f + i*",a[2*i]); printf("%f\t",a[2*i+1]); printf("%f + i*",c[2*i]); printf("%f\n",c[2*i+1]); } #endif /* for( Ns = 1;Ns < N; Ns *= 2 ) { for (int j = 0; j<N/2; j++) { fftiteration(j,N,Ns,a,b); } float * d; d = a ; a = b; b = d; //printf("ok\n"); } */ CALL_CL_GUARDED(clReleaseMemObject, (buf_a)); CALL_CL_GUARDED(clReleaseMemObject, (buf_b)); CALL_CL_GUARDED(clReleaseMemObject, (buf_c)); CALL_CL_GUARDED(clReleaseMemObject, (buf_d)); CALL_CL_GUARDED(clReleaseMemObject, (buf_e)); CALL_CL_GUARDED(clReleaseKernel, (fft1D)); CALL_CL_GUARDED(clReleaseKernel, (fft_init)); CALL_CL_GUARDED(clReleaseKernel, (vec_add)); CALL_CL_GUARDED(clReleaseKernel, (reduct_mul)); CALL_CL_GUARDED(clReleaseKernel, (reduct)); CALL_CL_GUARDED(clReleaseKernel, (mat_trans)); CALL_CL_GUARDED(clReleaseCommandQueue, (queue)); CALL_CL_GUARDED(clReleaseContext, (ctx)); }
int main(int argc, char **argv) { char cmd[1000]; FILE * fp; FILE * sigDb; char * fileName = NULL; char * sigPattern = NULL; size_t len = 0; size_t sigLen = 0; ssize_t readFile; ssize_t readSig; uint8_t *fileBuf; uint8_t *sigBuf; size_t sizeFb; uint8_t *found; int count=0; strcpy(cmd, "find "); strcat(cmd, MOUNT); strcat(cmd, " -type f > filesToScan.txt"); system(cmd); fp = fopen("filesToScan.txt", "r"); if (fp == NULL) exit(EXIT_FAILURE); //sigDb = fopen("mainCPUsig.ndb","r"); timestamp_type time1, time2; get_timestamp(&time1); while ((readFile = getline(&fileName, &len, fp)) != -1) { printf("scaning: %s", fileName); remove_char_from_string('\n',fileName); loadFile(fileName, &fileBuf, &sizeFb); sigDb = fopen("mainCPUsig5k.ndb","r"); while ((readSig = getline(&sigPattern, &sigLen, sigDb)) != -1){ remove_char_from_string('\n',sigPattern); sigLen = strlen(sigPattern)/2; hex2data(&sigBuf, sigPattern); found = boyer_moore(fileBuf,sizeFb, sigBuf, sigLen); if(found != NULL){ printf(" found virus in %s\n", fileName); count++; } free(sigBuf); } fclose(sigDb); printf("\n"); free(fileBuf); } fclose(fp); get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2); printf("%f s\n", elapsed); printf("virus count: %d\n", count); //loadFile(argv[1], &fileBuf, &sizeFb); }
int main (int argc, char *argv[]){ int N, i, j; double h, h2, f, r, r0, tol, rt; double *u, *unew; int MAX_ITER; if (argc != 3){ fprintf(stderr, "must input discretization size N and # of iterations\n"); exit(0); } N = atoi( argv[1] ); MAX_ITER = atoi( argv[2] ); h = 1./(N+1); h2 = h*h; f = 1.; // allocate u, unew u = (double *) malloc( (N+2) * sizeof(double)); unew = (double *) malloc( (N+2) * sizeof(double)); // fill arrays for(j = 0; j<=N+1; j++){ u[j] = 0.; unew[j] = 0.; } /* // initial residual r0 = 0.0; for(j=1; j<=N; j++){ rt = (-u[j-1] + 2.*u[j] - u[j+1]) / h2 - f; r0 += rt*rt; } r0 = sqrt(r0/N); */ #pragma omp parallel { printf("Hello, I am thread %d of %d\n", omp_get_thread_num(), omp_get_num_threads()); } timestamp_type start_t, stop_t; get_timestamp(&start_t); /* r = r0; while (r/r0 > tol){ */ for(i = 0; i < MAX_ITER; i++ ){ #pragma omp parallel for default(none) \ schedule(static) \ shared(u,unew,h2,f,N) //jacobi iteration for(j= 1; j <=N ; j++){ unew[j] = (h2*f + u[j-1] + u[j+1] ) * 0.5; } // printf("Thread %d done\n", omp_get_thread_num()); #pragma omp parallel for default(none) \ schedule(static) \ shared(u,unew,h2,f,N) // copy work for(j= 1; j <=N ; j++){ u[j] = unew[j]; } /* r = 0.0; // compute residual for(j=1; j <= N ; j++){ rt = (-u[j-1] + 2.*u[j] - u[j+1]) / h2 - f; r = r + rt*rt; } r = sqrt(r/N); */ // printf("the residual at %dth iteration is %.14f\n", i+1, r); } get_timestamp(&stop_t); double elapsed = timestamp_diff_in_seconds(start_t, stop_t); printf("Total number of iterations is %d\n", MAX_ITER); printf("Time elapsed is %f seconds.\n", elapsed); // printf("the residual at %dth iteration is %.14f\n", MAX_ITER, r); free(u); free(unew); return 0; }
int main (int argc, char *argv[]) { double *a, *a_reduced; if (argc != 3) { fprintf(stderr, "Usage: %s N nloops\n", argv[0]); abort(); } const cl_long N = (cl_long) atol(argv[1]); const int nloops = atoi(argv[2]); cl_long Ngroups = (N + LDIM - 1)/LDIM; Ngroups = (Ngroups + 8 - 1)/8; cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("full_reduction.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "reduction", "-DLDIM=" STRINGIFY(LDIM)); free(knl_text); // -------------------------------------------------------------------------- // allocate and initialize CPU memory // -------------------------------------------------------------------------- posix_memalign((void**)&a, 32, N*sizeof(double)); if (!a) { fprintf(stderr, "alloc a"); abort(); } posix_memalign((void**)&a_reduced, 32, Ngroups*sizeof(double)); if (!a_reduced) { fprintf(stderr, "alloc a_reduced"); abort(); } srand48(8); for(cl_long n = 0; n < N; ++n) a[n] = (double)drand48(); // a[n] = n; // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N*sizeof(double), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_a_reduced[2]; buf_a_reduced[0] = clCreateBuffer(ctx, CL_MEM_READ_WRITE, Ngroups*sizeof(double), 0, &status); buf_a_reduced[1] = clCreateBuffer(ctx, CL_MEM_READ_WRITE, Ngroups*sizeof(double), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0, N*sizeof(double), a, 0, NULL, NULL)); timestamp_type tic, toc; double elapsed; // -------------------------------------------------------------------------- // run reduction_simple on device // -------------------------------------------------------------------------- printf("Simple Reduction\n"); double sum_gpu = 0.0; CALL_CL_SAFE(clFinish(queue)); get_timestamp(&tic); for(int loop = 0; loop < nloops; ++loop) { int r = 0; size_t Ngroups_loop = Ngroups; SET_3_KERNEL_ARGS(knl, N, buf_a, buf_a_reduced[r]); size_t local_size[] = { LDIM }; size_t global_size[] = { Ngroups_loop*LDIM }; CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL, global_size, local_size, 0, NULL, NULL)); while(Ngroups_loop > 1) { cl_long N_reduce = Ngroups_loop; Ngroups_loop = (N_reduce + LDIM - 1)/LDIM; Ngroups_loop = (Ngroups_loop + 8 - 1)/8; size_t local_size[] = { LDIM }; size_t global_size[] = { Ngroups_loop*LDIM }; SET_3_KERNEL_ARGS(knl, N_reduce, buf_a_reduced[r], buf_a_reduced[(r+1)%2]); CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 1, NULL, global_size, local_size, 0, NULL, NULL)); r = (r+1)%2; } CALL_CL_SAFE(clEnqueueReadBuffer( queue, buf_a_reduced[r], /*blocking*/ CL_TRUE, /*offset*/ 0, Ngroups_loop*sizeof(double), a_reduced, 0, NULL, NULL)); sum_gpu = 0.0; for(cl_long n = 0; n < Ngroups_loop; ++n) sum_gpu += a_reduced[n]; } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); elapsed = timestamp_diff_in_seconds(tic,toc)/nloops; printf("%f s\n", elapsed); printf("%f GB/s\n", N*sizeof(double)/1e9/elapsed); double sum_cpu = 0.0; for(cl_long n = 0; n < N; ++n) sum_cpu += a[n]; printf("Sum CPU: %e\n", sum_cpu); printf("Sum GPU: %e\n", sum_gpu); printf("Relative Error: %e\n", fabs(sum_cpu-sum_gpu)/sum_gpu); // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_a)); CALL_CL_SAFE(clReleaseMemObject(buf_a_reduced[0])); CALL_CL_SAFE(clReleaseMemObject(buf_a_reduced[1])); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(a); free(a_reduced); return 0; }
int main(int argc, char *argv[]) { int error, xsize, ysize, rgb_max; int *r, *b, *g; float *gray, *congray, *congray_cl; // identity kernel // float filter[] = { // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,1,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // }; // 45 degree motion blur float filter[] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; // mexican hat kernel // float filter[] = { // 0, 0,-1,-1,-1, 0, 0, // 0,-1,-3,-3,-3,-1, 0, // -1,-3, 0, 7, 0,-3,-1, // -1,-3, 7,24, 7,-3,-1, // -1,-3, 0, 7, 0,-3,-1, // 0,-1,-3,-3,-3,-1, 0, // 0, 0,-1,-1,-1, 0, 0 // }; if(argc != 3) { fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]); abort(); } const char* filename = argv[1]; const int num_loops = atoi(argv[2]); // -------------------------------------------------------------------------- // load image // -------------------------------------------------------------------------- printf("Reading ``%s''\n", filename); ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b); printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize); // -------------------------------------------------------------------------- // allocate CPU buffers // -------------------------------------------------------------------------- posix_memalign((void**)&gray, 32, xsize*ysize*sizeof(float)); if(!gray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray, 32, xsize*ysize*sizeof(float)); if(!congray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray_cl, 32, xsize*ysize*sizeof(float)); if(!congray_cl) { fprintf(stderr, "alloc gray"); abort(); } // -------------------------------------------------------------------------- // convert image to grayscale // -------------------------------------------------------------------------- for(int n = 0; n < xsize*ysize; ++n) gray[n] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max; // -------------------------------------------------------------------------- // execute filter on cpu // -------------------------------------------------------------------------- for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { float sum = 0; for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k) { for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l) { sum += gray[(i+k)*xsize + (j+l)] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; } } congray[i*xsize + j] = sum; } } // -------------------------------------------------------------------------- // output cpu filtered image // -------------------------------------------------------------------------- printf("Writing cpu filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) r[n] = g[n] = b[n] = (int)(congray[n] * rgb_max); error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // get an OpenCL context and queue // -------------------------------------------------------------------------- cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("convolution.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution", NULL); free(knl_text); #ifdef NON_OPTIMIZED int deviceWidth = xsize; #else int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX; #endif int deviceHeight = ysize; size_t deviceDataSize = deviceHeight*deviceWidth*sizeof(float); // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0, deviceDataSize, gray, 0, NULL, NULL)); #else size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), ysize, 1}; clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, gray, 0, NULL, NULL); #endif CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL)); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- cl_int rows = ysize; cl_int cols = xsize; cl_int filterWidth = FILTER_WIDTH; cl_int paddingPixels = 2*HALF_FILTER_WIDTH; size_t local_size[] = { WGX, WGY }; size_t global_size[] = { ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0], ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1], }; cl_int localWidth = local_size[0] + paddingPixels; cl_int localHeight = local_size[1] + paddingPixels; size_t localMemSize = localWidth * localHeight * sizeof(float); CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter)); CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows)); CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols)); CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth)); CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL)); CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight)); CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth)); // -------------------------------------------------------------------------- // print kernel info // -------------------------------------------------------------------------- print_kernel_info(queue, knl); CALL_CL_SAFE(clFinish(queue)); timestamp_type tic, toc; get_timestamp(&tic); for(int loop = 0; loop < num_loops; ++loop) { CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL, global_size, local_size, 0, NULL, NULL)); // Edit: Copy the blurred image to input buffer #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueCopyBuffer(queue, buf_congray, buf_gray, 0, 0, deviceDataSize, 0, NULL, NULL)); #else clEnqueueCopyBufferRect(queue, buf_congray, buf_gray, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, 0, NULL, NULL); #endif } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops; printf("%f s\n", elapsed); printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed); printf("%f GBit/s\n", 2*xsize*ysize*sizeof(float)/1e9/elapsed); printf("%f GFlop/s\n", (xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH) *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed); // -------------------------------------------------------------------------- // transfer back & check // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueReadBuffer( queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0, xsize * ysize * sizeof(float), congray_cl, 0, NULL, NULL)); #else buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; region[0] = (xsize-paddingPixels)*sizeof(float); region[1] = (ysize-paddingPixels); region[2] = 1; clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, congray_cl, 0, NULL, NULL); #endif // -------------------------------------------------------------------------- // output OpenCL filtered image // -------------------------------------------------------------------------- printf("Writing OpenCL filtered image\n"); // Edit: Keep pixel value in the interval [0, 255] to reduce boundary effect for(int n = 0; n < xsize*ysize; ++n) { int color = (int)(congray_cl[n] * rgb_max); if (color < 0) { color = 0; } else if (color > 255) { color = 255; } r[n] = g[n] = b[n] = color; } error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_congray)); CALL_CL_SAFE(clReleaseMemObject(buf_gray)); CALL_CL_SAFE(clReleaseMemObject(buf_filter)); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(gray); free(congray); free(congray_cl); free(r); free(b); free(g); }
int main(int argc, char **argv) { if (argc != 3) { fprintf(stderr, "need two arguments!\n"); abort(); } const long n = atol(argv[1]); const long size = n*n; const int ntrips = atoi(argv[2]); cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); cl_int status; // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("transpose-soln.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "transpose", NULL); free(knl_text); // -------------------------------------------------------------------------- // allocate and initialize CPU memory // -------------------------------------------------------------------------- #ifdef USE_PINNED cl_mem buf_a_host = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b_host = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); value_type *a = (value_type *) clEnqueueMapBuffer(queue, buf_a_host, /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status); CHECK_CL_ERROR(status, "clEnqueueMapBuffer"); value_type *b = (value_type *) clEnqueueMapBuffer(queue, buf_b_host, /*blocking*/ CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, /*offs*/ 0, sizeof(value_type)*size, 0, NULL, NULL, &status); CHECK_CL_ERROR(status, "clEnqueueMapBuffer"); #else value_type *a = (value_type *) malloc(sizeof(value_type) * size); if (!a) { perror("alloc x"); abort(); } value_type *b = (value_type *) malloc(sizeof(value_type) * size); if (!b) { perror("alloc y"); abort(); } #endif for (size_t j = 0; j < n; ++j) for (size_t i = 0; i < n; ++i) a[i + j*n] = i + j*n; // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(value_type) * size, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- CALL_CL_GUARDED(clFinish, (queue)); timestamp_type time1, time2; get_timestamp(&time1); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_a, /*blocking*/ CL_FALSE, /*offset*/ 0, size * sizeof(value_type), a, 0, NULL, NULL)); CALL_CL_GUARDED(clEnqueueWriteBuffer, ( queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0, size * sizeof(value_type), b, 0, NULL, NULL)); get_timestamp(&time2); double elapsed = timestamp_diff_in_seconds(time1,time2); printf("transfer: %f s\n", elapsed); printf("transfer: %f GB/s\n", 2*size*sizeof(value_type)/1e9/elapsed); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time1); for (int trip = 0; trip < ntrips; ++trip) { SET_3_KERNEL_ARGS(knl, buf_a, buf_b, n); size_t ldim[] = { 16, 16 }; size_t gdim[] = { n, n }; CALL_CL_GUARDED(clEnqueueNDRangeKernel, (queue, knl, /*dimensions*/ 2, NULL, gdim, ldim, 0, NULL, NULL)); } CALL_CL_GUARDED(clFinish, (queue)); get_timestamp(&time2); elapsed = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%f s\n", elapsed); printf("%f GB/s\n", 2*size*sizeof(value_type)/1e9/elapsed); CALL_CL_GUARDED(clEnqueueReadBuffer, ( queue, buf_b, /*blocking*/ CL_FALSE, /*offset*/ 0, size * sizeof(value_type), b, 0, NULL, NULL)); CALL_CL_GUARDED(clFinish, (queue)); for (size_t i = 0; i < n; ++i) for (size_t j = 0; j < n; ++j) if (a[i + j*n] != b[j + i*n]) { printf("bad %d %d\n", i, j); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_GUARDED(clFinish, (queue)); CALL_CL_GUARDED(clReleaseMemObject, (buf_a)); CALL_CL_GUARDED(clReleaseMemObject, (buf_b)); CALL_CL_GUARDED(clReleaseKernel, (knl)); CALL_CL_GUARDED(clReleaseCommandQueue, (queue)); CALL_CL_GUARDED(clReleaseContext, (ctx)); #ifdef USE_PINNED CALL_CL_GUARDED(clReleaseMemObject, (buf_a_host)); CALL_CL_GUARDED(clReleaseMemObject, (buf_b_host)); #else free(a); free(b); #endif return 0; }
int main(int argc, char ** argv){ // check input if (argc != 3) { fprintf(stderr, "in main: need two arguments!\n"); abort(); } // seed the random number generator //srand( (int) time(0)); srand( (int) 4); // parameters const long m = atol(argv[1]); const long n = atol(argv[2]); long mn = 0; // min of m,n long len_beta = 0; if ( m < n){ mn = m; len_beta = mn; } else{ mn = n; len_beta = mn-1; } double a = 1; double b = 2; double tol = 1.0e-9; // big matrix storage double *A = (double *) malloc(sizeof(double) *m*n); if(!A) { fprintf(stderr,"in main: failed to allocate A\n"); abort();} double *A2 = (double *) malloc(sizeof(double) *m*n); if(!A2) { fprintf(stderr,"in main: failed to allocate A2\n"); abort();} double *B = (double *) malloc(sizeof(double) *m*n); if(!B) { fprintf(stderr,"in main: failed to allocate B\n"); abort();} double *A_Copy = (double *) malloc(sizeof(double) *m*n); if(!A_Copy) { fprintf(stderr,"in main: failed to allocate A_Copy\n"); abort();} double *A_Result = (double *) malloc(sizeof(double) *m*n); if(!A_Result) { fprintf(stderr,"in main: failed to allocate A_Result\n"); abort();} double *temp = (double *) malloc(sizeof(double) *m*n); if(!temp) { fprintf(stderr,"in main: failed to allocate temp\n"); abort();} double *temp2 = (double *) malloc(sizeof(double) *m*m); if(!temp2) { fprintf(stderr,"in main: failed to allocate temp2\n"); abort();} double *temp3 = (double *) malloc(sizeof(double) *n*n); if(!temp3) { fprintf(stderr,"in main: failed to allocate temp3\n"); abort();} double *U = (double *) malloc(sizeof(double) *m*m); if(!U) { fprintf(stderr,"in main: failed to allocate U\n"); abort();} double *UT = (double *) malloc(sizeof(double) *m*m); if(!UT) { fprintf(stderr,"in main: failed to allocate UT\n"); abort();} double *V = (double *) malloc(sizeof(double) *n*n); if(!V) { fprintf(stderr,"in main: failed to allocate V\n"); abort();} double *VT = (double *) malloc(sizeof(double) *n*n); if(!VT) { fprintf(stderr,"in main: failed to allocate VT\n"); abort();} // diagonal component storage double *alpha = (double *) malloc(sizeof(double) *mn); if(!alpha) { fprintf(stderr,"in main: failed to allocate alpha\n"); abort();} double *beta = (double *) malloc(sizeof(double) *len_beta); if(!beta) { fprintf(stderr,"in main: failed to allocate beta\n"); abort();} double *alpha2 = (double *) malloc(sizeof(double) *mn); if(!alpha) { fprintf(stderr,"in main: failed to allocate alpha2\n"); abort();} double *beta2 = (double *) malloc(sizeof(double) *len_beta); if(!beta) { fprintf(stderr,"in main: failed to allocate beta2\n"); abort();} // fill A, A_Copy for (int i=0; i<m*n; i++){ A[i] = rand_d(a,b); A_Copy[i] = A[i]; A2[i] = A[i]; } timestamp_type time1, time2; // compute the bidiagonal form get_timestamp(&time1); bidiag_par(m,n,A,alpha,beta); get_timestamp(&time2); double elapsed_par = timestamp_diff_in_seconds(time1,time2); printf("time_par = %g\n",elapsed_par); get_timestamp(&time1); bidiag_seq(m,n,A2,alpha,beta); get_timestamp(&time2); double elapsed_seq = timestamp_diff_in_seconds(time1,time2); printf("time_seq = %g\n",elapsed_seq); // form the orthogonal matrices //form_u_par(m,n,A,U); //form_v_par(m,n,A,V); //form_bidiag(m,n,alpha,beta,B); //transpose(n,n,V,VT); //transpose(m,m,U,UT); // check the result of A_Result = U * B * V^T //dgemm_simple(m,n,n,B,VT,temp); //dgemm_simple(m,n,m,U,temp,A_Result); //dgemm_simple(n,n,n,VT,V,temp3); //dgemm_simple(m,m,m,UT,U,temp2); int errors = 0; for (int i=0; i < m*n; i++){ if ( fabs(A[i]-A2[i]) > tol ){ errors++; } } printf("ERRORS = %d\n",errors); //print_matrix(A,m,n,"A = "); //print_matrix(A2,m,n,"A2 = "); //print_matrix(A_Copy,m,n,"A_Copy = "); //print_matrix(A_Result,m,n,"A_Result = "); //print_matrix(B,m,n,"B = "); //print_matrix(U,m,m,"U = "); //print_matrix(V,n,n,"V = "); //print_matrix(temp2,m,m,"temp2 = "); //print_matrix(temp3,n,n,"temp3 = "); free(A); free(A2); free(A_Copy); free(A_Result); free(B); free(temp); free(temp2); free(temp3); free(U); free(UT); free(V); free(VT); free(alpha); free(alpha2); free(beta); free(beta2); return 0; }
void runTimings(int use_gpu){ int ntrips = 10; char device_name[256]; timestamp_type time1, time2; //////////////////////////////////////////////////// ///GPU TIMINGS //////////////////////////////////////////////////// init_opencl(use_gpu); load_cl_kernels(&clData); allocate_cl_buffers(&clData); print_device_info_from_queue(clData.queue); get_device_name_from_queue(clData.queue, device_name, 256); transfer_buffers_to_gpu(); double advectionVelocityTimeGPU, advectionDensityTimeGPU, divergenceTimeGPU, projectJacobiTimeGPU, projectCGTimeGPU, pressureApplyTimeGPU; transfer_buffers_to_gpu(); get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_advect_velocity(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); advectionVelocityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_calculate_divergence(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); divergenceTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; transfer_buffers_to_cpu(); flush_cl_queue(); //This needs ntrips different divergence matrices to get accurate timings. //This is because by the time the second time it is called it will detect //the system is solved and exit after one matrix get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { transfer_cl_float_buffer_from_device(&clData,clData.buf_pressure,g_pressure,clData.n,true); transfer_cl_float_buffer_from_device(&clData,clData.buf_divergence,g_divergence,clData.n,true); run_cl_cg_no_mtx(&clData,g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q, clData.n, 10, 0.0001f); flush_cl_queue(); transfer_cl_float_buffer_to_device(&clData,clData.buf_pressure,g_pressure,clData.n,true); } flush_cl_queue(); get_timestamp(&time2); projectCGTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { for(int i = 0; i < 20; ++i) { run_cl_pressure_solve(&clData, dt); } } flush_cl_queue(); get_timestamp(&time2); projectJacobiTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_pressure_apply(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); pressureApplyTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { run_cl_advect_density(&clData, dt); } flush_cl_queue(); get_timestamp(&time2); advectionDensityTimeGPU = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Velocity",advectionVelocityTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Advection Density",advectionDensityTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Divergence",divergenceTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU", "Projection Jacobi",projectJacobiTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t",device_name,NX,NY,NZ,"GPU", "Projection Conjugate Gradient",projectCGTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeGPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"GPU","Pressure Apply",pressureApplyTimeGPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeGPU); cleanup_cl(&clData); //////////////////////////////////////////////////// ///CPU TIMINGS //////////////////////////////////////////////////// double advectionVelocityTimeCPU, advectionDensityTimeCPU, divergenceTimeCPU, projectJacobiTimeCPU, projectCGTimeCPU, pressureApplyTimeCPU; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { advect_velocity_RK2(dt, g_u, g_v, g_w, g_u_prev, g_v_prev, g_w_prev); } get_timestamp(&time2); advectionVelocityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; //project(dt,g_u,g_v, g_w, g_divergence, g_pressure, g_pressure_prev, g_laplacian_matrix,useCG); get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { calculate_divergence(g_divergence, g_u, g_v, g_w, dt); } get_timestamp(&time2); divergenceTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; //This needs ntrips different divergence matrices to get accurate timings. //This is because by the time the second time it is called it will detect //the system is solved and exit after one matrix get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_solve_cg_no_matrix(g_pressure, g_divergence, g_cg_r, g_cg_d, g_cg_q); } get_timestamp(&time2); projectCGTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_solve(g_pressure,g_pressure_prev, g_divergence, dt); } get_timestamp(&time2); projectJacobiTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { pressure_apply(g_u, g_v, g_w, g_pressure, dt); } get_timestamp(&time2); pressureApplyTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; get_timestamp(&time1); for(int i = 0; i < ntrips; ++i) { advectRK2(dt,g_dens,g_dens_prev, g_u, g_v, g_w); } get_timestamp(&time2); advectionDensityTimeCPU = timestamp_diff_in_seconds(time1,time2)/ntrips; printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Velocity",advectionVelocityTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionVelocityTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Advection Density",advectionDensityTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/advectionDensityTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Divergence",divergenceTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/divergenceTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Jacobi",projectJacobiTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectJacobiTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Projection Conjugate Gradient",projectCGTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/projectCGTimeCPU); printf("%s\t%dx%dx%d\t%s\t%s\t %3.6f\ts\t", device_name,NX,NY,NZ,"CPU","Pressure Apply",pressureApplyTimeCPU); printf("%.3f\tMegaCells/s\n",(NX*NY*NZ)*1e-6/pressureApplyTimeCPU); }