//This function would create a matrix A[1024][512] and B[512][1024]. For matrix storing AXB the size would be C[1024][1024]. Where C is initialized with 0 and A and B with random values. static void initialize_matrix(void) { unsigned i,j; A=(float *) malloc(zdim*ydim*sizeof(float)); B=(float *) malloc(xdim*zdim*sizeof(float)); C=(float *) malloc(xdim*ydim*sizeof(float)); srand(2013); for (j=0;j<ydim;j++) { for (i=0;i<zdim;i++) { A[j+i*ydim] = (float)(starpu_drand48()); } } for (j=0;j<zdim;j++) { for (i=0;i<xdim;i++) { B[j+i*zdim] = (float)(starpu_drand48()); } } for (j=0;j<ydim;j++) { for (i=0;i<xdim;i++) { C[j+i*ydim]=(float)(0); } } }
static void init_problem_data(void) { unsigned i,j; #ifndef STARPU_SIMGRID starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE)); starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE)); starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE)); /* fill the A and B matrices */ for (j=0; j < ydim; j++) { for (i=0; i < zdim; i++) { A[j+i*ydim] = (TYPE)(starpu_drand48()); } } for (j=0; j < zdim; j++) { for (i=0; i < xdim; i++) { B[j+i*zdim] = (TYPE)(starpu_drand48()); } } for (j=0; j < ydim; j++) { for (i=0; i < xdim; i++) { C[j+i*ydim] = (TYPE)(0); } } #endif }
static int random_push_task(struct starpu_sched_component * component, struct starpu_task * task) { STARPU_ASSERT(component->nchildren > 0); /* indexes_components and size are used to memoize component that can execute tasks * during the first phase of algorithm, it contain the size indexes of the components * that can execute task. */ int indexes_components[component->nchildren]; int size=0; /* speedup[i] is revelant only if i is in the size firsts elements of * indexes_components */ double speedup[component->nchildren]; double alpha_sum = 0.0; int i; for(i = 0; i < component->nchildren ; i++) { if(starpu_sched_component_can_execute_task(component->children[i],task)) { speedup[size] = compute_relative_speedup(component->children[i]); alpha_sum += speedup[size]; indexes_components[size] = i; size++; } } if(size == 0) return -ENODEV; /* not fully sure that this code is correct * because of bad properties of double arithmetic */ double random = starpu_drand48()*alpha_sum; double alpha = 0.0; struct starpu_sched_component * select = NULL; for(i = 0; i < size ; i++) { int index = indexes_components[i]; if(alpha + speedup[i] >= random) { select = component->children[index]; break; } alpha += speedup[i]; } STARPU_ASSERT(select != NULL); if(starpu_sched_component_is_worker(select)) { select->can_pull(select); return 1; } int ret_val = select->push_task(select,task); return ret_val; }
static int run(struct starpu_sched_policy *policy) { int ret; struct starpu_conf conf; int i; starpu_conf_init(&conf); conf.sched_policy = policy; ret = starpu_init(&conf); if (ret != 0) exit(STARPU_TEST_SKIPPED); starpu_profiling_status_set(1); struct starpu_codelet clA = { .cpu_funcs = {A}, .nbuffers = 0 }; struct starpu_codelet clB = { .cpu_funcs = {B}, .nbuffers = 0 }; starpu_srand48(0); for (i = 0; i < NTASKS; i++) { struct starpu_task *task = starpu_task_create(); if (((int)(starpu_drand48()*2))%2) { task->cl = &clA; task->priority=STARPU_MIN_PRIO; } else { task->cl = &clB; task->priority=STARPU_MAX_PRIO; } task->detach=1; ret = starpu_task_submit(task); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); FPRINTF(stdout,"\n"); starpu_shutdown(); return 0; enodev: starpu_shutdown(); return -ENODEV; }
static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks) { const unsigned block_size = (psize/pnblocks); unsigned i, j; for (i = 0; i < block_size; i++) for (j = 0; j < block_size; j++) { blockptr[j+i*block_size] = (TYPE)starpu_drand48(); } }
int main(int argc, char **argv) { int ret; /* Not supported yet */ if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0) return 77; ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl", &_opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); #endif #ifdef STARPU_USE_CUDA /* cublasSdot has synchronization issues when using a non-blocking stream */ cublasGetVersion(&cublas_version); if (cublas_version >= 7050) starpu_cublas_init(); #endif unsigned long nelems = _nblocks*_entries_per_block; size_t size = nelems*sizeof(float); _x = (float *) malloc(size); _y = (float *) malloc(size); _x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t)); _y_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t)); assert(_x && _y); starpu_srand48(0); DOT_TYPE reference_dot = 0.0; unsigned long i; for (i = 0; i < nelems; i++) { _x[i] = (float)starpu_drand48(); _y[i] = (float)starpu_drand48(); reference_dot += (DOT_TYPE)_x[i]*(DOT_TYPE)_y[i]; } unsigned block; for (block = 0; block < _nblocks; block++) { starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM, (uintptr_t)&_x[_entries_per_block*block], _entries_per_block, sizeof(float)); starpu_vector_data_register(&_y_handles[block], STARPU_MAIN_RAM, (uintptr_t)&_y[_entries_per_block*block], _entries_per_block, sizeof(float)); } starpu_variable_data_register(&_dot_handle, STARPU_MAIN_RAM, (uintptr_t)&_dot, sizeof(DOT_TYPE)); /* * Compute dot product with StarPU */ starpu_data_set_reduction_methods(_dot_handle, &redux_codelet, &init_codelet); for (block = 0; block < _nblocks; block++) { struct starpu_task *task = starpu_task_create(); task->cl = &dot_codelet; task->destroy = 1; task->handles[0] = _x_handles[block]; task->handles[1] = _y_handles[block]; task->handles[2] = _dot_handle; ret = starpu_task_submit(task); if (ret == -ENODEV) goto enodev; STARPU_ASSERT(!ret); } for (block = 0; block < _nblocks; block++) { starpu_data_unregister(_x_handles[block]); starpu_data_unregister(_y_handles[block]); } starpu_data_unregister(_dot_handle); FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, _dot, reference_dot - _dot); #ifdef STARPU_USE_CUDA if (cublas_version >= 7050) starpu_cublas_shutdown(); #endif #ifdef STARPU_USE_OPENCL ret = starpu_opencl_unload_opencl(&_opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); #endif starpu_shutdown(); free(_x); free(_y); free(_x_handles); free(_y_handles); if (fabs(reference_dot - _dot) < reference_dot * 1e-6) return EXIT_SUCCESS; else return EXIT_FAILURE; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ return 77; }
int main(int argc, char *argv[]) { int i; struct timeval begin, end; int size; size_t bytes; int n = 0, m = 0; STARPUFFT(plan) plan; #ifdef STARPU_HAVE_FFTW _FFTW(plan) fftw_plan; #endif #ifdef STARPU_USE_CUDA cufftHandle cuda_plan; cudaError_t cures; #endif double timing; if (argc < 2 || argc > 3) { fprintf(stderr,"need one or two size of vector\n"); exit(EXIT_FAILURE); } starpu_init(NULL); if (argc == 2) { n = atoi(argv[1]); /* 1D */ size = n; } else if (argc == 3) { n = atoi(argv[1]); m = atoi(argv[2]); /* 2D */ size = n * m; } else { assert(0); } bytes = size * sizeof(STARPUFFT(complex)); STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in)); starpu_srand48(0); for (i = 0; i < size; i++) in[i] = starpu_drand48() + I * starpu_drand48(); STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out)); #ifdef STARPU_HAVE_FFTW STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw)); #endif #ifdef STARPU_USE_CUDA STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda)); #endif if (argc == 2) { plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS) printf("erf\n"); #endif } else if (argc == 3) { plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0); #ifdef STARPU_HAVE_FFTW fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE); #endif #ifdef STARPU_USE_CUDA STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS); #endif } else { assert(0); } #ifdef STARPU_HAVE_FFTW gettimeofday(&begin, NULL); _FFTW(execute)(fftw_plan); gettimeofday(&end, NULL); _FFTW(destroy_plan)(fftw_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif #ifdef STARPU_USE_CUDA gettimeofday(&begin, NULL); if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS) printf("erf2\n"); if ((cures = cudaThreadSynchronize()) != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures); gettimeofday(&end, NULL); cufftDestroy(cuda_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing); #endif STARPUFFT(execute)(plan, in, out); STARPUFFT(showstats)(stdout); STARPUFFT(destroy_plan)(plan); printf("\n"); #if 0 for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(in[i]), creal(in[i])); printf("\n\n"); for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out[i]), creal(out[i])); printf("\n\n"); #ifdef STARPU_HAVE_FFTW for (i = 0; i < 16; i++) printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i])); printf("\n\n"); #endif #endif #ifdef STARPU_HAVE_FFTW { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif #ifdef STARPU_USE_CUDA { double max = 0., tot = 0., norm = 0., normdiff = 0.; for (i = 0; i < size; i++) { double diff = cabs(out_cuda[i]-out_fftw[i]); double diff2 = diff * diff; double size = cabs(out_fftw[i]); double size2 = size * size; if (diff > max) max = diff; tot += diff; normdiff += diff2; norm += size2; } fprintf(stderr, "\nmaximum difference %g\n", max); fprintf(stderr, "average difference %g\n", tot / size); fprintf(stderr, "difference norm %g\n", sqrt(normdiff)); double relmaxdiff = max / sqrt(norm); fprintf(stderr, "relative maximum difference %g\n", relmaxdiff); double relavgdiff = (tot / size) / sqrt(norm); fprintf(stderr, "relative average difference %g\n", relavgdiff); if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8)) return EXIT_FAILURE; if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16)) return EXIT_FAILURE; } #endif STARPUFFT(free)(in); STARPUFFT(free)(out); #ifdef STARPU_HAVE_FFTW STARPUFFT(free)(out_fftw); #endif #ifdef STARPU_USE_CUDA free(out_cuda); #endif starpu_shutdown(); return EXIT_SUCCESS; }
static void init_problem_data(void) { unsigned i,j; #ifdef STARPU_USE_CUDA if (pin) { starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float)); starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float)); starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float)); } else #endif { #ifdef STARPU_HAVE_POSIX_MEMALIGN posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float)); posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float)); posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float)); #else A = malloc(zdim*ydim*sizeof(float)); B = malloc(xdim*zdim*sizeof(float)); C = malloc(xdim*ydim*sizeof(float)); #endif } /* fill the A and B matrices */ if (norandom) { for (j=0; j < ydim; j++) { for (i=0; i < zdim; i++) { A[j+i*ydim] = (float)(i); } } for (j=0; j < zdim; j++) { for (i=0; i < xdim; i++) { B[j+i*zdim] = (float)(j); } } } else { #ifdef NORANDOM srand(2008); STARPU_ABORT(); #endif for (j=0; j < ydim; j++) { for (i=0; i < zdim; i++) { A[j+i*ydim] = (float)(starpu_drand48()); } } for (j=0; j < zdim; j++) { for (i=0; i < xdim; i++) { B[j+i*zdim] = (float)(starpu_drand48()); } } } for (j=0; j < ydim; j++) { for (i=0; i < xdim; i++) { C[j+i*ydim] = (float)(0); } } display_memory_consumption(); }
int main(int argc, char *argv[]) { int i; struct timeval begin, end; int size; size_t bytes; int n = 0, m = 0; _FFTW(plan) fftw_plan; double timing; char *num; int num_threads = 1; _FFTW(init_threads)(); num = getenv("NUM_THREADS"); if (num) num_threads = atoi(num); _FFTW(plan_with_nthreads)(num_threads); if (argc < 2 || argc > 3) { fprintf(stderr,"need one or two size of vector\n"); exit(EXIT_FAILURE); } if (argc == 2) { n = atoi(argv[1]); /* 1D */ size = n; } else if (argc == 3) { n = atoi(argv[1]); m = atoi(argv[2]); /* 2D */ size = n * m; } else { assert(0); } bytes = size * sizeof(_FFTW(complex)); _FFTW(complex) *in = _FFTW(malloc)(size * sizeof(*in)); starpu_srand48(0); for (i = 0; i < size; i++) in[i] = starpu_drand48() + I * starpu_drand48(); _FFTW(complex) *out_fftw = _FFTW(malloc)(size * sizeof(*out_fftw)); if (argc == 2) { fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE); } else if (argc == 3) { fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE); } else { assert(0); } gettimeofday(&begin, NULL); _FFTW(execute)(fftw_plan); gettimeofday(&end, NULL); _FFTW(destroy_plan)(fftw_plan); timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec)); printf("FFTW with %d threads took %2.2f ms (%2.2f MB/s)\n\n", num_threads, timing/1000, bytes/(timing*num_threads)); printf("\n"); return EXIT_SUCCESS; }
int main(int argc, char **argv) { int my_rank, size, x, y, loop; float mean=0; float matrix[X][Y]; starpu_data_handle_t data_handles[X][Y]; int ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); starpu_mpi_init(&argc, &argv, 1); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &size); parse_args(argc, argv); /* Initial data values */ starpu_srand48((long int)time(NULL)); for(x = 0; x < X; x++) { for (y = 0; y < Y; y++) { matrix[x][y] = (float)starpu_drand48(); mean += matrix[x][y]; } } mean /= (X*Y); if (display) { FPRINTF_MPI(stdout, "mean=%2.2f\n", mean); for(x = 0; x < X; x++) { fprintf(stdout, "[%d] ", my_rank); for (y = 0; y < Y; y++) { fprintf(stdout, "%2.2f ", matrix[x][y]); } fprintf(stdout, "\n"); } } /* Initial distribution */ for(x = 0; x < X; x++) { for (y = 0; y < Y; y++) { int mpi_rank = my_distrib(x, y, size); if (mpi_rank == my_rank) { //FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y); starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(float)); } else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size) || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size)) { /* I don't own that index, but will need it for my computations */ //FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y); starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float)); } else { /* I know it's useless to allocate anything for this */ data_handles[x][y] = NULL; } if (data_handles[x][y]) { starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank); } } } /* First computation with initial distribution */ for(loop=0 ; loop<niter; loop++) { for (x = 1; x < X-1; x++) { for (y = 1; y < Y-1; y++) { starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y], STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y], STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1], 0); } } } FPRINTF(stderr, "Waiting ...\n"); starpu_task_wait_for_all(); /* Now migrate data to a new distribution */ /* First register newly needed data */ for(x = 0; x < X; x++) { for (y = 0; y < Y; y++) { int mpi_rank = my_distrib2(x, y, size); if (!data_handles[x][y] && (mpi_rank == my_rank || my_rank == my_distrib2(x+1, y, size) || my_rank == my_distrib2(x-1, y, size) || my_rank == my_distrib2(x, y+1, size) || my_rank == my_distrib2(x, y-1, size))) { /* Register newly-needed data */ starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float)); starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank); } if (data_handles[x][y] && mpi_rank != starpu_mpi_data_get_rank(data_handles[x][y])) { /* Migrate the data */ starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL); /* And register new rank of the matrix */ starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank); } } } /* Second computation with new distribution */ for(loop=0 ; loop<niter; loop++) { for (x = 1; x < X-1; x++) { for (y = 1; y < Y-1; y++) { starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y], STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y], STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1], 0); } } } FPRINTF(stderr, "Waiting ...\n"); starpu_task_wait_for_all(); /* Unregister data */ for(x = 0; x < X; x++) { for (y = 0; y < Y; y++) { if (data_handles[x][y]) { int mpi_rank = my_distrib(x, y, size); /* Get back data to original place where the user-provided buffer is. */ starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL); /* Register original rank of the matrix (although useless) */ starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank); /* And unregister it */ starpu_data_unregister(data_handles[x][y]); } } } starpu_mpi_shutdown(); starpu_shutdown(); if (display) { FPRINTF(stdout, "[%d] mean=%2.2f\n", my_rank, mean); for(x = 0; x < X; x++) { FPRINTF(stdout, "[%d] ", my_rank); for (y = 0; y < Y; y++) { FPRINTF(stdout, "%2.2f ", matrix[x][y]); } FPRINTF(stdout, "\n"); } } return 0; }
int main(int argc, char **argv) { int rank; int world_size; /* * Initialization */ int thread_support; if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) { fprintf(stderr,"MPI_Init_thread failed\n"); exit(1); } if (thread_support == MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); if (thread_support < MPI_THREAD_FUNNELED) fprintf(stderr,"Warning: MPI does not have thread support!\n"); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); starpu_srand48((long int)time(NULL)); parse_args(rank, argc, argv); int ret = starpu_init(NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* We disable sequential consistency in this example */ starpu_data_set_default_sequential_consistency_flag(0); starpu_mpi_init(NULL, NULL, 0); STARPU_ASSERT(p*q == world_size); starpu_cublas_init(); int barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); /* * Problem Init */ init_matrix(rank); fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank, (int)(allocated_memory/(1024*1024)), (int)(allocated_memory_extra/(1024*1024)), (int)((allocated_memory+allocated_memory_extra)/(1024*1024))); display_grid(rank, nblocks); TYPE *a_r = NULL; // STARPU_PLU(display_data_content)(a_r, size); TYPE *x, *y; if (check) { x = calloc(size, sizeof(TYPE)); STARPU_ASSERT(x); y = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { unsigned ind; for (ind = 0; ind < size; ind++) x[ind] = (TYPE)starpu_drand48(); } a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks); if (rank == 0) STARPU_PLU(display_data_content)(a_r, size); // STARPU_PLU(compute_ax)(size, x, y, nblocks, rank); } barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size); /* * Report performance */ int reduce_ret; double min_timing = timing; double max_timing = timing; double sum_timing = timing; reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); STARPU_ASSERT(reduce_ret == MPI_SUCCESS); if (rank == 0) { fprintf(stderr, "Computation took: %f ms\n", max_timing/1000); fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000); fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000); fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000)); unsigned n = size; double flop = (2.0f*n*n*n)/3.0f; fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f)); } /* * Test Result Correctness */ if (check) { /* * Compute || A - LU || */ STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r); #if 0 /* * Compute || Ax - LUx || */ unsigned ind; y2 = calloc(size, sizeof(TYPE)); STARPU_ASSERT(y); if (rank == 0) { for (ind = 0; ind < size; ind++) { y2[ind] = (TYPE)0.0; } } STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank); /* Compute y2 = y2 - y */ CPU_AXPY(size, -1.0, y, 1, y2, 1); TYPE err = CPU_ASUM(size, y2, 1); int max = CPU_IAMAX(size, y2, 1); fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size)); fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]); #endif } /* * Termination */ barrier_ret = MPI_Barrier(MPI_COMM_WORLD); STARPU_ASSERT(barrier_ret == MPI_SUCCESS); starpu_cublas_shutdown(); starpu_mpi_shutdown(); starpu_shutdown(); #if 0 MPI_Finalize(); #endif return 0; }
int main(int argc, char **argv) { unsigned long i; int ret; /* Not supported yet */ if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0) return 77; ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); unsigned long nelems = _nblocks*_entries_per_bock; size_t size = nelems*sizeof(TYPE); _x = (TYPE *) malloc(size); _x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t)); assert(_x && _x_handles); /* Initialize the vector with random values */ starpu_srand48(0); for (i = 0; i < nelems; i++) _x[i] = (TYPE)starpu_drand48(); unsigned block; for (block = 0; block < _nblocks; block++) { uintptr_t block_start = (uintptr_t)&_x[_entries_per_bock*block]; starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM, block_start, _entries_per_bock, sizeof(TYPE)); } /* Initialize current min */ _minmax[0] = TYPE_MAX; /* Initialize current max */ _minmax[1] = TYPE_MIN; starpu_variable_data_register(&_minmax_handle, STARPU_MAIN_RAM, (uintptr_t)_minmax, 2*sizeof(TYPE)); /* Set the methods to define neutral elements and to perform the reduction operation */ starpu_data_set_reduction_methods(_minmax_handle, &minmax_redux_codelet, &minmax_init_codelet); for (block = 0; block < _nblocks; block++) { struct starpu_task *task = starpu_task_create(); task->cl = &minmax_codelet; task->handles[0] = _x_handles[block]; task->handles[1] = _minmax_handle; ret = starpu_task_submit(task); if (ret) { STARPU_ASSERT(ret == -ENODEV); FPRINTF(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n"); return 77; } } for (block = 0; block < _nblocks; block++) { starpu_data_unregister(_x_handles[block]); } starpu_data_unregister(_minmax_handle); FPRINTF(stderr, "Min : %e\n", _minmax[0]); FPRINTF(stderr, "Max : %e\n", _minmax[1]); STARPU_ASSERT(_minmax[0] <= _minmax[1]); free(_x); free(_x_handles); starpu_shutdown(); return 0; }