static void init_problem_data(void) { unsigned i,j; #ifndef STARPU_SIMGRID starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE)); starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE)); starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE)); /* fill the A and B matrices */ for (j=0; j < ydim; j++) { for (i=0; i < zdim; i++) { A[j+i*ydim] = (TYPE)(starpu_drand48()); } } for (j=0; j < zdim; j++) { for (i=0; i < xdim; i++) { B[j+i*zdim] = (TYPE)(starpu_drand48()); } } for (j=0; j < ydim; j++) { for (i=0; i < xdim; i++) { C[j+i*ydim] = (TYPE)(0); } } #endif }
void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere) { unsigned i,j,x,y; *bmat = malloc(nblocks * sizeof(float **)); for(x=0 ; x<nblocks ; x++) { (*bmat)[x] = malloc(nblocks * sizeof(float *)); for(y=0 ; y<nblocks ; y++) { int mpi_rank = my_distrib(x, y, nodes); if (alloc_everywhere || (mpi_rank == rank)) { starpu_malloc((void **)&(*bmat)[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float)); for (i = 0; i < BLOCKSIZE; i++) { for (j = 0; j < BLOCKSIZE; j++) { (*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f); //mat[j +i*size] = ((i == j)?1.0f*size:0.0f); } } } } } }
int main(int argc, char **argv) { int ret; ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); starpu_malloc((void**)&data, sizeof(*data)); *data = 42; /* register a piece of data */ starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)data, 1, sizeof(unsigned)); struct starpu_task *task = starpu_task_create(); task->cl = &wrong_codelet; task->handles[0] = handle; task->use_tag = 1; task->tag_id = TAG; task->callback_func = wrong_callback; task->detach = 0; ret = starpu_task_submit(task); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); ret = starpu_tag_wait(TAG); STARPU_CHECK_RETURN_VALUE(ret, "starpu_tag_wait"); /* This call is valid as it is done by the application outside a * callback */ ret = starpu_data_acquire(handle, STARPU_RW); STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire"); starpu_data_release(handle); ret = starpu_task_wait(task); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait"); starpu_data_unregister(handle); starpu_free(data); starpu_shutdown(); return EXIT_SUCCESS; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ starpu_shutdown(); return STARPU_TEST_SKIPPED; }
static int initialize_system(float **A, unsigned dim, unsigned pinned) { int ret; #ifdef STARPU_HAVE_MAGMA magma_init(); #endif ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_CUDA initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost); initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost); initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost); #else initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL); initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL); initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL); #endif starpu_cublas_init(); #ifndef STARPU_SIMGRID if (pinned) { starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float)); } else { *A = malloc(dim*dim*sizeof(float)); } #endif return 0; }
int main(int argc, char **argv) { int i, j, ret; ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_initialize"); float *data; starpu_malloc((void**)&data, sizeof(*data) * NB_BUNDLE); float factors[NB_BUNDLE]; starpu_data_handle_t handles[NB_BUNDLE]; struct starpu_task *task[NB_ITERATION]; starpu_task_bundle_t bundles[NB_BUNDLE]; for (i = 0; i < NB_BUNDLE; i++) { data[i] = i + 1; factors[i] = NB_BUNDLE - i; } for (i = 0; i < NB_BUNDLE; i++) starpu_variable_data_register(&handles[i], STARPU_MAIN_RAM, (uintptr_t)&data[i], sizeof(float)); FPRINTF(stderr, "VALUES:"); for (i = 0; i < NB_BUNDLE; i++) FPRINTF(stderr, " %f (%f)", data[i], factors[i]); FPRINTF(stderr, "\n"); for (i = 0; i < NB_BUNDLE; i++) { starpu_task_bundle_create(&bundles[i]); for (j = 0; j < NB_ITERATION; j++) { task[j] = starpu_task_create(); task[j]->cl = &codelet; task[j]->cl_arg = &factors[i]; task[j]->cl_arg_size = sizeof(float); task[j]->handles[0] = handles[i]; ret = starpu_task_bundle_insert(bundles[i], task[j]); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert"); } ret = starpu_task_bundle_remove(bundles[i], task[NB_ITERATION / 2]); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_remove"); for (j = 0; j < NB_ITERATION; j++) { ret = starpu_task_submit(task[j]); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_bundle_close(bundles[i]); } ret = starpu_task_wait_for_all(); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all"); for(i = 0; i < NB_BUNDLE ; i++) { ret = starpu_data_acquire(handles[i], STARPU_R); STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire"); } FPRINTF(stderr, "VALUES:"); for (i = 0; i < NB_BUNDLE; i++) FPRINTF(stderr, " %f (%f)", data[i], factors[i]); FPRINTF(stderr, "\n"); for(i = 0; i < NB_BUNDLE ; i++) { starpu_data_release(handles[i]); starpu_data_unregister(handles[i]); } starpu_free(data); starpu_shutdown(); return EXIT_SUCCESS; enodev: starpu_shutdown(); fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ return STARPU_TEST_SKIPPED; }
int main(int argc, char **argv) { int ret; #ifdef STARPU_QUICK_CHECK nbuffers /= 4; niter /= 4; vectorsize /= 8; #endif ret = starpu_initialize(NULL, &argc, &argv); if (ret == -ENODEV) return STARPU_TEST_SKIPPED; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* Allocate all buffers and register them to StarPU */ int b; for (b = 0; b < nbuffers; b++) { ret = starpu_malloc((void **)&buffer[b], vectorsize); STARPU_CHECK_RETURN_VALUE(ret, "starpu_malloc"); starpu_vector_data_register(&v_handle[b], STARPU_MAIN_RAM, (uintptr_t)buffer[b], vectorsize, sizeof(char)); } int iter; for (iter = 0; iter < niter; iter++) { /* Use the buffers on the different workers so that it may not * be in main memory anymore */ for (b = 0; b < nbuffers; b++) { ret = use_handle(v_handle[b]); if (ret == -ENODEV) goto enodev; STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } ret = starpu_task_wait_for_all(); STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_wait_for_all"); /* Grab the different pieces of data into main memory */ for (b = 0; b < nbuffers; b++) { ret = starpu_data_acquire(v_handle[b], STARPU_RW); STARPU_CHECK_RETURN_VALUE(ret, "starpu_data_acquire"); } /* Release them */ for (b = 0; b < nbuffers; b++) starpu_data_release(v_handle[b]); } /* do some cleanup */ for (b = 0; b < nbuffers; b++) { starpu_data_unregister(v_handle[b]); starpu_free(buffer[b]); } starpu_shutdown(); return EXIT_SUCCESS; enodev: fprintf(stderr, "WARNING: No one can execute this task\n"); /* yes, we do not perform the computation but we did detect that no one * could perform the kernel, so this is not an error from StarPU */ starpu_shutdown(); return STARPU_TEST_SKIPPED; }
int main(int argc, char **argv) { int ret, exit_value = 0; /* Initialize StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_load_opencl_from_file("examples/axpy/axpy_opencl_kernel.cl", &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); #endif starpu_cublas_init(); /* This is equivalent to vec_a = malloc(N*sizeof(TYPE)); vec_b = malloc(N*sizeof(TYPE)); */ starpu_malloc((void **)&_vec_x, N*sizeof(TYPE)); assert(_vec_x); starpu_malloc((void **)&_vec_y, N*sizeof(TYPE)); assert(_vec_y); unsigned i; for (i = 0; i < N; i++) { _vec_x[i] = 1.0f; /*(TYPE)starpu_drand48(); */ _vec_y[i] = 4.0f; /*(TYPE)starpu_drand48(); */ } FPRINTF(stderr, "BEFORE x[0] = %2.2f\n", _vec_x[0]); FPRINTF(stderr, "BEFORE y[0] = %2.2f\n", _vec_y[0]); /* Declare the data to StarPU */ starpu_vector_data_register(&_handle_x, STARPU_MAIN_RAM, (uintptr_t)_vec_x, N, sizeof(TYPE)); starpu_vector_data_register(&_handle_y, STARPU_MAIN_RAM, (uintptr_t)_vec_y, N, sizeof(TYPE)); /* Divide the vector into blocks */ struct starpu_data_filter block_filter = { .filter_func = starpu_vector_filter_block, .nchildren = NBLOCKS }; starpu_data_partition(_handle_x, &block_filter); starpu_data_partition(_handle_y, &block_filter); double start; double end; start = starpu_timing_now(); unsigned b; for (b = 0; b < NBLOCKS; b++) { struct starpu_task *task = starpu_task_create(); task->cl = &axpy_cl; task->cl_arg = &_alpha; task->cl_arg_size = sizeof(_alpha); task->handles[0] = starpu_data_get_sub_data(_handle_x, 1, b); task->handles[1] = starpu_data_get_sub_data(_handle_y, 1, b); task->tag_id = b; ret = starpu_task_submit(task); if (ret == -ENODEV) { exit_value = 77; goto enodev; } STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit"); } starpu_task_wait_for_all(); enodev: starpu_data_unpartition(_handle_x, STARPU_MAIN_RAM); starpu_data_unpartition(_handle_y, STARPU_MAIN_RAM); starpu_data_unregister(_handle_x); starpu_data_unregister(_handle_y); end = starpu_timing_now(); double timing = end - start; FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(TYPE)/timing); FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[0], _alpha); if (exit_value != 77) exit_value = check(); starpu_free((void *)_vec_x); starpu_free((void *)_vec_y); #ifdef STARPU_USE_OPENCL ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); #endif /* Stop StarPU */ starpu_shutdown(); return exit_value; }
static void init_matrix(int rank) { #ifdef STARPU_HAVE_LIBNUMA if (numa) { fprintf(stderr, "Using INTERLEAVE policy\n"); unsigned long nodemask = ((1<<0)|(1<<1)); int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3); if (ret) perror("set_mempolicy failed"); } #endif /* Allocate a grid of data handles, not all of them have to be allocated later on */ dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t)); dataA = calloc(nblocks*nblocks, sizeof(TYPE *)); allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE); /* Allocate all the blocks that belong to this mpi node */ unsigned long i,j; for (j = 0; j < nblocks; j++) { for (i = 0; i < nblocks; i++) { TYPE **blockptr = &dataA[j+i*nblocks]; // starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i]; starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i]; if (get_block_rank(i, j) == rank) { /* This blocks should be treated by the current MPI process */ /* Allocate and fill it */ starpu_malloc((void **)blockptr, blocksize); allocated_memory += blocksize; //fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j); fill_block_with_random(*blockptr, size, nblocks); //fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j); if (i == j) { unsigned tmp; for (tmp = 0; tmp < size/nblocks; tmp++) { (*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks; } } /* Register it to StarPU */ starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*blockptr, size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } else { *blockptr = STARPU_POISON_PTR; *handleptr = STARPU_POISON_PTR; } } } /* Allocate the temporary buffers required for the distributed algorithm */ unsigned k; /* tmp buffer 11 */ #ifdef SINGLE_TMP11 starpu_malloc((void **)&tmp_11_block, blocksize); allocated_memory_extra += blocksize; starpu_matrix_data_register(&tmp_11_block_handle, STARPU_MAIN_RAM, (uintptr_t)tmp_11_block, size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); #else tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_11_block = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); for (k = 0; k < nblocks; k++) { if (tmp_11_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_11_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_11_block[k]); starpu_matrix_data_register(&tmp_11_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_11_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } } #endif /* tmp buffers 12 and 21 */ #ifdef SINGLE_TMP1221 tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_21_block_handles = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_12_block = calloc(nblocks, sizeof(TYPE *)); tmp_21_block = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); #else for (i = 0; i < 2; i++) { tmp_12_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_21_block_handles[i] = calloc(nblocks, sizeof(starpu_data_handle_t)); tmp_12_block[i] = calloc(nblocks, sizeof(TYPE *)); tmp_21_block[i] = calloc(nblocks, sizeof(TYPE *)); allocated_memory_extra += 2*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *)); } #endif for (k = 0; k < nblocks; k++) { #ifdef SINGLE_TMP1221 if (tmp_12_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_12_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_12_block[k]); starpu_matrix_data_register(&tmp_12_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_12_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } if (tmp_21_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_21_block[k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_21_block[k]); starpu_matrix_data_register(&tmp_21_block_handles[k], STARPU_MAIN_RAM, (uintptr_t)tmp_21_block[k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } #else for (i = 0; i < 2; i++) { if (tmp_12_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_12_block[i][k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_12_block[i][k]); starpu_matrix_data_register(&tmp_12_block_handles[i][k], STARPU_MAIN_RAM, (uintptr_t)tmp_12_block[i][k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } if (tmp_21_block_is_needed(rank, nblocks, k)) { starpu_malloc((void **)&tmp_21_block[i][k], blocksize); allocated_memory_extra += blocksize; STARPU_ASSERT(tmp_21_block[i][k]); starpu_matrix_data_register(&tmp_21_block_handles[i][k], STARPU_MAIN_RAM, (uintptr_t)tmp_21_block[i][k], size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE)); } } #endif } //display_all_blocks(nblocks, size/nblocks); }
int main(int argc, char **argv) { int ret; unsigned part; double timing; double start, end; unsigned row, pos; unsigned ind; /* CSR matrix description */ float *nzval; uint32_t nnz; uint32_t *colind; uint32_t *rowptr; /* Input and Output vectors */ float *vector_in_ptr; float *vector_out_ptr; /* * Parse command-line arguments */ parse_args(argc, argv); /* * Launch StarPU */ ret = starpu_init(NULL); if (ret == -ENODEV) return 77; STARPU_CHECK_RETURN_VALUE(ret, "starpu_init"); /* * Create a 3-band sparse matrix as input example */ nnz = 3*size-2; starpu_malloc((void **)&nzval, nnz*sizeof(float)); starpu_malloc((void **)&colind, nnz*sizeof(uint32_t)); starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t)); assert(nzval && colind && rowptr); /* fill the matrix */ for (row = 0, pos = 0; row < size; row++) { rowptr[row] = pos; if (row > 0) { nzval[pos] = 1.0f; colind[pos] = row-1; pos++; } nzval[pos] = 5.0f; colind[pos] = row; pos++; if (row < size - 1) { nzval[pos] = 1.0f; colind[pos] = row+1; pos++; } } STARPU_ASSERT(pos == nnz); rowptr[size] = nnz; /* initiate the 2 vectors */ starpu_malloc((void **)&vector_in_ptr, size*sizeof(float)); starpu_malloc((void **)&vector_out_ptr, size*sizeof(float)); assert(vector_in_ptr && vector_out_ptr); /* fill them */ for (ind = 0; ind < size; ind++) { vector_in_ptr[ind] = 2.0f; vector_out_ptr[ind] = 0.0f; } /* * Register the CSR matrix and the 2 vectors */ starpu_csr_data_register(&sparse_matrix, STARPU_MAIN_RAM, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float)); starpu_vector_data_register(&vector_in, STARPU_MAIN_RAM, (uintptr_t)vector_in_ptr, size, sizeof(float)); starpu_vector_data_register(&vector_out, STARPU_MAIN_RAM, (uintptr_t)vector_out_ptr, size, sizeof(float)); /* * Partition the CSR matrix and the output vector */ csr_f.nchildren = nblocks; vector_f.nchildren = nblocks; starpu_data_partition(sparse_matrix, &csr_f); starpu_data_partition(vector_out, &vector_f); /* * If we use OpenCL, we need to compile the SpMV kernel */ #ifdef STARPU_USE_OPENCL compile_spmv_opencl_kernel(); #endif start = starpu_timing_now(); /* * Create and submit StarPU tasks */ for (part = 0; part < nblocks; part++) { struct starpu_task *task = starpu_task_create(); task->cl = &spmv_cl; task->handles[0] = starpu_data_get_sub_data(sparse_matrix, 1, part); task->handles[1] = vector_in; task->handles[2] = starpu_data_get_sub_data(vector_out, 1, part); ret = starpu_task_submit(task); if (STARPU_UNLIKELY(ret == -ENODEV)) { FPRINTF(stderr, "No worker may execute this task\n"); exit(0); } } starpu_task_wait_for_all(); end = starpu_timing_now(); /* * Unregister the CSR matrix and the output vector */ starpu_data_unpartition(sparse_matrix, STARPU_MAIN_RAM); starpu_data_unpartition(vector_out, STARPU_MAIN_RAM); /* * Unregister data */ starpu_data_unregister(sparse_matrix); starpu_data_unregister(vector_in); starpu_data_unregister(vector_out); /* * Display the result */ for (row = 0; row < STARPU_MIN(size, 16); row++) { FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]); } starpu_free(nzval); starpu_free(colind); starpu_free(rowptr); starpu_free(vector_in_ptr); starpu_free(vector_out_ptr); /* * Stop StarPU */ starpu_shutdown(); timing = end - start; FPRINTF(stderr, "Computation took (in ms)\n"); FPRINTF(stdout, "%2.2f\n", timing/1000); return 0; }