//This function would create a matrix A[1024][512] and B[512][1024]. For matrix storing AXB the size would be C[1024][1024]. Where C is initialized with 0 and A and B with random values.
static void initialize_matrix(void)
 {
  unsigned i,j;
  A=(float *) malloc(zdim*ydim*sizeof(float));
  B=(float *) malloc(xdim*zdim*sizeof(float));
  C=(float *) malloc(xdim*ydim*sizeof(float));
  srand(2013);
  for (j=0;j<ydim;j++)
   {  
    for (i=0;i<zdim;i++)
      {
      A[j+i*ydim] = (float)(starpu_drand48());
     }
  }
  for (j=0;j<zdim;j++)
  {   
    for (i=0;i<xdim;i++)
      {
      B[j+i*zdim] = (float)(starpu_drand48());
     }
  } 
  for (j=0;j<ydim;j++)
  {   
    for (i=0;i<xdim;i++)
    { 
      C[j+i*ydim]=(float)(0);
    }
  } 
}   
Ejemplo n.º 2
0
static void init_problem_data(void)
{
	unsigned i,j;

#ifndef STARPU_SIMGRID
	starpu_malloc((void **)&A, zdim*ydim*sizeof(TYPE));
	starpu_malloc((void **)&B, xdim*zdim*sizeof(TYPE));
	starpu_malloc((void **)&C, xdim*ydim*sizeof(TYPE));

	/* fill the A and B matrices */
	for (j=0; j < ydim; j++)
	{
		for (i=0; i < zdim; i++)
		{
			A[j+i*ydim] = (TYPE)(starpu_drand48());
		}
	}

	for (j=0; j < zdim; j++)
	{
		for (i=0; i < xdim; i++)
		{
			B[j+i*zdim] = (TYPE)(starpu_drand48());
		}
	}

	for (j=0; j < ydim; j++)
	{
		for (i=0; i < xdim; i++)
		{
			C[j+i*ydim] = (TYPE)(0);
		}
	}
#endif
}
Ejemplo n.º 3
0
static int random_push_task(struct starpu_sched_component * component, struct starpu_task * task)
{
	STARPU_ASSERT(component->nchildren > 0);

	/* indexes_components and size are used to memoize component that can execute tasks
	 * during the first phase of algorithm, it contain the size indexes of the components
	 * that can execute task.
	 */
	int indexes_components[component->nchildren];
	int size=0;

	/* speedup[i] is revelant only if i is in the size firsts elements of
	 * indexes_components
	 */
	double speedup[component->nchildren];

	double alpha_sum = 0.0;

	int i;
	for(i = 0; i < component->nchildren ; i++)
	{
		if(starpu_sched_component_can_execute_task(component->children[i],task))
		{
			speedup[size] = compute_relative_speedup(component->children[i]);
			alpha_sum += speedup[size];
			indexes_components[size] = i;
			size++;
		}
	}
	if(size == 0)
		return -ENODEV;

	/* not fully sure that this code is correct
	 * because of bad properties of double arithmetic
	 */
	double random = starpu_drand48()*alpha_sum;
	double alpha = 0.0;
	struct starpu_sched_component * select  = NULL;

	for(i = 0; i < size ; i++)
	{
		int index = indexes_components[i];
		if(alpha + speedup[i] >= random)
		{
			select = component->children[index];
			break;
		}
		alpha += speedup[i];
	}
	STARPU_ASSERT(select != NULL);
	if(starpu_sched_component_is_worker(select))
	{
		select->can_pull(select);
		return 1;
	}

	int ret_val = select->push_task(select,task);
	return ret_val;
}
Ejemplo n.º 4
0
static int
run(struct starpu_sched_policy *policy)
{
    int ret;
    struct starpu_conf conf;
    int i;

    starpu_conf_init(&conf);
    conf.sched_policy = policy;
    ret = starpu_init(&conf);
    if (ret != 0)
        exit(STARPU_TEST_SKIPPED);
    starpu_profiling_status_set(1);

    struct starpu_codelet clA =
    {
        .cpu_funcs = {A},
        .nbuffers = 0
    };

    struct starpu_codelet clB =
    {
        .cpu_funcs = {B},
        .nbuffers = 0
    };

    starpu_srand48(0);

    for (i = 0; i < NTASKS; i++)
    {
        struct starpu_task *task = starpu_task_create();

        if (((int)(starpu_drand48()*2))%2)
        {
            task->cl = &clA;
            task->priority=STARPU_MIN_PRIO;
        }
        else
        {
            task->cl = &clB;
            task->priority=STARPU_MAX_PRIO;
        }
        task->detach=1;
        ret = starpu_task_submit(task);
        if (ret == -ENODEV) goto enodev;
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
    }

    starpu_task_wait_for_all();
    FPRINTF(stdout,"\n");

    starpu_shutdown();
    return 0;

enodev:
    starpu_shutdown();
    return -ENODEV;
}
Ejemplo n.º 5
0
static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
{
	const unsigned block_size = (psize/pnblocks);

	unsigned i, j;
	for (i = 0; i < block_size; i++)
	     for (j = 0; j < block_size; j++)
	     {
		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
	     }
}
int main(int argc, char **argv)
{
	int ret;

	/* Not supported yet */
	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
		return 77;

	ret = starpu_init(NULL);
	if (ret == -ENODEV)
		return 77;
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

#ifdef STARPU_USE_OPENCL
	ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
						  &_opencl_program, NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
#endif

#ifdef STARPU_USE_CUDA
	/* cublasSdot has synchronization issues when using a non-blocking stream */
	cublasGetVersion(&cublas_version);
	if (cublas_version >= 7050)
		starpu_cublas_init();
#endif

	unsigned long nelems = _nblocks*_entries_per_block;
	size_t size = nelems*sizeof(float);

	_x = (float *) malloc(size);
	_y = (float *) malloc(size);

	_x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));
	_y_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));

	assert(_x && _y);

        starpu_srand48(0);

	DOT_TYPE reference_dot = 0.0;

	unsigned long i;
	for (i = 0; i < nelems; i++)
	{
		_x[i] = (float)starpu_drand48();
		_y[i] = (float)starpu_drand48();

		reference_dot += (DOT_TYPE)_x[i]*(DOT_TYPE)_y[i];
	}

	unsigned block;
	for (block = 0; block < _nblocks; block++)
	{
		starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM,
			(uintptr_t)&_x[_entries_per_block*block], _entries_per_block, sizeof(float));
		starpu_vector_data_register(&_y_handles[block], STARPU_MAIN_RAM,
			(uintptr_t)&_y[_entries_per_block*block], _entries_per_block, sizeof(float));
	}

	starpu_variable_data_register(&_dot_handle, STARPU_MAIN_RAM, (uintptr_t)&_dot, sizeof(DOT_TYPE));

	/*
	 *	Compute dot product with StarPU
	 */
	starpu_data_set_reduction_methods(_dot_handle, &redux_codelet, &init_codelet);

	for (block = 0; block < _nblocks; block++)
	{
		struct starpu_task *task = starpu_task_create();

		task->cl = &dot_codelet;
		task->destroy = 1;

		task->handles[0] = _x_handles[block];
		task->handles[1] = _y_handles[block];
		task->handles[2] = _dot_handle;

		ret = starpu_task_submit(task);
		if (ret == -ENODEV) goto enodev;
		STARPU_ASSERT(!ret);
	}

	for (block = 0; block < _nblocks; block++)
	{
		starpu_data_unregister(_x_handles[block]);
		starpu_data_unregister(_y_handles[block]);
	}
	starpu_data_unregister(_dot_handle);

	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, _dot, reference_dot - _dot);

#ifdef STARPU_USE_CUDA
	if (cublas_version >= 7050)
		starpu_cublas_shutdown();
#endif

#ifdef STARPU_USE_OPENCL
        ret = starpu_opencl_unload_opencl(&_opencl_program);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
#endif
	starpu_shutdown();

	free(_x);
	free(_y);
	free(_x_handles);
	free(_y_handles);

	if (fabs(reference_dot - _dot) < reference_dot * 1e-6)
		return EXIT_SUCCESS;
	else
		return EXIT_FAILURE;

enodev:
	fprintf(stderr, "WARNING: No one can execute this task\n");
	/* yes, we do not perform the computation but we did detect that no one
 	 * could perform the kernel, so this is not an error from StarPU */
	return 77;
}
Ejemplo n.º 7
0
int main(int argc, char *argv[]) {
	int i;
	struct timeval begin, end;
	int size;
	size_t bytes;
	int n = 0, m = 0;
	STARPUFFT(plan) plan;
#ifdef STARPU_HAVE_FFTW
	_FFTW(plan) fftw_plan;
#endif
#ifdef STARPU_USE_CUDA
	cufftHandle cuda_plan;
	cudaError_t cures;
#endif
	double timing;

	if (argc < 2 || argc > 3) {
		fprintf(stderr,"need one or two size of vector\n");
		exit(EXIT_FAILURE);
	}

	starpu_init(NULL);

	if (argc == 2) {
		n = atoi(argv[1]);

		/* 1D */
		size = n;
	} else if (argc == 3) {
		n = atoi(argv[1]);
		m = atoi(argv[2]);

		/* 2D */
		size = n * m;
	} else {
		assert(0);
	}

	bytes = size * sizeof(STARPUFFT(complex));

	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
	starpu_srand48(0);
	for (i = 0; i < size; i++)
		in[i] = starpu_drand48() + I * starpu_drand48();

	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(complex) *out_fftw = STARPUFFT(malloc)(size * sizeof(*out_fftw));
#endif

#ifdef STARPU_USE_CUDA
	STARPUFFT(complex) *out_cuda = malloc(size * sizeof(*out_cuda));
#endif

	if (argc == 2) {
		plan = STARPUFFT(plan_dft_1d)(n, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		if (cufftPlan1d(&cuda_plan, n, _CUFFT_C2C, 1) != CUFFT_SUCCESS)
			printf("erf\n");
#endif

	} else if (argc == 3) {
		plan = STARPUFFT(plan_dft_2d)(n, m, SIGN, 0);
#ifdef STARPU_HAVE_FFTW
		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
#endif
#ifdef STARPU_USE_CUDA
		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
#endif
	} else {
		assert(0);
	}

#ifdef STARPU_HAVE_FFTW
	gettimeofday(&begin, NULL);
	_FFTW(execute)(fftw_plan);
	gettimeofday(&end, NULL);
	_FFTW(destroy_plan)(fftw_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif
#ifdef STARPU_USE_CUDA
	gettimeofday(&begin, NULL);
	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
		printf("erf2\n");
	if ((cures = cudaThreadSynchronize()) != cudaSuccess)
		STARPU_CUDA_REPORT_ERROR(cures);
	gettimeofday(&end, NULL);
	cufftDestroy(cuda_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
#endif

	STARPUFFT(execute)(plan, in, out);

	STARPUFFT(showstats)(stdout);
	STARPUFFT(destroy_plan)(plan);

	printf("\n");
#if 0
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(in[i]), creal(in[i]));
	printf("\n\n");
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out[i]), creal(out[i]));
	printf("\n\n");
#ifdef STARPU_HAVE_FFTW
	for (i = 0; i < 16; i++)
		printf("(%f,%f) ", cimag(out_fftw[i]), creal(out_fftw[i]));
	printf("\n\n");
#endif
#endif

#ifdef STARPU_HAVE_FFTW
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

#ifdef STARPU_USE_CUDA
{
	double max = 0., tot = 0., norm = 0., normdiff = 0.;
	for (i = 0; i < size; i++) {
		double diff = cabs(out_cuda[i]-out_fftw[i]);
		double diff2 = diff * diff;
		double size = cabs(out_fftw[i]);
		double size2 = size * size;
		if (diff > max)
			max = diff;
		tot += diff;
		normdiff += diff2;
		norm += size2;
	}
	fprintf(stderr, "\nmaximum difference %g\n", max);
	fprintf(stderr, "average difference %g\n", tot / size);
	fprintf(stderr, "difference norm %g\n", sqrt(normdiff));
	double relmaxdiff = max / sqrt(norm);
	fprintf(stderr, "relative maximum difference %g\n", relmaxdiff);
	double relavgdiff = (tot / size) / sqrt(norm);
	fprintf(stderr, "relative average difference %g\n", relavgdiff);
	if (!strcmp(TYPE, "f") && (relmaxdiff > 1e-8 || relavgdiff > 1e-8))
		return EXIT_FAILURE;
	if (!strcmp(TYPE, "") && (relmaxdiff > 1e-16 || relavgdiff > 1e-16))
		return EXIT_FAILURE;
}
#endif

	STARPUFFT(free)(in);
	STARPUFFT(free)(out);

#ifdef STARPU_HAVE_FFTW
	STARPUFFT(free)(out_fftw);
#endif

#ifdef STARPU_USE_CUDA
	free(out_cuda);
#endif

	starpu_shutdown();

	return EXIT_SUCCESS;
}
Ejemplo n.º 8
0
static void init_problem_data(void)
{
	unsigned i,j;

#ifdef STARPU_USE_CUDA
	if (pin) {
		starpu_data_malloc_pinned_if_possible((void **)&A, zdim*ydim*sizeof(float));
		starpu_data_malloc_pinned_if_possible((void **)&B, xdim*zdim*sizeof(float));
		starpu_data_malloc_pinned_if_possible((void **)&C, xdim*ydim*sizeof(float));
	} else
#endif
	{
#ifdef STARPU_HAVE_POSIX_MEMALIGN
		posix_memalign((void **)&A, 4096, zdim*ydim*sizeof(float));
		posix_memalign((void **)&B, 4096, xdim*zdim*sizeof(float));
		posix_memalign((void **)&C, 4096, xdim*ydim*sizeof(float));
#else
		A = malloc(zdim*ydim*sizeof(float));
		B = malloc(xdim*zdim*sizeof(float));
		C = malloc(xdim*ydim*sizeof(float));
#endif
	}

	/* fill the A and B matrices */
	if (norandom) {
		for (j=0; j < ydim; j++) {
			for (i=0; i < zdim; i++) {
				A[j+i*ydim] = (float)(i);
			}
		}
	
		for (j=0; j < zdim; j++) {
			for (i=0; i < xdim; i++) {
				B[j+i*zdim] = (float)(j);
			}
		}
	} 
	else {
#ifdef NORANDOM
		srand(2008);
		STARPU_ABORT();
#endif
		for (j=0; j < ydim; j++) {
			for (i=0; i < zdim; i++) {
				A[j+i*ydim] = (float)(starpu_drand48());
			}
		}
	
		for (j=0; j < zdim; j++) {
			for (i=0; i < xdim; i++) {
				B[j+i*zdim] = (float)(starpu_drand48());
			}
		}
	}

	for (j=0; j < ydim; j++) {
		for (i=0; i < xdim; i++) {
			C[j+i*ydim] = (float)(0);
		}
	}

	display_memory_consumption();
}
int main(int argc, char *argv[])
{
	int i;
	struct timeval begin, end;
	int size;
	size_t bytes;
	int n = 0, m = 0;
	_FFTW(plan) fftw_plan;
	double timing;
	char *num;
	int num_threads = 1;

	_FFTW(init_threads)();

	num = getenv("NUM_THREADS");
	if (num)
		num_threads = atoi(num);
	_FFTW(plan_with_nthreads)(num_threads);

	if (argc < 2 || argc > 3)
	{
		fprintf(stderr,"need one or two size of vector\n");
		exit(EXIT_FAILURE);
	}

	if (argc == 2)
	{
		n = atoi(argv[1]);

		/* 1D */
		size = n;
	}
	else if (argc == 3)
	{
		n = atoi(argv[1]);
		m = atoi(argv[2]);

		/* 2D */
		size = n * m;
	}
	else
	{
		assert(0);
	}

	bytes = size * sizeof(_FFTW(complex));

	_FFTW(complex) *in = _FFTW(malloc)(size * sizeof(*in));
	starpu_srand48(0);
	for (i = 0; i < size; i++)
		in[i] = starpu_drand48() + I * starpu_drand48();

	_FFTW(complex) *out_fftw = _FFTW(malloc)(size * sizeof(*out_fftw));

	if (argc == 2)
	{
		fftw_plan = _FFTW(plan_dft_1d)(n, in, out_fftw, SIGN, FFTW_ESTIMATE);

	}
	else if (argc == 3)
	{
		fftw_plan = _FFTW(plan_dft_2d)(n, m, in, out_fftw, SIGN, FFTW_ESTIMATE);
	}
	else
	{
		assert(0);
	}

	gettimeofday(&begin, NULL);
	_FFTW(execute)(fftw_plan);
	gettimeofday(&end, NULL);
	_FFTW(destroy_plan)(fftw_plan);
	timing = (double)((end.tv_sec - begin.tv_sec)*1000000 + (end.tv_usec - begin.tv_usec));
	printf("FFTW with %d threads took %2.2f ms (%2.2f MB/s)\n\n", num_threads, timing/1000, bytes/(timing*num_threads));

	printf("\n");

	return EXIT_SUCCESS;
}
Ejemplo n.º 10
0
int main(int argc, char **argv)
{
	int my_rank, size, x, y, loop;
	float mean=0;
	float matrix[X][Y];
	starpu_data_handle_t data_handles[X][Y];

	int ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
	starpu_mpi_init(&argc, &argv, 1);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);

	parse_args(argc, argv);

	/* Initial data values */
	starpu_srand48((long int)time(NULL));
	for(x = 0; x < X; x++)
	{
		for (y = 0; y < Y; y++)
		{
			matrix[x][y] = (float)starpu_drand48();
			mean += matrix[x][y];
		}
	}
	mean /= (X*Y);

	if (display)
	{
		FPRINTF_MPI(stdout, "mean=%2.2f\n", mean);
		for(x = 0; x < X; x++)
		{
			fprintf(stdout, "[%d] ", my_rank);
			for (y = 0; y < Y; y++)
			{
				fprintf(stdout, "%2.2f ", matrix[x][y]);
			}
			fprintf(stdout, "\n");
		}
	}

	/* Initial distribution */
	for(x = 0; x < X; x++)
	{
		for (y = 0; y < Y; y++)
		{
			int mpi_rank = my_distrib(x, y, size);
			if (mpi_rank == my_rank)
			{
				//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
				starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(float));
			}
			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			{
				/* I don't own that index, but will need it for my computations */
				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
			}
			else
			{
				/* I know it's useless to allocate anything for this */
				data_handles[x][y] = NULL;
			}
			if (data_handles[x][y])
			{
				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			}
		}
	}

	/* First computation with initial distribution */
	for(loop=0 ; loop<niter; loop++)
	{
		for (x = 1; x < X-1; x++)
		{
			for (y = 1; y < Y-1; y++)
			{
				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
						       0);
			}
		}
	}
	FPRINTF(stderr, "Waiting ...\n");
	starpu_task_wait_for_all();

	/* Now migrate data to a new distribution */

	/* First register newly needed data */
	for(x = 0; x < X; x++)
	{
		for (y = 0; y < Y; y++)
		{
			int mpi_rank = my_distrib2(x, y, size);
			if (!data_handles[x][y] && (mpi_rank == my_rank
				 || my_rank == my_distrib2(x+1, y, size) || my_rank == my_distrib2(x-1, y, size)
				 || my_rank == my_distrib2(x, y+1, size) || my_rank == my_distrib2(x, y-1, size)))
			{
				/* Register newly-needed data */
				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
				starpu_mpi_data_register(data_handles[x][y], (y*X)+x, mpi_rank);
			}
			if (data_handles[x][y] && mpi_rank != starpu_mpi_data_get_rank(data_handles[x][y]))
			{
				/* Migrate the data */
				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
				/* And register new rank of the matrix */
				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
			}
		}
	}

	/* Second computation with new distribution */
	for(loop=0 ; loop<niter; loop++)
	{
		for (x = 1; x < X-1; x++)
		{
			for (y = 1; y < Y-1; y++)
			{
				starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
						       STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
						       STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
						       0);
			}
		}
	}
	FPRINTF(stderr, "Waiting ...\n");
	starpu_task_wait_for_all();

	/* Unregister data */
	for(x = 0; x < X; x++)
	{
		for (y = 0; y < Y; y++)
		{
			if (data_handles[x][y])
			{
				int mpi_rank = my_distrib(x, y, size);
				/* Get back data to original place where the user-provided buffer is. */
				starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
				/* Register original rank of the matrix (although useless) */
				starpu_mpi_data_set_rank(data_handles[x][y], mpi_rank);
				/* And unregister it */
				starpu_data_unregister(data_handles[x][y]);
			}
		}
	}

	starpu_mpi_shutdown();
	starpu_shutdown();

	if (display)
	{
		FPRINTF(stdout, "[%d] mean=%2.2f\n", my_rank, mean);
		for(x = 0; x < X; x++)
		{
			FPRINTF(stdout, "[%d] ", my_rank);
			for (y = 0; y < Y; y++)
			{
				FPRINTF(stdout, "%2.2f ", matrix[x][y]);
			}
			FPRINTF(stdout, "\n");
		}
	}

	return 0;
}
Ejemplo n.º 11
0
int main(int argc, char **argv)
{
	int rank;
	int world_size;

	/*
	 *	Initialization
	 */
	int thread_support;
	if (MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &thread_support) != MPI_SUCCESS) {
		fprintf(stderr,"MPI_Init_thread failed\n");
		exit(1);
	}
	if (thread_support == MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n");
	if (thread_support < MPI_THREAD_FUNNELED)
		fprintf(stderr,"Warning: MPI does not have thread support!\n");

	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &world_size);

	starpu_srand48((long int)time(NULL));

	parse_args(rank, argc, argv);

	int ret = starpu_init(NULL);
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	/* We disable sequential consistency in this example */
	starpu_data_set_default_sequential_consistency_flag(0);

	starpu_mpi_init(NULL, NULL, 0);

	STARPU_ASSERT(p*q == world_size);

	starpu_cublas_init();

	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	/*
	 * 	Problem Init
	 */

	init_matrix(rank);

	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
                        (int)(allocated_memory/(1024*1024)),
			(int)(allocated_memory_extra/(1024*1024)),
                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));

	display_grid(rank, nblocks);

	TYPE *a_r = NULL;
//	STARPU_PLU(display_data_content)(a_r, size);

	TYPE *x, *y;

	if (check)
	{
		x = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(x);

		y = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			unsigned ind;
			for (ind = 0; ind < size; ind++)
				x[ind] = (TYPE)starpu_drand48();
		}

		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);

		if (rank == 0)
			STARPU_PLU(display_data_content)(a_r, size);

//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
	}

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);

	/*
	 * 	Report performance
	 */

	int reduce_ret;
	double min_timing = timing;
	double max_timing = timing;
	double sum_timing = timing;

	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &max_timing, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	reduce_ret = MPI_Reduce(&timing, &sum_timing, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);

	if (rank == 0)
	{
		fprintf(stderr, "Computation took: %f ms\n", max_timing/1000);
		fprintf(stderr, "\tMIN : %f ms\n", min_timing/1000);
		fprintf(stderr, "\tMAX : %f ms\n", max_timing/1000);
		fprintf(stderr, "\tAVG : %f ms\n", sum_timing/(world_size*1000));

		unsigned n = size;
		double flop = (2.0f*n*n*n)/3.0f;
		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/max_timing/1000.0f));
	}

	/*
	 *	Test Result Correctness
	 */

	if (check)
	{
		/*
		 *	Compute || A - LU ||
		 */

		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);

#if 0
		/*
		 *	Compute || Ax - LUx ||
		 */

		unsigned ind;

		y2 = calloc(size, sizeof(TYPE));
		STARPU_ASSERT(y);

		if (rank == 0)
		{
			for (ind = 0; ind < size; ind++)
			{
				y2[ind] = (TYPE)0.0;
			}
		}

		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);

		/* Compute y2 = y2 - y */
		CPU_AXPY(size, -1.0, y, 1, y2, 1);

		TYPE err = CPU_ASUM(size, y2, 1);
		int max = CPU_IAMAX(size, y2, 1);

		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
#endif
	}

	/*
	 * 	Termination
	 */

	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);

	starpu_cublas_shutdown();
	starpu_mpi_shutdown();
	starpu_shutdown();

#if 0
	MPI_Finalize();
#endif

	return 0;
}
Ejemplo n.º 12
0
int main(int argc, char **argv)
{
	unsigned long i;
	int ret;

	/* Not supported yet */
	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
		return 77;

	ret = starpu_init(NULL);
	if (ret == -ENODEV)
		return 77;
	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

	unsigned long nelems = _nblocks*_entries_per_bock;
	size_t size = nelems*sizeof(TYPE);

	_x = (TYPE *) malloc(size);
	_x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));

	assert(_x && _x_handles);

	/* Initialize the vector with random values */
        starpu_srand48(0);
	for (i = 0; i < nelems; i++)
		_x[i] = (TYPE)starpu_drand48();

	unsigned block;
	for (block = 0; block < _nblocks; block++)
	{
		uintptr_t block_start = (uintptr_t)&_x[_entries_per_bock*block];
		starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM, block_start,
					    _entries_per_bock, sizeof(TYPE));
	}

	/* Initialize current min */
	_minmax[0] = TYPE_MAX;

	/* Initialize current max */
	_minmax[1] = TYPE_MIN;

	starpu_variable_data_register(&_minmax_handle, STARPU_MAIN_RAM, (uintptr_t)_minmax, 2*sizeof(TYPE));

	/* Set the methods to define neutral elements and to perform the reduction operation */
	starpu_data_set_reduction_methods(_minmax_handle, &minmax_redux_codelet, &minmax_init_codelet);

	for (block = 0; block < _nblocks; block++)
	{
		struct starpu_task *task = starpu_task_create();

		task->cl = &minmax_codelet;

		task->handles[0] = _x_handles[block];
		task->handles[1] = _minmax_handle;

		ret = starpu_task_submit(task);
		if (ret)
		{
			STARPU_ASSERT(ret == -ENODEV);
			FPRINTF(stderr, "This test can only run on CPUs, but there are no CPU workers (this is not a bug).\n");
			return 77;
		}
	}

	for (block = 0; block < _nblocks; block++)
	{
		starpu_data_unregister(_x_handles[block]);
	}
	starpu_data_unregister(_minmax_handle);

	FPRINTF(stderr, "Min : %e\n", _minmax[0]);
	FPRINTF(stderr, "Max : %e\n", _minmax[1]);

	STARPU_ASSERT(_minmax[0] <= _minmax[1]);

	free(_x);
	free(_x_handles);
	starpu_shutdown();

	return 0;
}