C++ (Cpp) mpi_allreduce Exemples

Exemple #1

0

Afficher le fichier

Fichier : evectmatrix.c Projet : aitatanit/mpb

/* compute U = adjoint(X) * X, with S a scratch matrix. */
void evectmatrix_XtX(sqmatrix U, evectmatrix X, sqmatrix S)
{
     CHECK(X.p == U.p && U.p <= S.alloc_p, "matrices not conformant");
     
/*
     blasglue_gemm('C', 'N', X.p, X.p, X.n,
		   1.0, X.data, X.p, X.data, X.p, 0.0, S.data, U.p);
*/

     /* take advantage of the fact that U is Hermitian and only write
	out the upper triangle of the matrix */
     memset(S.data, 0, sizeof(scalar) * (U.p * U.p));
     blasglue_herk('U', 'C', X.p, X.n, 1.0, X.data, X.p, 0.0, S.data, U.p);
     evectmatrix_flops += X.N * X.c * X.p * (X.p - 1);

     /* Now, copy the conjugate of the upper half onto the lower half of S */
     {
	  int i, j;

	  for (i = 0; i < U.p; ++i)
	       for (j = i + 1; j < U.p; ++j) {
		    ASSIGN_CONJ(S.data[j * U.p + i], S.data[i * U.p + j]);
	       }
     }

     mpi_allreduce(S.data, U.data, U.p * U.p * SCALAR_NUMVALS,
		   real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
}

Exemple #2

0

Afficher le fichier

Fichier : evectmatrix.c Projet : aitatanit/mpb

/* As above, but compute only the diagonal elements of XtX. */
void evectmatrix_XtX_diag_real(evectmatrix X, real *diag, real *scratch_diag)
{
     matrix_XtX_diag_real(X.data, X.n, X.p, scratch_diag);
     evectmatrix_flops += X.N * X.c * X.p * (2*X.p);
     mpi_allreduce(scratch_diag, diag, X.p,
		   real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
}

Exemple #3

0

Afficher le fichier

Fichier : evectmatrix.c Projet : aitatanit/mpb

/* Compute only the diagonal elements of XtY, storing in diag
   (with scratch_diag a scratch array of the same size as diag). */
void evectmatrix_XtY_diag(evectmatrix X, evectmatrix Y, scalar *diag,
			  scalar *scratch_diag)
{
     matrix_XtY_diag(X.data, Y.data, X.n, X.p, scratch_diag);
     evectmatrix_flops += X.N * X.c * X.p * 2;
     mpi_allreduce(scratch_diag, diag, X.p * SCALAR_NUMVALS, 
		   real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
}

Exemple #4

0

Afficher le fichier

Fichier : mib_timer.c Projet : LLNL/mib

void
init_timer()
{
  double start;

  mpi_barrier(MPI_COMM_WORLD);
  start = mpi_wtime();
  mpi_allreduce(&start, &_zero, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
  _skew = start - _zero;
  _initialized = 1;
}

Exemple #5

0

Afficher le fichier

Fichier : evectmatrix.c Projet : aitatanit/mpb

/* compute trace(adjoint(X) * Y) */
scalar evectmatrix_traceXtY(evectmatrix X, evectmatrix Y)
{
     scalar trace, trace_scratch;

     CHECK(X.p == Y.p && X.n == Y.n, "matrices not conformant");
     
     trace_scratch = blasglue_dotc(X.n * X.p, X.data, 1, Y.data, 1);
     evectmatrix_flops += X.N * X.c * X.p * (2*X.p) + X.p;

     mpi_allreduce(&trace_scratch, &trace, SCALAR_NUMVALS,
		   real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);

     return trace;
}

Exemple #6

0

Afficher le fichier

Fichier : evectmatrix.c Projet : aitatanit/mpb

/* Dot p selected columns of X with those in Y, starting at ix and iy.
   Stores the result in U, with S a scratch matrix. */
void evectmatrix_XtY_slice(sqmatrix U, evectmatrix X, evectmatrix Y,
			   int ix, int iy, int p, sqmatrix S)
{
     CHECK(ix + p <= X.p && iy + p <= Y.p && ix >= 0 && iy >= 0 && X.n == Y.n
           && p == U.p && p <= S.alloc_p, "invalid arguments to XtY_slice");

     memset(S.data, 0, sizeof(scalar) * (U.p * U.p));
     blasglue_gemm('C', 'N', p, p, X.n,
                   1.0, X.data + ix, X.p, Y.data + iy, Y.p, 0.0, S.data, U.p);
     evectmatrix_flops += X.N * X.c * p * (2*p);

     mpi_allreduce(S.data, U.data, U.p * U.p * SCALAR_NUMVALS,
                   real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
}

Exemple #7

0

Afficher le fichier

Fichier : evectmatrix.c Projet : aitatanit/mpb

/* Compute adjoint(X) * Y, storing the result in U at an offset
   Uoffset with the matrix (i.e. as a submatrix within U).  S is a
   scratch matrix (at least Y.p by Y.p). */
void evectmatrixXtY_sub(sqmatrix U, int Uoffset, evectmatrix X, evectmatrix Y,
			sqmatrix S)
{
     int i;

     CHECK(X.p == Y.p && X.n == Y.n && U.p >= Y.p, "matrices not conformant");
     CHECK(Uoffset + (Y.p-1)*U.p + Y.p <= U.p*U.p,
	   "submatrix exceeds matrix bounds");
     CHECK(Y.p <= S.alloc_p, "scratch matrix too small");
     
     memset(S.data, 0, sizeof(scalar) * (Y.p * Y.p));
     blasglue_gemm('C', 'N', X.p, X.p, X.n,
		   1.0, X.data, X.p, Y.data, Y.p, 0.0, S.data, Y.p);
     evectmatrix_flops += X.N * X.c * X.p * (2*X.p);

     for (i = 0; i < Y.p; ++i) {
	  mpi_allreduce(S.data + i*Y.p, U.data + Uoffset + i*U.p, 
			Y.p * SCALAR_NUMVALS,
			real, SCALAR_MPI_TYPE, MPI_SUM, mpb_comm);
     }
}

Exemple #8

0

Afficher le fichier

Fichier : train.c Projet : jinshubai/nomad-kranksvm

int main(int argc, char **argv)
{
	//set the mpi settings
	int threadprovided;
	MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &threadprovided);
	if(threadprovided != MPI_THREAD_MULTIPLE)
	{
		printf("MPI multiple thread isn't provided!\n");
		fflush(stdout);
		mpi_exit(1);
	}
	int current_rank = mpi_get_rank();
	int nr_ranks = mpi_get_size();
	param.nr_ranks = nr_ranks;
	
	char hostname[1024];
	int hostname_len;
	MPI_Get_processor_name(hostname, &hostname_len);
    printf("processor name: %s, number of processed: %d, rank: %d\n", hostname, nr_ranks, current_rank);
	fflush(stdout);
	//
	int global_l;
	char input_file_name[1024];
	char model_file_name[1024];
	const char *error_msg;
	parse_command_line(argc, argv, input_file_name, model_file_name);
	
	//set the number of threads for the shared-memory system
	int nr_threads = param.thread_count;
	int max_thread_count = omp_get_max_threads();

	if(nr_threads > max_thread_count)
	{
		printf("[rank %d], please enter the correct number of threads: 1~%d\n", current_rank, max_thread_count);
		mpi_exit(1);
	}
	omp_set_num_threads(nr_threads);

	//set the cpu affnity
	/*int ithread, err, cpu;
	cpu_set_t cpu_mask;
#pragma omp parallel private(ithread, cpu_mask, err, cpu)
	{
		ithread = omp_get_thread_num();
		CPU_ZERO(&cpu_mask);//set mask to zero
		CPU_SET(ithread, &cpu_mask);//set mask with ithread
		err = sched_setaffinity((pid_t)0, sizeof(cpu_mask), &cpu_mask);
		cpu = sched_getcpu();
		printf("thread_id %d on CPU %d\n", ithread, cpu);
	}*/
	//now, read the problem from the input file
	read_problem(input_file_name);
	error_msg = rksvm_check_parameter(&prob,&param);

	if(error_msg)
	{
		fprintf(stderr,"ERROR: %s\n",error_msg);
		mpi_exit(1);
	}

	//distributed code
	global_l = prob.l;
	mpi_allreduce(&global_l, 1, MPI_INT, MPI_SUM);//MPI_INT :int;MPI_SUM:sum
	prob.global_l = global_l;
	
	printf("#local instances = %d, #global instances = %d\n", prob.l, prob.global_l);
	fflush(stdout);

	if(current_rank==0){
	puts("Start to train!");
	}
	model = rksvm_train(&prob,&param);
	if(rksvm_save_model(model_file_name,model))
	{
		fprintf(stderr,"[rank %d] can't save model to file %s\n",mpi_get_rank(), model_file_name);
		mpi_exit(1);
	}
	rksvm_free_and_destroy_model(&model);
	free(prob.y);
	free(prob.x);
	free(prob.query);
	free(x_space);
	free(prob.length_of_each_rksvm_node);
	free(line);

	MPI_Finalize();
	return 0;
}

Exemple #9

0

Afficher le fichier

Fichier : rksvm.cpp Projet : jinshubai/nomad-kranksvm

double l2r_rank_fun::fun(double *w)// w is with the size of global_l 
{
	int i,j,k;
	double f = 0.0;
	double reg = 0.0;
	int l=prob->l;
	selectiontree *T;
	Qv(w,z);
	//generate gz via MPI_Allgatherv
	MPI_Allgatherv((void*)z, l, MPI_DOUBLE, (void*)gz, local_l, start_ptr, MPI_DOUBLE, MPI_COMM_WORLD);

#pragma omp parallel for default(shared) private(i,j,k,T)
	for (i=0;i<nr_subset;i++)
	{
		for (j=0;j<count[i];j++)
		{
			pi[i][j].id= perm[j+start[i]];
			pi[i][j].value = z[perm[j+start[i]]];
		}
		qsort(pi[i], count[i], sizeof(id_and_value), compare_id_and_value);

		T=new selectiontree(nr_class[i]);
		k=0;
		for (j=0;j<count[i];j++)
		{
			while (k<count[i]&&(1-pi[i][j].value+pi[i][k].value>0))
			{
				T->insert_node(int_y[pi[i][k].id],pi[i][k].value);

				k++;
			}
			T->count_smaller(int_y[pi[i][j].id],&l_minus[pi[i][j].id], &gamma_minus[pi[i][j].id]);
		}
		delete T;
		k=count[i]-1;

		T = new selectiontree(nr_class[i]);
		for (j=count[i]-1;j>=0;j--)
		{
			while (k>=0&&(1+pi[i][j].value-pi[i][k].value>0))
			{
				T->insert_node(int_y[pi[i][k].id],pi[i][k].value);
				k--;
			}
			T->count_larger(int_y[pi[i][j].id],&l_plus[pi[i][j].id], &gamma_plus[pi[i][j].id]);
		}
		delete T;
	}

//#pragma omp parallel for default(shared) private(i) reduction(+:f) schedule(dynamic)
	for(i=0;i<global_l;i++)
	{
		f += w[i]*gz[i];
	}

#pragma omp parallel for default(shared) private(i)
	for (i=0;i<l;i++)
	{
		ATe[i] = l_minus[i] - l_plus[i];
		ATAQb[i] = (l_plus[i]+l_minus[i])*gz[i+start_ptr[current_rank]]-gamma_plus[i]-gamma_minus[i];
	}

//#pragma omp parallel for default(shared) //private(i) //reduction(+:reg) schedule(runtime)
	for (int i=0;i<l;i++)
	{
		//#pragma omp atomic
		reg += C*(gz[i+start_ptr[current_rank]]*(ATAQb[i] - 2 * ATe[i]) + l_minus[i]);
	}

	mpi_allreduce(&reg, 1, MPI_DOUBLE, MPI_SUM);	
	f /= 2.0;
	f += reg;
	si->obj=f;
	return(f);
}

Exemple #10

0

Afficher le fichier

Fichier : field-smob.c Projet : victorliu/mpb

/* Compute the integral of f(r, {fields}) over the cell. */
cnumber integrate_fieldL(function f, SCM_list fields)
{
     int i, j, k, n1, n2, n3, n_other, n_last, rank, last_dim;
#ifdef HAVE_MPI
     int local_n2, local_y_start, local_n3;
#endif
     real s1, s2, s3, c1, c2, c3;
     int ifield;
     field_smob **pf;
     cnumber integral = {0,0};

     CHK_MALLOC(pf, field_smob *, fields.num_items);
     for (ifield = 0; ifield < fields.num_items; ++ifield) {
          pf[ifield] = assert_field_smob(fields.items[ifield]);
          CHECK(fields_conform(pf[0], pf[ifield]),
                "fields for integrate-fields must conform");
     }

     if (fields.num_items > 0) {
	  n1 = pf[0]->nx; n2 = pf[0]->ny; n3 = pf[0]->nz;
	  n_other = pf[0]->other_dims;
	  n_last = pf[0]->last_dim_size 
	       / (sizeof(scalar_complex)/sizeof(scalar));
	  last_dim = pf[0]->last_dim;
     }
     else {
	  n1 = mdata->nx; n2 = mdata->ny; n3 = mdata->nz;
	  n_other = mdata->other_dims;
	  n_last = mdata->last_dim_size 
	       / (sizeof(scalar_complex)/sizeof(scalar));
	  last_dim = mdata->last_dim;
     }
     rank = (n3 == 1) ? (n2 == 1 ? 1 : 2) : 3;

     s1 = geometry_lattice.size.x / n1;
     s2 = geometry_lattice.size.y / n2;
     s3 = geometry_lattice.size.z / n3;
     c1 = n1 <= 1 ? 0 : geometry_lattice.size.x * 0.5;
     c2 = n2 <= 1 ? 0 : geometry_lattice.size.y * 0.5;
     c3 = n3 <= 1 ? 0 : geometry_lattice.size.z * 0.5;

     /* Here we have different loops over the coordinates, depending
	upon whether we are using complex or real and serial or
        parallel transforms.  Each loop must define, in its body,
        variables (i2,j2,k2) describing the coordinate of the current
        point, and "index" describing the corresponding index in 
	the curfield array.

        This was all stolen from maxwell_eps.c...it would be better
        if we didn't have to cut and paste, sigh. */

#ifdef SCALAR_COMPLEX

#  ifndef HAVE_MPI
     
     for (i = 0; i < n1; ++i)
	  for (j = 0; j < n2; ++j)
	       for (k = 0; k < n3; ++k)
     {
	  int i2 = i, j2 = j, k2 = k;
	  int index = ((i * n2 + j) * n3 + k);

#  else /* HAVE_MPI */

     if (fields.num_items > 0) {
	  local_n2 = pf[0]->local_ny;
	  local_y_start = pf[0]->local_y_start;
     }
     else {
	  local_n2 = mdata->local_ny;
	  local_y_start = mdata->local_y_start;
     }

     /* first two dimensions are transposed in MPI output: */
     for (j = 0; j < local_n2; ++j)
          for (i = 0; i < n1; ++i)
	       for (k = 0; k < n3; ++k)
     {
	  int i2 = i, j2 = j + local_y_start, k2 = k;
	  int index = ((j * n1 + i) * n3 + k);

#  endif /* HAVE_MPI */

#else /* not SCALAR_COMPLEX */

#  ifndef HAVE_MPI

     for (i = 0; i < n_other; ++i)
	  for (j = 0; j < n_last; ++j)
     {
	  int index = i * n_last + j;
	  int i2, j2, k2;
	  switch (rank) {
	      case 2: i2 = i; j2 = j; k2 = 0; break;
	      case 3: i2 = i / n2; j2 = i % n2; k2 = j; break;
	      default: i2 = j; j2 = k2 = 0;  break;
	  }

#  else /* HAVE_MPI */

     if (fields.num_items > 0) {
	  local_n2 = pf[0]->local_ny;
	  local_y_start = pf[0]->local_y_start;
     }
     else {
	  local_n2 = mdata->local_ny;
	  local_y_start = mdata->local_y_start;
     }

     /* For a real->complex transform, the last dimension is cut in
	half.  For a 2d transform, this is taken into account in local_ny
	already, but for a 3d transform we must compute the new n3: */
     if (n3 > 1) {
	  if (fields.num_items > 0)
	       local_n3 = pf[0]->last_dim_size / 2;
	  else
	       local_n3 = mdata->last_dim_size / 2;
     }
     else
	  local_n3 = 1;
     
     /* first two dimensions are transposed in MPI output: */
     for (j = 0; j < local_n2; ++j)
          for (i = 0; i < n1; ++i)
	       for (k = 0; k < local_n3; ++k)
     {
#         define i2 i
	  int j2 = j + local_y_start;
#         define k2 k
	  int index = ((j * n1 + i) * local_n3 + k);

#  endif /* HAVE_MPI */

#endif /* not SCALAR_COMPLEX */

	  {
	       list arg_list = SCM_EOL;
	       cnumber integrand;
	       vector3 p;

	       p.x = i2 * s1 - c1; p.y = j2 * s2 - c2; p.z = k2 * s3 - c3;

	       for (ifield = fields.num_items - 1; ifield >= 0; --ifield) {
		    SCM item = SCM_EOL;
		    switch (pf[ifield]->type) {
			case RSCALAR_FIELD_SMOB:
			     item = ctl_convert_number_to_scm(pf[ifield]->f.rs[index]);
			     break;
			case CSCALAR_FIELD_SMOB:
			     item = cnumber2scm(cscalar2cnumber(
				  pf[ifield]->f.cs[index]));
			     break;
			case CVECTOR_FIELD_SMOB:
                        item = cvector32scm(cscalar32cvector3(
			     pf[ifield]->f.cv+3*index));
                        break;
		    }
		    arg_list = gh_cons(item, arg_list);
	       }
	       arg_list = gh_cons(vector32scm(p), arg_list);
	       integrand = ctl_convert_cnumber_to_c(gh_apply(f, arg_list));
	       integral.re += integrand.re;
	       integral.im += integrand.im;

#ifndef SCALAR_COMPLEX
	       {
		    int last_index;
#  ifdef HAVE_MPI
		    if (n3 == 1)
			 last_index = j + local_y_start;
		    else
			 last_index = k;
#  else
		    last_index = j;
#  endif
		    
		    if (last_index != 0 && 2*last_index != last_dim) {
			 int i2c, j2c, k2c;
			 i2c = i2 ? (n1 - i2) : 0;
                         j2c = j2 ? (n2 - j2) : 0;
			 k2c = k2 ? (n3 - k2) : 0;
                         p.x = i2c * s1 - c1;
                         p.y = j2c * s2 - c2;
			 p.z = k2c * s3 - c3;
			 arg_list = SCM_EOL;
			 for (ifield = fields.num_items - 1; 
			      ifield >= 0; --ifield) {
			      SCM item = SCM_UNDEFINED;
			      switch (pf[ifield]->type) {
				  case RSCALAR_FIELD_SMOB:
				       item = ctl_convert_number_to_scm(
					    pf[ifield]->f.rs[index]);
				       break;
				  case CSCALAR_FIELD_SMOB:
				       item = cnumber2scm(cscalar2cnumber(
					    pf[ifield]->f.cs[index]));
				       break;
				  case CVECTOR_FIELD_SMOB:
				       item = cvector32scm(
					    cvector3_conj(cscalar32cvector3(
						 pf[ifield]->f.cv+3*index)));
				       break;
			      }
			      arg_list = gh_cons(item, arg_list);
			 }
			 arg_list = gh_cons(vector32scm(p), arg_list);
			 integrand = 
			      ctl_convert_cnumber_to_c(gh_apply(f, arg_list));
			 integral.re += integrand.re;
			 integral.im += integrand.im;
		    }
	       }
#endif
	  }
     }

     free(pf);

     integral.re *= Vol / (n1 * n2 * n3);
     integral.im *= Vol / (n1 * n2 * n3);
     {
	  cnumber integral_sum;
	  mpi_allreduce(&integral, &integral_sum, 2, number, 
			MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
	  return integral_sum;
     }
}