int main(int argc, char **argv) {
    int i, j, rank, nranks, peer, bufsize, errors, total_errors;
    double **buf_bvec, **src_bvec, *src_buf;
    int count[2], src_stride, trg_stride, stride_level;
    double scaling, time;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks);

    buf_bvec = (double **) malloc(sizeof(double *) * nranks);
    src_bvec = (double **) malloc(sizeof(double *) * nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    ARMCI_Malloc((void **) buf_bvec, bufsize);
    ARMCI_Malloc((void **) src_bvec, bufsize);
    src_buf = src_bvec[rank];

    if (rank == 0)
        printf("ARMCI Strided DLA Accumulate Test:\n");


    for (i = 0; i < XDIM*YDIM; i++) {
        *(buf_bvec[rank] + i) = 1.0 + rank;
        *(src_buf + i) = 1.0 + rank;


    scaling = 2.0;

    src_stride = XDIM * sizeof(double);
    trg_stride = XDIM * sizeof(double);
    stride_level = 1;

    count[1] = YDIM;
    count[0] = XDIM * sizeof(double);

    time = MPI_Wtime();

    peer = (rank+1) % nranks;

    for (i = 0; i < ITERATIONS; i++) {

          (void *) &scaling,
          (void *) buf_bvec[peer],

    time = MPI_Wtime() - time;

    if (rank == 0) printf("Time: %f sec\n", time);

    for (i = errors = 0; i < XDIM; i++) {
      for (j = 0; j < YDIM; j++) {
        const double actual   = *(buf_bvec[rank] + i + j*XDIM);
        const double expected = (1.0 + rank) + scaling * (1.0 + ((rank+nranks-1)%nranks)) * (ITERATIONS);
        if (actual - expected > 1e-10) {
          printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
              rank, j, i, expected, actual);

    MPI_Allreduce(&errors, &total_errors, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    ARMCI_Free((void *) buf_bvec[rank]);
    ARMCI_Free((void *) src_bvec[rank]);



    if (total_errors == 0) {
      if (rank == 0) printf("Success.\n");
      return 0;
    } else {
      if (rank == 0) printf("Fail.\n");
      return 1;
Example #2
unsigned int master(unsigned int base_dim, unsigned int max_fact, 
		    unsigned int** exponents, mpz_t * As,
		    int comm_size, unsigned int print_fact) {

  unsigned int fact_count = 0;

  MPI_Status status;

  int count;
  int source;
  /* Buffer per ricevere gli esponenti */
  unsigned int* buffer_exp;
  /* Buffer per ricevere (A + s) */
  unsigned char buffer_As[BUFFER_DIM];
  init_vector(& buffer_exp, base_dim);

  double t1 = MPI_Wtime();
  double t2;

  int fact_per_rank[comm_size];
  for(int i = 0; i < comm_size; ++i)
    fact_per_rank[i] = 0;

  while(fact_count < max_fact + base_dim) {
    /* Ricevo il vettore di esponenti */
    MPI_Recv(buffer_exp, base_dim, MPI_UNSIGNED,
	     MPI_COMM_WORLD, &status);
    source = status.MPI_SOURCE;
    for(unsigned int i = 0; i < base_dim; ++i) 
      set_matrix(exponents, fact_count, i, buffer_exp[i]);
    /* Ricevo l'mpz contenente (A + s) */
    MPI_Recv(buffer_As, BUFFER_DIM, MPI_UNSIGNED_CHAR, source, 
	     AS_TAG, MPI_COMM_WORLD, &status);
    MPI_Get_count(&status, MPI_UNSIGNED_CHAR, &count);
    mpz_import(As[fact_count], count, 1, 1, 1, 0, buffer_As);

    if(fact_count % print_fact == 0) {
      t2 = MPI_Wtime() - t1;
      printf("#%d/%d in %.6f seconds\n", fact_count, max_fact + base_dim, t2);
  /* Spedisco '1' agli slave per indicare la terminazione */
  char stop_signal = '1';
  for(unsigned int i = 1; i < comm_size; ++i)
    MPI_Send(&stop_signal, 1, MPI_CHAR, i, 0, MPI_COMM_WORLD);
  printf("#Sending stop_signal\n");

  printf("#Fattorizzazioni per ranks:\n#");
  for(int i = 1; i < comm_size; ++i)
    printf("%d \t", i);
  for(int i = 1; i < comm_size; ++i)
    printf("%d \t", fact_per_rank[i]);
  return fact_count;
Example #3
/* Estimate the reciprocal space part error of the SPME Ewald sum. */
static real estimate_reciprocal(
        t_inputinfo       *info,
        rvec               x[], /* array of particles */
        real               q[], /* array of charges */
        int                nr,  /* number of charges = size of the charge array */
        FILE  gmx_unused  *fp_out,
        gmx_bool           bVerbose,
        unsigned int       seed,     /* The seed for the random number generator */
        int               *nsamples, /* Return the number of samples used if Monte Carlo
                                      * algorithm is used for self energy error estimate */
        t_commrec         *cr)
    real     e_rec   = 0; /* reciprocal error estimate */
    real     e_rec1  = 0; /* Error estimate term 1*/
    real     e_rec2  = 0; /* Error estimate term 2*/
    real     e_rec3  = 0; /* Error estimate term 3 */
    real     e_rec3x = 0; /* part of Error estimate term 3 in x */
    real     e_rec3y = 0; /* part of Error estimate term 3 in y */
    real     e_rec3z = 0; /* part of Error estimate term 3 in z */
    int      i, ci;
    int      nx, ny, nz;  /* grid coordinates */
    real     q2_all = 0;  /* sum of squared charges */
    rvec     gridpx;      /* reciprocal grid point in x direction*/
    rvec     gridpxy;     /* reciprocal grid point in x and y direction*/
    rvec     gridp;       /* complete reciprocal grid point in 3 directions*/
    rvec     tmpvec;      /* template to create points from basis vectors */
    rvec     tmpvec2;     /* template to create points from basis vectors */
    real     coeff  = 0;  /* variable to compute coefficients of the error estimate */
    real     coeff2 = 0;  /* variable to compute coefficients of the error estimate */
    real     tmp    = 0;  /* variables to compute different factors from vectors */
    real     tmp1   = 0;
    real     tmp2   = 0;
    gmx_bool bFraction;

    /* Random number generator */
    gmx_rng_t rng     = NULL;
    int      *numbers = NULL;

    /* Index variables for parallel work distribution */
    int startglobal, stopglobal;
    int startlocal, stoplocal;
    int x_per_core;
    int xtot;

    double t0 = 0.0;
    double t1 = 0.0;

    rng = gmx_rng_init(seed);


    for (i = 0; i < nr; i++)
        q2_all += q[i]*q[i];

    /* Calculate indices for work distribution */
    startglobal = -info->nkx[0]/2;
    stopglobal  = info->nkx[0]/2;
    xtot        = stopglobal*2+1;
    if (PAR(cr))
        x_per_core = static_cast<int>(ceil(static_cast<real>(xtot) / cr->nnodes));
        startlocal = startglobal + x_per_core*cr->nodeid;
        stoplocal  = startlocal + x_per_core -1;
        if (stoplocal > stopglobal)
            stoplocal = stopglobal;
        startlocal = startglobal;
        stoplocal  = stopglobal;
        x_per_core = xtot;
   #ifdef GMX_LIB_MPI

#ifdef GMX_LIB_MPI
    if (MASTER(cr))
        t0 = MPI_Wtime();

    if (MASTER(cr))

        fprintf(stderr, "Calculating reciprocal error part 1 ...");


    for (nx = startlocal; nx <= stoplocal; nx++)
        svmul(nx, info->recipbox[XX], gridpx);
        for (ny = -info->nky[0]/2; ny < info->nky[0]/2+1; ny++)
            svmul(ny, info->recipbox[YY], tmpvec);
            rvec_add(gridpx, tmpvec, gridpxy);
            for (nz = -info->nkz[0]/2; nz < info->nkz[0]/2+1; nz++)
                if (0 == nx &&  0 == ny &&  0 == nz)
                svmul(nz, info->recipbox[ZZ], tmpvec);
                rvec_add(gridpxy, tmpvec, gridp);
                tmp    = norm2(gridp);
                coeff  = exp(-1.0 * M_PI * M_PI * tmp / info->ewald_beta[0] / info->ewald_beta[0] );
                coeff /= 2.0 * M_PI * info->volume * tmp;
                coeff2 = tmp;

                tmp  = eps_poly2(nx, info->nkx[0], info->pme_order[0]);
                tmp += eps_poly2(ny, info->nkx[0], info->pme_order[0]);
                tmp += eps_poly2(nz, info->nkx[0], info->pme_order[0]);

                tmp1 = eps_poly1(nx, info->nkx[0], info->pme_order[0]);
                tmp2 = eps_poly1(ny, info->nky[0], info->pme_order[0]);

                tmp += 2.0 * tmp1 * tmp2;

                tmp1 = eps_poly1(nz, info->nkz[0], info->pme_order[0]);
                tmp2 = eps_poly1(ny, info->nky[0], info->pme_order[0]);

                tmp += 2.0 * tmp1 * tmp2;

                tmp1 = eps_poly1(nz, info->nkz[0], info->pme_order[0]);
                tmp2 = eps_poly1(nx, info->nkx[0], info->pme_order[0]);

                tmp += 2.0 * tmp1 * tmp2;

                tmp1  = eps_poly1(nx, info->nkx[0], info->pme_order[0]);
                tmp1 += eps_poly1(ny, info->nky[0], info->pme_order[0]);
                tmp1 += eps_poly1(nz, info->nkz[0], info->pme_order[0]);

                tmp += tmp1 * tmp1;

                e_rec1 += 32.0 * M_PI * M_PI * coeff * coeff * coeff2 * tmp  * q2_all * q2_all / nr;

                tmp1  = eps_poly3(nx, info->nkx[0], info->pme_order[0]);
                tmp1 *= info->nkx[0];
                tmp2  = iprod(gridp, info->recipbox[XX]);

                tmp = tmp1*tmp2;

                tmp1  = eps_poly3(ny, info->nky[0], info->pme_order[0]);
                tmp1 *= info->nky[0];
                tmp2  = iprod(gridp, info->recipbox[YY]);

                tmp += tmp1*tmp2;

                tmp1  = eps_poly3(nz, info->nkz[0], info->pme_order[0]);
                tmp1 *= info->nkz[0];
                tmp2  = iprod(gridp, info->recipbox[ZZ]);

                tmp += tmp1*tmp2;

                tmp *= 4.0 * M_PI;

                tmp1  = eps_poly4(nx, info->nkx[0], info->pme_order[0]);
                tmp1 *= norm2(info->recipbox[XX]);
                tmp1 *= info->nkx[0] * info->nkx[0];

                tmp += tmp1;

                tmp1  = eps_poly4(ny, info->nky[0], info->pme_order[0]);
                tmp1 *= norm2(info->recipbox[YY]);
                tmp1 *= info->nky[0] * info->nky[0];

                tmp += tmp1;

                tmp1  = eps_poly4(nz, info->nkz[0], info->pme_order[0]);
                tmp1 *= norm2(info->recipbox[ZZ]);
                tmp1 *= info->nkz[0] * info->nkz[0];

                tmp += tmp1;

                e_rec2 += 4.0 * coeff * coeff * tmp * q2_all * q2_all / nr;

        if (MASTER(cr))
            fprintf(stderr, "\rCalculating reciprocal error part 1 ... %3.0f%%", 100.0*(nx-startlocal+1)/(x_per_core));


    if (MASTER(cr))
        fprintf(stderr, "\n");

    /* Use just a fraction of all charges to estimate the self energy error term? */
    bFraction =  (info->fracself > 0.0) && (info->fracself < 1.0);

    if (bFraction)
        /* Here xtot is the number of samples taken for the Monte Carlo calculation
         * of the average of term IV of equation 35 in Wang2010. Round up to a
         * number of samples that is divisible by the number of nodes */
        x_per_core  = static_cast<int>(ceil(info->fracself * nr / cr->nnodes));
        xtot        = x_per_core * cr->nnodes;
        /* In this case we use all nr particle positions */
        xtot       = nr;
        x_per_core = static_cast<int>(ceil(static_cast<real>(xtot) / cr->nnodes));

    startlocal = x_per_core *  cr->nodeid;
    stoplocal  = std::min(startlocal + x_per_core, xtot);  /* min needed if xtot == nr */

    if (bFraction)
        /* Make shure we get identical results in serial and parallel. Therefore,
         * take the sample indices from a single, global random number array that
         * is constructed on the master node and that only depends on the seed */
        snew(numbers, xtot);
        if (MASTER(cr))
            for (i = 0; i < xtot; i++)
                numbers[i] = static_cast<int>(floor(gmx_rng_uniform_real(rng) * nr));
        /* Broadcast the random number array to the other nodes */
        if (PAR(cr))
            nblock_bc(cr, xtot, numbers);

        if (bVerbose && MASTER(cr))
            fprintf(stdout, "Using %d sample%s to approximate the self interaction error term",
                    xtot, xtot == 1 ? "" : "s");
            if (PAR(cr))
                fprintf(stdout, " (%d sample%s per rank)", x_per_core, x_per_core == 1 ? "" : "s");
            fprintf(stdout, ".\n");

    /* Return the number of positions used for the Monte Carlo algorithm */
    *nsamples = xtot;

    for (i = startlocal; i < stoplocal; i++)
        e_rec3x = 0;
        e_rec3y = 0;
        e_rec3z = 0;

        if (bFraction)
            /* Randomly pick a charge */
            ci = numbers[i];
            /* Use all charges */
            ci = i;

        /* for(nx=startlocal; nx<=stoplocal; nx++)*/
        for (nx = -info->nkx[0]/2; nx < info->nkx[0]/2+1; nx++)
            svmul(nx, info->recipbox[XX], gridpx);
            for (ny = -info->nky[0]/2; ny < info->nky[0]/2+1; ny++)
                svmul(ny, info->recipbox[YY], tmpvec);
                rvec_add(gridpx, tmpvec, gridpxy);
                for (nz = -info->nkz[0]/2; nz < info->nkz[0]/2+1; nz++)

                    if (0 == nx && 0 == ny && 0 == nz)

                    svmul(nz, info->recipbox[ZZ], tmpvec);
                    rvec_add(gridpxy, tmpvec, gridp);
                    tmp      = norm2(gridp);
                    coeff    = exp(-1.0 * M_PI * M_PI * tmp / info->ewald_beta[0] / info->ewald_beta[0] );
                    coeff   /= tmp;
                    e_rec3x += coeff*eps_self(nx, info->nkx[0], info->recipbox[XX], info->pme_order[0], x[ci]);
                    e_rec3y += coeff*eps_self(ny, info->nky[0], info->recipbox[YY], info->pme_order[0], x[ci]);
                    e_rec3z += coeff*eps_self(nz, info->nkz[0], info->recipbox[ZZ], info->pme_order[0], x[ci]);



        svmul(e_rec3x, info->recipbox[XX], tmpvec);
        rvec_inc(tmpvec2, tmpvec);
        svmul(e_rec3y, info->recipbox[YY], tmpvec);
        rvec_inc(tmpvec2, tmpvec);
        svmul(e_rec3z, info->recipbox[ZZ], tmpvec);
        rvec_inc(tmpvec2, tmpvec);

        e_rec3 += q[ci]*q[ci]*q[ci]*q[ci]*norm2(tmpvec2) / ( xtot * M_PI * info->volume * M_PI * info->volume);
        if (MASTER(cr))
            fprintf(stderr, "\rCalculating reciprocal error part 2 ... %3.0f%%",


    if (MASTER(cr))
        fprintf(stderr, "\n");

#ifdef GMX_LIB_MPI
    if (MASTER(cr))
        t1 = MPI_Wtime() - t0;
        fprintf(fp_out, "Recip. err. est. took   : %lf s\n", t1);

#ifdef DEBUG
    if (PAR(cr))
        fprintf(stderr, "Rank %3d: nx=[%3d...%3d]  e_rec3=%e\n",
                cr->nodeid, startlocal, stoplocal, e_rec3);

    if (PAR(cr))
        gmx_sum(1, &e_rec1, cr);
        gmx_sum(1, &e_rec2, cr);
        gmx_sum(1, &e_rec3, cr);

    /* e_rec1*=8.0 * q2_all / info->volume / info->volume / nr ;
       e_rec2*=  q2_all / M_PI / M_PI / info->volume / info->volume / nr ;
       e_rec3/= M_PI * M_PI * info->volume * info->volume * nr ;
    e_rec = sqrt(e_rec1+e_rec2+e_rec3);

    return ONE_4PI_EPS0 * e_rec;
Example #4
int main( int argc, char *argv[] )
    int errs = 0;
    int *ranks;
    int *ranksout;
    MPI_Group gworld, grev, gself;
    MPI_Comm  comm;
    MPI_Comm  commrev;
    int rank, size, i;
    double start, end, time1, time2;

    MTest_Init( &argc, &argv );

    comm = MPI_COMM_WORLD;

    MPI_Comm_size( comm, &size );
    MPI_Comm_rank( comm, &rank );

    ranks    = malloc(size*sizeof(int));
    ranksout = malloc(size*sizeof(int));
    if (!ranks || !ranksout) {
        fprintf(stderr, "out of memory\n");
        MPI_Abort(MPI_COMM_WORLD, 1);

    /* generate a comm with the rank order reversed */
    MPI_Comm_split(comm, 0, (size-rank-1), &commrev);
    MPI_Comm_group(commrev, &grev);
    MPI_Comm_group(MPI_COMM_SELF, &gself);
    MPI_Comm_group(comm, &gworld);

    /* sanity check correctness first */
    for (i=0; i < size; i++) {
        ranks[i] = i;
        ranksout[i] = -1;
    MPI_Group_translate_ranks(grev, size, ranks, gworld, ranksout);
    for (i=0; i < size; i++) {
        if (ranksout[i] != (size-i-1)) {
            if (rank == 0)
                printf("%d: (gworld) expected ranksout[%d]=%d, got %d\n", rank, i, (size-rank-1), ranksout[i]);
    MPI_Group_translate_ranks(grev, size, ranks, gself, ranksout);
    for (i=0; i < size; i++) {
        int expected = (i == (size-rank-1) ? 0 : MPI_UNDEFINED);
        if (ranksout[i] != expected) {
            if (rank == 0)
                printf("%d: (gself) expected ranksout[%d]=%d, got %d\n", rank, i, expected, ranksout[i]);

    /* now compare relative performance */

    /* we needs lots of procs to get a group large enough to have meaningful
     * numbers.  On most testing machines this means that we're oversubscribing
     * cores in a big way, which might perturb the timing results.  So we make
     * sure everyone started up and then everyone but rank 0 goes to sleep to
     * let rank 0 do all the timings. */

    if (rank != 0) {
    else /* rank==0 */ {
        sleep(1); /* try to avoid timing while everyone else is making syscalls */

        MPI_Group_translate_ranks(grev, size, ranks, gworld, ranksout); /*throwaway iter*/
        start = MPI_Wtime();
        for (i = 0; i < NUM_LOOPS; ++i) {
            MPI_Group_translate_ranks(grev, size, ranks, gworld, ranksout);
        end = MPI_Wtime();
        time1 = end - start;

        MPI_Group_translate_ranks(grev, size, ranks, gself, ranksout); /*throwaway iter*/
        start = MPI_Wtime();
        for (i = 0; i < NUM_LOOPS; ++i) {
            MPI_Group_translate_ranks(grev, size, ranks, gself, ranksout);
        end = MPI_Wtime();
        time2 = end - start;

        /* complain if the "gworld" time exceeds 2x the "gself" time */
        if (fabs(time1 - time2) > (2.00 * time2)) {
            printf("too much difference in MPI_Group_translate_ranks performance:\n");
            printf("time1=%f time2=%f\n", time1, time2);
            printf("(fabs(time1-time2)/time2)=%f\n", (fabs(time1-time2)/time2));
            if (time1 < time2) {
                printf("also, (time1<time2) is surprising...\n");





    return 0;
Example #5
void index_jd(int * nr_of_eigenvalues_ov, 
	      const int max_iterations, const double precision_ov, char *conf_filename, 
	      const int nstore, const int method){
  complex *eval;
  spinor  *eigenvectors_ov, *eigenvectors_ov_;
  spinor  *lowvectors, *lowvectors_;
  int i=0 , k=0, returncode=0, index = 0, determined = 0, signed_index = 0;
  char filename[120];
  FILE * ifs = NULL;
  matrix_mult Operator[2];
  double absdifference;
  const int N2 = VOLUMEPLUSRAND;

#ifdef MPI
  double atime, etime;
  double lowestmodes[20];
  int intsign, max_iter, first_blocksize = 1;
  int * idx = NULL;

   * For Jacobi-Davidson 
  int verbosity = 3, converged = 0, blocksize = 1, blockwise = 0;
  int solver_it_max = 50, j_max, j_min, v0dim = 0;
  double * eigenvalues_ov = NULL;
  double decay_min = 1.7, threshold_min = 1.e-3, prec;

  WRITER *writer=NULL;
  spinor *s;
  double sqnorm;
  paramsPropagatorFormat *propagatorFormat = NULL;
  double ap_eps_sq;
  int switch_on_adaptive_precision = 0;
  double ov_s = 0;

   * General variables                                                    

  eval= calloc((*nr_of_eigenvalues_ov),sizeof(complex));
  shift = 0.0;

  //  ov_s = 0.5*(1./g_kappa - 8.) - 1.;
  ap_eps_sq = precision_ov*precision_ov; 

#if (defined SSE || defined SSE2 )
  eigenvectors_ov_= calloc(VOLUMEPLUSRAND*(*nr_of_eigenvalues_ov)+1, sizeof(spinor)); 
  eigenvectors_ov = (spinor *)(((unsigned long int)(eigenvectors_ov_)+ALIGN_BASE)&~ALIGN_BASE);
  lowvectors_ = calloc(2*first_blocksize*VOLUMEPLUSRAND+1, sizeof(spinor));
  lowvectors = (spinor *)(((unsigned long int)(lowvectors_)+ALIGN_BASE)&~ALIGN_BASE);
  //  eigenvectors_ov_ = calloc(VOLUMEPLUSRAND*(*nr_of_eigenvalues_ov), sizeof(spinor));
  eigenvectors_ov_ = calloc(VOLUMEPLUSRAND*(*nr_of_eigenvalues_ov), sizeof(spinor));
  lowvectors_ = calloc(2*first_blocksize*VOLUMEPLUSRAND, sizeof(spinor));
  eigenvectors_ov = eigenvectors_ov_;
  lowvectors = lowvectors_;

  //  idx = malloc((*nr_of_eigenvalues_ov)*sizeof(int));
  idx = malloc((*nr_of_eigenvalues_ov)*sizeof(int));
  if(g_proc_id == g_stdio_proc){
    printf("Computing first the two lowest modes in the positive and negative chirality sector, respectively\n");
    if(switch_on_adaptive_precision == 1) {
      printf("We have switched on adaptive precision with ap_eps_sq = %e!\n", ap_eps_sq);
    printf("We have set the mass to zero within this computation!\n");

  prec = precision_ov; 
  j_min = 8; j_max = 16;
  max_iter = 70;

#ifdef MPI
  atime = MPI_Wtime();

  v0dim = first_blocksize;
  blocksize = v0dim;
  for(intsign = 0; intsign < 2; intsign++){
    converged = 0;
    if(g_proc_id == g_stdio_proc){
      printf("%s chirality sector: \n", intsign ? "negative" : "positive");
    if(max_iter == 70){
       * We need random start spinor fields, but they must be half zero,
       * that's why we apply the Projektor once
      for(i = 0; i < first_blocksize; i++) {
	     &lowvectors[(first_blocksize*intsign+i)*VOLUMEPLUSRAND],N2, intsign);

	  shift, prec, blocksize, j_max, j_min, 
	  max_iter, blocksize, blockwise, v0dim, (complex*) &lowvectors[first_blocksize*intsign*VOLUMEPLUSRAND],
	  CG, solver_it_max,
	  threshold_min, decay_min, verbosity,
	  &converged, (complex*) &lowvectors[first_blocksize*intsign*VOLUMEPLUSRAND], 
	  &returncode, JD_MINIMAL, 1,

    if(converged != blocksize && max_iter == 70){
      if(g_proc_id == g_stdio_proc){
	printf("Restarting %s chirality sector with more iterations!\n", intsign ? "negative" : "positive");
      max_iter = 140;
    else {
      max_iter = 70;
      /* Save the allready computed eigenvectors_ov */
      for(i = 0; i< first_blocksize; i++) {
	sprintf(filename, "eigenvector_of_D%s.%.2d.%s.%.4d",((intsign==0)?"plus":"minus"),i , conf_filename, nstore);

	  construct_writer(&writer, filename, 0);
	  /* todo write propagator format */
	  propagatorFormat = construct_paramsPropagatorFormat(64, 1);
	  write_propagator_format(writer, propagatorFormat);

	  write_spinor(writer, &s,NULL, 1, 64);
	  printf(" wrote eigenvector of overlap operator !!! | |^2 = %e \n",sqnorm);


#ifdef MPI
  etime = MPI_Wtime();
  if(g_proc_id == g_stdio_proc){
    printf("It took %f sec to determine the sector with zero modes, if any!\n", etime-atime);

  /*Compare the two lowest modes */
  absdifference = fabs(lowestmodes[0]-lowestmodes[first_blocksize]);
  if(absdifference < 0.1*max(lowestmodes[0],lowestmodes[first_blocksize])){
    /* They are equal within the errors */
    if(g_proc_id == g_stdio_proc){
      printf("Index is 0!\n");
      sprintf(filename, "eigenvalues_of_overlap_proj.%s.%.4d", conf_filename, nstore);
      ifs = fopen(filename, "w");  
      printf("\nThe following lowest modes have been computed:\n");
      fprintf(ifs, "Index is 0\n\n");
      fprintf(ifs, "Sector with positive chirality:\n");
      for(i = 0; i < first_blocksize; i++) {
	lowestmodes[i] = 2.*(1.+ov_s)*lowestmodes[i];
	fprintf(ifs, "%d %e positive\n", i, lowestmodes[i]);
	printf("%d %e positive\n", i, lowestmodes[i]);
      fprintf(ifs, "Sector with negative chirality:\n");
      for(i = 0; i < first_blocksize; i++) {
	lowestmodes[i+first_blocksize] = 2.*(1.+ov_s)*lowestmodes[i+first_blocksize];
	fprintf(ifs, "%d %e negative\n", i, lowestmodes[i+first_blocksize]);
	printf("%d %e negative\n", i, lowestmodes[i+first_blocksize]);
      for(k = 0; k < 2; k++) {
	sprintf(filename, "eigenvalues_of_D%s.%s.%.4d", 
		k ? "minus" : "plus", conf_filename, nstore);
	ifs = fopen(filename, "w");
	fwrite(&first_blocksize, sizeof(int), 1, ifs);
	index = 0;
	fwrite(&index, sizeof(int), 1, ifs);
	for(i = 0; i < first_blocksize; i++) {
	  fwrite(&lowestmodes[((intsign+1)%2)*first_blocksize+i], sizeof(double), 1, ifs);
    /* they are not equal */
    /* determine the sector with not trivial topology */
    if(lowestmodes[0] < lowestmodes[first_blocksize]){
      intsign = 0;
      intsign = 1;
    if(g_proc_id == g_stdio_proc){
      printf("Computing now up to %d modes in the sector with %s chirality\n", 
	     (*nr_of_eigenvalues_ov), intsign ? "negative" : "positive");

    /* Here we set the (absolute) precision to be  */
    /* such that we can compare to the lowest mode */
    /* in the other sector                         */

    prec = (lowestmodes[first_blocksize*((intsign+1)%2)])*1.e-1;

    eigenvalues_ov = (double*)malloc((*nr_of_eigenvalues_ov)*sizeof(double));

    /* Copy the allready computed eigenvectors_ov */
    for(i = 0; i < first_blocksize; i++) { 
      assign(&eigenvectors_ov[i], &lowvectors[(first_blocksize*intsign+i)*VOLUMEPLUSRAND],N2);
      eigenvalues_ov[i] = lowestmodes[first_blocksize*intsign+i];

#ifdef MPI
    atime = MPI_Wtime();

    blocksize = 3;
    j_min = 8; j_max = 16;
    converged = first_blocksize;
    for(i = first_blocksize; i < (*nr_of_eigenvalues_ov); i+=3) { 

      if((i + blocksize) > (*nr_of_eigenvalues_ov) ) {
	blocksize = (*nr_of_eigenvalues_ov) - i;

      /* Fill up the rest with random spinor fields  */
      /* and project it to the corresponding sector  */
      for(v0dim = i; v0dim < i+blocksize; v0dim++){
	Proj(&eigenvectors_ov[v0dim*VOLUMEPLUSRAND], &eigenvectors_ov[v0dim*VOLUMEPLUSRAND],N2, intsign);
      v0dim = blocksize;
      returncode = 0;

      /* compute minimal eigenvalues */
#ifdef MPI
      /*      pjdher(VOLUME*sizeof(spinor)/sizeof(complex), VOLUMEPLUSRAND*sizeof(spinor)/sizeof(complex),
	     shift, prec, omega, n_omega, ev_tr,
	     i+blocksize, j_max, j_min, 
	     max_iterations, blocksize, blockwise, v0dim, (complex*)(&eigenvectors_ov[i*VOLUMEPLUSRAND]),
	     CG, solver_it_max,
	     threshold_min, decay_min, verbosity,
	     &converged, (complex*) eigenvectors_ov, eigenvalues_ov,
	     &returncode, JD_MINIMAL, 1, use_AV,
	    shift, prec, blocksize, j_max, j_min,
	    max_iter, blocksize, blockwise, v0dim, (complex*) &eigenvectors_ov[i*VOLUMEPLUSRAND],
	    CG, solver_it_max,
	    threshold_min, decay_min, verbosity,
	    &converged, (complex*) eigenvectors_ov,
	    &returncode, JD_MINIMAL, 1,
      /* Save eigenvectors_ov temporary    */
      /* in order to be able to restart */
      for (k=i; k < converged; k++){
	if(intsign == 0){
	  sprintf(filename, "eigenvector_of_Dplus.%.2d.%s.%.4d", k, conf_filename, nstore);
	  sprintf(filename, "eigenvector_of_Dminus.%.2d.%s.%.4d", k, conf_filename, nstore);
	/*	write_spinorfield(&eigenvectors_ov[k*VOLUMEPLUSRAND], filename);*/

      /* order the eigenvalues_ov and vectors */
      for(k = 0; k < converged; k++) {
	idx[k] = k;
      /*      quicksort(converged, eigenvalues_ov, idx);*/

      /* Check whether the index is detemined */
      index = 0;
      for(k = 0; k < converged; k++) { 
	absdifference = fabs(lowestmodes[first_blocksize*((intsign+1)%2)] - eigenvalues_ov[k]);
	if(absdifference < 0.1*lowestmodes[first_blocksize*((intsign+1)%2)]) {
	  /* We have found the first non zero */
	  if(k < converged-1) {
	    determined = 1;
	  else {
	    blocksize = 1;
	    shift = eigenvalues_ov[converged-1];
	else {
      /* If we have determined the index or */
      /* hit the maximal number of ev       */
      if(determined == 1 || converged == (*nr_of_eigenvalues_ov)) {
      else if(g_proc_id == g_stdio_proc) {
	if(blocksize != 1) {
	  printf("Index %s (or equal) than %s%d, continuing!\n\n", 
		 intsign ? "lower" : "bigger", 
		 intsign ? "-" : "+", index);
	  fflush( stdout );
	else {
	  printf("Index is %s%d, one non zero is missing, continuing!\n\n", 
		 intsign ? "-" : "+", index);
	  fflush( stdout );

#ifdef MPI
    etime = MPI_Wtime();

    /* Save the eigenvectors_ov */
    for(i = 0; i < converged; i++){
      eval[i].re = 2.*(1.+ov_s)*eigenvalues_ov[i];
      eval[i].im = 0.;
      if(intsign == 0){
	sprintf(filename, "eigenvector_of_Dplus.%.2d.%s.%.4d", i, conf_filename, nstore);
	sprintf(filename, "eigenvector_of_Dminus.%.2d.%s.%.4d", i, conf_filename, nstore);
      /*      write_spinorfield(&eigenvectors_ov[idx[i]*VOLUMEPLUSRAND], filename);*/

    /* Some Output */
    if(g_proc_id == g_stdio_proc) {
      printf("Index is %s%d!\n", intsign ? "-" : "+", index);
#ifdef MPI
      printf("Zero modes determined in %f sec!\n", etime-atime);
    if(g_proc_id == 0) {
      sprintf(filename, "eigenvalues_of_overlap_proj.%s.%.4d", conf_filename, nstore);
      ifs = fopen(filename, "w");
      printf("\nThe following lowest modes have been computed:\n");
      fprintf(ifs, "Index is %s%d!\n\n", intsign ? "-" : "+", index);
      for(k = 0; k < 2; k++) {
	if(k == intsign) {
	  for (i=0; i < converged; i++) {
	    fprintf(ifs, "%d %e %s\n", i, eval[i].re, intsign ? "negative" : "positive");
	    printf("%d %e %s\n", i, eval[i].re, intsign ? "negative" : "positive");
	else {
	  for(i = 0; i < first_blocksize; i++) {
	    lowestmodes[((intsign+1)%2)*first_blocksize+i] = 2.*(1.+ov_s)*lowestmodes[((intsign+1)%2)*first_blocksize+i];
	    fprintf(ifs, "%d %e %s\n", i, lowestmodes[((intsign+1)%2)*first_blocksize+i], intsign ? "positive" : "negative");
	    printf("%d %e %s\n", i, lowestmodes[((intsign+1)%2)*first_blocksize+i], intsign ? "positive" : "negative");
      if(intsign != 0) signed_index = -index;
      else signed_index = index;
      for(k = 0; k < 2; k++) {
	sprintf(filename, "eigenvalues_of_D%s.%s.%.4d", 
		k ? "minus" : "plus", conf_filename, nstore);
	ifs = fopen(filename, "w");
	if(k == intsign) {
	  fwrite(&converged, sizeof(int), 1, ifs);
	  fwrite(&signed_index, sizeof(int), 1, ifs);
	  for (i=index; i < converged; i++) {
	    fwrite(&eval[i].re, sizeof(double), 1, ifs);
	else {
	  fwrite(&first_blocksize, sizeof(int), 1, ifs);
	  fwrite(&signed_index, sizeof(int), 1, ifs);
	  for(i = 0; i < first_blocksize; i++) {
	    fwrite(&lowestmodes[((intsign+1)%2)*first_blocksize+i], sizeof(double), 1, ifs);

  switch_on_adaptive_precision = 0; 
  /* Free memory */
Example #6
int main(int argc, char** argv) {
  const int PING_PONG_LIMIT = 10;
  double t_start, t_end, t_total, tLoop, t_tick;
  double MPI_Wtime(void);
  int tests, maxTest = 10, i;
  int k[9] = {1,4,16,64,256,1024,4096,16384,65536};

  // Initialize the MPI environment
  // Find out rank, size
  int world_rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
  int world_size;
  MPI_Comm_size(MPI_COMM_WORLD, &world_size);

  // We are assuming at least 2 processes for this task
  if (world_size != 2) {
    fprintf(stderr, "World size must be two for %s\n", argv[0]);
    MPI_Abort(MPI_COMM_WORLD, 1);

  // Get the name of the processor
  char processor_name[MPI_MAX_PROCESSOR_NAME];
  int name_len;
  MPI_Get_processor_name(processor_name, &name_len);

  // Initialize the outer loop for 2^k where k = 2,4,6,8,10,12,14,16,18
  for (int p = 0; p < sizeof(k)/sizeof(k[0]); p++) {
  int A[k[p]]; // Vector of integers

  // Populate A with ints (4 bytes each)
  for (i = 0; i < k[p]; i++) {
    A[i] = i;

  // This is for loop timing
  tLoop = 1.0e10;
  for (tests = 0; tests < maxTest; tests++) { // begin timing
  t_start = MPI_Wtime();
  // t_tick = MPI_Wtick();
  int ping_pong_count = 0;
  int partner_rank = (world_rank + 1) % 2;
  while (ping_pong_count < PING_PONG_LIMIT) {
    if (world_rank == ping_pong_count % 2) {
      // Increment the ping pong count before you send it
      MPI_Send(&ping_pong_count, k[p], MPI_INT, partner_rank, 0, MPI_COMM_WORLD);
      printf("World rank %d sent and incremented ping_pong_count %d to partner rank %d\n",
             world_rank, ping_pong_count, partner_rank);
             printf("The size of A is: %lu\n", sizeof(A));
             printf("P is: %lu\n", sizeof(k));
    } else {
      MPI_Recv(&ping_pong_count, k[p], MPI_INT, partner_rank, 0, MPI_COMM_WORLD,
      printf("World rank %d received ping_pong_count %d from partner rank %d\n",
             world_rank, ping_pong_count, partner_rank);
t_end = MPI_Wtime();
t_total = t_end - t_start;
if (t_total < tLoop) tLoop = t_total;

  printf("That took %f seconds\n", tLoop);
  printf("Number of processes in MPI_COMM_WORLD: %d\n", world_size);
  printf("Name of processor %s\n", processor_name);
  printf("The size of A is: %lu\n", sizeof(A));
  printf("P is: %lu\n", sizeof(k));
  } // ends the outer loop for m


  // return 0;
int32_t main(int32_t argc, char *argv[])
	int32_t rankID = 0, nRanks = 1;
	char rankName[MAX_STRING_LENGTH];
	gethostname(rankName, MAX_STRING_LENGTH);
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &rankID);
	MPI_Comm_size(MPI_COMM_WORLD, &nRanks);

	char *target_path = argv[1], *target = NULL;
	int64_t target_length = 0;
	target_length = get_filesize(target_path);
	if(target_length < 0)
		printf("\nError: Cannot read target file [ %s ]\n", target_path);
	if(rankID == 0)
		printf("- Read target: [ %s ]\n", target_path);
		target = (char*)malloc(sizeof(char)*target_length);
		double read_time = 0;
		read_time -= MPI_Wtime();
		read_targetfile(target, target_length, target_path);
		read_time += MPI_Wtime();
		printf("- Target length: %ld (read time: %lf secs)\n", target_length, read_time);

	char *pattern = argv[2];
	int64_t pattern_length = 0;
	if(pattern == NULL)
		printf("\nError: Cannot read pattern [ %s ]\n", pattern);
	pattern_length = strlen(pattern);
	if(rankID == 0)
		printf("- Pattern: [ %s ]\n", pattern);
		printf("- Pattern length: %ld\n", pattern_length);
	int32_t* BCS = (int32_t*)malloc(ALPHABET_LEN * sizeof(int32_t));
	int32_t* GSS = (int32_t*)malloc(pattern_length * sizeof(int32_t));;
	make_BCS(BCS, pattern, pattern_length);
	make_GSS(GSS, pattern, pattern_length);

	int64_t found_count = 0;
	double search_time = 0;
	if(rankID == 0)
		search_time -= MPI_Wtime();

	int64_t mpi_found_count = 0;
	char* chunk = NULL;
	if(argv[3] == NULL)
		printf("\nError: Check chunk size [ %s ]\n", argv[3]);

	if(rankID == 0) printf("\ttarget_length = %ld\n", target_length);
	int64_t nChunksPerRank = atoi(argv[3]);
	//각 rank에 몇개의 문자열 덩어리를 줄것인가 결정
	int64_t nTotalChunks = (nRanks-1) * nChunksPerRank; 
	//rank 0은 검사하지않으므로 nRanks - 1
	//문자열은 총 nTotalChunks개로 쪼개진다.
	if(rankID == 0) printf("\tnTotalChunks = %ld\n", nTotalChunks);
	int64_t overlap_length = (pattern_length - 1) * (nTotalChunks - 1);
	//쪼개진 덩어리 중 마지막 1개는 겹치는 부분이 없으므로 nTotalChunks - 1
	//문자열은 최악의 경우 pattern의 첫글자가 하나의 코어 
	//나머지 글자가 하나의 코어에 분배되는 경우이므로 pattern_length - 1

	if(rankID == 0) printf("\toverlap_length = %ld\n", overlap_length);
	int64_t quotient = (target_length + overlap_length) / nTotalChunks; 
	//각 코어당 최악의 경우를 방지하기 위해 덩어리마다 pattern_length - 1을 추가
	//즉 target_length + overlap_length가 되고 이를 정해진 nChunksPerRank씩
	//각 코어에 분배하기 위하여 nTotalChunks로 나누어 준다.

	if(rankID == 0) printf("\tquotient = %ld\n", quotient);
	int64_t remainder = (target_length + overlap_length) - (quotient * nTotalChunks);
	//나누는 경우에 나누어 떨어지지 않는 경우가 있으므로 나머지를 따로 처리해준다.
	if(rankID == 0) printf("\tremainder = %ld\n\n", remainder);

	int64_t chunkID = 0;
	int64_t* chunk_length = (int64_t*)malloc((nTotalChunks+1)*sizeof(int64_t)); 
	int64_t* chunk_start_idx = (int64_t*)malloc((nTotalChunks+1)*sizeof(int64_t)); 
	//remainder의 경우를 위해 nTotalChunks에 + 1 을 한다.
	int64_t i;
	for(i=0; i<nTotalChunks; i++)
		chunk_length[i] = quotient;
	for(i=0; i<remainder; i++)
		chunk_length[i] += 1;
	chunk_start_idx[0] = 0;
	for(i=1; i<nTotalChunks; i++)
		chunk_start_idx[i] = chunk_start_idx[i-1] + chunk_length[i-1] - (pattern_length-1); 
	//마지막에 - (pattern_length - 1) 을 해줌으로서 첫 번째 chunk를 제외하고
	//모든 chunk는 이전 chunk의 마지막 문자열의 -4번째 포인터를 chunk_start_idx로 가진다.
	//따라서 첫번째 chunk를 제외한 모든 chunk 이전 chunk의 마지막 4글자를 무조건 포함한다.

	chunk_start_idx[nTotalChunks] = 0;
	chunk_length[nTotalChunks] = 0;

	//chunk가 끝났다는 것을 표시하기 위해 nTotalChunk + 1번째 chunk의
	//start idx 와 length는 모두 0으로 지정한다.

	MPI_Request MPI_req[2];
	MPI_Status MPI_stat[2];
	int32_t MPI_tag = 0;
	int32_t request_rankID = -1;
	if(rankID == 0)
		int64_t nFinishRanks = 0;
		while(nFinishRanks < nRanks-1)
			MPI_Recv(&request_rankID, 1, MPI_INT32_T, MPI_ANY_SOURCE, MPI_tag, MPI_COMM_WORLD, &MPI_stat[0]);
			//Rank 0 은 다른 Rank들의 송신을 기다린다.
			//MPI_stat[0]로 다른 Rank가 이제 일을 시작한다는 것을 확인한다.
			MPI_Isend(&target[chunk_start_idx[chunkID]], chunk_length[chunkID], MPI_CHAR, request_rankID, chunkID, MPI_COMM_WORLD, &MPI_req[1]);
			//위에서 각 rank가 할당 받는 chunk_length의 길이를 구했었다.
			//chunkID를 tag로 받는 것에 주의할 것
			//파일 전체를 읽어서(target) rank 마다 검사할 위치의 시작점을 정해주고 다시 해당 rank에 전송한다.
			//파일 전체 위치에서 chunk_length[chunkID]길이 만큼만 보낸다는 것을 주의할 것
			//req[1]을 보냄으로서 검사를 시작해라는 요청을 보낸다.
			printf("\trequest_rankID = %d\n", request_rankID);
			printf("\tchunkID = %ld\n", chunkID);
			printf("\tchunk_start_idx[chunkID] = %ld\n", chunk_start_idx[chunkID]);
			printf("\ttarget[chunk_start_idx[chunkID] = %c\n\n", target[chunk_start_idx[chunkID]]);
			if(chunkID < nTotalChunks)
		chunk = (char *)malloc(chunk_length[0] * sizeof(char));
		//chunk 는 문자열임을 기억한다.
		int64_t chunk_found_count = 0;
		int64_t call_count = 0;
		while(chunkID < nTotalChunks)
			//선언될 때 chunkID = 0인 상태이다.
			MPI_Isend(&rankID, 1, MPI_INT32_T, 0, MPI_tag, MPI_COMM_WORLD, &MPI_req[0]);
			//Rank 0에게 현재 어떤 Rank가 일을 하는지 알려준다.
			//MPI_requ[0]은 검사를 준비하는 상태라는 의미이다.
			MPI_Recv(chunk, chunk_length[0], MPI_CHAR, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &MPI_stat[1]);
			//각 rank가 부여받은 chunk 사이즈의 크기는 균일하게 나눈 뒤 나머지를 각 chunk에 1씩 더해 주었으므로
			//받는 크기는 chunk_length[0]으로 고정한다.
			chunkID = MPI_stat[1].MPI_TAG;
			printf("\trank = %d chunk = %s chunkID = %d\n", rankID, chunk, chunkID);
			if(chunkID < nTotalChunks)
				chunk_found_count = do_search(chunk, target_length, 0, chunk_length[chunkID], pattern, pattern_length, BCS, GSS);
				if(found_count < 0)
				mpi_found_count += chunk_found_count;
		printf("- [%02d: %s] call_count: %ld\n", rankID, rankName, call_count);
	MPI_Reduce(&mpi_found_count, &found_count, 1, MPI_INT64_T, MPI_SUM, 0, MPI_COMM_WORLD);
	//Rank 0 의 mpi_found_count 변수에 각 프로세서들의 mpi_found_count값을 더해서 모은다.


	if(rankID == 0)
		search_time += MPI_Wtime();
		printf("- Found_count: %ld\n", found_count);
		printf("- Time: %lf secs\n", search_time);


	return 0;
Example #8
int main(int argc, char* argv[]) {
	/*   Parameter declaration                          */
	// Index params
	int i,j,k = 0;
	// Directory path params
	char input_path[1024] = "/projects/isgs/lidar/champaign/las";
	char scratch_path[1024] = "/gpfs_scratch/ncasler/data/tmp";
	char out_path[1024] = "";
	char tmp_path[1024] = "";

	// MPI Params
	int world_size, world_rank, mpi_err;
	MPI_Comm world_comm = MPI_COMM_WORLD;
	MPI_Info info = MPI_INFO_NULL;
	MPI_Status status;
	MPI_Request request;
	MPI_Init(&argc, &argv);
	MPI_Comm_size(world_comm, &world_size);
	MPI_Comm_rank(world_comm, &world_rank);
	MPI_Errhandler_set(world_comm, MPI_ERRORS_RETURN);
	double starttime, endtime;
	// Set memory limit for grid allocations
	long buffer_lim = 8000000000;
	size_t max_size = buffer_lim / sizeof(Pixel);
	// File specific parameters
	int n_files, file_off, file_blk, file_end = 0;
	char ext[5] = ".las";

	// Metadata
	double g_mins[3] = {DBL_MAX,DBL_MAX,DBL_MAX}; // Global min coord
	double g_maxs[3] = {-DBL_MAX,-DBL_MAX,-DBL_MAX}; // Global max coord
	double l_mins[3] = {DBL_MAX,DBL_MAX,DBL_MAX}; // Local min coord
	double l_maxs[3] = {-DBL_MAX,-DBL_MAX,-DBL_MAX}; // Local max coord

	// File specific params
	FileCollection *g_files = NULL; // Global file list
	int g_n_files = 0; // Global file count
	int l_n_files = 0; // Local file count
	BBox* g_file_bbox = NULL; // Global array of file bboxes
	BBox* l_file_bbox = NULL; // Local array of file bboxes
	// Grid specific params
	struct Point* origin = new Point();
	DType datatype = DT_Float32;
	struct Grid* g_grid = NULL; // Global grid
	int g_cols = 0; // Global grid column count
	int g_rows = 0; // Global grid row count
	//Specify resolution > should be parameterized in prod
	double res = 5.0f;
	// Block scheme params
	struct BlockScheme* blk_scheme = NULL;
	int *l_blks = NULL; // Local block array
	int *g_blks = NULL; // Global block array
	int l_n_blks = 0; // Local block count
	int g_n_blks = 0; // Global block count
	int *blk_n_files = NULL; // Array holding block specific file counts
	// Las specific params
	las_file las;
	long long g_n_pts = 0; // Global point count
	long long l_n_pts = 0; // Local point count

	/*       Begin first file scan                    */
	starttime = MPI_Wtime();
	g_files = new FileCollection(&input_path[0], &ext[0]);
	// Check the number of available LAS files
	g_n_files = g_files->countFiles();
	g_file_bbox = new BBox[g_n_files];
	l_file_bbox = new BBox[l_n_files];
	// Create tif output dir
	sprintf(&tmp_path[0], "%s/blocks", scratch_path);
	printf("[%i] Using tmp dir: %s\n", tmp_path);
	struct stat st = {0};
	if (stat(tmp_path, &st) == -1) 
		mkdir(tmp_path, 0700);
	// Set file Block size
	file_blk = ceil((float)g_n_files /(float)world_size);
	file_off = file_blk * world_rank;
	if (file_off + file_blk > g_n_files)
		file_end = g_n_files - file_off;
		file_end = file_off + file_blk;

	// Read subset of file paths from dir
	l_n_files = g_files->getMetadata(file_off, file_end);

	// Scan metadata from files
	for (i = file_off; i < file_end; i++) {>fileList[i]);
		l_n_pts = l_n_pts + (long long) las.points_count();
		compareMin(l_mins, las.minimums());
		compareMax(l_maxs, las.maximums());
		double *tmp_min = las.minimums();
		double *tmp_max = las.maximums();
		j = i - file_off;
		l_file_bbox[j].updateMin(tmp_min[0], tmp_min[1], tmp_min[2]);
		l_file_bbox[j].updateMax(tmp_max[0], tmp_max[1], tmp_max[2]);
	endtime = MPI_Wtime();
	printf("[%i] Metadata gathered in %f seconds\n", world_rank, 
			endtime - starttime);
	/*         Gather global min/max point count         */
	/*                 COMMUNICATIONS                    */
	MPI_Allreduce(&l_mins[0], &g_mins[0], 3, MPI_DOUBLE, MPI_MIN, world_comm);
	MPI_Allreduce(&l_maxs[0], &g_maxs[0], 3, MPI_DOUBLE, MPI_MAX, world_comm);
	MPI_Allreduce(&l_n_pts, &g_n_pts, 1, MPI_LONG_LONG, MPI_SUM, world_comm);
	// Gather the bounding box values for each file
	printf("[%i] Gather bbox values\n", world_rank);
	MPI_Allgather(&l_file_bbox, l_n_files*sizeof(BBox), MPI_BYTE, &g_file_bbox, l_n_files, l_n_files*sizeof(BBox), world_comm);
	double io_time = MPI_Wtime();
	printf("[%i] Communication finished in %f seconds\n", world_rank, io_time - endtime);

	// Create origin for global grid
	origin->update(g_mins[0], g_maxs[0], g_mins[2]);
	g_cols = (int) ceil((g_maxs[0] - g_mins[0]) / res);
	g_rows = (int) ceil((g_maxs[1] - g_mins[1]) / res);
	// Create global grid
	g_grid = new Grid(origin, g_cols, g_rows, datatype, res, res);
	// Create global block scheme
	blk_scheme = new BlockScheme(g_grid, max_size, datatype, world_size);
	l_n_blks = blk_scheme->getBlockCount(world_rank); // Local block count
	g_n_blks = blk_scheme->cols * blk_scheme->rows; // Global block count
	l_blks = blk_scheme->getBlocks(world_rank); // Local block id array
	printf("[%i] Block Total: %i,Local: %i, first: %i, last: %i \n", g_n_blks, l_n_blks, l_blks[0], l_blks[l_n_blks-1]);
	/*                 Get file list for block                 */

	/*                 Read blocks                             */

	/*                Write                                    */

	/*        Clean up                                         */
	printf("[%i]Cleaning up\n", world_rank);
	delete[] l_file_bbox;
	delete[] g_file_bbox;

	return 0;
Example #9
File: 10.c Project: arkuzmin/ppp
int main (int argc,char **argv)
   double time,time_seq, time_par;
   int rank, size;

   char *tracefile;

   tracefile = getenv("TVTRACE");
   if( tracefile != NULL ){
      printf( "tv tracefile=%s\n", tracefile );
      MPI_Pcontrol(TRACEFILES, NULL, tracefile, 0);      
      MPI_Pcontrol(TRACEFILES, NULL, "trace", 0);
   MPI_Pcontrol(TRACELEVEL, 1, 1, 1);
   MPI_Pcontrol(TRACENODE, 1000000, 1, 1);

   MPI_Comm_rank (MPI_COMM_WORLD,&rank);
   MPI_Comm_size (MPI_COMM_WORLD,&size);

   if( !rank ){
      double *a,*b,*c, *c0;
      int i,i1,j,k;
      int ann;
      MPI_Status *st;
      MPI_Request *rq,rq1;
      rq = (MPI_Request*) malloc( (size-1)*sizeof(MPI_Request) );
      st = (MPI_Status*) malloc( (size-1)*sizeof(MPI_Status) );

//      printf("[%d]ann=%d\n", rank, ann );

      a=(double*) malloc(am*an*sizeof(double));
      b=(double*) malloc(am*bm*sizeof(double));
      c=(double*) malloc(an*bm*sizeof(double));
      printf( "Data ready [%d]\n", rank );
      c0 = (double*)malloc(an*bm*sizeof(double));

      time = MPI_Wtime();  
      for (i=0; i<an; i++)
         for (j=0; j<bm; j++)
            double s = 0.0;
            for (k=0; k<am; k++)
              s+= a[i*am+k]*b[k*bm+j];
            c0[i*bm+j] = s;
      time = MPI_Wtime() - time;
      printf("Time seq[%d] = %lf\n", rank, time );
      time_seq = time;

      MPI_Barrier( MPI_COMM_WORLD );

      MPI_Bcast( b, am*bm, MPI_DOUBLE, 0, MPI_COMM_WORLD);
      printf( "Data Bcast [%d]\n", rank );

      for( i1=0, j=1; j<size; j++, i1+=ann*am ){
         printf( "Data to Send [%d] %016x[%4d] =>> %d\n", rank, a+i1, i1, j );
         MPI_Isend( a+i1, ann*am, MPI_DOUBLE, j, 101, MPI_COMM_WORLD, &rq1 );
         MPI_Request_free( &rq1 ); 
         printf( "Data Send [%d] =>> %d\n", rank, j );
      printf( "Data Send [%d]\n", rank );
      MPI_Isend( a+i1, 1, MPI_DOUBLE, 0, 101, MPI_COMM_WORLD, &rq1 );
      MPI_Request_free( &rq1 ); 
      printf( "Data Send [%d] =>> %d\n", rank, j );

          double s=0.0;

      printf( "Job done  [%d]\n", rank );
      for( i1=0, j=1; j<size; j++, i1+=(ann*bm) ){
         printf( "Data to Recv [%d] %016x[%4d] =>> %d\n", rank, c+i1, i1/bm, j );
         MPI_Irecv( c+i1, ann*am, MPI_DOUBLE, j, 102, MPI_COMM_WORLD, rq+(j-1) );
      MPI_Waitall( size-1, rq, st );
      printf("time [%d]=%12.8lf\n",rank,time);
      time_par = time;

      printf( "Data collected [%d]\n", rank );
      int ok = 1;
        if( c[i] != c0[i] ){
           ok = 0;
           printf( "Fail [%d %d] %lf != %lf\n", i/bm, i%bm, c[i], c0[i] );
      if( ok ){
        printf( "Data verifeid [%d] time = %lf\n", rank, time );
        printf( "SpeedUp S(%d) = %14.10lf\n", size, time_seq/time_par );
        printf( "Efitncy E(%d) = %14.10lf\n", size, time_seq/(time_par*size) );
      int ann;
      double *a,*b,*c;
      MPI_Status st;
      int i,j,k;

      MPI_Pcontrol(TRACEEVENT, "entry", 0, 0, "");

      ann= an/size + ((an%size)?1:0);
//      if(rank==1)
//        printf("[%d]ann=%d = %d / %d \n", rank, ann, an, size );
      printf( "Mem allocated [%d]\n", rank );

      MPI_Barrier( MPI_COMM_WORLD );
      MPI_Pcontrol(TRACEEVENT, "exit", 0, 0, "");
      time = MPI_Wtime();

      MPI_Pcontrol(TRACEEVENT, "entry", 1, 0, "");
      printf( "Data Bcast [%d]\n", rank );
      MPI_Recv( a, ann*am, MPI_DOUBLE, 0, 101, MPI_COMM_WORLD, &st);
      printf( "Data Recv [%d]\n", rank );
      MPI_Pcontrol(TRACEEVENT, "exit", 1, 0, "");
      MPI_Pcontrol(TRACEEVENT, "entry", 2, 0, "");
      for( i=0; i<ann; i++ )
            double s=0.0;
            for( k=0; k<am; k++ ){
                  printf( "c[%d<%d %d] = %lf\n", i,ann,j, s );
      printf( "Job done  [%d]\n", rank );
      MPI_Pcontrol(TRACEEVENT, "exit", 2, 0, "");

      MPI_Pcontrol(TRACEEVENT, "entry", 3, 0, "");
      MPI_Send( c, ann*bm,  MPI_DOUBLE, 0, 102, MPI_COMM_WORLD);
      printf( "Data returned [%d]\n", rank );
      MPI_Pcontrol(TRACEEVENT, "exit", 3, 0, "");

      printf("time [%d]=%12.8lf\n",rank,time);
   return 0;
Example #10
int main(int argc, char *argv[])

const int PNUM = 2; //number of processes
const int MSIZE = 4; //matrix size

int rank,value,size;
int namelen;

double time1,time2;


MPI_Init(&argc, &argv);

time1 = MPI_Wtime();

char processor_name[MPI_MAX_PROCESSOR_NAME];

MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status status;

int B[MSIZE];
int C[MSIZE];


		int a=0;
        for(int i=0;i<MSIZE;i++)
            for(int j=0;j<MSIZE;j++)

	int F=0;
    for(int i=rank*MSIZE/PNUM;i<(rank*MSIZE/PNUM)+2;i++)
		 for(int j=0;j<MSIZE;j++)

	time2 = MPI_Wtime();
	std::cout<<"\nElasped Time: "<<time2-time1;

    for(int i=1;i<PNUM;i++)  MPI_Recv(&C[i*MSIZE/PNUM],MSIZE/2,MPI_INT,i,0,MPI_COMM_WORLD,&status);

        for(int i=0;i<MSIZE;i++)
		std::cout<<C[i]<<" ";



return 0;
void node_server::start_run(bool scf) {
	integer output_cnt;

	if (!hydro_on) {
	printf("%e %e\n", grid::get_A(), grid::get_B());
	if (scf) {
		printf("Adjusting velocities:\n");
		auto diag = diagnostics();
		space_vector dv;
		dv[XDIM] = -diag.grid_sum[sx_i] / diag.grid_sum[rho_i];
		dv[YDIM] = -diag.grid_sum[sy_i] / diag.grid_sum[rho_i];
		dv[ZDIM] = -diag.grid_sum[sz_i] / diag.grid_sum[rho_i];

	int ngrids = regrid(me.get_gid(), false);

	real output_dt = opts.output_dt;

	printf("OMEGA = %e, output_dt = %e\n", grid::get_omega(), output_dt);
	real& t = current_time;
	integer step_num = 0;

	auto fut_ptr = me.get_ptr();
	node_server* root_ptr = fut_ptr.get();

	output_cnt = root_ptr->get_rotation_count() / output_dt;
	hpx::future<void> diag_fut = hpx::make_ready_future();
	hpx::future<void> step_fut = hpx::make_ready_future();
	profiler_output (stdout);
	real bench_start, bench_stop;
	while (current_time < opts.stop_time) {
		auto time_start = std::chrono::high_resolution_clock::now();
		if (root_ptr->get_rotation_count() / output_dt >= output_cnt) {
			//	if (step_num != 0) {

			char* fname;

			if (asprintf(&fname, "X.%i.chk", int(output_cnt))) {
			if (asprintf(&fname, "X.%i.silo", int(output_cnt))) {
			output(fname, output_cnt, false);
			//	SYSTEM(std::string("cp *.dat ./dat_back/\n"));
			//	}

		if (step_num == 0) {
			bench_start = MPI_Wtime();

		//	break;
		auto ts_fut = hpx::async([=]() {return timestep_driver();});
		real dt = ts_fut.get();
		real omega_dot = 0.0, omega = 0.0, theta = 0.0, theta_dot = 0.0;
		if (opts.problem == DWD) {
			auto diags = diagnostics();

			const real dx = diags.secondary_com[XDIM] - diags.primary_com[XDIM];
			const real dy = diags.secondary_com[YDIM] - diags.primary_com[YDIM];
			const real dx_dot = diags.secondary_com_dot[XDIM] - diags.primary_com_dot[XDIM];
			const real dy_dot = diags.secondary_com_dot[YDIM] - diags.primary_com_dot[YDIM];
			theta = atan2(dy, dx);
			omega = grid::get_omega();
			theta_dot = (dy_dot * dx - dx_dot * dy) / (dx * dx + dy * dy) - omega;
			const real w0 = grid::get_omega() * 100.0;
			const real theta_dot_dot = (2.0 * w0 * theta_dot + w0 * w0 * theta);
			omega_dot = theta_dot_dot;
			omega += omega_dot * dt;
//		omega_dot += theta_dot_dot*dt;
		double time_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - time_start).count();
		step_fut =
				[=]() {
					FILE* fp = fopen( "step.dat", "at");
					fprintf(fp, "%i %e %e %e %e %e %e %e %e %i\n", int(step_num), double(t), double(dt), time_elapsed, rotational_time, theta, theta_dot, omega, omega_dot, int(ngrids));
		printf("%i %e %e %e %e %e %e %e %e\n", int(step_num), double(t), double(dt), time_elapsed, rotational_time, theta, theta_dot, omega, omega_dot);

//		t += dt;

		if (step_num % refinement_freq() == 0) {
			ngrids = regrid(me.get_gid(), false);
			FILE* fp = fopen("profile.txt", "wt");
			//		set_omega_and_pivot();
			bench_stop = MPI_Wtime();
			if (scf || opts.bench) {
				printf("Total time = %e s\n", double(bench_stop - bench_start));
				FILE* fp = fopen("bench.dat", "at");
				fprintf(fp, "%i %e\n", int(hpx::find_all_localities().size()), double(bench_stop - bench_start));

		//		set_omega_and_pivot();
		if (scf) {
			bench_stop = MPI_Wtime();
			printf("Total time = %e s\n", double(bench_stop - bench_start));
			//	FILE* fp = fopen( "bench.dat", "at" );
			//	fprintf( fp, "%i %e\n", int(hpx::find_all_localities().size()), double(bench_stop - bench_start));
			//	fclose(fp);
	output("final.silo", output_cnt, true);
int main(int argc, char* argv[])
	int i,n, length;
	int *inmsg,*outmsg;
	int mypid,mysize;
	int rc;
	int sint;
	double start, finish,time;
	double bw;
	MPI_Status status;



		fprintf(stderr,"now we only test message passing time between 2 processes\n");


	inmsg=(int *) malloc(MAXLENGTH*sizeof(int));
	outmsg=(int *) malloc(MAXLENGTH*sizeof(int));

	//synchronize the process, so the MPI_Barrier will return only if all the processes all called it

        	printf("\n\nDoing time test for:\n");
        	printf("Message length=%d int value\n",length);
        	printf("Message size =%d Bytes\n",sint*length);
        	printf("Number of Reps=%d\n",REPS);

        		rc= MPI_Recv(&inmsg[0],length,MPI_INT,1,0,MPI_COMM_WORLD,&status);


        	//calculate the average time and bandwidth

        	printf("***delivering message avg= %f Sec for lengthSize=%d\n",time/REPS,length);

            printf("*** bandwidth=%f Byte/Sec\n",bw);

            //increase the length 
    //task 1 processing now
      		rc = MPI_Send(&outmsg[0],length,MPI_INT,0,0,MPI_COMM_WORLD);


Example #13
int main(int argc, char **argv) 
  int i, j, n, N=50, b=1, p;
  double *A, *v, *w;

  /* Iniciar MPI */
  /* Extracción de argumentos */
  if (argc > 1) { /* El usuario ha indicado el valor de n */
     if ((N = atoi(argv[1])) < 0) N = 50;
  if (argc > 2) { /* El usuario ha indicado el valor de b */
     if ((b = atoi(argv[2])) < 0) b = 1;
  if (b>=N/np) { /* Valor de b incorrecto */
    printf("Error: ancho de banda excesivo, N/np=%d, b=%d\n", N/np, b);

  p = 0;
  for (i=0; i<np; i++) {
    n = N/np;
    if (i<N%np) n++;
    if (i==me) break;
    p += n;
  printf("[Proc %d] tamaño local: n=%d, fila inicial: p=%d\n", me, n, p);

  /* Reserva de memoria */
  A = (double*)calloc(N*n,sizeof(double));
  v = (double*)calloc(n+b,sizeof(double)); //tamaño solo aumenta por b
  w = (double*)calloc(n+b,sizeof(double)); //tamaño solo aumenta por b

  /* Inicializar datos */
  for(i=0; i<n; i++) A[i*N+(i+p)] = 2*b;
  for(i=0; i<n; i++) {
    for(j=0; j<N; j++) {
      if (i+p<j && abs(i+p-j)<=b) A[i*N+j] = -1.0;
  for(i=0; i<n; i++) v[i] = 1.0;

  /* Multiplicación de matrices */
  double t1,t2;
  t1 = MPI_Wtime();
  t2 = MPI_Wtime();  
  if (me==0) printf("Tiempo transcurrido: %f s.\n", t2-t1);
  /* Imprimir solución */
  for(i=0; i<n; i++) printf("w[%d] = %g\n", i+p, w[i]); //Solo imprimimos w sin la parte de arriba.


  return 0;
Example #14
int main(int argc, char *argv[]) {

	int i, n, nlocal;
	int numprocs, myrank;
	MPI_File f; char* filename = "input/8";
	MPI_Status status;

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);

		fprintf(stderr, "Cannot open file %s\n", filename);
		return 1;
	MPI_File_seek(f, 0, MPI_SEEK_SET);
	MPI_File_read(f, &n, 1, MPI_INT, &status);
	nlocal = n/numprocs; if(myrank == numprocs - 1) nlocal = nlocal + n % numprocs;

	int *a = (int *)malloc(nlocal * n * sizeof(int));
	MPI_File_seek(f, (myrank * nlocal * n + 1) * sizeof(int), MPI_SEEK_SET);
	MPI_File_read(f, &a[0], nlocal * n, MPI_INT, &status);

// 	int j;
//	if(myrank == 3) {
//		for(i = 0; i < nlocal; i++) {
//			for(j = 0; j < n; j++) {
//				printf("%d ", a[i * n +j]);
//			}
//			printf("\n");
//		}
//	}

	double start = MPI_Wtime();
	floyd_all_pairs_sp_1d(n, nlocal, a);
	double stop = MPI_Wtime();
	printf("[%d] Completed in %1.3f seconds\n", myrank, stop-start);

//	if(myrank == 3) {
//		for(i = 0; i < nlocal; i++) {
//			for(j = 0; j < n; j++) {
//				printf("%d ", a[i * n +j]);
//			}
//			printf("\n");
//		}
//	}
			printf("Cannot open file %s\n", "out");
			return 1;
	if(myrank == 0) {
		MPI_File_seek(f, 0, MPI_SEEK_SET);
		MPI_File_write(f, &n, 1, MPI_INT, &status);
	for(i = 0; i < nlocal; i++) {
		MPI_File_seek(f, (myrank * nlocal * n + 1) * sizeof(int), MPI_SEEK_SET);
		MPI_File_write(f, &a[0], nlocal * n, MPI_INT, &status);


	return 0;
Example #15
int main(int argc, char **argv) {
	const int MAX_ITER  = 50;
	const double RELTOL = 1e-2;
	const double ABSTOL = 1e-4;

	 * Some bookkeeping variables for MPI. The 'rank' of a process is its numeric id
	 * in the process pool. For example, if we run a program via `mpirun -np 4 foo', then
	 * the process ranks are 0 through 3. Here, N and size are the total number of processes 
	 * running (in this example, 4).

	int rank;
	int size;

	MPI_Init(&argc, &argv);               // Initialize the MPI execution environment
	MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Determine current running process
	MPI_Comm_size(MPI_COMM_WORLD, &size); // Total number of processes
	double N = (double) size;             // Number of subsystems/slaves for ADMM

	/* Read in local data */

	int skinny;           // A flag indicating whether the matrix A is fat or skinny
	FILE *f;
	int m, n;
	int row, col;
	double entry;

	 * Subsystem n will look for files called An.dat and bn.dat 
	 * in the current directory; these are its local data and do not need to be
	 * visible to any other processes. Note that
	 * m and n here refer to the dimensions of the *local* coefficient matrix.

	/* Read A */
	char s[20];
	sprintf(s, "data/A%d.dat", rank + 1);
	printf("[%d] reading %s\n", rank, s);

	f = fopen(s, "r");
	if (f == NULL) {
		printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
	mm_read_mtx_array_size(f, &m, &n);	
	gsl_matrix *A = gsl_matrix_calloc(m, n);
	for (int i = 0; i < m*n; i++) {
		row = i % m;
		col = floor(i/m);
		fscanf(f, "%lf", &entry);
		gsl_matrix_set(A, row, col, entry);

	/* Read b */
	sprintf(s, "data/b%d.dat", rank + 1);
	printf("[%d] reading %s\n", rank, s);

	f = fopen(s, "r");
	if (f == NULL) {
		printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s);
	mm_read_mtx_array_size(f, &m, &n);
	gsl_vector *b = gsl_vector_calloc(m);
	for (int i = 0; i < m; i++) {
		fscanf(f, "%lf", &entry);
		gsl_vector_set(b, i, entry);

	m = A->size1;
	n = A->size2;
	skinny = (m >= n);

	 * These are all variables related to ADMM itself. There are many
	 * more variables than in the Matlab implementation because we also
	 * require vectors and matrices to store various intermediate results.
	 * The naming scheme follows the Matlab version of this solver.

	double rho = 1.0;

	gsl_vector *x      = gsl_vector_calloc(n);
	gsl_vector *u      = gsl_vector_calloc(n);
	gsl_vector *z      = gsl_vector_calloc(n);
	gsl_vector *y      = gsl_vector_calloc(n);
	gsl_vector *r      = gsl_vector_calloc(n);
	gsl_vector *zprev  = gsl_vector_calloc(n);
	gsl_vector *zdiff  = gsl_vector_calloc(n);

	gsl_vector *q      = gsl_vector_calloc(n);
	gsl_vector *w      = gsl_vector_calloc(n);
	gsl_vector *Aq     = gsl_vector_calloc(m);
	gsl_vector *p      = gsl_vector_calloc(m);

	gsl_vector *Atb    = gsl_vector_calloc(n);

	double send[3]; // an array used to aggregate 3 scalars at once
	double recv[3]; // used to receive the results of these aggregations

	double nxstack  = 0;
	double nystack  = 0;
	double prires   = 0;
	double dualres  = 0;
	double eps_pri  = 0;
	double eps_dual = 0;

	/* Precompute and cache factorizations */

	gsl_blas_dgemv(CblasTrans, 1, A, b, 0, Atb); // Atb = A^T b

	 * The lasso regularization parameter here is just hardcoded
	 * to 0.5 for simplicity. Using the lambda_max heuristic would require 
	 * network communication, since it requires looking at the *global* A^T b.

	double lambda = 0.5;
	if (rank == 0) {
		printf("using lambda: %.4f\n", lambda);

	gsl_matrix *L;

	/* Use the matrix inversion lemma for efficiency; see section 4.2 of the paper */
	if (skinny) {
		/* L = chol(AtA + rho*I) */
		L = gsl_matrix_calloc(n,n);

		gsl_matrix *AtA = gsl_matrix_calloc(n,n);
		gsl_blas_dsyrk(CblasLower, CblasTrans, 1, A, 0, AtA);

		gsl_matrix *rhoI = gsl_matrix_calloc(n,n);
		gsl_matrix_scale(rhoI, rho);

		gsl_matrix_memcpy(L, AtA);
		gsl_matrix_add(L, rhoI);

	} else {
		/* L = chol(I + 1/rho*AAt) */
		L = gsl_matrix_calloc(m,m);

		gsl_matrix *AAt = gsl_matrix_calloc(m,m);
		gsl_blas_dsyrk(CblasLower, CblasNoTrans, 1, A, 0, AAt);
		gsl_matrix_scale(AAt, 1/rho);

		gsl_matrix *eye = gsl_matrix_calloc(m,m);

		gsl_matrix_memcpy(L, AAt);
		gsl_matrix_add(L, eye);


	/* Main ADMM solver loop */

	int iter = 0;
	if (rank == 0) {
		printf("%3s %10s %10s %10s %10s %10s\n", "#", "r norm", "eps_pri", "s norm", "eps_dual", "objective");		
    double startAllTime, endAllTime;
	startAllTime = MPI_Wtime();
	while (iter < MAX_ITER) {

        /* u-update: u = u + x - z */
		gsl_vector_sub(x, z);
		gsl_vector_add(u, x);

		/* x-update: x = (A^T A + rho I) \ (A^T b + rho z - y) */
		gsl_vector_memcpy(q, z);
		gsl_vector_sub(q, u);
		gsl_vector_scale(q, rho);
		gsl_vector_add(q, Atb);   // q = A^T b + rho*(z - u)

        double tmp, tmpq;
		gsl_blas_ddot(x, x, &tmp);
		gsl_blas_ddot(q, q, &tmpq);

		if (skinny) {
			/* x = U \ (L \ q) */
			gsl_linalg_cholesky_solve(L, q, x);
		} else {
			/* x = q/rho - 1/rho^2 * A^T * (U \ (L \ (A*q))) */
			gsl_blas_dgemv(CblasNoTrans, 1, A, q, 0, Aq);
			gsl_linalg_cholesky_solve(L, Aq, p);
			gsl_blas_dgemv(CblasTrans, 1, A, p, 0, x); /* now x = A^T * (U \ (L \ (A*q)) */
			gsl_vector_scale(x, -1/(rho*rho));
			gsl_vector_scale(q, 1/rho);
			gsl_vector_add(x, q);

		 * Message-passing: compute the global sum over all processors of the
		 * contents of w and t. Also, update z.

		gsl_vector_memcpy(w, x);
		gsl_vector_add(w, u);   // w = x + u

		gsl_blas_ddot(r, r, &send[0]); 
		gsl_blas_ddot(x, x, &send[1]);
		gsl_blas_ddot(u, u, &send[2]);
		send[2] /= pow(rho, 2);

		gsl_vector_memcpy(zprev, z);

		// could be reduced to a single Allreduce call by concatenating send to w
		MPI_Allreduce(w->data, z->data,  n, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
		MPI_Allreduce(send,    recv,     3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

		prires  = sqrt(recv[0]);  /* sqrt(sum ||r_i||_2^2) */
		nxstack = sqrt(recv[1]);  /* sqrt(sum ||x_i||_2^2) */
		nystack = sqrt(recv[2]);  /* sqrt(sum ||y_i||_2^2) */

		gsl_vector_scale(z, 1/N);
		soft_threshold(z, lambda/(N*rho));

		/* Termination checks */

		/* dual residual */
		gsl_vector_memcpy(zdiff, z);
		gsl_vector_sub(zdiff, zprev);
		dualres = sqrt(N) * rho * gsl_blas_dnrm2(zdiff); /* ||s^k||_2^2 = N rho^2 ||z - zprev||_2^2 */

		/* compute primal and dual feasibility tolerances */
		eps_pri  = sqrt(n*N)*ABSTOL + RELTOL * fmax(nxstack, sqrt(N)*gsl_blas_dnrm2(z));
		eps_dual = sqrt(n*N)*ABSTOL + RELTOL * nystack;

		if (rank == 0) {
			printf("%3d %10.4f %10.4f %10.4f %10.4f %10.4f\n", iter, 
					prires, eps_pri, dualres, eps_dual, objective(A, b, lambda, z));

		if (prires <= eps_pri && dualres <= eps_dual) {

		/* Compute residual: r = x - z */
		gsl_vector_memcpy(r, x);
		gsl_vector_sub(r, z);


	/* Have the master write out the results to disk */
	if (rank == 0) { 
        endAllTime = MPI_Wtime();
        printf("Elapsed time is: %lf \n", endAllTime - startAllTime);

		f = fopen("data/solution.dat", "w");
		gsl_vector_fprintf(f, z, "%lf");

	MPI_Finalize(); /* Shut down the MPI execution environment */

	/* Clear memory */

Example #16
int main(int argc, char **argv)
  int my_id;		/* process id */
  int p;		/* number of processes */
  char* message_s, *message_r;	/* storage for the message */
  int i, max_msgs, msg_size;
  MPI_Status status;	/* return status for receive */
  double elapsed_time_sec;
  double bandwidth;
  double startTime = 0;
  MPI_Init( &argc, &argv );
  MPI_Comm_rank( MPI_COMM_WORLD, &my_id );
  MPI_Comm_size( MPI_COMM_WORLD, &p );
  if ((sscanf (argv[1], "%d", &max_msgs) < 1) ||
      (sscanf (argv[2], "%d", &msg_size) < 1)) {
    fprintf (stderr, "need msg count and msg size as params\n");
    goto EXIT;

  message_s = (char*)malloc (msg_size);  
  message_r = (char*)malloc (msg_size);

  /* don't start timer until everybody is ok */
  if( my_id < p/2 ) {
    startTime = MPI_Wtime();
    for(i=0; i<max_msgs; i++){
      MPI_Send(message_s, msg_size, MPI_CHAR, my_id+p/2, 0, MPI_COMM_WORLD);
      MPI_Recv(message_r, msg_size, MPI_CHAR, my_id+p/2, 0, MPI_COMM_WORLD, 

    elapsed_time_sec = MPI_Wtime() - startTime; 

    fprintf(stdout, "Totaltime: %8.3f s\n",elapsed_time_sec);
    elapsed_time_sec /= 2;  /* We want the ping performance not round-trip. */
    elapsed_time_sec /= max_msgs; /* time for each message */
    bandwidth = msg_size / elapsed_time_sec; /* bandwidth */
    fprintf (stdout, "%5d %7d\t ", max_msgs, msg_size);
    fprintf (stdout,"%8.3f us\t %8.3f MB/sec\n",
	     elapsed_time_sec * 1e6, bandwidth / 1e6);
  else {
    for(i=0; i<max_msgs; i++){
      MPI_Recv(message_r, msg_size, MPI_CHAR, my_id-p/2, 0, MPI_COMM_WORLD, 
      MPI_Send(message_s, msg_size, MPI_CHAR, my_id-p/2, 0, MPI_COMM_WORLD);
  return 0;
Example #17
NhlErrorTypes NclGetWTime(double *time)
	*time = MPI_Wtime();
	return NhlNOERROR;
int main(int argc, char *argv[])
    int i, j, length, my_rank, left, right, size, test_value, mid;
    double start, finish, transfer_time;
    float snd_buf_left[max_length], snd_buf_right[max_length];
    float *rcv_buf_left, *rcv_buf_right;

    MPI_Win win_rcv_buf_left, win_rcv_buf_right;

    /* Naming conventions                                                                */
    /* Processes:                                                                        */
    /*     my_rank-1                        my_rank                         my_rank+1    */
    /* "left neighbor"                     "myself"                     "right neighbor" */
    /*   ...    rcv_buf_right <--- snd_buf_left snd_buf_right ---> rcv_buf_left    ...   */
    /*   ... snd_buf_right ---> rcv_buf_left       rcv_buf_right <--- snd_buf_left ...   */
    /*                        |                                  |                       */
    /*              halo-communication                 halo-communication                */

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    right = (my_rank+1)      % size;
    left  = (my_rank-1+size) % size;

    MPI_Alloc_mem((MPI_Aint)(max_length*sizeof(float)), MPI_INFO_NULL, &rcv_buf_left );
    MPI_Alloc_mem((MPI_Aint)(max_length*sizeof(float)), MPI_INFO_NULL, &rcv_buf_right);
    MPI_Win_create(rcv_buf_left,  (MPI_Aint)(max_length*sizeof(float)), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &win_rcv_buf_left );
    MPI_Win_create(rcv_buf_right, (MPI_Aint)(max_length*sizeof(float)), sizeof(float), MPI_INFO_NULL, MPI_COMM_WORLD, &win_rcv_buf_right);

    if (my_rank == 0) printf("    message size      transfertime  duplex bandwidth per process and neighbor\n");

    length = start_length;

    for (j = 1; j <= number_package_sizes; j++)

        for (i = 0; i <= number_of_messages; i++)
            if(i==1) start = MPI_Wtime();

            test_value = j*1000000 + i*10000 + my_rank*10 ;
            mid = (length-1)/number_of_messages*i;

            snd_buf_left[0]=test_value+1  ;
            snd_buf_left[mid]=test_value+2  ;
            snd_buf_right[0]=test_value+6 ;
            snd_buf_right[mid]=test_value+7 ;

            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPRECEDE, win_rcv_buf_left );
            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPRECEDE, win_rcv_buf_right);

            MPI_Put(snd_buf_left,  length, MPI_FLOAT, left,  (MPI_Aint)0, length, MPI_FLOAT, win_rcv_buf_right);
            MPI_Put(snd_buf_right, length, MPI_FLOAT, right, (MPI_Aint)0, length, MPI_FLOAT, win_rcv_buf_left );

            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT + MPI_MODE_NOSUCCEED, win_rcv_buf_left );
            MPI_Win_fence(MPI_MODE_NOSTORE + MPI_MODE_NOPUT + MPI_MODE_NOSUCCEED, win_rcv_buf_right);

            /*    ...snd_buf_... is used to store the values that were stored in snd_buf_... in the neighbor process */
            test_value = j*1000000 + i*10000 + left*10  ;
            mid = (length-1)/number_of_messages*i;
            snd_buf_right[0]=test_value+6 ;
            snd_buf_right[mid]=test_value+7 ;
            test_value = j*1000000 + i*10000 + right*10 ;
            mid = (length-1)/number_of_messages*i;
            snd_buf_left[0]=test_value+1  ;
            snd_buf_left[mid]=test_value+2  ;
            if ((rcv_buf_left[0] != snd_buf_right[0]) || (rcv_buf_left[mid] != snd_buf_right[mid]) ||
                    (rcv_buf_left[length-1] != snd_buf_right[length-1])) {
                printf("%d: j=%d, i=%d --> snd_buf_right[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank, j, i, mid, length-1, snd_buf_right[0], snd_buf_right[mid], snd_buf_right[length-1]);
                printf("%d:     is not identical to rcv_buf_left[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank,       mid, length-1, rcv_buf_left[0],  rcv_buf_left[mid],  rcv_buf_left[length-1]);
            if ((rcv_buf_right[0] != snd_buf_left[0]) || (rcv_buf_right[mid] != snd_buf_left[mid]) ||
                    (rcv_buf_right[length-1] != snd_buf_left[length-1])) {
                printf("%d: j=%d, i=%d --> snd_buf_left[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank, j, i, mid, length-1, snd_buf_left[0],  snd_buf_left[mid],  snd_buf_left[length-1]);
                printf("%d:     is not identical to rcv_buf_right[0,%d,%d]=(%f,%f,%f)\n",
                       my_rank,       mid, length-1, rcv_buf_right[0], rcv_buf_right[mid], rcv_buf_right[length-1]);

        finish = MPI_Wtime();

        if (my_rank == 0)
            transfer_time = (finish - start) / number_of_messages;
            printf("%10i bytes %12.3f usec %13.3f MB/s\n",
                   length*(int)sizeof(float), transfer_time*1e6, 1.0e-6*2*length*sizeof(float) / transfer_time);

        length = length * length_factor;
    MPI_Win_free(&win_rcv_buf_left );
    MPI_Free_mem(rcv_buf_left );

Example #19
/*---< main() >-------------------------------------------------------------*/
int main(int argc, char **argv) {
           int     opt;
    extern char   *optarg;
    extern int     optind;
           int     i, j;
           int     isInFileBinary, isOutFileBinary;
           int     is_output_timing, is_print_usage;

           int     numClusters, numCoords, numObjs, totalNumObjs;
           int    *membership;    /* [numObjs] */
           char   *filename;
           float **objects;       /* [numObjs][numCoords] data objects */
           float **clusters;      /* [numClusters][numCoords] cluster center */
           float   threshold;
           double  timing, io_timing, clustering_timing;

           int        rank, nproc, mpi_namelen;
           char       mpi_name[MPI_MAX_PROCESSOR_NAME];
           MPI_Status status;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    /* some default values */
    _debug           = 0;
    threshold        = 0.001;
    numClusters      = 0;
    isInFileBinary   = 0;
    isOutFileBinary  = 0;
    is_output_timing = 0;
    is_print_usage   = 0;
    filename         = NULL;

    while ( (opt=getopt(argc,argv,"p:i:n:t:abdorh"))!= EOF) {
        switch (opt) {
            case 'i': filename=optarg;
            case 'b': isInFileBinary = 1;
            case 'r': isOutFileBinary = 1;
            case 't': threshold=atof(optarg);
            case 'n': numClusters = atoi(optarg);
            case 'o': is_output_timing = 1;
            case 'd': _debug = 1;
            case 'h': is_print_usage = 1;
            default: is_print_usage = 1;

    if (filename == 0 || numClusters <= 1 || is_print_usage == 1) {
        if (rank == 0) usage(argv[0], threshold);

    if (_debug) printf("Proc %d of %d running on %s\n", rank, nproc, mpi_name);

    io_timing = MPI_Wtime();

    /* read data points from file ------------------------------------------*/
    objects = mpi_read(isInFileBinary, filename, &numObjs, &numCoords,

    if (_debug) { /* print the first 4 objects' coordinates */
        int num = (numObjs < 4) ? numObjs : 4;
        for (i=0; i<num; i++) {
            char strline[1024], strfloat[16];
            sprintf(strline,"%d: objects[%d]= ",rank,i);
            for (j=0; j<numCoords; j++) {
                strcat(strline, strfloat);
            strcat(strline, "\n");

    timing            = MPI_Wtime();
    io_timing         = timing - io_timing;
    clustering_timing = timing;

    /* allocate a 2D space for clusters[] (coordinates of cluster centers)
       this array should be the same across all processes                  */
    clusters    = (float**) malloc(numClusters *             sizeof(float*));
    assert(clusters != NULL);
    clusters[0] = (float*)  malloc(numClusters * numCoords * sizeof(float));
    assert(clusters[0] != NULL);
    for (i=1; i<numClusters; i++)
        clusters[i] = clusters[i-1] + numCoords;

    MPI_Allreduce(&numObjs, &totalNumObjs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    /* pick first numClusters elements in feature[] as initial cluster centers*/
    if (rank == 0) {
        for (i=0; i<numClusters; i++)
            for (j=0; j<numCoords; j++)
                clusters[i][j] = objects[i][j];
    MPI_Bcast(clusters[0], numClusters*numCoords, MPI_FLOAT, 0, MPI_COMM_WORLD);

    /* membership: the cluster id for each data object */
    membership = (int*) malloc(numObjs * sizeof(int));
    assert(membership != NULL);

    /* start the core computation -------------------------------------------*/
    mpi_kmeans(objects, numCoords, numObjs, numClusters, threshold, membership,
               clusters, MPI_COMM_WORLD);


    timing            = MPI_Wtime();
    clustering_timing = timing - clustering_timing;

    /* output: the coordinates of the cluster centres ----------------------*/
    mpi_write(isOutFileBinary, filename, numClusters, numObjs, numCoords,
              clusters, membership, totalNumObjs, MPI_COMM_WORLD);


    /*---- output performance numbers ---------------------------------------*/
    if (is_output_timing) {
        double max_io_timing, max_clustering_timing;

        io_timing += MPI_Wtime() - timing;

        /* get the max timing measured among all processes */
        MPI_Reduce(&io_timing, &max_io_timing, 1, MPI_DOUBLE,
                   MPI_MAX, 0, MPI_COMM_WORLD);
        MPI_Reduce(&clustering_timing, &max_clustering_timing, 1, MPI_DOUBLE,
                   MPI_MAX, 0, MPI_COMM_WORLD);

        if (rank == 0) {
            printf("\nPerforming **** Simple Kmeans  (MPI) ****\n");
            printf("Num of processes = %d\n", nproc);
            printf("Input file:        %s\n", filename);
            printf("numObjs          = %d\n", totalNumObjs);
            printf("numCoords        = %d\n", numCoords);
            printf("numClusters      = %d\n", numClusters);
            printf("threshold        = %.4f\n", threshold);

            printf("I/O time           = %10.4f sec\n", max_io_timing);
            printf("Computation timing = %10.4f sec\n", max_clustering_timing);

double mpi_wait_time(MPI_Request * request, MPI_Status * status)
    double t = MPI_Wtime();
    MPI_Wait(request, status);
    return MPI_Wtime() - t;
Example #21
int main(int argc, char * argv[])
    /* Constant Declarations */
    //long const 	SET_SIZE = 7920;

    /* Variable Declarations */
    int		count = 0;				// local count
    double 	elapsed_time = 0.00;			// time elapsed
    int		first;					// index of first multiple
    int 	global_count = 1;			// global count
    int 	high_value;				// highest value on processor
    char 	hostname[MPI_MAX_PROCESSOR_NAME];	// host process is running on
    int	 	i;					// counter variable
    int 	id;					// process id number
    int		index;
    int 	init_status;			// initialization error status flag
    bool  	initialized = false;		// mpi initialized flag
    int 	len;				// hostname length
    int 	low_value;			// lowest value on the processor
    char*	marked;				// portion of 2 to n that is marked
    int		n;			// number of elements to sieve
    int		n_sqrt;			// square root of n
    int 	p;			// number of processes
    int		prime;
    int		proc0_size;		// size of process 0's subarray
    int		size;			// elements in marked
    int*	sqrt_primes;		// primes up to the square root
    int		sqrt_primes_index;	// index in the square root primes array
    char*	sqrt_primes_marked;	// numbers up to sqrt marked prime or not
    int		sqrt_primes_size;	// size of square root primes array

    /* Function Declarations */
    //int is_prime( int );

    /* Initialization */
    MPI_Initialized( &initialized );                     // set initialized flag
    if( !initialized )                                  // if MPI is not initialized
        init_status = MPI_Init( &argc, &argv );        // Initialize MPI
        init_status = MPI_SUCCESS;   	               // otherwise set init_status to success
    if( init_status != MPI_SUCCESS ) {     	       // if not successfully initialized
        printf ("Error starting MPI program. Terminating.\n");      // print error message
        MPI_Abort(MPI_COMM_WORLD, init_status);                     // abort
    MPI_Get_processor_name( hostname, &len );                       // set hostname

    MPI_Comm_rank( MPI_COMM_WORLD, &id );                           // set process rank
    MPI_Comm_size( MPI_COMM_WORLD, &p );                            // set size of comm group
    //printf("Process rank %d started on %s.\n", id, hostname);     // print start message
    //MPI_Barrier(MPI_COMM_WORLD );

    /* Start Timer */
    MPI_Barrier( MPI_COMM_WORLD );                                  // synchronize
    elapsed_time = - MPI_Wtime();                                   // start time

    /* Check that a set size was passed into the program */
    if(argc != 2) {
        if(id==0) {
            printf("Command line: %s <m>\n", argv[0]);

    n = atoi(argv[1]);
    n_sqrt = ceil(sqrt((double)n));
    //	printf("square root: %i\n", n_sqrt);
    // debug
    //if(id==0) {
	//printf("n sqrt: %i\n", n_sqrt);

    sqrt_primes_marked = (char *) malloc(n_sqrt + 1);
    sqrt_primes_marked[0] = 1;
    sqrt_primes_marked[1] =1;

    for(i = 2; i <= n_sqrt; ++i) {
	sqrt_primes_marked[i] = 0;

    prime = 2;
    sqrt_primes_size = n_sqrt;
    //printf("sqrt primes size: %i\n", sqrt_primes_size);

    do {
	for(i = prime * prime; i < n_sqrt; i+=prime) {
	     sqrt_primes_marked[i] = 1;
    } while (prime * prime <= n_sqrt);
    //printf("sqrt primes size: %i\n", sqrt_primes_size);
    sqrt_primes = (int *) malloc(sqrt_primes_size);
    sqrt_primes_index = 0;

    //sqrt_primes_size = 0;

    for(i = 3; i <= n_sqrt; ++i) {
	if(!sqrt_primes_marked[i]) {
	    sqrt_primes[sqrt_primes_index] = i;
	   // printf("%i, ", sqrt_primes[sqrt_primes_index]);

    sqrt_primes_size = sqrt_primes_index;

    //printf("sqrt primes size: %i\n", sqrt_primes_size);

    /* Set process's array share and first and last elements */
    low_value = 2 + BLOCK_LOW(id,p,n-1);
    high_value = 2 + BLOCK_HIGH(id,p,n-1);
    size = BLOCK_SIZE(id,p,n-1);

    //printf("Process %i block low: %i\n", id, low_value);
    //printf("Process %i block high: %i\n", id, high_value);
    //printf("Block size: %i\n", size);

    if(low_value % 2 == 0) {
	if(high_value % 2 == 0) {
	     size = (int)floor((double)size / 2.0);
	else {
	    size = size / 2;
    else {
	if(high_value % 2 == 0) {
	     size = size / 2;
	else {
	     size = (int)ceil((double)size / 2.0);

    //printf("Process %i block low: %i\n", id, low_value);
    //printf("Process %i block high: %i\n", id, high_value);
    //printf("Block size: %i\n", size);

    //proc0_size = (n-1)/p;

    /* if process 0 doesn't have all the primes for sieving, then bail*/
    /*if((2+proc0_size) < (int)sqrt((double)n)) {
        if(id==0) {
            printf("Too many processes\n");

    /* Allocate share of array */
    marked = (char *) malloc(size);

    if(marked == NULL) {
        printf("Cannot allocate enough memory\n");

    /* Run Sieve */

    //printf("made it to sieve\n");

    for(i = 0; i < size; i++)
	marked[i] = 0;

	first = 0;
    sqrt_primes_index = 0;
    prime = sqrt_primes[sqrt_primes_index];

    //printf("first prime: %i\n", prime);

    //for(i = 0; i < sqrt_primes_size; i++) {

      //              printf("%i,", sqrt_primes[i]);
        //            fflush(stdout);


    do {
	if(prime >= low_value)
	    first = ((prime - low_value) / 2) + prime;
	else if(prime * prime > low_value) {
		first = (prime * prime - low_value) / 2;
	else {
	    if(low_value % prime == 0)
		first = 0;
	    else {
		first = 1;
		while ((low_value + (2 * first)) % prime != 0)

	//printf("first: %i\n", first);

	for(i = first; i < size; i += (prime))
		marked[i] = 1;

	//printf("made it to prime assignment\n");
	prime = sqrt_primes[++sqrt_primes_index];

	//printf("prime: %i\n", prime);

    } while(prime * prime <= n && sqrt_primes_index < sqrt_primes_size);

    count = 0;

    for(i = 0; i < size; i++) {

    //printf("size: %i\ncount: %i\n", size, count);

//    for( i=id; i<SET_SIZE; i+=p )                                                       // interleaved allocation
//        count += is_prime( i );                                                             // check if prime w/ sieve of eratosthenes

    /* Reduce Sum */
    MPI_Reduce( &count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );        // reduce the primes count, root: proces 0

    /* Stop Timer */
    elapsed_time += MPI_Wtime();                                                        // end time

    //printf("Process %i found %i primes.\n", id, count);

    //printf("Process %d is done in %d, running on %s.\n", id, elapsed_time, hostname);   // print process done message
    if( id == 0 ) {                                                                     // rank 0 prints global count
        printf("There are %d primes in the first %i integers.\nExecution took %10.6f.\n",
               global_count, n, elapsed_time);
//	printf("Debug:\n");
//	fflush(stdout);
//	printf("sqrt primes size: %i\n", sqrt_primes_size);
//        fflush(stdout);
	for(i = 0; i < sqrt_primes_size; i++) {
		    printf("%i,", sqrt_primes[i]);


  //  printf("rank: %i\nlow value: %i\nhigh value: %i\ncount: %i\n", id, low_value, high_value, count);

    MPI_Finalize();                                                                     // finalize
    return 0;
Example #22
int main(int argc, char* argv[]) {

	int* bodies_off;
	int* n_bodies_split;
	int n_local_bodies;
	const MPI_Comm comm = MPI_COMM_WORLD;
	FILE *inputf;
	FILE *outputf;
	double clockStart, clockEnd;
	int rc, n_proc, rank;

	rc = MPI_Init(&argc, &argv);
	if (rc != MPI_SUCCESS) {
		puts("MPI_Init failed");

	MPI_Comm_size(comm, &n_proc);
	MPI_Comm_rank(comm, &rank);

	//creazione datatype per mpi!
	MPI_Datatype bodytype;
	int block_len[6] = {1, 1, 3, 3, 3, 1};
	MPI_Aint disp[6];
	leaf_t example[2];

	MPI_Get_address(&example[0], &disp[0]);
	MPI_Get_address(&(example[0].mass), &disp[1]);
	MPI_Get_address(&(example[0].pos), &disp[2]);
	MPI_Get_address(&(example[0].vel), &disp[3]);
	MPI_Get_address(&(example[0].acc), &disp[4]);
	MPI_Get_address(&(example[1].acc), &disp[5]);
//	int i;
//	for(i = 6; i >= 0; --i)
//		disp[i] -= disp[0];

	disp[1] = disp[1] - disp[0];
	disp[2] = disp[2] - disp[0];
	disp[3] = disp[3] - disp[0];
	disp[4] = disp[4] - disp[0];
	disp[5] = disp[5] - disp[0];

	MPI_Type_create_struct(6, block_len, disp, type, &bodytype);

	bodies_off = malloc((n_proc + 1) * sizeof(int));
	n_bodies_split = malloc((n_proc) * sizeof(int));
	bodies = malloc(nbodies * sizeof(node_t*));
	leafs = malloc(nbodies * sizeof(leaf_t));
	char* inputfile = argv[1];
	inputf = fopen(inputfile, "r");

	if (inputf == NULL) {
		printf("impossibile leggere da file");

	fscanf(inputf, "%d", &nbodies);
	fscanf(inputf, "%d", &steps);
	fscanf(inputf, "%lf", &dt);
	fscanf(inputf, "%lf", &eps);
	fscanf(inputf, "%lf", &tol);


	if (rank == 0) {
		int i;


		quicksort(0, nbodies - 1);

		//	bublesort();
		//	int i = 0;
		//	for (i = 0; i < nbodies; i++) {
		//		printf("%lf, %lf, %lf \n", bodies[i]->pos[0], bodies[i]->pos[1],
		//				bodies[i]->pos[2]);
		//	}
		n_local_bodies = nbodies / n_proc;

		//split delle particelle secondo shark & fish
		//		split_bodies(n_proc, bodies_off, n_bodies_split);
		//		n_local_bodies = n_bodies_split[rank];
		//		MPI_Bcast(n_bodies_split, n_proc, MPI_INT, 0, comm);

		MPI_Bcast(leafs, nbodies, bodytype, 0, comm);

		dthf = 0.5 * dt;
		epssq = eps * eps;
		itolsq = 1.0 / (tol * tol);

		clockStart = MPI_Wtime();
		int step = 0;
		root = NULL;
		for (step = 0; step < steps; step++) {

			root = malloc(sizeof(struct node_t)); // "new" is like "malloc"
			double mass_root = 0.0;

			root->type = 1;
			root->mass = &mass_root;
			root->pos = center;
			root->cell.childs[0] = NULL;
			root->cell.childs[1] = NULL;
			root->cell.childs[2] = NULL;
			root->cell.childs[3] = NULL;
			root->cell.childs[4] = NULL;
			root->cell.childs[5] = NULL;
			root->cell.childs[6] = NULL;
			root->cell.childs[7] = NULL;

			double radius = diameter * 0.5;

			int i = 0;
			for (i = 0; i < nbodies; i++) {
				insert(root, bodies[i], radius); // questo è il modo per passare i dati per riferimento... cioè mandare l'indirizzo della struttura puntata dal puntatore
			curr = 0;

			for (i = 0; i < n_local_bodies; i++) {
				compute_force(&(*root), &(*bodies[i]), diameter, step);
			//		for (i = 0; i < nbodies; i++) {
			//		}


			//inserire all gather
			MPI_Allgather(leafs, n_local_bodies, bodytype, leafs,
					n_local_bodies, bodytype, comm);

			for (i = 0; i < nbodies; i++) {

			//		int p = 0;
			//		for (p = 0; p < nbodies; p++)
			//			printf("%lf, %lf, %lf \n", bodies[p]->pos[0], bodies[p]->pos[1],
			//					bodies[p]->pos[2]);
			//		printf("*************************************** \n");
		//	int i = 0;
		// dopo l'esecuzione!!
		//		int proc_rec = 1;
		//		while (proc_rec < n_proc) {
		//			MPI_Status status;
		//			int proc_rank;
		//			int cap = nbodies / n_proc;
		//			node_t temp[cap];
		//			MPI_Recv(temp, cap, bodytype, MPI_ANY_SOURCE, MPI_ANY_TAG, comm,
		//					&status);
		//			proc_rank = status.MPI_SOURCE;
		//			int idx = 0;
		//			for (idx = proc_rec * (cap); idx < cap; idx++)
		//				*bodies[idx] = temp[idx];
		//			proc_rec++;
		//		}
		clockEnd = MPI_Wtime();
		if (nbodies == 16384) {
			system("echo 'Host:' `hostname` >> output16384 ");
			outputf = fopen("output16384", "a");
			fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd
					- clockStart);
			for (i = 0; i < nbodies; i++) {
				fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0],
						bodies[i]->pos[1], bodies[i]->pos[2]);
		} else if (nbodies == 32768) {
			system("echo 'Host:' `hostname` >> output32768 ");
			outputf = fopen("output32768", "a");
			fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd
					- clockStart);
			for (i = 0; i < nbodies; i++) {
				fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0],
						bodies[i]->pos[1], bodies[i]->pos[2]);
		} else if (nbodies == 65536) {
			system("echo 'Host:' `hostname` >> output65536 ");
			outputf = fopen("output65536", "a");
			fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd
					- clockStart);
			for (i = 0; i < nbodies; i++) {
				fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0],
						bodies[i]->pos[1], bodies[i]->pos[2]);
		} else {
			system("echo 'Host:' `hostname` >> output ");
			outputf = fopen("output", "a");
			fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd
					- clockStart);
			for (i = 0; i < nbodies; i++) {
				fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0],
						bodies[i]->pos[1], bodies[i]->pos[2]);

		printf("Esecuzione completata\n");

	} else {

		int low = 1, up = 0;
		int i;
		dthf = 0.5 * dt;
		epssq = eps * eps;
		itolsq = 1.0 / (tol * tol);

		//	if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) {
		//		printf("Inizializzazione della libreria di papi fallita \n");
		//		exit(1);
		//	}
		//	if (PAPI_create_eventset(&event_set) != PAPI_OK) {
		//		printf("E' andata a male la creazione dell'eventSet \n");
		//		exit(1);
		//	}
		//	if (PAPI_add_events(event_set, events, 2) != PAPI_OK) {
		//		printf("E' andata a male l'aggiunta degli eventi\n");
		//		exit(1);
		//	}

		n_local_bodies = nbodies / n_proc;
		MPI_Bcast(leafs, nbodies, bodytype, 0, comm);
		int step = 0;
		root = NULL;

		low += (rank * n_local_bodies);

		up = low + n_local_bodies;

		//	PAPI_start(event_set);
		//	clockStart = PAPI_get_real_usec();
		for (step = 0; step < steps; step++) {

			root = malloc(sizeof(struct node_t)); // "new" is like "malloc"

			root->type = 1;
			*(root->mass) = 0.0;
			root->pos = center;
			root->cell.childs[0] = NULL;
			root->cell.childs[1] = NULL;
			root->cell.childs[2] = NULL;
			root->cell.childs[3] = NULL;
			root->cell.childs[4] = NULL;
			root->cell.childs[5] = NULL;
			root->cell.childs[6] = NULL;
			root->cell.childs[7] = NULL;

			double radius = diameter * 0.5;

			for (i = 0; i < nbodies; i++) {
				bodies[i] = malloc(sizeof(node_t));
				bodies[i]->cell.leaf = &leafs[i];
				bodies[i]->mass = &leafs[i].mass;
				bodies[i]->pos = leafs[i].pos;
				insert(&(*root), &(*bodies[i]), radius); // questo è il modo per passare i dati per riferimento... cioè mandare l'indirizzo della struttura puntata dal puntatore
			curr = 0;

			for (i = low; i < up; i++) {
				compute_force(&(*root), &(*bodies[i]), diameter, step);
			//		for (i = 0; i < nbodies; i++) {
			//		}


			local_leafs = &leafs[low];
			//inserire all_gather
			MPI_Allgather(local_leafs, up - low, bodytype, leafs, up - low,
					bodytype, comm);

			for (i = 0; i < nbodies; i++) {
			//		int p = 0;
			//		for (p = 0; p < nbodies; p++)
			//			printf("%lf, %lf, %lf \n", bodies[p]->pos[0], bodies[p]->pos[1],
			//					bodies[p]->pos[2]);
			//		printf("*************************************** \n");
		//	clockEnd = PAPI_get_real_usec();
		//	PAPI_stop(event_set, values);
		//	int i = 0;
		//		MPI_Send(bodies[low], up - low + 1, bodytype, 0, MPI_ANY_TAG, comm);


	return 0;
Example #23
   @routine    PUGH_Sync
   @date       Mon Jun 05 2000
   @author     Thomas Radke
               Finally synchronizes a variable or group of variables
               according to a given comm structure.
   @calls      PUGH_SyncSingleProc
static int PUGH_Sync(pGH *pughGH,
                     pComm *comm)
#ifdef CCTK_MPI
  int dir;
  pGA *GA;
  MPI_Status mss;
  int i;
  MPI_Request *sr;
  double t1, t2;

  /* single-processor case in handled in separate routine */
  if (pughGH->nprocs == 1)
    return (PUGH_SyncSingleProc (pughGH, comm));

#ifdef CCTK_MPI

  /* start the timer for communication time */
  if (pughGH->comm_time >= 0)
    CCTK_TimerStartI (pughGH->comm_time);

  GA = (pGA *) pughGH->variables [comm->first_var][comm->sync_timelevel];

  if (pughGH->commmodel == PUGH_DERIVEDTYPES) 
    /* 2 faces, send and receive is the 2 * 2 */
    sr = (MPI_Request *) malloc(comm->n_vars * 2 * 2 * sizeof(MPI_Request));

  printf (" PUGH_Sync: syncing group of %d vars with first var '%s'\n",
          comm->n_vars, GA->name);
  fflush (stdout);

  for (dir = 0; dir < GA->extras->dim; dir ++) 

    t1 = MPI_Wtime();

    PostReceiveGA(pughGH, 2*dir, comm);
    PostReceiveGA(pughGH, 2*dir+1, comm);
    t2 = MPI_Wtime();
    printf("PR : %f\n",t2-t1);

    PostSendGA(pughGH, 2*dir, comm);
    PostSendGA(pughGH, 2*dir+1, comm);

    t1 = MPI_Wtime();
    printf("PS : %f\n",t1-t2);

    /* Now comes the big difference between derived types and
       allocated buffers. With derived types, we now have to
       wait on all our recieve AND SEND buffers so we can
       keep on using the send buffers ( as communications are
       in-place). With the allocated we have to wait on each
       recieve, but not on the send, since we don't need the
       send buffer until we pack a send again (above)
    if (pughGH->commmodel == PUGH_ALLOCATEDBUFFERS) 
      /* Do a wait any on the receives */
      MPI_Wait(&comm->rreq[2*dir], &mss);
      FinishReceiveGA(pughGH, 2*dir, comm);
      MPI_Wait(&comm->rreq[2*dir+1], &mss);
      FinishReceiveGA(pughGH, 2*dir+1, comm);
    else if (pughGH->commmodel == PUGH_DERIVEDTYPES) 
      /* Load up the thing for the waitall */
      for (i = 0; i < comm->n_vars; i++)
        int id = i * 2 * 2;
        pGA *GA = (pGA *) pughGH->variables [i][comm->sync_timelevel];

        if (GA->comm->docomm[2*dir] &&
          sr[id] = GA->comm->sreq[2*dir];
          sr[id+1] = GA->comm->rreq[2*dir];
          sr[id] = MPI_REQUEST_NULL;
          sr[id+1] = MPI_REQUEST_NULL;

        if (GA->comm->docomm[2*dir+1] &&
          sr[id+2] = GA->comm->sreq[2*dir+1];
          sr[id+3] = GA->comm->rreq[2*dir+1];
          sr[id+2] = MPI_REQUEST_NULL;
          sr[id+3] = MPI_REQUEST_NULL;
      /* Now do a waitall */
      MPI_Waitall(4*comm->n_vars, sr, &mss);

    t2 = MPI_Wtime();
    printf("FR : %f\n",t2-t1);


  if (pughGH->commmodel == PUGH_DERIVEDTYPES) 
    /* wait for MPI to finish all outstanding send requests */
    CACTUS_MPI_ERROR (MPI_Waitall (2 * GA->extras->dim, comm->sreq,

  /* get the time spent in communication */
  if (pughGH->comm_time >= 0)

#endif /* CCTK_MPI */

  return (0);
Example #24
void two_d_partitioning(MPI_Comm *comm_new, float *A, int local_rank, int num_procs) {
  MPI_Status status;
  int k, i, j, startingRow, endingRow, numRows, startingColumn, endingColumn, numColumns;
  int n_startingRow, n_startingColumn, n_local_coords[2];
  //long double determinant;
  double start, end, dt;
  int p = (int) sqrt(num_procs);
  int dis, left_rank, right_rank, up_rank, down_rank;
  MPI_Request req;
  numRows = n / p;
  numColumns = numRows;

  startingRow = local_coords[1] * numRows;
  endingRow = startingRow + numRows;

  startingColumn = local_coords[0] * numRows;
  endingColumn = startingColumn + numColumns;

  start = MPI_Wtime();
  for( k = 0; k < n; k++ ) {
    float Akk[1];
    int local_k = k % numRows;
    // Send A(k,k) to the right
    start = MPI_Wtime();
    if( k >= startingColumn && k < endingColumn && k >= startingRow && k < endingRow ) {
      send_to(comm_new, 0, A, 1, local_k, local_k, numRows);
      Akk[0] = A[local_k * numRows + local_k];
    } else if( k < startingColumn && k >= startingRow && k < endingRow ) {
      receive_from_left(comm_new, 0, Akk, 1, 0, 0, numRows, k);
    end = MPI_Wtime();
    dt = end - start;
    comm_time += dt;

    // Now calculate the row
    start = MPI_Wtime();
    if( k >= startingColumn && k < endingColumn && k >= startingRow && k < endingRow ) {
      for( j = local_k + 1; j < numColumns; j++ ) {
        A[local_k * numRows + j] /= Akk[0];
    } else if( k >= startingRow && k < endingRow && k < startingColumn ) {
      for( j = 0; j < numColumns; j++ ) {
        A[local_k * numRows + j] /= Akk[0];
    end = MPI_Wtime();
    dt = end - start;
    proc_time += dt;

    // Now calculate the box
    int m, bOutside = 1; 
    float top_row[numRows]; 

    start = MPI_Wtime();
    // k is West of this Partition
    if( k >= startingRow && k < endingRow & k < startingColumn ) {
      send_to(comm_new, 1, A, numColumns, local_k, 0, numRows);
      for( m = 0; m < numColumns; m++ ) {
        top_row[m] = A[local_k * numRows + m];
      bOutside = -1;
    // k is in this BOX
    else if( k >= startingRow && k < endingRow && k >= startingColumn && k < endingColumn ) {
      int size = numColumns - (local_k + 1);
      if( size != 0 ) {
        send_to(comm_new, 1, A, size, local_k, local_k + 1, numRows);

        for( m = 0; m < size; m++ ) {
          top_row[m] = A[local_k * numRows + local_k + 1 + m];
        bOutside = -1;
    } // k is NW of this box 
    else if( k < startingRow && k < startingColumn ) {
      int sender_row = k / numRows;
      int sender_column = k / numColumns;
      int sender_rank = local_coords[0] * sqrt(num_procs) + sender_row;
      MPI_Recv(top_row, numColumns, MPI_FLOAT, sender_rank, 0, *comm_new, &status);
      bOutside = -1;
    // k is N of this box
    else if( k < startingRow && k >= startingColumn && k < endingColumn ) {
      int sender_row = k / numRows;
      int sender_column = k / numColumns;
      int sender_rank = sender_column * sqrt(num_procs) + sender_row;
      int size = numColumns - (local_k + 1);
      if( size != 0 ) { 
        //top_row = (float *)malloc(sizeof(float) * numberToReceive);
        //printf("%d Waiting to receive from:%d\n", local_rank, sender_rank);
        MPI_Recv(top_row, size, MPI_FLOAT, sender_rank, 0, *comm_new, &status);
        bOutside = -1;
    float left_row[numRows];
    // k is N of this Box
    if( k >= startingColumn && k < endingColumn & k < startingRow ) {
      for(m = 0; m < numRows; m++ ) {
        left_row[m] = A[m * numColumns + local_k];
      send_to(comm_new, 0, left_row, numRows, 0, 0, 0);
      bOutside = -1;
    // k is IN this box 
    else if( k >= startingRow && k < endingRow && k >= startingColumn && k < endingColumn ) {
      //int local_k = k % numRows;
      int size = numColumns - (local_k + 1);
      if( size != 0 ) {
        for(m = 0; m < size; m++ ) {
          left_row[m] = A[(local_k + 1) * numColumns + local_k];
        send_to(comm_new, 0, left_row, size, 0, 0, 0);
        bOutside = -1;
    // k is SW from this box
    else if( k < startingRow && k < startingColumn ) {
      int sender_row = k / numRows;
      int sender_column = k / numColumns;
      int sender_rank = sender_column * sqrt(num_procs) + local_coords[1];
      MPI_Recv(left_row, numColumns, MPI_FLOAT, sender_rank, 0, *comm_new, &status);
      bOutside = -1;
    // k is W of this box
    else if( k < startingColumn && k >= startingRow && k < endingRow ) {
      int sender_row = k / numRows;
      int sender_column = k / numColumns;
      int sender_rank = sender_column * sqrt(num_procs) + local_coords[1];
      int local_k = k % numRows;
      int numberToReceive = numColumns - (local_k + 1);
      if( numberToReceive != 0 ) { 
        MPI_Recv(left_row, numberToReceive, MPI_FLOAT, sender_rank, 0, *comm_new, &status);
        bOutside = -1;
    end = MPI_Wtime();
    dt = end - start;
    comm_time += dt;

    // Now process the box
    if( bOutside < 0 ) {
      start = MPI_Wtime();
      process_row_and_column(A, left_row, top_row, k, startingRow, endingRow, startingColumn, endingColumn, numRows, numColumns, local_k);
      end = MPI_Wtime();
      dt = end - start;
      proc_time += dt;
  } // end for

  float determinant[1];
  float result[1];
  determinant[0] = 1;
  if( local_coords[0] == local_coords[1] ) {
    start = MPI_Wtime();
    for(i = 0; i < numRows; i++ ) {
      determinant[0] *= A[i * numRows + i];
    end = MPI_Wtime();
    dt = end - start;
    proc_time += dt;
  start = MPI_Wtime();
  MPI_Reduce(determinant, result, 1, MPI_FLOAT, MPI_PROD, 0, *comm_new);
  end = MPI_Wtime();
  dt = end - start;
  comm_time += dt;
  if( !computerStats && local_rank == 0 ) {
    printf("Determinant is %f\n", result[0]);
Example #25
int main(int argc, char *argv[])
    int myid, numprocs, i, j;
    int size, align_size;
    char *s_buf, *r_buf;
    double t_start = 0.0, t_end = 0.0, t = 0.0;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);

    align_size = getpagesize();
    assert(align_size <= MAX_ALIGNMENT);

    s_buf =
        (char *) (((unsigned long) s_buf1 + (align_size - 1)) /
                  align_size * align_size);
    r_buf =
        (char *) (((unsigned long) r_buf1 + (align_size - 1)) /
                  align_size * align_size);

    if(numprocs != 2) {
        if(myid == 0) {
            fprintf(stderr, "This test requires exactly two processes\n");


        return EXIT_FAILURE;

    if(myid == 0) {
        fprintf(stdout, HEADER);
        fprintf(stdout, "%-*s%*s\n", 10, "# Size", FIELD_WIDTH,
                "Bi-Bandwidth (MB/s)");

    for(size = 1; size <= MAX_MSG_SIZE; size *= 2) {
        /* touch the data */
        for(i = 0; i < size; i++) {
            s_buf[i] = 'a';
            r_buf[i] = 'b';

        if(size > large_message_size) {
            loop = loop_large;
            skip = skip_large;
            window_size = window_size_large;

        if(myid == 0) {
            for(i = 0; i < loop + skip; i++) {
                if(i == skip) {
                    t_start = MPI_Wtime();

                for(j = 0; j < window_size; j++) {
                    MPI_Irecv(r_buf, size, MPI_CHAR, 1, 10, MPI_COMM_WORLD,
                            recv_request + j);

                for(j = 0; j < window_size; j++) {
                    MPI_Isend(s_buf, size, MPI_CHAR, 1, 100, MPI_COMM_WORLD,
                            send_request + j);

                MPI_Waitall(window_size, send_request, reqstat);
                MPI_Waitall(window_size, recv_request, reqstat);

            t_end = MPI_Wtime();
            t = t_end - t_start;


        else if(myid == 1) {
            for(i = 0; i < loop + skip; i++) {
                for(j = 0; j < window_size; j++) {
                    MPI_Irecv(r_buf, size, MPI_CHAR, 0, 100, MPI_COMM_WORLD,
                            recv_request + j);

                for (j = 0; j < window_size; j++) {
                    MPI_Isend(s_buf, size, MPI_CHAR, 0, 10, MPI_COMM_WORLD,
                            send_request + j);

                MPI_Waitall(window_size, send_request, reqstat);
                MPI_Waitall(window_size, recv_request, reqstat);

        if(myid == 0) {
            double tmp = size / 1e6 * loop * window_size * 2;

            fprintf(stdout, "%-*d%*.*f\n", 10, size, FIELD_WIDTH,
                    FLOAT_PRECISION, tmp / t);


    return EXIT_SUCCESS;
Example #26
void ADIOI_GEN_ReadContig(ADIO_File fd, void *buf, int count, 
			  MPI_Datatype datatype, int file_ptr_type,
			  ADIO_Offset offset, ADIO_Status *status,
			  int *error_code)
    ssize_t err = -1;
    MPI_Count datatype_size;
    ADIO_Offset len, bytes_xfered=0;
    size_t rd_count;
    static char myname[] = "ADIOI_GEN_READCONTIG";
    double io_time=0;
    char *p;

    MPE_Log_event (5034, 0, NULL);
    MPI_Type_size_x(datatype, &datatype_size);
    len = datatype_size * (ADIO_Offset)count;

    io_time = MPI_Wtime();
    if (gpfsmpio_timing) {
	gpfsmpio_prof_cr[ GPFSMPIO_CIO_DATA_SIZE ] += len;

    if (file_ptr_type == ADIO_INDIVIDUAL) {
	offset = fd->fp_ind;

    while (bytes_xfered < len) {
	MPE_Log_event( ADIOI_MPE_read_a, 0, NULL );
	rd_count = len - bytes_xfered;
	/* stupid FreeBSD and Darwin do not like a count larger than a signed
           int, even though size_t is eight bytes... */
        if (rd_count > INT_MAX)
            rd_count = INT_MAX;
	if (gpfsmpio_devnullio)
	    err = pread(fd->null_fd, p, rd_count, offset+bytes_xfered);
	    err = pread(fd->fd_sys, p, rd_count, offset+bytes_xfered);
	if (err == -1) {
	    *error_code = MPIO_Err_create_code(MPI_SUCCESS,
		    myname, __LINE__,
		    MPI_ERR_IO, "**io",
		    "**io %s", strerror(errno));
	    fd->fp_sys_posn = -1;
	if (err == 0) {
	    /* end of file */

	MPE_Log_event( ADIOI_MPE_read_b, 0, NULL );
	bytes_xfered += err;
	p += err;
    if (gpfsmpio_timing) gpfsmpio_prof_cr[ GPFSMPIO_CIO_T_POSI_RW ] += (MPI_Wtime() - io_time);
    fd->fp_sys_posn = offset + bytes_xfered;

    if (file_ptr_type == ADIO_INDIVIDUAL) {
	fd->fp_ind += bytes_xfered; 

    /* what if we only read half a datatype? */
    /* bytes_xfered could be larger than int */
    if (err != -1) MPIR_Status_set_bytes(status, datatype, bytes_xfered);

    *error_code = MPI_SUCCESS;
    MPE_Log_event (5035, 0, NULL);
    if (gpfsmpio_timing) gpfsmpio_prof_cr[ GPFSMPIO_CIO_T_MPIO_RW ] += (MPI_Wtime() - io_time);
Example #27
unsigned long quadratic_sieve(mpz_t N, 
			      unsigned int n, 
			      unsigned interval,
			      unsigned int max_fact,
			      unsigned int block_size,
			      mpz_t m,
			      unsigned int print_fact) {
  double t1, t2;
  int rank;
  int comm_size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, & comm_size);

  /* Controllo con test di pseudoprimalità di rabin */
  if(mpz_probab_prime_p(N, 25)) {
    return NUM_PRIMO;

  /* Radice intera di N */
  mpz_t s;
  mpz_sqrt(s, N); 

  t1 = MPI_Wtime();
  /* Individuazione primi in [2, n] */
  unsigned int * primes = malloc(sizeof(unsigned int) * n);  
  eratosthenes_sieve(primes, n);
  /* Compattiamo i numeri primi in primes */
  unsigned j = 0;
  for(int i = 2; i < n; ++i)
    if(primes[i] == 1) {
      primes[j++] = i;

  unsigned int n_all_primes = j;
  /* Fattorizzazione eseguita da tutti, gli slave ritornano
     IM_A_SLAVE mentre il main il fattore */
  unsigned int simple_factor = trivial_fact(N, primes, n_all_primes);
  if(simple_factor != 0) {
    mpz_set_ui(m, simple_factor);
    return rank == 0 ? OK : IM_A_SLAVE;

  /* Calcolo base di fattori e soluzioni dell'eq x^2 = N mod p */
  pair * solutions = malloc(sizeof(pair) * n_all_primes);
  unsigned int * factor_base = primes;

  unsigned n_primes = base_fattori(N, s, factor_base, solutions,
				  primes, n_all_primes);
  t2 = MPI_Wtime();
  double t_base = t2 - t1;
  if(rank == 0)
    printf("#Dimensione base di fattori: %d\n", n_primes);

  /* Vettore degli esponenti in Z */
  unsigned int ** exponents;
  /* Vettore degli (Ai + s) */
  mpz_t * As;
  /* Parte di crivello: troviamo le k+n fattorizzazioni complete */
  unsigned int n_fatt;

  t1 = MPI_Wtime();
  if(rank == 0){
    /* Inizializzazioni vettori */
    init_matrix(& exponents, n_primes + max_fact, n_primes);
    init_vector_mpz(& As, n_primes + max_fact);

    /* Procedura master che riceve le fatt. complete */
    n_fatt = master(n_primes, max_fact, exponents, As, comm_size, print_fact);
  } else {
    mpz_t begin;
    mpz_t counter;

    mpz_set_ui(begin, interval * (rank - 1));
    //gmp_printf("%d) begin=%Zd interval=%d\n", rank, begin, interval);

    int stop_flag = 0;
    do {
      //gmp_printf("\t%d) [%Zd, %Zd+%d] - (flag=%d)\n", rank, begin, begin, interval, flag);
      stop_flag = smart_sieve(N, factor_base, n_primes, solutions,
		  begin, interval,
		  block_size, max_fact);
      mpz_add_ui(begin, begin, interval * (comm_size-1));
    } while(!stop_flag);

    //printf("#%d) Termina\n", rank);

    return IM_A_SLAVE;
  t2 = MPI_Wtime();
  double t_sieve = t2 - t1;
  printf("#Numero fattorizzazioni complete trovate: %d\n", n_fatt);
  t1 = MPI_Wtime();
  /* Matrice di esponenti in Z_2 organizzata a blocchi di bit */ 
  word ** M;
  /* Numero di blocchi di bit da utilizzare */
  unsigned long n_blocchi = n_primes / N_BITS + 1;
  /* Inizializzazione egli esponenti mod 2 */
  init_matrix_l(& M, n_fatt, n_blocchi);
  for(int i = 0; i < n_fatt; ++i)
    for(int j = 0; j < n_primes; ++j) {
      unsigned int a = get_matrix(exponents, i, j);
      set_k_i(M, i, j, a);

  /* Vettore con le info (bit piu' a dx e num bit a 1) su M */
  struct row_stats * wt = malloc(sizeof(struct row_stats) * n_fatt);
  for(int i = 0; i < n_fatt; ++i)
    get_wt_k(M, i, n_primes, & wt[i]);

  /* In gauss gli esponenti sommati possono andare in overflow,
     li converto dunque in mpz */
  mpz_t ** exponents_mpz;
  mpz_t temp;
  mpz_init_set_ui(temp, 2);

  unsigned int a; 
  init_matrix_mpz(& exponents_mpz, n_fatt, n_primes);
  for(unsigned i = 0; i < n_fatt; ++i)
    for(unsigned j = 0; j < n_primes; ++j) {
      a = get_matrix(exponents, i, j);
      mpz_set_ui(temp, a);
      set_matrix_mpz(exponents_mpz, i, j, temp);
  /* Eliminazione gaussiana */
  gaussian_elimination(exponents_mpz, M, As, N, 
		       n_fatt, n_primes, n_blocchi, wt);
  t2 = MPI_Wtime();
  double t_gauss = t2 - t1;

  /* In m ritorno un fattore non banale di N */
  unsigned int n_fact_non_banali = factorization(N, factor_base, 
						 M, exponents_mpz, 
						 As, wt, n_fatt, 
						 n_primes, m);
  printf("#time_base time_sieve time_gauss time_totale\n");
  printf("%.6f ", t_base);
  printf("%.6f ", t_sieve);
  printf("%.6f ", t_gauss);
  printf("%.6f\n", t_base + t_gauss + t_sieve);
  if(n_fact_non_banali > 0) {
    return OK;
  else {
Example #28
void LU_decomp(struct problem *info, struct fmatrix *X, int *reorder, MPI_Datatype pivot_type, MPI_Op best_pivot_op)
	MPI_Request req_spiv, req_sa, req_sm;
	MPI_Status status;
	number_type *m = malloc(info->blksz * sizeof(*m));
	int diag;
	for (diag = 0; diag < info->n; diag++) {
		/* we do partial pivoting, so the proc with the pivot is on this column: */
		int pivot_h = diag / info->blksz;
		int r, c, i;
		double start_time = MPI_Wtime();
		double start_time2;

		struct pivot pivot = { -1, 0. };
		/* choose pivot across the column */
		if (info->coords[HDIM] == pivot_h) {
			/* column with pivot in block */
			int pivot_c = diag % info->blksz;
			/* Argo doesn't want aliasing in allreduce */
			struct pivot pivot_cand = { -1, 0. };
			for (i = 0; i < info->blksz; i++) {
				if (reorder[i] > diag && fabs(CELL(X, i, pivot_c)) > fabs(pivot_cand.value)) {
					pivot_cand.row = info->blksz*info->coords[VDIM] + i;
					pivot_cand.value = CELL(X, i, pivot_c);
			start_time2 = MPI_Wtime();
			MPI_Allreduce(&pivot_cand, &pivot, 1, pivot_type, best_pivot_op, info->vcomm);
			pivot_allr_time += MPI_Wtime() - start_time2;
		/* broadcast pivot choice across row towards the right */
		start_time2 = MPI_Wtime();
		pipeline_right(info, pivot_h, &pivot, 1, pivot_type, 45, &req_spiv);
		pivot_bcast_time += MPI_Wtime() - start_time2;
		pivot_time += MPI_Wtime() - start_time;
		/* find rank of proc with pivot on the vertical communicator */
		int pivot_v = pivot.row / info->blksz;
		/* fill in reorder */
		if (info->coords[VDIM] == pivot_v) {
			reorder[pivot.row % info->blksz] = diag;
		/* calculate and distribute the ms */
		for (r = 0; r < info->blksz; r++) {
			if (reorder[r] > diag) {
				if (info->coords[HDIM] == pivot_h) {
					int pivot_c = diag % info->blksz;
					m[r] = CELL(X, r, pivot_c) / pivot.value;
					CELL(X, r, pivot_c) = m[r];
				/* broadcast m towards right */
				start_time = MPI_Wtime();
				pipeline_right(info, pivot_h, &m[r], 1, MPI_number_type, 64, &req_sm);
				m_bcast_time += MPI_Wtime() - start_time;
		/* distribute the pivot row and eliminate */
		int startc = 0;
		if (info->coords[HDIM] == pivot_h) startc = (diag+1) % info->blksz;
		if (info->coords[HDIM] < pivot_h) startc = info->blksz;
		/* elimination */
		for (c = startc; c < info->blksz; c++) {
			number_type a;
			if (info->coords[VDIM] == pivot_v) {
				a = CELL(X, pivot.row % info->blksz, c);
			start_time = MPI_Wtime();
			int up = (info->coords[VDIM]+info->sqp-1)%info->sqp;
			int down = (info->coords[VDIM]+1)%info->sqp;
			if (info->coords[VDIM] != pivot_v) {
				MPI_Recv(&a, 1, MPI_number_type, up, 78, info->vcomm, &status);
			if (down != pivot_v) {
				MPI_Isend(&a, 1, MPI_number_type, down, 78, info->vcomm, &req_sa);
			a_bcast_time += MPI_Wtime() - start_time;
			for (r = 0; r < info->blksz; r++) {
				if (reorder[r] > diag) {
					CELL(X, r,c) -= m[r]*a;
			if (down != pivot_v) MPI_Wait(&req_sa, &status);
Example #29
int TestVecUnPackDouble(int n, int stride,
                        double *avgTimeUser, double *avgTimeMPI, double *dest, const double *src)
    double *restrict d_dest;
    const double *restrict d_src;
    register int i, j;
    int rep, position;
    double t1, t2, t[NTRIALS];
    MPI_Datatype vectype;

    /* User code */
    if (verbose)
        printf("TestVecUnPackDouble (USER): ");
    for (j = 0; j < NTRIALS; j++) {
        t1 = MPI_Wtime();
        for (rep = 0; rep < N_REPS; rep++) {
            i = n;
            d_dest = dest;
            d_src = src;
            while (i--) {
                *d_dest = *d_src++;
                d_dest += stride;
        t2 = MPI_Wtime() - t1;
        t[j] = t2;
        if (verbose)
            printf("%.3f ", t[j]);
    if (verbose)
        printf("[%.3f]\n", noise(t, NTRIALS));
    /* If there is too much noise, discard the test */
    if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
        *avgTimeUser = 0;
        *avgTimeMPI = 0;
        if (verbose)
            printf("Too much noise; discarding measurement\n");
        return 0;
    *avgTimeUser = mean(t, NTRIALS) / N_REPS;

    /* MPI Vector code */
    MPI_Type_vector(n, 1, stride, MPI_DOUBLE, &vectype);

    if (verbose)
        printf("TestVecUnPackDouble (MPI): ");
    for (j = 0; j < NTRIALS; j++) {
        t1 = MPI_Wtime();
        for (rep = 0; rep < N_REPS; rep++) {
            position = 0;
            MPI_Unpack((void *) src, n * sizeof(double),
                       &position, dest, 1, vectype, MPI_COMM_SELF);
        t2 = MPI_Wtime() - t1;
        t[j] = t2;
        if (verbose)
            printf("%.3f ", t[j]);
    if (verbose)
        printf("[%.3f]\n", noise(t, NTRIALS));
    /* If there is too much noise, discard the test */
    if (noise(t, NTRIALS) > VARIANCE_THRESHOLD) {
        *avgTimeUser = 0;
        *avgTimeMPI = 0;
        if (verbose)
            printf("Too much noise; discarding measurement\n");
    } else {
        *avgTimeMPI = mean(t, NTRIALS) / N_REPS;


    return 0;
Example #30
int main(int argc, char *argv[])
  int rc, i, j = 0, rid, ret;
  armci_ckpt_ds_t ckptds;
  ARMCI_Group grp;

  ARMCI_Init_args(&argc, &argv);
  nproc = armci_msg_nproc();
  me = armci_msg_me();

  if (me == 0) {
    if (nproc > MAXPROCS) {
      ARMCI_Error("nproc > MAXPROCS", nproc);
    else {
      printf("ARMCI test program (%d processes)\n", nproc);

  size = SIZE_;
  rc = ARMCI_Malloc((void **)ptr_arr, size * 8);
  printf("ARMCI test program (%d processes)\n", nproc);
  for (size = 1; size <= SIZE_; size *= 2) {
    t1 = MPI_Wtime();
    for (i = 0; i < 5; i++) {
      for (rc = 0; rc < 15; rc++) {
    time_array[j++] = MPI_Wtime() - t1;
    printf("%d:done for size %ld\n", me, size);

  (void)ARMCI_Ckpt_create_ds(&ckptds, 1);
  ckptds.ptr_arr[0] = ptr_arr[me];[0] = SIZE_ * 8;
  rid = ARMCI_Ckpt_init(NULL, &grp, 1, 0, &ckptds);
  printf("%d: After ARMCI_Ckpt_init(): \n", me);

  j = 0;
  for (size = 128; size <= SIZE_; size *= 2) {

    int rc;
    int simulate_restart = 1;
    t1 = MPI_Wtime();

    ret = ARMCI_Ckpt(rid);
    if (ret == ARMCI_CKPT) {
      printf("%d: Performed CHECKPOINT @ size=%ld\n", me, size);
    else if (ret == ARMCI_RESTART) {
      simulate_restart = 0;
      printf("%d: Performed RESTART @ size=%ld\n", me, size);

    for (i = 0; i < 5; i++) {
      for (rc = 0; rc < 15; rc++)
        if (i == 3 && rc == 10) {

    time_array1[j++] = MPI_Wtime() - t1;

    if (simulate_restart && size == FAILURE_SIZE_) {
      printf("%d: Simulating FAILURE @ size = %d\n", me, size);
      ARMCI_Restart_simulate(rid, 1);

    printf("%d: DONE for size=%ld regular=%f withckpt=%f\n\n",
           me, size, time_array[j-1], time_array1[j-1]);



  printf("Before Finalize()\n");