C++ (Cpp) BLOCK_LOWの例

コード例 #1

0

ファイルを表示

ファイル: myConvKernel.cpp プロジェクト: EnigmaHuang/waifu2x-converter-cpp

void myConvKernel_naive()
{
    float *filterOutput_buf = (float*) _mm_malloc(sizeof(float) * outputSize, 512); 
    assert(filterOutput_buf != NULL);
    
    memset(outputPlanes, 0, outputSize * nOutputPlanes);
    
    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        int nthreads = omp_get_num_threads();
        
        int ioHeight_spos = BLOCK_LOW(tid, nthreads, ioHeight);
        int ioHeight_epos = BLOCK_LOW(tid + 1, nthreads, ioHeight);
        
        int oS_spos = ioHeight_spos * ioWidth;
        int oS_size = (ioHeight_epos - ioHeight_spos) * ioWidth;
      
        for (int opIndex = 0; opIndex < nOutputPlanes; opIndex++)
        {
            float *filterOutput = filterOutput_buf;                    
            float *outputPlane = outputPlanes + opIndex * outputSize; 
            
            for (int ipIndex = 0; ipIndex < nInputPlanes; ipIndex++)
            {
                int wMatIndex = nInputPlanes * opIndex + ipIndex;
                float *inputPlane = inputPlanes + ipIndex * paddedInSize;
                float *weightMatrix = weights + wMatIndex * wSize;
                
                convolve3x3withPad(
                    inputPlane, filterOutput, weightMatrix,
                    ioHeight_spos, ioHeight_epos
                );

                addVec(oS_size, filterOutput + oS_spos, outputPlane + oS_spos);
            }
        }
        
        #pragma omp barrier
        
        #pragma omp for
        for (int opIndex = 0; opIndex < nOutputPlanes; opIndex++)
        {
            int wMatIndex = nInputPlanes * opIndex;
            float *outputPlane = outputPlanes + opIndex * outputSize;    
            addBias(outputSize, (float)(biases[opIndex]), outputPlane); 
            scaleIfLessThanX(outputSize, outputPlane, 0.0, 0.1);  
        }
    }

    _mm_free(filterOutput_buf);
}

コード例 #2

0

ファイルを表示

ファイル: test-vecvecmul.c プロジェクト: gpaulsen/ompi-www

void *VectMul (void *th)
{
	long Tid = (long) th;

	// interleaving communication with computation

	if (Tid == NCORES-1) // this thread handles communication
	{
		if (id == 0)
		{
			MPI_Isend (v3[BLOCK_LOW(id,p,VLEN)], BLOCK_SIZE(id,p,VLEN), mpntype1, id^1, 0, MPI_COMM_WORLD, &Srqst);
			MPI_Irecv (v3[BLOCK_LOW(id^1,p,VLEN)], BLOCK_SIZE(id^1,p,VLEN), mpntype1, id^1, 1, MPI_COMM_WORLD, &Rrqst);
		}
		else if (id == 1)
		{
			MPI_Isend (v3[BLOCK_LOW(id,p,VLEN)], BLOCK_SIZE(id,p,VLEN), mpntype1, id^1, 1, MPI_COMM_WORLD, &Srqst);
			MPI_Irecv (v3[BLOCK_LOW(id^1,p,VLEN)], BLOCK_SIZE(id^1,p,VLEN), mpntype1, id^1, 0, MPI_COMM_WORLD, &Rrqst);
		}
	}
	else if (Tid < NCORES-1) // these threads handle computation
	{
		vectvectmul (v1, v2, Tid); // product of two vectors

		// reduction within a node using threads
		pthread_mutex_lock (&mutexvv);
		mpn_add_n (fin_sum, fin_sum, temp_sum[Tid], 2*LIMBS+1);
		pthread_mutex_unlock (&mutexvv);

		// each thread waits for every other thread to finish reduction
		wait (NCORES-1);

		// reduction across nodes using MPI
		if (Tid == 0)
		{
			MPI_Allreduce (fin_sum, result, 1, mpntype0, mpn_sum, MPI_COMM_WORLD);
			mpn_tdiv_qr (tquo, cnum, 0, result, 2*LIMBS+1, q, LIMBS);
		}
		wait (NCORES-1);
	}

	if (Tid == NCORES-1)  // waiting for the communications to terminate
	{
		MPI_Wait (&Srqst, &Sstatus);
		MPI_Wait (&Rrqst, &Rstatus);
	}
}

コード例 #3

0

ファイルを表示

ファイル: test-vecvecmul.c プロジェクト: gpaulsen/ompi-www

int main(int argc, char *argv[])
{
	unsigned char str[154];
	unsigned int arr[] = {9,2,5,8,4,2,4,1,6,9,1,8,9,9,6,1,5,7,0,7,7,4,3,7,6,3,9,5,4,2,3,0,4,4,1,5,3,3,7,2,3,3,7,0,9,4,5,2,8,4,6,\
                              2,1,3,4,1,4,2,6,0,8,5,1,7,3,1,4,4,7,0,5,3,4,4,8,9,1,1,9,8,3,5,1,8,3,4,4,8,3,2,8,1,2,8,7,4,1,8,1,8,0,4,\
                              8,4,2,4,4,5,4,9,1,8,3,4,9,5,6,3,3,1,4,6,4,1,0,2,0,2,5,1,4,8,5,9,9,6,9,4,0,3,6,5,5,9,5,4,2,2,3,7,8,5,9,7};

	long i;

	double t1, t2, Itime;
	int provided;

	/* Allocation */

	v1 = (vector *) malloc (VLEN * sizeof (vector));
	v2 = (vector *) malloc (VLEN * sizeof (vector));
	v3 = (vector *) malloc (VLEN * sizeof (vector));

	fin_sum = (mp_limb_t *) malloc ((2*LIMBS+1) * sizeof (mp_limb_t));
	result = (mp_limb_t *) malloc ((2*LIMBS+1) * sizeof (mp_limb_t));
	q = (mp_limb_t *) malloc (LIMBS * sizeof (mp_limb_t));

	MPI_Init_thread (&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
	MPI_Comm_rank (MPI_COMM_WORLD, &id);
	MPI_Comm_size (MPI_COMM_WORLD, &p);

	MPI_Type_contiguous (2*LIMBS+1, MPI_UNSIGNED_LONG_LONG, &mpntype0);
	MPI_Type_commit (&mpntype0);

	MPI_Type_contiguous (LIMBS, MPI_UNSIGNED_LONG_LONG, &mpntype1);
	MPI_Type_commit (&mpntype1);

	MPI_Op_create ((MPI_User_function *)addmpn, 1, &mpn_sum);

	for (i=0; i<154; ++i)	str[i] = (unsigned char)arr[i];
	mpn_set_str (q, str, 154, 10);
	//if (!id) gmp_printf ("Modulus: %Nd\n", q, LIMBS);

	MPI_Barrier (MPI_COMM_WORLD);

	/* Setting limits for 2 MPI nodes */

	VOffset = BLOCK_LOW(id,p,VLEN);
	VChunk  = BLOCK_SIZE(id,p,VLEN);

	/* Setting limits for NCORES-1 threads */

	for (i=0; i<NCORES-1; ++i)
	{
		VStart[i] = VOffset + BLOCK_LOW(i,NCORES-1,VChunk);
		VEnd[i]   = VOffset + BLOCK_HIGH(i,NCORES-1,VChunk);
	}

	for (i=0; i<VLEN; ++i)	mpn_random (v1[i], LIMBS);
	for (i=0; i<VLEN; ++i)	mpn_random (v2[i], LIMBS);
	for (i=BLOCK_LOW(id,p,VLEN); i<=BLOCK_HIGH(id,p,VLEN); ++i)	mpn_random (v3[i], LIMBS);
		
	MPI_Barrier (MPI_COMM_WORLD);

	t1 = MPI_Wtime ();

	for (i=0; i<NCORES; ++i)
		pthread_create(&threads[i], &attr, VectMul, (void *) i);

	for (i=0; i<NCORES; ++i)
		pthread_join (threads[i], NULL);

	t2 = MPI_Wtime ();
	Itime = t2 - t1;
	if (!id) printf ("Total time taken: %lf\n",Itime);
	
	if (!id) gmp_printf ("Result: %Nd\n", cnum, LIMBS);

	MPI_Op_free(&mpn_sum);
	MPI_Request_free (&Rrqst);
	MPI_Request_free (&Srqst);
	MPI_Finalize ();

	return 0;	
}

コード例 #4

0

ファイルを表示

ファイル: scatter1c.c プロジェクト: jluisacosta/SCtest

int BLOCK_HIGH(int id,int p,int n) {return BLOCK_LOW(id+1,p,n)-1;}

コード例 #5

0

ファイルを表示

ファイル: scatter1c.c プロジェクト: jluisacosta/SCtest

int BLOCK_SIZE(int id,int p,int n) {return BLOCK_HIGH(id,p,n)-BLOCK_LOW(id,p,n)+1;}

コード例 #6

0

ファイルを表示

ファイル: improvedPrimeOther.c プロジェクト: allentdo/HPC_improvedPrime

//wh
int main (int argc, char *argv[])
{
    int count;              ///局部素数个数
    double elapsed_time;    ///运行时间
    int first;              ///每组中第一个是prime倍数的数
    int global_count;       ///全局素数个数
    int high_value;         ///每组的最后一个元素值
    int i;                  ///用于循环
    int id;                 ///程序ID
    int index;              ///子程序0数组标号
    int low_value;          ///每组的第一个元素值
    char * marked;          ///指向局部数组
    int n;                  ///求素数的范围
    int p;                  ///子程序数目
    int proc0_size;         ///子程序0的数组大小
    int prime;              ///下一个要删除其倍数的素数
    int size;               ///局部素数个数
    MPI_Init (&argc, &argv);    ///MPI初始化
    MPI_Barrier(MPI_COMM_WORLD);///所有程序同时开始
    elapsed_time = -MPI_Wtime();///获得当前时间的负值
    MPI_Comm_rank (MPI_COMM_WORLD, &id);///得到该通信子程序的ID号
    MPI_Comm_size (MPI_COMM_WORLD, &p);///得到通信关联组大小
    ///判断传入参数个数是否正确，不正确则终止程序
    if (argc != 2)
    {
        if (!id) printf ("Command line: %s <m>\n", argv[0]);
        MPI_Finalize();
        exit (1);
    }
    n = atoi(argv[1]);///从传入参数得到的大小
    low_value = 3 + 2*(BLOCK_LOW(id,p,(n-1)/2));///算出该子程序持有的数组的最小值
    high_value = 3 + 2*(BLOCK_HIGH(id,p,(n-1)/2));///算出该子程序持有的数组的最大值
    size = BLOCK_SIZE(id,p,(n-1)/2);///算出该子程序持有的数组元素个数
    proc0_size = ((n-1)/2)/p;///算出进程0的数组元素个数
    ///如果进程0所控制的最大数小于n的平方根，异常退出
    if ((3 + 2*proc0_size) < (int) sqrt((double) n))
    {
        if (!id) printf ("Too many processes\n");
        MPI_Finalize();
        exit (1);
    }
    marked = (char *) malloc (size);///分配数组空间
    ///如果分配空间失败，异常退出
    if (marked == NULL)
    {
        printf ("Cannot allocate enough memory\n");
        MPI_Finalize();
        exit (1);
    }
    for (i = 0; i < size; i++) marked[i] = 0;///初始化数组，0表示未标记
    if (!id) index = 0;///初始化index，只有子程序0使用该变量
    prime = 3;///第一个素数为3(偶数已经去掉)
    /*************************核心算法*****************************/
    do
    {
        if (prime * prime > low_value)
            first = (prime * prime - low_value)/2;
        else
        {
            if (!(low_value % prime)) first = 0;
            else
                first = (prime - low_value % prime+1)/2+((prime-1)/2)*((prime - low_value % prime)%2);
        }
        for (i = first; i < size; i += prime)
        {
            marked[i] = 1;
        }
        if (!id)
        {
            while (marked[++index]);
            prime = 2*index + 3;
        }
        MPI_Bcast (&prime,  1, MPI_INT, 0, MPI_COMM_WORLD);
    }
    while (prime * prime <= n);
    /*************************************************************/
    ///计算局部数组个数
    count = 0;
    for (i = 0; i < size; i++)
        if (!marked[i])
        {
            count++;
        }
    ///将所有局部素数数目求和发送给子程序0
    MPI_Reduce (&count, &global_count, 1, MPI_INT, MPI_SUM,
                0, MPI_COMM_WORLD);
    elapsed_time += MPI_Wtime();///计算运行时间
    ///子程序0负责输出信息
    if (!id)
    {
        printf ("%d primes are less than or equal to %d\n",
                global_count+1, n);
        printf ("Total elapsed time: %10.6f\n", elapsed_time);
    }
    MPI_Finalize ();///MPI终止
    return 0;
}

コード例 #7

0

ファイルを表示

ファイル: sieve_0.c プロジェクト: trevorwhitney/csci563

int main (int argc, char *argv[])
{
  //define variables
  int n;
  double elapsed_time;
  int p;
  int id;
  int low_value;
  int high_value;
  int size;
  int proc0_size;
  char* marked;
  int index;
  int prime;
  int first;
  int count;
  int global_count;
  int i;

  //Initialize MPI
  MPI_Init (&argc, &argv);
  MPI_Barrier(MPI_COMM_WORLD);
  elapsed_time = -MPI_Wtime();
  MPI_Comm_rank (MPI_COMM_WORLD, &id);
  MPI_Comm_size (MPI_COMM_WORLD, &p);
  
  //Check for proper command line parameters, must include N
  if (argc != 2) {
      if (!id) printf ("Command line: %s <m>\n", argv[0]);
      MPI_Finalize();
      exit(1);
  }

  //Convert parameter string to integer
  //N represents the number up to which we need to calculate primes
  n = atoi(argv[1]);

  //Low and high values for each processor
  low_value = 2 + BLOCK_LOW(id,p,n-1);
  high_value = 2 + BLOCK_HIGH(id,p,n-1);
  size = BLOCK_SIZE(id,p,n-1);
  
  //largest prime is sqrt(n), so first processor has all primes if
  //p is less than sqrt(n). We need to check we don't have more processors
  //than we need.
  proc0_size = (n-1)/p;
  if ((2 + proc0_size) < (int) sqrt((double) n)) {
    if (!id) printf ("Too many processes\n");
    MPI_Finalize();
    exit (1);
  }

  //allocate memory for block, error if unable to
  marked = (char *) malloc (size);
  if (marked == NULL) {
    printf ("Cannot allocate enough memory\n");
    MPI_Finalize();
    exit (1);
  }

  /* Begin Sieve of Eratosthenes Algorithm */
  //First fill marked[] with zero/false for all items in block
  for (i = 0; i < size; i++) marked[i] = 0;
  
  if (!id) index = 0;
  
  //first prime is 2
  prime = 2;
  do {
    if (prime * prime > low_value)
       first = prime * prime - low_value;
    else {
       if (!(low_value % prime)) first = 0;
       else first = prime - (low_value % prime);
    }
    //increment by prime, marking the non-primes with 1, or true
    for (i = first; i < size; i += prime) marked[i] = 1;
    if (!id) {
       while (marked[++index]);
       prime = index + 2;
    }
    MPI_Bcast (&prime,  1, MPI_INT, 0, MPI_COMM_WORLD);
  } while (prime * prime <= n);
  /* End Sieve of Eratosthenes Algorithm */

  /*Begin count of primes */
  count = 0;

  //for all elements in block, if prime is 1/true, increment count
  for (i = 0; i < size; i++)
    if (!marked[i]) count++;
  
  //Sum count of primes from each process
  MPI_Reduce (&count, &global_count, 1, MPI_INT, MPI_SUM,
    0, MPI_COMM_WORLD);
  elapsed_time += MPI_Wtime();
  
  //print results on main processor
  if (!id) {
    printf ("%d primes are less than or equal to %d\n",
       global_count, n);
    printf ("Total elapsed time: %10.6f\n", elapsed_time);
  }
  MPI_Finalize();
  return 0;
}

コード例 #8

0

ファイルを表示

ファイル: grid.c プロジェクト: tuxfan/ska

grid *grid_create(double startx, double endx, int nx,
                  double starty, double endy, int ny,
                  double startz, double endz, int nz)
{
    MPI_Comm cart_comm;
    int periodics[3];
    int i, j, k;
    int ind;
    int np, rank;

    grid *grd = (grid*) malloc(sizeof(grid));

    MPI_Comm_size(MPI_COMM_WORLD, &np);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    grd->id = 0;

    if (nz > 0)
        grd->nd = 3;
    else
        grd->nd = 2;

    grd->num_global[0] = nx;
    grd->num_global[1] = ny;
    grd->num_global[2] = nz;

    grid_decomp(grd, np);

    periodics[0] = periodics[1] = periodics[2] = 0;
    MPI_Cart_create(MPI_COMM_WORLD, grd->nd, grd->num_procs, periodics, 1, &cart_comm);
    MPI_Comm_rank(cart_comm, &rank);
    MPI_Cart_coords(cart_comm, rank, grd->nd, grd->cart_coord);

    grd->nx = BLOCK_SIZE(grd->cart_coord[0], grd->num_procs[0], nx);
    grd->ny = BLOCK_SIZE(grd->cart_coord[1], grd->num_procs[1], ny);

    grd->num_local[0] = grd->nx;
    grd->num_local[1] = grd->ny;

    grd->num_pts = grd->nx * grd->ny;
    grd->is[0] = BLOCK_LOW(grd->cart_coord[0], grd->num_procs[0], nx) + 1;
    grd->ie[0] = BLOCK_HIGH(grd->cart_coord[0], grd->num_procs[0], nx) + 1;
    grd->is[1] = BLOCK_LOW(grd->cart_coord[1], grd->num_procs[1], ny) + 1;
    grd->ie[1] = BLOCK_HIGH(grd->cart_coord[1], grd->num_procs[1], ny) + 1;

    if (grd->nd == 3) {
        grd->nz = BLOCK_SIZE(grd->cart_coord[2], grd->num_procs[2], nz);
        grd->num_local[2] = grd->nz;
        grd->num_pts = grd->num_pts * grd->nz;
        grd->is[2] = BLOCK_LOW(grd->cart_coord[2], grd->num_procs[2], nz) + 1;
        grd->ie[2] = BLOCK_HIGH(grd->cart_coord[2], grd->num_procs[2], nz) + 1;
    }

    grd->comm = MPI_COMM_WORLD;

    grd->xyz = (double*) malloc(grd->num_pts*3*sizeof(double));
    grd->x = &grd->xyz[0];
    grd->y = &grd->xyz[grd->num_pts];
    grd->z = &grd->xyz[grd->num_pts*2];

    grd->hx = (endx - startx) / (grd->num_global[0] - 1);
    grd->hy = (endy - starty) / (grd->num_global[1] - 1);
    grd->hz = (endz - startz) / (grd->num_global[2] - 1);

    startx = startx + (grd->is[0]-1)*grd->hx;
    starty = starty + (grd->is[1]-1)*grd->hy;
    startz = startz + (grd->is[2]-1)*grd->hz;

    if (grd->nd == 3) {
        for (k = 0; k < grd->nz; k++) {
            for (j = 0; j < grd->ny; j++) {
                for (i = 0; i < grd->nx; i++) {
                    ind = k*grd->nx*grd->ny + j*grd->nx + i;
                    grd->x[ind] = startx + i*grd->hx;
                    grd->y[ind] = starty + j*grd->hy;
                    grd->z[ind] = startz + k*grd->hz;
                }
            }
        }
    } else {
        for (j = 0; j < grd->ny; j++) {
            for (i = 0; i < grd->nx; i++) {
                ind = j*grd->nx + i;
                grd->x[ind] = startx + i*grd->hx;
                grd->y[ind] = starty + j*grd->hy;
            }
        }
    }

    for (i = 0; i < grd->nd; i++) {
        grd->periodic[i] = 0;
    }

    return grd;
}

コード例 #9

0

ファイルを表示

ファイル: cycleprunner.c プロジェクト: belkhir-nacim/CMAES-Parallelization

/* the optimization loop */
int main(int argc, char **argv) {
  cmaes_t evo; /* an CMA-ES type struct or "object" */
  double *arFunvals,  *xfinal, *const*pop;
  int i,j;
  int numberDipoles;
  int id;  //Rank
  int p;  //Number processors
  double elapsed_time;//Time from beginning.
  double bestValue;
  int lambda;
  int maxLambda;
  int * sendCnts; //For MPI_Alltoallv for arFunVals
  int * sdispls;  //For MPI_Alltoallv for arFunVals
  int * recvCnts; //For MPI_Alltoallv for arFunVals
  int * rdispls; //For MPI_Alltoallv for arFunVals
  int * sendCntsPop; //For MPI_Alltoallv for pop
  int * sdisplsPop; //For MPI_Alltoallv for pop
  int * recvCntsPop; //For MPI_Alltoallv for pop
  int * rdisplsPop; //For MPI_Alltoallv for pop
  int canTerminate;
  int canTerminateBuffer;

  //Start MPI
  MPI_Init(&argc, &argv);
  MPI_Barrier(MPI_COMM_WORLD);
  elapsed_time = -MPI_Wtime(); //Set initial time.

  MPI_Comm_rank(MPI_COMM_WORLD, &id); //Set id
  MPI_Comm_size(MPI_COMM_WORLD, &p); //set p



  for (i=0;i<32;i++)
  {
    observations[i]/=1000.0;
  }

  //Set number of dipoles, either first argument or default value of 2.
  numberDipoles=2; 
  if (argc>=2)
  {
    numberDipoles=atoi(argv[1]);
  }

  //Set lambda based on entry, default of 40
  maxLambda=40;
  if (argc>=3)
  {
    maxLambda=atoi(argv[2]);
  }

  if (id==0)
  {
    printf("Dipoles:%d MaxLambda:%d\n",numberDipoles,maxLambda);
  }

  //Allocate lambda pieces to each processor, based on the size of maxLambda and the number of processors.
  lambda = BLOCK_SIZE(id,p,maxLambda);

  printf("Id:%d Lambda:%d\n",id,lambda);

  //Setup send and receive buffers for function evaluations and populations that resulted in those evaluation.
  sendCnts = malloc(p*sizeof(int));
  sdispls = malloc(p*sizeof(int));
  recvCnts = malloc(p*sizeof(int));
  rdispls = malloc(p*sizeof(int));
  sendCntsPop = malloc(p*sizeof(int));
  sdisplsPop = malloc(p*sizeof(int));
  recvCntsPop = malloc(p*sizeof(int));
  rdisplsPop = malloc(p*sizeof(int));

  for (i=0;i<p;i++)
  {

    sendCnts[i]=lambda;//Same for all others
    sdispls[i] = BLOCK_LOW(id,p,maxLambda);//Same for all others
    recvCnts[i] = BLOCK_SIZE(i,p,maxLambda);//Depends on which we receive from.
    rdispls[i] = BLOCK_LOW(i,p,maxLambda);

    sendCntsPop[i]=lambda*((numberDipoles*6+2));//Same for all others
    sdisplsPop[i] = BLOCK_LOW(id,p,maxLambda)*(numberDipoles*6+2);//Same for all others
    recvCntsPop[i] = BLOCK_SIZE(i,p,maxLambda)*(numberDipoles*6+2);//Depends on which we receive from.
    rdisplsPop[i] = BLOCK_LOW(i,p,maxLambda)*(numberDipoles*6+2);

  }

  for (i=0;i<p;i++)
  {

    printf("Id: %d recvCnts[%d]=%d\n",id,i,recvCnts[i]);
    printf("Id: %d rdispls[%d]=%d\n",id,i,rdispls[i]);

    printf("Id: %d recvCntsPop[%d]=%d\n",id,i,recvCntsPop[i]);
    printf("Id: %d rdisplsPop[%d]=%d\n",id,i,rdisplsPop[i]);

  }
  





  /* Initialize everything into the struct evo, 0 means default */
  //arFunvals = cmaes_init(&evo, 0, NULL, NULL, 0, 0, "initials.par"); 

//  printf("0\n");
  
  //The maxLambda parameter has been added so all of them will have enough space to store the results
  arFunvals = reinit(&evo, maxLambda, numberDipoles);

  //outputCMAES_t(evo,1);

//  printf("1\n");

  resetSignals(&evo, numberDipoles);  /* write header and initial values */

  //Reset the seed value based on processor (so they don't all come out the same!
  evo.sp.seed=evo.sp.seed*(id+1)/p;
  printf("proc: %d seed: %d\n",id,evo.sp.seed);


  //outputCMAES_t(evo,0);

//  printf("2\n");

//  printf("%s\n", cmaes_SayHello(&evo));
//  i=40;

//  for (i=32;i<40;i*=2)
//  { 

//    arFunvals = reinit(&evo, i);
    //outputCMAES_t(evo);


  evo.sp.lambda=lambda;
  canTerminate = (0==1);
  /* Iterate until stop criterion holds */
  while(!canTerminate)
    { 
      /* generate lambda new search points, sample population */
      pop = cmaes_SamplePopulation(&evo); /* do not change content of pop */

      /* Here you may resample each solution point pop[i] until it
	 becomes feasible, e.g. for box constraints (variable
	 boundaries). function is_feasible(...) needs to be
	 user-defined.  
	 Assumptions: the feasible domain is convex, the optimum is
	 not on (or very close to) the domain boundary, initialX is
	 feasible and initialStandardDeviations are sufficiently small
	 to prevent quasi-infinite looping.
      */
      /*for (i = 0; i < lambda; ++i) 
      {
          cmaes_ReSampleSingle(&evo, i); 
      }*/
      for (i = 0; i < lambda; ++i) 
      {
	   while (!is_feasible(evo.rgrgx[i],(int) cmaes_Get(&evo, "dim"))) 
	   {
             cmaes_ReSampleSingle(&evo, i); 
           }
      }

      for (i=0;i<lambda;i++)
      {
         for(j=0;j<(6*numberDipoles)+2;j++)
         {
	   evo.rgrgx[BLOCK_LOW(id,p,maxLambda)+i][j]=evo.rgrgx[i][j];
         }
      }
 
      /* evaluate the new search points using fitfun from above */ 
      for (i = BLOCK_LOW(id,p,maxLambda); i <= BLOCK_HIGH(id,p,maxLambda); ++i) {
	arFunvals[i] = fitfun(evo.rgrgx[i], (int) cmaes_Get(&evo, "dim"));
        //printf("ID:%d, arFunvals[%d]=%lf\n",id,i,arFunvals[i]);
      }

      



      //Now communicate the arFunvals around
      MPI_Alltoallv(arFunvals,sendCnts,sdispls,MPI_DOUBLE,arFunvals,recvCnts,rdispls,MPI_DOUBLE,MPI_COMM_WORLD);


      //Now communicate the populations being looked at around
      MPI_Alltoallv(&evo.rgrgx[0][0],sendCntsPop,sdisplsPop,MPI_DOUBLE,&evo.rgrgx[0][0],recvCntsPop,rdisplsPop,MPI_DOUBLE,MPI_COMM_WORLD);


      /* update the search distribution used for cmaes_SampleDistribution() */
      cmaes_UpdateDistribution(&evo, arFunvals);  


      //Test for any that can terminate.
      canTerminate = cmaes_TestForTermination(&evo);
      if (canTerminate)
      {
	printf("id:%d can terminate for reason:%s\n",id,cmaes_TestForTermination(&evo));
      }
      MPI_Allreduce(&canTerminate,&canTerminateBuffer,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD);//Get the max, if any are >0, then someone has terminated.
      canTerminate = canTerminateBuffer;//Reset so the loop will exit.

      /* read instructions for printing output or changing termination conditions */ 
//      cmaes_ReadSignals(&evo, "signals.par");   
//      fflush(stdout); /* useful in MinGW */
    }
//  printf("Stop:\n%s\n",  cmaes_TestForTermination(&evo)); /* print termination reason */

//  cmaes_WriteToFile(&evo, "all", "allcmaes.dat");         /* write final results */


  elapsed_time += MPI_Wtime();



  /* get best estimator for the optimum, xmean */
  xfinal = cmaes_GetNew(&evo, "xmean"); /* "xbestever" might be used as well */
  bestValue = fitfun(xfinal, (int) cmaes_Get(&evo, "dim"));
  printf("Proccesor:%d has last mean of:%lf elapsedTime:%lf\n",id,bestValue,elapsed_time);
  for (i=0;i<6*numberDipoles;i++)
  {
    printf("(%d:%d:%lf)\n",id,i,xfinal[i]);
  }

//  cmaes_exit(&evo); /* release memory */ 
  /* do something with final solution and finally release memory */
  free(xfinal); 
  free(sendCnts);
  free(sdispls);
  free(recvCnts);
  free(rdispls);
  free(sendCntsPop);
  free(sdisplsPop);
  free(recvCntsPop);
  free(rdisplsPop);

  MPI_Finalize();

//}

  return 0;
}

コード例 #10

0

ファイルを表示

ファイル: backup-floyd-parallel.c プロジェクト: liuhuac/MPI

int main (int argc, char *argv[]) 
{
        int opt;
	char *ifile=NULL;
	char *ofile=NULL;
        while((opt=getopt(argc, argv, "i:o:"))!=-1){
                switch(opt){
                        case 'i':
                                ifile=strdup(optarg);
                                break;
                        case 'o':
                                ofile=strdup(optarg);
                                break;
			case '?':
                        case ':':
                        default :
                                usage();
                                break;
                }
        }

        if(optind!=argc){
                printf("Unknow argument '%s'\n",argv[optind]);
                usage();
        } else if(strcmp(argv[optind-1], "--")==0){
                printf("Unknow argument '%s'\n",argv[optind-1]);
                usage();
        }

        if(ifile==NULL){
                ifile=strdup("default-make-graph-file.dat");
        }
        if(ofile==NULL){
                ofile=strdup("default-make-graph-file.seq");
        }

	int rank, size; /* rank is your pid, staring with 0 */
	/* size, is the number of processes you */
	/* run the program with */
	/* never make MPI calls before this and */
	/* never touch argc and argv before doing this */ 
	MPI_Init (&argc, &argv);
	/* get current process id */
	MPI_Comm_rank (MPI_COMM_WORLD, &rank); 
	/* get number of processes */
	MPI_Comm_size (MPI_COMM_WORLD, &size); 

	MPI_Barrier(MPI_COMM_WORLD);
	double entire_start=MPI_Wtime();

	int n; /*vector length*/
	void *subvector; /*subvector*/
	void **subs; /*2D array*/
	void *storage; /*Array elements*/

	int dim[2], period[2], reorder;
	dim[0]=size;
	dim[1]=1;
	period[0]=0;
	period[1]=0;
	reorder=1;
	MPI_Comm comm;
	MPI_Comm rowcomm;
	MPI_Comm colcomm;
	int remain_dims[2];

	MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &comm);
	remain_dims[0]=1;
	remain_dims[1]=0;
	MPI_Cart_sub(comm, remain_dims, &rowcomm);
	remain_dims[0]=0;
	remain_dims[1]=0;
	MPI_Cart_sub(comm, remain_dims, &rowcomm);

	read_checkerboard_matrix_square (
		ifile,		/* IN - File name */
		&subs,         	/* OUT - 2D array */
		&storage,       /* OUT - Array elements */
		MPI_INT,   	/* IN - Element type */
		&n,	        /* OUT - Array dimension */
		comm);		/* IN - Communicator */

	print_checkerboard_matrix (
		subs,           /* IN -2D matrix */
		MPI_INT,        /* IN -Matrix element type */
		n,            	/* IN -Matrix rows */
		n,            	/* IN -Matrix columns */
		comm);    	/* IN - Communicator */
	

	MPI_Barrier(MPI_COMM_WORLD);
	double comp_start=MPI_Wtime();

	int coord[2];
	MPI_Cart_coords(comm, rank, 2, coord);
	local_rows = BLOCK_SIZE(coord[0],dim[0],n);
	local_cols = BLOCK_SIZE(coord[1],dim[1],n);

	int i,j,k;
	int ii;
	for(k=0;k<n;k++){

		xk_coord[0]=coord[0];
		xk_coord[1]=BLOCK_OWNER(k,dim[1],n);
		ky_coord[0]=BLOCK_OWNER(k,dim[0],n);
		ky_coord[1]=coord[1];
		MPI_Cart_rank(comm, xk_coord, &xk_rank);
		MPI_Cart_rank(comm, ky_coord, &ky_rank);
		row_offset=BLOCK_LOW(k,dim[0],n);
		col_offset=BLOCK_LOW(k,dim[1],n);

		if(rank==xk_rank){
			xk_rows=BLOCK_SIZE(xk_coord[0],dim[0],n);
			for(ii=0;ii<xk_rows;ii++){
				xk_storage[ii]=subs[ii][col_offset];
			}
			
			MPI_Send(xk_storage, xk_rows, MPI_INT, dest, tag,comm)
		} else {

コード例 #11

0

ファイルを表示

ファイル: bucket_sort_pipeline.c プロジェクト: Cristianf/MPI-Examples

/* Function to be executed by the workers */
void slave(Params * p)
{  
    srand(2);
    /* Timer */
    double start, end;
    int i;
    int * array;
    array = (int *) malloc(sizeof(int) * p->array_size);
    MPI_Status status;
    int count = 0;
    int done = 0;
    int temp = 0;
    int low = BLOCK_LOW(p->rank-1,p->size,p->max_num+1);
    int high = BLOCK_HIGH(p->rank-1,p->size,p->max_num+1);
    start = MPI_Wtime();
    // Repeat until done
    while(!done)
    {
        // Master work
        if(p->rank == 0)
        {
            // Increase count until array_size
            if(count++ < p->array_size)
            {
                // Add random number to list, send it up the pipeline to rank 1
                temp = rand() % p->max_num;
                MPI_Send(&temp, 1, MPI_INT, 1, NUM_TAG, MPI_COMM_WORLD);
            }
            else
            {
                // Signal end of the list
                MPI_Send(&temp, 1, MPI_INT, 1, TERM_TAG, MPI_COMM_WORLD);
                done = 1;
            }
        }
        // Slave work
        else
        {
            // receive a number from the previous stage
            MPI_Recv(&temp, 1, MPI_INT, p->rank-1, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
            // check for termination
            if(status.MPI_TAG == TERM_TAG)
            {
                // make sure I'm sending to a valid rank and foward termination
                done = 1;
                if(p->rank != p->size)
                {
                    MPI_Send(&temp, 1, MPI_INT, p->rank+1, TERM_TAG, MPI_COMM_WORLD);
                }
            }
            // got number
            else if(status.MPI_TAG == NUM_TAG)
            {
                // check to see if I keep it
                if(temp >= low && temp <= high)
                    array[count++] = temp;
                else
                    // pass it on
                    MPI_Send(&temp, 1, MPI_INT, p->rank+1, NUM_TAG,MPI_COMM_WORLD); 
            }

        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    int * sizes = (int *) calloc(p->size+1,sizeof(int));
    MPI_Gather(&count, 1, MPI_INT, sizes, 1, MPI_INT, 0 , MPI_COMM_WORLD);
    int * disp = (int *) calloc(p->size+1,sizeof(int));
    if(p->rank == 0)
    {
        sizes[0] = 0;
        disp[0] = 0;
        for(i = 1; i < p->size+1; ++i)
        {
            disp[i] = disp[i-1] + sizes[i-1];
        }

    }
    // sort the array, except for the master
    if(p->rank != 0) 
        qsort(array,count,sizeof(int),compare);
    if(p->rank == 0) count = 0;
    // gather results
    MPI_Gatherv(array, count, MPI_INT, &array[0], sizes, disp, MPI_INT,0,MPI_COMM_WORLD);
    // stop the clock, print results
    end = MPI_Wtime();
    if(p->rank ==0) fprintf(stderr,"[%d] Elapsed time: %f\n",p->rank,end-start);
    return;
}

コード例 #12

0

ファイルを表示

ファイル: parameters.c プロジェクト: pandasasa/PLSA-MP

bool processOptions (int argc, char *argv[], INFO *info) {
  int c = 0;

  char *base_fn = NULL;
  char *co_fn = NULL;
  unsigned int num_clusters = 0;
  unsigned int seed = UINT_MAX;
  unsigned int maxiter = 0;
  unsigned int snapshot = UINT_MAX;
  bool verbose = false;
  bool debug = false;
  bool textio = false;
  bool rounding = false;
  bool no_output = false;

  /*  Usage information if no arguments  */
  if (argc == 1) {
    usage (argv[0]);
  }

  while (1) {
    int option_index = 0;
    static struct option long_options[] = {
      {"base", 1, 0, 0},
      {"cooccur", 1, 0, 0},
      {"clusters", 1, 0, 0},
      {"seed", 1, 0, 0},
      {"maxiter", 1, 0, 0},
      {"snapshot", 1, 0, 0},
      {"openmp", 1, 0, 0},
      {"verbose", 0, 0, 0},
      {"debug", 0, 0, 0},
      {"text", 0, 0, 0},
      {"rounding", 0, 0, 0},
      {"nooutput", 0, 0, 0},
      {0, 0, 0, 0}
    };

    c = getopt_long (argc, argv, "", long_options, &option_index);
    if (c == -1) {
      break;
    }

    switch (c) {
      case 0:
        if (strcmp (long_options[option_index].name, "cooccur") == 0) {
          co_fn = wmalloc (strlen (optarg) + 1);
          co_fn = strcpy (co_fn, optarg);
        }
        else if (strcmp (long_options[option_index].name, "clusters") == 0) {
          num_clusters = atoi (optarg);
        }
        else if (strcmp (long_options[option_index].name, "seed") == 0) {
          seed = atoi (optarg);
        }
        else if (strcmp (long_options[option_index].name, "base") == 0) {
          base_fn = wmalloc (strlen (optarg) + 1);
          base_fn = strcpy (base_fn, optarg);
        }
        else if (strcmp (long_options[option_index].name, "maxiter") == 0) {
          maxiter = atoi (optarg);
        }
        else if (strcmp (long_options[option_index].name, "snapshot") == 0) {
          snapshot = atoi (optarg);
        }
        else if (strcmp (long_options[option_index].name, "openmp") == 0) {
#if HAVE_OPENMP
          /*  Check the previously set value, which is the maximum for the system  */
          if (atoi (optarg) > info -> threads) {
            fprintf (stderr, "==\tError:  The number of threads requested exceeds the number available in the system (%u).\n", info -> threads);
            exit (-1);
          }
          info -> threads = atoi (optarg);
          omp_set_num_threads (info -> threads);
#else
          fprintf (stderr, "==\tError:  OpenMP is not enabled; --openmp meaningless.\n");
          exit (-1);
#endif
        }
        else if (strcmp (long_options[option_index].name, "verbose") == 0) {
          verbose = true;
        }
        else if (strcmp (long_options[option_index].name, "debug") == 0) {
          debug = true;
        }
        else if (strcmp (long_options[option_index].name, "text") == 0) {
          textio = true;
        }
        else if (strcmp (long_options[option_index].name, "rounding") == 0) {
          rounding = true;
        }
        else if (strcmp (long_options[option_index].name, "nooutput") == 0) {
          no_output = true;
        }
        break;
      default:
        printf ("?? getopt returned character code 0%o ??\n", c);
        exit (EXIT_FAILURE);
    }
  }

  info -> base_fn = base_fn;
  info -> co_fn = co_fn;
  info -> num_clusters = num_clusters;
  info -> seed = seed;
  info -> maxiter = maxiter;
  info -> snapshot = snapshot;
  info -> verbose = verbose;
  info -> debug = debug;
  info -> textio = textio;
  info -> rounding = rounding;
  info -> no_output = no_output;

  /*  Set the range of clusters this process will handle  */
  info -> block_start = BLOCK_LOW (info ->  world_id, info -> world_size, info -> num_clusters);
  info -> block_end = BLOCK_HIGH (info ->  world_id, info -> world_size, info -> num_clusters);
  info -> block_size = BLOCK_SIZE (info ->  world_id, info -> world_size, info -> num_clusters);

  return true;
}

コード例 #13

0

ファイルを表示

ファイル: sieve.c プロジェクト: peteraleksa/mpi-prime-sieve

int main(int argc, char * argv[])
{
    /* Constant Declarations */
    //long const 	SET_SIZE = 7920;

    /* Variable Declarations */
    int		count = 0;				// local count
    double 	elapsed_time = 0.00;			// time elapsed
    int		first;					// index of first multiple
    int 	global_count = 1;			// global count
    int 	high_value;				// highest value on processor
    char 	hostname[MPI_MAX_PROCESSOR_NAME];	// host process is running on
    int	 	i;					// counter variable
    int 	id;					// process id number
    int		index;
    int 	init_status;			// initialization error status flag
    bool  	initialized = false;		// mpi initialized flag
    int 	len;				// hostname length
    int 	low_value;			// lowest value on the processor
    char*	marked;				// portion of 2 to n that is marked
    int		n;			// number of elements to sieve
    int		n_sqrt;			// square root of n
    int 	p;			// number of processes
    int		prime;
    int		proc0_size;		// size of process 0's subarray
    int		size;			// elements in marked
    int*	sqrt_primes;		// primes up to the square root
    int		sqrt_primes_index;	// index in the square root primes array
    char*	sqrt_primes_marked;	// numbers up to sqrt marked prime or not
    int		sqrt_primes_size;	// size of square root primes array

    /* Function Declarations */
    //int is_prime( int );

    /* Initialization */
    MPI_Initialized( &initialized );                     // set initialized flag
    if( !initialized )                                  // if MPI is not initialized
        init_status = MPI_Init( &argc, &argv );        // Initialize MPI
    else
        init_status = MPI_SUCCESS;   	               // otherwise set init_status to success
    if( init_status != MPI_SUCCESS ) {     	       // if not successfully initialized
        printf ("Error starting MPI program. Terminating.\n");      // print error message
        fflush(stdout);
        MPI_Abort(MPI_COMM_WORLD, init_status);                     // abort
    }
    MPI_Get_processor_name( hostname, &len );                       // set hostname

    MPI_Comm_rank( MPI_COMM_WORLD, &id );                           // set process rank
    MPI_Comm_size( MPI_COMM_WORLD, &p );                            // set size of comm group
    //printf("Process rank %d started on %s.\n", id, hostname);     // print start message
    //fflush(stdout);
    //MPI_Barrier(MPI_COMM_WORLD );

    /* Start Timer */
    MPI_Barrier( MPI_COMM_WORLD );                                  // synchronize
    elapsed_time = - MPI_Wtime();                                   // start time

    /* Check that a set size was passed into the program */
    if(argc != 2) {
        if(id==0) {
            printf("Command line: %s <m>\n", argv[0]);
            fflush(stdout);
	}
        MPI_Finalize();
        exit(1);
    }

    n = atoi(argv[1]);
    n_sqrt = ceil(sqrt((double)n));
    //if(id==0)
    //	printf("square root: %i\n", n_sqrt);
    // debug
    //if(id==0) {
	//printf("n sqrt: %i\n", n_sqrt);
   	//fflush(stdout);
    //}

    sqrt_primes_marked = (char *) malloc(n_sqrt + 1);
    sqrt_primes_marked[0] = 1;
    sqrt_primes_marked[1] =1;

    for(i = 2; i <= n_sqrt; ++i) {
	sqrt_primes_marked[i] = 0;
    }

    prime = 2;
    sqrt_primes_size = n_sqrt;
    //printf("sqrt primes size: %i\n", sqrt_primes_size);

    do {
	for(i = prime * prime; i < n_sqrt; i+=prime) {
	     sqrt_primes_marked[i] = 1;
	     //sqrt_primes_size--;
	}
	while(sqrt_primes_marked[++prime]);    
    } while (prime * prime <= n_sqrt);
    //printf("sqrt primes size: %i\n", sqrt_primes_size);
    sqrt_primes = (int *) malloc(sqrt_primes_size);
    sqrt_primes_index = 0;

    //sqrt_primes_size = 0;

    for(i = 3; i <= n_sqrt; ++i) {
	if(!sqrt_primes_marked[i]) {
	    
	    sqrt_primes[sqrt_primes_index] = i;
	   // printf("%i, ", sqrt_primes[sqrt_primes_index]);
	    sqrt_primes_index++;
                
        }
    }

    sqrt_primes_size = sqrt_primes_index;

    //printf("sqrt primes size: %i\n", sqrt_primes_size);
    //fflush(stdout);

    /* Set process's array share and first and last elements */
    low_value = 2 + BLOCK_LOW(id,p,n-1);
    high_value = 2 + BLOCK_HIGH(id,p,n-1);
    size = BLOCK_SIZE(id,p,n-1);

    //printf("Process %i block low: %i\n", id, low_value);
    //fflush(stdout);
    //printf("Process %i block high: %i\n", id, high_value);
    //fflush(stdout);
    //printf("Block size: %i\n", size);
    //fflush(stdout);

    if(low_value % 2 == 0) {
	if(high_value % 2 == 0) {
	     size = (int)floor((double)size / 2.0);
	     high_value--;
	}
	else {
	    size = size / 2;
	}
	low_value++;
    }
    else {
	if(high_value % 2 == 0) {
	     size = size / 2;
	     high_value--;
	}
	else {
	     size = (int)ceil((double)size / 2.0);
	}
    }

    //printf("Process %i block low: %i\n", id, low_value);
    //fflush(stdout);
    //printf("Process %i block high: %i\n", id, high_value);
    //fflush(stdout);
    //printf("Block size: %i\n", size);
    //fflush(stdout);

    //proc0_size = (n-1)/p;

    /* if process 0 doesn't have all the primes for sieving, then bail*/
    /*if((2+proc0_size) < (int)sqrt((double)n)) {
        if(id==0) {
            printf("Too many processes\n");
            fflush(stdout);
        }
        MPI_Finalize();
        exit(1);
    }
    */

    /* Allocate share of array */
    marked = (char *) malloc(size);

    if(marked == NULL) {
        printf("Cannot allocate enough memory\n");
        fflush(stdout);
	MPI_Finalize();
        exit(1);
    }

    /* Run Sieve */

    //printf("made it to sieve\n");
    //fflush(stdout);

    for(i = 0; i < size; i++)
	marked[i] = 0;

    if(id==0)
	first = 0;
    
    sqrt_primes_index = 0;
    prime = sqrt_primes[sqrt_primes_index];

    //printf("first prime: %i\n", prime);
    //fflush(stdout);

    //for(i = 0; i < sqrt_primes_size; i++) {

      //              printf("%i,", sqrt_primes[i]);
        //            fflush(stdout);

        //}
     

    do {
	if(prime >= low_value)
	    first = ((prime - low_value) / 2) + prime;
	else if(prime * prime > low_value) {
		first = (prime * prime - low_value) / 2;
	}
	else {
	    if(low_value % prime == 0)
		first = 0;
	    else {
		first = 1;
		while ((low_value + (2 * first)) % prime != 0)
			++first;
	    }
	}

	//printf("first: %i\n", first);
	//fflush(stdout);

	for(i = first; i < size; i += (prime))
		marked[i] = 1;

	//printf("made it to prime assignment\n");
	prime = sqrt_primes[++sqrt_primes_index];

	//printf("prime: %i\n", prime);
	//fflush(stdout);

    } while(prime * prime <= n && sqrt_primes_index < sqrt_primes_size);

    count = 0;

    for(i = 0; i < size; i++) {
	if(!marked[i])
	    count++;
    }

    //printf("size: %i\ncount: %i\n", size, count);

//    for( i=id; i<SET_SIZE; i+=p )                                                       // interleaved allocation
//        count += is_prime( i );                                                             // check if prime w/ sieve of eratosthenes

    /* Reduce Sum */
    MPI_Reduce( &count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD );        // reduce the primes count, root: proces 0

    /* Stop Timer */
    elapsed_time += MPI_Wtime();                                                        // end time

    //printf("Process %i found %i primes.\n", id, count);
    //fflush(stdout);

    //printf("Process %d is done in %d, running on %s.\n", id, elapsed_time, hostname);   // print process done message
    if( id == 0 ) {                                                                     // rank 0 prints global count
        printf("There are %d primes in the first %i integers.\nExecution took %10.6f.\n",
               global_count, n, elapsed_time);
	fflush(stdout);
	
//	printf("Debug:\n");
//	fflush(stdout);
//	printf("sqrt primes size: %i\n", sqrt_primes_size);
//        fflush(stdout);
	for(i = 0; i < sqrt_primes_size; i++) {
		if(!sqrt_primes[i]){
		    printf("%i,", sqrt_primes[i]);
		    fflush(stdout);
		}
	}
    }

    MPI_Barrier(MPI_COMM_WORLD);

  //  printf("rank: %i\nlow value: %i\nhigh value: %i\ncount: %i\n", id, low_value, high_value, count);

    //fflush(stdout);
    MPI_Finalize();                                                                     // finalize
    return 0;
}

コード例 #14

0

ファイルを表示

ファイル: sieve3.c プロジェクト: laterDays/CSC718

int main (int argc, char *argv[])
{
	int count;		/* local prime count				*/
	double elapsed_time; 	/* execution time				*/
	int first;		/* index of the first sieve			*/
	int global_count;	/* global count of prime numbers 		*/
	int high_value; 	/* highest value assigned to this process 	*/
	int i;			/* loop counter					*/
	int id;			/* this process id				*/
	int index;		/* index of the current sieve			*/
	int low_value;		/* lowest value assigned to this process 	*/
	int *marked;		/* array elements to be  marked			*/
	int n;			/* value of the largest number			*/
	int p; 			/* number of processes				*/
	int proc0_size;		/* number of elements assigned to process zero 	*/
				/* this is to find if process zero has all primes */
	int prime;		/* current prime or sieve			*/
	int size;		/* elements in marked array 			*/
	int seed_size;
	char cpu_name[MPI_MAX_PROCESSOR_NAME];
	int namelen;

	MPI_Init (&argc, &argv);
	/* start timer */
	MPI_Barrier(MPI_COMM_WORLD);
	elapsed_time = -MPI_Wtime();
	MPI_Comm_rank (MPI_COMM_WORLD, &id);
	MPI_Comm_size (MPI_COMM_WORLD, &p);
	MPI_Get_processor_name(cpu_name, &namelen);

	if (argc != 2)
	{
		if (!id) printf ("Command line: %s <m>\n", argv[0]);
		MPI_Finalize();
		exit (1);
	}

	n = atoi(argv[1]);

	/* find how many elements are assigned to this process */
	low_value = BLOCK_LOW(id,p,n);
	high_value = BLOCK_HIGH(id,p,n);
	size = BLOCK_SIZE(id,p,n);
	seed_size = SEED_SIZE(n);
	proc0_size = (n-1)/(2*p);

	// - main loop works only for prime * prime <= n
	// - this means it only runs as long as
	//	prime <= sqrt(n)
	// - In this setup, the program will exit if proc0 doesn't hold all starting primes (i.e.
	// it will exit if a starting prime will need to be chosen from another process - which this
	// program is not prepared to do.
	/*
	if ((OFFSET + proc0_size) < (int) sqrt((double) n))
	{	
		if (!id) printf ("Too many processes\n");
		MPI_Finalize();
		exit (1);
	}*/
	// There is too many processes when we cannot split
	// up what is left of the numbers after taking out
	// the SEED section (which encloses sqrt(n))
	if (BLOCK_SXN_SIZE(n) < p)
	{
		if (!id) printf ("Too many processes\n");
		MPI_Finalize();
		exit (1);	
	}

	marked = (int *) malloc ((seed_size + size) * sizeof(int));

	if (marked == NULL)
	{
		printf ("Cannot allocate enough memory\n");
		MPI_Finalize();
		exit (1);
	}

	for (i = 0; i < (seed_size + size); i++) marked[i] = 0;

	index = 0;

	prime = OFFSET;
	if(!id) printf("[%d-%s] SEED:  low[     0]=     3, high[%6d]=%6d (%d)\n", id, cpu_name, (SEED_SIZE(n)-1), SEED_HIGH(n), seed_size);
	MPI_Barrier(MPI_COMM_WORLD);	
	printf("[%d-%s] ARRAY: low[%6d]=%6d, high[%6d]=%6d (%d)\n", id, cpu_name, seed_size, low_value, seed_size + size-1, high_value, size);
	do {
		//printf("[%d] *prime: %d\n", id, prime);
		// SEED marking - mark the multiples of the seed - within
		// the seed block. Start marking from the value of 
		// the prime * prime, which is at the position
		// index + prime
		for (i = index + prime; i < seed_size; i += prime) 
		{
			//printf("[%d] marked: %d\n", id, (OFFSET + (i * 2)));
			marked[i] = 1;
		}

		// Now, we need to continue to the block section, and keep 
		// marking. But we first need to find out where to start.
		// For each process, we need to find the first element to mark.
		// The first number that we would have to mark is prime * prime

		// If the first number to mark (prime * prime) is at least 
		// above the low bound of this process ...
		if (prime * prime > low_value) 
		{
			// ... Then the first index is that number (prime * prime)
			// - the low_value. E.g:
			// prime = 7
			// low_value = 41
			// first = [(7*7) - 41]/2 = (49 - 41)/2 = 8/2 = 4
			// marked[4] will be marked first
			first = (prime * prime - low_value)/2;
			//printf("[%d] first(a): %d(%d)\n", id, first, low_value + (2 * first));
		

		}
		else 
		{
			// This section is for "run-on" arrays, e.g.
			// prime=3
			// p0 [ 3| 5] 3*3 > 3
			// p1 [ 7| 9] 3*3 !> 7 "run-on", need mark 9
			// p2 [11|13] 3*3 !> 11 "run-on"
		 	if (!(low_value % prime)) 
			{
				// If it is, then the first element of the
				// array will be the starting place			
				first = 0;
				//printf("[%d] first(b): %d(%d)\n", id, first, low_value + (2 * first));
			}		 	
			else 
			{
				// (3 - (11 % 3))/2
				// (3 - (2))/2s
				int tmp = prime - (low_value % prime);
				first = tmp % 2 == 0 ? tmp/2 : (tmp + prime)/2;
				//printf("[%d] first(c): %d(%d)\n", id, first, low_value + (2 * first));
			}
		}
		//  0         sqrt(n)
		// [SEED             ][     BLOCK_SXN ]
		// SEED_SIZE -------> ---> first
		// ----------------------> first + SEED_SIZE
		// Now, mark all multiples of the prime. 
		// 'first' is a multiple of the prime, so += prime
		// is also a multiple.
		for (i = first; i < size; i += prime)
		{
			//printf("[%d] marked: %d\n", id, (low_value + (i * 2)));
			marked[i + seed_size] = 1;
		}

		// Increase 'index' which pointed to the last
		// smallest prime, until it reaches the next
		// smallest prime.
		while (marked[++index]);

		// Remember, each 'index' in process 0 
		// represents the number = index + 2
		prime = (2 * index) + OFFSET;
	} while (prime * prime <= n);

	count = 0;

	// Total up the amount of items that are not marked
	// i.e. the number of local primes

	// REMOVE LATER!!!!
	//sleep(1);
	MPI_Barrier(MPI_COMM_WORLD);
	//

	
	//printf("[%d] ", id);
	if (!id)
	{
		for (i = 0; i < seed_size; i++) 
		{
			if (!marked[i]) 
			{
				//printf("%d,", ((2*i)+low_value));
				count++;
			}
		}
	}
	for (i = 0; i < size; i++) 
	{
		if (!marked[i + seed_size]) 
		{
			//printf("%d,", ((2*i)+low_value));
			count++;
		}
	}
	//printf("\n");

	
	// process 0 will receieve the sum of the number 
	// of local primes
	MPI_Reduce (&count, &global_count, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

	elapsed_time += MPI_Wtime();

	if (!id) {
		global_count++; 	// To account for 2

		printf ("%d primes are less than or equal to %d\n",
		global_count, n);
		printf ("Total elapsed time: %10.6f\n", elapsed_time);
	}

	MPI_Finalize ();

	return 0;
}