int parallelMax(int* A, int s) {
	int blocksize, localMax, i, limit;
	int* maxs;	
	blocksize = ceil((double)n/p); //get block size
	
	i = blocksize * s;
	limit = MIN(n, blocksize * (s+1));
	
	localMax = A[i];
	for(i += 1; i < limit;i++) {
		localMax = MAX(localMax, A[i]);
	}
	
	maxs = (int*) malloc(sizeof(int) * p);
	bsp_push_reg(maxs, sizeof(int) * p); //Make maxs visible globally
	bsp_sync(); //sync
	
	bsp_put(0, &localMax, maxs, s * sizeof(int), sizeof(int)); //send localMax to P0
	bsp_sync();
	
	if(s == 0) {
		localMax = maxs[0];
		for(i = 1; i < p; i++) {
			localMax = MAX(localMax, maxs[i]);
		}
	}
	
	bsp_push_reg(&localMax, sizeof(int)); //Make localMax visible globally
	bsp_sync();
	
	bsp_get(0, &localMax, 0, &localMax, sizeof(int)); //each processor gets the min
	bsp_sync();
	
	return localMax;
}
Beispiel #2
0
int main()
{
    bsp_begin();

    int p = bsp_pid();

    char a = 0;
    char b = 0;
    char c = 0;
    bsp_push_reg(&a, sizeof(char));
    bsp_sync();
    bsp_push_reg(&b, sizeof(char));
    bsp_sync();

    if (p == 0)
    {
        c = 'y';
        bsp_hpput(3, &c, &a, 0, sizeof(char));
        bsp_hpput(3, &c, &b, 0, sizeof(char));
    }

    bsp_end();

    return 0;
}
Beispiel #3
0
double bspip(int p, int s, int n, double *x, double *y){
    /* Compute inner product of vectors x and y of length n>=0 */

    int nloc(int p, int s, int n);
    double inprod, *Inprod, alpha;
    int i, t;
  
    Inprod= vecallocd(p); bsp_push_reg(Inprod,p*SZDBL);
    bsp_sync();

    inprod= 0.0;
    for (i=0; i<nloc(p,s,n); i++){
        inprod += x[i]*y[i];
    }
    for (t=0; t<p; t++){
        bsp_put(t,&inprod,Inprod,s*SZDBL,SZDBL);
    }
    bsp_sync();

    alpha= 0.0;
    for (t=0; t<p; t++){
        alpha += Inprod[t];
    }
    bsp_pop_reg(Inprod); vecfreed(Inprod);

    return alpha;

} /* end bspip */
Beispiel #4
0
    void Simulation::report()
    {
        size_t total_p = bsp_nprocs();
        size_t s = bsp_pid();
        double *densities = new double[total_p]();
        double current_density = 0;
        bsp_push_reg(densities,total_p * sizeof(double));
        bsp_sync();

        for (auto node : d_domain->nodes)
            current_density += density(d_domain->set, node);

        // send density to each processor
        for (size_t t = 0; t < total_p; t++)
            bsp_put(t, &current_density, densities, s * sizeof(double), sizeof(double));

        bsp_sync();
        // now calculate the total density
        double total_density = 0;
        for (size_t t = 0; t < total_p; t++)
            total_density += densities[t];

        bsp_pop_reg(densities);

        if (s == 0)
            std::cout << "Total density: " << total_density << '\n';

        delete[] densities;
    }
Beispiel #5
0
void withPut() {
    bsp_begin( 4 );
    std::map<char, int*> m;
    for(int i = 0; i < 4; ++i) {
        if(i == bsp_pid()) {
            // Init
            m['m'] = new int[3];
            memset(m['m'], 0, sizeof(int)*3);
            m['s'] = new int[3];
            memset(m['s'], 0, sizeof(int)*3);

            if(0 == bsp_pid() % 2) {
                m['s'][0] = bsp_pid()*5+1;
                m['s'][1] = bsp_pid()*10+1;
                m['s'][2] = bsp_pid()*15+1;
                //std::cout << "proc " << bsp_pid() << " unregistered "
                //          << m['m'] << std::endl << std::flush;
            }
            //if(1 == bsp_pid() % 2) {
                std::cout << "proc " << bsp_pid() << " registering "
                          << m['s'] << std::endl << std::flush;
                bsp_push_reg(m['s'], 3*sizeof(int));
            //}
        }
        bsp_sync();
    }

    bsp_sync();
    for(int i = 0; i < 4; ++i) {
        if(i == bsp_pid()) {
            if(0 == i % 2) {
                std::cout << "proc " << bsp_pid() << " puts to proc "
                         << bsp_pid() + 1 << "data from " << m['s']
                         << " to " << m['s']
                         << std::endl << std::flush;
                bsp_put(bsp_pid() + 1,
                        m['s'], 
                        m['s'], 0, 3 * sizeof(int));
            }
        }
        bsp_sync();
    }
    
    bsp_sync();
    // print values
    for(int i = 0; i < 4; ++i) {
        if(i == bsp_pid()) {
            bsp_pop_reg(m['s']);
            std::cout << "Proc {" << bsp_pid() << "} contains"
                      << std::endl << std::flush;
            for(int i = 0; i < 3; ++i)
                std::cout << m['s'][i] << " "; 
            std::cout << std::endl << std::flush;
        }
        bsp_sync();
    }
    bsp_end();
}
Beispiel #6
0
int main() {
    bsp_begin();
    int var = bsp_pid();
    int* unregistered_var = (int*)0x7000;
    char teststr[] = "Default test string!";
    char goodstr[] = "Replacement string.";
    bsp_push_reg(&var, sizeof(int));

    if (bsp_pid() != 2)
        bsp_sync();

    bsp_push_reg(teststr, sizeof(int));

    // Only core 2 will do both registrations in the same sync
    if (bsp_pid() == 2)
        bsp_sync();
    // expect: ($02: BSP ERROR: multiple bsp_push_reg calls within one sync)

    if (bsp_pid() == 1) {
        bsp_hpput(0, &var, &var, 0, sizeof(int));
        bsp_hpput(0, &var, unregistered_var, 0, sizeof(int)); // Error
        // expect: ($01: BSP ERROR: could not find bsp var 0x7000)
    }
    if (bsp_pid() == 0) {
        bsp_hpput(1, goodstr, teststr, 0, 19 * sizeof(char));
    }

    bsp_sync();

    if (bsp_pid() == 0)
        ebsp_message("%d", var);
    // expect: ($00: 1)
    bsp_sync();
    if (bsp_pid() == 1)
        ebsp_message(teststr);
    // expect: ($01: Replacement string.!)

    bsp_end();
    return 0;
}
Beispiel #7
0
void
DLargestCommonSubSequence::distributedInit() {
    

    // allocate storage for rows before
    int rowsToExportPerProc = (0 == chunkStride_%n_) ? chunkStride_/n_ :
                                                     chunkStride_/n_+1;  
    //std::cout << "rowsToExportPerProc "
    //          << rowsToExportPerProc << std::endl;

    for(int i = 0; i < rowsToExportPerProc; ++i) {
        Row curr;
        curr.resize(chunkLength_);
        over_.push_back(curr);
        bsp_push_reg(over_.back().data(), sizeof(int)*chunkLength_);
        //std::cout << "Proc " << id_ << 
        //                    " Last Row index " << i <<  " addr: ";
        //                std::cout << over_.back().data()
        //                    << std::endl << std::flush;
    }

    for(int i = 0; i < chunkStride_; ++i) {
        for(int j = 0; j < chunkStride_; ++j) {
            if(id_ != i % n_)
                continue;
            // allocate L 
            L_[getCPair(i,j)] = new int*[chunkLength_];
            for(int k = 0; k < chunkLength_; ++k) {
                L_[getCPair(i,j)][k] = new int [chunkLength_];
                memset(L_[getCPair(i,j)][k], 0, chunkLength_*sizeof(int));
            }

            //allocate chunks for lastRow
            if(i > 0) {
                L_[getCPair(i-1,j)] = new int*[chunkLength_];
                for(int k = 0; k < chunkLength_; ++k) {
                    if(k < chunkLength_-1) {
                        L_[getCPair(i-1,j)][k] = NULL;
                    } else {
                        L_[getCPair(i-1,j)][k] = new int[chunkLength_];
                        memset(L_[getCPair(i-1,j)][k],
                               0, chunkLength_*sizeof(int));
                    }
                }
            }
        }
    }
}
void bspinprod(){
    
    double bspip(int p, int s, int n, double *x, double *y);
    int nloc(int p, int s, int n);
    double *x, alpha, time0, time1;
    int p, s, n, nl, i, iglob;
    
    bsp_begin(P);
    p= bsp_nprocs(); /* p = number of processors obtained */ 
    s= bsp_pid();    /* s = processor number */ 
    if (s==0){
        printf("Please enter n:\n"); fflush(stdout);
        scanf("%d",&n);
        if(n<0)
            bsp_abort("Error in input: n is negative");
    }
    bsp_push_reg(&n,SZINT);
    bsp_sync();

    bsp_get(0,&n,0,&n,SZINT);
    bsp_sync();
    bsp_pop_reg(&n);

    nl= nloc(p,s,n);
    x= vecallocd(nl);
    for (i=0; i<nl; i++){
        iglob= i*p+s;
        x[i]= iglob+1;
    }
    bsp_sync(); 
    time0=bsp_time();

    alpha= bspip(p,s,n,x,x);
    bsp_sync();  
    time1=bsp_time();

    printf("Processor %d: sum of squares up to %d*%d is %.lf\n",
            s,n,n,alpha); fflush(stdout);
    if (s==0){
        printf("This took only %.6lf seconds.\n", time1-time0);
        fflush(stdout);
    }

    vecfreed(x);
    bsp_end();

} /* end bspinprod */
Beispiel #9
0
void spmd() {
    bsp_begin( 4 );
    /// Init
    std::vector<unsigned int> a;
    std::vector<unsigned int> b;
    a.resize(3);


    if(0 == bsp_pid() % 2) {
        a = {bsp_pid()*5+1, bsp_pid()*10+1, bsp_pid()*15+1};
        b = {bsp_pid()*5+1, bsp_pid()*10+1, bsp_pid()*15+1};
    }

    bsp_sync();

        bsp_push_reg (b.data (),
                      b.size () * sizeof(unsigned int));

    bsp_sync ();
    // getting values of even into odd
    if(1 == bsp_pid() % 2) {
        bsp_get(0,
                b.data(),
                1* sizeof(unsigned int),
                a.data(),
                2 * sizeof(unsigned int));
    }
    
    bsp_sync();
    // print values
    for(int i = 0; i < 4; ++i) {
        if(i == bsp_pid()) {
            std::cout << "Proc {" << bsp_pid() << "} contains"
                      << std::endl << std::flush;
            for(std::vector<unsigned int>::const_iterator it = a.cbegin();
                it != a.cend(); ++it)
                std::cout << *it << " "; 
            std::cout << std::endl << std::flush;
        }
        bsp_sync();
    }
    bsp_pop_reg (b.data());
    bsp_end();
}
Beispiel #10
0
void withMaps() {
    bsp_begin( 4 );
    // Init
    std::map<char, int*> n;
    std::map<char, std::vector<unsigned int> > m;
    std::vector<unsigned int> a;
    a.resize(3);
    m['m'] = a;
    bsp_push_reg(m['m'].data(), m['m'].capacity()*sizeof(unsigned int));

    if(0 == bsp_pid() % 2) {
        m['m'] = {bsp_pid()*5+1, bsp_pid()*10+1, bsp_pid()*15+1};
    }

    bsp_sync();
    // getting values of even into odd
    if(1 == bsp_pid() % 2) {
        bsp_get(bsp_pid() - 1, m['m'].data(), 0,
                m['m'].data(), 3 * sizeof(unsigned int));
    }
    
    bsp_sync();
    // print values
    for(int i = 0; i < 4; ++i) {
        if(i == bsp_pid()) {
            std::cout << "Test " << n['n'] <<std::endl;
            std::cout << "Proc {" << bsp_pid() << "} contains"
                      << std::endl << std::flush;
            for(std::vector<unsigned int>::const_iterator it = m['m'].cbegin();
                it != m['m'].cend(); ++it)
                std::cout << *it << " "; 
            std::cout << std::endl << std::flush;
        }
        bsp_sync();
    }
    bsp_pop_reg(m['m'].data());
    bsp_end();
}
Beispiel #11
0
//initialisation function for ip
void ip_init( double **ip_buffer ) {
	const size_t size = bsp_nprocs() * sizeof(double);
	*ip_buffer = malloc( size );
	bsp_push_reg( *ip_buffer, size );
}
void countSort() {
	int s, i, j, min, max, blocksize, start, limit, localCount, index;
	double time, time0, time1;
	int *A, *C, *localB, *sizes;
	
	bsp_begin(p); //Begin Parallel
	
    s = bsp_pid(); //Current Processor Number
	
	A = (int*)malloc(sizeof(int) * n);
	
	if(s == 0) {
		for(i = 0; i < n; i++) { //Generating the array
			A[i] = rand() % 100;
		}
	}
	
	bsp_sync(); //sync
	
	time0 = bsp_time();
	
	bsp_push_reg(A, sizeof(int) * n); //push the array
	bsp_sync(); //sync

    bsp_get(0, A, 0, A, sizeof(int)* n); //Get the array
	bsp_sync(); //sync
	
	min = parallelMin(A, s); //find min of A
	max = parallelMax(A, s); //find max of A
	
	blocksize = ceil((double)(max - min + 1)/p); //get block size
	
	start = blocksize * s; //start index in C
	limit = MIN(max - min + 1, blocksize * (s+1)); //end index in C
	
	C = (int*) malloc(sizeof(int) * blocksize); //init C to 0
	for(i = 0; i < blocksize; i++) {
		C[i] = 0;
	}
	
	localCount = 0;
	for(i = 0; i < n;i++) { //fill C for values in Range[start, limit[
		int tmp = A[i] - min;
		if(start <= tmp && tmp < limit) {
			C[tmp - start] += 1;
			localCount++;
		}
	}
	
	if(localCount > 0) {
		localB = (int*)malloc(sizeof(int) * localCount);
		
		int tmp = limit - start;
		int j = 0;
		for(i = 0; i < tmp; i++) { //Generate localB from C
			if(C[i] > 0) {
				localB[j] = i + start + min;
				C[i] = C[i] - 1;
				i--;
				j++;
			}
		}
	}
	
	sizes = (int*) malloc(sizeof(int) * p);
	bsp_push_reg(sizes, sizeof(int) * p);
	bsp_sync(); //sync
	
	bsp_put(0, &localCount, sizes, s * sizeof(int), sizeof(int)); //send localCount to P0
	bsp_sync(); //sync
	
	index = 0;
	bsp_push_reg(&index, sizeof(int));
	bsp_sync(); //sync
	
	if(s == 0) { //Processor 0 sends start index to all processors	
		int tmp = 0;
		for(i = 0; i < p; i++) {
			bsp_put(i, &tmp, &index, 0, sizeof(int));
			tmp += sizes[i];
		}
	}
	
	bsp_sync(); //sync
	
	bsp_put(0, localB, A, index * sizeof(int), sizeof(int) * localCount); //put localB in its place in A
	bsp_sync(); //A now contains the final sorted array
	
	time1 = bsp_time();
	time = time1 - time0;
	
	if(s == 0) { //printing Result
		printf("Number of processors: %d \t Input Size: %d \t Time Taken: %.8f", p, n, time);
	}
	
	bsp_end();
}
Beispiel #13
0
void bspParSort(){

  int Log2(int x);
  void mergeSort(int x, int *temp1);
  void merge2(int *arr1, int *arr2, int size);

  int *localArr; /* local array in each processor */
  int i,j,k; /* index variables */
  int n_divide_p; /* Avoid multiple computation */
  int n; /* Number of elements to be sorted */
  int szLocalArray; /* Size of local array */
  double time0, time1; /* Time */
  FILE *ifp = 0; /* Reader to read sequence of numbers to be sorted */

  bsp_begin(P);
  int p= bsp_nprocs(); /* Number of processors obtained */ 
  int s= bsp_pid();    /* Processor number */ 

  //Get number of elements to be sorted
  if(s==0){
    ifp = fopen("sort","r");
    if(ifp == NULL){
      fprintf(stderr, "Can't open input file!\n");
      exit(1);
    }
    fscanf(ifp, "%i", &n);
  }

  // Make sure every processor knows everything
  bsp_push_reg(&n,sizeof(int));
  bsp_sync();
  bsp_get(0,&n,0,&n,sizeof(int));
  bsp_sync();
  bsp_pop_reg(&n);

  //Setup distribution 
  n_divide_p = n/p;
  szLocalArray = n/pow(2,ceil(Log2(s+1)));
  localArr = vecalloci(szLocalArray);
  bsp_push_reg(localArr,sizeof(int)*szLocalArray);

  if(s==0){ 
    printf("Distribution start\n"); fflush(stdout); 
  }

  bsp_sync();
  int value;
  if(s==0){
    //allocate to array on proc 0
    for(i=0; i< n_divide_p; i++){
      fscanf(ifp, "%i", &value);
      localArr[i]=value;      
    }
    //Send to arrays on other processors
    for(i=1; i< p; i++){
      for(j=0;j<n_divide_p;j++){
        fscanf(ifp, "%i", &value);
        bsp_put(i,&value,localArr,j*sizeof(int),sizeof(int));
      }
    }
    fclose(ifp);
  }
  bsp_sync();
  if(s==0){ 
    printf("Distribution done\n"); fflush(stdout); 
  }

  //Distribution done and we can start time measurement 
  if(s==0){
    printf("Time start\n"); fflush(stdout);
  }
  time0 = bsp_time();

  //Locally sort each array
  if(s==0){
    printf("Local sort\n"); fflush(stdout);
  }
  mergeSort(n_divide_p, localArr);
  bsp_sync();

  //Merging 
  int *temp = malloc(sizeof(int)*pow(2,Log2(p))*n_divide_p);
  for(j=1;j<Log2(p)+1;j++){
    if(s<p/pow(2,j)){
      for(k=0;k<pow(2,j-1)*n_divide_p;k++){
        bsp_get(s+(p/pow(2,j)),localArr,k*sizeof(int),&(temp[k]),sizeof(int));
      }
    }
    bsp_sync();

    if(s<p/pow(2,j)){
      merge2(localArr, temp, n_divide_p*pow(2,j-1));
    }

    bsp_sync();
    if(s==0){ 
      printf("Round %i out of %i rounds of merging done (on proc 0)\n",j,Log2(p)); fflush(stdout); 
    }
  }
  if(s==0){
    printf("Sorting done\n"); fflush(stdout);
  }
  bsp_sync();
 
  //Print sorted array - expensive if sample is big
  /*
  if(s==0){
    printf("Sorted sequence is:\n");
    for(i=0; i<szLocalArray; i++){
      printf("%i ",localArr[i]); fflush(stdout);
    }
    printf("\n"); fflush(stdout);
  }
  */

  //Parallel algorithm ends
  time1 = bsp_time();
  if(s==0){
    printf("Time stop\n"); fflush(stdout);
  }

  //Report time to user
  if(s==0){
    printf("Sorting took %.6lf seconds.\n", time1-time0); fflush(stdout);
  }
  
  //Clean up
  free(temp);
  bsp_pop_reg(localArr); free(localArr);

  bsp_end();
} /* End bspParSort */
void parallel_part()
{
    int i, j;
    srand(1452764);

    //Matrix initilization
    float **matrix = (float**)calloc(N+2, sizeof(float*));
    for (i=0; i<N+2; i++) {
        matrix[i] = (float*)calloc(N+2, sizeof(float));
    }
    for (i=0; i<N+2; i++) {
        for (j=0; j<N+2; j++) {
            matrix[i][j] = (float)rand()/(float)RAND_MAX;
            //printf("row %d, coloum %d, element: %f\n", i, j, matrix[i][j]);
        }
    }

    //Parallel part
    bsp_begin(bsp_nprocs());
    int pid, x, y, done;
    pid=x=y=done=0;
    int sqroot = (int)(sqrt(bsp_nprocs()));
    int size = (int)(N/sqroot);    //side
    float Ai_jm1, Aim1_j, Ai_jp1, Aip1_j;
    Ai_jm1 = Aim1_j = Ai_jp1 = Aip1_j = 0.0;
    float temp, diff, convergence, total_diff;
    temp = convergence = 0.0;
    float *diffs = (float*)calloc(bsp_nprocs(), sizeof(float));
    int counter= 0;

    //(N/sqrt(p)) is an integer assurance
    if ( N%sqroot!=0) {
        bsp_abort("N/sqrt(p) is not an integer.\nProgram Aborted.\n");
    }

    //Initiliaze a piece of martix in decomposition
    float **sub_martix = (float**)calloc(size, sizeof(float*));
    for (i=0; i<size; i++) {
        sub_martix[i] = (float*) calloc(size, sizeof(float));
    }
    //Initiliaze borders
    float *upper = (float*)calloc(size, sizeof(float));
    float *lower = (float*)calloc(size, sizeof(float));
    float *left = (float*)calloc(size, sizeof(float));
    float *right = (float*)calloc(size, sizeof(float));
    float *overlap = (float*)calloc(size, sizeof(float));

    bsp_push_reg(&diff, sizeof(float));
    bsp_push_reg(upper, size*sizeof(float));
    bsp_push_reg(lower, size*sizeof(float));
    bsp_push_reg(left, size*sizeof(float));
    bsp_push_reg(right, size*sizeof(float));

    //Make each matrix and border available globally
    for (i=0; i<size; i++) {
        bsp_push_reg(sub_martix[i], size*sizeof(float));
    }
    bsp_sync();
    /*Processor 0 distributes the data*/
    if (bsp_pid()==0) {
        for (pid = 0; pid<bsp_nprocs(); pid++) {
            //Determine which part of the original matrix
            x = pid/sqroot;
            y = pid%sqroot;
            //Then the processor 0 copy the data to each processor
            for (i=0; i<size; i++) {
                for (j=0; j<size; j++) {
                    sub_martix[i][j] = matrix[x*size+i+1][y*size+j+1];
                }
            }
            if (pid!=0) {
                for (i=0; i<size; i++) {
                    bsp_put(pid, sub_martix[i], sub_martix[i], 0, size*sizeof(float));
                }
            }
        }
    }
    bsp_sync();

    if (bsp_pid()==0) {
        for (pid=0; pid<bsp_nprocs(); pid++) {
            x=pid/sqroot;
            x=pid%sqroot;

            //if the part is in 1st row
            if (x==0) {
                for (i=0; i<size; i++) {
                    upper[i] = matrix[0][y*size+1+i];
                }
            }
            //if the part is in leftmost column
            if (y==0) {
                for (i=0; i<size; i++) {
                    left[i] = matrix[x*size+1+i][0];
                }
            }
            //if the part is in last row
            if (x==sqroot-1) {
                for (i=0; i<size; i++) {
                    lower[i] = matrix[N+1][y*size+1+i];
                }
            }
            //if the part is in rightmost column
            if (y==1) {
                for (i=0; i<size; i++) {
                    right[i] = matrix[x*size+1+i][N+1];
                }
            }

            if (pid!=0) {
                bsp_put(pid, upper, upper, 0, size*sizeof(float));
                bsp_put(pid, lower, lower, 0, size*sizeof(float));
                bsp_put(pid, left, left, 0, size*sizeof(float));
                bsp_put(pid, right, right, 0, size*sizeof(float));
            }
        }
    }
    bsp_sync();

    /* Computation */
    while (!done) {
        pid = bsp_pid();
        diff=0.0;
        total_diff=0.0;
        x = pid/sqroot;
        y = pid%sqroot;
        //printf("Now %d th round:", ++counter);

        if (x<sqroot-1) {
            for (i=0; i<size; i++) {
                overlap[i] = sub_martix[size-1][i];
            }
            bsp_put(bsp_pid()+sqroot, overlap, upper, 0, size*sizeof(float));
        }
        if (y<sqroot-1) {
            for (i=0; i<size; i++) {
                overlap[i]=sub_martix[i][size-1];
            }
            bsp_put(bsp_pid()+1, overlap, left, 0, size*sizeof(float));
        }
        if (x>0) {
            for (i=0; i<size; i++) {
                overlap[i]=sub_martix[0][i];
            }
            bsp_put(bsp_pid()-sqroot, overlap, lower, 0, size*sizeof(float));
        }
        if (y>0) {
            for (i=0; i<size; i++) {
                overlap[i]=sub_martix[i][0];
            }
            bsp_put(bsp_pid()-1, overlap, right, 0, size*sizeof(float));
        }
        bsp_sync();

        for (i=0; i<size; i++) {
            for (j=0; j<size; j++) {
                temp = sub_martix[i][j];
                if (i-1<0) {
                    Aim1_j=upper[j];
                }
                else {
                    Aim1_j=sub_martix[i-1][j];
                }
                if (i+1>size-1) {
                    Aip1_j=lower[j];
                }
                else {
                    Aip1_j=sub_martix[i+1][j];
                }
                if (j-1<0) {
                    if (y!=0) {
                        Ai_jm1 = left[size-1];
                    }
                    else {
                        Ai_jm1 = left[i];
                    }
                }
                else {
                    Ai_jm1 = sub_martix[i][j-1];
                }
                if (j+1>size-1) {
                    if (y!=sqroot-1) {
                        Ai_jp1 = right[0];
                    }
                    else {
                        Ai_jp1 = right[i];
                    }
                }
                else {
                    Ai_jp1 = sub_martix[i][j+1];
                }
                sub_martix[i][j] = 0.2*(sub_martix[i][j]
                                        + Ai_jm1
                                        + Aim1_j
                                        + Ai_jp1
                                        + Aip1_j);
                //printf("data is %f\n", sub_martix[i][j]);
                diff += fabs(sub_martix[i][j]-temp);
            }
        }

        //printf("Result from pid: %d: difference= %f \n", bsp_pid(), diff);
        bsp_sync();

        for (i=0; i<bsp_nprocs(); i++) {
            bsp_get(i, &diff, 0, &diffs[i], sizeof(float));
        }
        bsp_sync();

        for (i=0; i<bsp_nprocs(); i++) {
            total_diff += diffs[i];
        }
        bsp_sync();
        convergence = (total_diff)/(float)(N*N);
        //printf("Current Convergence is %f\n", convergence);
        if (convergence<TOL) {
            done = 1;
        }
        bsp_sync();
    }

    for (i=0; i<size; i++) {
        bsp_pop_reg(sub_martix[i]);
    }
    bsp_pop_reg(&diff);
    bsp_pop_reg(lower);
    bsp_pop_reg(upper);
    bsp_pop_reg(left);
    bsp_pop_reg(right);
    bsp_sync();

    for (i=0; i<size; i++) {
        free(sub_martix[i]);
    }
    free(sub_martix);
    free(diffs);
    free(lower);
    free(upper);
    free(left);
    free(right);
    free(overlap);
    bsp_sync();
    bsp_end();
    for (i=0; i<N+2; i++) {
        free(matrix[i]);
    }
    free(matrix);

}
Beispiel #15
0
void bspfft_test()
{
    void bspfft( double * x, int n, int p, int s, int sign, double * w0,
                 double * w, double * tw, int *rho_np, int *rho_p );
    void bspfft_init( int n, int p, int s, double * w0,
                      double * w, double * tw, int *rho_np, int *rho_p );
    int k1_init( int n, int p );

    int p, s, n, q, np, k1, j, jglob, it, *rho_np, *rho_p;
    double time0, time1, time2, ffttime, nflops,
           max_error, error_re, error_im, error,
           *Error, *x, *w0, *w, *tw;

    bsp_begin( P );
    p = bsp_nprocs();
    s = bsp_pid();

    bsp_push_reg( &n, SZINT );
    Error = vecallocd( p );
    bsp_push_reg( Error, p * SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Please enter length n: \n" );

#ifdef _WIN32
        scanf_s( "%d", &n );
#else
        scanf( "%d", &n );
#endif

        if ( n < 2 * p )
        {
            bsp_abort( "Error in input: n < 2p" );
        }

        for ( q = 1; q < p; q++ )
        {
            bsp_put( q, &n, &n, 0, SZINT );
        }
    }

    bsp_sync();

    if ( s == 0 )
    {
        printf( "FFT of vector of length %d using %d processors\n", n, p );
        printf( "performing %d forward and %d backward transforms\n",
                NITERS, NITERS );
    }

    /* Allocate, register,  and initialize vectors */
    np = n / p;
    x = vecallocd( 2 * np );
    bsp_push_reg( x, 2 * np * SZDBL );
    k1 = k1_init( n, p );
    w0 = vecallocd( k1 );
    w =  vecallocd( np );
    tw = vecallocd( 2 * np + p );
    rho_np = vecalloci( np );
    rho_p =  vecalloci( p );

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        x[2 * j] = ( double )jglob;
        x[2 * j + 1] = 1.0;
    }

    bsp_sync();
    time0 = bsp_time();

    /* Initialize the weight and bit reversal tables */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft_init( n, p, s, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time1 = bsp_time();

    /* Perform the FFTs */
    for ( it = 0; it < NITERS; it++ )
    {
        bspfft( x, n, p, s, 1, w0, w, tw, rho_np, rho_p );
        bspfft( x, n, p, s, -1, w0, w, tw, rho_np, rho_p );
    }

    bsp_sync();
    time2 = bsp_time();

    /* Compute the accuracy */
    max_error = 0.0;

    for ( j = 0; j < np; j++ )
    {
        jglob = j * p + s;
        error_re = fabs( x[2 * j] - ( double )jglob );
        error_im = fabs( x[2 * j + 1] - 1.0 );
        error = sqrt( error_re * error_re + error_im * error_im );

        if ( error > max_error )
        {
            max_error = error;
        }
    }

    bsp_put( 0, &max_error, Error, s * SZDBL, SZDBL );
    bsp_sync();

    if ( s == 0 )
    {
        max_error = 0.0;

        for ( q = 0; q < p; q++ )
        {
            if ( Error[q] > max_error )
            {
                max_error = Error[q];
            }
        }
    }

    for ( j = 0; j < NPRINT && j < np; j++ )
    {
        jglob = j * p + s;
        printf( "proc=%d j=%d Re= %f Im= %f \n", s, jglob, x[2 * j], x[2 * j + 1] );
    }

    fflush( stdout );
    bsp_sync();

    if ( s == 0 )
    {
        printf( "Time per initialization = %lf sec \n",
                ( time1 - time0 ) / NITERS );
        ffttime = ( time2 - time1 ) / ( 2.0 * NITERS );
        printf( "Time per FFT = %lf sec \n", ffttime );
        nflops = 5 * n * log( ( double )n ) / log( 2.0 ) + 2 * n;
        printf( "Computing rate in FFT = %lf Mflop/s \n",
                nflops / ( MEGA * ffttime ) );
        printf( "Absolute error= %e \n", max_error );
        printf( "Relative error= %e \n\n", max_error / n );
    }


    bsp_pop_reg( x );
    bsp_pop_reg( Error );
    bsp_pop_reg( &n );
    bsp_sync();

    vecfreei( rho_p );
    vecfreei( rho_np );
    vecfreed( tw );
    vecfreed( w );
    vecfreed( w0 );
    vecfreed( x );
    vecfreed( Error );
    bsp_end();

} /* end bspfft_test */
Beispiel #16
0
void bspbench(){
void leastsquares(int h0, int h1, double *t, double *g, double *l);
int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH];
double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest,
time0, time1, time, *Time, mintime, maxtime,
nflops, r, g0, l0, g, l, t[MAXH+1];
/**** Determine p ****/
bsp_begin(P);
p= bsp_nprocs(); /* p = number of processors obtained */
s= bsp_pid();
/* s = processor number */
Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL);
dest= vecallocd(2*MAXH+p); bsp_push_reg(dest,(2*MAXH+p)*SZDBL);
bsp_sync();
/**** Determine r ****/
for (n=1; n <= MAXN; n *= 2){
/* Initialize scalars and vectors */
alpha= 1.0/3.0;
beta= 4.0/9.0;
for (i=0; i<n; i++){
z[i]= y[i]= x[i]= (double)i;
}
/* Measure time of 2*NITERS DAXPY operations of length n */
time0=bsp_time();
for (iter=0; iter<NITERS; iter++){
for (i=0; i<n; i++)
y[i] += alpha*x[i];
for (i=0; i<n; i++)
z[i] -= beta*x[i];
}
time1= bsp_time();
time= time1-time0;
bsp_put(0,&time,Time,s*SZDBL,SZDBL);
bsp_sync();
/* Processor 0 determines minimum, maximum, average30
INTRODUCTION
computing rate */
if (s==0){
mintime= maxtime= Time[0];
for(s1=1; s1<p; s1++){
mintime= MIN(mintime,Time[s1]);
maxtime= MAX(maxtime,Time[s1]);
}
if (mintime>0.0){
/* Compute r = average computing rate in flop/s */
nflops= 4*NITERS*n;
r= 0.0;
for(s1=0; s1<p; s1++)
r += nflops/Time[s1];
r /= p;
printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ",
n, nflops/(maxtime*MEGA),nflops/
(mintime*MEGA), r/MEGA);
fflush(stdout);
/* Output for fooling benchmark-detecting compilers */
printf(" fool=%7.1lf\n",y[n-1]+z[n-1]);
} else
printf("minimum time is 0\n"); fflush(stdout);
}
}
/**** Determine g and l ****/
for (h=0; h<=MAXH; h++){
/* Initialize communication pattern */
for (i=0; i<h; i++){
src[i]= (double)i;
if (p==1){
destproc[i]=0;
destindex[i]=i;
} else {
/* destination processor is one of the p-1 others */
destproc[i]= (s+1 + i%(p-1)) %p;
/* destination index is in my own part of dest */
destindex[i]= s + (i/(p-1))*p;
}
}
/* Measure time of NITERS h-relations */
bsp_sync();
time0= bsp_time();
for (iter=0; iter<NITERS; iter++){
for (i=0; i<h; i++)
bsp_put(destproc[i],&src[i],dest,destindex[i]*SZDBL,
SZDBL);
bsp_sync();
}
time1= bsp_time();
time= time1-time0;
/* Compute time of one h-relation */
if (s==0){
t[h]= (time*r)/NITERS;
printf("Time of %5d-relation= %lf sec= %8.0lf flops\n",
h, time/NITERS, t[h]); fflush(stdout);
}
}
if (s==0){
printf("size of double = %d bytes\n",(int)SZDBL);
leastsquares(0,p,t,&g0,&l0);
printf("Range h=0 to p : g= %.1lf, l= %.1lf\n",g0,l0);
leastsquares(p,MAXH,t,&g,&l);
printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l);
printf("The bottom line for this BSP computer is:\n");
printf("p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf\n",
p,r/MEGA,g,l);
fflush(stdout);
}
bsp_pop_reg(dest); vecfreed(dest);
bsp_pop_reg(Time); vecfreed(Time);
bsp_end();
} /* end bspbench */
Beispiel #17
0
void bspbench(){
    void leastsquares(int h0, int h1, double *t, double *g, double *l);
    int p, s, s1, iter, i, n, h, destproc[MAXH], destindex[MAXH];
    double alpha, beta, x[MAXN], y[MAXN], z[MAXN], src[MAXH], *dest,
           time0, time1, time, *Time, mintime, maxtime,
           nflops, r, g0, l0, g, l, t[MAXH+1]; 
  
    size_t pin[100];

    // Determine p 
    // start: new code for pinning
    for (i=0; i< tnode->length; i++) pin[i] = tnode->sons[i]->index;
    mcbsp_set_pinning( pin, tnode->length );
    bsp_begin(tnode->length);
    // end: new code for pinning

    p= bsp_nprocs(); // p = number of processors obtained 
    s= bsp_pid();    // s = processor number

    Time= vecallocd(p); bsp_push_reg(Time,p*SZDBL);
    dest= vecallocd(2*(MAXH+p)); bsp_push_reg(dest,(2*(MAXH+p))*SZDBL);
    bsp_sync();

    // Determine r 

    for (n=1; n < MAXN; n *= 2){
        // Initialize scalars and vectors 
        alpha= 1.0/3.0;
        beta= 4.0/9.0;
        for (i=0; i<n; i++){
          z[i]= y[i]= x[i]= (double)i;
        }
        // Measure time of 2*NITERS DAXPY operations of length n 
        time0=bsp_time();
        for (iter=0; iter<NITERS; iter++){
          for (i=0; i<n; i++)
            y[i] += alpha*x[i];
          for (i=0; i<n; i++)
            z[i] -= beta*x[i];
        }
        time1= bsp_time(); 
        time= time1-time0; 
        bsp_put(0,&time,Time,s*SZDBL,SZDBL);
        bsp_sync();

        // Processor 0 determines minimum, maximum, average computing rate 
        if (s==0){
          mintime= maxtime= Time[0];
          for(s1=1; s1<p; s1++){
            mintime= MIN(mintime,Time[s1]);
            maxtime= MAX(maxtime,Time[s1]);
          }
          if (mintime>0.0){
            // Compute r = average computing rate in flop/s 
            nflops= 4*NITERS*n;
            r= 0.0;
            for(s1=0; s1<p; s1++)
              r += nflops/Time[s1];
            r /= p; 

            //printf("n= %5d min= %7.3lf max= %7.3lf av= %7.3lf Mflop/s ",
            //       n, nflops/(maxtime*MEGA),nflops/(mintime*MEGA), r/MEGA);
            //fflush(stdout);
            // Output for fooling benchmark-detecting compilers 
            printf( "", y[n-1]+z[n-1] );
          } 
        }
      }

      // Determine g and l 
      for (h=0; h<=MAXH; h++){
        // Initialize communication pattern 
        for (i=0; i<h; i++){
          src[i]= (double)i;
          if (p==1){
            destproc[i]=0;
            destindex[i]=i;
          } else {
            // destination processor is one of the p-1 others 
            destproc[i]= (s+1 + i%(p-1)) %p;
            // destination index is in my own part of dest 
            destindex[i]= s + (i/(p-1))*p;
          }
        }
        for (i=0; i<h; i++){
          src[i]= (double)i;
          if (p==1){
            destproc[i]=0;
            destindex[i]=i;
          } else {
            // destination processor is one of the p-1 others 
            destproc[i]= (s+1 + i%(p-1)) %p;
            // destination index is in my own part of dest 
            destindex[i]= s + (i/(p-1))*p;
          }
        }


        // Measure time of NITERS h-relations 
        bsp_sync(); 

        time0= bsp_time(); 
        for (iter=0; iter<NITERS; iter++){
          for (i=0; i<h; i++) {
            //bsp_get(0,  dest, destindex[i]*SZDBL, &src[i] , SZDBL);
            //bsp_get(destproc[i],  dest, destindex[i]*SZDBL, &src[i] , SZDBL);
            bsp_put(destproc[i],  &src[i] , dest              , destindex[i]*SZDBL, SZDBL);
          }

          //if (s == 0) 
          //  bsp_get(0,  dest, destindex[i]*SZDBL, &src[i] , SZDBL);

          bsp_sync(); 
          
        }

        time1= bsp_time();
        time= time1-time0;

        // Compute time of one h-relation 
        if (s==0){
          t[h]= (time*r)/NITERS;
//#define SEHLOC_BENCH_VERBOSE
#ifdef SEHLOC_BENCH_VERBOSE
          char strnodes[256];
          sprintf(strnodes, "");
          for (i=0; i<tnode->length; i++) {
            sprintf(strnodes, "%s %d", strnodes, tnode->sons[i]->index);
          }
          printf("SEH# Level%d %5d %lf %8.0lf\n", tnode->level, h, time/NITERS, t[h]); fflush(stdout);
#endif
        }
      }

      if (s==0){
        leastsquares(0,p,t,&g0,&l0); 
        printf("Range h=0 to p   : g= %.1lf, l= %.1lf\n",g0,l0);
        leastsquares(p,MAXH,t,&g,&l);
        g=(g>0)? g: g0*2;
        printf("Range h=p to HMAX: g= %.1lf, l= %.1lf\n",g,l);
        //printf("plot# %d %.1lf  %.1lf\n",tnode->level, g,l);
        printf("The bottom line for this MultiBSP component is:\n");
        printf("<p= %d, r= %.3lf Mflop/s, g= %.1lf, l= %.1lf>\n",
               p,r/MEGA,g,l);
        fflush(stdout);
      }
      bsp_pop_reg(dest); vecfreed(dest);
      bsp_pop_reg(Time); vecfreed(Time);
      bsp_end();

} /* end bspbench */
Beispiel #18
0
Datei: bsp.c Projekt: jong42/git
void spmd( void ) {
	//parallel over three processes
	bsp_begin( 3 );

	//test bsp_push_reg (results in next superstep)
	size_t localInt;
	bsp_push_reg( &localInt, sizeof( size_t ) );
	checkLocalIntAddress[ bsp_pid() ] = &localInt;

	//check pid/nprocs, both using primitives as well as manually
	checkPcount[ bsp_pid() ] = (size_t)(bsp_nprocs());
	pthread_mutex_lock( &test_mutex );
	check++;
	checkP[ bsp_pid() ] = true;
	pthread_mutex_unlock( &test_mutex );

	//nobody should be at superstep 0
	if( superstep == 1 )
		superstepOK = false;

	//test barrier synchronisation
	bsp_sync();

	//note someone is at superstep 1
	superstep = 1;

	//check bsp_time
	if( bsp_time() <= 0 )
		bsp_abort( "FAILURE \t bsp_time returned 0 or less!\n" );

	//set up a pop_reg, but should only take effect after the next sync
	//(testing the push_reg after this statement thus provides a free test)
	bsp_pop_reg( &localInt );
	struct mcbsp_thread_data * const data = pthread_getspecific( mcbsp_internal_thread_data );
	if( data->localsToRemove.top != 1 || data->localsToRemove.cap != 16 ||
		*((void**)(data->localsToRemove.array)) != (void*)&localInt ) {
		fprintf( stderr, "FAILURE \t bsp_pop_reg did not push entry on the to-remove stack (%p != %p)!\n",
			*((void**)(data->localsToRemove.array)), (void*)&localInt );
		mcbsp_util_fatal();
	}

	//check push_reg
	for( unsigned char i=0; i<3; ++i ) {
		if( checkLocalIntAddress[ i ] != mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address ) {
			fprintf( stderr, "FAILURE \t bsp_push_reg did not register correct address!\n" );
			mcbsp_util_fatal();
		}
	}

	bsp_sync();

	//check pop_reg
	for( unsigned char i=0; i<3; ++i ) {
		if( mcbsp_util_address_table_get( &(data->init->global2local), 0, i ) != NULL ||
			data->localC != 0 ) {
			fprintf( stderr, "FAILURE \t bsp_pop_reg did not de-register correctly (entry=%p)!\n",
				mcbsp_util_address_table_get( &(data->init->global2local), 0, i )->address );
			mcbsp_util_fatal();
		}
		//localInt = *(size_t*)mcbsp_util_stack_pop( &(data->removedGlobals) );
	}

	bsp_sync();

	//going to test communication primitives on the following area
	size_t commTest[ 3 ];
	commTest[ 0 ] = commTest[ 1 ] = ((size_t)bsp_pid());
	commTest[ 2 ] = (size_t)(bsp_nprocs());
	bsp_push_reg( &commTest, 3 * sizeof( size_t ) );

	//make push valid
	bsp_sync();

	//after this put, commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs
	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &commTest, &commTest, sizeof( size_t ), 2*sizeof( size_t) );
	commTest[ 2 ] = ULONG_MAX; //this should not influence the result after sync.

	//test behind-the-scenes
	const struct mcbsp_util_stack queue = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ];
	size_t predicted_cap = predictCap( sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) );
	if( queue.cap != predicted_cap || queue.top != sizeof( struct mcbsp_message ) + 2 * sizeof( size_t) || queue.size != sizeof( struct mcbsp_message ) ) {
		fprintf( stderr, "FAILURE \t bsp_put did not adapt the communication queue as expected!\n(cap = %ld, top = %ld, size = %ld)\n",
			(size_t)queue.cap, (size_t)queue.top, (size_t)queue.size );
		mcbsp_util_fatal();
	}
	const struct mcbsp_message request = *((struct mcbsp_message*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message )) );
	if( request.length != 2 * sizeof( size_t) ) {
		fprintf( stderr, "FAILURE \t bsp_put did not push a request of the expected length!\n(length = %ld)\n", (size_t)request.length );
		mcbsp_util_fatal();
	}
	const size_t * const chk_array = (size_t*) ((char*)queue.array + queue.top - sizeof( struct mcbsp_message ) - 2 * sizeof( size_t ));
	if( chk_array[ 0 ] != ((size_t)bsp_pid()) || chk_array[ 1 ] != ((size_t)bsp_pid()) ) {
		fprintf( stderr, "FAILURE \t bsp_put did not push an expected communication request!\n" );
		mcbsp_util_fatal();
	}
	//note there is no easy way to check request.destination; the top-level BSP test will handle that one

	bsp_sync();

	//test for the above expectation after bsp_put, namely
	//commTest[ 0 ] should equal bsp_pid, commTest[ 1, 2 ] should equal bsp_pid-1 mod bsp_nprocs
	if( commTest[ 0 ] != ((size_t)bsp_pid()) || 
		commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs()) ||
		commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs()-1)%bsp_nprocs())
	) {
		fprintf( stderr, "FAILURE \t array after bsp_put is not as expected! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] );
		mcbsp_util_fatal();
	}
	
	//do a get on the next processor on the last element of commTest
	bsp_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest[ 2 ]), sizeof( size_t ) );

	//fill the expected value after the get to test non-buffering
	commTest[ 2 ] = ((size_t)bsp_pid());

	//communicate
	bsp_sync();

	//commTest[ 0 ] should equal bsp_pid, commTest[ 1 ] should equal bsp_pid-1, commTest[ 2 ] should be bsp_pid+1
	if( commTest[ 0 ] != ((size_t)bsp_pid()) || 
		commTest[ 1 ] != (size_t)((bsp_pid()+bsp_nprocs() - 1)%bsp_nprocs())
	) {
		fprintf( stderr, "FAILURE \t start of array after bsp_get changed! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] );
		mcbsp_util_fatal();
	}
	if( commTest[ 2 ] != (size_t)((bsp_pid()+bsp_nprocs() + 1)%bsp_nprocs()) ) {
		fprintf( stderr, "FAILURE \t last element of array after bsp_get erroneous! (%d: %ld %ld %ld))\n", bsp_pid(), commTest[ 0 ], commTest[ 1 ], commTest[ 2 ] );
		mcbsp_util_fatal();
	}

	bsp_sync();

	//test direct_get functionality
	size_t commTest2[ 3 ];
	commTest2[ 0 ] = commTest[ 0 ];

	//get commTest[1] from right neighbour
	bsp_direct_get( (bsp_pid() + 1) % bsp_nprocs(), &commTest, sizeof( size_t ), &(commTest2[ 1 ]), sizeof( size_t ) );

	//get commTest[2] from left neighbour
	bsp_direct_get( (bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs(), &commTest, 2 * sizeof( size_t ), &(commTest2[ 2 ]), sizeof( size_t ) );

	//now everything should equal bsp_pid
	if( commTest2[ 0 ] != ((size_t)bsp_pid()) || 
		commTest2[ 1 ] != ((size_t)bsp_pid()) || 
		commTest2[ 2 ] != ((size_t)bsp_pid())
	) {
		fprintf( stderr, "FAILURE \t direct_get does not function properly! (%d: [%ld %ld %ld])\n", bsp_pid(), commTest2[ 0 ], commTest2[ 1 ], commTest2[ 2 ] );
		mcbsp_util_fatal();
	}

	//now test single BSMP message
	bsp_send( (bsp_pid() + 1) % bsp_nprocs(), NULL, &commTest, sizeof( size_t ) );
	
	//check messages
	const struct mcbsp_util_stack queue1 = data->queues[ (bsp_pid() + 1) % bsp_nprocs() ];
	const size_t new_predicted_cap = predictCap( sizeof( struct mcbsp_message ) + sizeof( size_t ) );
	predicted_cap = predicted_cap > new_predicted_cap ? predicted_cap : new_predicted_cap;
	if( queue1.cap != predicted_cap || queue1.size != sizeof( struct mcbsp_message ) || queue1.top != sizeof( struct mcbsp_message ) + sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t bsp_send did not adapt the communication queue as expected!\n(cap = %ld, size = %ld, top = %ld; prediction was %ld, %ld, %ld)\n",
			(size_t)queue1.cap, (size_t)queue1.size, (size_t)queue1.top,
			(size_t)predicted_cap, (size_t)(sizeof( struct mcbsp_message )), (size_t)(sizeof( struct mcbsp_message ) + sizeof( size_t )) );
		mcbsp_util_fatal();
	}
	const struct mcbsp_message request2 = *(struct mcbsp_message*) ((char*)queue1.array + queue1.top - sizeof( struct mcbsp_message ));
	if( request2.destination != NULL ||
		request2.length != sizeof( size_t ) || // assumes tagSize = 0
		*(size_t *)queue1.array != ((size_t)bsp_pid()) ) {
		fprintf( stderr, "FAILURE \t bsp_send did not push the expected communication request!\n(top = %ld, destination = %p, length = %ld, payload = %ld\n",
			(size_t)queue1.top, request2.destination, (size_t)request2.length, *(size_t *)queue1.array );
		mcbsp_util_fatal();
	}

	bsp_sync();

	//inspect incoming BSMP queue (assuming tagSize = 0)
	predicted_cap = predictCap( sizeof( size_t ) + sizeof( size_t ) );
	if( data->bsmp.cap != predicted_cap || data->bsmp.top != sizeof( size_t ) + sizeof( size_t ) || data->bsmp.size != sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t BSMP queue after superstep with sends is not as expected!\n(cap = %ld, top = %ld, size = %ld; prediction was %ld, %ld, %ld)\n",
			(size_t)data->bsmp.cap, (size_t)data->bsmp.top, (size_t)data->bsmp.size,
			(size_t)predicted_cap, (size_t)(8 + sizeof( size_t )), (size_t)(data->bsmp.size) );
		mcbsp_util_fatal();
	}
	if( *(size_t*)(data->bsmp.array) != (size_t)((bsp_pid() + bsp_nprocs() - 1) % bsp_nprocs()) ) {
		fprintf( stderr, "FAILURE \t Value in BSMP queue is not correct!\n" );
		mcbsp_util_fatal();
	}
	
	//inspect using primitives
	MCBSP_NUMMSG_TYPE   packets;
	MCBSP_BYTESIZE_TYPE packetSize;
	bsp_qsize( &packets, &packetSize );
	if( packets != 1 || packetSize != sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t bsp_qsize does not function correctly!\n" );
		mcbsp_util_fatal();
	}
	bsp_move( &commTest, sizeof( size_t ) );
	if( commTest[ 0 ] != (size_t)(( bsp_pid() + bsp_nprocs() - 1 ) % bsp_nprocs()) ) {
		fprintf( stderr, "FAILURE \t bsp_move does not function correctly!\n" );
		mcbsp_util_fatal();
	}
	
	//check set_tagsize
	MCBSP_BYTESIZE_TYPE tsz = sizeof( size_t );
	bsp_set_tagsize( &tsz );
	if( tsz != 0 ) {
		fprintf( stderr, "FAILURE \t return value of bsp_set_tagsize is incorrect!\n" );
		mcbsp_util_fatal();
	}

	bsp_sync();

	//check set_tagsize
	if( data->init->tagSize != sizeof( size_t ) ) {
		fprintf( stderr, "FAILURE \t bsp_set_tagsize failed!\n" );
		mcbsp_util_fatal();
	}
	
	commTest[ 0 ] = ((size_t)bsp_pid());
	commTest[ 1 ] = 3;
	commTest[ 2 ] = 8 + ((size_t)bsp_pid());
	for( unsigned char i = 0; i < bsp_nprocs(); ++i ) {
		bsp_send( i, commTest, &(commTest[1]), 2 * sizeof( size_t ) );
		char * const test = (char*)(data->queues[ (size_t)i ].array) + data->queues[ (size_t)i ].top - sizeof( struct mcbsp_message ) - sizeof( size_t );
		if( *(size_t*)test != *commTest ) {
			fprintf( stderr, "FAILURE \t BSMP tag did not get pushed correctly (reads %ld instead of %ld)!\n", *(size_t*)test, *commTest );
			mcbsp_util_fatal();
		}
	}

	bsp_sync();

	MCBSP_BYTESIZE_TYPE status;
	size_t tag;
	for( unsigned char i = 0; i < bsp_nprocs(); ++i ) {
		bsp_get_tag( &status, &tag );
		if( tag >= ((size_t)bsp_nprocs()) || status != 2 * sizeof( size_t ) ) {
			fprintf( stderr, "FAILURE \t error in BSMP tag handling! (tag=%ld, status=%ld)\n", tag, (size_t)status );
			mcbsp_util_fatal();
		}
		size_t *p_tag, *msg;
		if( bsp_hpmove( (void**)&p_tag, (void**)&msg ) != 2 * sizeof( size_t ) ) {
			fprintf( stderr, "FAILURE \t bsp_hpmove does not return correct payload length." );
		}
		if( msg[ 0 ] != 3 || *p_tag != tag ) {
			fprintf( stderr, "FAILURE \t bsp_hpmove does not contain correct message (tag=%ld, payload = %ld) which should be (%ld, 3).\n", *p_tag, msg[ 0 ], tag );
			mcbsp_util_fatal();
		}
		commTest[ tag ] = msg[ 1 ];
	}
	for( unsigned short int i = 0; i < bsp_nprocs(); ++i ) {
		if( commTest[ i ] != (unsigned int)(8 + i) ) {
			fprintf( stderr, "FAILURE \t error in bsp_tag / bsp_(hp)move combination!\n" );
			mcbsp_util_fatal();
		}
	}

	bsp_sync();

#ifdef MCBSP_ALLOW_MULTIPLE_REGS
	//test multiple regs
	double mreg[17];
	bsp_push_reg( &(mreg[0]), 7*sizeof( double ) );

	bsp_sync();

	double mregs = 1.3;
	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 6 * sizeof( double ), sizeof( double ) );
	bsp_push_reg( &(mreg[0]), 17*sizeof( double ) );

	bsp_sync();

	bsp_push_reg( &(mreg[0]), 13*sizeof( double ) );
	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 16 * sizeof( double ), sizeof( double ) );

	bsp_sync();

	if( mreg[ 6 ] != mreg[ 16 ] ||  mreg[ 6 ] != mregs ) {
		fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg calls (%f,%f,%f,...,%f,%f)\n", mreg[ 5 ], mreg[ 6 ], mreg[ 7 ], mreg[ 15 ], mreg[ 16 ] );
		mcbsp_util_fatal();
	}
	bsp_pop_reg( &(mreg[0]) );
	bsp_pop_reg( &(mreg[0]) );

	bsp_sync();

	bsp_put( (bsp_pid() + 1) % bsp_nprocs(), &mregs, &mreg, 2 * sizeof( double ), sizeof( double ) );

	bsp_sync();

	if( mreg[ 2 ] != mregs ) {
		fprintf( stderr, "FAILURE \t error in bsp_put + multiple bsp_push_reg + multiple bsp_pop_reg calls\n" );
		mcbsp_util_fatal();
	}
#endif

	bsp_end();
}
Beispiel #19
0
/**
 * \brief Main function.
 */
int main(int argc, char **argv)
{
  char            *s,*t;
  int             size,sizes,sizet;
  int             i,j,k,P;
  int             cond;
  int             *simi,res,Paux;
  int             *a,*b;
  FILE            *f,*f2;
  fpos_t          filepos;
  int             my_rank,set;
  struct timeval  ini, fi;
  struct timezone tz;


  bsp_begin(atoi(argv[1]));

  size = atoi(argv[1]);

  f=fopen(argv[2],"r");
  if (f==NULL) Exit("Error: File %s not found\n",argv[2]);
  fscanf(f,"%d",&sizes);

  if (sizes%size != 0)
    Exit("Error: The sequences have to have multiple of "
         "processes quantity size");

  f2=fopen(argv[3],"r");

  if (f2==NULL) Exit("Error: File %s not found\n",argv[3]);

  fscanf(f2,"%d",&sizet);

  if (bsp_pid() == 0)
    if (sizet%size != 0)
      Exit("Error: The sequences have to have multiple of "
         "processes quantity size");

  P = atoi(argv[4]);

  if (bsp_pid() == 0)
    printf("align %d %s %s %d\n",size,argv[2],argv[3],P);

  sizes /= size;
  sizet /= size;

  s = (char*) malloc (sizes*sizeof(char));
  t = (char*) malloc (sizet*sizeof(char));

  if (s == NULL || t == NULL)
    Exit("No memory\n");


  a = (int*)malloc ((sizet+1)*sizeof(int));
  b = (int*)malloc ((sizes+1)*sizeof(int));


  if (a == NULL || b == NULL)
    Exit("No memory\n");


  if (bsp_pid() == size-1)
  {
    simi = (int*) malloc(P*sizeof(int));
    if (simi == NULL) Exit("No memory\n");
  }

  Paux = 0;

  bsp_push_reg(s,sizes*sizeof(char));
  bsp_push_reg(b,(sizes+1)*sizeof(int));
  bsp_push_reg(&filepos,sizeof(long int));
  bsp_push_reg(&i,sizeof(int));

  bsp_sync();

  gettimeofday(&ini,&tz);
  
  for (k = 0; k < P*size + size -1; k++)
  {
    if (k >= bsp_pid() && k <= P*size+bsp_pid()-1)
      cond = 1;
    else
      cond = 0;

    set = 0;
    if (cond==1 && (k-bsp_pid())%size == 0)/*start of a reading*/
    {
      if (bsp_pid() == 0 && k < size);
      else if (bsp_pid() == 0)
      {
	bsp_get(size-1,&filepos,0,&filepos,sizeof(long int));
      }
      else
      {
	bsp_get(bsp_pid()-1,&filepos,0,&filepos,sizeof(long int));
      }
      set = 1;
    }

    bsp_sync();

    if (cond==1 && (k-bsp_pid())%size == 0)/*start of a reading*/
    {
      if (set == 1) fsetpos(f2,&filepos);
      for (i = 0; i < sizet; i++)
      {
	fscanf(f2,"%c",&t[i]);
	if (t[i] == 'A' ||t[i] == 'T' ||t[i] == 'C' ||t[i] == 'G');
	else
	{
	  if (t[i] == EOF) Exit("Error: End of file reached without"
			   "read all sequence in %s\n",argv[3]);
	  i--;
	}
      }
      fgetpos(f2,&filepos);
      for (i = 0; i <= sizet; i++)
	a[i] = (i+bsp_pid()*sizet)*gap;

    }

    if (cond==1)
    {
      if (bsp_pid() == 0)
      {
	for (i = 0; i < sizes; i++)
	{
	  fscanf(f,"%c",&s[i]);
	  if (s[i] == 'A' ||s[i] == 'T' ||s[i] == 'C' ||s[i] == 'G');
	  else
	  {
	  if (s[i] == EOF) Exit("Error: End of file reached without"
			   "read all sequence in %s\n",argv[2]);
	    i--;
	  }
	}
	for (j = 0; j <= sizes; j++)
	  b[j] = (j + (k%size)*sizes)*gap;
      }

      res = Similarity (s, sizes, t, sizet, a, b);

      if (bsp_pid() == size-1 && (k-bsp_pid()+1)%size == 0)
      {
	simi[Paux++] = res;
      }
    }
    if (cond)
      {
	if (bsp_pid() != size -1)
	{
	  bsp_put(bsp_pid()+1,s,s,0,sizes*sizeof(char));
	  bsp_put(bsp_pid()+1,b,b,0,(sizes+1)*sizeof(int));
	}
      }
    bsp_sync();
  }

  gettimeofday(&fi,&tz);

  printf("process %d ended\n",bsp_pid());

  fclose(f);
  fclose(f2);

  if (bsp_pid() == size-1)
  {
    printf("Similarities: ");
    for (i = 0; i < P; i++)
      printf("%d ",simi[i]);
    printf("\n");
  }
  if (bsp_pid() == 0)
  {
    printf("Computation time: %f\n", (fi.tv_sec - ini.tv_sec + (double)(fi.tv_usec -
ini.tv_usec)/1000000)/60);
  }

  bsp_pop_reg(&filepos);
  bsp_pop_reg(b);
  bsp_pop_reg(s);
  bsp_sync();

  return 0;
}
Beispiel #20
0
void bspsieve(){
    
    double time0, time1;
    ulong *x;  // local list of candidates
    ulong *ks; //place for proc0 to store intermediate ks
    ulong n, 
          nl, 
          i, 
          iglob;
    int   s,
          p;
    ulong k;   // the current largest sure-prime

    n = N+1; // copy global N and increase by 1. (only proc 1 knows this)
             // this is so the maximum array idx == N
    
    bsp_begin(P);
    p= bsp_nprocs(); /* p = number of processors obtained */ 
    printf("Now we have %d processors.\n", p);
    s= bsp_pid();    /* s = processor number */ 
    if (s==0){
        if(n<0)
            bsp_abort("Error in input: n is negative");
        ks = vecalloculi(p);
    }

    bsp_push_reg(&n,SZULL);
    bsp_sync();

    bsp_get(0,&n,0,&n,SZULL); //everyone reads N from proc 0
    bsp_sync();
    bsp_pop_reg(&n);

    nl= blockSize(p,s,n); // how big must s block be?
    printf("P(%d) tries to alloc vec of %lld ulongs", s, nl);
    printf(", size would be = %lld Mb\n", nl*SZULL/1024/1024);
    x= vecalloculi(nl);

    for (i=0; i<nl; i++){
        // start by assuming everything is prime, except 1
        iglob= globalIdx(p,s,n,i);
        x[i]= iglob;
    }
    if(s==0)
        x[1]=0;
    bsp_sync(); 
    time0=bsp_time();
    k = 2;
    // begin work

    while( k*k <= n )
    {
        bspmarkmultiples(p,s,n,k,x);
        k = nextPrime(p,s,n,k,x);

        bsp_push_reg(&k, SZULL);
        bsp_sync();

        if(s==0)
        {
            ks[0] = k; // my k
            for(i=1;i<p; i++)
            {
                bsp_get(i, &k, 0, &ks[i], SZULL);
            }
        }

        bsp_sync();

        if(s==0)
        {
            k = findMinimum(p,ks);
        }
        bsp_sync();

        //broadcast minimum 
        bsp_get(0,&k,0,&k,SZULL); 
        bsp_sync();

        bsp_pop_reg(&k);
    }

    // end work
    bsp_sync();  
    time1=bsp_time();

    ulong primes= 0;
    //printf("Processor %lld primes: \n", s); 
    for(i = 0; i < blockSize(p,s,n); i++)
        if( x[i] != 0)
            primes++;
    //do not print primes, just count them. 
    printf("proc %d finds %lld primes.\n", s, primes);

    fflush(stdout);
    if (s==0){
        printf("This took only %.6lf seconds.\n", time1-time0);
        fflush(stdout);
        vecfreeuli(ks);
    }

    vecfreeuli(x);
    bsp_end();

} /* end bspsieve */
int main() {
    bsp_begin();
    int s = bsp_pid();
    int p = bsp_nprocs();

    int a = 0;
    bsp_push_reg(&a, sizeof(int));
    bsp_sync();

    int b = 0;
    bsp_push_reg(&b, sizeof(int));
    bsp_sync();

    int c[16] = {0};
    bsp_push_reg(&c, 16 * sizeof(int));
    bsp_sync();

    // first we test puts
    int data = s;
    bsp_hpput((s + 1) % p, &data, &a, 0, sizeof(int));
    bsp_hpput((s + 2) % p, &data, &b, 0, sizeof(int));
    for (int t = 0; t < p; ++t) {
        bsp_hpput(t, &data, &c, sizeof(int) * s, sizeof(int));
    }
    ebsp_barrier();

    // test: can set and get tagsize from core
    EBSP_MSG_ORDERED("%i", a);
    // expect_for_pid: ((pid - 1) % 16)

    // test: can put register multiple vars, and put multiple times
    EBSP_MSG_ORDERED("%i", b);
    // expect_for_pid: ((pid - 2) % 16)

    // test: support for larger variables
    EBSP_MSG_ORDERED("%i", c[5]);
    // expect_for_pid: ("5")

    // next we test gets

    int core_num_next = 0;
    int core_num_next_next = 0;
    bsp_hpget((s + 1) % p, &a, 0, &core_num_next, sizeof(int));
    bsp_hpget((s + 2) % p, &b, 0, &core_num_next_next, sizeof(int));
    bsp_hpget((s + 3) % p, &c, 4 * sizeof(int), &data, sizeof(int));
    ebsp_barrier();

    // test: can set and get tagsize from core
    EBSP_MSG_ORDERED("%i", core_num_next);
    // expect_for_pid: (pid)

    // test: can put register multiple vars, and put multiple times
    EBSP_MSG_ORDERED("%i", core_num_next_next);
    // expect_for_pid: (pid)

    // test: support for larger variables
    EBSP_MSG_ORDERED("%i", data);
    // expect_for_pid: ("4")

    bsp_end();

    return 0;
}
Beispiel #22
0
void mainloop(){

//int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000,
//2,1000,-5,0,1000,1000,1000,1000,6,0};
   
int nlr,nlc,s,t,i,j,k,l,li,lsize,tsize0, tsize1,tempp,tempoff,rpos,cpos, 
*lpart,*linter,*gindx,*lcol,*lrow,*lsrow, *lscol, *ltrow, *ltcol, *temp;

int* init = gen_graph(N, 0.05);  

bsp_begin(bsp_nprocs());

/**********Initialization SuperStep 0***************/

//Compute global row and column indeces for each element
int pm = sqrt(bsp_nprocs());
int pn = (bsp_nprocs())/pm;
/* Compute 2D processor numbering from 1D numbering 
 with failsafe if the number of processors are not enough, back to simple 1D cyclic distribution */ 
if ( pn  != pm ){
	pn = bsp_nprocs();
	pm = 1;
	t = bsp_pid();
	s = 0;
  
}else{
	s= bsp_pid()%pm;  /* 0 <= s < pm */
	t= bsp_pid()/pn;  /* 0 <= t < pn */
}

nlr=  nloc(pm,s,N); /* number of local rows */
nlc=  nloc(pn,t,N); /* number of local columns */

lsize = nlr*nlc;						  //interpret 2D size to array size
lpart = vecalloci(lsize);				  //Initialize local part of processor s
linter = vecalloci(lsize);				  //Intermidiate array used for the matrix "multiplication"
gindx = vecalloci(lsize);				  //Array to store the global indeces of the local elements
lcol  = vecalloci(lsize);				  //Array to store the glocal column index
lrow  = vecalloci(lsize);				  //Array to store the glocal row index
bsp_push_reg(lpart,lsize*SZINT);

//Distribute the Data
li=0;
for ( i= 0; i < N; i++){
	for ( j= 0; j < N; j++){
		if ((j % pn) == t){
			lpart[li] = init[N*i+j];
			lrow[li] = i;
			lcol[li] = j;
			gindx[li] = N*i+j;
			li++;	
		}
	}
}


/*for ( i= 0; i < N*N; i++) {

		if(bsp_pid() == (i % bsp_nprocs())){
   			lpart[li] = init[i];
			lrow[li] = i/N;
			lcol[li] = i % N;
			gindx[li] = i;
			li++;	
		}
		

}*/
vecfreei(init);//out of the shared space

tsize0 = tsize1 =lsize;
temp = lrow;

//find unique global rows for processor s
for(i=0;i<tsize0;i++){
    for(j=0;j<tsize0;j++){
         if(i==j){
             continue;
         }
         else if(*(temp+i)==*(temp+j)){
             k=j;
             tsize0--;
             while(k < tsize0){
                 *(temp+k)=*(temp+k+1);
                 k++;
             }
              j=0;
         }
    }
}
temp = lcol;

//find unique global column for processor s
for(i=0;i<tsize1;i++){
    for(j=0;j<tsize1;j++){
         if(i==j){
             continue;
         }
         else if(*(temp+i)==*(temp+j)){
             k=j;
             tsize1--;
             while(k < tsize1){
                 *(temp+k)=*(temp+k+1);
                 k++;
             }
              j=0;
         }
    }
}


//keep unique global rows and columns in arrays
//initialize arrays to hold the elements of those rows and columns(ltcol, ltrow)
lscol  = vecalloci(tsize1); 
lsrow  = vecalloci(tsize0);
ltcol  = vecalloci(N*tsize1);
ltrow  = vecalloci(N*tsize0);

for(i=0;i < tsize0;i++){
    lsrow[i] = lrow[i];
  }
for(i=0;i < tsize1;i++){
    lscol[i] = lcol[i];
  }


vecfreei(lcol);//not needed from this point on
vecfreei(lrow);//we use lscol, lsrow, ltrow, ltcol

//sort arrays
qsort (lsrow, tsize0, sizeof(int), compare_int);
qsort (lscol, tsize1, sizeof(int), compare_int);
bsp_sync();
/**********End Initialization SuperStep 0***************/

double time0= bsp_time();
/*********Repeated Squaring loop start*************/
j=1;
while ((N-1) > j) {

/*************Comm. SuperStep j0*************/
for(i=0;i < tsize1;i++){
	for(k=0; k<N;k++){
		tempp=((N*k+lscol[i]) % bsp_nprocs());
		tempoff = ((double)(N*k+lscol[i])/(double)bsp_nprocs());
		bsp_get(tempp, &lpart[0],tempoff*SZINT, &ltcol[N*i+k],SZINT);
	} 
}

for(i=0;i < tsize0;i++){
	for(k=0; k<N;k++){
		tempp=((N*lsrow[i]+k) % bsp_nprocs());
		tempoff = ((double)(N*lsrow[i]+k)/(double)bsp_nprocs());
		bsp_get(tempp, &lpart[0],tempoff*SZINT, &ltrow[N*i+k],SZINT);
	} 
}
bsp_sync();
/*************End Comm. SuperStep j0*************/

/*************Comp. SuperStep j1*************/
for ( i=0; i<lsize; i++) {
  
	int gcol = gindx[i] % N; //get global col indx of current element
	int grow = gindx[i]/N;	 //get global row indx of current element

    linter[i]=1000;//initiliaze array
	//find appropriate indx of the global rows and columns to perform "multiplication"
	/*for ( l=0; l < tsize0;l++){
		if(grow == lsrow[l]){
			rpos =l;
			break;
		}
	}*/
	int *rp = bsearch (&grow, lsrow, tsize0, sizeof (lsrow),compare_int);
	rpos = rp - lsrow;
	

	int *cp = bsearch (&gcol, lscol, tsize1, sizeof (lscol),compare_int);
	cpos = cp - lscol;
	
	/*for ( l=0; l < tsize1;l++){
		if(gcol == lscol[l]){
			cpos =l;
			break;
		}
	}*/

	//this is where the update is done
	for(k=0;k<N;k++){
		linter[i] = fmin(linter[i], ltrow[N*rpos + k]+ltcol[N*cpos + k]);
	}

}

memcpy(lpart,linter,lsize*SZINT);
j = 2*j;
bsp_sync();
/*************End Comp. SuperStep j1*************/

}
/*********Repeated Squaring loop end*************/
double time1= bsp_time();
bsp_sync();
/*********display matrices and time*********/
if(bsp_pid()==0){
	printf( " \n Block Cyclic Distr  calculation of APSP took: %f seconds \n", time1-time0 ); 
}
/*printf("\n The array is, proc %d \n ", bsp_pid());
  for(i=0;i < lsize;i++){
    	printf(" %d",lpart[i]);
	
}*/
printf("\n ");

//clean up
bsp_pop_reg(lpart);
vecfreei(lpart);
vecfreei(linter);
vecfreei(lscol);
vecfreei(lsrow);
vecfreei(ltcol);
vecfreei(ltrow);
vecfreei(gindx);

bsp_end();   
}
Beispiel #23
0
void mainloop(){

//int init[N*N] = {0,3,8,1000,-4, 1000,0,1000,1,7,1000,4,0,1000,1000,
//2,1000,-5,0,1000,1000,1000,1000,6,0};

int i,j,k,l,v,t,lsize,*lsize_m,*lrow,*lcol, *linit, *linter,*startrow_m;
int li,lj,lk,startrow, endrow,g;

int* init = gen_graph(N, 0.05);  

bsp_begin(bsp_nprocs());


/**********Initialization***************/

/*******Comp. Superstep 0******/

lsize = nloc(bsp_nprocs(),bsp_pid(), N); //Get the number of rows of processor s
lrow = vecalloci(lsize*N);				 //The main storing array of processor s
lcol = vecalloci(N);					 //array to hold the column for the matrix squaring
startrow_m = vecalloci(bsp_nprocs());    //array to hold all processors starting global row
lsize_m = vecalloci(bsp_nprocs());		 //array to hold the number of rows of all processors
linter = vecalloci(lsize*N);			 //Intermidiate array used for the matrix "multiplication"

bsp_push_reg(startrow_m,bsp_nprocs()*SZINT);
bsp_push_reg(lsize_m,bsp_nprocs()*SZINT);
bsp_push_reg(lrow,lsize*N*SZINT);

/****Get the first and last global row of processor s***/
if(bsp_pid() == (bsp_nprocs() - 1)){
 startrow = (N - lsize);
 endrow = N;
}else{
 startrow = bsp_pid()*lsize;
 endrow = bsp_pid()*lsize + lsize;
}



//Distribute Data, according row block distribution
li=0;
for ( i= startrow; i < endrow; i++) {
	lj=0;
	 for(j=0; j < N; j++) {	
   		lrow[N*li+lj] = init[N*i+j];
		lj++;
   	 } 
 li++;
}
vecfreei(init); //out of the shared enviroment

//initialize arrays
for ( i=0; i<bsp_nprocs(); i++) {
			startrow_m[i] = 0;
			lsize_m[i] = 0;
}

bsp_sync();
/*******End Comp. Superstep 0******/


/*********Comm. Superstep 1********/
//Communicate the global starting rows of all processors
for(g=0; g<bsp_nprocs();g++){
	bsp_put(g,&startrow,&startrow_m[0],bsp_pid()*SZINT,SZINT);
	bsp_put(g,&lsize,&lsize_m[0],bsp_pid()*SZINT,SZINT);
}
/*********End Comm. Superstep 1*****/
bsp_sync();
/**********End Initialization***************/

double time0= bsp_time();
/*********Repeated Squaring loop start*************/
j=1;
while ((N-1) > j) {
 
		/****Comp. Superstep j0****/ 
		//initialize arrays
		for ( i=0; i<N*lsize; i++) {
			linter[i] = 1000;
		}
		for ( i=0; i<N; i++) {
			lcol[i] = 0;
		}
		bsp_sync();
		/****End Comp. Superstep j0****/ 
	   		
        	for ( lj=0; lj < N; lj++) {
				/***Comm. SuperStep jlj0*******/
				//get global column lj 
				t=0;
				for(g=0; g < bsp_nprocs();g++){
				  for(v=0; v<lsize_m[g]; v++){				
					bsp_get(g,&lrow[0],(lj+v*N)*SZINT,&lcol[t],SZINT);
					t++;
				  }
				}
				bsp_sync();
				/***End Comm. SuperStep jlj0***/
				/***Comp. SuperStep jlj1*******/
				//update the values that use global column lj
				for ( li = 0; li < lsize; li++){
					for ( lk=0; lk < N; lk++) {
						linter[N*li+lj] = fmin(linter[N*li+lj], lrow[N*li+lk]+lcol[lk]);
					} 
        		}
				bsp_sync();
				/***End Comp. SuperStep jlj1***/
    		}
 		/****Comp. Superstep j1****/ 
		memcpy(lrow,linter,N*lsize*SZINT);
  		j=2*j;
		bsp_sync();
		/****End Comp. Superstep j1****/ 
}
/*********Repeated Squaring loop end*************/
double time1= bsp_time();
bsp_sync();
/*********display matrices and time*********/
if(bsp_pid()==0){
	printf( " \n Block Row Distr (need to know basis) calculation of APSP took: %f seconds \n", time1-time0 ); 
}

/*for(g = 0; g < bsp_nprocs(); g++){
if(bsp_pid()==g){
 printf("\n i am proc %d and i have APSP Mat \n",bsp_pid());
  for(k=0;k<lsize;k++)
     {
	  printf("\n");
		 for(l=0;l<N;l++){
		    printf("\t %d",lrow[N*k+l]);
			  }
			printf("\n \n ");
		}
	}
	bsp_sync();
}*/


//Clean up
bsp_pop_reg(startrow_m);
bsp_pop_reg(lsize_m);
bsp_pop_reg(lrow);


vecfreei(lrow);
vecfreei(lcol);
vecfreei(startrow_m);
vecfreei(lsize_m);
vecfreei(linter);

bsp_end();   
}