//#define Debug
int main(int argc, char **argv)
{
	if(argc != 3)
	{
		printf("Usage: test {N}\n");
		exit(-1);
	}
	size_t N = atoi(argv[1]);
	size_t npro = atoi(argv[2]);
	omp_set_num_threads(npro);
	 int iam = 0, np = 1;
	 #pragma omp parallel private(iam, np)
          {
                  np = omp_get_num_threads();
                  iam = omp_get_thread_num();
#ifdef Debug
                  printf("Hello from thread %d out of %d\n", iam, np);
#endif
          }
	struct timeval tv1,tv2;
	// generate a random matrix.
	printf("Size is %d,numP is %d\n",N,npro);
	int *mat = (int*)malloc(sizeof(int)*N*N);
	GenMatrix(mat, N);

	// compute the reference result.
	int *ref = (int*)malloc(sizeof(int)*N*N);
	memcpy(ref, mat, sizeof(int)*N*N);
	gettimeofday(&tv1,NULL);
	ST_APSP(ref, N);
	gettimeofday(&tv2,NULL);
	printf("Sequential time = %ld usecs\n",
                          (tv2.tv_sec-tv1.tv_sec)*1000000+tv2.tv_usec-tv1.tv_usec);


	// compute your results
	int *result = (int*)malloc(sizeof(int)*N*N);
	memcpy(result, mat, sizeof(int)*N*N);
//	ST_APSP(result, N);

	gettimeofday(&tv1,NULL);
        OMP_APSP(result,N);
        gettimeofday(&tv2,NULL);
        printf("OpenMp time = %ld usecs\n",
                          (tv2.tv_sec-tv1.tv_sec)*1000000+tv2.tv_usec-tv1.tv_usec);

	// compare your result with reference result
	if(CmpArray(result, ref, N*N))
		printf("Your result is correct.\n");
	else
		printf("Your result is wrong.\n");
}
int main(int argc, char **argv)
{
	if(argc != 2)
	{
		printf("Usage: test {N}\n");
		exit(-1);
	}

	size_t N = atoi(argv[1]);
	// matrix related variables
	int *mat, *ref, *result, *part, *k_row;
	int rows, k, root;
	int i, j, vij, vik, vkj;
	int npes, rank;
	struct timeval tv1, tv2;
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &npes);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	if(rank == 0){
		// generate matrix and compute sequential
		mat = (int*)malloc(sizeof(int)*N*N);
		ref = (int*)malloc(sizeof(int)*N*N);
		result = (int*)malloc(sizeof(int)*N*N);
		GenMatrix(mat, N);
		memcpy(ref, mat, sizeof(int)*N*N);
		gettimeofday(&tv1, NULL);
		ST_APSP(ref, N);
		gettimeofday(&tv2, NULL);
		printf("Sequential: %ld usecs\n",(tv2.tv_sec-tv1.tv_sec)*1000000+tv2.tv_usec-tv1.tv_usec);
		gettimeofday(&tv1, NULL);
	}

	// scatter the matrix
	rows = N/npes;
	part = (int*)malloc(sizeof(int)*N*rows);
	MPI_Scatter(mat, N*rows, MPI_INT, part, N*rows, MPI_INT, 0, MPI_COMM_WORLD);

	// parallel computing
	k_row = (int*)malloc(sizeof(int)*N);
	for(k = 0; k < N; k++){
		root = k/rows;
		if(rank == root){
			for(i = 0; i < N; i ++){
				*(k_row+i) = *(part + N*(k-rows*root) + i);
			}
		}
		MPI_Bcast(k_row, N, MPI_INT, root, MPI_COMM_WORLD);
		for(i = 0; i < rows; i++){
			for(j = 0; j < N; j++){
				vij = *(part + N*i + j);
				vik = *(part + N*i + k);
				vkj = *(k_row + j);
				if(vik != -1 && vkj != -1){
					if(vij == -1 || vij > vik+vkj)
					*(part + N*i + j) = vik + vkj;	
				}
			}
		}
	}
	
	// gather the matrix
	MPI_Gather(part, N*rows, MPI_INT, result, N*rows, MPI_INT, 0, MPI_COMM_WORLD);

	//compare your result with reference result
	if(rank == 0){
		gettimeofday(&tv2, NULL);
		printf("Parallel: %ld usecs\n",(tv2.tv_sec-tv1.tv_sec)*1000000+tv2.tv_usec-tv1.tv_usec);
//		printmat(mat, N);
//		printmat(ref, N);
//		printmat(result, N);
		if(CmpArray(result, ref, N*N))
			printf("Your result is correct.\n");
		else
			printf("Your result is wrong.\n");
	}

	// free memory
	if(rank == 0){
		free(mat);
		free(ref);
		free(result);
	}
	free(part);
	free(k_row);
	MPI_Finalize();
}
int main(int argc, char **argv) {
	MPI_Init(NULL, NULL);

	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &total_rank);

	if(argc != 2) {
		printf("Usage: test {mat_size}\n");
		exit(-1);
	}
	size_t mat_size = atoi(argv[1]);

	if (mat_size % total_rank) {
		for (int i = 0; i < 20; i++)
			printf("~");
		printf("\n");
		printf("Using MPT_*, only support divisible number of vertex...\n");
		for (int i = 0; i < 20; i++)
			printf("~");
		printf("\n");
		exit(-1);
	}
	
	int* mat;
	int* ref;
	int* result;
	if (my_rank == 0) {
		for (int i = 0; i < 20; i++)
			printf("~");
		printf("\n");
		printf("Using MPT_*, only support divisible number of vertex...\n");
		printf("Input size: %ld\n", mat_size);
		printf("Total process: %d\n", total_rank);

		prepare_data(&mat, &ref, &result, mat_size);

		// start the timer
		gettimeofday(&timer_parallel, NULL);
	}

	MPI_Barrier(MPI_COMM_WORLD);

	// know the set of rows I am working on according to my_rank 
	int rows_in_charge = mat_size / total_rank;

	// should this be included in the timer?
	int* my_rows = (int*)malloc(sizeof(int) * mat_size * rows_in_charge); //rows the current process have
	int* k_to_j = (int*)malloc(sizeof(int) * mat_size); // the vertical (column)

	if (my_rank == 0)
		gettimeofday(&timer_comm, NULL);
	// divide the matrix for each process
	// send rows to each process using scatter, sendbuf:*result, recvbuf:*my_rows
	int sendrecvcount = mat_size * rows_in_charge;
	MPI_Scatter(
		result, 
		sendrecvcount, 
		MPI_INT, 
		my_rows, 
		sendrecvcount, 
		MPI_INT, 
		0, 
		MPI_COMM_WORLD);
	if (my_rank == 0)
		time_comm += get_time_and_replace(&timer_comm);

	// preprocess_graph(my_rows, rows_in_charge, mat_size);

	for (int k = 0; k < mat_size; k++) {
		if (my_rank == 0)
			gettimeofday(&timer_comm, NULL);
		// broadcast k-th row to other process if I am the owner
		int owner_of_k_row = k / rows_in_charge;
		if (my_rank == owner_of_k_row)
			memcpy(k_to_j, my_rows + mat_size * (k % rows_in_charge), sizeof(int) * mat_size);
		MPI_Bcast(k_to_j, mat_size, MPI_INT, owner_of_k_row, MPI_COMM_WORLD);
		if (my_rank == 0)
			time_comm += get_time_and_replace(&timer_comm);

		for (int i = 0; i < rows_in_charge; i++) {
			for (int j = 0; j < mat_size; j++) {
				int ij = i * mat_size + j;
				int ik = i * mat_size + k;
				// if (my_rows[ij] > ikj)
				// 	my_rows[ij] = ikj;
				if (my_rows[ik] != -1 && k_to_j[j] != -1) {
					int ikj = my_rows[ik] + k_to_j[j];
					if (my_rows[ij] == -1 || my_rows[ij] > ikj)
						my_rows[ij] = ikj;
				}
			}
		}
	}

	if (my_rank == 0)
		gettimeofday(&timer_comm, NULL);
	// collect result to process 0
	MPI_Gather(
		my_rows,
		sendrecvcount,
		MPI_INT,
		result,
		sendrecvcount,
		MPI_INT,
		0,
		MPI_COMM_WORLD);
	if (my_rank == 0)
		time_comm += get_time_and_replace(&timer_comm);

	if (my_rank == 0) {
		//stop the timer
		time_used_parallel = get_time_and_replace(&timer_parallel);
		printf("Time used (parallel  ): %8ld usecs\n", time_used_parallel);
		printf("Time used (parallel  ) comm : %6ld usecs (%2.3lf%%) \n", time_comm, time_comm / (double)time_used_parallel * 100);
		printf("Speed up (sequential / parallel): %.3lf\n", time_used_sequential / (double)time_used_parallel);

		//compare your result with reference result
		if(CmpArray(result, ref, mat_size * mat_size))
			printf("Your result is correct.\n");
		else
			printf("Your result is wrong.\n");
		for (int i = 0; i < 20; i++)
			printf("~");
		printf("\n");
	}

    // Finalize the MPI environment.
    MPI_Finalize();
}
int main(int argc, char *argv[]) {
	if(argc != 2)
	{
		printf("Missing Argument\n");
		exit(-1);
	}
	
	int i, j;
	int *mat, *ref;
	int P, myrank;
	size_t N = atoi(argv[1]); //matrix size
	struct timeval tv1,tv2;
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &P);
	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
	
	
	if(myrank == 0) {
		// generate a random matrix.
		printf("input Size is %d; number of process is %d \n",N,P);
		mat = (int*)malloc(sizeof(int)*N*N);
		GenMatrix(mat, N);

		// compute the reference result.
		ref = (int*)malloc(sizeof(int)*N*N);
		memcpy(ref, mat, sizeof(int)*N*N);
		
		gettimeofday(&tv1, NULL);
		ST_APSP(ref, N);
		gettimeofday(&tv2, NULL);
		printf("Sequential time = %ld usecs\n", 
				(tv2.tv_sec-tv1.tv_sec)*1000000+tv2.tv_usec-tv1.tv_usec);  
		
		// compute your results
		
	}
		int strip = N/P;
		int *part = (int*)malloc(sizeof(int)*N*strip); /*array holding the part for this processor*/
	// Scatter data to all processors
	MPI_Barrier(MPI_COMM_WORLD);
	    if (myrank == 0) {
                gettimeofday(&tv1,NULL);
        }
	MPI_Scatter(mat, N*strip, MPI_INT, part, N*strip, MPI_INT, 0, MPI_COMM_WORLD);

	// Compute matrix in parallel
	MT_APSP (part, MPI_COMM_WORLD, myrank, N, P);
	
	//Gather the results
	MPI_Gather(part, N*strip, MPI_INT, mat, N*strip, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Barrier(MPI_COMM_WORLD);
	if(myrank == 0) {
		gettimeofday(&tv2, NULL);
		printf("Parallel time = %ld usecs\n\n",
				(tv2.tv_sec-tv1.tv_sec)*1000000+tv2.tv_usec-tv1.tv_usec);
#ifdef test	
		// compare your result with reference result
		if(CmpArray(mat, ref, N*N))
			printf("Your result is correct.\n");
		else
			printf("Your result is wrong.\n");
#endif			
		free(mat);
		free(ref);
		
	}
	free(part);
	MPI_Finalize();
	return 0;
}