int main(void){

  matrix* A = create_matrix(3, 3);
  value temp_a[9] = { 0, 0, 1,
		      0, 1, 0,
		      1, 0, 0};
  insert_array(temp_a, A);

  matrix* B = create_matrix(3, 1);
  value temp_b[3] = { 0,
		      4,
		      3};
  insert_array(temp_b, B);

  matrix* X = create_matrix(3, 1);

  gauss_jordan_solver(A, X, B);

  /* X should be */
  matrix* solution = create_matrix(3, 1);
  value temp_solution[3] = {3,
			    4,
			    0};
  insert_array(temp_solution, solution);

  assert(compare_matrices(X, solution));

  free_matrix(A);
  free_matrix(B);
  free_matrix(X);
  free_matrix(solution);

  return 0;
}
Esempio n. 2
0
static char *test_compare_matrices() 
{

	// By default newly create marices have all elements
	// equal to 0. So a and b should be equal.
	Matrix *a = create_matrix(2, 2);
	Matrix *b = create_matrix(2, 2);

	mu_assert("a != b", compare_matrices(a, b) == 1);

	// Let's modify their values a bit.
	a->value[1][1] = 2;
	b->value[0][0] = 1;

	// They should be not equal now
	mu_assert("a == b, should be different", compare_matrices(a, b) == 0);

	destroy_matrix(a);
	destroy_matrix(b);
	
	return 0;
}
Esempio n. 3
0
int main(){
  clock_t begin, end;
  double time_spent;
  begin = clock();

  matrix* Q = create_matrix(4, 4);
  value Q_arr[16] = {1, 0, 1, 0,
		     0, 2, 0, 1,
		     1, 0, 2, 0,
		     0, 1, 0, 1};
  insert_array(Q_arr, Q);

  sparse_matrix* s_Q = create_sparse_matrix(Q, 8);

  matrix* q = create_matrix(4, 1);
  value q_arr[4] = {3,
		    20,
		    5,
		    15};
  insert_array(q_arr, q);

  matrix* expected = create_matrix(4, 1);
  value e_arr[4] = {1,
                    5,
                    2,
                    10};
  insert_array(e_arr, expected);

  matrix* x = create_zero_matrix(4, 1);


  conjugate_gradient(s_Q, x, q);

  assert(compare_matrices(x, expected));

  free_matrix(Q);
  free_matrix(q);
  free_matrix(x);
  free_matrix(expected);
  free_sparse_matrix(s_Q);


  end = clock(); 
  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
  printf("time taken was: %f \n", time_spent);

}
Esempio n. 4
0
static char *test_add_matrices()
{
	// We should create 2 empty matrices. If we add them we should get
	// an empty matrix.
	Matrix *a = create_matrix(2, 2);
	Matrix *b = create_matrix(2, 2);
	// We will store the result of the adition in a third matrix c.
	Matrix *c = create_matrix(2, 2);

	add_matrices(a, b, &c);

	// Since a and b are empty their result will an empty matrix too
	// so it makes sense to compare c against b or a.
	mu_assert("c != a or b", compare_matrices(a, c));


	destroy_matrix(a);
	destroy_matrix(b);
	destroy_matrix(c);
	
	return 0;
}
Esempio n. 5
0
static char *test_compute_inverse()
{
	Matrix *a = create_matrix(3, 3);
	// The matrix b will contain the value of the a's inverse.
	Matrix *b = create_matrix(3, 3);
	
	b->value[0][0] = -1;
	b->value[0][1] = 0.5;
	b->value[0][2] = 0;
	b->value[1][0] = 0.5;
	b->value[1][1] = -1;
	b->value[1][2] = 0.5;
	b->value[2][0] = 0.3333333;
	b->value[2][1] = 0.5;
	b->value[2][2] = -0.333333;

	for(int i = 0, k = 1; i < 3; i++) {
		for(int j = 0; j < 3; j++, k++) {
			if(k != 5) {
				a->value[i][j] = k;
			}
			else {
				// The matrix that has all the values from 1 to 9 in spiral
				// has no inverse, so I replaced the 5 with an four.
				a->value[i][j] = 4;
			}
		}
	}
	a->determinant = get_determinant(a);
	compute_inverse(a);
	mu_assert("The invers is not equal to the matrix b", compare_matrices(
			  a->inverse, b));
		
	destroy_matrix(a);
	destroy_matrix(b);
	
	return 0;
}
Esempio n. 6
0
int main(){
  clock_t begin, end;
  double time_spent;
  begin = clock();

  matrix* Q = create_matrix(2,2);
  value Q_arr[4] = {	2, 0,
			0, 2};
  insert_array(Q_arr, Q);


  matrix* q = create_matrix(2, 1);
  value q_arr[2] = {  -2, 
		      -5};
  insert_array(q_arr, q);


  matrix* F = create_matrix(5, 2);
  value F_arr[10] = {  1, -2, 
		       -1, -2,
		       -1,  2, 
		       1,  0,
		       0,  1};
  insert_array(F_arr, F);


  matrix* g = create_matrix(5, 1);
  value g_arr[5] = { -2, 
		     -6, 
		     -2, 
		     0,
		     0};
  insert_array(g_arr, g);

  /* starting point */
  matrix* z = create_matrix(2,1);
  value z_arr[2] = {1, 
		    1};
  insert_array(z_arr, z);


  problem* problem = create_problem(Q, q, NULL, NULL, F, g, z, 0, 0);

  quadopt_solver(problem);

  matrix* expected = create_matrix(2, 1);
  value e_arr[2] = {1.4,
		    1.7};
  insert_array(e_arr, expected);

  assert(compare_matrices(problem->solution, expected));

  free_matrix(expected);
  free_problem(problem);

  end = clock(); 
  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
  printf("time taken was: %f \n", time_spent);

  return 0;
}
Esempio n. 7
0
int main(int argc, char **argv)
{
    int i, j, k, kk;

    // Randomly init A and B.
    srand(2008);
    randomInitArr((float*)A, HA*WA);
    randomInitArr((float*)B, WA*WB);

#pragma hicuda global alloc A[*][*] copyin
#pragma hicuda global alloc B[*][*] copyin
#pragma hicuda global alloc C[*][*]

    // Record the start time.
    struct timeval start_time;
    gettimeofday(&start_time, NULL);

    // C = A * B
#pragma hicuda kernel matrixMul tblock(64,64) thread(16,16)

#pragma hicuda loop_partition over_tblock over_thread
    for (i = 0; i < HA; ++i)
    {
#pragma hicuda loop_partition over_tblock over_thread
        for (j = 0; j < WB; ++j)
        {
            float sum = 0;

            for (kk = 0; kk < WA; kk += TILE_SZ) {
#pragma hicuda shared alloc A[i][kk:kk+15] copyin
#pragma hicuda shared alloc B[kk:kk+15][j] copyin
#pragma hicuda barrier
                for (k = 0; k < TILE_SZ; ++k) {
                    sum += A[i][kk+k] * B[kk+k][j];
                }
#pragma hicuda barrier
#pragma hicuda shared remove A B
            }
            C[i][j] = sum;
        }
    }

#pragma hicuda kernel_end

#pragma hicuda global copyout C[*][*]

#pragma hicuda global free A B C

    // Record the end time.
    struct timeval end_time;
    gettimeofday(&end_time, NULL);

    printf("Time elapsed: %6f ms\n", get_time_diff(&start_time, &end_time));

    // Compute reference solution.
    computeGold((float*)reference, (float*)A, (float*)B, HA, WA, WB);

    // Check result.
    compare_matrices((float*)C, (float*)reference, HA*WB);

    // printMatrix((float*)C, HA, WB);

    return 0;
}
Esempio n. 8
0
int main(void) {
  clock_t begin, end;
  double time_spent;
  begin = clock();

  matrix* a = create_matrix(4, 4);
  value temp_a[16] = { 18, 60, 57, 96,
		       41, 24, 99, 58,
		       14, 30, 97, 66,
		       51, 13, 19, 85 };
  insert_array(temp_a, a);

  matrix* b = create_matrix(4, 4);
  assert(insert_array(temp_a, b));


  //tests check_boundaries
  assert(check_boundaries(1,1,a));
  assert(check_boundaries(4,4,a));
  assert(!check_boundaries(4,5,a));
  assert(!check_boundaries(5,4,a));
  assert(!check_boundaries(0,1,a));
  assert(!check_boundaries(1,0,a));
  assert(!check_boundaries(-1,1,a));
  assert(!check_boundaries(1,-1,a));


  //tests compare_matrices,insert_value and get_value
  assert(compare_matrices(a,b));
  assert(insert_value(10,1,1,b));
  assert(!compare_matrices(a,b));
  assert(get_value(1,1,b)==10);
  assert(insert_value(18,1,1,b));
  assert(compare_matrices(a,b));


  //tests is_matrix
  matrix* c=a;
  assert(compare_matrices(a,c));
  assert(!is_matrix(a,b));
  assert(is_matrix(a,c));


  //tests insert_value by trying to go outside the matrix
  assert(insert_value(1,1,1,c));
  assert(insert_value(2,2,2,c));
  assert(insert_value(3,3,3,c));
  assert(insert_value(4,4,4,c));
  assert(!insert_value(5,5,5,c));
  assert(!insert_value(-1,-1,-1,c));
  assert(!insert_value(-1,-1,1,c));
  assert(!insert_value(-1,1,-1,c));

  //test get_value
  assert(get_value(1,1,c)==1);
  assert(get_value(2,2,c)==2);
  assert(get_value(3,3,c)==3);
  assert(get_value(4,4,c)==4);
  assert(get_value(0,0,c)==0);
  assert(get_value(1,-1,c)==0);
  assert(get_value(-1,1,c)==0);
  assert(get_value(5,5,c)==0);

  //tests insert and get without boundary checks
  insert_value_without_check(4,1,1,c);
  insert_value_without_check(3,2,2,c);
  insert_value_without_check(2,3,3,c);
  insert_value_without_check(1,4,4,c);
  assert(get_value_without_check(1,1,c)==4);
  assert(get_value_without_check(2,2,c)==3);
  assert(get_value_without_check(3,3,c)==2);
  assert(get_value_without_check(4,4,c)==1);

  //tests add_matrices
  value temp_b[16]={
    36,120,114,192,
    82,48,198,116,
    28, 60, 194,132,
    102,26,38,170};
  assert(insert_array(temp_b,a));
  matrix* d = create_matrix(4, 4);
  assert(add_matrices(b,b,d));
  assert(compare_matrices(d,a));

  //tests subtract_matrices
  value temp_c[16]={
    0,0,0,0,
    0,0,0,0,
    0, 0, 0,0,
    0,0,0,0};
  assert(insert_array(temp_c,a));
  assert(subtract_matrices(b,b,d));
  assert(compare_matrices(d,a));

  //tests sum_of_row
  assert(insert_array(temp_a,a));
  assert(sum_of_row(1,a)==231);
  assert(sum_of_row(4,a)==168);
  assert(sum_of_row(0,a)==0);
  assert(sum_of_row(5,a)==0);

  //tests sum_of_column
  assert(sum_of_column(1,a)==124);
  assert(sum_of_column(4,a)==305);
  assert(sum_of_column(0,a)==0);
  assert(sum_of_column(5,a)==0);

  //tests get_row_vector
  matrix* e = create_matrix(1, 4);
  value temp_d[4] = { 18, 60, 57, 96};
  assert(insert_array(temp_d,e));
  matrix* f = create_matrix(1, 4);
  assert(!get_row_vector(0,a,f));
  assert(!get_row_vector(5,a,f));
  assert(get_row_vector(1,a,f));
  assert(compare_matrices(e,f));

  //tests get_column_vector
  matrix* g = create_matrix(4, 1);
  assert(insert_array(temp_d,e));
  matrix* h = create_matrix(1, 4);
  assert(!get_row_vector(0,a,h));
  assert(!get_row_vector(5,a,h));
  assert(get_row_vector(1,a,h));
  assert(compare_matrices(e,h));

  //tests mulitply_matrices
  assert(multiply_matrices(a,a,b));
  value temp_f[16]={8478,5478,14319,17130,
		    6066,6760,15418,16792,
		    6206,5328,14431,15096,
		    6052,5047,7652,14129.00};
  assert(insert_array(temp_f,d));
  assert(compare_matrices(b,d));
  assert(!multiply_matrices(a,h,b));
  assert(!multiply_matrices(a,a,h));

  //tests transpose_matrix
  value temp_g[16]={18,41,14,51,
		    60,24,30,13,
		    57,99,97,19,
		    96,58,66,85};
  assert(insert_array(temp_g,d));
  assert(transpose_matrix(a,b));
  assert(compare_matrices(b,d));
  assert(!transpose_matrix(e,b));
  assert(!transpose_matrix(a,e));

  //tests multiply_matrix_with_scalar
  value temp_h[16] = { 36, 120, 114, 192,
		       82, 48, 198, 116,
		       28, 60, 194, 132,
		       102, 26, 38, 170 };
  assert(insert_array(temp_h,b));
  multiply_matrix_with_scalar(2,a);
  assert(compare_matrices(a,b));

  //test get_sub_matrix
  matrix* i=create_matrix(2,2);
  assert(insert_array(temp_a,a));
  assert(get_sub_matrix(1,2,1,2,a,i));
  matrix* j=create_matrix(2,2);
  value temp_i[4] = { 18, 60, 41, 24};
  assert(insert_array(temp_i,j));
  assert(compare_matrices(j,i));
  value temp_j[4] = { 97, 66, 19, 85};
  assert(insert_array(temp_j,j));
  assert(get_sub_matrix(3,4,3,4,a,i));
  assert(compare_matrices(j,i));
  assert(!get_sub_matrix(2,4,3,4,a,i));
  assert(!get_sub_matrix(3,4,2,4,a,i));
  assert(!get_sub_matrix(4,5,4,5,a,i));
  assert(!get_sub_matrix(0,1,0,1,a,i));

  //test insert_row_vector
  assert(insert_array(temp_a,a));
  value temp_k[16] = { 18, 60, 57, 96,
		       18, 60, 57, 96,
		       14, 30, 97, 66,
		       51, 13, 19, 85 };
  assert(insert_array(temp_k,b));
  assert(insert_array(temp_d,e));
  assert(insert_row_vector(2,e,a));
  assert(compare_matrices(a,b));

  end = clock();
  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
  printf("time taken was: %f \n",time_spent);
  free_matrix(a);
  free_matrix(b);
  free_matrix(d);
  free_matrix(e);
  free_matrix(f);
  free_matrix(g);
  free_matrix(h);
  free_matrix(i);
  free_matrix(j);

  return 0;
}
Esempio n. 9
0
void measure_difference(ghog::lib::HogDescriptor* hog1,
	ghog::lib::HogDescriptor* hog2,
	std::string hog_name1,
	std::string hog_name2,
	std::vector< std::string > image_list,
	cv::Size img_size,
	cv::Size window_size,
	int num_experiments,
	boost::random::mt19937 random_gen)
{
	std::cout << "Running difference experiment on descriptors " << hog_name1
		<< " and " << hog_name2 << ", with " << image_list.size()
		<< " images, using " << num_experiments << " windows of size "
		<< window_size << std::endl;

	cv::Size descriptor_size(hog1->get_descriptor_size(), 1);

	cv::Mat input_img1;
	cv::Mat normalized_img1;
	cv::Mat grad_mag1;
	cv::Mat grad_phase1;
	cv::Mat descriptor1;

	hog1->alloc_buffer(img_size, CV_32FC3, input_img1);
	hog1->alloc_buffer(window_size, CV_32FC3, normalized_img1);
	hog1->alloc_buffer(window_size, CV_32FC1, grad_mag1);
	hog1->alloc_buffer(window_size, CV_32FC1, grad_phase1);
	hog1->alloc_buffer(descriptor_size, CV_32FC1, descriptor1);

	cv::Mat input_img2;
	cv::Mat normalized_img2;
	cv::Mat grad_mag2;
	cv::Mat grad_phase2;
	cv::Mat descriptor2;

	hog2->alloc_buffer(img_size, CV_32FC3, input_img2);
	hog2->alloc_buffer(window_size, CV_32FC3, normalized_img2);
	hog2->alloc_buffer(window_size, CV_32FC1, grad_mag2);
	hog2->alloc_buffer(window_size, CV_32FC1, grad_phase2);
	hog2->alloc_buffer(descriptor_size, CV_32FC1, descriptor2);

	boost::random::uniform_smallint< int > dist_w(1,
		input_img1.cols - window_size.width - 2);
	boost::random::uniform_smallint< int > dist_h(1,
		input_img1.rows - window_size.height - 2);

	std::vector< double > errors_normalization;
	std::vector< double > magnitude_similarity;
	std::vector< double > phase_similarity;
	std::vector< double > descriptor_similarity;
	std::vector< double > total_similarity;

	std::cout << "Calculating partial difference" << std::endl;

	for(int i = 0; i < image_list.size(); ++i)
	{
		cv::imread(image_list[i], CV_LOAD_IMAGE_COLOR).convertTo(input_img1,
			CV_32FC3);
		cv::imread(image_list[i], CV_LOAD_IMAGE_COLOR).convertTo(input_img2,
			CV_32FC3);
		input_img1 /= 256.0;
		input_img2 /= 256.0;

		for(int j = 0; j < num_experiments; ++j)
		{
			int pos_x = dist_w(random_gen);
			int pos_y = dist_h(random_gen);

			input_img1.rowRange(pos_y, pos_y + window_size.height).colRange(
				pos_x, pos_x + window_size.width).copyTo(normalized_img1);
			input_img2.rowRange(pos_y, pos_y + window_size.height).colRange(
				pos_x, pos_x + window_size.width).copyTo(normalized_img2);

			hog1->image_normalization_sync(normalized_img1);
			hog2->image_normalization_sync(normalized_img2);
			errors_normalization.push_back(
				compare_matrices(normalized_img1, normalized_img2));

			normalized_img1.copyTo(normalized_img2);

			hog1->calc_gradient_sync(normalized_img1, grad_mag1, grad_phase1);
			hog2->calc_gradient_sync(normalized_img2, grad_mag2, grad_phase2);
			magnitude_similarity.push_back(
				compare_matrices(grad_mag1, grad_mag2));
			phase_similarity.push_back(
				compare_matrices(grad_phase1, grad_phase2));

			grad_mag1.copyTo(grad_mag2);
			grad_phase1.copyTo(grad_phase2);

			hog1->create_descriptor_sync(grad_mag1, grad_phase1, descriptor1);
			hog2->create_descriptor_sync(grad_mag2, grad_phase2, descriptor2);

			descriptor_similarity.push_back(
				compare_matrices(descriptor1, descriptor2));
		}
	}

	std::cout << "Error on image normalization:" << std::endl;
	report_statistics(errors_normalization, 1, "normalized euclidian distance");
	std::cout << "Error on magnitude calculation:" << std::endl;
	report_statistics(magnitude_similarity, 1, "normalized euclidian distance");
	std::cout << "Error on phase calculation:" << std::endl;
	report_statistics(phase_similarity, 1, "normalized euclidian distance");
	std::cout << "Error on descriptor calculation:" << std::endl;
	report_statistics(descriptor_similarity, 1,
		"normalized euclidian distance");

	std::cout << "Calculating complete difference" << std::endl;

	for(int i = 0; i < image_list.size(); ++i)
	{
		cv::imread(image_list[i], CV_LOAD_IMAGE_COLOR).convertTo(input_img1,
			CV_32FC3);
		cv::imread(image_list[i], CV_LOAD_IMAGE_COLOR).convertTo(input_img2,
			CV_32FC3);
		input_img1 /= 256.0;
		input_img2 /= 256.0;

		for(int j = 0; j < num_experiments; ++j)
		{
			int pos_x = dist_w(random_gen);
			int pos_y = dist_h(random_gen);

			input_img1.rowRange(pos_y, pos_y + window_size.height).colRange(
				pos_x, pos_x + window_size.width).copyTo(normalized_img1);
			input_img2.rowRange(pos_y, pos_y + window_size.height).colRange(
				pos_x, pos_x + window_size.width).copyTo(normalized_img2);

			hog1->image_normalization_sync(input_img1);
			hog1->calc_gradient_sync(normalized_img1, grad_mag1, grad_phase1);
			hog1->create_descriptor_sync(grad_mag1, grad_phase1, descriptor1);

			hog2->image_normalization_sync(input_img2);
			hog2->calc_gradient_sync(normalized_img2, grad_mag2, grad_phase2);
			hog2->create_descriptor_sync(grad_mag2, grad_phase2, descriptor2);

			total_similarity.push_back(
				compare_matrices(descriptor1, descriptor2));
		}
	}

	std::cout << "Total similarity :" << std::endl;
	report_statistics(total_similarity, 1, "normalized euclidian distance");
	std::cout << std::endl;
}
int main(int argc, char **argv )
{
  int 
    me,              /* holds the index of "this" process */
    nprocs,          /* holds the number of processes involved */
    nprows, npcols,  /* mesh sizes */
    myrow, mycol,    /* this node's mesh coordinates */
    n,               /* global matrix size */
    local_m,         /* local row size of A, B, C */
    local_n;         /* local column size of A, B, C */

  double
    *global_A,       /* array in which to hold matrix A */
    *global_B,       /* array in which to hold matrix B */
    *global_C,       /* array in which to hold matrix C */
    *local_A,        /* array in which to hold local part of matrix A */
    *local_B,        /* array in which to hold local part of matrix B */
    *local_C,        /* array in which to hold local part of matrix C */
    *local_C_ref,    /* array in which to hold local part of matrix C */
    local_diff,      /* hold difference between sequential and */
    diff,            /*			parallel result */
    d_one = 1.0;     /* double precision one, to pass by address */

  MPI_Comm
    comm_row, comm_col;  /* communicators for the row and col in which this 
			    node exists */

  /* Initialize MPI, passing in the command-line parameters.  MPI_Init
     strips out the command-line parameters for MPI (e.g., the -np 5
     in our example) and then returns argc and argv with without those
     parameters. */
  MPI_Init( &argc, &argv );

  /* Inquire how many processes were started up by mpiexec */
  MPI_Comm_size( MPI_COMM_WORLD, &nprocs );

  /* Inquire the index (rank) of "this" process within the proceses
     started up by mpiexec */
  MPI_Comm_rank( MPI_COMM_WORLD, &me );

  /* Process 0 accepts input */
  if ( me == 0 ){
    printf("enter matrix size n:");
    scanf( "%d", &n );
    printf("enter nprows, npcols:");
    scanf( "%d%d", &nprows, &npcols );
  }

  /* share parameters  with all nodes */
  MPI_Bcast( &n, 1, MPI_INT, 0, MPI_COMM_WORLD );
  MPI_Bcast( &nprows, 1, MPI_INT, 0, MPI_COMM_WORLD );
  MPI_Bcast( &npcols, 1, MPI_INT, 0, MPI_COMM_WORLD );

  if ( nprows * npcols != nprocs ){
    printf( "mesh not of right size\n" );
    exit( 0 );
  }
  
  /* Figure out what my index is */
  mycol = me / nprows;
  myrow = me % nprows;

  /* create a communicator for the row of which I am part */
  MPI_Comm_split( MPI_COMM_WORLD, myrow, mycol, &comm_row );

  /* create a communicator for the column of which I am part */
  MPI_Comm_split( MPI_COMM_WORLD, mycol, myrow, &comm_col );

  /* create buffers into which to hold the global A, B, C (everyone will have a copy) */
  global_A = ( double * ) malloc ( sizeof( double ) * n * n );
  global_B = ( double * ) malloc ( sizeof( double ) * n * n );
  global_C = ( double * ) malloc ( sizeof( double ) * n * n );

  /* create random matrices on node zero and share with all nodes */
  if ( me == 0 ){
    random_matrix( n, n, global_A, n );
    random_matrix( n, n, global_B, n );
    random_matrix( n, n, global_C, n );
  }
  MPI_Bcast( global_A, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD );
  MPI_Bcast( global_B, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD );
  MPI_Bcast( global_C, n*n, MPI_DOUBLE, 0, MPI_COMM_WORLD );

  /* compute local matrix sizes */
  local_m   = n / nprows + ( myrow < n % nprows ? 1 : 0 );
  local_n   = n / npcols + ( mycol < n % npcols ? 1 : 0 );

  /* create buffer into which to hold the ocal A, B, C */
  local_A = ( double * ) malloc ( sizeof( double ) * local_m * local_n );
  local_B = ( double * ) malloc ( sizeof( double ) * local_m * local_n );
  local_C = ( double * ) malloc ( sizeof( double ) * local_m * local_n );

  /* copy the local parts */
  CopyMatrixGlobalToLocal( n, n, 
			   global_A, n, 
			   local_A, local_m, 
			   comm_row, comm_col );
  CopyMatrixGlobalToLocal( n, n, 
			   global_B, n, 
			   local_B, local_m, 
			   comm_row, comm_col );
  CopyMatrixGlobalToLocal( n, n, 
			   global_C, n, 
			   local_C, local_m, 
			   comm_row, comm_col );

  /* Compute parallel matrix-matrix multiply */
  ParallelMMult( n, n, n, 
		 local_A, local_m, 
		 local_B, local_m, 
		 local_C, local_m, 
		 comm_row, comm_col );

  /* Compute sequential matrix-matrix multiply on all nodes */
  dgemm_( "N", "N", &n, &n, &n,
  	  &d_one, global_A, &n, global_B, &n,
  	  &d_one, global_C, &n );

  local_C_ref = ( double * ) malloc ( sizeof( double ) * local_m * local_n );

  CopyMatrixGlobalToLocal( n, n, 
			   global_C, n, 
			   local_C_ref, local_m, 
			   comm_row, comm_col );

  local_diff = compare_matrices( local_m, local_n, 
				 local_C, local_m, 
				 local_C_ref, local_m );

  MPI_Allreduce( &local_diff, &diff, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );

  if ( me == 0 )
    printf("\ndiff = %le\n", diff );

  free( global_A );
  free( global_B );
  free( global_C );
  free( local_A );
  free( local_B );
  free( local_C );
  free( local_C_ref);

  MPI_Comm_free( &comm_row );
  MPI_Comm_free( &comm_col );

  /* Cleanup up the MPI environment */
  MPI_Finalize();

  exit( 0 );
}