Example #1
0
int compare(matrixMultAlg alg1, matrixMultAlg alg2, 
	    const int *A, const int *B, const int size)
{
  int *C1, *C2;
  C1=(int*)malloc(sizeof(int)*size);
  C2=(int*)malloc(sizeof(int)*size);
  alg1(A,B,C1,size);
  alg2(A,B,C2,size);
  int result=matrixEqual(C1,C2,size);
  free(C1);
  free(C2);
  return result;
}
void Renderer::makeSingleRenderCommandList(std::vector<RenderCommand*> commands) {
	int j = 0;
	// dont use with push_back_resize: some weird realloc error occurs
	//_renderCommands->reserveElements(commands.size());

	for (auto i = commands.cbegin(); i < commands.cend(); i++, j++) {
		auto type = (*i)->getType();

		if (type == RenderCommand::Type::ARBITRARY_VERTEX_COMMAND) {
			ArbitraryVertexCommand* avc = (ArbitraryVertexCommand*)(*i);

			bool newCommand = _lastWasFlushCommand;

			Material2D* currMaterial = avc->_material2d;
			bool transformOnCpu = avc->_transformOnCpu;
			ArbitraryVertexCommand::Data data = avc->_data;
			Mat4 modelView = avc->_mv;
			ssize_t vertexDataSize = avc->getVertexDataSize();

			_lastWasFlushCommand = false;

			// process batching

			// check if buffer limit is exceeded
			if (_currentVertexBufferOffset + vertexDataSize > ARBITRARY_VBO_SIZE ||
				_currentIndexBufferOffset + data.indexCount > ARBITRARY_INDEX_VBO_SIZE) {
				CCASSERT(false, "Exceeding the index or vertex buffer size");
			}

			if (_firstAVC) {
				_vertexBatches->push_back_resize(VertexBatch());
				_currentVertexBatch->material = currMaterial;
				_currentVertexBatch->indexed = avc->_isIndexed;
				_lastMaterial_skipBatching = currMaterial->_skipBatching && currMaterial->_id == MATERIAL_ID_DO_NOT_BATCH;
				newCommand = true;
				_firstAVC = false;
			}
			else {

				bool needsFilledVertexReset = _filledVertex + data.vertexCount > 0xFFFF; // meaning no index(short) could adress it anymore

				if (_isBufferSlicing) {
					bool vboFull = ((_currentVertexBufferOffset + vertexDataSize) - _lastVertexBufferSlicePos) > _vboByteSlice;
					needsFilledVertexReset |= vboFull;

					if (vboFull) {
						CCASSERT(vertexDataSize < _vboByteSlice, "commands vertex data is too big for slicing");
						_lastVertexBufferSlicePos = _currentVertexBufferOffset;
					}
				}

				bool currMaterial_skipBatching = currMaterial->_skipBatching || currMaterial->_id == MATERIAL_ID_DO_NOT_BATCH;
				bool needFlushDueToDifferentMatrix = false;

				bool indexedStateDiffers = avc->_isIndexed != _lastCommandWasIndexed;

				needsFilledVertexReset |= indexedStateDiffers;

				// check if there need to be new batch due to different transform mode:
				// last command was cpu-transform and new one isnt -> new batch
				// last command was non-cpu-transform and new one is -> new batch
				// last command and new command are cpu-transformed, but dont share the same modelview -> new batch
				if (_lastAVC_was_NCT) {
					do {
						if (transformOnCpu) {
							needFlushDueToDifferentMatrix = true;
							break;
						}
						if (!matrixEqual(&_lastAVC_NCT_Matrix, &modelView)) {
							needFlushDueToDifferentMatrix = true;
							_lastAVC_NCT_Matrix = modelView;
						}
					} while (0);
				}
				else if (!transformOnCpu) {
					needFlushDueToDifferentMatrix = true;
					_lastAVC_NCT_Matrix = modelView;
				}

				// check if:
				// curr material id differs from previous?
				// either curr or prev materials skipped batching?
				// there needs to be a _filledVertex reset
				// the above check returned new batch
				if (currMaterial->_id != _currentMaterial2dId ||
					currMaterial_skipBatching ||
					_lastMaterial_skipBatching ||
					needsFilledVertexReset ||
					needFlushDueToDifferentMatrix)
				{
					// set the previous vertex batch end render command index
					_currentVertexBatch->endRCIndex = _currentAVCommandCount;
					// go to next vertex batch
					nextVertexBatch();
					// set material and starting render command index
					_currentVertexBatch->material = currMaterial;
					_currentVertexBatch->indexed = avc->_isIndexed;
					_currentVertexBatch->indexBufferHandle = 0;
					_currentVertexBatch->vertexBufferHandle = 0;
					_currentVertexBatch->startingRCIndex = _currentAVCommandCount;
					if (needsFilledVertexReset || _lastArbitraryCommand->_material2d->_vertexStreamAttributes.id != currMaterial->_vertexStreamAttributes.id) {
						// if needsFilledVertexReset is set or the vertex attrib format from the previous material is different from the current use new vertex offset
						_filledVertex = 0;
						_currentVertexBatch->indexBufferOffset = _currentIndexBufferOffset;
						_currentVertexBatch->vertexBufferOffset = _currentVertexBufferOffset;
					}
					else {
						// use the offsets from the previous one
						_currentVertexBatch->indexBufferOffset = _previousVertexBatch->indexBufferOffset;
						_currentVertexBatch->vertexBufferOffset = _previousVertexBatch->vertexBufferOffset;
					}
					_previousVertexBatch->indexBufferUsageEnd = _currentVertexBatch->indexBufferUsageStart = _currentIndexBufferOffset;
					_previousVertexBatch->vertexBufferUsageEnd = _currentVertexBatch->vertexBufferUsageStart = _currentVertexBufferOffset;
					newCommand = true;
				}
			}
			_lastAVC_was_NCT = !transformOnCpu;
			_lastCommandWasIndexed = avc->_isIndexed;
			_currentMaterial2dId = currMaterial->_id;

			// data copying logic
			memcpy(_currentVertexBuffer, data.vertexData, vertexDataSize);
			if (transformOnCpu) {
				// treat the first 12 byte (3 floats) as a Vec3 and transform it using the modelView
				byte* ptr = _currentVertexBuffer;
				byte* endPtr = ptr + vertexDataSize;
				int stride = currMaterial->_vertexStreamAttributes.stride;
				while (ptr < endPtr) {
					Vec3* vec = reinterpret_cast<Vec3*>(ptr);
					modelView.transformPoint(vec);
					ptr += stride;
				}
			}
			if (data.indexCount != 0) {
				// copy index data
				if (_filledVertex == 0) {
					// special case when the vertex buffer offset is 0
					memcpy(_currentIndexBuffer, data.indexData, sizeof(short) * data.indexCount);
				}
				else {
					GLushort* ptr = _currentIndexBuffer;
					GLushort* endPtr = ptr + data.indexCount;

					GLushort* srcPtr = (GLushort*)data.indexData;

					while (ptr < endPtr) {
						*(ptr++) = *(srcPtr++) + _filledVertex;
					}
				}
			}

			// adjust buffers and offset
			_currentIndexBuffer += data.indexCount;
			_currentVertexBuffer += vertexDataSize;

			_currentVertexBufferOffset += vertexDataSize;
			_currentIndexBufferOffset += data.indexCount;

			_filledVertex += data.vertexCount;

			// if newCommand is set create a new avc and init it
			if (newCommand) {
				ArbitraryVertexCommand* avc = _avcPool1->pop();

				// the data value doesnt really matters here
				avc->init(0, currMaterial, data, modelView, transformOnCpu, 0);

				_currentAVCommandCount++;
				_lastArbitraryCommand = avc;
				_renderCommands->push_back_resize(avc);

				_avcPool2->push(avc);
			}
			else {
				// do nothing
			}
			_lastArbitraryCommand = avc;
		}
		else {
			_lastWasFlushCommand = true;
			if (type == RenderCommand::Type::GROUP_COMMAND) {
				makeSingleRenderCommandList(_renderGroups[reinterpret_cast<GroupCommand*>(*i)->getRenderQueueID()]);
				//_renderCommands->reserveElements(commands.size() - j);
				continue;
			}
			_renderCommands->push_back_resize(*i);
			continue;
		}
	}
}
Example #3
0
int main(int argc, char * argv[]) {

  int rank_grid, rank_row, rank_col;
  int coordinates[2];
  int node_total_size;
  int node_dim_size;
  int elem_dim_size;
  int subelem_dim_size;
  int * scatter_sendcount;
  int * scatter_displacement;
  int gridinit_num_dims = 2;
  int gridinit_dims[2] = {0,0};
  int gridinit_periods[2] = {0,0};
  int gridinit_reorder = 1;

  MPI_Comm mpi_comm_grid, mpi_comm_row, mpi_comm_col;
  MPI_Datatype mpi_type_submatrix, mpi_type_submatrix_vector;

  MPI_Request fox_send_request, fox_recv_request;
  int fox_sendto, fox_recfrom, fox_sendtag, fox_rectag;
  int fox_broadcaster;

  double *mat_a, *mat_b, *mat_c;
  double *A_mine, *B_old, *B_new, *C_mine, *A_bcast;
  double *mat_verify;

  int i, j, k;

  int verify = 0;
  int verbose = 0;

  MPI_Init(&argc, &argv);
  MPI_Comm_size(MPI_COMM_WORLD, &node_total_size);
  double starttime, endtime;
  starttime = MPI_Wtime();
  
  // Set up cartesian coordinate grid
  MPI_Dims_create(node_total_size, 
		  gridinit_num_dims, 
		  gridinit_dims);
  MPI_Cart_create(MPI_COMM_WORLD, 
		  gridinit_num_dims, 
		  gridinit_dims, 
		  gridinit_periods, 
		  gridinit_reorder, 
		  &mpi_comm_grid);

  // ** Get the grid coordinates of this process.
  MPI_Comm_rank(mpi_comm_grid, &rank_grid);
  MPI_Cart_coords(mpi_comm_grid, rank_grid, gridinit_num_dims, coordinates);

  // ** Set up column communicators.
  MPI_Comm_split(mpi_comm_grid, coordinates[1], coordinates[0], &mpi_comm_col);
  MPI_Comm_rank(mpi_comm_col, &rank_col); 

  // ** Set up row communicators  
  MPI_Comm_split(mpi_comm_grid, coordinates[0], coordinates[1], &mpi_comm_row);
  MPI_Comm_rank(mpi_comm_row, &rank_row); 

  // Get the number of processors per dimension in grid.
  MPI_Comm_size(mpi_comm_row, &node_dim_size);
  
  // ********************************************
  // ** CHECK SANITY OF AND SET UP ENVIRONMENT **
  // ********************************************

  // Check that number of parameters is sane.
  if(argc < 2) {
    if(rank_grid == 0)
      printf("Usage: foxmatrix N\n  N = randomize NxN matrices.\n");
    MPI_Finalize();
    return -1;
  }

  // Get the number of elements per dimension in matrices from arguments.
  elem_dim_size = atoi(argv[1]);

  // Check that number of processors is sane.
  if(sqrt(node_total_size) != (double) ((int) sqrt(node_total_size))) {
    if(rank_grid == 0)
      printf("Not a square number of processors.\n");
    MPI_Finalize();
    return -1;
  }

  // Check that it is possible to split matrix over the processors.
  if(elem_dim_size % node_dim_size != 0) {
    if(rank_grid == 0)
      printf("Cannot split elements evenly over processors.\n");
    MPI_Finalize();
    return -1;
  }

  // Calculate the size (in one dimension) of the submatrices.
  subelem_dim_size = elem_dim_size / node_dim_size;

  // Check if the user has given the verify/verbose commands.
  if(argc == 3 && strcmp(argv[2], "verify") == 0)
    verify = 1;
  else if(argc == 3 && strcmp(argv[2], "verbose") == 0)
    verbose = 1;
  else if(argc == 4 && 
	  strcmp(argv[2], "verbose") == 0 && 
	  strcmp(argv[3], "verify") == 0) {
    verbose = 1;
    verify = 1;
  } else if(argc == 4 && 
	  strcmp(argv[2], "verify") == 0 && 
	  strcmp(argv[3], "verbose") == 0) {
    verbose = 1;
    verify = 1;
  }
  
  // Create datatype used for transmitting submatrices.
  // Idea of using vector+struct taken from http://www.mcs.anl.gov/research/projects/mpi/tutorial/mpiexmpl/src4/scatter/C/solution.html.

  MPI_Type_vector(subelem_dim_size,
		  subelem_dim_size,
		  elem_dim_size, 
		  MPI_DOUBLE,
		  &mpi_type_submatrix_vector);
  int sm_blocklength[2] = {1, 1};
  MPI_Aint sm_displacement[2] = {0, subelem_dim_size * sizeof(double)};
  MPI_Datatype sm_types[2] = {mpi_type_submatrix_vector, MPI_UB};
  MPI_Type_struct(2, 
		  sm_blocklength, 
		  sm_displacement, 
		  sm_types, 
		  &mpi_type_submatrix);
  MPI_Type_commit(&mpi_type_submatrix);

  // ** CREATE MATRICES AND SET UP SCATTERV/GATHERV VARIABLES **
  
  if(rank_grid == 0) {
    // Create matrices on rank 0.
    mat_a = (double *) malloc(elem_dim_size * elem_dim_size * sizeof(double));
    mat_b = (double *) malloc(elem_dim_size * elem_dim_size * sizeof(double));
    mat_c = (double *) malloc(elem_dim_size * elem_dim_size * sizeof(double));
    
    // Randomize matrix contents.
    randomMatrixInit();
    randomMatrix(mat_a, elem_dim_size);
    randomMatrix(mat_b, elem_dim_size);

    // Allocate memory for storing scattering information.
    scatter_sendcount = (int *) malloc(node_total_size * sizeof(int));
    scatter_displacement = (int *) malloc(node_total_size * sizeof(int));
    
    // Set up scatter/gather arguments.
    int sit;
    for(sit = 0; sit < node_total_size; sit++) {
      scatter_sendcount[sit] = 1;
      if(sit == 0)
	scatter_displacement[sit] = 0;
      else {
	scatter_displacement[sit] = scatter_displacement[sit - 1] + 1;
	if(sit % node_dim_size == 0)
	  // At end of line, go to start of next submatrix.
	  scatter_displacement[sit] += node_dim_size * (subelem_dim_size - 1);
      }      
    }
  }

  A_mine = (double *) malloc(subelem_dim_size*subelem_dim_size*sizeof(double));
  A_bcast = (double *) malloc(subelem_dim_size*subelem_dim_size*sizeof(double));
  B_old = (double *) malloc(subelem_dim_size*subelem_dim_size*sizeof(double));
  B_new = (double *) malloc(subelem_dim_size*subelem_dim_size*sizeof(double));
  C_mine = (double *) malloc(subelem_dim_size*subelem_dim_size*sizeof(double));
  
  zeroMatrix(C_mine, subelem_dim_size);
  
  // ** DISTRIBUTE THE SUBMATRICES TO THE GRID NODES **
  MPI_Scatterv(mat_a,
	       scatter_sendcount,
	       scatter_displacement,
	       mpi_type_submatrix,
	       A_mine,
	       subelem_dim_size * subelem_dim_size,
	       MPI_DOUBLE,
	       0,
	       mpi_comm_grid);
  
  MPI_Scatterv(mat_b,
	       scatter_sendcount,
	       scatter_displacement,
	       mpi_type_submatrix,
	       B_new,
	       subelem_dim_size * subelem_dim_size,
	       MPI_DOUBLE,
	       0,
	       mpi_comm_grid);

  // ** PERFORM FOX'S ALGORITHM FOR MATRIX MULTIPLICATION **
 
  for(k = 0; k < node_dim_size; k++) {
    
    // **** BROADCAST A **** //
   
    // Decide who broadcasts this iteration.
    fox_broadcaster = (k + rank_col) % node_dim_size;
    
    // Copy matrix to the broadcast variable of the node that shall broadcast.
    if(rank_row == fox_broadcaster)
      copyMatrix(A_bcast, A_mine, subelem_dim_size);
    
    // Perform the broadcasting of the A matrix.
    MPI_Bcast(A_bcast, 
	      subelem_dim_size * subelem_dim_size, 
	      MPI_DOUBLE, 
	      fox_broadcaster,
	      mpi_comm_row);

    // **** CREATE COPY OF B **** //
    
    // Wait for everyone to get their new B. If k = 0 everyone has it scattered.
    if(k != 0)
      MPI_Wait(&fox_recv_request, MPI_STATUS_IGNORE);
    
    // Make a copy of B so we can overwrite the old one.
    copyMatrix(B_old, B_new, subelem_dim_size);

    // **** SHIFT B **** //
    
    // Find which node to send to, and which to recieve from (B matrix).
    fox_recfrom = ((rank_col + 1) % node_dim_size);
    fox_sendto = ((rank_col - 1) % node_dim_size);
    if(fox_sendto < 0)
      fox_sendto = node_dim_size - 1;
    
    fox_sendtag = 1000 + fox_sendto;
    fox_rectag = 1000 + rank_col;

    // Send the B matrix.
    MPI_Isend(B_old,
	      subelem_dim_size * subelem_dim_size,
	      MPI_DOUBLE,
	      fox_sendto,
	      fox_sendtag,
	      mpi_comm_col,
	      &fox_send_request);

    // Receive the B matrix.
    MPI_Irecv(B_new,
	      subelem_dim_size * subelem_dim_size,
	      MPI_DOUBLE,
	      fox_recfrom,
	      fox_rectag,
	      mpi_comm_col,
	      &fox_recv_request);
    
    // Perform matrix multiplication on the local submatrix.
    naiveMatrixMult(A_bcast, B_old, C_mine, subelem_dim_size);

  }

  // ** GATHER DATA **

  MPI_Barrier(MPI_COMM_WORLD);

  // Collect C from submatrices.
  MPI_Gatherv(C_mine,
	      subelem_dim_size * subelem_dim_size,
	      MPI_DOUBLE,
	      mat_c,
	      scatter_sendcount,
	      scatter_displacement,
	      mpi_type_submatrix,
	      0,
	      mpi_comm_grid);

  // ** PRESENT DATA **
 
  if(verbose && !rank_grid) {
    printf("** Will print matrix C from 0:\n");
    printMatrix(mat_c, elem_dim_size);
  }
  
  // ** VERIFICATION OF CORRECTNESS **

  if(verify && !rank_grid) {

      // Allocate memory for verification matrix.
      mat_verify = (double *) 
	malloc(elem_dim_size * elem_dim_size * sizeof(double));
      
      // Initialize verification matrix to zeroes.
      zeroMatrix(mat_verify, elem_dim_size);

      // Do the naive multiplication.
      naiveMatrixMult(mat_a, mat_b, mat_verify, elem_dim_size);
      
      // Print the correct matrix.
      if(verbose) {
	printf("** Print correct matrix from 0:\n");
	printMatrix(mat_verify, elem_dim_size);
      }
      
      // Check equality between matrices.
      if(matrixEqual(mat_c, mat_verify, elem_dim_size))
	printf("\n   Ok!\n\n");
      else
	printf("\n   FAIL!\n\n");
       
      // Free the memory used by the verification matrix.
      free(mat_verify);
  }

  // ** FINALIZE MPI **

  MPI_Barrier(MPI_COMM_WORLD);
  
  if(rank_grid==0)
  {
    endtime = MPI_Wtime();
    printf("%f\n", endtime - starttime);
  } 
  MPI_Finalize();

  // ** CLEANUP **
  
  if(A_mine)
    free(A_mine);
  if(A_bcast)
     free(A_bcast);
  if(B_old)
     free(B_old);
  if(B_new)
    free(B_new);
  if(C_mine)
    free(C_mine);
  
  // Local rank 0 cleanup.
  if(rank_grid == 0) {
    free(mat_a);
    free(mat_b);
    free(mat_c);
  }

  return 0;
}
Example #4
0
void loop(){
    DEBUG(("%d", api.getTime()));
    
    api.getMyZRState(me);
	api.getOtherZRState(other);
	
	aboveOtherPos[0] = other[0];
	aboveOtherPos[1] = other[1];
	
	if (game.getMemoryFilled() != 2 /*&& game.getEnergy() >= 3*/) {
	    mathVecSubtract(vecBtwnSph, other, me, 3);
	    mathVecNormalize(vecBtwnSph, 3);
	    api.setAttitudeTarget(vecBtwnSph);
	}
	
    if ((game.getEnergy() > 3) && (game.getPicPoints() > 0) && canTakePic()) {
        game.takePic();
    }
    
    if (game.getMemoryFilled() == 2) {
        api.setAttitudeTarget(earth);
        
        if (matrixEqual(me+6, earth) && game.getEnergy() >= 3) {
            game.uploadPics();
        }
    }
    
    state = setState();
    
    if (game.getNumMirrorsHeld() > 0) game.useMirror();
    
	//bunch of states
    switch (state) {
        case 0://Get items
            DEBUG(("State 0"));
            
            if (!(sphColor)) {
                if (game.hasItem(8) == -1) {
                    moveFast(mir2);
                }
                else if (game.hasItem(4) == -1) {
                    if (game.getEnergy() > 3) moveFast(score2);
                    else api.setPositionTarget(me);
                }
            }
            
            else {
                if (game.hasItem(7) == -1) {
                    moveFast(mir1);
                }
                else if (game.hasItem(5) == -1) {
                    if (game.getEnergy() > 3) moveFast(score3);
                    else api.setPositionTarget(me);
                }
            }
            
            //Not worth it to get the other mirror
            /*else if (game.hasItem(7) == -1 && !(sphColor)) {
                moveFast(mir1);
            }
            else if (game.hasItem(8) && sphColor) {
                moveFast(mir2);
            }*/
            
            //Bottom score a waste of energy?
            /*else if (game.hasItem(3) == -1) {
                if (game.getEnergy() > 3) moveFast(score1);
                else api.setPositionTarget(me);
            }*/
            
            //Top score object
            /*else if (game.hasItem(6) == -1) {
                if (game.getEnergy() > 3) moveFast(score4);
                else api.setPositionTarget(me);
            }*/
        break;
        
        case 1://Stay at top
            DEBUG(("State 1"));
            
            if (dist(me, other) > 0.5) {
                api.setPositionTarget(aboveOtherPos);
            }
            else {
                api.setPositionTarget(me);
            }
        break;
        
        case 2://Spam upload
            DEBUG(("State 2"));
            
            if (game.getMemoryFilled() != 0 && game.getEnergy() > 2) game.uploadPics();
        break;
        
        /*case 3://Try to ram
            DEBUG(("Case 3"));
            
            moveFast(otherPos);
            
        break;*/
    }
}
Example #5
0
int main(int argc, char *argv[])  {
  int numtasks, rank, dest, source, rc, count, tag=1;  
  MPI_Status Stat;
  
  // Seed random number generator.
  randomMatrixInit();

  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  
  double *A;
  double *B;
  
  if(argc==1)
    {
      printf("Specify matrix size\n");
      return 1;
    }
  
  int size=atoi(argv[1]);
  A = (double*)malloc(sizeof(double)*size*size);
  B = (double*)malloc(sizeof(double)*size*size);
  randomMatrix(A,size);
  randomMatrix(B,size);
  
  //  int Atest[]={1,2,3,4,5,6,7,8,9};
  //  int Btest[]={3,2,1,6,5,4,4,3,2};
  //  A=Atest;
  //  B=Btest;
  
  if(numtasks>size*size)
    numtasks=size*size;
  
  int minCells=size*size/(numtasks);
  int extra=size*size-minCells*(numtasks);
  int pad=0;
  
  
  if (rank == 0) {
    dest = 1;
    source = 1;
    
    double *C;
    
    C = (double*)malloc(sizeof(double)*size*size);
    
    double *ret = (double*)malloc(sizeof(double)*size*size/(numtasks)+1);
    
    for(int cell=0; cell<minCells; cell++)
      {
        C[cell]=0;
        for(int j=0; j<size; j++)
	  C[cell]+=A[(cell/size)*size+j]*B[j*size+cell%size];
	//     printf("Calculating job %d. C[%d]=%d\n", rank, cell, C[cell]);
      }
    
    for(int i=1; i<numtasks; i++)
      {
        int noCells=minCells;
        if(i>=numtasks-extra)
	  {
            pad=i-numtasks+extra;
            noCells++;
	  }
	
	
        rc = MPI_Recv(ret, noCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &Stat);
        for(int j=0; j<noCells; j++)
	  {
            int cell = minCells*i+pad+j;
	    //            if(cell>size*size-1)
	    //               printf("  KUK!\n");
            C[cell]=ret[j];
	    //            printf("C[%d]=ret[%d]=%d\n",cell, j, ret[j]);
	  }
      }
    //    printMatrix(A, size);
    //    printf("*\n");
    //    printMatrix(B, size);
    //    printf("=\n");
    //    printMatrix(C, size);
    //    printf("\nControl:\n");
    double *D;
    if(argc==3 && strcmp(argv[2],"verify") == 0)
      {
	D = (double*)malloc(sizeof(double)*size*size);
	naiveMatrixMult(A,B,D,size);
	
	//    printMatrix(D, size);
	if(matrixEqual(C,D,size))
	  printf("\n   Ok!\n\n");
	else
	  printf("\n   FAIL!\n\n");
	free(D);
      }
    free(A);
    free(B);
    free(C);
  } 
  
  else if (rank+1 <= size*size) {
    int noCells=minCells;
    if(rank>=numtasks-extra)
      {
        pad=rank-numtasks+extra;
        noCells++;
      }
    // printf("noCells: %d    extra: %d    rank: %d\n", noCells, extra, rank);
    
    double *ret = (double*)calloc(noCells,sizeof(double));
    
    for(int i=0; i<noCells; i++)
      {
        ret[i] = 0;
        int cell=minCells*(rank)+pad+i;
	
        for(int j=0; j<size; j++)
	  ret[i]+=A[(cell/size)*size+j]*B[j*size+cell%size];
	//        printf("Calculating job %d. C[%d]=%d\n", rank, cell, ret[i]);
      }
    rc = MPI_Send(ret, noCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
    
  }
  
  else {
    
  }
  
  MPI_Finalize();
  return 0;
}