Exemple #1
0
/* Function: modEuler
 * Description: Modified Euler Integrator using Implicit and Explicit
 *				vi(t + h) = vi(t) + (ALPHA / h) * (gi(t) - xi(t)) + (h / mi) * Fext(t) 
 *				xi(t + h) = xi(t) + h * vi(t + h)
 * Input: None
 * Output: None
 */
void ModEuler(phyzx *phyzxObj, int mIndex, int deformMode)
{
	point vertex, velocity, extVel, position, velDamp;
	point vDiff, velTotal, newPos, temp;
	matrix R, matTemp;

	memset( (void*)&temp, 0, sizeof(temp));
	memset((void*)&extVel, 0, sizeof(point));
	memset((void*)&velocity, 0, sizeof(point));
	memset((void*)&position, 0, sizeof(point));
	memset((void*)&vDiff, 0, sizeof(point));
	memset((void*)&velTotal, 0, sizeof(point));
	memset((void*)&newPos, 0, sizeof(point));
	memset((void*)&phyzxObj->avgVel, 0, sizeof(point));

	matInit(&R, 0, 0);
	matInit(&matTemp, 0, 0);

	if (deformMode == 3)
		quadDeformRot(&R, phyzxObj);

	for (unsigned int index = STARTFROM; index <= phyzxObj->model->numvertices; index++)
	{
		if (deformMode == 3)
		{
			// Compute Quadratic Deformation Goal Positions
			matMult(R, phyzxObj->q[index], &matTemp);						// R(q)
			temp = matToPoint(matTemp);										// Data type conversion
			pSUM(temp, phyzxObj->cmDeformed, phyzxObj->goal[index]);		// g = R(q) + xcm
		} //end if
		else
		{
			// Compute Goal Positions
			matMult3331(phyzxObj->R, phyzxObj->relStableLoc[index], &temp);				// R(xi0 - xcm0)
			pSUM(temp, phyzxObj->cmDeformed, phyzxObj->goal[index]);					// g = R(xi0 - xcm0) + xcm
		} //end if

		vertex.x = phyzxObj->model->vertices[3*index];
		vertex.y = phyzxObj->model->vertices[3*index + 1];
		vertex.z = phyzxObj->model->vertices[3*index + 2];\

		if (stickyFloor == 1)
			if (vertex.y <= -WALLDIST)
				continue;

		// Add user force
		if (mIndex == iMouseModel && lMouseVal == 2 && objectName != -1)// && index == objectName)
		{
			//point uForce;
			/*GLMnode *node;
			node = NBVStruct[objectName];

			while (node->next != NULL)
			{
				pSUM(phyzxObj->extForce[node->index], userForce, phyzxObj->extForce[node->index]);
				node = node->next;
			} //end while*/
			/*if (index != objectName)
			{
				point extPos = vMake(phyzxObj->model->vertices[3*objectName], phyzxObj->model->vertices[3*objectName+1], phyzxObj->model->vertices[3*objectName+2]);
				double dist = vecLeng(extPos, vertex);
				//if (dist > 0.04)
				//{
					pMULTIPLY(userForce, (1.0/dist), uForce);
					pSUM(phyzxObj->extForce[index], uForce, phyzxObj->extForce[index]);
					//pDisp("user", userForce);
				//} //end if
				//else
				//{
					//pSUM(phyzxObj->extForce[index], userForce, phyzxObj->extForce[index]);
				//} //end else
			} //end if
			else
			{*/
				pSUM(phyzxObj->extForce[index], userForce, phyzxObj->extForce[index]);
			//} //end else
		} //end if

		// Explicit Euler Integrator for veloctiy -> vi(t + h)
		pDIFFERENCE(phyzxObj->goal[index], vertex, vDiff);												// gi(t) - xi(t)
		pMULTIPLY(vDiff, (phyzxObj->alpha / phyzxObj->h), velocity);									// vi(h) = (ALPHA / h) * (gi(t) - xi(t))
		pMULTIPLY(phyzxObj->extForce[index], (phyzxObj->h / phyzxObj->mass[index]), extVel);			// (h / mi) * Fext(t)
//		pMULTIPLY(phyzxObj->extForce[index], phyzxObj->h, extVel);			// (h / mi) * Fext(t)
		pSUM(velocity, extVel, velTotal);																// vi(h) = (ALPHA / h) * (gi(t) - xi(t)) + (h / mi) * Fext(t) 

		pSUM(phyzxObj->velocity[index], velTotal, phyzxObj->velocity[index]);							// vi(t + h) = vi(t) + vi(h)
		
		// Velocity Damping
		pMULTIPLY(phyzxObj->velocity[index], -phyzxObj->delta, velDamp);
		pSUM(phyzxObj->velocity[index], velDamp, phyzxObj->velocity[index]);

		// Implicity Euler Integrator for position
		pMULTIPLY(phyzxObj->velocity[index], phyzxObj->h, position);									// xi(h) = h * vi(t + h)
		pSUM(vertex, position, newPos);																// xi(t + h) = xi(t) + xi(h)

		// Store new position into data structure
		phyzxObj->model->vertices[3*index] = newPos.x;
		phyzxObj->model->vertices[3*index + 1] = newPos.y;
		phyzxObj->model->vertices[3*index + 2] = newPos.z;

		pSUM(phyzxObj->avgVel, phyzxObj->velocity[index], phyzxObj->avgVel);

		//if (objCollide)
			CheckForCollision(index, phyzxObj, mIndex);
	} //end for

	pMULTIPLY(phyzxObj->avgVel, 1.0 / phyzxObj->model->numvertices, phyzxObj->avgVel);

	delete[] R.data;
	delete[] matTemp.data;
} //end ModEuler()
Exemple #2
0
// WARNING mxSize is the matrixSize here
NORMAL_API DSP_STATUS helloDSP_Execute(IN Uint32 mxSize, Uint8 processorId, Uint32* matrixA, Uint32* matrixB, Uint32* matrixC)
{
	DSP_STATUS status = DSP_SOK;
	Uint16 sequenceNumber = 0;
	Uint16 msgId = 0;
	Uint32 i, j;
	ControlMsg *msg;
	Uint8 flag = 0;

	Uint32 matrixD[mxSize * mxSize];

	Uint32 numElements, numMessages, elementCount, messageCount;
	Uint32 sizeElements, numProdMessages, matrixCount, prodElements;
	Char8 ascii_string[STRING_SIZE + 1];
	Char8 null_string[STRING_SIZE + 1] = {'\0','\0','\0','\0','\0','\0'};

	myStrcpy(ascii_string, null_string);

	SYSTEM_0Print("Entered helloDSP_Execute ()\n");

	// Wait for the first DSP is awake message
	status = MSGQ_get(SampleGppMsgq, WAIT_FOREVER, (MsgqMsg *) &msg);
	if (DSP_FAILED(status))
	{
		SYSTEM_1Print("MSGQ_get () failed. Status = [0x%x]\n", status);
	}

	// TODO possibly verify the data?

	SYSTEM_1Print("Received message: %s\n", (Uint32) msg->arg1);

	SYSTEM_0Print("Generated matrices:\n");
	// Generate the matrices after the DSPLink is established
	matrixGen(mxSize, matrixA, matrixB);

	// Have to translate the Int32 matrix elements to string elements
	// or the communication protocol
	prodElements = (mxSize * mxSize);
	numElements = (mxSize * mxSize * 2);
	sizeElements = numElements * STRING_SIZE;
	numMessages = ((sizeElements - 1) / ARG_MSG) + 1;
	numProdMessages = (((sizeElements / 2) - 1) / ARG_MSG) + 1;

	//SYSTEM_2Print("NumElements: %d, NumMessages: %d\n", numElements, numMessages);

	// WARNING Sending 5 Char8 each for loop
	// Start sending the matrices to the DSP which is not waiting
	for (messageCount = 0, elementCount = 0; messageCount < numMessages; messageCount++)
	{
		// First send a message, then receive
		//for ( ; (((elementCount - (messageCount * ARG_MSG)) * STRING_SIZE) < ARG_MSG ) && elementCount < numElements; elementCount++)

#if defined (PROFILE)
		SYSTEM_GetStartTimeDspEnc();
#endif
		for (; (elementCount - (messageCount * ARG_MSG)) < ARG_MSG && elementCount < numElements; elementCount++)
		{
			//SYSTEM_0Print("Putting element in a message\n");
			//itoa
			if(elementCount < prodElements)
			{
				SYSTEM_itoa(matrixA[elementCount], ascii_string, 10);
				//SYSTEM_1Print("Looping through string: %s in MatrixA\n", (Uint32) ascii_string);
			}
			else
			{
				SYSTEM_itoa(matrixB[elementCount - prodElements], ascii_string, 10);
				//SYSTEM_1Print("Looping through string: %s in MatrixB\n", (Uint32) ascii_string);
			}
			//SYSTEM_0Print("After SYSTEM_itoa\n");

			// loop through characters of the string
			for (i = 0; i < STRING_SIZE; i++)
			{
				msg->arg1[((elementCount * STRING_SIZE) - (messageCount * ARG_MSG)) + i] = ascii_string[i];
				//SYSTEM_sprintf(msg->arg1[(elementCount - (messageCount * ARG_MSG)) + i] = ascii_string[i];
			}
			// clean the string
			myStrcpy(ascii_string, null_string);
		}

#if defined (PROFILE)
		SYSTEM_GetEndTimeDspEnc();
#endif

		//SYSTEM_0Print("Filled a single message\n");

		// After filling a single message, should send it to the DSP and wait for a reply 
		// unless it is the last one
		if (DSP_SUCCEEDED(status))
		{
			//SYSTEM_0Print("DSP succeeded after filling\n");

#if defined (PROFILE)
			SYSTEM_GetStartTimeDspMes();
#endif
			msgId = MSGQ_getMsgId(msg);
			MSGQ_setMsgId(msg, msgId);
			// TODO set the command flag of the msg to distinguish
			status = MSGQ_put(SampleDspMsgq, (MsgqMsg) msg);
			if (DSP_FAILED(status))
			{
				MSGQ_free((MsgqMsg) msg);
				SYSTEM_1Print("MSGQ_put () failed. Status = [0x%x]\n", status);
			}
#if defined (PROFILE)
			else {
				SYSTEM_GetEndTimeDspMes();
				SYSTEM_GetStartTimeDspCalc();
			}
#endif
		}

		//SYSTEM_0Print("Message send\n");

		sequenceNumber++;
		// Make sure that the sequenceNumber stays within the permitted
		// range for applications. 
		if (sequenceNumber == MSGQ_INTERNALIDSSTART)
		{
			//SYSTEM_0Print("Something with sequences\n");
			sequenceNumber = 0;
		}
		// If it is the last message, don't wait for an acknowledge
		if (messageCount + 1 < numMessages)
		{
			// Wait for a response of the DSP before sending a reply
			status = MSGQ_get(SampleGppMsgq, WAIT_FOREVER, (MsgqMsg *) &msg);
			if (DSP_FAILED(status))
			{
				SYSTEM_1Print("MSGQ_get () failed. Status = [0x%x]\n", status);
			}

			//SYSTEM_1Print("Received: %s\n", (Uint32) msg->arg1);
		}
	}

	//SYSTEM_0Print("Sending completed..\n");

	// TODO start receiving the product matrix 
	// WARNING wait and acknowledge except for last loop
	for (messageCount = 0, elementCount = 0, matrixCount = 0; messageCount < numProdMessages; messageCount++)
	{
		status = MSGQ_get(SampleGppMsgq, WAIT_FOREVER, (MsgqMsg *) &msg);

		if (messageCount == 0) {
#if defined (PROFILE)
			SYSTEM_GetEndTimeDspCalc();
#endif
			SYSTEM_0Print("\nProduct matrix on DSP:\n");
		}
		//SYSTEM_1Print("Message received: %s\n", (Uint32) msg->arg1);
		if (DSP_FAILED(status))
		{
			SYSTEM_1Print("MSGQ_get () failed. Status = [0x%x]\n", status);
		}
		// Put the received message in the matrixC
		for (; matrixCount < prodElements && (elementCount - (messageCount * ARG_MSG)) < ARG_MSG && elementCount < (prodElements * STRING_SIZE); matrixCount++, elementCount += STRING_SIZE)
		{
			// atoi
			for (i = 0; i < STRING_SIZE; i++)
			{
				ascii_string[i] = msg->arg1[elementCount + i];
			}
			ascii_string[5] = '\0';
			/*
			 if (matrixCount >= 72)
			 {
			 SYSTEM_1Print("Ascii string received: %s\n", ascii_string);
			 }
			 */
			// Put it in the matrixC
			matrixC[matrixCount] = atoi(ascii_string);
			// print the string
			if (matrixCount % mxSize == 0)
			{
				SYSTEM_0Print("\n");
			}
			SYSTEM_1Print("%d ", matrixC[matrixCount]);

			// Clean the string
			myStrcpy(ascii_string, null_string);
		}

		// If this is not the last message, send an acknowledge
		if (messageCount + 1 < numProdMessages)
		{
			// Send the same message received in earlier MSGQ_get () call. 
			if (DSP_SUCCEEDED(status))
			{
				msgId = MSGQ_getMsgId(msg);
				MSGQ_setMsgId(msg, msgId);
				status = MSGQ_put(SampleDspMsgq, (MsgqMsg) msg);
				if (DSP_FAILED(status))
				{
					MSGQ_free((MsgqMsg) msg);
					SYSTEM_1Print("MSGQ_put () failed. Status = [0x%x]\n", status);
				}
			}

			sequenceNumber++;
			// Make sure that the sequenceNumber stays within the permitted
			// range for applications. 
			if (sequenceNumber == MSGQ_INTERNALIDSSTART)
			{
				sequenceNumber = 0;
			}
		}
	}

	SYSTEM_0Print("\n");

	MSGQ_free((MsgqMsg) msg);
	//SYSTEM_0Print("After freeing the message..\n");

	SYSTEM_0Print("\nProduct matrix on GPP:\n");

#if defined (PROFILE)
	SYSTEM_GetStartTimeGpp();
#endif

	matMult(matrixA, matrixB, matrixD, mxSize);

#if defined (PROFILE)
	SYSTEM_GetEndTimeGpp();
#endif

	// compare the matrices
	for(i=0; i<mxSize; i++)
	{
		for(j=0; j<mxSize; j++)
		{
			if(matrixC[i * mxSize + j] != matrixD[i * mxSize + j])
			{
				SYSTEM_2Print("Matrices are not equal row: %d, column: %d\n", i, j);
				flag = 1;
				break;
			}
		}
	}
	if (flag == 0)
	{
		SYSTEM_0Print("\nMatrix products are equal\n");
	}

	SYSTEM_0Print("Leaving helloDSP_Execute ()\n");

#if defined (PROFILE)
	if (DSP_SUCCEEDED(status))
	{
		SYSTEM_GetProfileInfoGpp();
		SYSTEM_GetProfileInfoDsp(numMessages); //is numProdMessages interesting?
	}
#endif

	return status;
}
int main(int argc, const char *argv[])
{
    // Seed the random number generator using time
    srand48((unsigned int) time(NULL));

    // Dimension of the operation with defaul value
    int N = PROBSIZE;

    // Specify operation: 0 MatMult; 1 MatVecMult
    int opr = 0;

    // Whether to verify the result or not
    int verif = 0;

    // Whether to display the result or not
    int disp = 0;

    // Whether to call the naive implementation
    int execNaive = 1;

    // Whether to call the optimized implementation
    int execOPT = 1;

    // Parse command line
    {
        int arg_index = 1;
        int print_usage = 0;

        while (arg_index < argc)
        {
            if ( strcmp(argv[arg_index], "-N") == 0 )
            {
                arg_index++;
                N = atoi(argv[arg_index++]);
            }
            else if ( strcmp(argv[arg_index], "-operation") == 0 )
            {
                arg_index++;
                opr = atoi(argv[arg_index++]);
            }
            else if ( strcmp(argv[arg_index], "-help") == 0 )
            {
                print_usage = 1;
                break;
            }
            else if( strcmp(argv[arg_index], "-verif") == 0 )
            {
                arg_index++;
                verif = 1;
                if(execNaive==0 || execOPT==0) {
                  printf("***Must call both naive and optimized when running verification\n");
                  print_usage = 1;
                  break;
                }
            }
            else if( strcmp(argv[arg_index], "-disp") == 0 )
            {
                arg_index++;
                disp = 1;
            }
            else if( strcmp(argv[arg_index], "-naive") == 0 )
            {
                arg_index++;
                execNaive = 1;
                execOPT   = 0;
                if(verif==1) {
                  printf("***Must call both naive and optimized when running verification\n");
                  print_usage = 1;
                  break;                  
                }
            }
            else if( strcmp(argv[arg_index], "-OPT") == 0 )
            {
                arg_index++;
                execOPT   = 1;
                execNaive = 0;
                if(verif==1) {
                  printf("***Must call both naive and optimized when running verification\n");
                  print_usage = 1;
                  break;                  
                }
            }
            else
            {
                printf("***Invalid argument: %s\n", argv[arg_index]);
                print_usage = 1;
                break;
            }
        }

        if (print_usage)
        {
            printf("\n");
            printf("Usage: %s [<options>]\n", argv[0]);
            printf("\n");
            printf("  -N <N>          : problem size (default: %d)\n", PROBSIZE);
            printf("  -operation <ID> : Operation ID = 0 for MatMult or ID = 1 for MatVecMult\n");
            printf("  -verif          : Activate verification\n");
            printf("  -disp           : Display result (use only for small N!)\n");
            printf("  -naive          : Run only naive implementation\n");
            printf("  -OPT            : Run only optimized implementation\n");
            printf("  -help           : Display this message\n");
            printf("\n");
        }

        if (print_usage)
            return 0;
    }

    // Perform operation
    switch(opr)
    {
        case 0: /* Matrix-matrix multiply */
            {
                printf("Performing matrix-matrix multiply operation\n");
                double *matA, *matB, *matC1, *matC2;

                // Allocate memory
                matA = (double *) malloc(N*N * sizeof(double));
                matB = (double *) malloc(N*N * sizeof(double));
                if(execNaive) matC1 = (double *) malloc(N*N * sizeof(double));
                if(execOPT)   matC2 = (double *) malloc(N*N * sizeof(double));

                // Initialize matrix values
                randInitialize(N*N,matA);
                randInitialize(N*N,matB);

                clock_t tic, toc;
                double tm;

                if(execNaive) {
                  // Perform naive matA x matB = matC1
                  tic = clock();
                  matMult(N,matA,matB,matC1);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for naive mat-mat mult.: %f seconds\n",tm);
                }

                if(execOPT) {
                  // Perform optimized matA x matB = matC2
                  tic = clock();
                  //matMult_opt(N,matA,matB,matC2);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for optimized mat-mat mult.: %f seconds\n",tm);
                }

                // Verify results (compare the two matrices)
                if(verif)
                    compareVecs(N*N,matC2,matC1);

                // Display results (don't use for large matrices)
                if(disp)
                {
                    displayMat(N,N,matA);
                    printf("\n");
                    displayMat(N,N,matB);
                    printf("\n");
                    displayMat(N,N,matC1);
                    printf("\n");
		    displayMat(N,N,matC2);
                }

                // Free memory
                free(matA);
                free(matB);
                if(execNaive) free(matC1);
                if(execOPT)   free(matC2);
            }
            break;

        case 1: /* Matrix-vector multiply */
            {
                printf("Performing matrix-vector multiply operation\n");
                double *matA, *vecB, *vecC1,*vecC2;

                // Allocate memory
                matA = (double *) malloc(N*N * sizeof(double));
                vecB = (double *) malloc(N*N * sizeof(double));
                if(execNaive) vecC1 = (double *) malloc(N*N * sizeof(double));
                if(execOPT)   vecC2 = (double *) malloc(N*N * sizeof(double));

                // Initialize values
                randInitialize(N*N,matA);
                randInitialize(N,vecB);

                clock_t tic, toc;
                double tm;

                if(execNaive) {
                  // Perform naive matA x vecB = vecC1
                  tic = clock();
                  matVecMult(N,matA,vecB,vecC1);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for naive mat-vec mult.: %f seconds\n",tm);
                }

                if(execOPT) {
                  // Perform optimized matA x vecB = vecC2
                  tic = clock();
                  matVecMult_opt(N,matA,vecB,vecC2);
                  toc = clock();
                  tm = (double)(toc - tic) / CLOCKS_PER_SEC;
                  printf("Elapsed time for optimized mat-vec mult.: %f seconds\n",tm);
                }

                // Verify results
                if(verif)
                    compareVecs(N,vecC2,vecC1);

                // Display results (don't use for large matrices)
                if(disp)
                {
                    displayMat(N,N,matA);
                    printf("\n");
                    displayVec(N,vecB);
                    printf("\n");
                    displayVec(N,vecC1);
                    printf("\n");
                }

                // Free memory
                free(matA);
                free(vecB);
                if(execNaive) free(vecC1);
                if(execOPT)   free(vecC2);
            }
            break;

        default:
            printf(" Invalid operation ID\n");
            return 0;
    }


    return 0;
}
int main(int argc, char** argv)
{
	int i, j;
	Timer neonTime;
    int16_t *mat1, *mat2;
    int32_t prod[sizeof(int32_t)*matrix_size][sizeof(int32_t)*matrix_size];

    /* Get argument size */
    matrix_size = atoi(argv[1]);
    if(matrix_size < 0 || matrix_size > 512)
    {
    	printf("Matrix size must be between 0 and 512.\n");
    	return -1;
    }

    /* Initialize timer */
    initTimer(&neonTime, "NEON Time");

    /* Allocate matrices */
    mat1 = malloc(matrix_size * matrix_size * sizeof(int16_t));
    mat2 = malloc(matrix_size * matrix_size * sizeof(int16_t));

    if (mat1 == NULL || mat2 == NULL) {
        printf("Out of memory\n");
    }

	/* Initialize matrices */
	for (i = 0; i < matrix_size; i++)
	{
		for (j = 0; j < matrix_size; j++)
		{
			mat1[i*matrix_size + j] = i+j*2;
		}
	}
	
	for(i = 0; i < matrix_size; i++)
	{
		for (j = 0; j < matrix_size; j++)
		{
			mat2[i*matrix_size + j] = i+j*3;
		}
	}

	/* Run the multiplication */
    startTimer(&neonTime);
	matMult(mat1,mat2,prod);
    stopTimer(&neonTime);
    printTimer(&neonTime);	

/*
	for (i = 0;i < matrix_size; i++)
	{
		printf("\n");
		for (j = 0; j < matrix_size; j++)
		{
			printf("\t%d ", prod[i][j]);
		}
	}
	
	printf("\nDone !!! \n");
}
*/
	return 0;
}
int main (int argc, char *argv[])
{
  int n1, n2, n3, n4;
  int numberOfPermutations;
  int i, j, k, l, p, ii, jj, m; 
  char *a; 
  int *iordre;
  int *corder;
  double *b;
  double *c;
  double *CAprime;
  double ssqeigvB;
  double ssqeigvC;
  double tracetot;
  double temp, prob;
  double *traceper;
  double dACC;
  FILE *f, *outf; 
  long idum; 
  int iGE, iGEF1, iGEF2, dummyINT;
  char fileNameA[1024], 
    fileNameB[1024], 
    fileNameC[1024],
    outFileName[1024]; 
  double F1, F2, trace0, prob1, prob2, F1per, F2per;
  double time;
  MPI_Status msgStatus; 
  int jobMsg[3];
  double resultMsg[7];
  int jobID;
  int nonZero;
  parameters *params;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &processID);
  MPI_Comm_size(MPI_COMM_WORLD, &numOfWorkers);

  params = (parameters *)malloc(sizeof(parameters));

  nonZero = 0;

  get_args(argc, argv, params, processID);

  numberOfPermutations = params->permutations;
  n1 = params->n1;
  n2 = params->n2;
  n3 = params->n3;
  n4 = params->n4;
  strcpy(fileNameA, params->fileNameA);
  strcpy(fileNameB, params->fileNameB);
  strcpy(fileNameC, params->fileNameC);
  strcpy(outFileName, params->outFileName);

  if(processID == 0)
    {
      time = gettime();  
      outf = fopen(outFileName, "w");  
      fprintf(outf, "Permutations: %d N1 %d N2 %d, N3 %d N4 %d\n", numberOfPermutations - 1, n1, n2, n3, n4); 
    }
  
  traceper = (double *)malloc(sizeof(double) * numberOfPermutations);

  iordre = (int *) malloc (sizeof(int) * n2);
  corder = (int *) malloc (sizeof(int) * n2);
  
  a = (char *)malloc(sizeof(char) * n1 * n2); 

  b = (double *)malloc(sizeof(double) * n4 * n1);
  
  CAprime = (double *)malloc(sizeof(double) * n3 * n1);  

  c = (double *)malloc(sizeof(double) * n3 * n2);

  /******** READ DATA ***********************************************/

  f = fopen(fileNameA, "r");
    
  for(i = 0; i < n1; i++)    
    for(j = 0; j < n2; j++)	      
      {
	int d, v;
	v = fscanf(f, "%d", &d);
	if(v == 0)
	  {
	    printf("Format Conversion Error while reading Matrix A(%s) at position A[%d][%d]\n", fileNameA, i, j);
	    exit(-1);
	  }
	if(v == EOF)
	  {
	    printf("End of File reached while reading Matrix A(%s) at position A[%d][%d]\n", fileNameA, i, j);
	    exit(-1);
	  }
	a[i * n2 + j] = (char)d;
	if(a[i * n2 + j] != 0)
	  nonZero++;	
      }
  
  fclose(f);   
 
  f = fopen(fileNameB, "r");
  
  for(i = 0; i < n1; i++)   
    for(j = 0; j < n4; j++)      
      {
	int v;
#ifdef ROWS
	v = fscanf(f, "%lf",&b[j * n1 + i]);
#else
	v = fscanf(f, "%lf",&b[i * n4 + j]);
#endif
	if(v == 0)
	  {
	    printf("Format Conversion Error while reading Matrix B(%s) at position B[%d][%d]\n", fileNameB, i, j);
	    exit(-1);
	  }
	if(v == EOF)
	  {
	    printf("End of File reached while reading Matrix B(%s) at position B[%d][%d]\n", fileNameB, i, j);
	    exit(-1);
	  }
      }
  

  fclose(f);

  f = fopen(fileNameC, "r");
  
  for(i = 0; i < n3; i++)        
    for(j = 0; j < n2; j++)	
      {
	int v;
	v = fscanf(f, "%lf",&c[i * n2 + j]);    
	if(v == 0)
	  {
	    printf("Format Conversion Error while reading Matrix C(%s) at position C[%d][%d]\n", fileNameC, i, j);
	    exit(-1);
	  }
	if(v == EOF)
	  {
	    printf("End of File reached while reading Matrix C(%s) at position C[%d][%d]\n", fileNameC, i, j);
	    exit(-1);
	  }
      }
	
  fclose(f);       
  ssqeigvB = 0.0; 

#ifdef ROWS
  for(i = 0; i < n4; i++)
    {
      temp = 0.0;
      for(j = 0; j < n1; j++)     
	temp += b[i * n1 + j] * b[i * n1 + j];      
      ssqeigvB += temp * temp;
    }
#else
  for(i = 0; i < n4; i++)
    {
      temp = 0.0;
      for(j = 0; j < n1; j++)     
	temp += b[j * n4 + i] * b[j * n4 + i];      
      ssqeigvB += temp * temp;
    }
#endif

  ssqeigvC = 0.0;

  for(i = 0; i < n3; i++)
    {
      temp = 0.0;
      for(j = 0; j < n2; j++)
	temp += c[i * n2 + j] * c[i * n2 + j];
      ssqeigvC += temp * temp;
    }    
 
  if(processID == 0)
    {
      fprintf(outf, "Sum of squared PCoA eigenvalues of B = %1.5f\n\n", ssqeigvB);
      fprintf(outf, "Sum of squared PCoA eigenvalues of C = %1.5f\n\n", ssqeigvC);
    }

  if(ssqeigvC > ssqeigvB) 
    tracetot = ssqeigvC;
  else
    tracetot = ssqeigvB;

  if(processID == 0)
    fprintf(outf, "TraceTot = %1.5f\n\n", tracetot);

  {
    FILE *t;
    int readCount;

    if(processID == 0)
      printf("READING trace file %s\n",  params->externalTraceFileName);

    t = fopen(params->externalTraceFileName, "r");      
    readCount = fread(((void *)traceper), sizeof(double), numberOfPermutations, t);       
    fclose(t);           

    if(readCount < numberOfPermutations)
      {
	printf("Error, external tracefile %s contains only %d entries but %d are required\n", params->externalTraceFileName, readCount, numberOfPermutations);
	exit(-1);
      }
      
    iGE = 1;

    for(p = 1; p < numberOfPermutations; p++)
      {                            	 	  	  
	if(traceper[p] >= traceper[0])
	  iGE++;          	 
      }
      
    prob = (double)(iGE) / (double)(numberOfPermutations);

    if(processID == 0)
      {
	fprintf(outf, " Global test of cospeciation:                     ParaFitGlobal = %1.5f   Prob  = %1.5f\n\n", traceper[0], prob);       
	fprintf(outf, " Test of individual host-parasite links:\n\n");  
	fprintf(outf, "                        F1 = ParaFitLink1                  F2 = ParaFitLink2\n\n\n");     
	printf("Global test of cospeciation:                     ParaFitGlobal = %1.5f   Prob  = %1.5f\n\n", traceper[0], prob);
      }
  }
  
  if(processID == 0)
    {
      int count;
      jobQueue *jobs;     
      int jobsSent, jobsReceived;
      resultVector *results;
      char *received;
      int resultCounter = 0;
      int lastFlush = 0;      
      
      jobs = (jobQueue *)malloc(nonZero * sizeof(jobQueue));
      results = (resultVector *)malloc(nonZero * sizeof(resultVector));
      received = (char *)malloc(sizeof(char) * nonZero);
      count = 0;
      for(i = 0; i < n1; i++)
	for(j = 0; j < n2; j++)
	  {
	    if(a[i * n2 + j] != 0)
	      {
		jobs[count].i = i;
		jobs[count].j = j;
		count++;
	      }
	  }

      for(i = 0; i < nonZero; i++)
	received[i] = 0;
      
      jobsReceived = nonZero;
      jobsSent = 0;
      while(jobsReceived > 0)
	{
	  MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &msgStatus);
	  switch(msgStatus.MPI_TAG)
	    {
	    case JOB_REQUEST: 
	      MPI_Recv(&dummyINT, 1, MPI_INT, msgStatus.MPI_SOURCE, JOB_REQUEST, MPI_COMM_WORLD, &msgStatus);	     
	      if(jobsSent < nonZero)
		 {
		   jobMsg[0] = jobsSent;
		   jobMsg[1] = jobs[jobsSent].i;
		   jobMsg[2] = jobs[jobsSent].j;		   
		   MPI_Send(jobMsg, 3, MPI_INT, msgStatus.MPI_SOURCE, COMPUTE, MPI_COMM_WORLD);
		   jobsSent++;
		 }
	       break;
	    case RESULT:
	      MPI_Recv(resultMsg, 7, MPI_DOUBLE, msgStatus.MPI_SOURCE, RESULT, MPI_COMM_WORLD, &msgStatus);	     
	      jobsReceived--;
	      jobID                = (int)resultMsg[0];
	      results[jobID].ii    = (int)resultMsg[1];
	      results[jobID].jj    = (int)resultMsg[2];
	      results[jobID].F1    = resultMsg[3];
	      results[jobID].prob1 = resultMsg[4];
	      results[jobID].F2    = resultMsg[5];
	      results[jobID].prob2 = resultMsg[6];	     	      	      
	      received[jobID] = 1;
	      resultCounter++;
	      
	      if((resultCounter % (2 * numOfWorkers)) == 0)
		{
		  while(lastFlush < nonZero && received[lastFlush] == 1)
		    {
		      printf("Parasite  %d  Host %d   F1 =   %1.5f   Prob1 =  %1.5f   F2 =    %1.5f   Prob2 =  %1.5f\n", 
			     results[lastFlush].ii + 1,  results[lastFlush].jj + 1,  results[lastFlush].F1,  results[lastFlush].prob1,  
			     results[lastFlush].F2,  results[lastFlush].prob2);
		      fprintf(outf, "Parasite  %d  Host %d   F1 =   %1.5f   Prob1 =  %1.5f   F2 =    %1.5f   Prob2 =  %1.5f\n", 
			      results[lastFlush].ii + 1,  results[lastFlush].jj + 1,  results[lastFlush].F1,  results[lastFlush].prob1,  
			      results[lastFlush].F2,  results[lastFlush].prob2);
		      lastFlush++;
		    }
		}

	      if(jobsSent < nonZero)
		{
		  jobMsg[0] = jobsSent;
		  jobMsg[1] = jobs[jobsSent].i;
		  jobMsg[2] = jobs[jobsSent].j;
		  MPI_Send(jobMsg, 3, MPI_INT, msgStatus.MPI_SOURCE, COMPUTE, MPI_COMM_WORLD);
		  jobsSent++;
		}
	      break;	      
	    }
	}

      while(lastFlush < nonZero && received[lastFlush] == 1)
	{
	  printf("Parasite  %d  Host %d   F1 =   %1.5f   Prob1 =  %1.5f   F2 =    %1.5f   Prob2 =  %1.5f\n", 
		 results[lastFlush].ii + 1,  results[lastFlush].jj + 1,  results[lastFlush].F1,  results[lastFlush].prob1,  
		 results[lastFlush].F2,  results[lastFlush].prob2);
	  fprintf(outf, "Parasite  %d  Host %d   F1 =   %1.5f   Prob1 =  %1.5f   F2 =    %1.5f   Prob2 =  %1.5f\n", 
		 results[lastFlush].ii + 1,  results[lastFlush].jj + 1,  results[lastFlush].F1,  results[lastFlush].prob1,  
		 results[lastFlush].F2,  results[lastFlush].prob2);
	  lastFlush++;
	}


      printf("There are %d host-parasite links in matrix A\n", nonZero);
      fprintf(outf, "There are %d host-parasite links in matrix A\n", nonZero);    

      for(i = 1; i < numOfWorkers; i++)
	{
	  MPI_Send(&dummyINT, 1, MPI_INT, i, FINALIZE, MPI_COMM_WORLD);
	}
      fclose(outf);

      printf("TIME %f\n", gettime() - time);
      goto FINISH;
    }
  else
    {
      MPI_Send(&dummyINT, 1, MPI_INT, 0, JOB_REQUEST, MPI_COMM_WORLD);

      while(1)
	{		  
	  MPI_Probe(0, MPI_ANY_TAG, MPI_COMM_WORLD, &msgStatus);	  
	  switch(msgStatus.MPI_TAG)
	    {
	    case COMPUTE:
	      MPI_Recv(jobMsg, 3, MPI_INT, 0, COMPUTE, MPI_COMM_WORLD, &msgStatus);	     
	      jobID = jobMsg[0];
	      ii = jobMsg[1];
	      jj = jobMsg[2];

	      a[ii * n2 + jj] = 0;
	      	     
	      makeCAprime(n3, n1, n2, a, c, CAprime, corder);
	      dACC = matMult(n3, n4, n1, CAprime, b);	    

	      F1 = (traceper[0] - dACC);
	      F2 = (traceper[0] - dACC)/(tracetot - traceper[0]);	  

	      for(i = 0; i < n2; i++)
		iordre[i] = i;
	      idum = -1;   
	      for(i = 0; i < NTURN; i++)
		ran2(&idum);

	      iGEF1 = 1;
	      iGEF2 = 1;
	      
	      for(p = 1; p < numberOfPermutations; p++)
		{		 
		  permuteCAprime(n3, n1, n2, a, c, CAprime, &idum, iordre, corder);
		  dACC = matMult(n3, n4, n1, CAprime, b);		  		 

		  F1per = traceper[p] - dACC;
		  F2per = (traceper[p] - dACC) / (tracetot - traceper[p]);		 

		  if(F1per >= F1)
		    iGEF1++;
		  if(F2per >= F2)
		    iGEF2++;		 				 
		}
	      prob1 = (double)(iGEF1) / (double)(numberOfPermutations);
	      prob2 = (double)(iGEF2) / (double)(numberOfPermutations);	     	      

	      a[ii * n2 + jj]    = 1; 
	      resultMsg[0] = (double)jobID;
	      resultMsg[1] = (double)ii;
	      resultMsg[2] = (double)jj;
	      resultMsg[3] = F1;
	      resultMsg[4] = prob1;
	      resultMsg[5] = F2;
	      resultMsg[6] = prob2;
	     	    
	      MPI_Send(resultMsg, 7, MPI_DOUBLE, 0, RESULT, MPI_COMM_WORLD);
	      break;
	    case FINALIZE:
	      MPI_Recv(&dummyINT, 1, MPI_INT, 0, FINALIZE, MPI_COMM_WORLD, &msgStatus);
	      goto FINISH;
	    }
	}
    }
 FINISH:
  MPI_Finalize();   	        
}