void pure_g_ecef(void)
{
	int i;
	double rad1, rad2;
	double vv[3], vv1[3], r_vec[3];

	rad1 = (r0 * r0) / (pure_R * pure_R);
	rad2 = pure_ecef_pos[2] / pure_R;

	pure_ecef_gravity[0] = -(mue / (r0 * r0)) * (pure_ecef_pos[0] / pure_R) * ((rad1 * (1 + epsilon)) + (jconst2 * (rad1 * rad1)) * (1 - 5 * (rad2 * rad2)));
	pure_ecef_gravity[1] = -(mue / (r0 * r0)) * (pure_ecef_pos[1] / pure_R) * ((rad1 * (1 + epsilon)) + (jconst2 * (rad1 * rad1)) * (1 - 5 * (rad2 * rad2)));
	pure_ecef_gravity[2] = -(mue / (r0 * r0)) * (pure_ecef_pos[2] / pure_R) * ((rad1 * (1 + epsilon)) + (jconst2 * (rad1 * rad1)) * (3 - 5 * (rad2 * rad2)));


	for (i = 0; i < 3; i++)
	{
		r_vec[i] = pure_ecef_pos[i];
		vv[i] = 0.0;
	}

	cross(omega, r_vec, vv);
	cross(omega, vv, vv1);
	matsub(3,1,(double*)pure_ecef_gravity, (double*)vv1, (double*)pure_ecef_gravity);

	pure_g_ecef_mag = sqrt((pure_ecef_gravity[0] * pure_ecef_gravity[0]) + (pure_ecef_gravity[1] * pure_ecef_gravity[1]) + (pure_ecef_gravity[2] * pure_ecef_gravity[2]));

	matmulint(3,1,(double*)pure_ecef_gravity,eight_delt,(double*)pure_ecef_gravity);
}	//end of g_ecef()
void pure_vel_update(void)
{
	int i;
	double temp[3];

	matmul(3, 3, (double *)p_dcm, 3, 1, (double *)p_velo_20ms, (double *)pure_vel);

	//earth rate correction
	cross(omg_dub, pure_v_old, temp);
	matmulint(3,1,(double *)temp, eight_delt, (double *)temp);
	matsub(3,1,(double *)pure_vel, (double *)temp, (double *)pure_vel);
	matadd(3,1,(double *)pure_vel, (double *)pure_ecef_gravity, (double *)pure_vel);

	matadd(3,1,(double *)pure_vel, (double *)pure_v_old, (double *)pure_vel);

	matadd(3,1,(double *)pure_vel, (double *)pure_v_old, (double *)pure_vav);
	matmulint(3,1,(double *)pure_vav, 0.5, (double *)pure_vav);

	for (i = 0; i < 3; i++)
		pure_v_old[i] = pure_vel[i];


}
Example #3
0
File: mgfas.c Project: gnovak/bin
void mgfas(double **u, int n, int maxcyc)
{
	double anorm2(double **a, int n);
	void copy(double **aout, double **ain, int n);
	void interp(double **uf, double **uc, int nf);
	void lop(double **out, double **u, int n);
	void matadd(double **a, double **b, double **c, int n);
	void matsub(double **a, double **b, double **c, int n);
	void relax2(double **u, double **rhs, int n);
	void rstrct(double **uc, double **uf, int nc);
	void slvsm2(double **u, double **rhs);
	unsigned int j,jcycle,jj,jm1,jpost,jpre,nf,ng=0,ngrid,nn;
	double **irho[NGMAX+1],**irhs[NGMAX+1],**itau[NGMAX+1],
		**itemp[NGMAX+1],**iu[NGMAX+1];
	double res,trerr;

	nn=n;
	while (nn >>= 1) ng++;
	if (n != 1+(1L << ng)) nrerror("n-1 must be a power of 2 in mgfas.");
	if (ng > NGMAX) nrerror("increase NGMAX in mglin.");
	nn=n/2+1;
	ngrid=ng-1;
	irho[ngrid]=dmatrix(1,nn,1,nn);
	rstrct(irho[ngrid],u,nn);
	while (nn > 3) {
		nn=nn/2+1;
		irho[--ngrid]=dmatrix(1,nn,1,nn);
		rstrct(irho[ngrid],irho[ngrid+1],nn);
	}
	nn=3;
	iu[1]=dmatrix(1,nn,1,nn);
	irhs[1]=dmatrix(1,nn,1,nn);
	itau[1]=dmatrix(1,nn,1,nn);
	itemp[1]=dmatrix(1,nn,1,nn);
	slvsm2(iu[1],irho[1]);
	free_dmatrix(irho[1],1,nn,1,nn);
	ngrid=ng;
	for (j=2;j<=ngrid;j++) {
		nn=2*nn-1;
		iu[j]=dmatrix(1,nn,1,nn);
		irhs[j]=dmatrix(1,nn,1,nn);
		itau[j]=dmatrix(1,nn,1,nn);
		itemp[j]=dmatrix(1,nn,1,nn);
		interp(iu[j],iu[j-1],nn);
		copy(irhs[j],(j != ngrid ? irho[j] : u),nn);
		for (jcycle=1;jcycle<=maxcyc;jcycle++) {
		nf=nn;
			for (jj=j;jj>=2;jj--) {
				for (jpre=1;jpre<=NPRE;jpre++)
					relax2(iu[jj],irhs[jj],nf);
				lop(itemp[jj],iu[jj],nf);
				nf=nf/2+1;
				jm1=jj-1;
				rstrct(itemp[jm1],itemp[jj],nf);
				rstrct(iu[jm1],iu[jj],nf);
				lop(itau[jm1],iu[jm1],nf);
				matsub(itau[jm1],itemp[jm1],itau[jm1],nf);
				if (jj == j)
					trerr=ALPHA*anorm2(itau[jm1],nf);
				rstrct(irhs[jm1],irhs[jj],nf);
				matadd(irhs[jm1],itau[jm1],irhs[jm1],nf);
			}
			slvsm2(iu[1],irhs[1]);
			nf=3;
			for (jj=2;jj<=j;jj++) {
			jm1=jj-1;
			rstrct(itemp[jm1],iu[jj],nf);
			matsub(iu[jm1],itemp[jm1],itemp[jm1],nf);
			nf=2*nf-1;
			interp(itau[jj],itemp[jm1],nf);
			matadd(iu[jj],itau[jj],iu[jj],nf);
			for (jpost=1;jpost<=NPOST;jpost++)
				relax2(iu[jj],irhs[jj],nf);
			}
			lop(itemp[j],iu[j],nf);
			matsub(itemp[j],irhs[j],itemp[j],nf);
			res=anorm2(itemp[j],nf);
			if (res < trerr) break;
		}
	}
	copy(u,iu[ngrid],n);
	for (nn=n,j=ng;j>=1;j--,nn=nn/2+1) {
		free_dmatrix(itemp[j],1,nn,1,nn);
		free_dmatrix(itau[j],1,nn,1,nn);
		free_dmatrix(irhs[j],1,nn,1,nn);
		free_dmatrix(iu[j],1,nn,1,nn);
		if (j != ng && j != 1) free_dmatrix(irho[j],1,nn,1,nn);
	}
}
int main(int argc, char *argv[])
{
	e_epiphany_t Epiphany, *pEpiphany;
	e_mem_t      DRAM,     *pDRAM;
	unsigned int msize;
	float        seed;
	unsigned int addr; //, clocks;
	size_t       sz;
	double       tdiff[4];
	int          result, rerval;
	
	pEpiphany = &Epiphany;
	pDRAM     = &DRAM;
	msize     = 0x00400000;

	get_args(argc, argv);


	fo = stderr;
	fi = stdin;

	printf("\nMatrix: C[%d][%d] = A[%d][%d] * B[%d][%d]\n\n", _Smtx, _Smtx, _Smtx, _Smtx, _Smtx, _Smtx);
	printf("Using %d x %d cores\n\n", _Nside, _Nside);
	seed = 0.0;
	printf("Seed = %f\n", seed);



	// Connect to device for communicating with the Epiphany system
	// Prepare device
	e_set_host_verbosity(H_D0);
	e_init(NULL);
	e_reset_system();

	if (e_alloc(pDRAM, 0x00000000, msize))
	{
		printf("\nERROR: Can't allocate Epiphany DRAM!\n\n");
		exit(1);
	}
	if (e_open(pEpiphany, 0, 0, e_platform.chip[0].rows, e_platform.chip[0].cols))
	{
		printf("\nERROR: Can't establish connection to Epiphany device!\n\n");
		exit(1);
	}

	// Initialize Epiphany "Ready" state
	addr = offsetof(shared_buf_t, core.ready);
	Mailbox.core.ready = 0;
	e_write(pDRAM, 0, 0, addr, &Mailbox.core.ready, sizeof(Mailbox.core.ready));

	printf("Loading program on Epiphany chip...\n");
	e_set_loader_verbosity(ar.verbose);
	result = e_load_group(ar.srecFile, pEpiphany, 0, 0, pEpiphany->rows, pEpiphany->cols, ar.run_target);
	if (result == E_ERR) {
		printf("Error loading Epiphany program.\n");
		exit(1);
	}


	// Generate operand matrices based on a provided seed
	matrix_init(seed);


#ifdef __WIPE_OUT_RESULT_MATRIX__
	// Wipe-out any previous remains in result matrix (for verification)
	addr = offsetof(shared_buf_t, C[0]);
	sz = sizeof(Mailbox.C);
	printf("Writing C[%uB] to address %08x...\n", sz, addr);
	e_write(pDRAM, 0, 0, addr, (void *) Mailbox.C, sz);
#endif

	clock_gettime(CLOCK_MONOTONIC, &timer[0]);

	// Copy operand matrices to Epiphany system
	addr = offsetof(shared_buf_t, A[0]);
	sz = sizeof(Mailbox.A);
	printf("Writing A[%uB] to address %08x...\n", sz, addr);
	e_write(pDRAM, 0, 0, addr, (void *) Mailbox.A, sz);
	
	addr = offsetof(shared_buf_t, B[0]);
	sz = sizeof(Mailbox.B);
	printf("Writing B[%uB] to address %08x...\n", sz, addr);
	e_write(pDRAM, 0, 0, addr, (void *) Mailbox.B, sz);


	// Call the Epiphany matmul() function
	printf("GO Epiphany! ...   ");
	clock_gettime(CLOCK_MONOTONIC, &timer[1]);
	matmul_go(pDRAM);
	clock_gettime(CLOCK_MONOTONIC, &timer[2]);
	printf("Finished calculating Epiphany result.\n");


	// Read result matrix and timing
	addr = offsetof(shared_buf_t, C[0]);
	sz = sizeof(Mailbox.C);
	printf("Reading result from address %08x...\n", addr);
	e_read(pDRAM, 0, 0, addr, (void *) Mailbox.C, sz);

	clock_gettime(CLOCK_MONOTONIC, &timer[3]);


	// Calculate a reference result
	printf("Calculating result on Host ...   ");
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[4]);
#ifndef __DO_STRASSEN__
	matmul(Mailbox.A, Mailbox.B, Cref, _Smtx);
#else
	matmul_strassen(Mailbox.A, Mailbox.B, Cref, _Smtx);
#endif
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[5]);
	printf("Finished calculating Host result.\n");


	addr = offsetof(shared_buf_t, core.clocks);
	sz = sizeof(Mailbox.core.clocks);
	printf("Reading time from address %08x...\n", addr);
	e_read(pDRAM,0, 0, addr, &Mailbox.core.clocks, sizeof(Mailbox.core.clocks));
//	clocks = Mailbox.core.clocks;


	// Calculate the difference between the Epiphany result and the reference result
	printf("\n*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
	printf("Verifying result correctness ...   ");
	matsub(Mailbox.C, Cref, Cdiff, _Smtx);

	tdiff[0] = (timer[2].tv_sec - timer[1].tv_sec) * 1000 + ((double) (timer[2].tv_nsec - timer[1].tv_nsec) / 1000000.0);//total
	tdiff[1] = (timer[1].tv_sec - timer[0].tv_sec) * 1000 + ((double) (timer[1].tv_nsec - timer[0].tv_nsec) / 1000000.0);//write
	tdiff[2] = (timer[3].tv_sec - timer[2].tv_sec) * 1000 + ((double) (timer[3].tv_nsec - timer[2].tv_nsec) / 1000000.0);//read
	tdiff[3] = (timer[5].tv_sec - timer[4].tv_sec) * 1000 + ((double) (timer[5].tv_nsec - timer[4].tv_nsec) / 1000000.0);//ref


	// If the difference is 0, then the matrices are identical and the
	// calculation was correct
	if (iszero(Cdiff, _Smtx))
	{
		printf("C_epiphany == C_host\n");
		rerval = 0;
	} else {
		printf("\n\nERROR: C_epiphany is different from C_host !!!\n");
		rerval = 1;
	}
	printf("*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n");
	printf("\n");
	printf("Epiphany (compute):  %9.1f msec  (@ %03d MHz)\n"   , tdiff[0], eMHz);
	printf("         (write)  :  %9.1f msec \n"                , tdiff[1]);
	printf("         (read)   :  %9.1f msec\n"                 , tdiff[2]);
	printf("         (*total*):  %9.1f msec\n\n"               , tdiff[2]+tdiff[1]+tdiff[0]);
	printf("Host     (*total*):  %9.1f msec  (@ %03d MHz)\n"   , tdiff[3], aMHz);


#ifdef __DUMP_MATRICES__
	printf("\n\n\n");
	printf("A[][] = \n");
	matprt(Mailbox.A, _Smtx);
	printf("B[][] = \n");
	matprt(Mailbox.B, _Smtx);
	printf("C[][] = \n");
	matprt(Mailbox.C, _Smtx);
	printf("Cref[][] = \n");
	matprt(Cref, _Smtx);

	int i, j;
	for (i=0; i<_Nside; i++)
		for (j=0; j<_Nside; j++)
		{
			e_read(pEpiphany, i, j, 0x2000+0*sizeof(float), &Aepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float));
			e_read(pEpiphany, i, j, 0x2000+2*sizeof(float), &Aepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float));
			e_read(pEpiphany, i, j, 0x4000+0*sizeof(float), &Bepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float));
			e_read(pEpiphany, i, j, 0x4000+2*sizeof(float), &Bepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float));
		}
	printf("Aepi[][] = \n");
	matprt(Aepi, _Smtx);
	printf("Bepi[][] = \n");
	matprt(Bepi, _Smtx);
#endif

	printf("\n* * *   EPIPHANY FTW !!!   * * *\n");


	// Close connection to device
	if (e_close(pEpiphany))
	{
		printf("\nERROR: Can't close connection to Epiphany device!\n\n");
		exit(1);
	}
	if (e_free(pDRAM))
	{
		printf("\nERROR: Can't release Epiphany DRAM!\n\n");
		exit(1);
	}

	e_finalize();

	return rerval;
}
Example #5
0
void matmul_strassen(double* a, double* b, double* c, int n)
{
    double* tmp1 = (double*) malloc((n*n/4)*sizeof(double));
    double* tmp2 = (double*) malloc((n*n/4)*sizeof(double));
    
    double* a11 = (double*) malloc((n*n/4)*sizeof(double));
    double* a12 = (double*) malloc((n*n/4)*sizeof(double));
    double* a21 = (double*) malloc((n*n/4)*sizeof(double));
    double* a22 = (double*) malloc((n*n/4)*sizeof(double));
    
    double* b11 = (double*) malloc((n*n/4)*sizeof(double));
    double* b12 = (double*) malloc((n*n/4)*sizeof(double));
    double* b21 = (double*) malloc((n*n/4)*sizeof(double));
    double* b22 = (double*) malloc((n*n/4)*sizeof(double));
    
    double* c11 = (double*) malloc((n*n/4)*sizeof(double));
    double* c12 = (double*) malloc((n*n/4)*sizeof(double));
    double* c21 = (double*) malloc((n*n/4)*sizeof(double));
    double* c22 = (double*) malloc((n*n/4)*sizeof(double));
    

    double* m1 = (double*) malloc((n*n/4)*sizeof(double));
    double* m2 = (double*) malloc((n*n/4)*sizeof(double));
    double* m3 = (double*) malloc((n*n/4)*sizeof(double));
    double* m4 = (double*) malloc((n*n/4)*sizeof(double));
    double* m5 = (double*) malloc((n*n/4)*sizeof(double));
    double* m6 = (double*) malloc((n*n/4)*sizeof(double));
    double* m7 = (double*) malloc((n*n/4)*sizeof(double));
	int i, j, k, ii, jj, kk, N;
	double tmp;
    

    // partition A and B
    N = n/2;
    for (i=0; i< N; i++) 
    {

        for (j=0; j< N; j++) 
        {
            a11[i*N +j ] = a[i*n+j];
            b11[i*N +j ] = b[i*n+j];
            
            a12[i*N +j ] = a[i*n+j+N];
            b12[i*N +j ] = b[i*n+j+N];
            
            a21[i*N +j ] = a[(i+N)*n+j];
            b21[i*N +j ] = b[(i+N)*n+j];

            a22[i*N +j ] = a[(i+N)*n+j+N];
            b22[i*N +j ] = b[(i+N)*n+j+N];
        }
    }
    //print(a, n);
    //print(a11, N);
    //print(a12, N);
    //print(a21, N);
    //print(a22, N);
    
    //form m1 = (a11 + a22)(b11 + b22)
    matadd(a11, a22, tmp1, N);
    matadd(b11, b22, tmp2, N);
    matmul(tmp1, tmp2, m1, N);
    
    //form m2 = (a21 + a22)b11
    matadd(a21, a22, tmp1, N);
    matmul(tmp1, b11, m2, N);

    //form m3 = a11(b12 - b22)
    matsub(b12, b22, tmp1, N);
    matmul(a11, tmp1, m3, N);

    //form m4 = a22(b21 - b11)
    matsub(b21, b11, tmp1, N);
    matmul(a22, tmp1, m4, N);
    
    //form m5 = (a11 +a12)b22
    matadd(a11, a12, tmp1, N);
    matmul(tmp1, b22, m5, N);
    
    //form m6 = (a21 -a11)(b11 + b12)
    matsub(a21, a11, tmp1, N);
    matadd(b11, b12, tmp2, N);
    matmul(tmp1, tmp2, m6, N);

    //form m7 = (a12 -a22)(b21 + b22)
    matsub(a12, a22, tmp1, N);
    matadd(b21, b22, tmp2, N);
    matmul(tmp1, tmp2, m7, N);    



    //============================
    //form c11 = m1 + m4 - m5 + m7
    matadd(m1, m4, tmp1, N);
    matsub(tmp1, m5, tmp2, N);
    matadd(tmp2, m7, c11, N);

    //form c12 = m3 + m5
    matadd(m3, m5, c12, N);

    //form c21 = m2 + m4
    matadd(m2, m4, c21, N);

    //fomr c22 = m1 - m2 + m3 + m6
    matsub(m1, m2, tmp1, N);
    matadd(tmp1, m3, tmp2, N);
    matadd(tmp2, m6, c22, N);

    for (i=0; i< N; i++) 
    {

        for (j=0; j< N; j++) 
        {
            c[i*n+j] = c11[i*N +j ];
            c[i*n+j+N] = c12[i*N +j ];
            c[(i+N)*n+j] = c21[i*N +j ];
            c[(i+N)*n+j+N] = c22[i*N +j ]; 
        }
    }
    free(tmp1);
    free(tmp2);

    free(a11);
    free(a12);
    free(a21);
    free(a22);
    
    free(b11);
    free(b12);
    free(b21);
    free(b22);

    free(c11);
    free(c12);
    free(c21);
    free(c22);

    free(m1);
    free(m2);
    free(m3);
    free(m4);
    free(m5);
    free(m6);
    free(m7);


}
Example #6
0
int main(int argc, char *argv[])
{
	p_mem_t shared_mem, results_mem;
	uint32_t eram_base;
	char results[1024] = { '\0' };
	int device_cols, device_rows, nside;
	p_dev_t dev;
	p_prog_t prog;
	p_team_t team;
	p_coords_t size;
	p_coords_t start = { .row = 0, .col = 0 };

	unsigned int msize;
	float        seed;
	unsigned int addr; //, clocks;
	size_t       sz;
	int          verbose=0;
	double       tdiff[3];
	int          result, retval = 0;

	msize     = 0x00400000;

	get_args(argc, argv);

	fo = stderr;
	fi = stdin;
	printf( "------------------------------------------------------------\n");
	printf( "Calculating:   C[%d][%d] = A[%d][%d] * B[%d][%d]\n", _Smtx, _Smtx, _Smtx, _Smtx, _Smtx, _Smtx);
	seed = 0.0;
	if(verbose){
	  printf( "Seed = %f\n", seed);
	}

	dev = p_init(P_DEV_EPIPHANY, 0);
	if (p_error(dev)) {
		fprintf(stderr, "Error initializing PAL\n");
		return p_error(dev);
	}

	device_cols = p_query(dev, P_PROP_COLS);
	device_rows = p_query(dev, P_PROP_ROWS);

	// Use min size
	nside = device_cols > device_rows ? device_cols : device_rows;

	if (nside < 4) {
		fprintf(stderr, "Error: Too small device, need at least 4x4\n");
		return 1;
	}

	// Either 1024, 256, 64, or 16 cores (side must be power of two),
	nside = nside >= 32 ? 32 : nside >= 16 ? 16 : nside >= 8 ? 8 : 4;

	size.row = nside;
	size.col = nside;
	team = p_open4(dev, P_TOPOLOGY_2D, &start, &size);
	printf("Using team of size %d\n", p_team_size(team));
	if (p_error(team)) {
		fprintf(stderr, "Error opening team\n");
		return p_error(team);
	}

	prog = p_load(dev, ar.elfFile, 0);

	eram_base = (unsigned) p_query(dev, P_PROP_MEMBASE);
	shared_mem = p_map(dev, eram_base, msize);

	// Clear mailbox contents
	memset(&Mailbox, 0, sizeof(Mailbox));
	p_write(&shared_mem, &Mailbox, 0, sizeof(Mailbox), 0);

	// Generate operand matrices based on a provided seed
	matrix_init((int)seed);

#ifdef __WIPE_OUT_RESULT_MATRIX__
	// Wipe-out any previous remains in result matrix (for verification)
	addr = offsetof(shared_buf_t, C[0]);
	sz = sizeof(Mailbox.C);
	if(verbose){
	  printf( "Writing C[%uB] to address %08x...\n", (unsigned) sz, addr);
	}
	p_write(&shared_mem, (void *) Mailbox.C, addr, sz, 0);
#endif

	/* Wallclock time */
	clock_gettime(CLOCK_MONOTONIC, &timer[0]);
	/* Clock CPUTIME too. We don't want to indicate failure just
	 * because the system was under high load. */
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[4]);

	// Copy operand matrices to Epiphany system
	addr = offsetof(shared_buf_t, A[0]);
	sz = sizeof(Mailbox.A);
	if(verbose){
	  printf( "Writing A[%uB] to address %08x...\n", (unsigned) sz, addr);
	}
	p_write(&shared_mem, (void *) Mailbox.A, addr, sz, 0);

	addr = offsetof(shared_buf_t, B[0]);
	sz = sizeof(Mailbox.B);
	if(verbose){
	  printf( "Writing B[%uB] to address %08x...\n", (unsigned) sz, addr);
	}
	p_write(&shared_mem, (void *) Mailbox.B, addr, sz, 0);
	// Call the Epiphany matmul() function

	if(verbose){
	  printf( "GO Epiphany! ...   ");
	}
	if(verbose){
	  printf("Loading program on Epiphany chip...\n");
	}

	p_arg_t args[] = { &nside, sizeof(nside), true };
	if (p_run(prog, "matmul", team, 0, p_team_size(team), 1, args, 0)) {
		fprintf(stderr, "Error loading Epiphany program.\n");
		exit(1);
	}

	// Read result matrix and timing
	addr = offsetof(shared_buf_t, C[0]);
	sz = sizeof(Mailbox.C);
	if(verbose){
	  printf( "Reading result from address %08x...\n", addr);
	}
	p_read(&shared_mem, (void *) Mailbox.C, addr, sz, 0);

	clock_gettime(CLOCK_MONOTONIC, &timer[1]);
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[5]);


	// Calculate a reference result
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[2]);
#ifndef __DO_STRASSEN__
	matmul(Mailbox.A, Mailbox.B, Cref, _Smtx);
#else
	matmul_strassen(Mailbox.A, Mailbox.B, Cref, _Smtx);
#endif
	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[3]);
	addr = offsetof(shared_buf_t, core.clocks);
	sz = sizeof(Mailbox.core.clocks);
	if(verbose){
	  printf( "Reading time from address %08x...\n", addr);
	}
	p_read(&shared_mem, &Mailbox.core.clocks, addr, sizeof(Mailbox.core.clocks), 0);
//	clocks = Mailbox.core.clocks;





	// Calculate the difference between the Epiphany result and the reference result
	matsub(Mailbox.C, Cref, Cdiff, _Smtx);

	tdiff[0] = (timer[1].tv_sec - timer[0].tv_sec) * 1000 + ((double) (timer[1].tv_nsec - timer[0].tv_nsec) / 1000000.0);
//	tdiff[0] = ((double) clocks) / eMHz * 1000;
	tdiff[1] = (timer[3].tv_sec - timer[2].tv_sec) * 1000 + ((double) (timer[3].tv_nsec - timer[2].tv_nsec) / 1000000.0);
	tdiff[2] = (timer[5].tv_sec - timer[4].tv_sec) * 1000 + ((double) (timer[5].tv_nsec - timer[4].tv_nsec) / 1000000.0);


	// If the difference is 0, then the matrices are identical and the
	// calculation was correct
	if (iszero(Cdiff, _Smtx))
	  {

	    printf( "Epiphany(time) %9.1f msec  (@ %03d MHz)\n", tdiff[0], eMHz);
	    printf( "Host(time)     %9.1f msec  (@ %03d MHz)\n", tdiff[1], aMHz);
	    printf( "------------------------------------------------------------\n");
	    printf( "TEST \"matmul-16\" PASSED\n");
	    retval = 0;
	} else {
	  printf( "\n\nERROR: C_epiphany is different from C_host !!!\n");
	  printf( "TEST \"matmul-16\" FAILED\n");
	  retval = 1;
	}

#if 0
#ifdef __DUMP_MATRICES__
	printf( "\n\n\n");
	printf( "A[][] = \n");
	matprt(Mailbox.A, _Smtx);
	printf( "B[][] = \n");
	matprt(Mailbox.B, _Smtx);
	printf( "C[][] = \n");
	matprt(Mailbox.C, _Smtx);
	printf( "Cref[][] = \n");
	matprt(Cref, _Smtx);

	int i, j;
	for (i=0; i<_Nside; i++)
		for (j=0; j<_Nside; j++)
		{
			e_read(pEpiphany, i, j, 0x2000+0*sizeof(float), &Aepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float));
			e_read(pEpiphany, i, j, 0x2000+2*sizeof(float), &Aepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float));
			e_read(pEpiphany, i, j, 0x4000+0*sizeof(float), &Bepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float));
			e_read(pEpiphany, i, j, 0x4000+2*sizeof(float), &Bepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float));
		}
	printf( "Aepi[][] = \n");
	matprt(Aepi, _Smtx);
	printf( "Bepi[][] = \n");
	matprt(Bepi, _Smtx);
#endif
#endif



	// p_unmap ...
	p_close(team);
	p_finalize(dev);

	return retval;
}


// Initialize operand matrices
void matrix_init(int seed)
{
	int i, j, p;

	p = 0;
	for (i=0; i<_Smtx; i++)
		for (j=0; j<_Smtx; j++)
			Mailbox.A[p++] = (i + j + seed) % _MAX_MEMBER_;

	p = 0;
	for (i=0; i<_Smtx; i++)
		for (j=0; j<_Smtx; j++)
			Mailbox.B[p++] = ((i + j) * 2 + seed) % _MAX_MEMBER_;

	p = 0;
	for (i=0; i<_Smtx; i++)
		for (j=0; j<_Smtx; j++)
			Mailbox.C[p++] = 0x8dead;

	return;
}