void TopologyMatrix::calculateForThreeAtoms( const unsigned& iat, const Vector& d1, const double& d1_len, HistogramBead& bead, multicolvar::AtomValuePack& myatoms ) const { // Calculate if there are atoms in the cylinder (can use delta here as pbc are done in atom setup) Vector d2 = getSeparation( myatoms.getPosition(0), myatoms.getPosition(iat) ); // Now calculate projection of d2 on d1 double proj=dotProduct(d2,d1); // This tells us if we are outside the end of the cylinder double excess = proj - d1_len; // Return if we are outside of the cylinder as calculated based on excess if( excess>low_sf( getBaseColvarNumber( myatoms.getIndex(0) ), getBaseColvarNumber( myatoms.getIndex(1) ) ).get_dmax() ) return; // Find the length of the cylinder double binw = binw_mat( getBaseColvarNumber( myatoms.getIndex(0) ), getBaseColvarNumber( myatoms.getIndex(1) ) ); double lcylinder = (std::floor( d1_len / binw ) + 1)*binw; // Return if the projection is outside the length of interest if( proj<-bead.getCutoff() || proj>(lcylinder+bead.getCutoff()) ) return; // Calculate the excess swiching function double edf, eval = low_sf( getBaseColvarNumber( myatoms.getIndex(0) ), getBaseColvarNumber( myatoms.getIndex(1) ) ).calculate( excess, edf ); // Calculate the projection on the perpendicular distance from the center of the tube double cm = d2.modulo2() - proj*proj; // Now calculate the density in the cylinder if( cm<cylinder_sw( getBaseColvarNumber( myatoms.getIndex(0) ), getBaseColvarNumber( myatoms.getIndex(1) ) ).get_dmax2() ) { double dfuncr, val = cylinder_sw( getBaseColvarNumber( myatoms.getIndex(0) ), getBaseColvarNumber( myatoms.getIndex(1) ) ).calculateSqr( cm, dfuncr ); double cellv = cell_volume( getBaseColvarNumber( myatoms.getIndex(0) ), getBaseColvarNumber( myatoms.getIndex(1) ) ); Vector dc1, dc2, dc3, dd1, dd2, dd3, de1, de2, de3; if( !doNotCalculateDerivatives() ) { Tensor d1_a1; // Derivative of director connecting atom1 - atom2 wrt the position of atom 1 d1_a1(0,0) = ( -(d1[1]*d1[1]+d1[2]*d1[2])/d1_len ); // dx/dx d1_a1(0,1) = ( d1[0]*d1[1]/d1_len ); // dx/dy d1_a1(0,2) = ( d1[0]*d1[2]/d1_len ); // dx/dz d1_a1(1,0) = ( d1[1]*d1[0]/d1_len ); // dy/dx d1_a1(1,1) = ( -(d1[0]*d1[0]+d1[2]*d1[2])/d1_len ); // dy/dy d1_a1(1,2) = ( d1[1]*d1[2]/d1_len ); d1_a1(2,0) = ( d1[2]*d1[0]/d1_len ); d1_a1(2,1) = ( d1[2]*d1[1]/d1_len ); d1_a1(2,2) = ( -(d1[1]*d1[1]+d1[0]*d1[0])/d1_len ); // Calculate derivatives of dot product dd1 = matmul(d2, d1_a1) - d1; dd2 = matmul(d2, -d1_a1); dd3 = d1; // Calculate derivatives of cross product dc1 = dfuncr*( -d2 - proj*dd1 ); dc2 = dfuncr*( -proj*dd2 ); dc3 = dfuncr*( d2 - proj*dd3 ); // Calculate derivatives of excess de1 = edf*excess*( dd1 + d1 ); de2 = edf*excess*( dd2 - d1 ); de3 = edf*excess*dd3; } Vector pos1 = myatoms.getPosition(0) + d1_len*d1; Vector pos2 = myatoms.getPosition(0) + d2; Vector g1derivf,g2derivf,lderivf; Tensor vir; for(unsigned bin=0; bin<maxbins; ++bin) { bead.set( bin*binw, (bin+1)*binw, sigma ); if( proj<(bin*binw-bead.getCutoff()) || proj>binw*(bin+1)+bead.getCutoff() ) continue; double der, contr=bead.calculateWithCutoff( proj, der ) / cellv; der /= cellv; myatoms.addValue( 2+bin, contr*val*eval ); if( !doNotCalculateDerivatives() ) { g1derivf=contr*eval*dc1 + val*eval*der*dd1 + contr*val*de1; addAtomDerivatives( 2+bin, 0, g1derivf, myatoms ); g2derivf=contr*eval*dc2 + val*eval*der*dd2 + contr*val*de2; addAtomDerivatives( 2+bin, 1, g2derivf, myatoms ); lderivf=contr*eval*dc3 + val*eval*der*dd3 + contr*val*de3; addAtomDerivatives( 2+bin, iat, lderivf, myatoms ); // Virial vir = -Tensor( myatoms.getPosition(0), g1derivf ) - Tensor( pos1, g2derivf ) - Tensor( pos2, lderivf ); myatoms.addBoxDerivatives( 2+bin, vir ); } } } }
void ray_tracing_v2 (double x, double y, Exterior Ex, Interior I, Glass G, mm_np mm, double *Xb2, double *Yb2, double* Zb2, double *a3, double *b3, double *c3) /* ray-tracing, see HOEHLE and Manual of Photogrammetry */ { double a1, b1, c1, a2, b2, c2, Xb1, Yb1, Zb1, d1, d2, vect1[3], vect2[3], s2; double a[3],b[3],base2[3],c,dummy,bn[3],bp[3],n,p; s2 = sqrt (x*x + y*y + I.cc*I.cc); /* direction cosines in image coordinate system */ vect1[0] = x/s2; vect1[1] = y/s2; vect1[2] = -I.cc/s2; matmul (vect2, Ex.dm, vect1, 3,3,1); /* direction cosines in space coordinate system , medium n1 */ a1 = vect2[0]; b1 = vect2[1]; c1 = vect2[2]; //old d1 = -(Ex.z0 - mm.d[0]) / c1; //find dist to outer interface //... from Jakob Mann vector3 XLinePlane(vector3 a, vector3 b, struct plane pl) //... a + b*((pl.c - dot(pl.base[2],a))/dot(pl.base[2],b)); /*Ex.x0=0.; Ex.y0=20.; Ex.z0=10.; Ex.omega=-0.7853981; Ex.phi=0.; Ex.kappa=0.; G.vec_x=0.; G.vec_y=10.; G.vec_z=0.; vect2[0]=0.; vect2[1]=-1./sqrt(2.); vect2[2]=-1./sqrt(2.);*/ a[0]=Ex.x0;a[1]=Ex.y0;a[2]=Ex.z0; b[0]=vect2[0];b[1]=vect2[1];b[2]=vect2[2]; c=sqrt(G.vec_x*G.vec_x+G.vec_y*G.vec_y+G.vec_z*G.vec_z); base2[0]=G.vec_x/c;base2[1]=G.vec_y/c;base2[2]=G.vec_z/c; c=c+mm.d[0]; dummy=base2[0]*a[0]+base2[1]*a[1]+base2[2]*a[2]; dummy=dummy-c; d1=-dummy/(base2[0]*b[0]+base2[1]*b[1]+base2[2]*b[2]); /* point on the horizontal plane between n1,n2 */ //old Xb1 = Ex.x0 + d1*a1; Yb1 = Ex.y0 + d1*b1; Zb1 = Ex.z0 + d1*c1; Xb1=a[0]+b[0]*d1; Yb1=a[1]+b[1]*d1; Zb1=a[2]+b[2]*d1; //old cosi1 = c1; //cosi1=base2[0]*b[0]+base2[1]*b[1]+base2[2]*b[2]; //factor = cosi1 * mm.n1/mm.n2[0] // + sqrt (1 - (mm.n1*mm.n1)/(mm.n2[0]*mm.n2[0]) // + (cosi1*cosi1)*(mm.n1*mm.n1)/(mm.n2[0]*mm.n2[0])); /* direction cosines in space coordinate system , medium n2 */ //old a2 = a1 * mm.n1/mm.n2[0]; //old b2 = b1 * mm.n1/mm.n2[0]; //old c2 = c1 * mm.n1/mm.n2[0] - factor; //old d2 = -mm.d[0]/c2; bn[0]=base2[0];bn[1]=base2[1];bn[2]=base2[2]; n=(b[0]*bn[0]+b[1]*bn[1]+b[2]*bn[2]); bp[0]=b[0]-bn[0]*n;bp[1]=b[1]-bn[1]*n;bp[2]=b[2]-bn[2]*n; dummy=sqrt(bp[0]*bp[0]+bp[1]*bp[1]+bp[2]*bp[2]); if (dummy == 0) dummy = 1.0; bp[0]=bp[0]/dummy;bp[1]=bp[1]/dummy;bp[2]=bp[2]/dummy; p=sqrt(1-n*n); p = p * mm.n1/mm.n2[0];//interface parallel //n = n * mm.n1/mm.n2[0] - factor;//interface normal n=-sqrt(1-p*p); a2=p*bp[0]+n*bn[0]; b2=p*bp[1]+n*bn[1]; c2=p*bp[2]+n*bn[2]; d2=mm.d[0]/fabs((base2[0]*a2+base2[1]*b2+base2[2]*c2)); /* point on the horizontal plane between n2,n3 */ *Xb2 = Xb1 + d2*a2; *Yb2 = Yb1 + d2*b2; *Zb2 = Zb1 + d2*c2; //old cosi2 = c2; //cosi2=base2[0]*a2+base2[1]*b2+base2[2]*c2; //factor = cosi2 * mm.n2[0]/mm.n3 // + sqrt (1 - (mm.n2[0]*mm.n2[0])/(mm.n3*mm.n3) // + (cosi2*cosi2)*(mm.n2[0]*mm.n2[0])/(mm.n3*mm.n3)); /* direction cosines in space coordinate system , medium mm.n3 */ //old *a3 = a2 * mm.n2[0]/mm.n3; //old *b3 = b2 * mm.n2[0]/mm.n3; //old *c3 = c2 * mm.n2[0]/mm.n3 - factor; n=(a2*bn[0]+b2*bn[1]+c2*bn[2]); bp[0]=a2-bn[0]*n;bp[1]=b2-bn[1]*n;bp[2]=c2-bn[2]*n; dummy=sqrt(bp[0]*bp[0]+bp[1]*bp[1]+bp[2]*bp[2]); if (dummy == 0) dummy = 1.0; bp[0]=bp[0]/dummy;bp[1]=bp[1]/dummy;bp[2]=bp[2]/dummy; p=sqrt(1-n*n); p = p * mm.n2[0]/mm.n3;//interface parallel //n = n * mm.n2[0]/mm.n3 - factor;//interface normal n=-sqrt(1-p*p); *a3=p*bp[0]+n*bn[0]; *b3=p*bp[1]+n*bn[1]; *c3=p*bp[2]+n*bn[2]; }
Vector Pbc::realToScaled(const Vector&d)const { return matmul(invBox.transpose(),d); }
int raw_orient (Calibration* cal, control_par *cpar, int nfix, vec3d fix[], target pix[]) { double X[10][6], y[10], XPX[6][6], XPy[6], beta[6]; int i, j, n, itnum, stopflag; double dm = 0.0001, drad = 0.0001; double xp, yp, xc, yc; vec3d pos; /* init X, y (set to zero) */ for (i = 0; i < 10; i++) { for (j = 0; j < 6; j++) X[i][j] = 0; y[i] = 0; } cal->added_par.k1 = 0; cal->added_par.k2 = 0; cal->added_par.k3 = 0; cal->added_par.p1 = 0; cal->added_par.p2 = 0; cal->added_par.scx = 1; cal->added_par.she = 0; /* main loop, program runs through it, until none of the beta values comes over a threshold and no more points are thrown out because of their residuals */ itnum = 0; stopflag = 0; while ((stopflag == 0) && (itnum < 20)) { ++itnum; for (i = 0, n = 0; i < nfix; i++) { /* we do not check the order - trust the user to click the points in the correct order of appearance in man_ori and in the calibration parameters GUI */ pixel_to_metric (&xc, &yc, pix[i].x, pix[i].y, cpar); /* no corrections as additional parameters are neglected correct_brown_affin (xc, yc, cal->added_par, &xc, &yc); */ /* every calibration dot is projected to the mm position, xp, yp */ vec_set(pos, fix[i][0], fix[i][1], fix[i][2]); rotation_matrix(&(cal->ext_par)); img_coord (pos, cal, cpar->mm, &xp, &yp); /* numeric derivatives of internal camera coefficients */ num_deriv_exterior(cal, cpar, dm, drad, pos, X[n], X[n + 1]); y[n] = xc - xp; y[n+1] = yc - yp; n += 2; } /* Gauss Markoff Model */ ata ((double *) X, (double *) XPX, n, 6, 6); matinv ((double *) XPX, 6, 6); atl ((double *) XPy, (double *) X, y, n, 6, 6); matmul ((double *) beta, (double *) XPX, (double *) XPy, 6,6,1,6,6); stopflag = 1; for (i = 0; i < 6; i++) { if (fabs (beta[i]) > 0.1 ) stopflag = 0; } cal->ext_par.x0 += beta[0]; cal->ext_par.y0 += beta[1]; cal->ext_par.z0 += beta[2]; cal->ext_par.omega += beta[3]; cal->ext_par.phi += beta[4]; cal->ext_par.kappa += beta[5]; } if (stopflag) { rotation_matrix(&(cal->ext_par)); } return stopflag; }
int main() { double *a; double *b; double *c; int i = 0, j = 0, k = 0; int *events; // Array of events long long *values; // Array of values events int EventSet = PAPI_NULL; // Handle for a PAPI event set as created by PAPI_create_eventset (3) int retval; // Test fail function int num_event = 0; // Number of events int max_event; // Number of available events int EventCode = 0; // Event code PAPI_event_info_t pset; // PAPI_event_info_t Struct Reference char evname[PAPI_MAX_STR_LEN]; // Symbol event /* Memory asignament to matrixs*/ if((a = (double *)malloc(mrows * ncolumns * sizeof(double))) == NULL) printf("Error malloc matrix a[%d]\n",mrows * ncolumns); if((b = (double *)malloc(ncolumns * pcolumns * sizeof(double))) == NULL) printf("Error malloc matrix b[%d]\n",mrows * ncolumns); if((c = (double *)malloc(mrows * pcolumns * sizeof(double))) == NULL) printf("Error malloc matrix c[%d]\n",mrows * ncolumns); /* Initialize the Matrix arrays */ initmat(a, b, mrows, ncolumns, pcolumns); /* Initialize the PAPI library */ retval = PAPI_library_init(PAPI_VER_CURRENT); if (retval != PAPI_VER_CURRENT) test_fail( __FILE__, __LINE__, "PAPI_library_init", retval ); /* Enable and initialize multiplex support */ retval = PAPI_multiplex_init(); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_multiplex_init", retval ); /* Create an EventSet */ retval = PAPI_create_eventset(&EventSet); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_create_eventset", retval ); /* Assign it to the CPU component */ retval = PAPI_assign_eventset_component(EventSet, 0); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_assign_eventset_component", retval ); /* Convert the EventSet to a multiplexed event set */ retval = PAPI_set_multiplex(EventSet); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_set_multiplex", retval ); /* Obtaining the number of available events */ max_event = PAPI_get_opt( PAPI_MAX_MPX_CTRS, NULL ); printf("\nNumber of available events: %d", max_event ); /* Fill up the event set with as many non-derived events as we can */ EventCode = PAPI_PRESET_MASK; do { if ( PAPI_get_event_info( EventCode, &pset ) == PAPI_OK ) { if ( pset.count && ( strcmp( pset.derived, "NOT_DERIVED" ) == 0 ) ) { retval = PAPI_add_event( EventSet, ( int ) pset.event_code ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_add_event", retval ); else { //printf( "Added %s\n", pset.symbol ); num_event++; } } } } while ( ( PAPI_enum_event( &EventCode, PAPI_PRESET_ENUM_AVAIL ) == PAPI_OK ) && ( num_event < max_event ) ); /* Memory asignament to values and events*/ events = ( int * ) malloc( ( size_t ) num_event * sizeof ( int ) ); if ( events == NULL ) test_fail( __FILE__, __LINE__, "Error malloc events", 0 ); values = ( long long * ) malloc( ( size_t ) num_event * sizeof ( long long ) ); if ( values == NULL ) test_fail( __FILE__, __LINE__, "Erro malloc values", 0 ); /* Start counting events */ if ((retval=PAPI_start(EventSet)) != PAPI_OK) test_fail(__FILE__, __LINE__, "PAPI_start", retval); /* Matrix-Matrix multiply */ matmul(a, b, c, mrows, ncolumns, pcolumns); /* Read the counters */ if ((retval=PAPI_read( EventSet, values )) != PAPI_OK) test_fail(__FILE__, __LINE__, "PAPI_read_counters", retval); /* Stop counting events */ if ((retval=PAPI_stop( EventSet, values )) != PAPI_OK) test_fail(__FILE__, __LINE__, "PAPI_stop_counters", retval); /* List the events in the event set */ retval = PAPI_list_events( EventSet, events, &num_event ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_list_events", retval ); /* Print results */ printf("\nNumber of non-zero events: %d\n", num_event ); printf( "\nCounts of non-zero available events........................................................\n" ); printf("Name: \t\t\t Value: \t Description:\n"); for ( i = 0; i < num_event; i++ ) { PAPI_event_code_to_name( events[i], evname ); // Obtaining name of available events PAPI_get_event_info(events[i], &pset); if ( values[i] != 0 ) printf("%s \t %15lld \t %s\n", evname, values[i], pset.long_descr); } printf( "\nCounts of zero available events............................................................\n" ); printf("Name: \t\t\t Value: \t Description:\n"); for ( i = 0; i < num_event; i++ ) { PAPI_event_code_to_name( events[i], evname ); // Obtaining name of available events PAPI_get_event_info(events[i], &pset); if ( values[i] == 0 ) printf("%s \t %15lld \t %s\n", evname, values[i], pset.long_descr); } /* Check if counter pair(s) had identical values */ for ( i = 0; i < num_event; i++ ) { for ( i = j+1; j < num_event; j++ ) { if ( ( i != j ) && ( values[i] == values[j] ) ) k++; } } if ( k != 0 ) { printf( "\nCaution: %d counter pair(s) had identical values\n", k ); } printf("\n"); /* Free memory */ free( events ); free( values ); free( a ); free( b ); free( c ); /* Cleaning events */ retval = PAPI_cleanup_eventset( EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_cleanup_eventset", retval ); /* Destroying events */ retval = PAPI_destroy_eventset( &EventSet ); if ( retval != PAPI_OK ) test_fail( __FILE__, __LINE__, "PAPI_destroy_eventset", retval ); return 0; }
void varinit(void) { int i; /* * Resetting all flags */ Intr1_Cnt=0; Intr2_Cnt=0; IRQ1Flag = 1; IRQ2Flag = 1; WSZ = 34; TA_cnt =0; count = 0; qcnt = 0; velcnt = 0; rtime = 0.0; rcnt = 0; cnt_10ms = 0; latm = MasterLat; longm = MasterLon; epsilon = 0.0; four_delt = 4.0 * del_t; eight_delt = 8.0 * del_t; cdr_delt = cdr * del_t; cdr_delt_ms = cdr_delt / 3600; for(i=0;i<32;i++){ Array_SA[i] = 0; } for(i=0;i<3;i++) { velo_ref_y[i] = 0.0; velo_ref_yold[i] = 0.0;; velo_ref_x[i] = 0.0; velo_ref_xold[i] = 0.0; pure_vel[i] = 0.0; p_velo_20ms[i] = 0.0; p_velo[i] = 0.0; pure_v_old[i] = 0.0; p_Ang[i] = 0.0; pure_gyro_drift[i] = 0.0; pure_acc_residu[i] = 0.0; } #if 0 /* these are known misalignment angles between M and S - * Measured w.r.t Master to give DCM from slave to Master. * Beware they are not between slave to NED */ known_si = 0.0 * cdr; known_theta = 0.0 * cdr; known_phi = 0.0 * cdr; euler2dcm_stp(0, 0, 0, (double*)CSkew_est); transpose(3, 3, (double*)CSkew_est, (double*)CSkew_est_T); euler2dcm_stp(known_si, known_theta, known_phi, (double*)CS2M_K); transpose(3, 3, (double*)CS2M_K, (double*)CM2S_K); euler2dcm_stp(THDG, PITCH, ROLL, (double*)Cb2ned_M); matmul(3, 3, (double*)Cb2ned_M, 3, 3, (double*)CS2M_K, (double*)Cb2ned_S); if(ta_flag==1 && nav_flag==1) { dcm2quat((double*)Cb2ned_S, (double *)p_q_body2ned); } else if(ta_flag ==0 && level_flag==1) #endif { euler2quat_spt(mdl_si,mdl_phi,mdl_theta,(double *)p_q_body2ned); p_si = mdl_si; p_phi = mdl_phi; p_theta = mdl_theta; } ned2ecef_q(latm, longm,(double*) q_ned2ecef); quat_mult((double*)q_ned2ecef,(double*)p_q_body2ned, (double*)p_q_body2ecef); /* * Modification after Manjit discussion */ quat2dcm((double *)p_q_body2ecef,(double*)p_dcm); quat2dcm((double *)q_ned2ecef,(double*)p_dcm_n); matmul(3,3, (double*)p_dcm_n,3,1,(double*)MasterVel,(double*)pure_vel); pure_v_old[0] = pure_vel[0]; pure_v_old[1] = pure_vel[1]; pure_v_old[2] = pure_vel[2]; init(0.0, 0.0, 0.0, p_velo_20ms); init(0.0, 0.0, 0.0, p_velo); init(0.0,0.0,0.0,pure_gyro_drift); init(0.0,0.0,0.0,pure_acc_residu); for (i = 0; i < 3; i++) { p_alp1[i] = 0.0; p_alp2[i] = 0.0; p_alp3[i] = 0.0; p_alp4[i] = 0.0; } for (i = 0; i < 3; i++) Delta_Angle[i] = 0.0; for (i = 0; i < 6; i++) accum1[i] = 0.0; init(0.0, 0.0, earth_rate, omega); //earth rate vector ECEF //used in levelling Ned_omega[0] = earth_rate * cos(latm); Ned_omega[1] = 0.0; Ned_omega[2] = -earth_rate *sin(latm); for (i = 0; i < 3; i++) omg_dub[i] = 2.0 * omega[i]; r_init = r0 * (1.0 - eccen * (sin(latm) * sin(latm))); pure_R = r_init + MasterAlt; // altitude; lla2ecef(latm,longm,MasterAlt,(double *)pure_ecef_pos); //input is geodetic pure_g_ecef(); /**** for epsilon estimation ****/ init(0.0, 0.0, -pure_g_ecef_mag, Ned_gravity_detic); } //end of varinit()
int main(int argc, char *argv[]) { e_epiphany_t Epiphany, *pEpiphany; e_mem_t DRAM, *pDRAM; unsigned int msize; float seed; unsigned int addr; //, clocks; size_t sz; double tdiff[4]; int result, rerval; pEpiphany = &Epiphany; pDRAM = &DRAM; msize = 0x00400000; get_args(argc, argv); fo = stderr; fi = stdin; printf("\nMatrix: C[%d][%d] = A[%d][%d] * B[%d][%d]\n\n", _Smtx, _Smtx, _Smtx, _Smtx, _Smtx, _Smtx); printf("Using %d x %d cores\n\n", _Nside, _Nside); seed = 0.0; printf("Seed = %f\n", seed); // Connect to device for communicating with the Epiphany system // Prepare device e_set_host_verbosity(H_D0); e_init(NULL); e_reset_system(); if (e_alloc(pDRAM, 0x00000000, msize)) { printf("\nERROR: Can't allocate Epiphany DRAM!\n\n"); exit(1); } if (e_open(pEpiphany, 0, 0, e_platform.chip[0].rows, e_platform.chip[0].cols)) { printf("\nERROR: Can't establish connection to Epiphany device!\n\n"); exit(1); } // Initialize Epiphany "Ready" state addr = offsetof(shared_buf_t, core.ready); Mailbox.core.ready = 0; e_write(pDRAM, 0, 0, addr, &Mailbox.core.ready, sizeof(Mailbox.core.ready)); printf("Loading program on Epiphany chip...\n"); e_set_loader_verbosity(ar.verbose); result = e_load_group(ar.srecFile, pEpiphany, 0, 0, pEpiphany->rows, pEpiphany->cols, ar.run_target); if (result == E_ERR) { printf("Error loading Epiphany program.\n"); exit(1); } // Generate operand matrices based on a provided seed matrix_init(seed); #ifdef __WIPE_OUT_RESULT_MATRIX__ // Wipe-out any previous remains in result matrix (for verification) addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); printf("Writing C[%uB] to address %08x...\n", sz, addr); e_write(pDRAM, 0, 0, addr, (void *) Mailbox.C, sz); #endif clock_gettime(CLOCK_MONOTONIC, &timer[0]); // Copy operand matrices to Epiphany system addr = offsetof(shared_buf_t, A[0]); sz = sizeof(Mailbox.A); printf("Writing A[%uB] to address %08x...\n", sz, addr); e_write(pDRAM, 0, 0, addr, (void *) Mailbox.A, sz); addr = offsetof(shared_buf_t, B[0]); sz = sizeof(Mailbox.B); printf("Writing B[%uB] to address %08x...\n", sz, addr); e_write(pDRAM, 0, 0, addr, (void *) Mailbox.B, sz); // Call the Epiphany matmul() function printf("GO Epiphany! ... "); clock_gettime(CLOCK_MONOTONIC, &timer[1]); matmul_go(pDRAM); clock_gettime(CLOCK_MONOTONIC, &timer[2]); printf("Finished calculating Epiphany result.\n"); // Read result matrix and timing addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); printf("Reading result from address %08x...\n", addr); e_read(pDRAM, 0, 0, addr, (void *) Mailbox.C, sz); clock_gettime(CLOCK_MONOTONIC, &timer[3]); // Calculate a reference result printf("Calculating result on Host ... "); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[4]); #ifndef __DO_STRASSEN__ matmul(Mailbox.A, Mailbox.B, Cref, _Smtx); #else matmul_strassen(Mailbox.A, Mailbox.B, Cref, _Smtx); #endif clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[5]); printf("Finished calculating Host result.\n"); addr = offsetof(shared_buf_t, core.clocks); sz = sizeof(Mailbox.core.clocks); printf("Reading time from address %08x...\n", addr); e_read(pDRAM,0, 0, addr, &Mailbox.core.clocks, sizeof(Mailbox.core.clocks)); // clocks = Mailbox.core.clocks; // Calculate the difference between the Epiphany result and the reference result printf("\n*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n"); printf("Verifying result correctness ... "); matsub(Mailbox.C, Cref, Cdiff, _Smtx); tdiff[0] = (timer[2].tv_sec - timer[1].tv_sec) * 1000 + ((double) (timer[2].tv_nsec - timer[1].tv_nsec) / 1000000.0);//total tdiff[1] = (timer[1].tv_sec - timer[0].tv_sec) * 1000 + ((double) (timer[1].tv_nsec - timer[0].tv_nsec) / 1000000.0);//write tdiff[2] = (timer[3].tv_sec - timer[2].tv_sec) * 1000 + ((double) (timer[3].tv_nsec - timer[2].tv_nsec) / 1000000.0);//read tdiff[3] = (timer[5].tv_sec - timer[4].tv_sec) * 1000 + ((double) (timer[5].tv_nsec - timer[4].tv_nsec) / 1000000.0);//ref // If the difference is 0, then the matrices are identical and the // calculation was correct if (iszero(Cdiff, _Smtx)) { printf("C_epiphany == C_host\n"); rerval = 0; } else { printf("\n\nERROR: C_epiphany is different from C_host !!!\n"); rerval = 1; } printf("*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***\n"); printf("\n"); printf("Epiphany (compute): %9.1f msec (@ %03d MHz)\n" , tdiff[0], eMHz); printf(" (write) : %9.1f msec \n" , tdiff[1]); printf(" (read) : %9.1f msec\n" , tdiff[2]); printf(" (*total*): %9.1f msec\n\n" , tdiff[2]+tdiff[1]+tdiff[0]); printf("Host (*total*): %9.1f msec (@ %03d MHz)\n" , tdiff[3], aMHz); #ifdef __DUMP_MATRICES__ printf("\n\n\n"); printf("A[][] = \n"); matprt(Mailbox.A, _Smtx); printf("B[][] = \n"); matprt(Mailbox.B, _Smtx); printf("C[][] = \n"); matprt(Mailbox.C, _Smtx); printf("Cref[][] = \n"); matprt(Cref, _Smtx); int i, j; for (i=0; i<_Nside; i++) for (j=0; j<_Nside; j++) { e_read(pEpiphany, i, j, 0x2000+0*sizeof(float), &Aepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x2000+2*sizeof(float), &Aepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+0*sizeof(float), &Bepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+2*sizeof(float), &Bepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); } printf("Aepi[][] = \n"); matprt(Aepi, _Smtx); printf("Bepi[][] = \n"); matprt(Bepi, _Smtx); #endif printf("\n* * * EPIPHANY FTW !!! * * *\n"); // Close connection to device if (e_close(pEpiphany)) { printf("\nERROR: Can't close connection to Epiphany device!\n\n"); exit(1); } if (e_free(pDRAM)) { printf("\nERROR: Can't release Epiphany DRAM!\n\n"); exit(1); } e_finalize(); return rerval; }
void ixform(Space *t, Matrix m, Matrix inv){ matmul(t->t, m); matmulr(t->tinv, inv); }
void matmul_strassen(double* a, double* b, double* c, int n) { double* tmp1 = (double*) malloc((n*n/4)*sizeof(double)); double* tmp2 = (double*) malloc((n*n/4)*sizeof(double)); double* a11 = (double*) malloc((n*n/4)*sizeof(double)); double* a12 = (double*) malloc((n*n/4)*sizeof(double)); double* a21 = (double*) malloc((n*n/4)*sizeof(double)); double* a22 = (double*) malloc((n*n/4)*sizeof(double)); double* b11 = (double*) malloc((n*n/4)*sizeof(double)); double* b12 = (double*) malloc((n*n/4)*sizeof(double)); double* b21 = (double*) malloc((n*n/4)*sizeof(double)); double* b22 = (double*) malloc((n*n/4)*sizeof(double)); double* c11 = (double*) malloc((n*n/4)*sizeof(double)); double* c12 = (double*) malloc((n*n/4)*sizeof(double)); double* c21 = (double*) malloc((n*n/4)*sizeof(double)); double* c22 = (double*) malloc((n*n/4)*sizeof(double)); double* m1 = (double*) malloc((n*n/4)*sizeof(double)); double* m2 = (double*) malloc((n*n/4)*sizeof(double)); double* m3 = (double*) malloc((n*n/4)*sizeof(double)); double* m4 = (double*) malloc((n*n/4)*sizeof(double)); double* m5 = (double*) malloc((n*n/4)*sizeof(double)); double* m6 = (double*) malloc((n*n/4)*sizeof(double)); double* m7 = (double*) malloc((n*n/4)*sizeof(double)); int i, j, k, ii, jj, kk, N; double tmp; // partition A and B N = n/2; for (i=0; i< N; i++) { for (j=0; j< N; j++) { a11[i*N +j ] = a[i*n+j]; b11[i*N +j ] = b[i*n+j]; a12[i*N +j ] = a[i*n+j+N]; b12[i*N +j ] = b[i*n+j+N]; a21[i*N +j ] = a[(i+N)*n+j]; b21[i*N +j ] = b[(i+N)*n+j]; a22[i*N +j ] = a[(i+N)*n+j+N]; b22[i*N +j ] = b[(i+N)*n+j+N]; } } //print(a, n); //print(a11, N); //print(a12, N); //print(a21, N); //print(a22, N); //form m1 = (a11 + a22)(b11 + b22) matadd(a11, a22, tmp1, N); matadd(b11, b22, tmp2, N); matmul(tmp1, tmp2, m1, N); //form m2 = (a21 + a22)b11 matadd(a21, a22, tmp1, N); matmul(tmp1, b11, m2, N); //form m3 = a11(b12 - b22) matsub(b12, b22, tmp1, N); matmul(a11, tmp1, m3, N); //form m4 = a22(b21 - b11) matsub(b21, b11, tmp1, N); matmul(a22, tmp1, m4, N); //form m5 = (a11 +a12)b22 matadd(a11, a12, tmp1, N); matmul(tmp1, b22, m5, N); //form m6 = (a21 -a11)(b11 + b12) matsub(a21, a11, tmp1, N); matadd(b11, b12, tmp2, N); matmul(tmp1, tmp2, m6, N); //form m7 = (a12 -a22)(b21 + b22) matsub(a12, a22, tmp1, N); matadd(b21, b22, tmp2, N); matmul(tmp1, tmp2, m7, N); //============================ //form c11 = m1 + m4 - m5 + m7 matadd(m1, m4, tmp1, N); matsub(tmp1, m5, tmp2, N); matadd(tmp2, m7, c11, N); //form c12 = m3 + m5 matadd(m3, m5, c12, N); //form c21 = m2 + m4 matadd(m2, m4, c21, N); //fomr c22 = m1 - m2 + m3 + m6 matsub(m1, m2, tmp1, N); matadd(tmp1, m3, tmp2, N); matadd(tmp2, m6, c22, N); for (i=0; i< N; i++) { for (j=0; j< N; j++) { c[i*n+j] = c11[i*N +j ]; c[i*n+j+N] = c12[i*N +j ]; c[(i+N)*n+j] = c21[i*N +j ]; c[(i+N)*n+j+N] = c22[i*N +j ]; } } free(tmp1); free(tmp2); free(a11); free(a12); free(a21); free(a22); free(b11); free(b12); free(b21); free(b22); free(c11); free(c12); free(c21); free(c22); free(m1); free(m2); free(m3); free(m4); free(m5); free(m6); free(m7); }
void StrandSPLam::lhsDisFluxJacobian(const int& npts, const double* A, const double* xv, const double* ql, const double* qr, double* M) { // absolute Jacobian matrix (Roe) int iq,iA,iM; double Ax,Ay,ds,Nx,Ny,Tx,Ty,rl,rul,rvl,re,rqq,p,rhl,rr,rur,rvr,rhr, dd,u,v,h,qq,cc,ccr,ccr2,c,ut,un,l[4],a,dlim=1.,R[nq*nq],S[nq*nq]; for (int n=0; n<npts; n++){ iq = nq *n; iA = ndim*n; iM = nq*nq*n; Ax = A[iA ]; Ay = A[iA+1]; ds = sqrt(Ax*Ax+Ay*Ay); Nx = Ax/ds; Ny = Ay/ds; Tx = Ny; Ty =-Nx; rl = ql[iq ]; rul = ql[iq+1]; rvl = ql[iq+2]; re = ql[iq+3]; rqq =(rul*rul+rvl*rvl)/rl; p = gm1*(re-.5*rqq); rhl = re+p; rr = qr[iq ]; rur = qr[iq+1]; rvr = qr[iq+2]; re = qr[iq+3]; rqq =(rur*rur+rvr*rvr)/rr; p = gm1*(re-.5*rqq); rhr = re+p; rl = sqrt(rl); rr = sqrt(rr); dd = 1./(rl+rr); rl = 1./rl; rr = 1./rr; u =(rul*rl+rur*rr)*dd; v =(rvl*rl+rvr*rr)*dd; h =(rhl*rl+rhr*rr)*dd; qq = .5*(u*u+v*v); cc = gm1*(h-qq); ccr = 1./cc; ccr2 = .5*ccr; c = sqrt(cc); ut = u*Tx+v*Ty; un = u*Nx+v*Ny-xv[n]/ds; l[0] = ds*fabs(un ); l[1] = ds*fabs(un ); l[2] = ds*fabs(un+c); l[3] = ds*fabs(un-c); a = dlim*c*ds; for (int k=0; k<nq; k++) if (l[k] < a) l[k] = .5*(a+l[k]*l[k]/a); R[0 ] = l[0]; R[1 ] = 0.; R[2 ] = l[2]; R[3 ] = l[3]; R[4 ] = l[0]*u; R[5 ] = l[1]*Tx; R[6 ] = l[2]*(u+Nx*c); R[7 ] = l[3]*(u-Nx*c); R[8 ] = l[0]*v; R[9 ] = l[1]*Ty; R[10] = l[2]*(v+Ny*c); R[11] = l[3]*(v-Ny*c); R[12] = l[0]*qq; R[13] = l[1]*ut; R[14] = l[2]*(h+un*c); R[15] = l[3]*(h-un*c); S[0 ] =-gm1*ccr*qq+1.; S[1 ] = gm1*ccr*u; S[2 ] = gm1*ccr*v; S[3 ] =-gm1*ccr; S[4 ] =-ut; S[5 ] = Tx; S[6 ] = Ty; S[7 ] = 0.; S[8 ] = ccr2*(gm1*qq-c*un); S[9 ] =-ccr2*(gm1*u -c*Nx); S[10] =-ccr2*(gm1*v -c*Ny); S[11] = ccr2* gm1; S[12] = ccr2*(gm1*qq+c*un); S[13] =-ccr2*(gm1*u +c*Nx); S[14] =-ccr2*(gm1*v +c*Ny); S[15] = ccr2* gm1; matmul(nq,nq,nq,&R[0],&S[0],&M[iM]); M[iM+0 ] = ds*(fabs(un)+c); M[iM+1 ] = 0.; M[iM+2 ] = 0.; M[iM+3 ] = 0.; M[iM+4 ] = 0.; M[iM+5 ] = ds*(fabs(un)+c); M[iM+6 ] = 0.; M[iM+7 ] = 0.; M[iM+8 ] = 0.; M[iM+9 ] = 0.; M[iM+10] = ds*(fabs(un)+c); M[iM+11] = 0.; M[iM+12] = 0.; M[iM+13] = 0.; M[iM+14] = 0.; M[iM+15] = ds*(fabs(un)+c); } }
static void hbhankel_tmatmul(double* out, const double* v, const void* matrix) { matmul(out, v, matrix, 1); }
void ERMSD::calcMat(const std::vector<Vector> & positions,const Pbc& pbc, std::vector<Vector4d> &mat, std::vector<TensorGeneric<4,3> > &Gderi) { std::vector<Vector3d> pos; pos.resize(3*nresidues); std::vector<Tensor3d> deri; deri.resize(nresidues*9); std::vector<Vector> centers; centers.resize(nresidues); unsigned idx_deri = 0; Tensor da_dxa = (2./3.)*Tensor::identity(); Tensor da_dxb = -(1./3.)*Tensor::identity(); Tensor da_dxc = -(1./3.)*Tensor::identity(); Tensor db_dxa = -(1./3.)*Tensor::identity(); Tensor db_dxb = (2./3.)*Tensor::identity(); Tensor db_dxc = -(1./3.)*Tensor::identity(); // Form factors - should this be somewhere else? double w = 1./3.; Vector form_factor = Vector(2.0,2.0,1.0/0.3); for(unsigned res_idx=0; res_idx<natoms/3; res_idx++) { const unsigned at_idx = 3*res_idx; //center for (unsigned j=0; j<3; j++) { centers[res_idx] += w*positions[at_idx+j]; } Vector3d a = delta(centers[res_idx],positions[at_idx]); Vector3d b = delta(centers[res_idx],positions[at_idx+1]); Vector3d d = crossProduct(a,b); double ianorm = 1./a.modulo(); double idnorm = 1./d.modulo(); // X vector: COM-C2 pos[at_idx] = a*ianorm; // Z versor: C2 x (COM-C4/C6) pos[at_idx+2] = d*idnorm; // Y versor: Z x Y pos[at_idx+1] = crossProduct(pos[at_idx+2],pos[at_idx]); // Derivatives //////// Tensor3d t1 = ianorm*(Tensor::identity()-extProduct(pos[at_idx],pos[at_idx])); // dv1/dxa deri[idx_deri] = (2./3. )*t1; // dv1/dxb deri[idx_deri+3] = -(1./3.)*t1; // dv1/dxc deri[idx_deri+6] = -(1./3.)*t1; Tensor dd_dxa = VcrossTensor(a,db_dxa) -VcrossTensor(b,da_dxa); Tensor dd_dxb = VcrossTensor(a,db_dxb)-VcrossTensor(b,da_dxb); Tensor dd_dxc = VcrossTensor(a,db_dxc)-VcrossTensor(b,da_dxc); // dv3/dxa deri[idx_deri+2] = deriNorm(d,dd_dxa); // dv3/dxb deri[idx_deri+5] = deriNorm(d,dd_dxb); // dv3/dxc deri[idx_deri+8] = deriNorm(d,dd_dxc); // dv2/dxa = dv3/dxa cross v1 + v3 cross dv1/dxa deri[idx_deri+1] = (VcrossTensor(deri[idx_deri+2],pos[at_idx]) + \ VcrossTensor(pos[at_idx+2],deri[idx_deri])); // dv2/dxb deri[idx_deri+4] = (VcrossTensor(deri[idx_deri+5],pos[at_idx]) + \ VcrossTensor(pos[at_idx+2],deri[idx_deri+3])); // dv2/dxc deri[idx_deri+7] = (VcrossTensor(deri[idx_deri+8],pos[at_idx]) + \ VcrossTensor(pos[at_idx+2],deri[idx_deri+6])); idx_deri += 9; // End derivatives /////// } // Initialization (unnecessary?) for (unsigned i1=0; i1<nresidues*nresidues; i1++) { for (unsigned i2=0; i2<4; i2++) { mat[i1][i2] = 0.0; } } double maxdist = cutoff/form_factor[0]; double gamma = pi/cutoff; unsigned idx; unsigned idx1 = 0; // Calculate mat for (unsigned i=0; i<nresidues; i++) { for (unsigned j=0; j<nresidues; j++) { // skip i==j if(inPair(i,j) and i != j) { //if(i!=j){ // Calculate normal distance first Vector diff = delta(centers[i],centers[j]); double d1 = diff.modulo(); //std::cout << inPair(i,j) << " " << i << " " << j << " "<< d1 <<"\n"; //std::cout << inPair(i,j) << " " << i << " " << j << " "<< d1 <<"\n"; if(d1<maxdist) { // calculate r_tilde_ij Vector3d rtilde; for (unsigned k=0; k<3; k++) { for (unsigned l=0; l<3; l++) { rtilde[l] += pos[3*i+l][k]*diff[k]*form_factor[l]; } } double rtilde_norm = rtilde.modulo(); double irnorm = 1./rtilde_norm; // ellipsoidal cutoff if(rtilde_norm < cutoff) { idx = i*nresidues + j; //std::cout << i << " " << j << " " << rtilde_norm << " " << idx <<"\n"; // fill 4d matrix double dummy = sin(gamma*rtilde_norm)/(rtilde_norm*gamma); mat[idx][0] = dummy*rtilde[0]; mat[idx][1] = dummy*rtilde[1]; mat[idx][2] = dummy*rtilde[2]; mat[idx][3] = (1.+ cos(gamma*rtilde_norm))/gamma; // Derivative (drtilde_dx) std::vector<Tensor3d> drtilde_dx; drtilde_dx.resize(6); unsigned pos_idx = 3*i; unsigned deri_idx = 9*i; for (unsigned at=0; at<3; at++) { for (unsigned l=0; l<3; l++) { Vector3d rvec = form_factor[l]*((pos[pos_idx+l])/3.); Vector3d vvec = form_factor[l]*(matmul(deri[deri_idx+3*at+l],diff)); drtilde_dx[at].setRow(l,vvec-rvec); drtilde_dx[at+3].setRow(l,rvec); } } //std::vector<TensorGeneric<4,3> > dG_dx; //dG_dx.resize(6); double dummy1 = (cos(gamma*rtilde_norm) - dummy); idx1 = i*nresidues*6 + j*6; for (unsigned l=0; l<6; l++) { //std::cout << i << " " << j << " " << idx1 << " " << idx1+l << "\n"; // components 1,2,3 // sin(gamma*|rtilde|)/gamma*|rtilde|*d_rtilde + // + ((d_rtilde*r_tilde/r_tilde^2) out r_tilde)* // (cos(gamma*|rtilde| - sin(gamma*|rtilde|)/gamma*|rtilde|)) Vector3d rdr = matmul(rtilde,drtilde_dx[l]); Tensor tt = dummy*drtilde_dx[l] + (dummy1*irnorm*irnorm)*Tensor(rtilde,rdr); for (unsigned m=0; m<3; m++) { // Transpose here //dG_dx[l].setRow(m,tt.getRow(m)); Gderi[idx1+l].setRow(m,tt.getRow(m)); } // component 4 // - sin(gamma*|rtilde|)/|rtilde|*(r_tilde*d_rtilde) //dG_dx[l].setRow(3,-dummy*gamma*rdr); Gderi[idx1+l].setRow(3,-dummy*gamma*rdr); } } } } } } }
int main(int argc, char *argv[]) { p_mem_t shared_mem, results_mem; uint32_t eram_base; char results[1024] = { '\0' }; int device_cols, device_rows, nside; p_dev_t dev; p_prog_t prog; p_team_t team; p_coords_t size; p_coords_t start = { .row = 0, .col = 0 }; unsigned int msize; float seed; unsigned int addr; //, clocks; size_t sz; int verbose=0; double tdiff[3]; int result, retval = 0; msize = 0x00400000; get_args(argc, argv); fo = stderr; fi = stdin; printf( "------------------------------------------------------------\n"); printf( "Calculating: C[%d][%d] = A[%d][%d] * B[%d][%d]\n", _Smtx, _Smtx, _Smtx, _Smtx, _Smtx, _Smtx); seed = 0.0; if(verbose){ printf( "Seed = %f\n", seed); } dev = p_init(P_DEV_EPIPHANY, 0); if (p_error(dev)) { fprintf(stderr, "Error initializing PAL\n"); return p_error(dev); } device_cols = p_query(dev, P_PROP_COLS); device_rows = p_query(dev, P_PROP_ROWS); // Use min size nside = device_cols > device_rows ? device_cols : device_rows; if (nside < 4) { fprintf(stderr, "Error: Too small device, need at least 4x4\n"); return 1; } // Either 1024, 256, 64, or 16 cores (side must be power of two), nside = nside >= 32 ? 32 : nside >= 16 ? 16 : nside >= 8 ? 8 : 4; size.row = nside; size.col = nside; team = p_open4(dev, P_TOPOLOGY_2D, &start, &size); printf("Using team of size %d\n", p_team_size(team)); if (p_error(team)) { fprintf(stderr, "Error opening team\n"); return p_error(team); } prog = p_load(dev, ar.elfFile, 0); eram_base = (unsigned) p_query(dev, P_PROP_MEMBASE); shared_mem = p_map(dev, eram_base, msize); // Clear mailbox contents memset(&Mailbox, 0, sizeof(Mailbox)); p_write(&shared_mem, &Mailbox, 0, sizeof(Mailbox), 0); // Generate operand matrices based on a provided seed matrix_init((int)seed); #ifdef __WIPE_OUT_RESULT_MATRIX__ // Wipe-out any previous remains in result matrix (for verification) addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); if(verbose){ printf( "Writing C[%uB] to address %08x...\n", (unsigned) sz, addr); } p_write(&shared_mem, (void *) Mailbox.C, addr, sz, 0); #endif /* Wallclock time */ clock_gettime(CLOCK_MONOTONIC, &timer[0]); /* Clock CPUTIME too. We don't want to indicate failure just * because the system was under high load. */ clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[4]); // Copy operand matrices to Epiphany system addr = offsetof(shared_buf_t, A[0]); sz = sizeof(Mailbox.A); if(verbose){ printf( "Writing A[%uB] to address %08x...\n", (unsigned) sz, addr); } p_write(&shared_mem, (void *) Mailbox.A, addr, sz, 0); addr = offsetof(shared_buf_t, B[0]); sz = sizeof(Mailbox.B); if(verbose){ printf( "Writing B[%uB] to address %08x...\n", (unsigned) sz, addr); } p_write(&shared_mem, (void *) Mailbox.B, addr, sz, 0); // Call the Epiphany matmul() function if(verbose){ printf( "GO Epiphany! ... "); } if(verbose){ printf("Loading program on Epiphany chip...\n"); } p_arg_t args[] = { &nside, sizeof(nside), true }; if (p_run(prog, "matmul", team, 0, p_team_size(team), 1, args, 0)) { fprintf(stderr, "Error loading Epiphany program.\n"); exit(1); } // Read result matrix and timing addr = offsetof(shared_buf_t, C[0]); sz = sizeof(Mailbox.C); if(verbose){ printf( "Reading result from address %08x...\n", addr); } p_read(&shared_mem, (void *) Mailbox.C, addr, sz, 0); clock_gettime(CLOCK_MONOTONIC, &timer[1]); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[5]); // Calculate a reference result clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[2]); #ifndef __DO_STRASSEN__ matmul(Mailbox.A, Mailbox.B, Cref, _Smtx); #else matmul_strassen(Mailbox.A, Mailbox.B, Cref, _Smtx); #endif clock_gettime(CLOCK_THREAD_CPUTIME_ID, &timer[3]); addr = offsetof(shared_buf_t, core.clocks); sz = sizeof(Mailbox.core.clocks); if(verbose){ printf( "Reading time from address %08x...\n", addr); } p_read(&shared_mem, &Mailbox.core.clocks, addr, sizeof(Mailbox.core.clocks), 0); // clocks = Mailbox.core.clocks; // Calculate the difference between the Epiphany result and the reference result matsub(Mailbox.C, Cref, Cdiff, _Smtx); tdiff[0] = (timer[1].tv_sec - timer[0].tv_sec) * 1000 + ((double) (timer[1].tv_nsec - timer[0].tv_nsec) / 1000000.0); // tdiff[0] = ((double) clocks) / eMHz * 1000; tdiff[1] = (timer[3].tv_sec - timer[2].tv_sec) * 1000 + ((double) (timer[3].tv_nsec - timer[2].tv_nsec) / 1000000.0); tdiff[2] = (timer[5].tv_sec - timer[4].tv_sec) * 1000 + ((double) (timer[5].tv_nsec - timer[4].tv_nsec) / 1000000.0); // If the difference is 0, then the matrices are identical and the // calculation was correct if (iszero(Cdiff, _Smtx)) { printf( "Epiphany(time) %9.1f msec (@ %03d MHz)\n", tdiff[0], eMHz); printf( "Host(time) %9.1f msec (@ %03d MHz)\n", tdiff[1], aMHz); printf( "------------------------------------------------------------\n"); printf( "TEST \"matmul-16\" PASSED\n"); retval = 0; } else { printf( "\n\nERROR: C_epiphany is different from C_host !!!\n"); printf( "TEST \"matmul-16\" FAILED\n"); retval = 1; } #if 0 #ifdef __DUMP_MATRICES__ printf( "\n\n\n"); printf( "A[][] = \n"); matprt(Mailbox.A, _Smtx); printf( "B[][] = \n"); matprt(Mailbox.B, _Smtx); printf( "C[][] = \n"); matprt(Mailbox.C, _Smtx); printf( "Cref[][] = \n"); matprt(Cref, _Smtx); int i, j; for (i=0; i<_Nside; i++) for (j=0; j<_Nside; j++) { e_read(pEpiphany, i, j, 0x2000+0*sizeof(float), &Aepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x2000+2*sizeof(float), &Aepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+0*sizeof(float), &Bepi[(i*_Score+0)*_Smtx + j*_Score], 2*sizeof(float)); e_read(pEpiphany, i, j, 0x4000+2*sizeof(float), &Bepi[(i*_Score+1)*_Smtx + j*_Score], 2*sizeof(float)); } printf( "Aepi[][] = \n"); matprt(Aepi, _Smtx); printf( "Bepi[][] = \n"); matprt(Bepi, _Smtx); #endif #endif // p_unmap ... p_close(team); p_finalize(dev); return retval; } // Initialize operand matrices void matrix_init(int seed) { int i, j, p; p = 0; for (i=0; i<_Smtx; i++) for (j=0; j<_Smtx; j++) Mailbox.A[p++] = (i + j + seed) % _MAX_MEMBER_; p = 0; for (i=0; i<_Smtx; i++) for (j=0; j<_Smtx; j++) Mailbox.B[p++] = ((i + j) * 2 + seed) % _MAX_MEMBER_; p = 0; for (i=0; i<_Smtx; i++) for (j=0; j<_Smtx; j++) Mailbox.C[p++] = 0x8dead; return; }
/* fix narrow-lane ambiguity by ILS ------------------------------------------*/ static int fix_amb_ILS(rtk_t *rtk, int *sat1, int *sat2, int *NW, int n) { double C1,C2,*B1,*N1,*NC,*D,*E,*Q,s[2],lam_NL=lam_LC(1,1,0),lam1,lam2; int i,j,k,m=0,info,stat,flgs[MAXSAT]= {0},max_flg=0; lam1=lam_carr[0]; lam2=lam_carr[1]; C1= SQR(lam2)/(SQR(lam2)-SQR(lam1)); C2=-SQR(lam1)/(SQR(lam2)-SQR(lam1)); B1=zeros(n,1); N1=zeros(n,2); D=zeros(rtk->nx,n); E=mat(n,rtk->nx); Q=mat(n,n); NC=mat(n,1); for (i=0; i<n; i++) { /* check linear independency */ if (!is_depend(sat1[i],sat2[i],flgs,&max_flg)) continue; j=IB(sat1[i],&rtk->opt); k=IB(sat2[i],&rtk->opt); /* float narrow-lane ambiguity (cycle) */ B1[m]=(rtk->x[j]-rtk->x[k]+C2*lam2*NW[i])/lam_NL; N1[m]=ROUND(B1[m]); /* validation of narrow-lane ambiguity */ if (fabs(N1[m]-B1[m])>rtk->opt.thresar[2]) continue; /* narrow-lane ambiguity transformation matrix */ D[j+m*rtk->nx]= 1.0/lam_NL; D[k+m*rtk->nx]=-1.0/lam_NL; sat1[m]=sat1[i]; sat2[m]=sat2[i]; NW[m++]=NW[i]; } if (m<3) return 0; /* covariance of narrow-lane ambiguities */ matmul("TN",m,rtk->nx,rtk->nx,1.0,D,rtk->P,0.0,E); matmul("NN",m,m,rtk->nx,1.0,E,D,0.0,Q); /* integer least square */ if ((info=lambda(m,2,B1,Q,N1,s))) { trace(2,"lambda error: info=%d\n",info); return 0; } if (s[0]<=0.0) return 0; rtk->sol.ratio=(float)(MIN(s[1]/s[0],999.9)); /* varidation by ratio-test */ if (rtk->opt.thresar[0]>0.0&&rtk->sol.ratio<rtk->opt.thresar[0]) { trace(2,"varidation error: n=%2d ratio=%8.3f\n",m,rtk->sol.ratio); return 0; } trace(2,"varidation ok: %s n=%2d ratio=%8.3f\n",time_str(rtk->sol.time,0),m, rtk->sol.ratio); /* narrow-lane to iono-free ambiguity */ for (i=0; i<m; i++) { NC[i]=C1*lam1*N1[i]+C2*lam2*(N1[i]-NW[i]); } /* fixed solution */ stat=fix_sol(rtk,sat1,sat2,NC,m); free(B1); free(N1); free(D); free(E); free(Q); free(NC); return stat; }
/* Try various ways to do matmul and time them. Tiled algorithms * running serially; multi-threaded QUARK runtime with tiled * algorithms; and direct serial computation over standard layout. */ int main_algorithm(int NB, int N, int THREADS) { int i, j, k, nerr=0; int BB = N/NB; double *A = (double*)malloc(N*N*sizeof(double)); double *Ablk = (double*)malloc(N*N*sizeof(double)); double *B = (double*)malloc(N*N*sizeof(double)); double *Bblk = (double*)malloc(N*N*sizeof(double)); double *C_direct = (double*)malloc(N*N*sizeof(double)); double *C = (double*)malloc(N*N*sizeof(double)); double *Cblk = (double*)malloc(N*N*sizeof(double)); double *C_quark = (double*)malloc(N*N*sizeof(double)); double *C_quark_blk = (double*)malloc(N*N*sizeof(double)); struct timeval tstart, tend, tdiff; double t_blk=0, t_quark=0, t_direct=0; // Initialize for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i+j*N] = (double)1.0+i; B[i+j*N] = (double)2.0+i+j; C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0; } } matrix_print("Printing A", A, N); matrix_print("Printing B", B, N); matrix_print("Printing C before computation", C, N); // Move from F77 to BDL std_to_bdl( A, Ablk, N, NB ); std_to_bdl( B, Bblk, N, NB ); std_to_bdl( C, Cblk, N, NB ); std_to_bdl( C_quark, C_quark_blk, N, NB ); /* ORIGINAL TILED ROUTINE */ /* This is the code for the serial tile-by-tile multiplication */ printf("Doing matrix multiplication using serial tile-by-tile algorithm\n"); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB); gettimeofday( &tend, NULL ); t_blk = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); bdl_to_std( C, Cblk, N, NB ); matrix_print("Printing C produced by serial tile-algorithm after computation", C, N); printf("\n"); /* QUARK PARALLEL TILED ROUTINE */ /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */ printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n"); Quark *quark = QUARK_New(THREADS); gettimeofday( &tstart, NULL ); for (i = 0; i < BB; i++) for (j = 0; j < BB; j++) for (k = 0; k < BB; k++) matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB); QUARK_Barrier( quark ); gettimeofday( &tend, NULL ); t_quark = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 ); QUARK_Delete(quark); bdl_to_std( C_quark, C_quark_blk, N, NB ); matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N); printf("\n"); /* DIRECT COMPUTATION OVER STANDARD LAYOUT */ /* Compute direct C if desired */ printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n"); gettimeofday( &tstart, NULL ); matmul ( A, B, C_direct, N ); gettimeofday( &tend, NULL ); t_direct = timeval_subtract( &tdiff, &tend, &tstart ); printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) ); matrix_print("Printing C produced by direct matmul after computation", C_direct, N); printf("\n"); /* Check for errors */ printf("Comparing result matrices (direct versus QUARK)\n"); nerr = matrix_compare( C_direct, C_quark, N ); printf("Number of differences: %d\n", nerr); printf("\n"); printf("Summary of time taken\n"); printf("Direct SerialBlock QUARK(%d threads)\n", THREADS); printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark); free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk); return 0; }
double RMSD::optimalAlignment(const std::vector<double> & align, const std::vector<double> & displace, const std::vector<Vector> & positions, const std::vector<Vector> & reference , std::vector<Vector> & derivatives, bool squared) { plumed_massert(displace==align,"OPTIMAL_FAST version of RMSD can only be used when displace weights are same as align weights"); double dist(0); double norm(0); const unsigned n=reference.size(); // This is the trace of positions*positions + reference*reference double sum00w(0); // This is positions*reference Tensor sum01w; derivatives.resize(n); Vector cpositions; Vector creference; // first expensive loop: compute centers for(unsigned iat=0;iat<n;iat++){ double w=align[iat]; norm+=w; cpositions+=positions[iat]*w; creference+=reference[iat]*w; } double invnorm=1.0/norm; cpositions*=invnorm; creference*=invnorm; // second expensive loop: compute second moments wrt centers for(unsigned iat=0;iat<n;iat++){ double w=align[iat]; sum00w+=(dotProduct(positions[iat]-cpositions,positions[iat]-cpositions) +dotProduct(reference[iat]-creference,reference[iat]-creference))*w; sum01w+=Tensor(positions[iat]-cpositions,reference[iat]-creference)*w; } double rr00=sum00w*invnorm; Tensor rr01=sum01w*invnorm; Matrix<double> m=Matrix<double>(4,4); m[0][0]=rr00+2.0*(-rr01[0][0]-rr01[1][1]-rr01[2][2]); m[1][1]=rr00+2.0*(-rr01[0][0]+rr01[1][1]+rr01[2][2]); m[2][2]=rr00+2.0*(+rr01[0][0]-rr01[1][1]+rr01[2][2]); m[3][3]=rr00+2.0*(+rr01[0][0]+rr01[1][1]-rr01[2][2]); m[0][1]=2.0*(-rr01[1][2]+rr01[2][1]); m[0][2]=2.0*(+rr01[0][2]-rr01[2][0]); m[0][3]=2.0*(-rr01[0][1]+rr01[1][0]); m[1][2]=2.0*(-rr01[0][1]-rr01[1][0]); m[1][3]=2.0*(-rr01[0][2]-rr01[2][0]); m[2][3]=2.0*(-rr01[1][2]-rr01[2][1]); m[1][0] = m[0][1]; m[2][0] = m[0][2]; m[2][1] = m[1][2]; m[3][0] = m[0][3]; m[3][1] = m[1][3]; m[3][2] = m[2][3]; vector<double> eigenvals; Matrix<double> eigenvecs; int diagerror=diagMat(m, eigenvals, eigenvecs ); if (diagerror!=0){ string sdiagerror; Tools::convert(diagerror,sdiagerror); string msg="DIAGONALIZATION FAILED WITH ERROR CODE "+sdiagerror; plumed_merror(msg); } dist=eigenvals[0]; Matrix<double> ddist_dm(4,4); Vector4d q(eigenvecs[0][0],eigenvecs[0][1],eigenvecs[0][2],eigenvecs[0][3]); // This is the rotation matrix that brings reference to positions // i.e. matmul(rotation,reference[iat])+shift is fitted to positions[iat] Tensor rotation; rotation[0][0]=q[0]*q[0]+q[1]*q[1]-q[2]*q[2]-q[3]*q[3]; rotation[1][1]=q[0]*q[0]-q[1]*q[1]+q[2]*q[2]-q[3]*q[3]; rotation[2][2]=q[0]*q[0]-q[1]*q[1]-q[2]*q[2]+q[3]*q[3]; rotation[0][1]=2*(+q[0]*q[3]+q[1]*q[2]); rotation[0][2]=2*(-q[0]*q[2]+q[1]*q[3]); rotation[1][2]=2*(+q[0]*q[1]+q[2]*q[3]); rotation[1][0]=2*(-q[0]*q[3]+q[1]*q[2]); rotation[2][0]=2*(+q[0]*q[2]+q[1]*q[3]); rotation[2][1]=2*(-q[0]*q[1]+q[2]*q[3]); double prefactor=2.0*invnorm; Vector shift=cpositions-matmul(rotation,creference); if(!squared) prefactor*=0.5/sqrt(dist); // if "safe", recompute dist here to a better accuracy if(safe) dist=0.0; // If safe is set to "false", MSD is taken from the eigenvalue of the M matrix // If safe is set to "true", MSD is recomputed from the rotational matrix // For some reason, this last approach leads to less numerical noise but adds an overhead // third expensive loop: derivatives for(unsigned iat=0;iat<n;iat++){ // there is no need for derivatives of rotation and shift here as it is by construction zero // (similar to Hellman-Feynman forces) Vector d(positions[iat]-shift - matmul(rotation,reference[iat])); derivatives[iat]= prefactor*align[iat]*d; if(safe) dist+=align[iat]*invnorm*modulo2(d); } if(!squared) dist=sqrt(dist); return dist; }
int main(int argc, char *argv[]) { int testN = 1; bool check_correctness = false; if (argc > 1) { testN = atoi(argv[1]); } if (argc > 2) { check_correctness = atoi(argv[2]); } std::cout << std::endl << "----------" << std::endl; std::cout << "Running sequential MM benchmark: testN: " << testN << ", check correctness: " << check_correctness << ", size: (" << S0 << ", " << S1 << ", " << S2 << ", " << S3 << ")" << std::endl; auto t1 = std::chrono::high_resolution_clock::now(); auto t2 = t1; float *A = (float*) malloc(S0 * S1 * sizeof(float)); float *B = (float*) malloc(S1 * S2 * sizeof(float)); float *C = (float*) malloc(S2 * S3 * sizeof(float)); // Initialize matrices with random values: for (int i = 0; i < S0 * S1; i++) A[i] = std::rand() % 10; for (int i = 0; i < S1 * S2; i++) B[i] = std::rand() % 10; for (int i = 0; i < S2 * S3; i++) C[i] = std::rand() % 10; std::cout << "Buffers initialized" << std::endl << std::flush; // Note that indices are flipped (see tutorial 2) Halide::Buffer<DATA_TYPE> A_buf(A, {S1, S0}); Halide::Buffer<DATA_TYPE> B_buf(B, {S2, S1}); Halide::Buffer<DATA_TYPE> C_buf(C, {S3, S2}); Halide::Buffer<DATA_TYPE> O_buf(S3, S0); // Make a dummy call to set up GPU (initalization takes time) matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer()); // CPU Multiplication for correctness check if (check_correctness) { // Reference matrix multiplication std::cout << "Running CPU multiplication.." << std::endl; Halide::Buffer<DATA_TYPE> O_val_buf(S3, S0); Halide::Buffer<DATA_TYPE> T_val_buf(S2, S0); t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < S0; i++) { for (int k = 0; k < S2; k++) { // Note that indices are flipped (see tutorial 2) T_val_buf(k, i) = 0; } } for (int i = 0; i < S0; i++) { for (int l = 0; l < S3; l++) { // Note that indices are flipped (see tutorial 2) O_val_buf(l, i) = 0; } } for (int j = 0; j < S1; j++) { for (int i = 0; i < S0; i++) { for (int k = 0; k < S2; k++) { // Note that indices are flipped (see tutorial 2) T_val_buf(k, i) += A_buf(j, i) * B_buf(k, j); } } } for (int k = 0; k < S2; k++) { for (int i = 0; i < S0; i++) { for (int l = 0; l < S3; l++) { // Note that indices are flipped (see tutorial 2) O_val_buf(l, i) += T_val_buf(k, i) * C_buf(l, k); } } } t2 = std::chrono::high_resolution_clock::now(); std::cout << "CPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() << "ms" << std::endl << std::flush; compare_buffers("matmul", O_buf, O_val_buf); } // GPU Multiplication t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < testN; i++) { matmul(A_buf.raw_buffer(), B_buf.raw_buffer(), C_buf.raw_buffer(), O_buf.raw_buffer()); } t2 = std::chrono::high_resolution_clock::now(); std::cout << "GPU matmul done: " << (std::chrono::duration<double,std::milli>(t2 - t1)).count() / testN << "ms" << std::endl << std::flush; // CUBLAS SGEMM // Transposed copies for cublas float *A_T = (float*) malloc(S0 * S1 * sizeof(float)); float *B_T = (float*) malloc(S1 * S2 * sizeof(float)); float *C_T = (float*) malloc(S2 * S3 * sizeof(float)); float *O_T = (float*) malloc(S0 * S3 * sizeof(float)); // Transpose for (int i = 0; i < S0; i++) for (int j = 0; j < S1; j++) A_T[i + j * S0] = A[i * S1 + j]; for (int i = 0; i < S1; i++) for (int j = 0; j < S2; j++) B_T[i + j * S1] = B[i * S2 + j]; for (int i = 0; i < S2; i++) for (int j = 0; j < S3; j++) C_T[i + j * S2] = C[i * S3 + j]; // Excluding handle creation which is time consuming cublasHandle_t handle; cublasCreate(&handle); t1 = std::chrono::high_resolution_clock::now(); for (int i = 0; i < testN; i++) { float *d_A; float *d_B; float *d_C; float *d_T; float *d_O; cudaMalloc((void**)&d_A, S0 * S1 * sizeof(*A)); cudaMalloc((void**)&d_B, S1 * S2 * sizeof(*A)); cudaMalloc((void**)&d_C, S2 * S3 * sizeof(*A)); cudaMalloc((void**)&d_T, S0 * S2 * sizeof(*A)); cudaMalloc((void**)&d_O, S0 * S3 * sizeof(*A)); cublasSetMatrix(S0, S1, sizeof(*A), A_T, S0, d_A, S0); cublasSetMatrix(S1, S2, sizeof(*B), B_T, S1, d_B, S1); cublasSetMatrix(S2, S3, sizeof(*C), C_T, S2, d_C, S2); float alpha_var = 1; float beta_var = 0; cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S2, S1, &alpha_var, d_A, S0, d_B, S1, &beta_var, d_T, S0); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, S0, S3, S2, &alpha_var, d_T, S0, d_C, S2, &beta_var, d_O, S0); cublasGetMatrix(S0, S3, sizeof(*C), d_O, S0, O_T, S0); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_T); cudaFree(d_O); } t2 = std::chrono::high_resolution_clock::now(); std::cout << "cublas matmul done (excluding cublasHandle creation): " << (std::chrono::duration<double,std::milli>(t2 - t1) / testN).count() << "ms" << std::endl << std::flush; cublasDestroy(handle); bool check_cublas_difference = false; if (check_cublas_difference) { bool flag = true; for (int i = 0; i < S0 && flag; i++) { for (int j = 0; j < S3; j++) { if (O_buf(j, i) != O_T[i + j * S0]) { std::cout << "cublas validation mismatch:" << std::endl; std::cout << i << " " << j << " " << O_T[i + j * S0] << " " << O_buf(j, i) << std::endl; } } } if (flag) { std::cout << "cublas and validation match" << std::endl; } } free(A); free(B); free(C); free(A_T); free(B_T); free(C_T); free(O_T); std::cout << "----------" << std::endl << std::endl; return 0; }
Array<T> leastSquares(const Array<T> &a, const Array<T> &b) { int M = a.dims()[0]; int N = a.dims()[1]; int K = b.dims()[1]; int MN = std::min(M, N); Array<T> B = createEmptyArray<T>(dim4()); trsm_func<T> gpu_trsm; cl_event event; cl_command_queue queue = getQueue()(); if (M < N) { #define UNMQR 0 // FIXME: UNMQR == 1 should be faster but does not work // Least squres for this case is solved using the following // solve(A, B) == matmul(Q, Xpad); // Where: // Xpad == pad(Xt, N - M, 1); // Xt == tri_solve(R1, B); // R1 == R(seq(M), seq(M)); // transpose(A) == matmul(Q, R); // QR is performed on the transpose of A Array<T> A = transpose<T>(a, true); #if UNMQR B = padArray<T, T>(b, dim4(N, K), scalar<T>(0)); B.resetDims(dim4(M, K)); #else B = copyArray<T>(b); #endif int NB = magma_get_geqrf_nb<T>(A.dims()[1]); int NUM = (2*MN + ((M+31)/32)*32)*NB; Array<T> tmp = createEmptyArray<T>(dim4(NUM)); std::vector<T> h_tau(MN); int info = 0; cl::Buffer *dA = A.get(); cl::Buffer *dT = tmp.get(); cl::Buffer *dB = B.get(); magma_geqrf3_gpu<T>(A.dims()[0], A.dims()[1], (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info); A.resetDims(dim4(M, M)); magmablas_swapdblk<T>(MN-1, NB, (*dA)(), A.getOffset(), A.strides()[1], 1, (*dT)(), tmp.getOffset() + MN * NB, NB, 0, queue); gpu_trsm(clblasColumnMajor, clblasLeft, clblasUpper, clblasConjTrans, clblasNonUnit, B.dims()[0], B.dims()[1], scalar<T>(1), (*dA)(), A.getOffset(), A.strides()[1], (*dB)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); magmablas_swapdblk<T>(MN - 1, NB, (*dT)(), tmp.getOffset() + MN * NB, NB, 0, (*dA)(), A.getOffset(), A.strides()[1], 1, queue); #if UNMQR int lwork = (B.dims()[0]-A.dims()[0]+NB)*(B.dims()[1]+2*NB); std::vector<T> h_work(lwork); B.resetDims(dim4(N, K)); magma_unmqr_gpu<T>(MagmaLeft, MagmaNoTrans, B.dims()[0], B.dims()[1], A.dims()[0], (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dB)(), B.getOffset(), B.strides()[1], &h_work[0], lwork, (*dT)(), tmp.getOffset(), NB, queue, &info); #else A.resetDims(dim4(N, M)); magma_ungqr_gpu<T>(A.dims()[0], A.dims()[1], std::min(M, N), (*dA)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), NB, queue, &info); B = matmul(A, B, AF_MAT_NONE, AF_MAT_NONE); #endif } else if (M > N) { // Least squres for this case is solved using the following // solve(A, B) == tri_solve(R1, Bt); // Where: // R1 == R(seq(N), seq(N)); // Bt == matmul(transpose(Q1), B); // Q1 == Q(span, seq(N)); // A == matmul(Q, R); Array<T> A = copyArray<T>(a); B = copyArray(b); int MN = std::min(M, N); int NB = magma_get_geqrf_nb<T>(M); int NUM = (2*MN + ((N+31)/32)*32)*NB; Array<T> tmp = createEmptyArray<T>(dim4(NUM)); std::vector<T> h_tau(NUM); int info = 0; cl::Buffer *A_buf = A.get(); cl::Buffer *B_buf = B.get(); cl::Buffer *dT = tmp.get(); magma_geqrf3_gpu<T>(M, N, (*A_buf)(), A.getOffset(), A.strides()[1], &h_tau[0], (*dT)(), tmp.getOffset(), getQueue()(), &info); int NRHS = B.dims()[1]; int lhwork = (M - N + NB) * (NRHS + NB) + NRHS * NB; std::vector<T> h_work(lhwork); h_work[0] = scalar<T>(lhwork); magma_unmqr_gpu<T>(MagmaLeft, MagmaConjTrans, M, NRHS, N, (*A_buf)(), A.getOffset(), A.strides()[1], &h_tau[0], (*B_buf)(), B.getOffset(), B.strides()[1], &h_work[0], lhwork, (*dT)(), tmp.getOffset(), NB, queue, &info); magmablas_swapdblk<T>(MN - 1, NB, (*A_buf)(), A.getOffset(), A.strides()[1], 1, (*dT)(), tmp.getOffset() + NB * MN, NB, 0, queue); std::string pName = getPlatformName(getDevice()); if(pName.find("NVIDIA") != std::string::npos) { Array<T> AT = transpose<T>(A, true); cl::Buffer* AT_buf = AT.get(); gpu_trsm(clblasColumnMajor, clblasLeft, clblasLower, clblasConjTrans, clblasNonUnit, N, NRHS, scalar<T>(1), (*AT_buf)(), AT.getOffset(), AT.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); } else { gpu_trsm(clblasColumnMajor, clblasLeft, clblasUpper, clblasNoTrans, clblasNonUnit, N, NRHS, scalar<T>(1), (*A_buf)(), A.getOffset(), A.strides()[1], (*B_buf)(), B.getOffset(), B.strides()[1], 1, &queue, 0, nullptr, &event); } B.resetDims(dim4(N, K)); } return B; }
//Умножение матриц Matrix operator *(const Matrix &A, const Matrix &B) throw (int) { return matmul(A, B); }
void matmul_mpi(float* A, float* B, float* C, int n){ int rank, nodes_n, used_nodes_n, row_per_process, i, j, k, start_row, end_row, child_start_row, child_end_row, *counts, *displs; float* my_A; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nodes_n); row_per_process = n / nodes_n; if(n % nodes_n)row_per_process ++; used_nodes_n = n / row_per_process; if(n % row_per_process)used_nodes_n ++; start_row = rank % used_nodes_n * row_per_process; end_row = start_row + row_per_process; if(end_row >= n)end_row = n; if(rank == 0) { // 0 ~ row_per_process are mine counts = (int *)malloc(sizeof(int) * nodes_n); displs = (int *)malloc(sizeof(int) * nodes_n); for(i = 0 ; i <nodes_n;i++){ child_start_row = i % used_nodes_n * row_per_process; child_end_row = child_start_row + row_per_process; if(child_end_row >= n) child_end_row = n; displs[i] = child_start_row * n; counts[i] = (child_end_row - child_start_row) * n; } } my_A = (float *)malloc(sizeof(float) * row_per_process * n); if(rank > 0) B = (float *)malloc(sizeof(float) * n * n); MPI_Scatterv(A, counts, displs, MPI_FLOAT, my_A, row_per_process * n, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(B, n * n, MPI_FLOAT, 0, MPI_COMM_WORLD); //자기것 계산하기 if(C == NULL) { C = (float *)malloc(sizeof(float) * (end_row - start_row) * n); memset(C, 0, sizeof(float) * (end_row - start_row) * n); } else { memset(C, 0, sizeof(float) * n * n); } if(end_row-start_row > 0) matmul(my_A, B, C, end_row-start_row, n, n); // 계산 완료 MPI_Gatherv(C, (end_row - start_row) * n, MPI_FLOAT, C, counts, displs, MPI_FLOAT, 0, MPI_COMM_WORLD); free(my_A); if(rank == 0){ free(counts); free(displs); } else { free(B); free(C); } }
/* Solve the overconstrained linear system Ma = b using a least squares error (pseudo inverse) approach. */ int solve_system (dmat M, dmat a,dmat b) { dmat Mt, MtM, Mdag; //AfxMessageBox("S1"); if ((M.ub1 - M.lb1) < (M.ub2 - M.lb2)) { fprintf (stderr, "solve_system: matrix M has more columns than rows\n"); return (-1); } //AfxMessageBox("S2"); Mt = newdmat (M.lb2, M.ub2, M.lb1, M.ub1, &errno); if (errno) { fprintf (stderr, "solve_system: unable to allocate matrix M_transpose\n"); return (-2); } //AfxMessageBox("S3"); transpose (M, Mt); if (errno) { fprintf (stderr, "solve_system: unable to transpose matrix M\n"); return (-3); } //AfxMessageBox("S4"); MtM = newdmat (M.lb2, M.ub2, M.lb2, M.ub2, &errno); if (errno) { fprintf (stderr, "solve_system: unable to allocate matrix M_transpose_M\n"); return (-4); } //AfxMessageBox("S5"); matmul (Mt, M, MtM); if (errno) { fprintf (stderr, "solve_system: unable to compute matrix product of M_transpose and M\n"); return (-5); } //modified by Dickson //AfxMessageBox("S6"); double aa=fabs (matinvert (MtM)); //AfxMessageBox("S7"); //if (aa < 0.001) { //CString str; //str.Format("determinant=%f",aa); //AfxMessageBox("S71"); //AfxMessageBox(str); //AfxMessageBox("S8"); if (aa < 0.001) { fprintf (stderr, "solve_system: determinant of matrix M_transpose_M is too small\n"); return (-6); } if (errno) { fprintf (stderr, "solve_system: error during matrix inversion\n"); return (-7); } Mdag = newdmat (M.lb2, M.ub2, M.lb1, M.ub1, &errno); if (errno) { fprintf (stderr, "solve_system: unable to allocate matrix M_diag\n"); return (-8); } matmul (MtM, Mt, Mdag); if (errno) { fprintf (stderr, "solve_system: unable to compute matrix product of M_transpose_M and M_transpose\n"); return (-9); } matmul (Mdag, b, a); if (errno) { fprintf (stderr, "solve_system: unable to compute matrix product of M_diag and b\n"); return (-10); } freemat (Mt); freemat (MtM); freemat (Mdag); return 0; }
// // Measure Whisker Segment Features // -------------------------------- // <face_axis> indicates the orientation of the mouse head with respect to // the image. // <face_axis> == 'x' --> horizontally (along x axis) // <face_axis> == 'y' --> vertically (along y axis) // void Whisker_Seg_Measure( Whisker_Seg *w, double *dest, int facex, int facey, char face_axis ) { float path_length, // median_score, // root_angle_deg, // side poly mean_curvature, //(side) poly quad? (depends on side for sign) follicle_x, // side follicle_y, // side tip_x, // side tip_y; // side float *x = w->x, *y = w->y, *s = w->scores; int len = w->len, idx_follicle, idx_tip; float dx; static double *cumlen = NULL; static size_t cumlen_size = 0; cumlen = request_storage( cumlen, &cumlen_size, sizeof(double), len, "measure: cumlen"); cumlen[0] = 0.0; // path length // ----------- // XXX: an alternate approach would be to compute the polynomial fit // and do quadrature on that. Might be more precise. // Although, need cumlen (a.k.a cl) for polyfit anyway { float *ax = x + 1, *ay = y + 1, *bx = x, *by = y; double *cl = cumlen + 1, *clm = cumlen; while( ax < x + len ) *cl++ = (*clm++) + hypotf( (*ax++) - (*bx++), (*ay++) - (*by++) ); path_length = cl[-1]; } // median score // ------------ { qsort( s, len, sizeof(float), _score_cmp ); if(len&1) // odd median_score = s[ (len-1)/2 ]; else //even median_score = ( s[len/2 - 1] + s[len/2] )/2.0; } // Follicle and root positions // --------------------------- dx = _side( w, facex, facey, &idx_follicle, &idx_tip ); follicle_x = x[ idx_follicle ]; follicle_y = y[ idx_follicle ]; tip_x = x[ idx_tip ]; tip_y = y[ idx_tip ]; // Polynomial based measurements // (Curvature and angle) // ----------------------------- { double px[ MEASURE_POLY_FIT_DEGREE+1 ], py[ MEASURE_POLY_FIT_DEGREE+1 ], xp[ MEASURE_POLY_FIT_DEGREE+1 ], yp[ MEASURE_POLY_FIT_DEGREE+1 ], xpp[ MEASURE_POLY_FIT_DEGREE+1 ], ypp[ MEASURE_POLY_FIT_DEGREE+1 ], mul1[ 2*MEASURE_POLY_FIT_DEGREE ], mul2[ 2*MEASURE_POLY_FIT_DEGREE ], num[ 2*MEASURE_POLY_FIT_DEGREE ], den[ 2*MEASURE_POLY_FIT_DEGREE ]; static double *t = NULL; static size_t t_size = 0; static double *xd = NULL; static size_t xd_size = 0; static double *yd = NULL; static size_t yd_size = 0; static double *workspace = NULL; static size_t workspace_size = 0; int i; const int pad = MIN( MEASURE_POLY_END_PADDING, len/4 ); // parameter for parametric polynomial representation t = request_storage(t, &t_size, sizeof(double), len, "measure"); xd = request_storage(xd, &xd_size, sizeof(double), len, "measure"); yd = request_storage(yd, &yd_size, sizeof(double), len, "measure"); { int i = len; // convert floats to doubles while(i--) { xd[i] = x[i]; yd[i] = y[i]; } } for( i=0; i<len; i++ ) t[i] = cumlen[i] / path_length; // [0 to 1] #ifdef DEBUG_MEASURE_POLYFIT_ERROR assert(t[0] == 0.0 ); assert( (t[len-1] - 1.0)<1e-6 ); #endif // polynomial fit workspace = request_storage( workspace, &workspace_size, sizeof(double), polyfit_size_workspace( len, 2*MEASURE_POLY_FIT_DEGREE ), //need 2*degree for curvature eval later "measure: polyfit workspace" ); polyfit( t+pad, xd+pad, len-2*pad, MEASURE_POLY_FIT_DEGREE, px, workspace ); polyfit_reuse( yd+pad, len-2*pad, MEASURE_POLY_FIT_DEGREE, py, workspace ); #ifdef DEBUG_MEASURE_POLYFIT_ERROR { double err = 0.0; int i; for( i=pad; i<len-2*pad; i++ ) err += hypot( xd[i] - polyval( px, MEASURE_POLY_FIT_DEGREE, t[i] ), yd[i] - polyval( py, MEASURE_POLY_FIT_DEGREE, t[i] ) ); err /= ((float)len); debug("Polyfit root mean squared residual: %f\n", err ); assert( err < 1.0 ); } #endif // first derivative memcpy( xp, px, sizeof(double) * ( MEASURE_POLY_FIT_DEGREE+1 ) ); memcpy( yp, py, sizeof(double) * ( MEASURE_POLY_FIT_DEGREE+1 ) ); polyder_ip( xp, MEASURE_POLY_FIT_DEGREE+1, 1 ); polyder_ip( yp, MEASURE_POLY_FIT_DEGREE+1, 1 ); // second derivative memcpy( xpp, xp, sizeof(double) * ( MEASURE_POLY_FIT_DEGREE+1 ) ); memcpy( ypp, yp, sizeof(double) * ( MEASURE_POLY_FIT_DEGREE+1 ) ); polyder_ip( xpp, MEASURE_POLY_FIT_DEGREE+1, 1 ); polyder_ip( ypp, MEASURE_POLY_FIT_DEGREE+1, 1 ); // Root angle // ---------- { double teval = (idx_follicle == 0) ? t[pad] : t[len-pad-1]; static const double rad2deg = 180.0/M_PI; switch(face_axis) { case 'h': case 'x': root_angle_deg = atan2( dx*polyval(yp, MEASURE_POLY_FIT_DEGREE, teval ), dx*polyval(xp, MEASURE_POLY_FIT_DEGREE, teval ) ) * rad2deg; break; case 'v': case 'y': root_angle_deg = atan2( dx*polyval(xp, MEASURE_POLY_FIT_DEGREE, teval ), dx*polyval(yp, MEASURE_POLY_FIT_DEGREE, teval ) ) * rad2deg; break; default: error("In Whisker_Seg_Measure\n" "\tParameter <face_axis> must take on a value of 'x' or 'y'\n" "\tGot value %c\n",face_axis); } } // Mean curvature // -------------- // Use the most naive of integration schemes { double *V = workspace; // done with workspace, so reuse it for vandermonde matrix (just alias it here) static double *evalnum = NULL, *evalden = NULL; static size_t evalnum_size = 0, evalden_size = 0; size_t npoints = len-2*pad; evalnum = request_storage( evalnum, &evalnum_size, sizeof(double), npoints, "numerator" ); evalden = request_storage( evalden, &evalden_size, sizeof(double), npoints, "denominator" ); Vandermonde_Build( t+pad, npoints, 2*MEASURE_POLY_FIT_DEGREE, V ); // used for polynomial evaluation // numerator memset( mul1, 0, 2*MEASURE_POLY_FIT_DEGREE*sizeof(double) ); memset( mul2, 0, 2*MEASURE_POLY_FIT_DEGREE*sizeof(double) ); polymul( xp, MEASURE_POLY_FIT_DEGREE+1, ypp, MEASURE_POLY_FIT_DEGREE+1, mul1 ); polymul( yp, MEASURE_POLY_FIT_DEGREE+1, xpp, MEASURE_POLY_FIT_DEGREE+1, mul2 ); polysub( mul1, 2*MEASURE_POLY_FIT_DEGREE, mul2, 2*MEASURE_POLY_FIT_DEGREE, num ); // denominator memset( mul1, 0, 2*MEASURE_POLY_FIT_DEGREE*sizeof(double) ); memset( mul2, 0, 2*MEASURE_POLY_FIT_DEGREE*sizeof(double) ); polymul( xp, MEASURE_POLY_FIT_DEGREE+1, xp, MEASURE_POLY_FIT_DEGREE+1, mul1 ); polymul( yp, MEASURE_POLY_FIT_DEGREE+1, yp, MEASURE_POLY_FIT_DEGREE+1, mul2 ); polyadd( mul1, 2*MEASURE_POLY_FIT_DEGREE, mul2, 2*MEASURE_POLY_FIT_DEGREE, den ); // Eval matmul( V, npoints, MEASURE_POLY_FIT_DEGREE*2, num, MEASURE_POLY_FIT_DEGREE*2, 1, evalnum ); matmul( V, npoints, MEASURE_POLY_FIT_DEGREE*2, den, MEASURE_POLY_FIT_DEGREE*2, 1, evalden ); // compute kappa at each t { int i; for(i=0; i<npoints; i++ ) evalnum[i] /= pow( evalden[i], 3.0/2.0 )*dx; //dx is 1 or -1 so dx = 1/dx; mean_curvature = evalnum[0] * (t[1]-t[0]); for(i=1; i<npoints; i++ ) mean_curvature += evalnum[i] * ( t[i]-t[i-1] ); } } } // fill in fields dest[0] = path_length; dest[1] = median_score; dest[2] = root_angle_deg; dest[3] = mean_curvature; dest[4] = follicle_x; dest[5] = follicle_y; dest[6] = tip_x; dest[7] = tip_y; }
void matmul_mpi(float* A, float* B, float* C, int n){ int rank, nodes_n, row_per_process, i, j, k, start_row, end_row, child_start_row, child_end_row; MPI_Request request_A[32]; MPI_Request request_B[32]; MPI_Request request_C[32]; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nodes_n); row_per_process = n / nodes_n; if(n % nodes_n)row_per_process ++; nodes_n = n / row_per_process; if(n % row_per_process)nodes_n ++; start_row = rank * row_per_process; end_row = (rank + 1) * row_per_process; if(start_row >= n) return; if(end_row > n)end_row = n; if(rank == 0) { // 0 ~ row_per_process are mine for(i = 1; i < nodes_n;i++){ child_start_row = i * row_per_process; child_end_row = (i+1) * row_per_process; if(child_end_row > n) child_end_row = n; MPI_Isend(A + child_start_row * n, (child_end_row - child_start_row) * n, MPI_FLOAT, i, SEND_DATA_TAG, MPI_COMM_WORLD, &request_A[i]); MPI_Isend(B, n * n, MPI_FLOAT, i, SEND_DATA_TAG, MPI_COMM_WORLD, &request_B[i]); fprintf(stderr, "0: %d 요청 시작\n", i); } } else { A = (float *)malloc(sizeof(float) * (end_row - start_row) * n); B = (float *)malloc(sizeof(float) * n * n); MPI_Recv(A, (end_row - start_row) * n, MPI_FLOAT, 0, SEND_DATA_TAG, MPI_COMM_WORLD, &status); MPI_Recv(B, n * n, MPI_FLOAT, 0, SEND_DATA_TAG, MPI_COMM_WORLD, &status); fprintf(stderr, "%d: 0으로 부터 받음\n", rank); } //자기것 계산하기 if(C == NULL) { C = (float *)malloc(sizeof(float) * (end_row - start_row) * n); memset(C, 0, sizeof(float) * (end_row - start_row) * n); } else { memset(C, 0, sizeof(float) * n * n); } matmul(A, B, C, end_row-start_row, n, n); /* for(i = 0; i < end_row - start_row; ++i){ for(j = 0 ; j < n ;++j){ for(k = 0 ; k < n ;++k){ C[i * n + j] += A[i * n + k] * B[k * n + j]; } } } */ // 계산 완료 if(rank == 0) { // 호스트면 보낸거 확인하고 데이터 합치기 for(i = 1; i < nodes_n ; i++){ child_start_row = i * row_per_process; MPI_Wait(&request_A[i], &status); MPI_Wait(&request_B[i], &status); fprintf(stderr, "0: %d 로 보내기 완료\n", i); } for(i = 1; i < nodes_n ; i++){ child_start_row = i * row_per_process; child_end_row = (i+1) * row_per_process; if(child_end_row > n) child_end_row = n; MPI_Irecv(C + child_start_row * n, (child_end_row - child_start_row) * n, MPI_FLOAT, i, SEND_ANSWER_TAG, MPI_COMM_WORLD, &request_C[i]); //fprintf(stderr, "%d\n",(child_end_row - child_start_row) * n); fprintf(stderr, "0: %d 로부터 답변 받기 요청 완료\n", i); } long t = clock(); for(i = 1; i < nodes_n ; i++){ MPI_Wait(&request_C[i], &status); fprintf(stderr, "0: %d 로부터 답변 받기 완료\n", i); } fprintf(stderr, "%f\n", ((double) clock() - t) / CLOCKS_PER_SEC); fprintf(stderr, "0: 계산 및 답변받기 모두 완료\n"); } else { // 아니면 보내기 //fprintf(stderr, "%d\n",(end_row - start_row) * n); MPI_Send(C, (end_row - start_row) * n, MPI_FLOAT, 0, SEND_ANSWER_TAG, MPI_COMM_WORLD); free(A); free(B); free(C); fprintf(stderr, "%d: 계산 및 보내기 모두 완료\n", rank); } }
Vector Pbc::scaledToReal(const Vector&d)const { return matmul(box.transpose(),d); }
/* orient() calculates orientation of the camera, updating its calibration structure using the definitions and algorithms well described in [1]. Arguments: Calibration* cal_in - camera calibration object control_par *cpar - control parameters int nfix - number of 3D known points vec3d fix[] - each of nfix items is one 3D position of known point on the calibration object. target pix[] - image coordinates corresponding to each point in ``fix``. can be obtained from the set of detected 2D points using sortgrid(). The points which are associated with fix[] have real pointer (.pnr attribute), others have -999. orient_par flags - structure of all the flags of the parameters to be (un)changed, read from orient.par parameter file using read_orient_par(), defaults are zeros except for x_scale which is by default 1. Output: Calibration *cal_in - if the orientation routine converged, this structure is updated, otherwise, returned untouched. The routine works on a copy of the calibration structure, cal. double sigmabeta[] - array of deviations for each of the interior and exterior parameters and glass interface vector (19 in total). Returns: On success, a pointer to an array of residuals. For each observation point i = 0..n-1, residual 2*i is the Gauss-Markof residual for the x coordinate and residual 2*i + 1 is for the y. Then come 10 cells with the delta between initial guess and final solution for internal and distortion parameters, which are also part of the G-M model and described in it. On failure returns NULL. */ double* orient (Calibration* cal_in, control_par *cpar, int nfix, vec3d fix[], target pix[], orient_par *flags, double sigmabeta[20]) { int i,j,n, itnum, stopflag, n_obs=0, maxsize; double ident[IDT], XPX[NPAR][NPAR], XPy[NPAR], beta[NPAR], omega=0; double xp, yp, xpd, ypd, xc, yc, r, qq, p, sumP; int numbers; double al,be,ga,nGl,e1_x,e1_y,e1_z,e2_x,e2_y,e2_z,safety_x,safety_y,safety_z; double *P, *y, *yh, *Xbeta, *resi; vec3d glass_dir, tmp_vec, e1, e2; Calibration *cal; /* small perturbation for translation/rotation in meters and in radians */ double dm = 0.00001, drad = 0.0000001; cal = malloc (sizeof (Calibration)); memcpy(cal, cal_in, sizeof (Calibration)); maxsize = nfix*2 + IDT; P = (double *) calloc(maxsize, sizeof(double)); y = (double *) calloc(maxsize, sizeof(double)); yh = (double *) calloc(maxsize, sizeof(double)); Xbeta = (double *) calloc(maxsize, sizeof(double)); resi = (double *) calloc(maxsize, sizeof(double)); double (*X)[NPAR] = malloc(sizeof (*X) * maxsize); double (*Xh)[NPAR] = malloc(sizeof (*Xh) * maxsize); for(i = 0; i < maxsize; i++) { for(j = 0; j < NPAR; j++) { X[i][j] = 0.0; Xh[i][j] = 0.0; } y[i] = 0; P[i] = 1; } for(i = 0; i < NPAR; i++) sigmabeta[j] = 0.0; if(flags->interfflag){ numbers = 18; } else{ numbers = 16; } vec_set(glass_dir, cal->glass_par.vec_x, cal->glass_par.vec_y, cal->glass_par.vec_z); nGl = vec_norm(glass_dir); e1_x = 2*cal->glass_par.vec_z - 3*cal->glass_par.vec_x; e1_y = 3*cal->glass_par.vec_x - 1*cal->glass_par.vec_z; e1_z = 1*cal->glass_par.vec_y - 2*cal->glass_par.vec_y; vec_set(tmp_vec, e1_x, e1_y, e1_z); unit_vector(tmp_vec, e1); e2_x = e1_y*cal->glass_par.vec_z - e1_z*cal->glass_par.vec_x; e2_y = e1_z*cal->glass_par.vec_x - e1_x*cal->glass_par.vec_z; e2_z = e1_x*cal->glass_par.vec_y - e1_y*cal->glass_par.vec_y; vec_set(tmp_vec, e2_x, e2_y, e2_z); unit_vector(tmp_vec, e2); al = 0; be = 0; ga = 0; /* init identities */ ident[0] = cal->int_par.cc; ident[1] = cal->int_par.xh; ident[2] = cal->int_par.yh; ident[3] = cal->added_par.k1; ident[4] = cal->added_par.k2; ident[5] = cal->added_par.k3; ident[6] = cal->added_par.p1; ident[7] = cal->added_par.p2; ident[8] = cal->added_par.scx; ident[9] = cal->added_par.she; safety_x = cal->glass_par.vec_x; safety_y = cal->glass_par.vec_y; safety_z = cal->glass_par.vec_z; /* main loop, program runs through it, until none of the beta values comes over a threshold and no more points are thrown out because of their residuals */ itnum = 0; stopflag = 0; while ((stopflag == 0) && (itnum < NUM_ITER)) { itnum++; for (i = 0, n = 0; i < nfix; i++) { /* check for correct correspondence note that we do not use anymore pointer in fix, the points are read by the order of appearance and if we want to use every other point we use 'i', just check it is not -999 */ if(pix[i].pnr != i) continue; switch (flags->useflag) { case 1: if ((i % 2) == 0) continue; break; case 2: if ((i % 2) != 0) continue; break; case 3: if ((i % 3) == 0) continue; break; } /* get metric flat-image coordinates of the detected point */ pixel_to_metric (&xc, &yc, pix[i].x, pix[i].y, cpar); correct_brown_affin (xc, yc, cal->added_par, &xc, &yc); /* Projected 2D position on sensor of corresponding known point */ rotation_matrix(&(cal->ext_par)); img_coord (fix[i], cal, cpar->mm, &xp, &yp); /* derivatives of distortion parameters */ r = sqrt (xp*xp + yp*yp); X[n][7] = cal->added_par.scx; X[n+1][7] = sin(cal->added_par.she); X[n][8] = 0; X[n+1][8] = 1; X[n][9] = cal->added_par.scx * xp * r*r; X[n+1][9] = yp * r*r; X[n][10] = cal->added_par.scx * xp * pow(r,4.0); X[n+1][10] = yp * pow(r,4.0); X[n][11] = cal->added_par.scx * xp * pow(r,6.0); X[n+1][11] = yp * pow(r,6.0); X[n][12] = cal->added_par.scx * (2*xp*xp + r*r); X[n+1][12] = 2 * xp * yp; X[n][13] = 2 * cal->added_par.scx * xp * yp; X[n+1][13] = 2*yp*yp + r*r; qq = cal->added_par.k1*r*r; qq += cal->added_par.k2*pow(r,4.0); qq += cal->added_par.k3*pow(r,6.0); qq += 1; X[n][14] = xp * qq + cal->added_par.p1 * (r*r + 2*xp*xp) + \ 2*cal->added_par.p2*xp*yp; X[n+1][14] = 0; X[n][15] = -cos(cal->added_par.she) * yp; X[n+1][15] = -sin(cal->added_par.she) * yp; /* numeric derivatives of projection coordinates over external parameters, 3D position and the angles */ num_deriv_exterior(cal, cpar, dm, drad, fix[i], X[n], X[n + 1]); /* Num. deriv. of projection coords over sensor distance from PP */ cal->int_par.cc += dm; rotation_matrix(&(cal->ext_par)); img_coord (fix[i], cal, cpar->mm, &xpd, &ypd); X[n][6] = (xpd - xp) / dm; X[n+1][6] = (ypd - yp) / dm; cal->int_par.cc -= dm; /* ditto, over water-glass-air interface position vector */ al += dm; cal->glass_par.vec_x += e1[0]*nGl*al; cal->glass_par.vec_y += e1[1]*nGl*al; cal->glass_par.vec_z += e1[2]*nGl*al; img_coord (fix[i], cal, cpar->mm, &xpd, &ypd); X[n][16] = (xpd - xp) / dm; X[n+1][16] = (ypd - yp) / dm; al -= dm; cal->glass_par.vec_x = safety_x; cal->glass_par.vec_y = safety_y; cal->glass_par.vec_z = safety_z; be += dm; cal->glass_par.vec_x += e2[0]*nGl*be; cal->glass_par.vec_y += e2[1]*nGl*be; cal->glass_par.vec_z += e2[2]*nGl*be; img_coord (fix[i], cal, cpar->mm, &xpd, &ypd); X[n][17] = (xpd - xp) / dm; X[n+1][17] = (ypd - yp) / dm; be -= dm; cal->glass_par.vec_x = safety_x; cal->glass_par.vec_y = safety_y; cal->glass_par.vec_z = safety_z; ga += dm; cal->glass_par.vec_x += cal->glass_par.vec_x*nGl*ga; cal->glass_par.vec_y += cal->glass_par.vec_y*nGl*ga; cal->glass_par.vec_z += cal->glass_par.vec_z*nGl*ga; img_coord (fix[i], cal, cpar->mm, &xpd, &ypd); X[n][18] = (xpd - xp) / dm; X[n+1][18] = (ypd - yp) / dm; ga -= dm; cal->glass_par.vec_x = safety_x; cal->glass_par.vec_y = safety_y; cal->glass_par.vec_z = safety_z; y[n] = xc - xp; y[n+1] = yc - yp; n += 2; } n_obs = n; /* identities */ for (i = 0; i < IDT; i++) X[n_obs + i][6 + i] = 1; y[n_obs+0] = ident[0] - cal->int_par.cc; y[n_obs+1] = ident[1] - cal->int_par.xh; y[n_obs+2] = ident[2] - cal->int_par.yh; y[n_obs+3] = ident[3] - cal->added_par.k1; y[n_obs+4] = ident[4] - cal->added_par.k2; y[n_obs+5] = ident[5] - cal->added_par.k3; y[n_obs+6] = ident[6] - cal->added_par.p1; y[n_obs+7] = ident[7] - cal->added_par.p2; y[n_obs+8] = ident[8] - cal->added_par.scx; y[n_obs+9] = ident[9] - cal->added_par.she; /* weights */ for (i = 0; i < n_obs; i++) P[i] = 1; P[n_obs+0] = ( ! flags->ccflag) ? POS_INF : 1; P[n_obs+1] = ( ! flags->xhflag) ? POS_INF : 1; P[n_obs+2] = ( ! flags->yhflag) ? POS_INF : 1; P[n_obs+3] = ( ! flags->k1flag) ? POS_INF : 1; P[n_obs+4] = ( ! flags->k2flag) ? POS_INF : 1; P[n_obs+5] = ( ! flags->k3flag) ? POS_INF : 1; P[n_obs+6] = ( ! flags->p1flag) ? POS_INF : 1; P[n_obs+7] = ( ! flags->p2flag) ? POS_INF : 1; P[n_obs+8] = ( ! flags->scxflag) ? POS_INF : 1; P[n_obs+9] = ( ! flags->sheflag) ? POS_INF : 1; n_obs += IDT; sumP = 0; for (i = 0; i < n_obs; i++) { /* homogenize */ p = sqrt (P[i]); for (j = 0; j < NPAR; j++) Xh[i][j] = p * X[i][j]; yh[i] = p * y[i]; sumP += P[i]; } /* Gauss Markoff Model it is the least square adjustment of the redundant information contained both in the spatial intersection and the resection, see [1], eq. 23 */ ata ((double *) Xh, (double *) XPX, n_obs, numbers, NPAR ); matinv ((double *) XPX, numbers, NPAR); atl ((double *) XPy, (double *) Xh, yh, n_obs, numbers, NPAR); matmul ((double *) beta, (double *) XPX, (double *) XPy, numbers, numbers,1, NPAR, NPAR); stopflag = 1; for (i = 0; i < numbers; i++) { if (fabs (beta[i]) > CONVERGENCE) stopflag = 0; } if ( ! flags->ccflag) beta[6] = 0.0; if ( ! flags->xhflag) beta[7] = 0.0; if ( ! flags->yhflag) beta[8] = 0.0; if ( ! flags->k1flag) beta[9] = 0.0; if ( ! flags->k2flag) beta[10] = 0.0; if ( ! flags->k3flag) beta[11] = 0.0; if ( ! flags->p1flag) beta[12] = 0.0; if ( ! flags->p2flag) beta[13] = 0.0; if ( ! flags->scxflag)beta[14] = 0.0; if ( ! flags->sheflag) beta[15] = 0.0; cal->ext_par.x0 += beta[0]; cal->ext_par.y0 += beta[1]; cal->ext_par.z0 += beta[2]; cal->ext_par.omega += beta[3]; cal->ext_par.phi += beta[4]; cal->ext_par.kappa += beta[5]; cal->int_par.cc += beta[6]; cal->int_par.xh += beta[7]; cal->int_par.yh += beta[8]; cal->added_par.k1 += beta[9]; cal->added_par.k2 += beta[10]; cal->added_par.k3 += beta[11]; cal->added_par.p1 += beta[12]; cal->added_par.p2 += beta[13]; cal->added_par.scx += beta[14]; cal->added_par.she += beta[15]; if (flags->interfflag) { cal->glass_par.vec_x += e1[0]*nGl*beta[16]; cal->glass_par.vec_y += e1[1]*nGl*beta[16]; cal->glass_par.vec_z += e1[2]*nGl*beta[16]; cal->glass_par.vec_x += e2[0]*nGl*beta[17]; cal->glass_par.vec_y += e2[1]*nGl*beta[17]; cal->glass_par.vec_z += e2[2]*nGl*beta[17]; } } /* compute residuals etc. */ matmul ( (double *) Xbeta, (double *) X, (double *) beta, n_obs, numbers, 1, n_obs, NPAR); omega = 0; for (i = 0; i < n_obs; i++) { resi[i] = Xbeta[i] - y[i]; omega += resi[i] * P[i] * resi[i]; } sigmabeta[NPAR] = sqrt (omega / (n_obs - numbers)); for (i = 0; i < numbers; i++) { sigmabeta[i] = sigmabeta[NPAR] * sqrt(XPX[i][i]); } free(X); free(P); free(y); free(Xbeta); free(Xh); if (stopflag){ rotation_matrix(&(cal->ext_par)); memcpy(cal_in, cal, sizeof (Calibration)); return resi; } else { free(resi); return NULL; } }