int main(int argc, char **argv) { double *A, *B, *C; int N; int i, j, k; double elapsed; // Input data N = atoi(argv[1]); A = (double *) malloc( N * N * sizeof(double) ); B = (double *) malloc( N * N * sizeof(double) ); C = (double *) malloc( N * N * sizeof(double) ); if((A == NULL) || (B == NULL) || (C == NULL) ){ printf("Running out of memory!\n"); exit(EXIT_FAILURE); } //Fill matrixes. Generate Identity like matrix for A and B , So C should result in an matrix with a single major diagonal for(i=0; i < N; i++ ){ for(j=0; j < N; j++){ A[i+N*j] = (i==j)?i:0.0; B[i+N*j] = (i==j)?1.0:0.0; C[i+N*j] = 0.0; } } int rows = N, columns = N; int stride = N; tick(); //cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, rows, columns, columns, 1.0, A, stride, B, stride, 1.0, C, stride); for(i=0; i < N; i++){ for(j=0; j<N; j++){ C[j + i*N] = 0.0; for(k=0; k<N; k++){ C[j + i*N] += A[k + i*N] * B[j + k*N]; } } } elapsed = tack(); printf("%f sec\n", elapsed); if( N < 30 ) { printf("C ... \n"); for (i=0; i<N; i++) { for (j=0; j<N; j++) { printf("%3.1f ", C[i+N*j]); } printf("\n"); } } free(A); free(B); free(C); exit(EXIT_SUCCESS); }
int main(int argc, char *argv[]) { long i; graph G; double runtime; inputCheck(argc, argv); if(N == 1){ generateTestGraph(&G); } else { generateGraph(N, randInit, &G, 0); } tick(); dijkstra(&G, 0, 0); runtime = tack(); // char *b; b = malloc(G.N * 5); if(b == NULL) {perror("malloc"); exit(EXIT_FAILURE); } sprintf(b,"\nLowest distances!\nD=["); for(i = 0; i<G.N; i++){ sprintf(&b[strlen(b)], "%d,", G.D[i]); } printf("%s]\n", b); printf("Was working for [%f] sec.\n",runtime); return EXIT_SUCCESS; }
void test_get_diff(){ struct timestamp *ts; time_t diff_sec; long diff_nsec; ts = init_timestamp(); /*test < 1sec*/ printf("Test in 500msec\n"); tick(ts); usleep(500000); /*500000usec = 500msec*/ tack(ts); diff_sec = get_diff_sec(ts); diff_nsec = get_diff_nanosec(ts); printf("start=%d:%ld end=%d:%ld\n", (int)(ts->start.tv_sec), ts->start.tv_nsec, (int)(ts->end.tv_sec), ts->end.tv_nsec); printf("diff: %dsec + %ld nanosec\n", (int)diff_sec, diff_nsec); /*test 1-2sec*/ printf("Test in 1500msec\n"); tick(ts); usleep(1000000 + 500000); /*1500000usec = 1500msec*/ tack(ts); diff_sec = get_diff_sec(ts); diff_nsec = get_diff_nanosec(ts); printf("start=%d:%ld end=%d:%ld\n", (int)(ts->start.tv_sec), ts->start.tv_nsec, (int)(ts->end.tv_sec), ts->end.tv_nsec); printf("diff: %dsec + %ld nanosec\n", (int)diff_sec, diff_nsec); /*test > 2sec*/ printf("Test in 3800msec\n"); tick(ts); usleep(3000000 + 800000); /*3800000usec = 3800msec*/ tack(ts); diff_sec = get_diff_sec(ts); diff_nsec = get_diff_nanosec(ts); printf("start=%d:%ld end=%d:%ld\n", (int)(ts->start.tv_sec), ts->start.tv_nsec, (int)(ts->end.tv_sec), ts->end.tv_nsec); printf("diff: %dsec + %ld nanosec\n", (int)diff_sec, diff_nsec); dispose_timestamp(ts); }
int main(int argc, char** argv) { point* points; distance solution; double elapsedTime; points = NULL; inputCheck(argc, argv); printf("Generating [%d] points\n", np); if( generatePoints(np, &points) != EXIT_SUCCESS ){ printf("Generating Points failed!\n"); exit(EXIT_FAILURE); } tick(); mpiInit(argc, argv); if( prepareMPIComm() ) { if(mpi_id == 0) { printf("Starting search ..."); } multiSearch(np, points, &solution); elapsedTime = tack(); //printf("Found Solution a[%f,%f] , b[%f,%f] distance [%0.10f]\n", solution.a.x, solution.a.y, solution.b.x, solution.b.y, solution.d); if(mpi_id == 0){ printf("Completed Search and found closest points at [%g, %g] , [%g, %g] with a distance of [%g]\n", mpi_id\ , solution.a.x ,solution.a.y, solution.b.x, solution.b.y , solution.d); printf("Operation took %f seconds \n", elapsedTime); free(points); } } else { } mpiFinish(); exit(EXIT_SUCCESS); }
void testScheduler(int nThreads, graph* G, char debug) { double runtime; //Set max nThreads omp_set_num_threads(nThreads); if(mpi_id == 0) printf("Scheduler (Static, %d)", G->N/100 ); resetGraph(G); omp_set_schedule(omp_sched_static, G->N/100); if(mpi_id == 0) tick(); dijkstra(G, 0, debug); if(mpi_id == 0){ runtime = tack(); printf("working for [%f] sec.\n",runtime); } if(mpi_id == 0) printf("Scheduler (dynamic, %d)", G->N/100 ); resetGraph(G); omp_set_schedule(omp_sched_dynamic, G->N/100); if(mpi_id == 0) tick(); dijkstra(G, 0, debug); if(mpi_id == 0){ runtime = tack(); printf("working for [%f] sec.\n",runtime); } if(mpi_id == 0) printf("Scheduler (guided, %d)", G->N/100 ); resetGraph(G); omp_set_schedule(omp_sched_guided, G->N/100); if(mpi_id == 0) tick(); dijkstra(G, 0, debug); if(mpi_id == 0){ runtime = tack(); printf("working for [%f] sec.\n",runtime); } }
static void loop( int *n, char *b ) { int i; if (n[0] == 0) { *b = '\0'; puts(buf); return; } for (i = 0; i < n[0]; i++) loop(n+1, tack(b, i)); }
void test_tick_tack(void){ struct timestamp *ts; time_t diff; ts = init_timestamp(); assert(ts != NULL); tick(ts); usleep(1000000 + 500000); tack(ts); diff = ts->end.tv_sec - ts->start.tv_sec; printf("start=%d:%ld end=%d:%ld\n", (int)(ts->start.tv_sec), ts->start.tv_nsec, (int)(ts->end.tv_sec), ts->end.tv_nsec); dispose_timestamp(ts); }
main (int argc, char *argv[]) { struct em_file inputdata1; struct em_file inputdata2; struct em_file inputdata3; struct em_file inputdata4; struct em_file outputdata; fftw_real *Vol_tmpl_sort, *Volume, *e3, *PointCorr, *sqconv; fftw_complex *C3, *PointVolume, *PointSq; rfftwnd_plan p3, pi3, r3, ri3; fftw_real scale; struct tm *zeit; struct tm start; char name[200]; int Rx_max, Ry_max, Rz_max; int Rx_min, Ry_min, Rz_min; int Vx_min, Vy_min, Vz_min; int Vx_max, Vy_max, Vz_max; float Phi, Psi, Theta, winkel_lauf; float *Rot_tmpl, *Vol_tmpl; int i, j, k, tmpx, tmpy, tmpz,lauf_pe, ksub; int ijk; int lauf, n; float max, eps; time_t lt; float Ctmp, Ctmpim, Dtmp, Dtmpim; int dim_fft; int sub[3],range[3],range_sub[3],subc[3],offset[3],dimarray[3]; int FullVolume_dims[3]; int nr[3]; int area[3]; /* MPI Variablen */ int winkel_max, winkel_min; int winkel_max_pe, winkel_min_pe; int winkel_step_pe; int Phi_max, Psi_max, Theta_max; int Phi_min, Psi_min, Theta_min; int Phi_step, Psi_step, Theta_step; int Theta_winkel_start, Psi_winkel_start, Phi_winkel_start; int Theta_winkel_nr, Psi_winkel_nr, Phi_winkel_nr; int Theta_winkel_end, Psi_winkel_end, Phi_winkel_end; int Theta_steps, Psi_steps, Phi_steps; float Theta_winkel_rest_nr, Psi_winkel_rest_nr, Phi_winkel_rest_nr; int in_max; float rms_wedge, tempccf; float *Ergebnis, *conv; float cycles; int cycle; /* MPI Variablen Ende*/ if (argc < 15) { printf ("\n\n"); printf (" 'OSCAR' is an Optimized SCanning AlgoRithm for \n"); printf (" local correlation.\n"); printf (" All files in EM-V-int4 format !!!\n\n"); printf (" Input: Volume to be searched, Template mask for local \n "); printf (" correlation, pointspread function and angular search \n"); printf (" range. \n"); printf (" Output: locally normalized X-Correlation Function Out.ccf.norm, \n"); printf (" non-normalized X-Correlation Function Out.ccf, and Out.ang \n"); printf (" with the corresponding angles.\n\n"); printf (" usage: oscar Volume Template Out ...\n"); printf (" ... Phi_min Phi_max Phi_step Psi_min Psi_max Psi_step The_min The_max The_step\n"); printf (" ... Poinspread-function mask-file dim_of_fft\n\n"); printf (" with Message Passing Interface (MPI)\n"); printf (" the total number of angles must be modulo\n"); printf (" of used processors!\n\n"); printf (" Linux: 1.'lamboot' to start MPI\n"); printf (" 2.'mpirun -np 2 oscar Volume Templ Out 30 180 30 30 180 30 30 180 30 Poinspread-function mask-file 256'\n\n"); printf (" In this version asymmetric masks can be used ! \n"); printf (" last revision , 11.11.03, Friedrich Foerster"); printf (" \n\n"); exit (1); } MPI_Init (&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &mysize); MPI_Comm_rank (MPI_COMM_WORLD, &myrank); /* Dimensionen auslesen */ // Dimension of fft dim_fft = atoi (argv[15]); nr[0]=1; nr[1]=1; nr[2]=1; area[0]=dim_fft; area[1]=dim_fft; area[2]=dim_fft; read_em_header(argv[1], &inputdata1); /* Searchvolume */ read_em (argv[2], &inputdata2); /* Template */ FullVolume_dims[0]=inputdata1.dims[0]; FullVolume_dims[1]=inputdata1.dims[1]; FullVolume_dims[2]=inputdata1.dims[2]; Rx_min = 1; Ry_min = 1; Rz_min = 1; Rx_max = (inputdata2.dims[0]); Ry_max = (inputdata2.dims[1]); Rz_max = (inputdata2.dims[2]); Vx_min = 1; Vy_min = 1; Vz_min = 1; Vx_max = dim_fft; Vy_max = dim_fft; Vz_max = dim_fft; p3 = rfftw3d_create_plan (Vx_max, Vy_max, Vz_max, FFTW_REAL_TO_COMPLEX, FFTW_MEASURE | FFTW_IN_PLACE); /*FFTW_ESTIMATE FFTW_MEASURE */ pi3 = rfftw3d_create_plan (Vx_max, Vy_max, Vz_max, FFTW_COMPLEX_TO_REAL, FFTW_MEASURE | FFTW_IN_PLACE); r3 = rfftw3d_create_plan (Rx_max, Rx_max, Rx_max, FFTW_REAL_TO_COMPLEX, FFTW_MEASURE | FFTW_IN_PLACE); /*FFTW_ESTIMATE FFTW_MEASURE */ ri3 = rfftw3d_create_plan (Rx_max, Rx_max, Rx_max, FFTW_COMPLEX_TO_REAL, FFTW_MEASURE | FFTW_IN_PLACE); if (myrank == 0) { printf("Plans for FFTW created \n");fflush(stdout); } Volume = (fftw_real *) calloc (Vx_max * Vx_max * 2 * (Vx_max / 2 + 1),sizeof (fftw_real) ); Rot_tmpl = (float *) malloc (sizeof (float) * Rx_max * Ry_max * Rz_max); Vol_tmpl = (float *) malloc (sizeof (float) * Vx_max * Vy_max * Vz_max); conv = (float *) malloc (sizeof (float) * Vx_max * Vy_max * Vz_max); sqconv = (fftw_real *) calloc(Vz_max * Vy_max * 2 * (Vx_max / 2 + 1), sizeof (fftw_real)); if (! (inputdata1.floatdata = (float *) malloc (sizeof (float) * Vx_max * Vy_max * Vz_max))) { printf ("Memory allocation failure in inputdata1.floatdata!!!"); fflush (stdout); exit (1); } if (! (outputdata.floatdata = (float *) malloc (sizeof (float) * Vx_max * Vy_max * Vz_max))) { printf ("Memory allocation failure in outputdata.floatdata!!!"); fflush (stdout); exit (1); } if (! (Vol_tmpl_sort = (fftw_real *) calloc (Vz_max*Vy_max*2*(Vx_max / 2 + 1), sizeof (fftw_real) ))) { printf ("Memory allocation failure in Volume_tmpl_sort!!!"); printf ("Nx = %i, Ny = %i, Nz = %i, bytes = %i \n",2 *(Vx_max / 2 + 1),Vy_max, Vz_max, sizeof (fftw_real)); fflush (stdout); exit (1); } Ergebnis = (float *) calloc (Vz_max * Vy_max * Vx_max, sizeof (float)); /* Winkelraum */ Phi_min = atof (argv[4]); Phi_max = atof (argv[5]); Phi_step = atof (argv[6]); Psi_min = atof (argv[7]); Psi_max = atof (argv[8]); Psi_step = atof (argv[9]); Theta_min = atof (argv[10]); Theta_max = atof (argv[11]); Theta_step = atof (argv[12]); /* Pointspread Function*/ read_em (argv[13], &inputdata3); /* mask function */ read_em (argv[14], &inputdata4); Phi_steps = (Phi_max - Phi_min) / Phi_step + 1; Psi_steps = (Psi_max - Psi_min) / Psi_step + 1; Theta_steps = (Theta_max - Theta_min) / Theta_step + 1; winkel_max = Phi_steps * Psi_steps * Theta_steps; winkel_min = 0; range[0]=dim_fft-1; range[1]=dim_fft-1; range[2]=dim_fft-1; range_sub[0]=range[0]-Rx_max; range_sub[1]=range[1]-Rx_max; range_sub[2]=range[2]-Rx_max; sub[0]=1; sub[1]=1; sub[2]=1; cycles=(int)(FullVolume_dims[2]/(dim_fft-Rx_max)+0.5); cycles=(int)(FullVolume_dims[1]/(dim_fft-Rx_max)+0.5)*cycles; cycles=(int)(FullVolume_dims[0]/(dim_fft-Rx_max)+0.5)*cycles; cycle=0; if (myrank == 0) { printf ("\n oscar starts to run ... ");tack (&start);fflush (stdout); /* prepare Output */ strcpy (name, argv[3]); strcat (name, ".ccf"); printf ("\nCreate outputfile: %s ... \n", name);fflush(stdout); create_em (name, FullVolume_dims); strcpy (name, argv[3]); strcat (name, ".ang"); printf ("Create outputfile: %s ... \n", name);fflush(stdout); create_em (name, FullVolume_dims); strcpy (name, argv[3]); strcat (name, ".ccf.norm"); printf ("Create outputfile: %s ... \n", name);fflush(stdout); create_em (name, FullVolume_dims); } for (sub[2]=1; sub[2] < FullVolume_dims[2]-Rz_max;sub[2]=sub[2]+dim_fft-Rz_max) { if (myrank == 0) { tack (&start); printf ("%f%%..", (float) (cycle / cycles * 100)); fflush (stdout); } for (sub[1]=1; sub[1] < FullVolume_dims[1]-Ry_max;sub[1]=sub[1]+dim_fft-Ry_max) { for (sub[0]=1; sub[0] < FullVolume_dims[0]-Rx_max;sub[0]=sub[0]+dim_fft-Rx_max) { cycle=cycle+1; subc[0]=sub[0]; subc[1]=sub[1]; subc[2]=sub[2]; if (sub[2] + range[2] > FullVolume_dims[2]) subc[2]=FullVolume_dims[2]-range[2]; /* we are at the corner ?!*/ if (sub[1] + range[1] > FullVolume_dims[1]) subc[1]=FullVolume_dims[1]-range[1]; /* we are at the corner ?!*/ if (sub[0] + range[0] > FullVolume_dims[0]) subc[0]=FullVolume_dims[0]-range[0]; /* we are at the corner ?!*/ read_em_subregion (argv[1], &inputdata1,subc,range); read_em_subregion (argv[1], &outputdata,subc,range); /* Umsortieren der Daten */ lauf = 0; for (k = 0; k < Vz_max; k++) { for (j = 0; j < Vy_max; j++) { for (i = 0; i < Vx_max; i++) { /* square - needed for normalization */ sqconv[i + 2 * (Vx_max / 2 + 1) * (j + Vy_max * k)] = inputdata1.floatdata[lauf]*inputdata1.floatdata[lauf]; Volume[i + 2 * (Vx_max / 2 + 1) * (j + Vy_max * k)] = inputdata1.floatdata[lauf]; inputdata1.floatdata[lauf] = -1.0; /* kleine Zahl wg Max-Op , hier kommen die CCFs rein*/ outputdata.floatdata[lauf] = -1.0; /* hier kommen die Winkel rein*/ lauf++; } } } rfftwnd_one_real_to_complex (p3, &Volume[0], NULL); /* einmalige fft von Suchvolumen */ rfftwnd_one_real_to_complex (p3, &sqconv[0], NULL); /* FFT of square*/ winkel_step_pe = (int) winkel_max / mysize; winkel_min_pe = myrank * winkel_step_pe; winkel_max_pe = winkel_min_pe + winkel_step_pe; Theta_winkel_nr = (int) winkel_min_pe / (Psi_steps * Phi_steps); Theta_winkel_rest_nr = winkel_min_pe - Theta_winkel_nr * (Psi_steps * Phi_steps); Psi_winkel_nr = (int) Theta_winkel_rest_nr / (Phi_steps); Psi_winkel_rest_nr = Theta_winkel_rest_nr - Psi_winkel_nr * (Phi_steps); Phi_winkel_nr = (int) Psi_winkel_rest_nr; Theta = Theta_winkel_nr * Theta_step + Theta_min; Phi = Phi_winkel_nr * Phi_step + Phi_min - Phi_step; Psi = Psi_winkel_nr * Psi_step + Psi_min; eps = 0.001; n = 0; //Friedrich -> Zaehlung der voxels n = countvoxel(inputdata4.dims[0], inputdata4.floatdata, eps); eps = 0.001; for (winkel_lauf = winkel_min_pe; winkel_lauf < winkel_max_pe;winkel_lauf++) { if (Phi < Phi_max) Phi = Phi + Phi_step; else { Phi = Phi_min; Psi = Psi + Psi_step; } if (Psi > Psi_max) { Psi = Psi_min; Theta = Theta + Theta_step; } tom_rotate3d (&Rot_tmpl[0], &inputdata2.floatdata[0], Phi, Psi, Theta, Rx_max, Ry_max, Rz_max); /*calculate Ref variance */ rms_wedge = energizer (Rx_min, Rx_max, n, &Rot_tmpl[0], &inputdata3.floatdata[0], &inputdata4.floatdata[0], r3, ri3); pastes (&Rot_tmpl[0], &Vol_tmpl[0], 1, 1, 1, Rx_max, Ry_max, Rz_max, Vx_max); scale = 1.0 / ((double)Vx_max * (double)Vy_max * (double)Vz_max * ((double) rms_wedge) ); //printf("hippo1: scale = %.10f \n",scale); sort4fftw(&Vol_tmpl_sort[0],&Vol_tmpl[0],Vx_max, Vy_max, Vz_max); rfftwnd_one_real_to_complex (p3, &Vol_tmpl_sort[0], NULL); PointVolume = (fftw_complex *) & Volume[0]; C3 = (fftw_complex *) & Vol_tmpl_sort[0]; /* Correlation */ correl(&PointVolume[0], &C3[0], Vx_max, Vy_max, Vz_max, scale); /* back to real space */ rfftwnd_one_complex_to_real (pi3, &C3[0], NULL); PointCorr = (fftw_real *) & C3[0]; /* Umsortieren der Daten */ sortback4fftw( &PointCorr[0], &Ergebnis[0], Vx_max, Vy_max, Vz_max); // crossen cross(&Ergebnis[0], Vx_max); /* 3rd: divide */ lauf = 0; for (k = 0 ; k < Vz_max ; k++) { for (j = 0; j < Vy_max; j++) { for (i = 0; i < Vx_max; i++) { if (inputdata1.floatdata[lauf] < Ergebnis[lauf] ) { inputdata1.floatdata[lauf] = Ergebnis[lauf]; outputdata.floatdata[lauf] = (int) winkel_lauf; } lauf++; } } } } /* Ende winkel_lauf */ //FF MPI_Barrier (MPI_COMM_WORLD); /* Ergebnisse einsammeln (myrank 0)*/ if (myrank == 0) { for (lauf_pe = 1; lauf_pe < mysize; lauf_pe++) { MPI_Recv (&Ergebnis[0], Vx_max * Vy_max * Vz_max, MPI_FLOAT, lauf_pe, 99, MPI_COMM_WORLD, &status); MPI_Recv (&conv[0], Vx_max * Vy_max * Vz_max, MPI_FLOAT, lauf_pe, 98, MPI_COMM_WORLD, &status); /* use conv as temporary memory for angles */ for (lauf = 0; lauf < Vx_max * Vy_max * Vz_max; lauf++) { if (inputdata1.floatdata[lauf] < Ergebnis[lauf]) { inputdata1.floatdata[lauf] = Ergebnis[lauf]; outputdata.floatdata[lauf] = conv[lauf]; } } } /*Ergebnisse eingesammelt */ } // myrank > 0: Ergebnisse senden else { MPI_Send (inputdata1.floatdata, Vx_max * Vy_max * Vz_max, MPI_FLOAT, 0, 99, MPI_COMM_WORLD); MPI_Send (outputdata.floatdata, Vx_max * Vy_max * Vz_max, MPI_FLOAT, 0, 98, MPI_COMM_WORLD); } MPI_Barrier (MPI_COMM_WORLD); // nicht normalisiertes Volumen und Winkel rausschreiben subc[0]=subc[0]+Rx_max/2; subc[1]=subc[1]+Rx_max/2; subc[2]=subc[2]+Rx_max/2; if (myrank==0) { offset[0]=Rx_max/2; offset[1]=Rx_max/2; offset[2]=Rx_max/2; dimarray[0]=dim_fft; dimarray[1]=dim_fft; dimarray[2]=dim_fft; strcpy (name, argv[3]); strcat (name, ".ccf"); write_em_subsubregion (name, &inputdata1,subc,range_sub,offset,dimarray); strcpy (name, argv[3]); strcat (name, ".ang"); write_em_subsubregion (name, &outputdata,subc,range_sub,offset,dimarray); /* ------------------- normalization - here only PE 0 ---------- */ pastes (&inputdata4.floatdata[0], &Vol_tmpl[0], 1, 1, 1, Rx_max, Ry_max, Rz_max, Vx_max); /* paste mask into zero volume*/ /* 1st local mean */ sort4fftw(&Vol_tmpl_sort[0], &Vol_tmpl[0], Vx_max, Vy_max, Vz_max); rfftwnd_one_real_to_complex (p3, &Vol_tmpl_sort[0], NULL); C3 = (fftw_complex *) & Vol_tmpl_sort[0]; /* Convolution of volume and mask */ scale = 1.0 / ((double)Vx_max * (double)Vy_max * (double)Vz_max ); convolve( &PointVolume[0], &C3[0], Vx_max, Vy_max, Vz_max, scale); rfftwnd_one_complex_to_real (pi3, &C3[0], NULL); PointCorr = (fftw_real *) & C3[0]; /* Umsortieren der Daten */ sortback4fftw( &PointCorr[0], &conv[0], Vx_max, Vy_max, Vz_max); /* 2nd : convolution of square and resorting*/ pastes (&inputdata4.floatdata[0], &Vol_tmpl[0], 1, 1, 1, Rx_max, Ry_max, Rz_max, Vx_max); /* paste mask into zero volume*/ sort4fftw( &Vol_tmpl_sort[0], &Vol_tmpl[0], Vx_max, Vy_max, Vz_max); rfftwnd_one_real_to_complex (p3, &Vol_tmpl_sort[0], NULL); C3 = (fftw_complex *) & Vol_tmpl_sort[0]; PointSq = (fftw_complex *) & sqconv[0];// set pointer to FFT of square convolve( &PointSq[0], &C3[0], Vx_max, Vy_max, Vz_max, scale); rfftwnd_one_complex_to_real (pi3, &C3[0], NULL); PointCorr = (fftw_real *) &C3[0]; //FF lauf = 0; for (k = 0; k < Vz_max; k++) { for (j = 0; j < Vy_max; j++) { for (i = 0; i < Vx_max; i++) { conv[lauf] = sqrt(PointCorr[i + 2 * (Vx_max / 2 + 1) * (j + Vy_max * k)] - conv[lauf]*conv[lauf]/((float) n) ) ;/*local variance*/ lauf++; } } } cross(&conv[0], Vx_max); /* perform division */ for (lauf = 0; k < Vz_max*Vy_max*Vz_max; lauf++) { if (conv[lauf] > eps) { inputdata1[lauf].floatdata = inputdata1[lauf].floatdata/conv[lauf]; } else { inputdata1[lauf].floatdata = 0; } } strcpy (name, argv[3]); strcat (name, ".ccf.norm"); write_em_subsubregion (name, &inputdata1,subc,range_sub,offset,dimarray); } MPI_Barrier (MPI_COMM_WORLD); } } /* these are the new brackets from the subregion_read , SN */ } free(Ergebnis); free(inputdata1.floatdata); free(inputdata2.floatdata); free(inputdata3.floatdata); free(inputdata4.floatdata); rfftwnd_destroy_plan(p3); rfftwnd_destroy_plan(pi3); rfftwnd_destroy_plan(r3); rfftwnd_destroy_plan(ri3); free(Volume); free(sqconv); free(conv); free(Rot_tmpl); free(Vol_tmpl_sort); free(outputdata.floatdata); if (myrank==0) { printf ("oscar finished. "); tack (&start); fflush(stdout); } MPI_Finalize(); /* end main */ }
int main(int argc, char **argv) { int N; int nThreads; int nColumns; int i,j,k; double *A,*Bi,*C,*Ci; int BiRows, BiColumns; CompressedMatrix *cBi; CompressedMatrix *cCi; double elapsed; char printDebug; //************ Check Input **************/ if(argc < 3){ printf("Usage: %s MaxtrixSize NumberOfThreads\n" , argv[0] ); exit(EXIT_FAILURE); } N = atoi(argv[1]); if( N <= 1){ printf("MatrixSize must be bigger than 1!"); exit(EXIT_FAILURE); } nThreads = atoi(argv[2]); if( nThreads <= 1){ printf("NumberOfThreads must be bigger than 1!"); exit(EXIT_FAILURE); } omp_set_num_threads(nThreads); omp_set_schedule(omp_sched_dynamic, N/10); MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &mpi_id); MPI_Comm_size(MPI_COMM_WORLD, &mpi_size); nColumns = N / mpi_size; //For the moment depend on N being a multiple the number of MPI nodes //************ Prepare Matrix **************/ A = (double *) malloc( N*N * sizeof(double) ); if((A == NULL) ){ printf("Running out of memory!\n"); exit(EXIT_FAILURE); } // if(mpi_id != 0){ // MPI_Finalize(); // exit(0); // } if(mpi_id == 0) { printDebug = 0; if(printDebug) printf("[%d] Generating A ...",mpi_id); //Fill matrixes. Generate Identity like matrix for A and B , So C should result in an matrix with a single major diagonal for(i=0; i < N; i++ ){ for(j=0; j < N; j++){ A[i+N*j] = (i==j)?i:0.0; // //Sparse Matrix with 10% population // A[i+N*j] = rand()%10; // if(A[i+N*j] == 0) // A[i+N*j] = rand()%10; // else // A[i+N*j] = 0; } } // printMatrix(A, N, nColumns); // cA = compressMatrix(A, N, nColumns); // printCompressedMatrix(cA); // uncompressMatrix(cA, &Bi, &i, &j); // printMatrix(Bi, i, j); // // MPI_Finalize(); // exit(0); tick(); if(printDebug) printf("[%d] Broadcasting A ...",mpi_id); MPI_Bcast( A, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(printDebug) printf("[%d] Generating B ...",mpi_id); double* B; CompressedMatrix* cB; B = (double *) malloc( N*N * sizeof(double) ); for(i=0; i < N; i++ ){ for(j=0; j < N; j++){ B[j+N*i] = (i==j)?1.0:0.0; } } if(printDebug) printf("[%d] Compressing and distributing Bi ...",mpi_id); cB = compressMatrix(B, N, N); for(i=1; i < mpi_size; i++){ mpiSendCompressedMatrix(cB, i*nColumns, (i+1)*nColumns, i); } //Fake shorten cB free(B); cB->columns = nColumns; uncompressMatrix(cB, &Bi, &BiRows, &BiColumns); Ci = MatrixMultiply(A, N, N, Bi, nColumns); if(printDebug) printf("[%d] Ci = A x Bi ...", mpi_id); if(printDebug) printMatrix(Ci, N, nColumns); cCi = compressMatrix(Ci, N, nColumns); if(printDebug) printf("cCi ...\n"); if(printDebug) printCompressedMatrix(cCi); MPI_Barrier(MPI_COMM_WORLD); if(printDebug) printf("[%d] Receiving Ci fragments ...\n", mpi_id); CompressedMatrix** Cii; Cii = (CompressedMatrix**) malloc(sizeof(CompressedMatrix*) * mpi_size); if(Cii == NULL){ perror("malloc"); exit(EXIT_FAILURE); } Cii[0] = cCi; for(i=1; i < mpi_size; i++){ Cii[i] = mpiRecvCompressedMatrix(N,nColumns, i); } if(printDebug) printf("[%d] Joining Cii ...\n", mpi_id); CompressedMatrix *cC; cC = joinCompressedMatrices(Cii, mpi_size); if(printDebug) printCompressedMatrix(cC); elapsed = tack(); printf("[%d] C ...\n", mpi_id); uncompressMatrix(cC, &C, &i,&j); if(i <= 20){ printMatrix(C, i,j); } else { if(i < 1000){ printf("C is too big, only printing first diagonal %d.\n[",j); for(k=0; (k < i) && (k < j); k++){ printf("%3.2f ",C[k + k*j]); } printf("]\n"); } else { printf("C is just too big!"); } } printf("Took [%f] seconds!\n",elapsed); } else { printDebug = 0; if(printDebug) printf("[%d] Waiting for A ...",mpi_id); MPI_Bcast( A, N*N, MPI_DOUBLE, 0, MPI_COMM_WORLD); if(printDebug) printf("[%d] Received A ...\n", mpi_id); if(printDebug) printMatrix(A, N, N); if(printDebug) printf("[%d] Waiting for Bi ...",mpi_id); cBi = mpiRecvCompressedMatrix(N, nColumns, 0); uncompressMatrix(cBi, &Bi, &BiRows, &BiColumns); if(printDebug) printf("[%d] Received Bi ...",mpi_id); if(printDebug) printMatrix(Bi,BiRows, BiColumns); assert( (BiRows == N) && "Number or Rows in Bi is not right!"); assert( (BiColumns == nColumns) && "Number or Columns in Bi is not right!"); Ci = MatrixMultiply(A, N, N, Bi, BiColumns); if(printDebug) printf("[%d] Ci = A x Bi ...", mpi_id); if(printDebug) printMatrix(Ci, N, nColumns); cCi = compressMatrix(Ci, N, nColumns); if(printDebug) printCompressedMatrix(cCi); MPI_Barrier(MPI_COMM_WORLD); if(printDebug) printf("[%d] Returning Ci ...\n", mpi_id); mpiSendCompressedMatrix(cCi, 0, nColumns, 0); } MPI_Finalize(); // NxM = NxN * NxM exit(EXIT_SUCCESS); }