Example #1
0
File: demo.c Project: psava/cwp12
int main(){
    int nthreads = 4;
    omp_set_num_threads(nthreads);
    #pragma omp parallel 
        fprintf(stderr,"nthreads %d \n", omp_get_num_threads());
 
    int n3 = 128;
    int n2 = 128;
    int n1 = 128;
//    float ***array = sf_floatalloc3(n1,n2,n3);
    
    float *array = fftwf_alloc_real(n3*n2*n1);
    fftwf_complex* cout = fftwf_alloc_complex(n3*n2*n1);

    int err = fftwf_init_threads();
    if (err == 0) {
        fprintf(stderr,"something went wrong with fftw\n");
    }

    fprintf(stderr,"Got here\n");

    double start,end;
    start = omp_get_wtime()*omp_get_wtick();
    fftwf_plan_with_nthreads(nthreads);
    fftwf_plan plan =  fftwf_plan_dft_r2c_3d(
                                    n1,n2,n3,
                                    array,cout,
                                    FFTW_MEASURE);
    end = omp_get_wtime()*omp_get_wtick();
    fprintf(stderr,"elapsed time: %f %f %f\n",end,start,end-start);

    for(int i = 0; i < n3*n2*n1; ++i)
        array[i] = rand()/RAND_MAX;
 
    //float start = clock()/CLOCKS_PER_SEC;
    start = omp_get_wtime();

    for(int i=0; i < 1001; ++i)
        fftwf_execute(plan);
   
    //float end = clock()/CLOCKS_PER_SEC;
    end = omp_get_wtime();
    fprintf(stderr,"elapsed time: %f time/calc %f\n",
        end-start,(end-start)/100.0);

    fftwf_cleanup_threads();
    fftwf_cleanup();
    fftwf_destroy_plan(plan);

    fftwf_free(cout);
    fftwf_free(array);
    //free(**array); free(*array); free(array);
    return 0;

}
Example #2
0
int test_omp_get_wtick()
{
  double tick;
  tick = -1.;
  tick = omp_get_wtick ();
  return ((tick > 0.0) && (tick < 0.01));
}
Example #3
0
int main(int argc, char* argv[])
{
    signal(SIGINT, sigint_handler);
#if !defined(NDEBUG)
    std::cout << "\t> Running in DEBUG mode" << std::endl;
#endif

#if defined(OPENMP_FOUND)
    omp_set_nested(true);
    std::cout << "\t> Running using OPENMP " << std::endl;
    std::cout << "\t\t> " << omp_get_max_threads() << " threads max" << std::endl;
    std::cout << "\t\t> " << omp_get_wtick()*1e9 << "ns tick" << std::endl;
    assert( omp_get_nested() );
#endif

//     test_random();
    Rng rng;
    rng.seed(rand());

    Options options = parse_options(argc, argv);

    typedef std::map<std::string, int> Wins;
    Wins wins;

    for (int kk=0; kk<options.number_of_games; kk++)
    {
        std::cout << std::endl << std::endl;
        std::cout << "****************************************" << std::endl;
        std::cout << "game " << kk << "/" << options.number_of_games << std::endl;

        const Game& game = play_game(options, rng);

        const int winner = game.state.get_winner();
        if (winner < 0) wins["draw"]++;
        else {
            std::string winner_name = "bot";
            if (game.hero_infos[winner].is_real_bot())
                winner_name = game.hero_infos[winner].name;
            wins[winner_name]++;
        }

        std::cout << std::endl;
        std::cout << "after " << options.number_of_games << " games" << std::endl;
        for (Wins::const_iterator wi=wins.begin(), wie=wins.end(); wi!=wie; wi++)
        {
            if (wi->first == "draw")
            {
                std::cout << "  " << wi->second << " draw" << std::endl;
                continue;
            }
            std::cout << "  " << wi->second << " victory for " << wi->first << std::endl;
        }

        if (sigint_already_caught) break;
    }

    return 0;
}
int
main(int argc, char *argv[])
{
  QLA_Real sum, *r1;
  QLA_Complex *c1;
  QLA_ColorVector *v1, *v2, *v3, *v4, *v5;
  QLA_ColorVector **vp1, **vp2, **vp3, **vp4;
  QLA_HalfFermion *h1, *h2, **hp1;
  QLA_DiracFermion *d1, *d2, **dp1;
  QLA_ColorMatrix *m1, *m2, *m3, *m4, **mp1;
  double flop, mem, time1;
  int nmin, nmax, c, nthreads=1;

  printf("QLA_Precision = %c\n", QLA_Precision);
#ifdef _OPENMP
  nthreads = omp_get_max_threads();
  printf("OMP THREADS = %i\n", nthreads);
  printf("omp_get_wtick = %g\n", omp_get_wtick());
#ifdef CPU_ZERO
#pragma omp parallel
  {
    int tid = omp_get_thread_num();
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(tid, &set);
    sched_setaffinity(0, sizeof(set), &set);
  }
#endif
#endif

  nmin = 64*nthreads;
  nmax = 256*1024*nthreads;

  r1 = myalloc(QLA_Real, nmax);
  c1 = myalloc(QLA_Complex, nmax);
  v1 = myalloc(QLA_ColorVector, nmax);
  v2 = myalloc(QLA_ColorVector, nmax);
  vp1 = myalloc(QLA_ColorVector *, nmax);
  d1 = myalloc(QLA_DiracFermion, nmax);
  d2 = myalloc(QLA_DiracFermion, nmax);
  dp1 = myalloc(QLA_DiracFermion *, nmax);
  m1 = myalloc(QLA_ColorMatrix, nmax);
  m2 = myalloc(QLA_ColorMatrix, nmax);
  m3 = myalloc(QLA_ColorMatrix, nmax);
  mp1 = myalloc(QLA_ColorMatrix *, nmax);

  for(int n=nmin; n<=nmax; n*=2) {
    printf("len = %i\n", n);
    printf("len/thread = %i\n", n/nthreads);
    double cf = 9.e9/n;

#include "benchfuncs.c"

  }

  return 0;
}
int main(int argc, char *argv[ ]) {

    double prec = omp_get_wtick();
    //fprintf( stderr, "Clock precision = %g\n", prec );

    for (int i = 0; i < NUM; i++) {
        A[i] = Ranf(-10.f, 10.f);
        B[i] = Ranf(-10.f, 10.f);
    }

    /****************************
     * SIMD test block
     * **************************/
    double time0 = Timer();
    for (int t = 0; t < NUM_TRIALS; t++) {
        SimdMul(A, B, C, NUM);
    }
    double time1 = Timer();

    double dts = (time1 - time0) / (float) NUM_TRIALS;
    if (PRINT_SIMD == 1) {
        if(GNUPLOT == 0) {
            printf("Average SIMD Elapsed time = %g\n", dts);
            printf("SIMD speed = %8.3f MFLOPS\n", ((float) NUM / dts) / 1000000.f);
        } else {
            // x-axis: #-of-elements y-axis: MFLOPS, do not need elapsed time
            printf("%d %8.3f\n", NUM, ((float) NUM / dts) / 1000000.f);
        }
    }

    /****************************
     * non SIMD test block
     * **************************/
    double time2 = Timer();
    for (int t = 0; t < NUM_TRIALS; t++) {
        NonSimdMul(A, B, C, NUM);
    }
    double time3 = Timer();

    double dtn = (time3 - time2) / (float) NUM_TRIALS;
    if(PRINT_NOSIMD == 1) { 
        if(GNUPLOT == 0) {
            printf("Average Non-SIMD Elapsed time = %g\n", dtn);
            printf("Non-SIMD speed = %8.3f MFLOPS\n", ((float) NUM / dtn) / 1000000.f);
            //printf("Speed-up = %g\n", dtn / dts);
        } else {
            // x-axis: #-of-elements y-axis: MFLOPS, do not need elapsed time
            printf("%d %8.3f\n", NUM, ((float) NUM / dtn) / 1000000.f);
        }
    }

    if(PRINT_DIFFERENCE == 1) {
        printf("%d %g\n", NUM, ((float) NUM / dtn) / (dtn/dts));
    }
    return 0;
}
Example #6
0
int main() {
    printf("omp_get_num_threads() [default value] = %d\n", omp_get_num_threads());
    printf("omp_get_max_threads() = %d \n", omp_get_max_threads());
    printf("omp_get_num_procs() = %d\n", omp_get_num_procs());
    printf("\n");

    
    omp_set_num_threads(2); // that affectsomp_get_max_threads()
    #pragma omp parallel for ordered 
    for (int i = 0; i < omp_get_max_threads(); i ++) {
        printf("Thread %d of total %d thread\n", omp_get_thread_num(), omp_get_num_threads());
    }
    printf("omp_get_num_threads() = %d (Always one in the sequencial part)\n", omp_get_num_threads());


    printf("\n");
    printf("omp_get_wtime() = %f\n", omp_get_wtime() );
    printf("omp_get_wtick() = %f\n", omp_get_wtick() );
    return 0;
}
Example #7
0
int main(int args, char **argv){
    int size, MyP, i, j, v, k, d, p, J;

    int * mas;
    long double MAX;
    double wtime1, wtime2, wtick;


/* Каждая ветвь генерирует свою полосу матрицы A и свой отрезок вектора
 * правой части, который присоединяется дополнительным столбцом к A.
 * Нулевая ветвь генерирует нулевую полосу, первая ветвь - первую полосу
 * и т.д. (По диагонали исходной матрицы - числа = 2, остальные числа = 1). */
    wtime1 = omp_get_wtime();
    wtick = omp_get_wtick();
    wtime2 = omp_get_wtime();
    srand((int)((wtime2-wtime1)/wtick));

    for (i = 0; i < M; i++){
        for (j = 0; j < M+1; j++){
            fscanf(stdin, "%Lf", &MA[i][j]);
        }
    }

    printMatrix();

    memcpy(MA2, MA, sizeof(long double)*M*(M+1));

    for (i = 0; i < M; i++ )
        OTV[i] = i;

    wtime1 = omp_get_wtime();
    for (i = 0; i < M; i++){


        #pragma omp parallel shared(mas, size, i, MA, MAD) private(j, MyP, MAX)
        {
            MyP = omp_get_thread_num();
            #pragma omp single
            {
                size = omp_get_num_threads();
                mas = malloc(sizeof(int)*size);
            }

            MAX = fabsl(MA[i][i]);
            mas[MyP] = i;

            #pragma omp for
            for (j = i+1; j < M; j++){
                if (fabsl(MA[j][i]) > MAX){
                    MAX = fabsl(MA[j][i]);
                    mas[MyP] = j;
                }
            }

            #pragma omp single
            {
                J = i;
                MAX = fabsl(MA[J][i]);
                for (j = 0; j < size; j++){
                    if (fabsl(MA[mas[j]][i]) > MAX){
                        J = mas[j];
                        MAX = fabsl(MA[J][i]);
                    }
                }
                if (J != i){
                    memcpy(V, &MA[i][i], sizeof(long double)*(M+1-i));
                    memcpy(&MA[i][i], &MA[J][i], sizeof(long double)*(M+1-i));
                    memcpy(&MA[J][i], V, sizeof(long double)*(M+1-i));
                }
                free(mas);
                printMatrix();
            }

            #pragma omp for
            for (j = M; j > i; j--){

                if (MA[i][i] != 0){
                    //printf("%d: MA[%d][%d] = %.2f\n", MyP, i, j, MA[i][j]);
                    MA[i][j] /= MA[i][i];
                    //printf("%d: MA[%d][%d] = %.2f\n", MyP, i, j, MA[i][j]);
                }else
                    printf("ERROR DIV BY ZERO %d: MA[%d][%d] = %.2Lf\n", MyP, i, j, MA[i][j]);
            }

            #pragma omp master
                MA[i][i] = 1;
            
            #pragma omp single
            {
                printMatrix();
            }
 
            #pragma omp for private(d)
            for (k = i+1; k < M; k++){
                for (d = M; d >= i; d--){
                    //printf("%d: %d %d\n", MyP, k, d);
                    //printf("%d: MA[%d][%d] = %.2f -= MA[%d][%d] = %.2f * MA[%d][%d] = %.2f\n", MyP, k, d, MA[k][d], k, i, MA[k][i], i, d, MA[i][d]);
                    MA[k][d] -= MA[k][i]*MA[i][d];
                }
            }

            #pragma omp single
            {
                printMatrix();
            }
        }
    }

    #pragma omp for
    for ( i = 0; i < M; i++ )
        X[i] = MA[i][M];

    for ( i = M - 2; i >= 0; i-- )
        for ( j = i + 1; j < M; j++ )
            X[i] -= X[j] * MA[i][j];
    wtime2 = omp_get_wtime();

    fprintf(stderr, "Время работы программы %.9f\n", wtime2-wtime1);

    for (i = 0; i < M; i++){
        MAD = 0;
        for (j = 0; j < M; j ++){
            MAD += MA2[i][j]*X[j];
        }
        MAD -= MA2[i][M];
        if (i < M-1)
            printf("%.12Lf+", MAD);
        else
            printf("%.12Lf\n", MAD);
    }
    //printf("\n");


return 0;
}
Example #8
0
File: lib-1.c Project: 0day-ci/gcc
int
main (void)
{
  double d, e;
  int l;
  omp_lock_t lck;
  omp_nest_lock_t nlck;

  d = omp_get_wtime ();

  omp_init_lock (&lck);
  omp_set_lock (&lck);
  if (omp_test_lock (&lck))
    abort ();
  omp_unset_lock (&lck);
  if (! omp_test_lock (&lck))
    abort ();
  if (omp_test_lock (&lck))
    abort ();
  omp_unset_lock (&lck);
  omp_destroy_lock (&lck);

  omp_init_nest_lock (&nlck);
  if (omp_test_nest_lock (&nlck) != 1)
    abort ();
  omp_set_nest_lock (&nlck);
  if (omp_test_nest_lock (&nlck) != 3)
    abort ();
  omp_unset_nest_lock (&nlck);
  omp_unset_nest_lock (&nlck);
  if (omp_test_nest_lock (&nlck) != 2)
    abort ();
  omp_unset_nest_lock (&nlck);
  omp_unset_nest_lock (&nlck);
  omp_destroy_nest_lock (&nlck);

  omp_set_dynamic (1);
  if (! omp_get_dynamic ())
    abort ();
  omp_set_dynamic (0);
  if (omp_get_dynamic ())
    abort ();

  omp_set_nested (1);
  if (! omp_get_nested ())
    abort ();
  omp_set_nested (0);
  if (omp_get_nested ())
    abort ();

  omp_set_num_threads (5);
  if (omp_get_num_threads () != 1)
    abort ();
  if (omp_get_max_threads () != 5)
    abort ();
  if (omp_get_thread_num () != 0)
    abort ();
  omp_set_num_threads (3);
  if (omp_get_num_threads () != 1)
    abort ();
  if (omp_get_max_threads () != 3)
    abort ();
  if (omp_get_thread_num () != 0)
    abort ();
  l = 0;
#pragma omp parallel reduction (|:l)
  {
    l = omp_get_num_threads () != 3;
    l |= omp_get_thread_num () < 0;
    l |= omp_get_thread_num () >= 3;
#pragma omp master
    l |= omp_get_thread_num () != 0;
  }
  if (l)
    abort ();

  if (omp_get_num_procs () <= 0)
    abort ();
  if (omp_in_parallel ())
    abort ();
#pragma omp parallel reduction (|:l)
  l = ! omp_in_parallel ();
#pragma omp parallel reduction (|:l) if (1)
  l = ! omp_in_parallel ();
  if (l)
    abort ();

  e = omp_get_wtime ();
  if (d > e)
    abort ();
  d = omp_get_wtick ();
  /* Negative precision is definitely wrong,
     bigger than 1s clock resolution is also strange.  */
  if (d <= 0 || d > 1)
    abort ();

  return 0;
}
Example #9
0
double
omp_get_wtick_ (void)
{
  return omp_get_wtick ();
}
Example #10
0
double
timer_getres (void)
{
  return omp_get_wtick ();
}
Example #11
0
int main(int argc, char *argv[]) {
	int i;
	double timeS1, timeS2, timeP1, timeP2, wtick;
	wtick = omp_get_wtick();
	int n = pow(2, atoi(argv[1]));
	//printf("n=%d; size=%f MB\n", n, (float)((n*sizeof(float))/1000000.0f));
	myRNG rng;

	/** SEQUENTIAL SORT *************************************************/
	// initialize random array
	float *a = (float *)malloc(sizeof(float)*n);
	int *index = (int *)malloc(sizeof(int)*n);
	int *rank;
	rng.resetSeed(10215);
	for(i=0; i<n; i++) {
		*(a + i) = rng.next();
		*(index + i) = i;
	}
	// if option is selected, print starting array
	if(atoi(argv[2]) == 1) {
		printf("[%f, ", *(a + 0));
		for(i=1; i<(n-1); i++) {
			printf("%f, ", *(a + i));
		}
		printf("%f]\n", *(a + n - 1));
	}

	// sort array (sequentially)
	timeS1 = omp_get_wtime();
	seqShellSort(a, index, n);
	timeS2 = omp_get_wtime();
	// check array
	i=0;
	while(*(a + i) <= *(a + i + 1) && i<n) {
		i++;
	}
	if(i==(n-1)) {
		//printf("Seq :: Array sorted SUCCESSFULLY in %f seconds.\n", (timeS2-timeS1));
	} else {
		printf("Seq :: Array is NOT sorted.  See index %d.\n", i);
	}
	// if option is selected, print ending array
	if(atoi(argv[3]) == 1) {
		printf("[%f, ", *(a + 0));
		for(i=1; i<(n-1); i++) {
			printf("%f, ", *(a + i));
		}
		printf("%f]\n", *(a + n - 1));
	}


	/** SAMPLE SORT ******************************************************/
	// reinitialize random array
	free(a);
	free(index);
	a = (float *)malloc(sizeof(float)*n);
	index = (int *)malloc(sizeof(int)*n);
	rng.resetSeed(10215);
	for(i=0; i<n; i++) {
		*(a + i) = rng.next();
		*(index + i) = i;
	}
	// if option is selected, print starting array
	if(atoi(argv[2]) == 1) {
		printf("[%f, ", *(a + 0));
		for(i=1; i<(n-1); i++) {
			printf("%f, ", *(a + i));
		}
		printf("%f]\n", *(a + n - 1));

		printf("[%f, ", (float)*(index + 0));
		for(i=1; i<(n-1); i++) {
			printf("%f, ", (float)*(index + i));
		}
		printf("%f]\n", (float)*(index + n - 1));
	}
	
	// sort array (in parallel using Merge)
	timeP1 = omp_get_wtime();
	rank = sampleSort(a, index, n);
	timeP2 = omp_get_wtime();
	// check array
	i=0;
	while(*(a + i) <= *(a + i + 1) && i<n) {
		i++;
	}
	if(i==(n-1)) {
		//printf("Par (Sample) :: Array sorted SUCCESSFULLY in %f seconds.\n", (timeP2-timeP1));
	} else {
		printf("Par (Sample) :: Array is NOT sorted.  See index %d.\n", i);
	}

	// if option is selected, print ending array
	if(atoi(argv[3]) == 1) {
		printf("[%f, ", *(a + 0));
		for(i=1; i<(n-1); i++) {
			printf("%f, ", *(a + i));
		}
		printf("%f]\n", *(a + n - 1));

		printf("[%f, ", (float)*(index + 0));
		for(i=1; i<(n-1); i++) {
			printf("%f, ", (float)*(index + i));
		}
		printf("%f]\n", (float)*(index + n - 1));

		printf("[%d, ", *(rank + 0));
		for(i=1; i<(n-1); i++) {
			printf("%d, ", *(rank + i));
		}
		printf("%d]\n", *(rank + n - 1));
	}
	
	printf("%d, %d, %f, %f, %f\n", omp_get_num_threads(), n, (float)((n*sizeof(float))/1000000.0f), (timeS2-timeS1), (timeP2-timeP1));
	return 0;
}
Example #12
0
 long get_wtick()
 {
   return omp_get_wtick();
 }
Example #13
0
int
main(int argc, char *argv[])
{
  QLA_Real sum, *r1;
  QLA_Complex *c1;
  QLA_ColorVector *v1, *v2, *v3, *v4, *v5;
  QLA_ColorVector **vp1, **vp2, **vp3, **vp4;
  QLA_HalfFermion *h1, *h2, **hp1;
  QLA_DiracFermion *d1, *d2, **dp1;
  QLA_ColorMatrix *m1, *m2, *m3, *m4, **mp1;
  double cf0, flop, mem, time1;
  int nmin, nmax, c, nthreads=1;

  nmin = 64;
  if(argc>1) nmin = atoi(argv[1]);
  nmax = 256*1024;
  if(argc>2) nmax = atoi(argv[2]);
  cf0 = 1e9;
  if(argc>3) cf0 *= atof(argv[3]);

  printf("QLA version %s (%i)\n", QLA_version_str(), QLA_version_int());
  printf("QLA_Precision = %c\n", QLA_Precision);
  printf("QLA_Nc = %i\n", QLA_Nc);

#ifdef _OPENMP
  nthreads = omp_get_max_threads();
  printf("OMP threads = %i\n", nthreads);
  printf("omp_get_wtick = %g\n", omp_get_wtick());
#ifdef CPU_ZERO
#pragma omp parallel
  {
    int tid = omp_get_thread_num();
    cpu_set_t set;
    CPU_ZERO(&set);
    CPU_SET(tid, &set);
    sched_setaffinity(0, sizeof(set), &set);
  }
#endif
#endif

  nmin *= nthreads;
  nmax *= nthreads;

  r1 = myalloc(QLA_Real, nmax);
  c1 = myalloc(QLA_Complex, nmax);
  v1 = myalloc(QLA_ColorVector, nmax);
  v2 = myalloc(QLA_ColorVector, nmax);
  v3 = myalloc(QLA_ColorVector, nmax);
  v4 = myalloc(QLA_ColorVector, nmax);
  v5 = myalloc(QLA_ColorVector, nmax);
  vp1 = myalloc(QLA_ColorVector *, nmax);
  vp2 = myalloc(QLA_ColorVector *, nmax);
  vp3 = myalloc(QLA_ColorVector *, nmax);
  vp4 = myalloc(QLA_ColorVector *, nmax);
  h1 = myalloc(QLA_HalfFermion, nmax);
  h2 = myalloc(QLA_HalfFermion, nmax);
  hp1 = myalloc(QLA_HalfFermion *, nmax);
  d1 = myalloc(QLA_DiracFermion, nmax);
  d2 = myalloc(QLA_DiracFermion, nmax);
  dp1 = myalloc(QLA_DiracFermion *, nmax);
  m1 = myalloc(QLA_ColorMatrix, nmax);
  m2 = myalloc(QLA_ColorMatrix, nmax);
  m3 = myalloc(QLA_ColorMatrix, nmax);
  m4 = myalloc(QLA_ColorMatrix, nmax);
  mp1 = myalloc(QLA_ColorMatrix *, nmax);
  //QLA_ColorVector *va[4] = { v2, v3, v4, v5 };
  QLA_ColorVector **vpa[4] = { vp1, vp2, vp3, vp4 };
  QLA_ColorMatrix *ma[4] = { m1, m2, m3, m4 };

  for(int n=nmin; n<=nmax; n*=2) {
    printf("len = %i\n", n);
    printf("len/thread = %i\n", n/nthreads);
    double cf = cf0*nthreads/n;

#include "benchfuncs.c"

  }

  return 0;
}
Example #14
0
int main(int argc, char *argv[])
{
    short iN, fN, incN;
	int nprocs = 0, iam = 0, mat_size, i, j, k;
	float time, start, finish;
	double *mat1, *mat2, *sol, mflops;

	if(argc<4) {
		printf("\n\nUSAGE: %s size_initial size_final size_increment\n\n",argv[0]);
		return -1;
	}

	iN=atoi(argv[1]);
	fN=atoi(argv[2]);
	incN=atoi(argv[3]);

	for(mat_size=iN; mat_size<=fN; mat_size+=incN) {
		int mat_size2 = mat_size * mat_size;

		// Allocating memory to the three matrix.
		mat1 = (double *) malloc(sizeof(double)*mat_size2);
		mat2 = (double *) malloc(sizeof(double)*mat_size2);
		sol = (double *) malloc(sizeof(double)*mat_size2);
		if(mat1==NULL || mat2==NULL || sol==NULL)
			printf("\nError in Matrix allocation. Ask Morpheus.\n");

		// Generating random values between 0 and 1 for mat1 and mat2.
		// About sol, we will simply fill it with zeroes.
		#pragma omp for schedule (dynamic, 20)
		for (i = 0; i < mat_size2; i++) {
			mat1[i] = (double)rand()/RAND_MAX;
			mat2[i] = (double)rand()/RAND_MAX;
			sol[i] = 0;
		}

		// note: nprocs is shared because we need it for later use.
		#pragma omp parallel shared(nprocs) private(iam)
		{
		iam=omp_get_thread_num();
		if(iam==0)
			nprocs=omp_get_num_threads();
		}

		// Clock START.
		start = omp_get_wtime();

		// This block contains the matrix multiplication code.
		{
			// This (long x) optimizes the loops a little.
			long x;
			double temp;

			// Where the magic begins.
			#pragma omp parallel for schedule(runtime)
			for (i = 0; i < mat_size; i++) {
				x = i*mat_size;
				for (j = 0; j < mat_size; j++) {
					// Using 'reduction' with a temp variable
					// optimizes the calculation a lot!
					temp = 0;
					#pragma omp parallel for reduction(+:temp)
					for (k = 0; k < mat_size; k++)
						temp = mat1[x+k] * mat2[k*mat_size+j];
					sol[x+j] = temp;
				}
			}
		}

		// Clock STOP.
		finish = omp_get_wtime();

		time = finish - start;

		if(time==0.) {
			printf("\nNot enough precission.\n");
		}
		else {
			// >>> mflops = (operations/time)/1000000
			// > operations = (first for()) + (second for()) + (third for())
			// (third for()) = (2*mat_size) <- because we have two floating point operations
			mflops = ((double)mat_size2*(2.*(double)mat_size)/time)/1000000.;
			printf("\n>>> Threads = %d\t\tSize = %d\t\tSeconds = %.6lf", nprocs, mat_size, time);
			printf("\n> Mflops = %.6f\t\tMflops/thread = %.6f",mflops,mflops/nprocs);
			printf("\n> Precision omp_get_wtick = %lf\n",omp_get_wtick());
		}

		// Free memory like a boss.
		free(mat1);
		free(mat2);
		free(sol);
	}

	return EXIT_SUCCESS;
}