F_VOID_FUNC blacs_gridinit_(int *ConTxt, F_CHAR order, int *nprow, int *npcol)
#endif
{
   int *tmpgrid, *iptr;
   int i, j;

/*
 * Grid can be row- or column-major natural ordering when blacs_gridinit is
 * called.  Define a tmpgrid to reflect this, and call blacs_gridmap to
 * set it up
 */
   iptr = tmpgrid = (int*) malloc( Mpval(nprow)*Mpval(npcol)*sizeof(*tmpgrid) );
   if (Mlowcase(F2C_CharTrans(order)) == 'c')
   {
      i = Mpval(npcol) * Mpval(nprow);
      for (j=0; j < i; j++) iptr[j] = j;
   }
   else
   {
      for (j=0; j < Mpval(npcol); j++)
      {
         for (i=0; i < Mpval(nprow); i++) iptr[i] = i * Mpval(npcol) + j;
         iptr += Mpval(nprow);
      }
   }
#if (INTFACE == C_CALL)
   Cblacs_gridmap(ConTxt, tmpgrid, nprow, nprow, npcol);
#else
   blacs_gridmap_(ConTxt, tmpgrid, nprow, nprow, npcol);
#endif
   free(tmpgrid);
}
예제 #2
0
int main (int argc, char **argv)
{
  //   double *A_local;
   int A_descrip[DESC_SIZE];
   //   double *B_local;
   int B_descrip[DESC_SIZE];
   //   double *C_local;
   int C_descrip[DESC_SIZE];

   int nproc_rows;
   int nproc_cols;
   int m, n, k;
   int blacs_grid;

   int myproc, nprocs;
   char myname[MPI_MAX_PROCESSOR_NAME];
   double *a, *b, *c;

   /* Get input parameters */
   m = GLOBAL_M;
   n = GLOBAL_N;
   k = GLOBAL_K; // 32  

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myproc);

   /* Ensure we have at least two processors */
   if (nprocs < 2) 
   {
       printf("Too few processors!\n");
       exit (1);
   }

   if(gethostname (myname, MPI_MAX_PROCESSOR_NAME) != 0 )
      printf("Error: gethostname failed!\n");
   else if(HELLO)
      printf("Hello from %2d of %2d on %s\n", myproc, nprocs, myname);

   /* Set to HIGH frequency */
   mapping(myproc%7, DVFS_HIGH);

   Cblacs_get(0, 0, &blacs_grid);

   int ldumap=PROC_NODE;
   nproc_rows=PROC_NODE;
   nproc_cols=PROC_NODE;
   /* ROW MAJOR TILING */
   if(MAJOR==1)
   {
      int usermap[64]=  {0,  1,  8,  9,  16, 17, 24, 25,
                         2,  3,  10, 11, 18, 19, 26, 27,
                         4,  5,  12, 13, 20, 21, 28, 29,
                         6,  7,  14, 15, 22, 23, 30, 31,
                         32, 33, 40, 41, 48, 49, 56, 57,
                         34, 35, 42, 43, 50, 51, 58, 59,
                         36, 37, 44, 45, 52, 53, 60, 61,
                         38, 39, 46, 47, 54, 55, 62, 63};

      Cblacs_gridmap(&blacs_grid, usermap, ldumap, nproc_rows, nproc_cols);
   }
   else if (MAJOR==2)	
    /* COLUMN MAJOR TILING*/
   {
      int usermap[64]={0,  1,  2,  3,  8,  9,  10, 11,
                       4,  5,  6,  7,  12, 13, 14, 15,
                       16, 17, 18, 19, 24, 25, 26, 27,
                       20, 21, 22, 23, 28, 29, 30, 31,
                       32, 33, 34, 35, 40, 41, 42, 43,
                       36, 37, 38, 39, 44, 45, 46, 47,
                       48, 49, 50, 51, 56, 57, 58, 59,
                       52, 53, 54, 55, 60, 61, 62, 63};
    

      Cblacs_gridmap(&blacs_grid, usermap, ldumap, nproc_rows, nproc_cols);
   }
   else if(MAJOR==0)
      Cblacs_gridinit(&blacs_grid, "R", nproc_rows, nproc_cols);
   //   Cblacs_pcoord(blacs_grid, myproc, &my_process_row, &my_process_col);

   int local_m = m/nproc_rows;
   int local_n = n/nproc_cols;
   int local_k = k/nproc_cols;

   if(myproc==SHOW1)
     printf("local m n k = %d %d %d\n",local_m, local_n, local_k);

   a = (double *) malloc (local_m*local_k * sizeof(double));
   b = (double *) malloc (local_k*local_n * sizeof(double));
   c = (double *) malloc (local_m*local_n * sizeof(double));

   //   A_local = (double *) malloc (local_m*local_k * sizeof(double));
   //   B_local = (double *) malloc (local_k*local_n * sizeof(double));
   //   C_local = (double *) malloc (local_m*local_n * sizeof(double));

   if(!a||!b||!c)//||!A_local||!B_local||!C_local)
   {
     printf("out of memory!\n");
     exit(-1);
   }

   Build_descrip(myproc, "A", A_descrip, m, k, local_m, local_k, blacs_grid, local_m);//MAX(local_m, local_k));
   Build_descrip(myproc, "B", B_descrip, k, n, local_k, local_n, blacs_grid, local_k);//MAX(local_k, local_n));
   Build_descrip(myproc, "C", C_descrip, m, n, local_m, local_n, blacs_grid, local_m);//MAX(local_m, local_n));

   if(myproc==SHOW1)
   {
     printf("\nA_descrip = [ %d, %d, %d, %d, %d, %d, %d, %d, %d]\n", 
     A_descrip[0], A_descrip[1], A_descrip[2], A_descrip[3], A_descrip[4], A_descrip[5], A_descrip[6], A_descrip[7], A_descrip[8]);
     printf("\nB_descrip = [ %d, %d, %d, %d, %d, %d, %d, %d, %d]\n", 
     B_descrip[0], B_descrip[1], B_descrip[2], B_descrip[3], B_descrip[4], B_descrip[5], B_descrip[6], B_descrip[7], B_descrip[8]);
     printf("\nC_descrip = [ %d, %d, %d, %d, %d, %d, %d, %d, %d]\n\n", 
     C_descrip[0], C_descrip[1], C_descrip[2], C_descrip[3], C_descrip[4], C_descrip[5], C_descrip[6], C_descrip[7], C_descrip[8]);
   }

       
   int ij = 1;
   char tran = 'N';
   double alpha = 1.0, beta = 1.0;
   double exetime=0;

   MPI_Barrier(MPI_COMM_WORLD);
  
   if(MEASURE && myproc==0)
   {
      system("/apps/power-bench/mclient -H 10.1.255.100 -d /tmp");
      system("/apps/power-bench/mclient -H 10.1.255.100 -l pdgemm.ptr");
      system("/apps/power-bench/mclient -H 10.1.255.100 -s pdgemm");
   }

    
   //   Zeros(A_local, local_m, local_k); 
   //   Zeros(B_local, local_k, local_n); 
   //   Zeros(C_local, local_m, local_n); 
   //   if(myproc%8==0)
   //   RndMatrix(A_local, local_m, local_k, myproc);
      //  if(myproc<8) 
   //   RndMatrix(B_local, local_k, local_n, myproc); 

   MPI_Barrier(MPI_COMM_WORLD);
   //   exetime0 = -MPI_Wtime();
   // ScaLAPACK pdgemm 

   if(!myproc)
     printf("\nM = %d, N = %d, K = %d\n", m, n, k); 
   /*
   pdgemm_(&tran, &tran, &m, &n, &k,
           &alpha, A_local, &ij, &ij, A_descrip,
                   B_local, &ij, &ij, B_descrip,
           &beta,  C_local, &ij, &ij, C_descrip);

  
   MPI_Barrier(MPI_COMM_WORLD);
   exetime0 += MPI_Wtime();
   
   CpyMatrix(A_local, a, local_m, local_k);
   CpyMatrix(B_local, b, local_k, local_n);
   Zeros(c, local_m, local_n); 
   */

   //   if(myproc%8==0)
      RndMatrix(a, local_m, local_k, myproc);
      //  if(myproc<8) 
      RndMatrix(b, local_k, local_n, myproc); 
      Zeros(c, local_m, local_n); 

   MPI_Barrier(MPI_COMM_WORLD);
   exetime = -MPI_Wtime();
   // My pdgemm 
   pdgemm(&tran, &tran, &m, &n, &k, &alpha, a, &ij, &ij, A_descrip, b, &ij, &ij, B_descrip, &beta, c, &ij, &ij, C_descrip);    
   //printf("MYPDGEMM finish\n");
   MPI_Barrier(MPI_COMM_WORLD);
   exetime += MPI_Wtime();
   

   if(MEASURE && myproc==0)
   {
      system("/apps/power-bench/mclient -H 10.1.255.100 -e session");
      system("/apps/power-bench/mclient -H 10.1.255.100 -e log");
   }

   mapping(myproc%7, DVFS_LOW);
   mapping(0, DVFS_HIGH);
   

   if(myproc == SHOW1)
   {
       sleep(1);
       //printf("Total execution time of my_pdgemm is %.3f.\n", exetime);
       printf("Total execution time of pdgemm is %.3f.\n", exetime);

       int i, j;
       /*
       printf("My PDGEMM ID AAA = %d :\n",myproc);   
       for(i=0;i<DISP_SIZE;i++)
       {
         for(j=0;j<DISP_SIZE;j++)
         	 printf("%8.5lf   ", a[i*DISP_SIZE+j]);
           printf("\n");
        }
      
       printf("My PDGEMM ID BBB = %d :\n",myproc);   
       for(i=0;i<DISP_SIZE;i++)
       {
         for(j=0;j<DISP_SIZE;j++)
         	 printf("%8.5lf   ", b[i*DISP_SIZE+j]);
           printf("\n");
        }
       */
       /*
       printf("My PDGEMM ID CCC = %d :\n",myproc);   
       for(i=0;i<DISP_SIZE;i++)
       {
         for(j=0;j<DISP_SIZE;j++)
         	 printf("%10.5lf\t", c[i*DISP_SIZE+j]);
           printf("\n");
        }
       */
       /*      
   }
   if(myproc == SHOW2)
   {
     sleep(3);
       printf("Total execution time of my_pdgemm is %.3f.\n", exetime);
       printf("Total execution time of pdgemm is %.3f.\n", exetime0);

       int i, j;
            
       printf("PDGEMM ID AAA = %d :\n",myproc);   
       for(i=0;i<DISP_SIZE;i++)
       {
         for(j=0;j<DISP_SIZE;j++)
         	 printf("%8.5lf   ", A_local[i*DISP_SIZE+j]);
           printf("\n");
        }
       
      
       printf("PDGEMM ID BBB = %d :\n",myproc);   
       for(i=0;i<DISP_SIZE;i++)
       {
         for(j=0;j<DISP_SIZE;j++)
         	 printf("%8.5lf   ", B_local[i*DISP_SIZE+j]);
           printf("\n");
        }
       */
       printf("PDGEMM ID CCC = %d :\n",myproc);   
       for(i=0;i<DISP_SIZE;i++)
       {
         for(j=0;j<DISP_SIZE;j++)
         	 printf("%10.5lf\t", c[i*DISP_SIZE+j]);
           printf("\n");
        }
   }

   //   double diffa, diffb, diffc, diff_total=0.0;

   //diffa=diff_norm(A_local, a, local_m, local_k);
   //diffb=diff_norm(B_local, b, local_k, local_n);
   //diffc=diff_norm(C_local, c, local_m, local_n);
   //MPI_Reduce(&diffa, &diff_total, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
   //   sleep(1);
   /*
   if(!myproc)
      printf("The total normal difference between my pdgemm A and ScaLAPACK pdgemm A is %e.\n", diff_total);
   MPI_Reduce(&diffb, &diff_total, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
   sleep(1);
   if(!myproc)
      printf("The total normal difference between my pdgemm B and ScaLAPACK pdgemm B is %e.\n", diff_total);
   MPI_Reduce(&diffc, &diff_total, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
   sleep(1);
   if(!myproc)
      printf("The total normal difference between my pdgemm C and ScaLAPACK pdgemm C is %e.\n", diff_total);
   */

   free(a); free(b); free(c);
   //free(A_local);free(B_local);free(C_local);
   Cblacs_exit(1);
   /* Clean-up and close down */ 
   MPI_Barrier(MPI_COMM_WORLD);
   //MPI_Comm_free(&my_row_comm); 
   //MPI_Comm_free(&my_column_comm);  
   MPI_Finalize();
   return 0;
}