Пример #1
0
main(int nargs, char *args[])
{
   char ln[127], pre='d', ln2[127], ch;
   int i, nb=28;
   double mflops[3];
   double mf1, mf4;
   FILE *fpin, *fp;

   for (i=1; i < nargs; i++)
   {
      if (args[i][0] != '-') PrintUsage(args[0]);
      switch(args[i][1])
      {
      case 'p':
         ch = args[++i][0];
         pre = Mlowcase(ch);
         break;
      case 'n':
         nb = atoi(args[++i]);
         break;
      default:
         PrintUsage(args[0]);
      }
   }
   sprintf(ln, "res/M%cNB%d_4x1x1_0-1.mflop", pre, nb);
   if (!FileExists(ln))
   {
      sprintf(ln2, "make %chc_cases nb=%d\n", pre, nb);
      assert(system(ln2) == 0);
   }
   fpin = fopen(ln, "r");
   assert(fpin);
   for (i=0; i < 3; i++) assert( fscanf(fpin, "%lf", mflops+i) );
   mf1 = GetAvg(3, 1.2, mflops);
   fclose(fpin);

   sprintf(ln, "res/M%cNB%d_4x4x1_0-1.mflop", pre, nb);
   if (!FileExists(ln))
   {
      sprintf(ln2, "make %chc_cases nb=%d\n", pre, nb);
      if (system(ln2) != 0)
      {
         sprintf(ln, "rm -f res/M%cNB*\n", pre);
         system(ln);
         fprintf(stderr, "Error in command: %s", ln2);
         exit(-1);
      }
   }
   fpin = fopen(ln, "r");
   assert(fpin);
   for (i=0; i < 3; i++) assert( fscanf(fpin, "%lf", mflops+i) );
   mf4 = GetAvg(3, 1.2, mflops);
   fclose(fpin);

   sprintf(ln, "res/%cmmcase.h", pre);
   fp = fopen(ln, "w");
   assert(fp);
   fprintf(fp, "#ifndef %cMMCASE_H\n", Mupcase(pre));
   fprintf(fp, "   #define %cMMCASE_H\n", Mupcase(pre));
   if (mf1 >= mf4) fprintf(fp, "   #define Use4x1\n");
   else fprintf(fp, "   #define Use4x4\n");
   fprintf(fp, "#endif\n");
   fclose(fp);
   sprintf(ln, "res/%cHCRES", pre);
   fp = fopen(ln, "w");
   assert(fp);
   fprintf(fp, "%f\n", mf1);
   fprintf(fp, "%f\n", mf4);
   fclose(fp);

   exit(0);
}
void pzgecopy_hd( F_CHAR_T TRANS, int *m_in, int *n_in,
    double *A, int *ia_in, int *ja_in, int *descA,
    double *dB, int *ib_in, int *jb_in, int *descB )
{
/*
 Copy m by n distributed submatrix  from host to GPU
 */

int m  = *m_in;
int n  = *n_in;

int ia = *ia_in;
int ja = *ja_in;
int ib = *ib_in;
int jb = *jb_in;

int nprow = 0;
int npcol = 0;
int myprow = 0;
int mypcol = 0;
int mmb = 0;
int nnb = 0;

int istart = 0;
int iend = 0;
int isize = 0;

int Locp = 0;
int Locq = 0;
int lld = 0;

int jstart = 0;
int jend = 0;
int jsize = 0;

int iib = 0;
int jjb = 0;

cuDoubleComplex *dBptr = 0;
double *Btmp = 0;
F_CHAR_T NoTrans = "N";

int descBtmp[DLEN_];
int elmSize = sizeof(cuDoubleComplex);

double z_one[2];
double z_zero[2];

/*
 Tuneable parameters
 */
int mfactor = 4;
int nfactor = 4;


int rsrc = 0;
int csrc = 0;
int irsrc = 0;
int jcsrc = 0;
int info = 0;

int notran = 0;
int TrA = 0;

int lrindx = 0;
int lcindx = 0;
int nrow = 0;
int ncol = 0;
int mm = 0;
int nn = 0;
int iia = 0;
int jja = 0;

cublasStatus cu_status;

z_one[REAL_PART] = 1;
z_one[IMAG_PART] = 0;
z_zero[REAL_PART] = 0;
z_zero[IMAG_PART] = 0;

Cblacs_gridinfo( descA[CTXT_], &nprow, &npcol, &myprow, &mypcol );

notran = ( ( TrA = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );


/*
 * check arguments
 */
if ((m <= 0) || (n <= 0)) {
   return;
   };

assert( (1 <= ia) && ((ia + m-1) <= descA[M_] ) );
assert( (1 <= ja) && ((ja + n-1) <= descA[N_] ) );
assert( (1 <= ib) && ((ib + m-1) <= descB[M_] ) );
assert( (1 <= jb) && ((jb + n-1) <= descB[N_] ) );

/*
 * Create a temp matrix that is aligned to descB.
 * Assume size is   mmb by nnb
 */

if (notran) {
  mmb = MIN( m, descB[MB_] * nprow * mfactor);
  nnb = MIN( n, descB[NB_] * npcol * nfactor);
  }
else {
  nnb = MIN( m, descB[MB_] * nprow * mfactor);
  mmb = MIN( n, descB[NB_] * npcol * nfactor);
  };
  

mmb = MAX( mmb,1);
nnb = MAX( nnb,1);

rsrc = indxg2p_( &ib, &descB[MB_], &myprow, &descB[RSRC_], &nprow);
csrc = indxg2p_( &jb, &descB[NB_], &mypcol, &descB[CSRC_], &npcol);


Locp = numroc_( &mmb, &descB[MB_], &myprow, &rsrc, &nprow );
Locq = numroc_( &nnb, &descB[NB_], &mypcol, &csrc, &npcol );

Btmp = (double*) malloc( MAX(1,(Locp * Locq))*elmSize );
assert( Btmp != 0 );


lld = MAX(1, Locp);
descinit_( descBtmp, &mmb, &nnb, &descB[MB_], &descB[NB_],
            &rsrc, &csrc, &descB[CTXT_], &lld, &info );

assert( info == 0);



for( jstart=ja; jstart <= ja + n-1; jstart = jend + 1) {
  jend = MIN( ja + n -1, jstart + nnb - 1);
  jsize = jend - jstart + 1;

  for( istart=ia; istart <= ia + m-1; istart = iend + 1) {
     iend = MIN( ia + m-1, istart + mmb -1);
     isize = iend - istart + 1;

     iia = ia + (istart-1);
     jja = ja + (jstart-1);


     iib = 1;
     jjb = 1;
     

     if (notran) {
        mm = isize;
        nn = jsize;
        }
    else {
        mm = jsize;
        nn = isize;
        };


     pzgeadd_( TRAN,  &mm, &nn,   z_one, A, &iia, &jja, descA,  
                         z_zero, Btmp, &iib, &jjb, descBtmp );

     /* 
      * find local extent
      */

     
     if (notran) {
       iib = ib + (istart-1);
       jjb = jb + (jstart-1);
       }
     else {
       iib = ib + (jstart-1);
       jjb = jb + (istart-1);
       };

     if (notran) {
       nrow = numroc_( &isize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &jsize, &descB[NB_], &mypcol, &csrc, &npcol);
       }
     else {
       nrow = numroc_( &jsize, &descB[MB_], &myprow, &rsrc, &nprow);
       ncol = numroc_( &isize, &descB[NB_], &mypcol, &csrc, &npcol);

     };

     /*
      Perform global
      dB( iib:(iib+isize-1), jjb:(jjb+jsize-1)) <- B(1:isize,1:jsize)

      Perform local
      dB( lrindx:(lrindx+nrow-1), lcindx:(lcindx+ncol-1)) <-
              B(1:nrow, 1:ncol)
      */


     infog2l_( &iib, &jjb, descB, &nprow, &npcol, &myprow, &mypcol,
               &lrindx, &lcindx,  &irsrc, &jcsrc );


     dBptr = (cuDoubleComplex *) dB;
     dBptr = dBptr + INDX2F( lrindx,lcindx, descB[LLD_]);


     cu_status = cublasSetMatrix( nrow,ncol,elmSize,
               (cuDoubleComplex *) Btmp,  descBtmp[LLD_],
                                   dBptr, descB[LLD_] );
     assert( cu_status == CUBLAS_STATUS_SUCCESS );

     };
  };

  free( Btmp );

  return;
}