PyObject* pblas_gemm(PyObject *self, PyObject *args) { char transa; char transb; int m, n, k; Py_complex alpha; Py_complex beta; PyArrayObject *a, *b, *c; PyArrayObject *desca, *descb, *descc; int one = 1; if (!PyArg_ParseTuple(args, "iiiDOODOOOOcc", &m, &n, &k, &alpha, &a, &b, &beta, &c, &desca, &descb, &descc, &transa, &transb)) { return NULL; } // cdesc // int c_ConTxt = INTP(descc)[1]; // If process not on BLACS grid, then return. // if (c_ConTxt == -1) Py_RETURN_NONE; if (c->descr->type_num == PyArray_DOUBLE) pdgemm_(&transa, &transb, &m, &n, &k, &(alpha.real), DOUBLEP(a), &one, &one, INTP(desca), DOUBLEP(b), &one, &one, INTP(descb), &(beta.real), DOUBLEP(c), &one, &one, INTP(descc)); else pzgemm_(&transa, &transb, &m, &n, &k, &alpha, (void*)COMPLEXP(a), &one, &one, INTP(desca), (void*)COMPLEXP(b), &one, &one, INTP(descb), &beta, (void*)COMPLEXP(c), &one, &one, INTP(descc)); Py_RETURN_NONE; }
int set_up_BD ( int * DESCD, double * Dmat, CSRdouble& BT_i, CSRdouble& B_j, CSRdouble& Btsparse ) { // Read-in of matrices X, Z and T from file (filename[X,Z,T]) // X and Z are read in entrely by every process // T is read in strip by strip (number of rows in each process is at maximum = blocksize) // D is constructed directly in a distributed way // B is first assembled sparse in root process and afterwards the necessary parts // for constructing the distributed Schur complement are sent to each process FILE *fT; int ni, i,j, info; int *DESCT; double *Tblock, *temp; int nTblocks, nstrips, pTblocks, stripcols, lld_T, pcol, colcur,rowcur; CSRdouble Xtsparse, Ztsparse,XtT_sparse,ZtT_sparse,XtT_temp, ZtT_temp; Xtsparse.loadFromFile ( filenameX ); Ztsparse.loadFromFile ( filenameZ ); Xtsparse.transposeIt ( 1 ); Ztsparse.transposeIt ( 1 ); XtT_sparse.allocate ( m,k,0 ); ZtT_sparse.allocate ( l,k,0 ); pcol= * ( position+1 ); // Matrix T is read in by strips of size (blocksize * *(dims+1), k) // Strips of T are read in row-wise and thus it is as if we store strips of T' (transpose) column-wise with dimensions (k, blocksize * *(dims+1)) // However we must then also transpose the process grid to distribute T' correctly // number of strips in which we divide matrix T' nstrips= n % ( blocksize * * ( dims+1 ) ) ==0 ? n / ( blocksize * * ( dims+1 ) ) : ( n / ( blocksize * * ( dims+1 ) ) ) +1; //the number of columns of T' included in each strip stripcols= blocksize * * ( dims+1 ); //number of blocks necessary to store complete column of T' nTblocks= k%blocksize==0 ? k/blocksize : k/blocksize +1; //number of blocks necessary in this process to store complete column of T' pTblocks= ( nTblocks - *position ) % *dims == 0 ? ( nTblocks- *position ) / *dims : ( nTblocks- *position ) / *dims +1; pTblocks= pTblocks <1? 1:pTblocks; //local leading dimension of the strip of T' (different from process to process) lld_T=pTblocks*blocksize; // Initialisation of descriptor of strips of matrix T' DESCT= ( int* ) malloc ( DLEN_ * sizeof ( int ) ); if ( DESCT==NULL ) { printf ( "unable to allocate memory for descriptor for Z\n" ); return -1; } // strip of T (k,stripcols) is distributed across ICTXT2D starting in process (0,0) in blocks of size (blocksize,blocksize) // the local leading dimension in this process is lld_T descinit_ ( DESCT, &k, &stripcols, &blocksize, &blocksize, &i_zero, &i_zero, &ICTXT2D, &lld_T, &info ); if ( info!=0 ) { printf ( "Descriptor of matrix Z returns info: %d\n",info ); return info; } // Allocation of memory for the strip of T' in all processes Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) ); if ( Tblock==NULL ) { printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)",*position,* ( position+1 ) ); return -1; } // Initialisation of matrix D (all diagonal elements of D equal to lambda) temp=Dmat; for ( i=0,rowcur=0,colcur=0; i<Dblocks; ++i, ++colcur, ++rowcur ) { if ( rowcur==*dims ) { rowcur=0; temp += blocksize; } if ( colcur==* ( dims+1 ) ) { colcur=0; temp += blocksize*lld_D; } if ( *position==rowcur && * ( position+1 ) == colcur ) { for ( j=0; j<blocksize; ++j ) { * ( temp + j * lld_D +j ) =lambda; } if ( i==Dblocks-1 && Ddim % blocksize != 0 ) { for ( j=blocksize-1; j>= Ddim % blocksize; --j ) { * ( temp + j * lld_D + j ) =0.0; } } } } fT=fopen ( filenameT,"rb" ); if ( fT==NULL ) { printf ( "Error opening file\n" ); return -1; } // Set up of matrix D and B per strip of T' for ( ni=0; ni<nstrips; ++ni ) { if ( ni==nstrips-1 ) { if(Tblock != NULL) free ( Tblock ); Tblock=NULL; Tblock= ( double* ) calloc ( pTblocks*blocksize*blocksize, sizeof ( double ) ); if ( Tblock==NULL ) { printf ( "Error in allocating memory for a strip of Z in processor (%d,%d)\n",*position,* ( position+1 ) ); return -1; } } //Each process only reads in a part of the strip of T' //When k is not a multiple of blocksize, read-in of the last elements of the rows of T is tricky if ( ( nTblocks-1 ) % *dims == *position && k%blocksize !=0 ) { if ( ni==0 ) { info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } else { info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } for ( i=0; i<blocksize; ++i ) { info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } for ( j=0; j < pTblocks-1; ++j ) { fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( ( *dims ) -1 ) * blocksize * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),k%blocksize,fT ); } //Normal read-in of the strips of T from a binary file (each time blocksize elements are read in) } else { if ( ni==0 ) { info=fseek ( fT, ( long ) ( pcol * blocksize * ( k ) * sizeof ( double ) ),SEEK_SET ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } else { info=fseek ( fT, ( long ) ( blocksize * ( * ( dims+1 )-1 ) * ( k ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } for ( i=0; i<blocksize; ++i ) { info=fseek ( fT, ( long ) ( blocksize * *position * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } for ( j=0; j < pTblocks-1; ++j ) { fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( * ( dims )-1 ) * blocksize * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } fread ( Tblock + i*pTblocks*blocksize + j*blocksize,sizeof ( double ),blocksize,fT ); info=fseek ( fT, ( long ) ( ( k - blocksize * ( ( pTblocks-1 ) * *dims + *position +1 ) ) * sizeof ( double ) ),SEEK_CUR ); if ( info!=0 ) { printf ( "Error in setting correct begin position for reading Z file\nprocessor (%d,%d), error: %d \n", *position,pcol,info ); return -1; } } } blacs_barrier_ ( &ICTXT2D,"A" ); // End of read-in // Matrix D is the sum of the multiplications of all strips of T' by their transpose // Up unitl now, the entire matrix is stored, not only upper/lower triangular, which is possible since D is symmetric // Be aware, that you akways have to allocate memory for the enitre matrix, even when only dealing with the upper/lower triangular part pdgemm_ ( "N","T",&k,&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &i_one, &i_one, DESCD ); //Z'Z //pdsyrk_ ( "U","N",&k,&stripcols,&d_one, Tblock,&i_one, &i_one,DESCT, &d_one, Dmat, &t_plus, &t_plus, DESCD ); // Matrix B consists of X'T and Z'T, since each process only has some parts of T at its disposal, // we need to make sure that the correct columns of Z and X are multiplied with the correct columns of T. for ( i=0; i<pTblocks; ++i ) { XtT_temp.ncols=k; //This function multiplies the correct columns of X' with the blocks of T at the disposal of the process // The result is also stored immediately at the correct positions of X'T. (see src/tools.cpp) XtT_temp.clear(); mult_colsA_colsC ( Xtsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize, ( *dims * i + *position ) *blocksize, blocksize, XtT_temp, 0 ); if ( XtT_temp.nonzeros>0 ) { if ( XtT_sparse.nonzeros==0 ){ XtT_sparse.clear(); XtT_sparse.make2 ( XtT_temp.nrows,XtT_temp.ncols,XtT_temp.nonzeros,XtT_temp.pRows,XtT_temp.pCols,XtT_temp.pData ); } else { XtT_sparse.addBCSR ( XtT_temp ); } } } //Same as above for calculating Z'T for ( i=0; i<pTblocks; ++i ) { ZtT_temp.ncols=k; ZtT_temp.clear(); mult_colsA_colsC ( Ztsparse, Tblock+i*blocksize, lld_T, ( * ( dims+1 ) * ni + pcol ) *blocksize, blocksize, blocksize * ( *dims * i + *position ), blocksize, ZtT_temp, 0 ); if ( ZtT_temp.nonzeros>0 ) { if ( ZtT_sparse.nonzeros==0 ){ ZtT_sparse.clear(); ZtT_sparse.make2 ( ZtT_temp.nrows,ZtT_temp.ncols,ZtT_temp.nonzeros,ZtT_temp.pRows,ZtT_temp.pCols,ZtT_temp.pData ); } else ZtT_sparse.addBCSR ( ZtT_temp ); } } blacs_barrier_ ( &ICTXT2D,"A" ); } XtT_temp.clear(); ZtT_temp.clear(); Xtsparse.clear(); Ztsparse.clear(); if(DESCT != NULL) free ( DESCT ); DESCT=NULL; if(Tblock != NULL) free ( Tblock ); Tblock=NULL; //printf("T read in\n"); info=fclose ( fT ); if ( info!=0 ) { printf ( "Error in closing open streams" ); return -1; } if(filenameT != NULL) free(filenameT); filenameT=NULL; //Each process only has calculated some parts of B //All parts are collected by the root process (iam==0), which assembles B //Each process then receives BT_i and B_j corresponding to the D_ij available to the process if ( iam!=0 ) { //Each process other than root sends its X' * T and Z' * T to the root process. MPI_Send ( & ( XtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pRows[0] ),XtT_sparse.nrows + 1, MPI_INT,0,iam+size,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pCols[0] ),XtT_sparse.nonzeros, MPI_INT,0,iam+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( XtT_sparse.pData[0] ),XtT_sparse.nonzeros, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD ); XtT_sparse.clear(); MPI_Send ( & ( ZtT_sparse.nonzeros ),1, MPI_INT,0,iam,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pRows[0] ),ZtT_sparse.nrows + 1, MPI_INT,0,4*size + iam,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pCols[0] ),ZtT_sparse.nonzeros, MPI_INT,0,iam+ 5*size,MPI_COMM_WORLD ); MPI_Send ( & ( ZtT_sparse.pData[0] ),ZtT_sparse.nonzeros, MPI_DOUBLE,0,iam+6*size,MPI_COMM_WORLD ); ZtT_sparse.clear(); //printf("Process %d sent ZtT and XtT\n",iam); // And eventually receives the necessary BT_i and B_j // Blocking sends are used, which is why the order of the receives is critical depending on the coordinates of the process int nonzeroes; if (*position >= pcol) { MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status ); BT_i.allocate ( blocksize*Drows,m+l,nonzeroes ); MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status ); int count; MPI_Get_count(&status,MPI_INT,&count); BT_i.nrows=count-1; MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status ); MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status ); B_j.allocate ( blocksize*Dcols,m+l,nonzeroes ); MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status ); MPI_Get_count(&status,MPI_INT,&count); B_j.nrows=count-1; MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status ); //Actually BT_j is sent, so it still needs to be transposed B_j.transposeIt ( 1 ); } else { MPI_Recv ( &nonzeroes,1, MPI_INT,0,iam+4*size,MPI_COMM_WORLD,&status ); B_j.allocate ( blocksize*Dcols,m+l,nonzeroes ); MPI_Recv ( & ( B_j.pRows[0] ),blocksize*Dcols + 1, MPI_INT,0,iam + 5*size,MPI_COMM_WORLD,&status ); int count; MPI_Get_count(&status,MPI_INT,&count); B_j.nrows=count-1; MPI_Recv ( & ( B_j.pCols[0] ),nonzeroes, MPI_INT,0,iam+6*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( B_j.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+7*size,MPI_COMM_WORLD,&status ); B_j.transposeIt ( 1 ); MPI_Recv ( &nonzeroes,1,MPI_INT,0,iam,MPI_COMM_WORLD,&status ); BT_i.allocate ( blocksize*Drows,m+l,nonzeroes ); MPI_Recv ( & ( BT_i.pRows[0] ),blocksize*Drows + 1, MPI_INT,0,iam + size,MPI_COMM_WORLD,&status ); MPI_Get_count(&status,MPI_INT,&count); BT_i.nrows=count-1; MPI_Recv ( & ( BT_i.pCols[0] ),nonzeroes, MPI_INT,0,iam+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( BT_i.pData[0] ),nonzeroes, MPI_DOUBLE,0,iam+3*size,MPI_COMM_WORLD,&status ); } } else { for ( i=1; i<size; ++i ) { // The root process receives parts of X' * T and Z' * T sequentially from all processes and directly adds them together. int nonzeroes; MPI_Recv ( &nonzeroes,1,MPI_INT,i,i,MPI_COMM_WORLD,&status ); if(nonzeroes>0) { XtT_temp.allocate ( m,k,nonzeroes ); MPI_Recv ( & ( XtT_temp.pRows[0] ),m + 1, MPI_INT,i,i+size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( XtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+2*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( XtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+3*size,MPI_COMM_WORLD,&status ); XtT_sparse.addBCSR ( XtT_temp ); XtT_temp.clear(); } MPI_Recv ( &nonzeroes,1, MPI_INT,i,i,MPI_COMM_WORLD,&status ); if(nonzeroes>0) { ZtT_temp.allocate ( l,k,nonzeroes ); MPI_Recv ( & ( ZtT_temp.pRows[0] ),l + 1, MPI_INT,i,4*size + i,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( ZtT_temp.pCols[0] ),nonzeroes, MPI_INT,i,i+ 5*size,MPI_COMM_WORLD,&status ); MPI_Recv ( & ( ZtT_temp.pData[0] ),nonzeroes, MPI_DOUBLE,i,i+6*size,MPI_COMM_WORLD,&status ); ZtT_sparse.addBCSR ( ZtT_temp ); ZtT_temp.clear(); } } XtT_sparse.transposeIt ( 1 ); ZtT_sparse.transposeIt ( 1 ); // B' is created by concatening blocks X'T and Z'T create1x2BlockMatrix ( XtT_sparse, ZtT_sparse,Btsparse ); XtT_sparse.clear(); ZtT_sparse.clear(); /*Btsparse.transposeIt(1); Btsparse.writeToFile("B_sparse.csr"); Btsparse.transposeIt(1);*/ // For each process row i BT_i is created which is also sent to processes in column i to become B_j. for ( int rowproc= *dims - 1; rowproc>= 0; --rowproc ) { BT_i.ncols=Btsparse.ncols; BT_i.nrows=0; BT_i.nonzeros=0; int Drows_rowproc; if (rowproc!=0) { Drows_rowproc= ( Dblocks - rowproc ) % *dims == 0 ? ( Dblocks- rowproc ) / *dims : ( Dblocks- rowproc ) / *dims +1; Drows_rowproc= Drows_rowproc<1? 1 : Drows_rowproc; } else Drows_rowproc=Drows; for ( i=0; i<Drows_rowproc; ++i ) { //Each process in row i can hold several blocks of contiguous rows of D for which we need the corresponding rows of B_T // Therefore we use the function extendrows to create BT_i (see src/tools.cpp) BT_i.extendrows ( Btsparse, ( i * *dims + rowproc ) * blocksize,blocksize ); } for ( int colproc= ( rowproc==0 ? 1 : 0 ); colproc < * ( dims+1 ); ++colproc ) { int rankproc; rankproc= blacs_pnum_ (&ICTXT2D, &rowproc,&colproc); MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+2*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+3*size,MPI_COMM_WORLD ); //printf("BT_i's sent to processor %d\n",rankproc); rankproc= blacs_pnum_ (&ICTXT2D, &colproc,&rowproc); MPI_Send ( & ( BT_i.nonzeros ),1, MPI_INT,rankproc,rankproc+4*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pRows[0] ),BT_i.nrows + 1, MPI_INT,rankproc,rankproc+5*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pCols[0] ),BT_i.nonzeros, MPI_INT,rankproc,rankproc+6*size,MPI_COMM_WORLD ); MPI_Send ( & ( BT_i.pData[0] ),BT_i.nonzeros, MPI_DOUBLE,rankproc,rankproc+7*size,MPI_COMM_WORLD ); //printf("B_j's sent to processor %d\n",rankproc); } } B_j.make2 ( BT_i.nrows,BT_i.ncols,BT_i.nonzeros,BT_i.pRows,BT_i.pCols,BT_i.pData ); B_j.transposeIt ( 1 ); } return 0; }
int main() { const MKL_INT m = 1000; const MKL_INT k = 100000; const MKL_INT n = 1000; const MKL_INT nb = 100; const MKL_INT nprow = 2; const MKL_INT npcol = 2; MKL_INT iam, nprocs, ictxt, myrow, mycol; MDESC descA, descB, descC, descA_local, descB_local, descC_local; MKL_INT info; MKL_INT a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local; MKL_INT a_lld, b_lld, c_lld; blacs_pinfo_( &iam, &nprocs ); blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); double *a = 0; double *b = 0; double *c = 0; if (iam==0) { a = gen_a(m, k); b = gen_b(k, n); c = (double*)calloc(m*n, sizeof(double)); puts("a="); print(a, m, k); puts("b="); print(b, k, n); } a_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); a_n_local = numroc_( &k, &nb, &mycol, &i_zero, &npcol ); b_m_local = numroc_( &k, &nb, &myrow, &i_zero, &nprow ); b_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); c_m_local = numroc_( &m, &nb, &myrow, &i_zero, &nprow ); c_n_local = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); double *A = (double*) calloc( a_m_local * a_n_local, sizeof( double ) ); double *B = (double*) calloc( b_m_local * b_n_local, sizeof( double ) ); double *C = (double*) calloc( c_m_local * c_n_local, sizeof( double ) ); a_lld = MAX( a_m_local, 1 ); b_lld = MAX( b_m_local, 1 ); c_lld = MAX( c_m_local, 1 ); if (iam==0) { printf("a_m_local = %d\ta_n_local = %d\tb_m_local = %d\tb_n_local = %d\tc_m_local = %d\tc_n_local = %d\n", a_m_local, a_n_local, b_m_local, b_n_local, c_m_local, c_n_local); printf("a_lld = %d\tb_lld = %d\tc_lld = %d\n", a_lld, b_lld, c_lld); } descinit_( descA_local, &m, &k, &m, &k, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descB_local, &k, &n, &k, &n, &i_zero, &i_zero, &ictxt, &k, &info ); descinit_( descC_local, &m, &n, &m, &n, &i_zero, &i_zero, &ictxt, &m, &info ); descinit_( descA, &m, &k, &nb, &nb, &i_zero, &i_zero, &ictxt, &a_lld, &info ); descinit_( descB, &k, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &b_lld, &info ); descinit_( descC, &m, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &c_lld, &info ); printf("Rank %d: start distribute data\n", iam); pdgeadd_( &trans, &m, &k, &one, a, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &k, &n, &one, b, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); printf("Rank %d: finished distribute data\n", iam); if (iam==0) { puts("a"); print(A, a_m_local, a_n_local); puts("b"); print(B, b_m_local, b_n_local); } pdgemm_( "N", "N", &m, &n, &k, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); printf("Rank %d: finished dgemm\n", iam); if (iam == 0) { puts("c"); print(C, c_m_local, c_n_local); } pdgeadd_( &trans, &m, &n, &one, C, &i_one, &i_one, descC, &zero, c, &i_one, &i_one, descC_local); if (iam==0) { puts("global c"); print(c, m, n); } free(A); free(B); free(C); if (iam==0) { free(a); free(b); free(c); } blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); }
/// 计算结果存储在矩阵a中 /// n_global: the order of the matrix static void inv_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); //// self code //n_global = 3; //double *aaa = new double(n_global*n_global); //for (int i = 0; i < 9; i++) //{ // aaa[i] = i + 1; //} //aaa[8] = 10; //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa); // Create a NxN random matrix A auto a = block_cyclic_mat_t::random(grid, n_global, n_global); // Create a NxN matrix to hold A^{-1} auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global); // Copy A to A^{-1} since it will be overwritten during factorization std::copy_n(a->local_data(), a->local_size(), ai->local_data()); MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); // Factorize A blas_idx_t ia = 1, ja = 1; std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100); blas_idx_t info; //含义应该是D-GE-TRF。 //第一个D表示我们的矩阵是double类型的 //GE表示我们的矩阵是General类型的 //TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。 pdgetrf_(n_global, n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), info); assert(info == 0); double t_factor = MPI_Wtime() - t0; // Compute A^{-1} based on the LU factorization // Compute workspace for double and integer work arrays on each process blas_idx_t lwork = 10; blas_idx_t liwork = 10; std::vector<double> work (lwork); std::vector<blas_idx_t> iwork(liwork); lwork = liwork = -1; // 计算lwork与liwork的值 pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); lwork = static_cast<blas_idx_t>(work[0]); liwork = static_cast<size_t>(iwork[0]); work.resize(lwork); iwork.resize(liwork); // Now compute the inverse t0 = MPI_Wtime(); pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); double t_solve = MPI_Wtime() - t0; // Verify that the inverse is correct using A*A^{-1} = I auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global); // Compute I = A * A^{-1} - I and verify that the ||I|| is small char nein = 'N'; double alpha = 1.0, beta = -1.0; pdgemm_(nein, nein, n_global, n_global, n_global, alpha, a->local_data() , ia, ja, a->descriptor(), ai->local_data(), ia, ja, ai->descriptor(), beta, identity->local_data(), ia, ja, identity->descriptor()); // Compute 1-norm of the result char norm='1'; work.resize(identity->local_cols()); double err = pdlange_(norm, n_global, n_global, identity->local_data(), ia, ja, identity->descriptor(), work.data()); double t_total = t_factor + t_solve; double t_glob; MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = getri_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX INVERSE BENCHMARK SUMMARY\n" "================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops, err);fflush(stdout); } }
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global) { auto grid = std::make_shared<blacs_grid_t>(); auto a = block_cyclic_mat_t::random(grid, m_global, k_global); // auto b = block_cyclic_mat_t::random(grid, k_global, n_global); auto c = block_cyclic_mat_t::random(grid, m_global, n_global); //for test TODO double *dd = new double[m_global*k_global]; for (int i = 0; i < m_global*k_global; i++) { dd[i] = i; } auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd); //从这里开始矩阵的运算 MPI_Barrier(MPI_COMM_WORLD); double alpha = 1.0, beta = 0.0; double t0 = MPI_Wtime(); char NEIN = 'N'; //表示不进行转置 blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1; // sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C) pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, alpha, a->local_data(), ia, ja, a->descriptor(), b->local_data(), ib, jb, b->descriptor(), beta, c->local_data(), ic, jc, c->descriptor() ); double t1 = MPI_Wtime() - t0; double t_glob; //获取所有进程所用时间中最长的时间 MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) // 进程号为0的进程 { double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX MULTIPLY BENCHMARK SUMMARY\n" "=================================\n" "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops); fflush(stdout); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { printf("%f ", c->local_data()[i*k_global + j]); } printf("\n"); } fflush(stdout); } }
/*==== MAIN FUNCTION =================================================*/ int main( int argc, char *argv[] ){ /* ==== Declarations =================================================== */ /* File variables */ FILE *fin; /* Matrix descriptors */ MDESC descA, descB, descC, descA_local, descB_local; /* Local scalars */ MKL_INT iam, nprocs, ictxt, myrow, mycol, nprow, npcol; MKL_INT n, nb, mp, nq, lld, lld_local; MKL_INT i, j, info; int n_int, nb_int, nprow_int, npcol_int; double thresh, diffnorm, anorm, bnorm, residual, eps; /* Local arrays */ double *A_local, *B_local, *A, *B, *C, *work; MKL_INT iwork[ 4 ]; /* ==== Executable statements ========================================== */ /* Get information about how many processes are used for program execution and number of current process */ blacs_pinfo_( &iam, &nprocs ); /* Init temporary 1D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "C", &nprocs, &i_one ); /* Open input file */ if ( iam == 0 ) { fin = fopen( "../in/pblas3ex.in", "r" ); if ( fin == NULL ) { printf( "Error while open input file." ); return 2; } } /* Read data and send it to all processes */ if ( iam == 0 ) { /* Read parameters */ fscanf( fin, "%d n, dimension of vectors, must be > 0 ", &n_int ); fscanf( fin, "%d nb, size of blocks, must be > 0 ", &nb_int ); fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int ); fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int ); fscanf( fin, "%lf threshold for residual check (to switch off check set it < 0.0) ", &thresh ); n = (MKL_INT) n_int; nb = (MKL_INT) nb_int; nprow = (MKL_INT) nprow_int; npcol = (MKL_INT) npcol_int; /* Check if all parameters are correct */ if( ( n<=0 )||( nb<=0 )||( nprow<=0 )||( npcol<=0 )||( nprow*npcol != nprocs ) ) { printf( "One or several input parameters has incorrect value. Limitations:\n" ); printf( "n > 0, nb > 0, p > 0, q > 0 - integer\n" ); printf( "p*q = number of processes\n" ); printf( "threshold - double (set to negative to swicth off check)\n"); return 2; } /* Pack data into array and send it to other processes */ iwork[ 0 ] = n; iwork[ 1 ] = nb; iwork[ 2 ] = nprow; iwork[ 3 ] = npcol; igebs2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four ); dgebs2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one ); } else { /* Recieve and unpack data */ igebr2d_( &ictxt, "All", " ", &i_four, &i_one, iwork, &i_four, &i_zero, &i_zero ); dgebr2d_( &ictxt, "All", " ", &i_one, &i_one, &thresh, &i_one, &i_zero, &i_zero ); n = iwork[ 0 ]; nb = iwork[ 1 ]; nprow = iwork[ 2 ]; npcol = iwork[ 3 ]; } if ( iam == 0 ) { fclose( fin ); } /* Destroy temporary process grid */ blacs_gridexit_( &ictxt ); /* Init workind 2D process grid */ blacs_get_( &i_negone, &i_zero, &ictxt ); blacs_gridinit_( &ictxt, "R", &nprow, &npcol ); blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* Create on process 0 two matrices: A - orthonormal, B -random */ if ( ( myrow == 0 ) && ( mycol == 0 ) ){ /* Allocate arrays */ A_local = (double*) calloc( n*n, sizeof( double ) ); B_local = (double*) calloc( n*n, sizeof( double ) ); /* Set arrays */ for ( i=0; i<n; i++ ){ for ( j=0; j<n; j++ ){ B_local[ i+n*j ] = one*rand()/RAND_MAX; } B_local[ i+n*i ] += two; } for ( j=0; j<n; j++ ){ for ( i=0; i<n; i++ ){ if ( j < n-1 ){ if ( i <= j ){ A_local[ i+n*j ] = one / sqrt( ( double )( (j+1)*(j+2) ) ); } else if ( i == j+1 ) { A_local[ i+n*j ] = -one / sqrt( one + one/( double )(j+1) ); } else { A_local[ i+n*j ] = zero; } } else { A_local[ i+n*(n-1) ] = one / sqrt( ( double )n ); } } } /* Print information of task */ printf( "=== START OF EXAMPLE ===================\n" ); printf( "Matrix-matrix multiplication: A*B = C\n\n" ); printf( "/ 1/q_1 ........ 1/q_n-1 1/q_n \\ \n" ); printf( "| . | \n" ); printf( "| `. : : | \n" ); printf( "| -1/q_1 `. : : | \n" ); printf( "| . `. : : | = A \n" ); printf( "| 0 `. ` | \n" ); printf( "| : `. `. 1/q_n-1 1/q_n | \n" ); printf( "| : `. `. | \n" ); printf( "\\ 0 .... 0 -(n-1)/q_n-1 1/q_n / \n\n" ); printf( "q_i = sqrt( i^2 + i ), i=1..n-1, q_n = sqrt( n )\n\n" ); printf( "A - n*n real matrix (orthonormal) \n" ); printf( "B - random n*n real matrix\n\n" ); printf( "n = %d, nb = %d; %dx%d - process grid\n\n", n, nb, nprow, npcol ); printf( "=== PROGRESS ===========================\n" ); } else { /* Other processes don't contain parts of initial arrays */ A_local = NULL; B_local = NULL; } /* Compute precise length of local pieces and allocate array on each process for parts of distributed vectors */ mp = numroc_( &n, &nb, &myrow, &i_zero, &nprow ); nq = numroc_( &n, &nb, &mycol, &i_zero, &npcol ); A = (double*) calloc( mp*nq, sizeof( double ) ); B = (double*) calloc( mp*nq, sizeof( double ) ); C = (double*) calloc( mp*nq, sizeof( double ) ); /* Compute leading dimensions */ lld_local = MAX( numroc_( &n, &n, &myrow, &i_zero, &nprow ), 1 ); lld = MAX( mp, 1 ); /* Initialize descriptors for initial arrays located on 0 process */ descinit_( descA_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); descinit_( descB_local, &n, &n, &n, &n, &i_zero, &i_zero, &ictxt, &lld_local, &info ); /* Initialize descriptors for distributed arrays */ descinit_( descA, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descB, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); descinit_( descC, &n, &n, &nb, &nb, &i_zero, &i_zero, &ictxt, &lld, &info ); /* Distribute matrices from 0 process over process grid */ pdgeadd_( &trans, &n, &n, &one, A_local, &i_one, &i_one, descA_local, &zero, A, &i_one, &i_one, descA ); pdgeadd_( &trans, &n, &n, &one, B_local, &i_one, &i_one, descB_local, &zero, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Arrays are distributed ( p?geadd ) ..\n" ); } /* Destroy arrays on 0 process - they are not necessary anymore */ if( ( myrow == 0 ) && ( mycol == 0 ) ){ free( A_local ); free( B_local ); } /* Compute norm of A and B */ work = (double*) calloc( mp, sizeof( double ) ); anorm = pdlange_( "I", &n, &n, A, &i_one, &i_one, descA, work ); bnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); if( iam == 0 ){ printf( ".. Norms of A and B are computed ( p?lange ) ..\n" ); } /* Compute product C = A*B */ pdgemm_( "N", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, B, &i_one, &i_one, descB, &zero, C, &i_one, &i_one, descC ); if( iam == 0 ){ printf( ".. Multiplication A*B=C is done ( p?gemm ) ..\n" ); } /* Compute difference B - inv_A*C (inv_A = transpose(A) because A is orthonormal) */ pdgemm_( "T", "N", &n, &n, &n, &one, A, &i_one, &i_one, descA, C, &i_one, &i_one, descC, &negone, B, &i_one, &i_one, descB ); if( iam == 0 ){ printf( ".. Difference is computed ( p?gemm ) ..\n" ); } /* Compute norm of B - inv_A*C (which is contained in B) */ diffnorm = pdlange_( "I", &n, &n, B, &i_one, &i_one, descB, work ); free( work ); if( iam == 0 ){ printf( ".. Norms of the difference B-inv_A*C is computed ( p?lange ) ..\n" ); } /* Print results */ if( iam == 0 ){ printf( ".. Solutions are compared ..\n" ); printf( "== Results ==\n" ); printf( "||A|| = %03.11f\n", anorm ); printf( "||B|| = %03.11f\n", bnorm ); printf( "=== END OF EXAMPLE =====================\n" ); } /* Compute machine epsilon */ eps = pdlamch_( &ictxt, "e" ); /* Compute residual */ residual = diffnorm /( two*anorm*bnorm*eps ); /* Destroy arrays */ free( A ); free( B ); free( C ); /* Destroy process grid */ blacs_gridexit_( &ictxt ); blacs_exit_( &i_zero ); /* Check if residual passed or failed the threshold */ if ( ( iam == 0 ) && ( thresh >= zero ) && !( residual <= thresh ) ){ printf( "FAILED. Residual = %05.16f\n", residual ); return 1; } else { return 0; } /*======================================================================== ====== End of PBLAS Level 3 example ==================================== ======================================================================*/ }
/* Test program * created 23/09/2014 * author Alex Bombrun * * icc -O1 -o eigen.exe lapackReadStore.c mpiutil.c normals.c matrixBlockStore.c -mkl * ./eigen.exe 4 4 * */ int main(int argc, char **argv) { FILE* store; FILE* scaStore; int N , M; int i, j; int n_blocks; int scalapack_size; int NB, MB; int i_block, j_block; int dim[4]; double * mat; // local matrix block use for reading double * matK; // the kernel matrix ng x 6 int t, t_block; const char* profileG_file_name= "./data/NormalsG/profile.txt"; const char* store_location = "./data/ReducedNormals"; const char* scaStore_location ="./data/CholeskyReducedNormals"; const char* kernel_file_name ="./data/kernel.txt"; int mp; // number of rows in the processor grid int mla; // number of rows in the local array int mb; // number of rows in a block int np; // number of columns in the processor grid int nla; // number of columns in the local array int nb; // number of columns in a block int mype,npe; // rank and total number of process int idescal[9]; // matrix descriptors double *la; // matrix values: al is the local array int idescaal[9]; double *laa; int idescbl[9]; double *lb; double normb; int idescxl[9]; double *lx; double normx; int idesck1l[9]; double *lk1; double normk1; int idesczl[9]; // matrix descriptors double *lz; // matrix values: al is the local array double *w; int ierr; // error output int mp_ret, np_ret, myrow, mycol; // to store grid info int zero=0; // value used for the descriptor initialization int one=1; // value used for the descriptor initialization int m,n; // matrix A dimensions double norm, cond; double *work = NULL; double * work2 = NULL; int *iwork = NULL; int lwork, liwork; float ll,mm,cr,cc; int ii,jj,pr,pc,h,g; // ii,jj coordinates of local array element int rsrc=0,csrc=0; // assume that 0,0 element should be stored in the 0,0 process int n_b = 1; int index; int icon; // scalapack cblacs context char normJob, jobz, uplo, trans, notrans, diag; double MPIt1, MPIt2, MPIelapsed; jobz= 'N'; uplo='U'; Cblacs_pinfo( &mype, &npe ); if (argc == 3) { //printf("%s %s %s\n", argv[0], argv[1], argv[2]); n_blocks= (int) strtol(argv[1], NULL, 10); scalapack_size= (int) strtol(argv[2], NULL, 10); } else { printf("Usage: expect 2 integers \n"); printf(" 1 : the number of diagonal blocks \n"); printf(" 2 : scalapack number to define block size (assume n is divisible by sqrt(p) and that n/sqrt(p) is divisible by this number)\n"); exit( -1); } printf("%d/%d: read store\n",mype,npe); N = getNumberOfLine(profileG_file_name); // the dimension of the matrix; M = N; // square matrix m=M; //mla*mp; n=N; //nla*np; np = isqrt(npe); // assume that the number of process is a square mp = np; // square grid mla = m/mp; // assume that the matrix dimension if a multiple of the process grid dimension nla = n/np; mb = mla/scalapack_size; // assume that the dimension of the matrix is a multiple of the number of the number of diagonal blocks nb = nla/scalapack_size; // init CBLACS Cblacs_get( -1, 0, &icon ); Cblacs_gridinit( &icon,"c", mp, np ); Cblacs_gridinfo( icon, &mp_ret, &np_ret, &myrow, &mycol); // read blocks and set the reduced normal matrix in scalapack grid // allocate local matrix la=malloc(sizeof(double)*mla*nla); printf("%d/%d: full matrix (%d,%d), local matrix (%d,%d), processor grid (%d,%d), block (%d,%d) \n", mype, npe, m, n, mla, nla, np, mp, mb, nb); for(i_block=0;i_block<n_blocks;i_block++){ printf("%d/%d: process store block %d \n", mype, npe, i_block); readStore(&store,i_block,store_location); t_block = 0; while(readNextBlockDimension(dim,store)!=-1) { // loop B over all block tasks j_block = mpi_get_diag_block_id(i_block, t_block, n_blocks); mat = malloc((dim[1]-dim[0])*(dim[3]-dim[2]) * sizeof(double)); readNextBlock(dim[0],dim[1],dim[2],dim[3],mat,store); // printf("%d/%d: read block (%d,%d) with global indices (%d,%d,%d,%d) \n",mype, npe, i_block,j_block,dim[0],dim[1],dim[2],dim[3]); NB = dim[1]-dim[0]; MB = dim[3]-dim[2]; for(i = dim[0];i<dim[1];i++){ for(j = dim[2];j<dim[3];j++){ //matA[i*M+j] = mat[(i-dim[0])*MB+(j-dim[2])]; // finding out which pe gets this i,j element cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( j/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( j/(mp*nb) ) ); ii = i%mb + (int)(ll)*mb; jj = j%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = mat[(i-dim[0])*MB+(j-dim[2])]; } } } // transpose if(j_block != i_block){ for(i = dim[0];i<dim[1];i++){ for(j = dim[2];j<dim[3];j++){ //matA[j*M+i] = mat[(i-dim[0])*MB+(j-dim[2])]; // finding out which pe gets this j,i element cr = (float)( j/mb ); h = rsrc+(int)(cr); pr = h%np; cc = (float)( i/mb ); g = csrc+(int)(cc); pc = g%mp; // check if process should get this element if (myrow == pr && mycol==pc){ // ii = x + l*mb // jj = y + m*nb ll = (float)( ( j/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric mm = (float)( ( i/(mp*nb) ) ); ii = j%mb + (int)(ll)*mb; jj = i%nb + (int)(mm)*nb; index=jj*mla+ii; // seems to be the transpose !? //if(index<0) printf("%d/%d: negative index (%d,%d) \n",mype,npe,i,j); //if(index>=mla*nla) printf("%d/%d: too large index (%d,%d) \n",mype,npe,i,j); la[index] = mat[(i-dim[0])*MB+(j-dim[2])]; } } } } free(mat); t_block++; } closeStore(store); } printf("%d/%d: finished scaterring the matrix \n",mype,npe); // read the kernel matrix printf("%d/%d: set kernel \n",mype,npe); matK = malloc(N * 6* sizeof(double)); readMatrixDouble(matK,kernel_file_name); // set k1 lk1 = calloc(sizeof(double),mla); for(i = 0; i<N; i++) { // finding out which pe gets i element cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; // check if process should get this element if (myrow == pr) { // ii = x + l*mb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric ii = i%mb + (int)(ll)*mb; lk1[ii] = matK[N*0+i]; } } printf("%d/%d: finished scaterring the kernel vector \n",mype,npe); printf("%d/%d: start computing \n",mype,npe); // set the matrix descriptor ierr=0; descinit_(idescal, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); // processor grip id start at 0 if (mype==0) saveMatrixDescriptor(idescal, scaStore_location); ierr=0; descinit_(idescaal, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); // processor grip id start at 0 ierr=0; descinit_(idescbl, &m, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 lb = calloc(sizeof(double),mla); ierr=0; // set x descinit_(idescxl, &n, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 lx = calloc(sizeof(double),mla); for(i=0;i<mla;i++){ lx[i] = 1.0/m; } pddot_(&n,&normx,lx,&one,&one,idescxl,&one,lx,&one,&one,idescxl,&one); // normx <- x'x if (mype==0) printf("%d/%d: normx2 %E \n",mype,npe,normx); ierr=0; // set k1 descinit_(idesck1l, &n, &one , &mb, &nb , &zero, &zero, &icon, &nla, &ierr); // processor grip id start at 0 pddot_(&n,&normk1,lk1,&one,&one,idesck1l,&one,lk1,&one,&one,idesck1l,&one); // normx <- x'x if (mype==0) printf("%d/%d: normk1 square %E \n",mype,npe,normk1); ierr=0; // set b double alpha =1.0; double beta =0.0; notrans = 'N'; pdgemv_(¬rans,&m,&n,&alpha,la,&one,&one,idescal,lk1,&one,&one,idesck1l,&one,&beta,lb,&one,&one,idescbl,&one); // b <- A k1 pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: is kernel, normb square %E \n",mype,npe,normb); ierr=0; // set b alpha =1.0; beta =0.0; notrans = 'N'; pdgemv_(¬rans,&m,&n,&alpha,la,&one,&one,idescal,lx,&one,&one,idescxl,&one,&beta,lb,&one,&one,idescbl,&one); // b <- A x pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: normb2 %E \n",mype,npe,normb); // set aa printf("%d/%d: start setting aa with k1 x k1\' \n",mype,npe); alpha = 1.0; beta = 1.0; trans = 'T'; notrans = 'N'; // laa=malloc(sizeof(double)*mla*nla); // for debugging //pdgemm(transa, transb, // m, n, k, // alpha, a, ia , ja , desca, // b, ib , jb ,descb, // beta, c, ic , jc , descc) // A = k1 (Nx1) // B = k1 (Nx1) // C = aa (MxN) ... M=N // C <- C + A x B' pdgemm_(¬rans, &trans, &N, &N, &one, &alpha, lk1, &one, &one, idesck1l, lk1, &one, &one, idesck1l, &beta, la, &one, &one, idescal ); for(j = 1; j<6; j++) { for(i = 0; i<N; i++) { // finding out which pe gets i element cr = (float)( i/mb ); h = rsrc+(int)(cr); pr = h%np; // check if process should get this element if (myrow == pr) { // ii = x + l*mb ll = (float)( ( i/(np*mb) ) ); // thinks seems to be mixed up does not matter as long as the matrix, the block and the grid is symmetric ii = i%mb + (int)(ll)*mb; lk1[ii] = matK[N*j+i]; } } pdgemm_(¬rans, &trans, &N, &N, &one, &alpha, lk1, &one, &one, idesck1l, lk1, &one, &one, idesck1l, &beta, la, &one, &one, idescal ); } ierr = 0; // compute norm 1 of the reduced normal matrix /* DO NOT WORK lwork = 2*mla+2*nla; work = malloc(sizeof(double)*lwork); normJob = '1'; norm = pdlansy_(&normJob, &uplo, &n, la, &one, &one, idescal, work); // matrix index start at one printf("%d/%d: norm %f \n",mype,npe,norm); free(work); */ ierr = 0; // compute the cholesky decomposition printf("%d/%d: start computing cholesky factor\n",mype,npe); pdpotrf_(&uplo,&n,la,&one,&one,idescal,&ierr); printf("%d/%d: finish computing cholesky factor\n",mype,npe); openScalapackStore(&scaStore,myrow,mycol,scaStore_location); saveLocalMatrix(la,nla,mla,scaStore); double test=0.0; for(i=0;i<nla*mla;i++){ test += la[i]*la[i]; } printf("%d/%d: finished computing cholesky, test=%f \n",mype,npe,test); ierr =0; // assume x and b set // assume cholesky decomposition // compute the soluation A x = b diag = 'N'; printf("%d/%d: start solving\n",mype,npe); //pdpptrs_(&uplo, &trans , &diag , &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // solve triangular system //pdtrtrs (&uplo, &trans , &diag , &n , &n , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); pdpotrs_(&uplo, &n , &one , la , &one , &one , idescal , lb , &one , &one , idescbl , &ierr); // b<- A-1 b alpha = -1.0; normb=0; pdaxpy_(&n,&alpha,lx,&one,&one,idescxl,&one,lb,&one,&one,idescbl,&one); // b<-b-x pddot_(&n,&normb,lb,&one,&one,idescbl,&one,lb,&one,&one,idescbl,&one); // norm <- b'b if (mype==0) printf("%d/%d: finish solving, norm2(sol-true) %E \n",mype,npe,normb); ierr = 0; /* // compute the eigen values jobz= 'N'; uplo='U'; // with N z is ignored descinit_(idesczl, &m, &n , &mb, &nb , &zero, &zero, &icon, &mla, &ierr); lz = malloc(sizeof(double)*mla*nla); w = malloc(sizeof(double)*m); lwork = -1; work = malloc(sizeof(double)*2); pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr); // only compute lwork //pdsyev_( &jobz, &uplo, &n, A, &ione, &ione, descA, W, Z, &ione, &ione, descZ, work, &lwork, &info ); lwork= (int) work[0]; free(work); work = (double *)calloc(lwork,sizeof(double)) ; //MPIt1 = MPI_Wtime(); pdsyev_( &jobz, &uplo, &n, la, &one, &one, idescal, w, lz, &one, &one, idesczl, work, &lwork, &ierr); // compute the eigen values //MPIt2 = MPI_Wtime(); //MPIelapsed=MPIt2-MPIt1; if (mype == 0) { saveMatrix(n,w,"eigenvalues.txt"); //printf("%d/%d: finished job in %8.2fs\n",mype,npe,MPIelapsed); // not working } */ ierr = 0; // compute the conditioner number assume that the norm and the cholesky decomposition have been computed /* DO NOT WORK lwork = 2*mla+3*nla; printf("%d/%d: lwork=%d @%p\n",mype,npe,lwork,&lwork); work2 = malloc(sizeof(double)*lwork); liwork = 2*mla+3*nla; iwork = malloc(sizeof(int)*liwork); pdpocon_(&uplo,&n,la,&one,&one,idescal,&norm,&cond,work2,&lwork,iwork,&liwork,&ierr); printf("%d/%d: condition number %f \n",mype,npe,cond); */ free(la); Cblacs_gridexit(icon); Cblacs_exit( 0 ); return 0; }
/// /// @return INFO = the status of the psgemm_() /// slpp::int_t pdgemmSlave(void* bufs[], size_t sizes[], unsigned count) { enum dummy {BUF_ARGS=0, BUF_A, BUF_B, BUF_C, NUM_BUFS }; for(size_t i=0; i < count; i++) { if(DBG) { std::cerr << "pdgemmSlave: buffer at:"<< bufs[i] << std::endl; std::cerr << "pdgemmSlave: bufsize =" << sizes[i] << std::endl; } } if(count < NUM_BUFS) { std::cerr << "pdgemmSlave: master sent " << count << " buffers, but " << NUM_BUFS << " are required." << std::endl; ::exit(99); // something that does not look like a signal } // take a COPY of args (because we will have to patch DESC.CTXT) scidb::PdgemmArgs args = *reinterpret_cast<PdgemmArgs*>(bufs[BUF_ARGS]) ; if(DBG) { std::cerr << "pdgemmSlave: args {" << std::endl ; std::cerr << args << std::endl; std::cerr << "}" << std::endl ; } // set up the scalapack grid if(DBG) std::cerr << "pdgemmSlave: NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL<<std::endl; slpp::int_t ICTXT=-1; // will be overwritten by sl_init // call scalapack tools routine to initialize a scalapack grid and give us its // context sl_init_(ICTXT/*out*/, args.NPROW/*in*/, args.NPCOL/*in*/); slpp::int_t NPROW=-1, NPCOL=-1, MYPROW=-1, MYPCOL=-1, MYPNUM=-1; // illegal vals getSlaveBLACSInfo(ICTXT/*in*/, NPROW, NPCOL, MYPROW, MYPCOL, MYPNUM); if(NPROW != args.NPROW || NPCOL != args.NPCOL || MYPROW != args.MYPROW || MYPCOL != args.MYPCOL || MYPNUM != args.MYPNUM){ if(DBG) { std::cerr << "scalapack general parameter mismatch" << std::endl; std::cerr << "args NPROW:"<<args.NPROW<<" NPCOL:"<<args.NPCOL << "MYPROW:"<<args.MYPROW<<" MYPCOL:"<<args.MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; std::cerr << "ScaLAPACK NPROW:"<<NPROW<<" NPCOL:"<<NPCOL << "MYPROW:"<<MYPROW<<" MYPCOL:"<<MYPCOL<<"MYPNUM:"<<MYPNUM << std::endl; } } const slpp::int_t one = 1 ; const slpp::int_t LTD_A = std::max(one, numroc_( args.A.DESC.N, args.A.DESC.NB, MYPCOL, /*CSRC_A*/0, NPCOL )); const slpp::int_t LTD_B = std::max(one, numroc_( args.B.DESC.N, args.B.DESC.NB, MYPCOL, /*CSRC_B*/0, NPCOL )); const slpp::int_t LTD_C = std::max(one, numroc_( args.C.DESC.N, args.C.DESC.NB, MYPCOL, /*CSRC_C*/0, NPCOL )); if(DBG) { std::cerr << "##################################################" << std::endl; std::cerr << "####pdgemmSlave##################################" << std::endl; std::cerr << "one:" << one << std::endl; std::cerr << "args.A.DESC.MB:" << args.A.DESC.MB << std::endl; std::cerr << "MYPROW:" << MYPROW << std::endl; std::cerr << "NPROW:" << NPROW << std::endl; } // size check args SLAVE_ASSERT_ALWAYS( sizes[BUF_ARGS] >= sizeof(PdgemmArgs)); // size check A,B,C -- debugs first slpp::int_t SIZE_A = args.A.DESC.LLD * LTD_A ; slpp::int_t SIZE_B = args.B.DESC.LLD * LTD_B ; slpp::int_t SIZE_C = args.C.DESC.LLD * LTD_C ; if(DBG) { if(sizes[BUF_A] != SIZE_A *sizeof(double)) { std::cerr << "sizes[BUF_A]:" << sizes[BUF_A] << " != args.A.DESC.LLD:" << args.A.DESC.LLD << "* LTD_A" << LTD_A << "*" << sizeof(double) << std::endl; } if(sizes[BUF_B] != SIZE_B *sizeof(double)) { std::cerr << "sizes[BUF_B]:" << sizes[BUF_B] << " != args.B.DESC.LLD:" << args.B.DESC.LLD << "* LTD_B" << LTD_B << "*" << sizeof(double) << std::endl; } if(sizes[BUF_C] != SIZE_C *sizeof(double)) { std::cerr << "sizes[BUF_C]:" << sizes[BUF_C] << " != args.C.DESC.LLD:" << args.C.DESC.LLD << "* LTD_C" << LTD_C << "*" << sizeof(double) << std::endl; } } SLAVE_ASSERT_ALWAYS(sizes[BUF_A] >= SIZE_A * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_B] >= SIZE_B * sizeof(double)); SLAVE_ASSERT_ALWAYS(sizes[BUF_C] >= SIZE_C * sizeof(double)); // sizes are correct, give the pointers their names double* A = reinterpret_cast<double*>(bufs[BUF_A]) ; double* B = reinterpret_cast<double*>(bufs[BUF_B]) ; double* C = reinterpret_cast<double*>(bufs[BUF_C]) ; // debug that the input is readable and show its contents if(DBG) { for(int ii=0; ii < SIZE_A; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") A["<<ii<<"] = " << A[ii] << std::endl; } for(int ii=0; ii < SIZE_B; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") B["<<ii<<"] = " << B[ii] << std::endl; } for(int ii=0; ii < SIZE_C; ii++) { std::cerr << "Pgrid("<< MYPROW << "," << MYPCOL << ") C["<<ii<<"] = " << C[ii] << std::endl; } } // ScaLAPACK: the DESCS are complete except for the correct context args.A.DESC.CTXT= ICTXT ; // (no DESC for S) args.B.DESC.CTXT= ICTXT ; args.C.DESC.CTXT= ICTXT ; if(true || DBG) { // we'll leave this on in Cheshire.0 and re-evaluate later std::cerr << "pdgemmSlave: argsBuf is: {" << std::endl; std::cerr << args << std::endl; std::cerr << "}" << std::endl << std::endl; std::cerr << "pdgemmSlave: calling pdgemm_ for computation, with args:" << std::endl ; std::cerr << "TRANSA: " << args.TRANSA << ", TRANSB: " << args.TRANSB << ", M: " << args.M << ", N: " << args.N << ", K: " << args.K << std::endl; std::cerr << "ALPHA: " << args.ALPHA << std::endl; std::cerr << "A: " << (void*)(A) << ", A.I: " << args.A.I << ", A.J: " << args.A.J << std::endl; std::cerr << ", A.DESC: " << args.A.DESC << std::endl; std::cerr << "B: " << (void*)(B) << ", B.I: " << args.B.I << ", B.J: " << args.B.J << std::endl; std::cerr << ", B.DESC: " << args.B.DESC << std::endl; std::cerr << "BETA: " << args.BETA << std::endl; std::cerr << "C: " << (void*)(C) << ", C.I: " << args.C.I << ", C.J: " << args.C.J << std::endl; std::cerr << ", C.DESC: " << args.C.DESC << std::endl; } ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// pdgemm_( args.TRANSA, args.TRANSB, args.M, args.N, args.K, &args.ALPHA, A, args.A.I, args.A.J, args.A.DESC, B, args.B.I, args.B.J, args.B.DESC, &args.BETA, C, args.C.I, args.C.J, args.C.DESC); if(true || DBG) { // we'll leave this on in Cheshire.0 and re-evaluate later std::cerr << "pdgemmSlave: pdgemm_ complete (pdgemm_ has no result INFO)" << std::endl; } if (DBG) { std::cerr << "pdgemmSlave outputs: {" << std::endl; // debug prints of the outputs: for(int ii=0; ii < SIZE_C; ii++) { std::cerr << " C["<<ii<<"] = " << C[ii] << std::endl; } std::cerr << "}" << std::endl; } // TODO: what is the check on the pdgemm_ (pblas call) for successful completion? if (DBG) std::cerr << "pdgemmSlave returning successfully:" << std::endl; slpp::int_t INFO = 0 ; return INFO ; }
int MAIN__(int argc, char** argv) { int num; // number of data int dim; // dimension of each data int nprow=4; // number of row int npcol=1; // number of columnn int zero=0, one=1; // constant value int ictxt,myrow,mycol,pnum,pdim,info; char ifilename[LEN_FILENAME]; char ofilename[LEN_FILENAME]; int myproc, nprocs; Cblacs_pinfo(&myproc, &nprocs); Cblacs_setup(&myproc, &nprocs); Cblacs_get(-1,0,&ictxt); nprow = nprocs; npcol = 1; // fixed char order[] = "Row"; Cblacs_gridinit(&ictxt, order, nprow, npcol); Cblacs_gridinfo(ictxt, &nprow, &npcol, &myrow, &mycol); if (DEBUG_MODE) { printf("ConTxt = %d\n", ictxt); printf("nprocs=%d, nprow=%d, npcol=%d\n", nprocs, nprow, npcol); printf("nprocs=%d, myrow=%d, mycol=%d\n", nprocs, myrow, mycol); } get_option(argc, argv, ifilename, ofilename, &num, &dim); // 0. cosinedist(ij) = 1 - V(i)V(j)/(Length(V(i))*Length(V(j))) // 1. calculate submatrix size int bsize = num / nprow; // blocking factor pnum = num / nprow; pdim = dim; if ( myrow < (num/bsize)%nprow) { pnum += bsize; } else if ( myrow == (num/bsize)%nprow) { pnum += (num % bsize); } else { } if(DEBUG_MODE) printf("myproc=%d: pnum=%d, pdim=%d, bsize=%d\n", myproc, pnum, pdim, bsize); int desc_input[9], desc_v[9], desc_ip[9], desc_n[9], desc_result[9]; descinit_(desc_input, &num, &dim, &num, &dim, &zero, &zero, &ictxt, &num, &info); descinit_(desc_v, &num, &dim, &bsize, &pdim, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_ip, &num, &num, &bsize, &num, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_n, &num, &one, &bsize, &one, &zero, &zero, &ictxt, &pnum, &info); descinit_(desc_result, &num, &num, &num, &num, &zero, &zero, &ictxt, &num, &info); // 2. read input data double* input; if (myproc == 0) { input = (double*)malloc(sizeof(double)*num*dim); memset(input, 0, sizeof(double)*num*dim); read_data(ifilename, num, dim, input); printArray("input", myproc, input, num, dim); } // 3. distribute input data array double* V = (double*)malloc(sizeof(double)*pnum*pdim); memset(V, 0, sizeof(double)*pnum*pdim); Cpdgemr2d(num, dim, input, 1, 1, desc_input, V, 1, 1, desc_v, ictxt); printArray("V", myproc, V, pnum, pdim); // 4. InnerProduct = VV' double* InnerProduct = (double*)malloc(sizeof(double)*pnum*num); memset(InnerProduct, 0, sizeof(double)*pnum*num); char transa = 'N', transb = 'T'; int m = num, n = num, k = dim; int lda = num, ldb = num, ldc = num; double alpha = 1.0f, beta = 0.0f; pdgemm_(&transa, &transb, &m, &n, &k, &alpha, V, &one, &one, desc_v, V, &one, &one, desc_v, &beta, InnerProduct, &one, &one, desc_ip); printArray("InnerProduct", myproc, InnerProduct, pnum, num); // 5. Norm of each vector double* Norm = (double*)malloc(sizeof(double)*pnum); for (int i = 0; i < pnum; i++) { int n = ((myproc*bsize)+(i/bsize)*(nprocs-1)*bsize+i)*pnum + i; Norm[i] = sqrt(InnerProduct[n]); } printArray("Norm", myproc, Norm, 1, pnum); // 6. Norm product matrix double* NormProduct = (double*)malloc(sizeof(double)*pnum*num); memset(NormProduct, 0, sizeof(double)*pnum*num); char uplo = 'U'; n = num; alpha = 1.0f; int incx = 1; lda = num; pdsyr_(&uplo, &n, &alpha, Norm, &one, &one, desc_n, &incx, NormProduct, &one, &one, desc_ip); printArray("NormProduct", myproc, NormProduct, pnum, num); // 7. CosineDistance(ij) = 1-InnerProduct(ij)/NormProduct(ij) double* CosineDistance = (double*)malloc(sizeof(double)*pnum*num); memset(CosineDistance, 0, sizeof(double)*pnum*num); for (int j = 0; j < num; j++) { for (int i = 0; i < pnum; i++) { int n = ((myproc*bsize)+i+(i/bsize)*(nprocs-1)*bsize)*pnum+i; int p = i+j*pnum; if (p<=n) { CosineDistance[p] = 0.0; } else { CosineDistance[p] = 1 - InnerProduct[p]/NormProduct[p]; } } } printArray("CosineDistance", myproc, CosineDistance, pnum, num); // 8. gather result double* result; if ( myproc == 0 ) { result = (double*)malloc(sizeof(double)*num*num); memset(result, 0, sizeof(double)*num*num); } Cpdgemr2d(num, num, CosineDistance, 1, 1, desc_ip, result, 1, 1, desc_result, ictxt); // 9. output to file if ( myproc == 0 ) { output_results(ofilename, result, num, num); } // a. cleanup memory free(V); free(InnerProduct); free(Norm); free(NormProduct); free(CosineDistance); if ( myproc == 0 ) { free(input); free(result); } blacs_exit_(&zero); return 0; }
int main(int argc, char **argv) { int ictxt, nside, ngrid, nblock, nthread; int rank, size; int ic, ir, nc, nr; int i, j; char *fname; int info, ZERO=0, ONE=1; struct timeval st, et; double dtnn, dtnt, dttn, dttt; double gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt; /* Initialising MPI stuff */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); printf("Process %i of %i.\n", rank, size); /* Parsing arguments */ if(argc < 6) { exit(-3); } nside = atoi(argv[1]); ngrid = atoi(argv[2]); nblock = atoi(argv[3]); nthread = atoi(argv[4]); fname = argv[5]; if(rank == 0) { printf("Multiplying matrices of size %i x %i\n", nside, nside); printf("Process grid size %i x %i\n", ngrid, ngrid); printf("Block size %i x %i\n", nblock, nblock); printf("Using %i OpenMP threads\n", nthread); } #ifdef _OPENMP if(rank == 0) printf("Setting OMP_NUM_THREADS=%i\n", nthread); omp_set_num_threads(nthread); #endif /* Setting up BLACS */ Cblacs_pinfo( &rank, &size ) ; Cblacs_get(-1, 0, &ictxt ); Cblacs_gridinit(&ictxt, "Row", ngrid, ngrid); Cblacs_gridinfo(ictxt, &nr, &nc, &ir, &ic); int descA[9], descB[9], descC[9]; /* Fetch local array sizes */ int Ar, Ac, Br, Bc, Cr, Cc; Ar = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Ac = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Br = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Bc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); Cr = numroc_( &nside, &nblock, &ir, &ZERO, &nr); Cc = numroc_( &nside, &nblock, &ic, &ZERO, &nc); printf("Local array section %i x %i\n", Ar, Ac); /* Set descriptors */ descinit_(descA, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Ar, &info); descinit_(descB, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Br, &info); descinit_(descC, &nside, &nside, &nblock, &nblock, &ZERO, &ZERO, &ictxt, &Cr, &info); /* Initialise and fill arrays */ double *A = (double *)malloc(Ar*Ac*sizeof(double)); double *B = (double *)malloc(Br*Bc*sizeof(double)); double *C = (double *)malloc(Cr*Cc*sizeof(double)); for(i = 0; i < Ar; i++) { for(j = 0; j < Ac; j++) { A[j*Ar + i] = drand48(); B[j*Br + i] = drand48(); C[j*Cr + i] = 0.0; } } double alpha = 1.0, beta = 0.0; //======================== if(rank == 0) printf("Starting multiplication (NN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nn = 2.0*pow(nside, 3) / (dtnn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnn, gfpc_nn); //======================== //======================== if(rank == 0) printf("Starting multiplication (NT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("N", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dtnt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_nt = 2.0*pow(nside, 3) / (dtnt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dtnt, gfpc_nt); //======================== //======================== if(rank == 0) printf("Starting multiplication (TN).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "N", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttn = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tn = 2.0*pow(nside, 3) / (dttn * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttn, gfpc_tn); //======================== //======================== if(rank == 0) printf("Starting multiplication (TT).\n"); Cblacs_barrier(ictxt,"A"); gettimeofday(&st, NULL); pdgemm_("T", "T", &nside, &nside, &nside, &alpha, A, &ONE, &ONE, descA, B, &ONE, &ONE, descB, &beta, C, &ONE, &ONE, descC ); Cblacs_barrier(ictxt,"A"); gettimeofday(&et, NULL); dttt = (double)((et.tv_sec-st.tv_sec) + (et.tv_usec-st.tv_usec)*1e-6); gfpc_tt = 2.0*pow(nside, 3) / (dttt * 1e9 * ngrid * ngrid * nthread); if(rank == 0) printf("Done.\n=========\nTime taken: %g s\nGFlops per core: %g\n=========\n", dttt, gfpc_tt); //======================== if(rank == 0) { FILE * fd; fd = fopen(fname, "w"); fprintf(fd, "%g %g %g %g %i %i %i %i %g %g %g %g\n", gfpc_nn, gfpc_nt, gfpc_tn, gfpc_tt, nside, ngrid, nblock, nthread, dtnn, dtnt, dttn, dttt); fclose(fd); } Cblacs_gridexit( 0 ); MPI_Finalize(); }