int CRSF_tcrossprod(int dim[], int iMyRank) { int iMemSize; int localRowSizeOfA; int localColSizeOfA; int localColSizeOfB; int localRowSizeOfC; int localColSizeOfC; double *dpWork = NULL; int ipZero[] = { 0, 1, 2, 3 }; int NPRow = dim[6]; int NPCol = dim[7]; int MyRow = iMyRank / NPCol; int MyCol = iMyRank % NPCol; int rowOfA = dim[0]; int colOfA = dim[1]; int colOfB = rowOfA; int rowBlockSize = dim[4]; int colBlockSize = dim[5]; int iLLD_A = 0; int iLLD_C = 0; localRowSizeOfA = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, ipZero, &NPRow); localColSizeOfA = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); localColSizeOfB = F77_CALL(numroc)(&colOfB, &colBlockSize, &MyCol, ipZero, &NPCol); localRowSizeOfC = localRowSizeOfA; localColSizeOfC = localColSizeOfB; if ( localColSizeOfA > 0 ) iLLD_A = max( 1, localRowSizeOfA ); else iLLD_A = 1; if ( localColSizeOfC > 0 ) iLLD_C = max( 1, localRowSizeOfC ); else iLLD_C = 1; /* I am calculating memory for only two matrices A & C(output). */ iMemSize = iLLD_A * localColSizeOfA + iLLD_C * localColSizeOfC + colBlockSize + 1; D_Rprintf (( "%d/%d: Mem size for me = %d\n", MyRow, MyCol, iMemSize )); dpWork = (double *) malloc(sizeof(double) * iMemSize); memset(dpWork, 0xcc, sizeof(double) * iMemSize); D_Rprintf (("%d: Allocated a work vector of size %d bytes\n", iMyRank, sizeof(double) * iMemSize)); F77_CALL(callpdgetcp)(dim, dpWork, &iMemSize); free(dpWork); return 0; }
/* **** CRSF_qr **** * This function is a C interface to the fortran implemented * scalapack driver function "callpdgeqrf" that performs * qr decomposition on the input data. */ int CRSF_qr(int dim[], int iMyRank) { int iMemSize = 0; double *dpWork = NULL; int ipZero[] = { 0, 1, 2, 3 }; int NPRow = dim[6]; int NPCol = dim[7]; int MyRow = iMyRank / NPCol; int MyCol = iMyRank % NPCol; int rowOfA = dim[0]; int colOfA = dim[1]; int rowBlockSize = dim[4]; int colBlockSize = dim[5]; /* Calculate required memory size */ int localRowSizeOfA = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, ipZero, &NPRow); int localColSizeOfA = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); int localSizeOfA = localRowSizeOfA * localColSizeOfA; int tauParam = min (rowOfA, colOfA); int lTau = F77_CALL(numroc)(&tauParam, &colBlockSize, &MyCol, ipZero, &NPCol); int iZero = 0; int iA = 1; int jA = 1; int iROff = rowBlockSize; int iCOff = colBlockSize; int iARow = F77_CALL(indxg2p)(&iA, &rowBlockSize, &MyRow,&iZero, &NPRow); int iACol = F77_CALL(indxg2p)(&jA, &colBlockSize, &MyCol,&iZero, &NPCol); int mp0Param = rowOfA + iROff; int mP0 = F77_CALL(numroc)(&mp0Param, &rowBlockSize, &MyRow, &iARow, &NPRow); int nQ0Param = colOfA + iCOff; int nQ0 = F77_CALL(numroc)(&nQ0Param, &colBlockSize, &MyCol, &iACol, &NPCol); int lWork = colBlockSize * (mP0 + nQ0 + colBlockSize); iMemSize = localSizeOfA + lTau + lWork; dpWork = (double *) malloc(sizeof(double) * iMemSize); memset(dpWork, 0xcc, sizeof(double) * iMemSize); D_Rprintf (("After allocating memory .. \n ")); F77_CALL(callpdgeqrf)(dim, dpWork, &iMemSize); D_Rprintf (("AFTER FORTRAN FUNCTION EXECUTION \n ")); free (dpWork); return 0; }
/* **** CRSF_solve **** * This function is a C interface to the fortran implemented * scalapack driver function "callpdgesv" that solves the * equations A * X = B for 'X' */ int CRSF_solve(int dim[], int iMyRank) { int iMemSize = 0; double *dpWork = NULL; int ipZero[] = { 0, 1, 2, 3 }; int NPRow = dim[6]; int NPCol = dim[7]; int MyRow = iMyRank / NPCol; int MyCol = iMyRank % NPCol; int rowOfA = dim[0]; int colOfA = dim[1]; int rowOfB = dim[2]; int colOfB = dim[3]; int rowBlockSize = dim[4]; int colBlockSize = dim[5]; /* Calculate the MemSize */ int localRowSizeOfA = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, ipZero, &NPRow); int localColSizeOfA = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); int localRowSizeOfB = F77_CALL(numroc)(&rowOfB, &rowBlockSize, &MyRow, ipZero, &NPRow); int localColSizeOfB = F77_CALL(numroc)(&colOfB, &colBlockSize, &MyCol, ipZero, &NPCol); int localSizeOfA = localRowSizeOfA * localColSizeOfA; int localSizeOfB = localRowSizeOfB * localColSizeOfB; int localSizeOfPiv = localRowSizeOfA + rowBlockSize; int localWorkSize = colBlockSize; D_Rprintf(("Child: inside CRSF_solve \n ")); iMemSize = localSizeOfA + localSizeOfB + localSizeOfPiv + localWorkSize; dpWork = (double *) malloc(sizeof(double) * iMemSize); memset(dpWork, 0xcc, sizeof(double) * iMemSize); D_Rprintf(("Child: After allocating memory .. \n ")); F77_CALL(callpdgesv)(dim, dpWork, &iMemSize); D_Rprintf (("Child: AFTER FORTRAN FUNCTION EXECUTION \n ")); free (dpWork); return 0; }
/* **** PA_Init **** * Check to see if MPI is initialized, and if it is not, then initialize it. * Returns 0 for success or non-zero on error. */ int PA_Init() { int flag; #if OMPI dlopen("libmpi.so.0", RTLD_GLOBAL | RTLD_LAZY); #endif if( MPI_Initialized(&flag) != MPI_SUCCESS){ Rprintf("ERROR[1]: Failed in call MPI_Initialized \n"); return 1; } if (flag){ D_Rprintf((" MPI has already been Initialized \n")); return 0; } else { D_Rprintf(("PA: ***************** Initializing MPI\n")); MPI_Init(NULL, NULL); MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); return 0; } }
/* **** CR_CallScalapackFun **** * This function calls the C wrappers around the Fortran wrappers around the * ScaLAPACK functions. * It basically does nothing but switch on the Function ID and call the * corresponding function. */ int CR_CallScalapackFn (int *ipGridAndDims, int iMyRank) { int iFunction = ipGridAndDims[8]; switch(iFunction) { case 0: /* sla.gridInit, sla.gridExit, etc. */ break; case 1: /* Solve */ CRSF_solve (ipGridAndDims, iMyRank); break; case 2: /* SVD */ CRSF_svd (ipGridAndDims, iMyRank); break; case 3: /* QR Decomposition */ CRSF_qr(ipGridAndDims, iMyRank); break; case 4: /* Choleski Factorization */ CRSF_chol(ipGridAndDims, iMyRank); break; case 5: /* Invert a (symmetric, positive definite, square) matrix from * its Choleski decomposition */ CRSF_chol2inv(ipGridAndDims, iMyRank); break; case 6: /* Eigen Value */ CRSF_eigen(ipGridAndDims, iMyRank); break; case 7: /* Matrix Multiplication */ /*D_Rprintf(("Got every thing, Need to run multiplication alg.\n")); */ CRSF_multiply(ipGridAndDims, iMyRank); break; case 8: /* tcrossprod */ D_Rprintf(("Got every thing, Need to run multiplication alg.\n")); CRSF_tcrossprod(ipGridAndDims, iMyRank); break; default: Rprintf("%d:%s:%d: ASSERT ERROR: Reached unreachable default in switch statement!\n", iMyRank, __FILE__, __LINE__); return -1; } /* Endof switch (iFunction) */ return 0; }
/* **** CRSF_chol2inv **** * This function is a C interface to the fortran implemented * scalapack driver function "callpdpotri" that performs * inverting a matrix from its Choleski Factorization */ int CRSF_chol2inv(int dim[], int iMyRank) { int iMemSize = 0; double *dpWork = NULL; int ipZero[] = { 0, 1, 2, 3 }; int NPRow = dim[6]; int NPCol = dim[7]; int MyRow = iMyRank / NPCol; int MyCol = iMyRank % NPCol; int rowOfA = dim[0]; int colOfA = dim[1]; int rowBlockSize = dim[4]; int colBlockSize = dim[5]; /* Calculate required memory size */ int localRowSizeOfA = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, ipZero, &NPRow); int localColSizeOfA = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); int localSizeOfA = localRowSizeOfA * localColSizeOfA; int workSpace = max (rowBlockSize, colBlockSize); iMemSize = localSizeOfA + workSpace; dpWork = (double *) malloc(sizeof(double) * iMemSize); memset(dpWork, 0xcc, sizeof(double) * iMemSize); D_Rprintf (("After allocating memory .. \n ")); F77_CALL(callpdpotri)(dim, dpWork, &iMemSize); D_Rprintf (("AFTER FORTRAN FUNCTION EXECUTION \n ")); free (dpWork); return 0; }
/* **** PA_SendData **** * This function sends the parameters to child 0 and then distributes the * data to all of the child processes. The data is distributed in the * 2D block/cyclic pattern required by the ScaLAPACK library. */ int PA_SendData(int ipDims[], double dpA[], double dpB[]) { int iFunction; MPI_Comm intercomm; iFunction = ipDims[8]; /* Broadcast the Data Dimension to the child Processes */ PA_ErrorHandler(MPI_Intercomm_merge(childComm, 0, &intercomm)); PA_ErrorHandler(MPI_Bcast(ipDims,10,MPI_INT, 0, intercomm)); /* If the function was sla.gridInit or sla.gridExit, then there is * no data to distribute. */ if (iFunction == 0) { D_Rprintf(("PA: iFunction = 0, Just before returning\n")); return 0; } else { /* Otherwise, distribute data as usual. */ /* Check if ready to run */ if ( PA_CheckFaultPriorRun () != 0){ printf(" Memory related problems in one/all of Spawned Processes \n"); printf(" Report the bug to: [email protected] \n"); return -1; } /* Distribute the first matrix*/ PAdistData (dpA,ipDims, ipDims[0], ipDims[1]); if (ipDims[2] != 0 && ipDims[8] != 2){ /* Distribute the Second matrix*/ PAdistData (dpB,ipDims, ipDims[2], ipDims[3]); } return 0; } }
/* Receive - I/P Data dimensions and Process Grid Specifications from the parent * 1. No. of rows in matrix A * 2. No. of cols in matrix A * 3. No. of rows in matrix B * 4. No. of cols in matrix B * 5. MB - Row Block size for matrix A * 6. NB - Col Block size for matrix A * 7. NPROW - Number of Process rows in the Process Grid - Row Block Size * 8. NPCOL - Number of Process cols in the Process Grid - Col Block Size * 9. Function id * 10. Relaease Flag */ int CR_GetInputParams(MPI_Comm mcParent, int *ipGridAndDims) { MPI_Comm parent; if (MPI_Comm_get_parent(&parent) != MPI_SUCCESS) { Rprintf("ERROR[2]: Getting Parent Comm ... FAILED .. EXITING !!\n"); return AsInt(2); } if(MPI_Intercomm_merge(parent, 1, &intercomm)!= MPI_SUCCESS) return -1; if(MPI_Bcast(ipGridAndDims,10, MPI_INT, 0, intercomm) != MPI_SUCCESS) { D_Rprintf(("Child: Broadcast error\n")); return -2; } else { return 0; } }
/* **** PA_Exec **** * The ParallelAgent's equivelant of "main". This function spawns the * children processes, sends them the data, and gets back the results. */ SEXP PA_Exec(SEXP scriptLocn, SEXP sxInputVector) { int iFunction; int iSpawnFlag = 1; int iNumProcs; int ipDims[10]= { 0,0,0,0,0,0,0,0,0,0 }; double *dpA = NULL; double *dpB = NULL; int iStatus; int returnValue; SEXP sRet; #ifndef DONT_SPAWN_R char *cpProgram = "R"; /* Program to call; Usually "R" */ char *child_args[] = { "BATCH", "--no-save", CHAR(STRING_ELT((scriptLocn),0)), "abc.out", NULL }; #else /* There are four ways to call the child processes (all go through * MPI_COMM_SPAWN): * 1. Spawn a copy of R and let it call the child process function through * an R script. This approach simplified the MPI-BLACS interation, * at the cost of major overhead. * 2. Spawn a shell script (dfCRDriver) which runs the child process * driver. Spawning the program directly is better, but there is a * problem with needing the shared library in the library path. * 3. Spawn a shell (sh) which first adds the path to the scalapack.so * library to LD_LIBRARY_PATH and then calls the driver program. * This eliminates the need for the extra script (dfCRDriver), while * also being several thousandths of a second faster. * 4. Spawn the driver program directly. This requires building the * program differently so that it doesn't need to scalapack.so * shared library. This is the preferred/current method. */ char *cpProgram; /* =R_PACKAGE_DIR"/exec/dfCRDriver.sh"; */ char *child_args[] = { NULL, NULL }; int iLength; /* The scriptLocn (script location) variable contains the path to the * executable directory (followed by the script name). Extract the path, * and use it for the executable's path. */ cpProgram = (char *) (CHAR(STRING_ELT((scriptLocn), 0))); iLength = strrchr(cpProgram, '/') - cpProgram; if (iLength < 0) { Rprintf("Path to script is not complete. Unable to continue.\n"); return R_NilValue; } cpProgram = (char *) malloc(sizeof(char) * (iLength + 12)); if (cpProgram == NULL) { Rprintf("Memory allocation (%d bytes) failed!\n", sizeof(char) * (iLength + 12)); return R_NilValue; } *(cpProgram) = '\0'; strncat(cpProgram, CHAR(STRING_ELT((scriptLocn),0)), iLength); strncat(cpProgram, "/CRDriver", 10); D_Rprintf(("Child process: \"%s\" \"%s\" \"%s\"\n", cpProgram, child_args[0], child_args[1])); #endif /* Endof If DONT_SPAWN_R is defined */ /* Begin by unpacking the input vector into all of the seperate variables * that get their values from it */ if (PA_UnpackInput(sxInputVector, ipDims, &dpA, &dpB, &iNumProcs, &iFunction, &iSpawnFlag) != 0) { free(cpProgram); return R_NilValue; } #if 0 // Guru int ig; for ( ig = 0; ig < 10; ig++ ) { fprintf(stdout, "ipDims[%d] = %d\n", ig, ipDims[ig] );fflush(stdout); } #endif /* Initialize MPI (if it is already initialized, it won't be * initialized again). */ if (PA_Init() != 0){ Rprintf(" ERROR[1]: Failed while intializing MPI \n"); free(cpProgram); return R_NilValue; } if (iSpawnFlag != 0 && iGlobalNumChildren != 0) { Rprintf(" Error: Attempt to spawn a new grid without releasing the previous grid.\n"); return R_NilValue; } if(iSpawnFlag == 0 && iGlobalNumChildren == 0){ Rprintf(" Error: Process Grid not present and Spawn option is set FALSE \n"); return R_NilValue; } int *ipErrcodes = (int *)Calloc(iNumProcs, int); /* Begin: Spawn the child processes */ if ( iSpawnFlag != 0 ) { /* Begin: Spawn the child processes */ D_Rprintf(("PA: Preparing to spawn %d child processes.\n", iNumProcs)); fflush(stdout); iStatus = MPI_Comm_spawn(cpProgram, child_args, iNumProcs, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &childComm, ipErrcodes); free(cpProgram); if (iStatus != MPI_SUCCESS) { Rprintf(" ERROR: Failed to spawn (%d) child processes.\n", iNumProcs); return R_NilValue; } D_Rprintf(("SPAWNING SUCCESSFUL\n")); fflush(stdout); /* End: Spawn the child processes */ iGlobalNumChildren = iNumProcs; iNprows = ipDims[6]; iNpcols = ipDims[7]; } /* SPECIAL for SVD */ /* If the function is SVD, the child process needs to know the nu,nv * parameters. */ if (iFunction == 2) { ipDims[2] = (int) dpB[0]; ipDims[3] = (int) dpB[1]; } /* DATA DISTRIBUTION */ /* The data is distributed by the PA to all of the child processes. */ if ((returnValue = PA_SendData(ipDims, dpA, dpB)) == 0) { D_Rprintf(("PA: DATA SENT TO CHILD PROCESSES.\n")); fflush(stdout); } else { /* The send data failed, */ Rprintf("ERROR [1] : DATA COULD NOT BE SENT TO CHILD PROCESSES.\n"); iGlobalNumChildren = 0; iNprows = 0; iNpcols = 0; return R_NilValue; } D_Rprintf(("PA: back to exec.\n")); fflush(stdout); /* If the release flag == 1, then the grid will be (or was) released. */ if (ipDims[9] == 1) { iGlobalNumChildren = 0; iNprows = 0; iNpcols = 0; } /* If the function is sla.gridInit or sla.gridExit, just return. */ if (iFunction == 0) { // MPI_Comm_free(&intercomm); return R_NilValue; } /* GET BACK THE RESULT */ sRet = PA_RecvResult(ipDims); return sRet; }
/* **** PA_RecvResult **** * After the computations are done, receive the result back from the children * processes. The parameters of the return values are sent by child 0, and * the data is collected (from the 2D block/cyclic distribution) from all * child processes. */ SEXP PA_RecvResult(int ipDims[]) { int iStatus; int iNumMatrices, iWorksize; int ipOutDims[3]; SEXP sxRet = R_NilValue; SEXP sxTmp; int i; iWorksize = ipDims[5]; D_Rprintf(("Preparing to receive .... ")); /* First, get the number of matrices in the return object. */ iStatus = MPI_Recv(&iNumMatrices,1,MPI_INT,0,NUMMATRICES_TAG,childComm,MPINULL); if (iStatus != MPI_SUCCESS) { PA_ErrorHandler(iStatus); return R_NilValue; } D_Rprintf(("%d matrices\n", iNumMatrices)); if (iNumMatrices == 0){ return R_NilValue; } /* Setup the R vector to hold the R matrix objects */ PROTECT( sxRet = allocVector(VECSXP, iNumMatrices) ); /* Main loop */ for (i = 0; i < iNumMatrices; i++) { /* Get the dimensions of matrix i */ iStatus = MPI_Recv(ipOutDims, 3, MPI_INT, 0, RECVRESULT_TAG+i, childComm, MPINULL); if (iStatus != MPI_SUCCESS) { PA_ErrorHandler(iStatus); UNPROTECT( 1 ); return R_NilValue; } D_Rprintf(("Dimensions of matrix %d: [%d, %d], tag=%d\n", i, ipOutDims[1], ipOutDims[2], ipOutDims[0])); /* If the matrix is empty (size zero), handle the case seperately. * SVD can return empty matrices. */ if (ipOutDims[1] == 0 || ipOutDims[2] == 0) { sxTmp = coerceVector(R_NilValue, NILSXP); SET_VECTOR_ELT(sxRet, i, sxTmp); continue; } /* If the matrix is actually a vector, handle that case. */ if (ipOutDims[1] == 0) ipOutDims[1] = 1; if (ipOutDims[2] == 0) ipOutDims[2] = 1; /* Make an R array to hold the matrix */ PROTECT( sxTmp = allocVector(REALSXP, ipOutDims[1]*ipOutDims[2]) ); D_Rprintf(("Preparing to receive...")); /* Get matrix i */ /* ipOutDims[0] is a flag saying whether to receive the matrix via * a normal MPI call or via the PACollectData function */ if (ipOutDims[0] == 1) { iStatus = MPI_Recv(REAL(sxTmp), ipOutDims[1]*ipOutDims[2], MPI_DOUBLE, 0, RECVVECORMAT_TAG+i, childComm, MPINULL); if (iStatus != MPI_SUCCESS) { PA_ErrorHandler(iStatus); return R_NilValue; } } else { /*F77_CALL(pacollectdata)(REAL(sxTmp), ipOutDims+1, ipOutDims+2, ipDims, &iWorksize);*/ PAcollectData(REAL(sxTmp), ipDims, ipOutDims[1], ipOutDims[2]); } /* Set the dimensions of the matrix */ PA_SetDim(sxTmp, 2, ipOutDims+1); /* Save the matrix into the vector */ SET_VECTOR_ELT(sxRet, i, sxTmp); UNPROTECT( 1 ); } UNPROTECT( 1 ); return sxRet; }
/* **** PA_UnpackInput **** * The input parameters/data to the ParallelAgent is given as an R vector. * This function unpacks the vector, putting the pieces into the appropriate * variables. */ int PA_UnpackInput(SEXP sxInputVector, int *ipDims, double **dppA, double **dppB, int *ipNumProcs, int *ipFunction, int *ipSpawnFlag) { SEXP s; int iMB; int ipReleaseFlag; /* First parameter is the first matrix. --populates ipDims[0] and ipDims[1] */ /* Second parameter is the second matrix. --populates ipDims[2] and ipDims[3] */ /* Third Parameter is number of Process Rows -- populates ipDims[6] = NPROWS*/ /* Fourth Parameter is number of Process Cols -- populates ipDims[7] = NPCOLS */ /* Fifth Parameter is Block Size -- populates ipDims[4] =ipDims [5]= MB */ /* Sixth Parameter is function identifier -- populates ipDims[8] = functionID */ /* Seventh parameter --- Instruction to Release Grid or not -- populates ipDims[9]=ReleaseFlag */ /* Eighth parameter --- Instruction to Spawn Grid or not*/ /* First parameter is the first matrix. --populates ipDims[0] and ipDims[1] */ s = VECTOR_PTR(sxInputVector)[0]; if (TYPEOF(s) != REALSXP) { Rprintf("1st parameter (Matrix A) is not an array of doubles.\n"); return -1; } if (PA_GetTwoDims(s, ipDims) > 2) { Rprintf("1st parameter (Matrix A) has too many dimensions.\n"); return -2; } /* If the object is one dimensional, set the second dimension to length 1 */ if (ipDims[1] == 0) ipDims[1] = 1; /* Save a pointer to the first matrix */ *dppA = REAL(s); /* Second parameter is the second matrix. --populates ipDims[2] and ipDims[3] */ s = VECTOR_PTR(sxInputVector)[1]; if (TYPEOF(s) != REALSXP) { Rprintf("2nd parameter (Matrix B) is not an array of doubles.\n"); return -3; } if (PA_GetTwoDims(s, ipDims + 2) > 2) { Rprintf("2nd parameter (Matrix B) has too many dimensions.\n"); return -4; } /* If the object is one dimensional, set the second dimension to length 1, * unless the length is zero (i.e, there is no second matrix). */ if (ipDims[3] == 0 && LENGTH(s) != 0) ipDims[3] = 1; /* Save a pointer to the second matrix */ *dppB = REAL(s); /* Third Parameter is number of Process Rows -- populates ipDims[6] = NPROWS*/ s = VECTOR_PTR(sxInputVector)[2]; if (TYPEOF(s) != INTSXP) { Rprintf("Third parameter (number of row processors) is not an integer.\n"); return -5; } if (LENGTH(s) != 1) { Rprintf("First parameter (number of row processors) is not a single number.\n"); return -6; } ipDims[6] = INTEGER(s)[0]; /* Fourth Parameter is number of Process Cols -- populates ipDims[7] = NPCOLS */ s = VECTOR_PTR(sxInputVector)[3]; if (TYPEOF(s) != INTSXP) { Rprintf("Fourth parameter (number of col processors) is not an integer.\n"); return -7; } if (LENGTH(s) != 1) { Rprintf("Fourth parameter (number of col processors) is not a single number.\n"); return -8; } ipDims[7] = INTEGER(s)[0]; *ipNumProcs = ipDims[6] * ipDims[7]; /* iNumProcs = iNPRows * iNPCols */ /* Fifth Parameter is Block Size -- populates ipDims[4] =ipDims [5]= MB */ s = VECTOR_PTR(sxInputVector)[4]; if (TYPEOF(s) != INTSXP) { Rprintf("Fifth parameter (row block size of LHS matrix) is not an integer.\n"); return -9; } if (LENGTH(s) != 1) { Rprintf("Fifth parameter (row block size of LHS matrix) is not a single number.\n"); return -10; } iMB = INTEGER(s)[0]; /* !!! If the block size is larger than the input size, reduce the block * size down to the size of the input. Needed for the eigen function. */ if (iMB > ipDims[0] && iMB > ipDims[1] && iMB > ipDims[2] && iMB > ipDims[3]) iMB = max(ipDims[0], max(ipDims[1], max(ipDims[2], ipDims[3]))); /* Once upon a time, the block dimensions were taken as two inputs. Now, * the blocksize is forced to be square. */ ipDims[4] = ipDims[5] = iMB; /* Sixth Parameter is function identifier -- populates ipDims[8] = functionID */ s = VECTOR_PTR(sxInputVector)[5]; if (TYPEOF(s) != INTSXP) { Rprintf("Sixth parameter (function) is not an integer.\n"); return -11; } if (LENGTH(s) != 1) { Rprintf("Sixth parameter (function) is not a single number.\n"); return -12; } *ipFunction = INTEGER(s)[0]; if (*ipFunction < 0 || *ipFunction > 7) { Rprintf("Error: Unknown function ID (%d).\n", *ipFunction); return -13; } ipDims[8]= *ipFunction; /* Seventh parameter --- Instruction to Release Grid or not -- populates ipDims[9]=ReleaseFlag */ /* Populating ipDims array is complete ... */ s = VECTOR_PTR(sxInputVector)[6]; if (TYPEOF(s) != INTSXP) { Rprintf("Seventh parameter (function) is not an integer.\n"); return -11; } ipReleaseFlag = INTEGER(s)[0]; if(ipReleaseFlag != 0 && ipReleaseFlag != 1) { Rprintf ("Warning: Proper value for Release Flag= %d not used \n \t Release Flag is set to 1 \n", ipReleaseFlag ); ipReleaseFlag = 1; } ipDims[9]= ipReleaseFlag; /* ipSpawnFlag is the information sent from R to ParallelAgent requesting * it either to spawn the processes or not to spawn (use the existing ones) */ /* Eighth parameter --- Instruction to Spawn Grid or not*/ s = VECTOR_PTR(sxInputVector)[7]; if (TYPEOF(s) != INTSXP) { Rprintf("Sixth parameter (function) is not an integer.\n"); return -11; } *ipSpawnFlag = INTEGER(s)[0]; D_Rprintf(("spawn flag : %d \n", *ipSpawnFlag)); return 0; }
/* **** CRSF_svd **** * This function is a C interface to the fortran implemented * scalapack driver function "callpdgesvd" that performs * singular value decomposition */ int CRSF_svd(int dim[], int iMyRank) { int iMemSize = 0; double *dpWork = NULL; int ipZero[] = { 0, 1, 2, 3 }; int NPRow = dim[6]; int NPCol = dim[7]; int MyRow = iMyRank / NPCol; int MyCol = iMyRank % NPCol; int rowOfA = dim[0]; int colOfA = dim[1]; int rowBlockSize = dim[4]; int colBlockSize = dim[5]; /* Calculate the required mem size */ int localRowSizeOfA = 0; int localColSizeOfA = 0; int localSizeOfA = 0; int singularVectorSize = 0; int localSizeOfU = 0; int localSizeOfVT = 0; int localWorkSpaceSize = 0; D_Rprintf ((" In CRSF_svd func .. 1 \n")); /* Get the values */ localRowSizeOfA = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, ipZero, &NPRow); localColSizeOfA = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); localSizeOfA = localRowSizeOfA * localColSizeOfA; singularVectorSize = colOfA; localSizeOfU = localRowSizeOfA * localRowSizeOfA ; localSizeOfVT = localColSizeOfA * localColSizeOfA ; if (iMyRank == 0) { /* Calculate workspace size and Broadcast */ int wATOBD =0; int ictxt; int mp0 = 0, nq0 = 0; int temp1=-1, temp2=0; int lldA = max (1, localRowSizeOfA); int wBDTOSVD; int size = min (rowOfA, colOfA); int sizeP = F77_CALL(numroc)(&size, &rowBlockSize, &MyRow, ipZero, &NPRow); int sizeQ = F77_CALL(numroc)(&size, &colBlockSize, &MyCol, ipZero, &NPRow); int wANTU = 1; int NRU = localRowSizeOfA; int wANTVT = 1; int NCVT = localColSizeOfA; int wBDSQR; int wPDORMBRQLN; int wPDORMBRPRT; D_Rprintf ((" Process 0 ... in cntrl statement ..after init \n")); #if MPICH F77_CALL(blacs_get)(&temp1, &temp2, &ictxt); #else F77_CALL(blacs_get)(&temp1, &temp2, &ictxt); #endif /* Calculate the value for WATOBD */ D_Rprintf ((" Process 0 ... in cntrl statement ..1 \n")); mp0 = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, &ictxt , &NPRow); nq0 = F77_CALL(numroc)(&rowOfA, &colBlockSize, &MyCol, &lldA , &NPRow); wATOBD = rowBlockSize * (mp0 + nq0 + 1) + nq0 ; D_Rprintf ((" Process 0 ... in cntrl statement ..2 \n")); /* Calculate the value for WBDTOSVD */ wBDSQR = max (1, 2*size + (2*size - 4) * max (wANTU, wANTVT)) ; wPDORMBRQLN = max (colBlockSize *(colBlockSize -1)/2 , (sizeQ + mp0)*colBlockSize) + colBlockSize * colBlockSize; wPDORMBRPRT= max (rowBlockSize *(rowBlockSize -1)/2 , (sizeP + nq0)*rowBlockSize) + rowBlockSize * rowBlockSize; wBDTOSVD = size * (wANTU * NRU + wANTVT * NCVT) + max (wBDSQR,max (wANTU * wPDORMBRQLN , wANTVT * wPDORMBRPRT)); localWorkSpaceSize = 2 + 6* max( rowOfA, colOfA) + max (wATOBD, wBDTOSVD); D_Rprintf ((" Process 0 ... in cntrl statement ..3 \n")); } D_Rprintf ((" after cntrl statement ..rank = %d \n", iMyRank)); if (MPI_Bcast (&localWorkSpaceSize, 1, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS){ Rprintf ("SVD: Failed during MPI_Bcast call ... Rank: %d Exiting \n", iMyRank); exit (-1); } iMemSize = localSizeOfA + singularVectorSize + localSizeOfU + localSizeOfVT + localWorkSpaceSize; D_Rprintf ((" After Broadcasting iMemSize = %d .. localWspace = %d ... rank = %d \n", iMemSize,localWorkSpaceSize, iMyRank)); dpWork = (double *) malloc(sizeof(double) * iMemSize); memset(dpWork, 0xcc, sizeof(double) * iMemSize); D_Rprintf (("After allocating memory .. \n ")); F77_CALL(callpdgesvd)(dim, dpWork, &iMemSize); D_Rprintf (("AFTER FORTRAN FUNCTION EXECUTION \n ")); free(dpWork); return 0; }
/* **** CRSF_eigen **** * This function is a C interface to the fortran implemented * scalapack driver function "callpdsyevd" that performs * eigen value decomposition or Spectral decomposition */ int CRSF_eigen(int dim[], int iMyRank) { int iMemSize = 0; double *dpWork = NULL; int ipZero[] = { 0, 1, 2, 3 }; int NPRow = dim[6]; int NPCol = dim[7]; int MyRow = iMyRank / NPCol; int MyCol = iMyRank % NPCol; int rowOfA = dim[0]; int colOfA = dim[1]; int rowBlockSize = dim[4]; int colBlockSize = dim[5]; /* Calculate required memory size */ int sizeOfLWORK; int tRILWMIN; int np, nq; int localRowSizeOfA = F77_CALL(numroc)(&rowOfA, &rowBlockSize, &MyRow, ipZero, &NPRow); int localColSizeOfA = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); int localSizeOfA = localRowSizeOfA * localColSizeOfA; int localSizeOfZ = localRowSizeOfA * localColSizeOfA; int localSizeOfW = colOfA; int sizeOfLIWORK = 7 * colOfA + 8 * NPCol +2; np = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyRow, ipZero, &NPRow); nq = F77_CALL(numroc)(&colOfA, &colBlockSize, &MyCol, ipZero, &NPCol); tRILWMIN = 3 * colOfA + max ( colBlockSize * (np +1) , 3 * colBlockSize); sizeOfLWORK = max (1 + 6 * colOfA + 2*np*nq, tRILWMIN ) + 2 * colOfA; #if 0 fprintf(stdout, "%d: localSizeOfA = %d, localSizeOfZ=%d, localSizeOfW = %d, sizeOfLIWORK=%d\n", iMyRank, localSizeOfA, localSizeOfZ, localSizeOfW, sizeOfLIWORK );fflush(stdout); #endif iMemSize = localSizeOfA + localSizeOfZ + localSizeOfW + sizeOfLIWORK + sizeOfLWORK ; dpWork = (double *) malloc(sizeof(double) * iMemSize); if (dpWork == NULL) { fprintf(stdout, "%s:%d - Error allocating memory.\n", __FILE__, __LINE__ ); fflush(stdout); } memset(dpWork, 0xcc, sizeof(double) * iMemSize); D_Rprintf (("After allocating memory .. \n ")); F77_CALL(callpdsyevd)(dim, dpWork, &iMemSize); D_Rprintf (("AFTER FORTRAN FUNCTION EXECUTION \n ")); free (dpWork); return 0; }