int conpar_mpi_wrapper(integer *nov, integer *na, integer *nra, integer *nca, doublereal ***a, integer *ncb, doublereal ***b, integer *nbc, integer *nrc, doublereal ***c, doublereal *d, integer *irf, integer *icf) { integer loop_start,loop_end; integer loop_start_tmp,loop_end_tmp; int i,comm_size; int *a_counts,*a_displacements; int *b_counts,*b_displacements; int *c_counts,*c_displacements; int *irf_counts,*irf_displacements; int *icf_counts,*icf_displacements; MPI_Comm_size(MPI_COMM_WORLD,&comm_size); a_counts=(int *)MALLOC(sizeof(int)*comm_size); a_displacements=(int *)MALLOC(sizeof(int)*comm_size); b_counts=(int *)MALLOC(sizeof(int)*comm_size); b_displacements=(int *)MALLOC(sizeof(int)*comm_size); c_counts=(int *)MALLOC(sizeof(int)*comm_size); c_displacements=(int *)MALLOC(sizeof(int)*comm_size); irf_counts=(int *)MALLOC(sizeof(int)*comm_size); irf_displacements=(int *)MALLOC(sizeof(int)*comm_size); icf_counts=(int *)MALLOC(sizeof(int)*comm_size); icf_displacements=(int *)MALLOC(sizeof(int)*comm_size); a_counts[0] = 0; a_displacements[0] = 0; b_counts[0] = 0; b_displacements[0] = 0; c_counts[0] = 0; c_displacements[0] = 0; irf_counts[0] = 0; irf_displacements[0] = 0; icf_counts[0] = 0; icf_displacements[0] = 0; for(i=1;i<comm_size;i++){ /*Send message to get worker into conpar mode*/ { int message=AUTO_MPI_CONPAR_MESSAGE; MPI_Send(&message,1,MPI_INT,i,0,MPI_COMM_WORLD); } loop_start = ((i-1)*(*na))/(comm_size - 1); loop_end = ((i)*(*na))/(comm_size - 1); a_counts[i] = (*nca)*(*nra)*(loop_end-loop_start); a_displacements[i] = (*nca)*(*nra)*loop_start; b_counts[i] = (*ncb)*(*nra)*(loop_end-loop_start); b_displacements[i] = (*ncb)*(*nra)*loop_start; c_counts[i] = (*nca)*(*nrc)*(loop_end-loop_start); c_displacements[i] = (*nca)*(*nrc)*loop_start; irf_counts[i] = (*nra)*(loop_end-loop_start); irf_displacements[i] = (*nra)*loop_start; icf_counts[i] = (*nca)*(loop_end-loop_start); icf_displacements[i] = (*nca)*loop_start; loop_start_tmp = 0; loop_end_tmp = loop_end-loop_start; MPI_Send(&loop_start_tmp ,1,MPI_LONG,i,0,MPI_COMM_WORLD); MPI_Send(&loop_end_tmp ,1,MPI_LONG,i,0,MPI_COMM_WORLD); } { integer params[6]; params[0]=*nov; params[1]=*nra; params[2]=*nca; params[3]=*ncb; params[4]=*nbc; params[5]=*nrc; MPI_Bcast(params ,6,MPI_LONG,0,MPI_COMM_WORLD); } MPI_Scatterv(irf,irf_counts,irf_displacements,MPI_LONG, NULL,0,MPI_LONG, 0,MPI_COMM_WORLD); MPI_Scatterv(icf,icf_counts,icf_displacements,MPI_LONG, NULL,0,MPI_LONG, 0,MPI_COMM_WORLD); /* Worker is running now */ { /*I create a temporary send buffer for the MPI_Reduce command. This is because there isn't an asymmetric version (like MPI_Scatterv).*/ double **dtemp; dtemp = DMATRIX(*nrc,*ncb); for(i=0;i<(*nrc);i++) for(j=0;i<(*ncb);i++) dtemp[i][j]=d[i][j]; MPI_Reduce(dtemp[0],d[0],(*ncb)*(*nrc),MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); FREE_DMATRIX(dtemp); } MPI_Gatherv(NULL,0,MPI_DOUBLE, a[0][0],a_counts,a_displacements,MPI_DOUBLE, 0,MPI_COMM_WORLD); MPI_Gatherv(NULL,0,MPI_DOUBLE, b[0][0],b_counts,b_displacements,MPI_DOUBLE, 0,MPI_COMM_WORLD); MPI_Gatherv(NULL,0,MPI_DOUBLE, c[0][0],c_counts,c_displacements,MPI_DOUBLE, 0,MPI_COMM_WORLD); MPI_Gatherv(NULL,0,MPI_LONG, irf,irf_counts,irf_displacements,MPI_LONG, 0,MPI_COMM_WORLD); MPI_Gatherv(NULL,0,MPI_LONG, icf,icf_counts,icf_displacements,MPI_LONG, 0,MPI_COMM_WORLD); return 0; }
/* Uses Data info to initialize certain aspects of special point, so make sure Data is initialized and populated with proper parameter values before calling this function. */ int CreateSpecialPoint(AutoData *Data, integer itp, integer lab, doublereal *u, integer npar, integer *ipar, doublereal *par, integer *icp, doublereal *ups, doublereal *udotps, doublereal *rldot) { integer i, j, k, nsp; nsp = Data->num_sp; if (nsp == Data->sp_len) { Data->sp_len += 1; Data->sp = (AutoSPData *)REALLOC(Data->sp, Data->sp_len*sizeof(AutoSPData)); Data->sp[nsp].u = (doublereal *)MALLOC((Data->iap.ndim+1)*sizeof(doublereal)); Data->sp[nsp].ups = NULL; Data->sp[nsp].udotps = NULL; Data->sp[nsp].rldot = NULL; Data->sp[nsp].icp = NULL; Data->sp[nsp].a1 = NULL; Data->sp[nsp].a2 = NULL; } Data->sp[nsp].ibr = 1; Data->sp[nsp].mtot = 1; Data->sp[nsp].itp = itp; Data->sp[nsp].lab = lab; Data->sp[nsp].nfpr = Data->iap.nicp; Data->sp[nsp].isw = Data->iap.isw; Data->sp[nsp].ntpl = Data->iap.ncol*Data->iap.ntst+1; Data->sp[nsp].nar = Data->iap.ndim+1; Data->sp[nsp].nrowpr = 0; Data->sp[nsp].ntst = Data->iap.ntst; Data->sp[nsp].ncol = Data->iap.ncol; Data->sp[nsp].nparx = NPARX; Data->npar = npar-1; // Number of parameters in model (subtract period T) // u for (i=0; i<Data->iap.ndim+1; i++) Data->sp[nsp].u[i] = u[i]; // par for (i=0; i<NPARX; i++) Data->sp[nsp].par[i] = 0.0; for (i=0; i<npar; i++) Data->sp[nsp].par[ipar[i]] = par[i]; // icp if (icp != NULL) { Data->sp[nsp].icp = (integer *)MALLOC(Data->iap.nicp*sizeof(integer)); for (i=0; i<Data->iap.nicp; i++) Data->sp[nsp].icp[i] = icp[i]; } // ups if (ups != NULL) { Data->sp[nsp].ups = DMATRIX(Data->iap.ncol*Data->iap.ntst+1, Data->iap.ndim+1); for (i=0; i<Data->iap.ncol*Data->iap.ntst+1; i++) for (j=0; j<Data->iap.ndim+1; j++) Data->sp[nsp].ups[i][j] = ups[i*(Data->iap.ndim+1)+j]; } // udotps if (udotps != NULL) { Data->sp[nsp].udotps = DMATRIX(Data->iap.ncol*Data->iap.ntst+1, Data->iap.ndim); for (i=0; i<Data->iap.ncol*Data->iap.ntst+1; i++) for (j=0; j<Data->iap.ndim; j++) Data->sp[nsp].udotps[i][j] = udotps[i*(Data->iap.ndim)+j]; } // rldot if (rldot != NULL) { Data->sp[nsp].rldot = (doublereal *)MALLOC((Data->iap.nicp)*sizeof(doublereal)); for (i=0; i<Data->iap.nicp; i++) Data->sp[nsp].rldot[i] = rldot[i]; } // a1 and a2 if (Data->sflow) { Data->sp[nsp].a1 = DMATRIX_3D(Data->iap.ntst, Data->iap.ndim, Data->iap.ndim); Data->sp[nsp].a2 = DMATRIX_3D(Data->iap.ntst, Data->iap.ndim, Data->iap.ndim); } Data->num_sp += 1; }
void *conpar_process(void * arg) { integer icf_dim1, irf_dim1; /* Local variables */ integer ipiv, jpiv, itmp; doublereal tpiv; integer i, j, l, k1, k2, m1, m2, ic, ir; doublereal rm; integer ir1, irp; doublereal piv; integer icp1; integer *nov, *nra, *nca; doublereal ***a; integer *ncb; doublereal ***b; integer *nbc, *nrc; doublereal ***c, **d; integer *irf, *icf; integer loop_start,loop_end; #ifdef PTHREADS doublereal **dlocal; #endif #ifdef USAGE struct rusage *conpar_process_usage; usage_start(&conpar_process_usage); #endif nov = ((conpar_parallel_arglist *)arg)->nov; nra = ((conpar_parallel_arglist *)arg)->nra; nca = ((conpar_parallel_arglist *)arg)->nca; a = ((conpar_parallel_arglist *)arg)->a; ncb = ((conpar_parallel_arglist *)arg)->ncb; b = ((conpar_parallel_arglist *)arg)->b; nbc = ((conpar_parallel_arglist *)arg)->nbc; nrc = ((conpar_parallel_arglist *)arg)->nrc; c = ((conpar_parallel_arglist *)arg)->c; d = ((conpar_parallel_arglist *)arg)->d; irf = ((conpar_parallel_arglist *)arg)->irf; icf = ((conpar_parallel_arglist *)arg)->icf; loop_start = ((conpar_parallel_arglist *)arg)->loop_start; loop_end = ((conpar_parallel_arglist *)arg)->loop_end; #ifdef PTHREADS dlocal=DMATRIX(*nrc, *ncb); #endif /* In the default case we don't need to do anything special */ if(global_conpar_type == CONPAR_DEFAULT) { ; } /* In the message passing case we set d to be 0.0, do a sum here, and then do the final sum (with the true copy of d) in the master */ else if (global_conpar_type == CONPAR_MPI) { for(i=0;i<*nrc;i++) for (j=0; j<*ncb;j++) d[i][j]=0.0; } /* In the shared memory case we create a local variable for doing this threads part of the sum, then we do a final sum into shared memory at the end */ else if (global_conpar_type == CONPAR_PTHREADS) { #ifdef PTHREADS for(i=0;i<*nrc;i++) for (j=0; j<*ncb;j++) dlocal[i][j]=0.0; #else ; #endif } /* Note that the summation of the adjacent overlapped part of C */ /* is delayed until REDUCE, in order to merge it with other communications.*/ /* NA is the local NTST. */ irf_dim1 = *nra; icf_dim1 = *nca; /* Condensation of parameters (Elimination of local variables). */ m1 = *nov + 1; m2 = *nca - *nov; for (i = loop_start;i < loop_end; i++) { for (ic = m1; ic <= m2; ++ic) { ir1 = ic - *nov + 1; irp = ir1 - 1; icp1 = ic + 1; /* **Search for pivot (Complete pivoting) */ piv = 0.0; ipiv = irp; jpiv = ic; for (k1 = irp; k1 <= *nra; ++k1) { int irf_k1_i = irf[-1 + k1 + i*irf_dim1]; for (k2 = ic; k2 <= m2; ++k2) { int icf_k2_i = icf[-1 + k2 + i*icf_dim1]; tpiv = a[i][-1 + irf_k1_i][-1 + icf_k2_i]; if (tpiv < 0.0) { tpiv = -tpiv; } if (piv < tpiv) { piv = tpiv; ipiv = k1; jpiv = k2; } } } /* **Move indices */ itmp = icf[-1 + ic + i*icf_dim1]; icf[-1 + ic + i*icf_dim1] = icf[-1 + jpiv + i*icf_dim1]; icf[-1 + jpiv + i*icf_dim1] = itmp; itmp = irf[-1 + irp + i*irf_dim1]; irf[-1 + irp + i*irf_dim1] = irf[-1 + ipiv + i*irf_dim1]; irf[-1 + ipiv + i*irf_dim1] = itmp; { int icf_ic_i = icf[-1 + ic + i*icf_dim1]; int irf_irp_i = irf[-1 + irp + i*irf_dim1]; doublereal *a_offset2 = a[i][-1 + irf_irp_i]; doublereal *b_offset2 = b[i][-1 + irf_irp_i]; /* **End of pivoting; elimination starts here */ for (ir = ir1; ir <= *nra; ++ir) { int irf_ir_i = irf[-1 + ir + i*irf_dim1]; doublereal *a_offset1 = a[i][-1 + irf_ir_i]; doublereal *b_offset1 = b[i][-1 + irf_ir_i]; rm = a_offset1[-1 + icf_ic_i]/a_offset2[-1 + icf_ic_i]; a_offset1[-1 + icf_ic_i] = rm; if (rm != (double)0.) { for (l = 0; l < *nov; ++l) { a_offset1[l] -= rm * a_offset2[l]; } for (l = icp1 -1; l < *nca; ++l) { int icf_l_i = icf[l + i*icf_dim1]; a_offset1[-1 + icf_l_i] -= rm * a_offset2[-1 + icf_l_i]; } for (l = 0; l < *ncb; ++l) { b_offset1[l] -= rm * b_offset2[l]; } } } for (ir = *nbc + 1; ir <= *nrc; ++ir) { doublereal *c_offset1 = c[i][-1 + ir]; doublereal *d_offset1 = d[-1 + ir]; rm = c_offset1[-1 + icf_ic_i]/a_offset2[-1 + icf_ic_i]; c_offset1[-1 + icf_ic_i]=rm; if (rm != (double)0.) { for (l = 0; l < *nov; ++l) { c_offset1[l] -= rm * a_offset2[l]; } for (l = icp1 -1 ; l < *nca; ++l) { int icf_l_i = icf[l + i*icf_dim1]; c_offset1[-1 + icf_l_i] -= rm * a_offset2[-1 + icf_l_i]; } for (l = 0; l < *ncb; ++l) { /* A little explanation of what is going on here is in order I believe. This array is created by a summation across all workers, hence it needs a mutex to avoid concurrent writes (in the shared memory case) or a summation in the master (in the message passing case). Since mutex's can be somewhat slow, we will do the summation into a local variable, and then do a final summation back into global memory when the main loop is done. */ /* Nothing special for the default case */ if(global_conpar_type == CONPAR_DEFAULT) { d_offset1[l] -= rm * b_offset2[l]; } /* In the message passing case we sum into d, which is a local variable initialized to 0.0. We then sum our part with the masters part in the master. */ else if (global_conpar_type == CONPAR_MPI) { d_offset1[l] -= rm * b_offset2[l]; } /* In the shared memory case we sum into a local variable our contribution, and then sum into shared memory at the end (inside a mutex */ else if (global_conpar_type == CONPAR_PTHREADS) { #ifdef PTHREADS dlocal[-1 + ir][l] -= rm * b_offset2[l]; #else ; #endif } } } } } } } #ifdef PTHREADS /* This is were we sum into the global copy of the d array, in the shared memory case */ if(global_conpar_type == CONPAR_PTHREADS) { #ifdef PTHREADS pthread_mutex_lock(&mutex_for_d); for(i=0;i<*nrc;i++) for (j=0; j<*ncb;j++) d[i][j] += dlocal[i][j]; pthread_mutex_unlock(&mutex_for_d); FREE_DMATRIX(dlocal); #else ; #endif } #endif #ifdef USAGE usage_end(conpar_process_usage,"in conpar worker"); #endif return NULL; }
int AUTO(AutoData *Data) { struct timeval *time0,*time1; integer icp[NPARX2]; doublereal par[NPARX2], thl[NPARX]; iap_type *iap; rap_type *rap; doublereal *thu; integer *iuz; doublereal *vuz; function_list list; integer i, j, k; // Initialize structures and constants gData = Data; iap = &(Data->iap); rap = &(Data->rap); Data->sp_len = Data->num_sp + (1 + floor(iap->nmx/iap->npr)); Data->sp_inc = 5; #ifdef USAGE struct rusage *init_usage,*total_usage; usage_start(&init_usage); usage_start(&total_usage); #endif #ifdef FLOATING_POINT_TRAP trapfpe(); #endif #ifdef PTHREADS global_conpar_type = CONPAR_PTHREADS; global_setubv_type = SETUBV_PTHREADS; global_reduce_type = REDUCE_PTHREADS; #endif fp9 = fopen("fort.9","w"); if(fp9 == NULL) { fprintf(stderr,"Error: Could not open fort.9\n"); exit(1); } /* Initialization : */ iap->mynode = mynode(); iap->numnodes = numnodes(); if (iap->numnodes > 1) { iap->parallel_flag = 1; } else { iap->parallel_flag = 0; } /* NOTE: thu is allocated inside this function, and the pointer is passed back. I know this is ugly, but this function does a bit of work to get thu setup correctly, as well as figuring out the size the array should be. What really should happen is to have one function which reads fort.2 and another fuction which initializes the array. That way the allocation could happen between the two calls. */ init0(iap, rap, par, icp, thl, &thu, &iuz, &vuz); /* Find restart label and determine type of restart point. */ if (iap->irs > 0) { logical found = FALSE_; findlb(iap, rap, iap->irs, &(iap->nfpr), &found); if (! found) { if (iap->mynode == 0) { fprintf(stderr,"\nRestart label %4ld not found\n",iap->irs); } exit(0); } } set_function_pointers(*iap,&list); init1(iap, rap, icp, par); chdim(iap); /* Create the allocations for the global structures used in autlib3.c and autlib5.c. These are purely an efficiency thing. The allocation and deallocation of these scratch areas takes up a nontrivial amount of time if done directly in the wrapper functions in autlib3.c*/ allocate_global_memory(*iap); /* ---------------------------------------------------------- */ /* ---------------------------------------------------------- */ /* One-parameter continuations */ /* ---------------------------------------------------------- */ /* ---------------------------------------------------------- */ #ifdef USAGE usage_end(init_usage,"main initialization"); #endif if (Data->print_input) PrintInput(Data, par, icp); // Initialize output variables if(list.type==AUTOAE) Data->u = DMATRIX(iap->nmx, iap->ndim); else { // Solution measures Data->usm = (doublereal ***)MALLOC((2+(int)(log2(Data->nsm)))*sizeof(doublereal **)); Data->usm[0] = DMATRIX(iap->nmx, iap->ndim); // MAX Data->usm[1] = DMATRIX(iap->nmx, iap->ndim); // MIN for (i=0; i<(int)(log2(Data->nsm)); i++) Data->usm[2+i] = DMATRIX(iap->nmx, iap->ndim); // Jacobian of flow if (Data->sjac) { Data->c0 = DMATRIX_3D(iap->nmx, iap->ndim, iap->ndim); Data->c1 = DMATRIX_3D(iap->nmx, iap->ndim, iap->ndim); } // Jacobian of flow along cycles (temporary storage) if (Data->sflow) { Data->a1 = DMATRIX_3D(iap->ntst, iap->ndim, iap->ndim); Data->a2 = DMATRIX_3D(iap->ntst, iap->ndim, iap->ndim); } } Data->par = DMATRIX(iap->nmx, iap->nicp); if (iap->isp >= 1) { Data->ev = DCMATRIX(iap->nmx, iap->ndim); for (i=0; i<iap->nmx; i++) { for (j=0; j<iap->ndim; j++) { Data->ev[i][j].r = NAN; // This is a flag for bad floquet multipliers Data->ev[i][j].i = NAN; } } } Data->num_u = 0; if (Data->sp == NULL) Data->num_sp = 0; Data->sp = (AutoSPData *)REALLOC(Data->sp, (Data->sp_len)*sizeof(AutoSPData)); for (i=Data->num_sp; i<Data->sp_len; i++) { Data->sp[i].u = NULL; Data->sp[i].icp = NULL; Data->sp[i].ups = NULL; Data->sp[i].udotps = NULL; Data->sp[i].rldot = NULL; Data->sp[i].a1 = NULL; Data->sp[i].a2 = NULL; } if(list.type==AUTOAE) autoae(iap, rap, par, icp, list.aelist.funi, list.aelist.stpnt, list.aelist.pvli, thl, thu, iuz, vuz); if(list.type==AUTOBV) autobv(iap, rap, par, icp, list.bvlist.funi, list.bvlist.bcni, list.bvlist.icni, list.bvlist.stpnt, list.bvlist.pvli, thl, thu, iuz, vuz); // Testing output if (Data->print_output) PrintOutput(Data); #ifdef USAGE usage_end(total_usage,"total"); #endif //time_end(time0,"Total Time ",fp9); fprintf(fp9,"----------------------------------------------"); fprintf(fp9,"----------------------------------------------\n"); //time_end(time1,"",stdout); //} FREE(thu); FREE(iuz); FREE(vuz); fclose(fp9); // Clean up special solution points that were allocated and not used Data->sp = (AutoSPData *)REALLOC(Data->sp, (Data->num_sp)*sizeof(AutoSPData)); assert(Data->sp); Data->sp_len = Data->num_sp; return 1; }