main () { thds = omp_get_max_threads (); if (thds == 1) { printf ("should be run this program on multi threads.\n"); exit (0); } omp_set_dynamic (0); prvt.i = MAGICNO; prvt.d = MAGICNO+1; #pragma omp parallel firstprivate (prvt) { int id = omp_get_thread_num (); if (prvt.i != MAGICNO) { #pragma omp critical errors += 1; } if (prvt.d != MAGICNO+1) { #pragma omp critical errors += 1; } prvt.i = id; prvt.d = id-1; #pragma omp barrier if (prvt.i != id) { #pragma omp critical errors += 1; } if (prvt.d != id-1) { #pragma omp critical errors += 1; } if (sizeof(prvt) != sizeof(struct x)) { #pragma omp critical errors += 1; } } prvt.i = MAGICNO*2; prvt.d = MAGICNO*2+1; #pragma omp parallel firstprivate (prvt) func1 (MAGICNO*2, &prvt); prvt.i = MAGICNO*3; prvt.d = MAGICNO*3+1; #pragma omp parallel firstprivate (prvt) func2 (MAGICNO*3); if (errors == 0) { printf ("firstprivate 013 : SUCCESS\n"); return 0; } else { printf ("firstprivate 013 : FAILED\n"); return 1; } }
double computeGraph(graph* G, graphSDG* SDGdata) { VERT_T* endV; LONG_T *degree, *numEdges, *pos, *pSums; WEIGHT_T* w; double elapsed_time; #ifdef _OPENMP omp_lock_t *vLock; LONG_T chunkSize; #endif elapsed_time = get_seconds(); #ifdef _OPENMP omp_set_num_threads(NUM_THREADS); #endif #ifdef _OPENMP #pragma omp parallel #endif { LONG_T i, j, u, n, m, tid, nthreads; #ifdef DIAGNOSTIC double elapsed_time_part; #endif #ifdef _OPENMP nthreads = omp_get_num_threads(); tid = omp_get_thread_num(); #else tid = 0; nthreads = 1; #endif n = N; m = M; if (tid == 0) { #ifdef _OPENMP vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t)); assert(vLock != NULL); chunkSize = n/nthreads; #endif pos = (LONG_T *) malloc(m*sizeof(LONG_T)); assert(pos != NULL); degree = (LONG_T *) calloc(n, sizeof(LONG_T)); assert(degree != NULL); } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #pragma omp for schedule(static, chunkSize) for (i=0; i<n; i++) { omp_init_lock(&vLock[i]); } #pragma omp barrier #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Lock initialization time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #pragma omp for #endif for (i=0; i<m; i++) { u = SDGdata->startVertex[i]; #ifdef _OPENMP omp_set_lock(&vLock[u]); #endif pos[i] = degree[u]++; #ifdef _OPENMP omp_unset_lock(&vLock[u]); #endif } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Degree computation time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #pragma omp for schedule(static, chunkSize) for (i=0; i<n; i++) { omp_destroy_lock(&vLock[i]); } if (tid == 0) free(vLock); #endif #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Lock destruction time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif if (tid == 0) { numEdges = (LONG_T *) malloc((n+1)*sizeof(LONG_T)); pSums = (LONG_T *) malloc(nthreads*sizeof(LONG_T)); } #ifdef _OPENMP #pragma omp barrier #endif prefix_sums(degree, numEdges, pSums, n); #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Prefix sums time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif #ifdef _OPENMP #pragma omp barrier #endif if (tid == 0) { free(degree); free(pSums); w = (WEIGHT_T *) malloc(m*sizeof(WEIGHT_T)); endV = (VERT_T *) malloc(m* sizeof(VERT_T)); } #ifdef _OPENMP #pragma omp barrier #pragma omp for #endif for (i=0; i<m; i++) { u = SDGdata->startVertex[i]; j = numEdges[u] + pos[i]; endV[j] = SDGdata->endVertex[i]; w[j] = SDGdata->weight[i]; } #ifdef DIAGNOSTIC if (tid == 0) { elapsed_time_part = get_seconds() - elapsed_time_part; fprintf(stderr, "Edge data structure construction time: %lf seconds\n", elapsed_time_part); elapsed_time_part = get_seconds(); } #endif if (tid == 0) { free(pos); G->n = n; G->m = m; G->numEdges = numEdges; G->endV = endV; G->weight = w; } #ifdef _OPENMP #endif } /* Verification */ #if 0 fprintf(stderr, "SDG data:\n"); for (int i=0; i<SDGdata->m; i++) { fprintf(stderr, "[%ld %ld %ld] ", SDGdata->startVertex[i], SDGdata->endVertex[i], SDGdata->weight[i]); } fprintf(stderr, "\n"); for (int i=0; i<G->n + 1; i++) { fprintf(stderr, "[%ld] ", G->numEdges[i]); } fprintf(stderr, "\nGraph:\n"); for (int i=0; i<G->n; i++) { for (int j=G->numEdges[i]; j<G->numEdges[i+1]; j++) { fprintf(stderr, "[%ld %ld %ld] ", i, G->endV[j], G->weight[j]); } } #endif free(SDGdata->startVertex); free(SDGdata->endVertex); free(SDGdata->weight); elapsed_time = get_seconds() - elapsed_time; return elapsed_time; }
/** Purpose ------- SLAEX3 finds the roots of the secular equation, as defined by the values in D, W, and RHO, between 1 and K. It makes the appropriate calls to SLAED4 and then updates the eigenvectors by multiplying the matrix of eigenvectors of the pair of eigensystems being combined by the matrix of eigenvectors of the K-by-K system which is solved here. It is used in the last step when only a part of the eigenvectors is required. It compute only the required part of the eigenvectors and the rest is not used. This code makes very mild assumptions about floating point arithmetic. It will work on machines with a guard digit in add/subtract, or on those binary machines without guard digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2. It could conceivably fail on hexadecimal or decimal machines without guard digits, but we know of none. Arguments --------- @param[in] k INTEGER The number of terms in the rational function to be solved by SLAED4. K >= 0. @param[in] n INTEGER The number of rows and columns in the Q matrix. N >= K (deflation may result in N > K). @param[in] n1 INTEGER The location of the last eigenvalue in the leading submatrix. min(1,N) <= N1 <= N/2. @param[out] d REAL array, dimension (N) D(I) contains the updated eigenvalues for 1 <= I <= K. @param[out] Q REAL array, dimension (LDQ,N) Initially the first K columns are used as workspace. On output the columns ??? to ??? contain the updated eigenvectors. @param[in] ldq INTEGER The leading dimension of the array Q. LDQ >= max(1,N). @param[in] rho REAL The value of the parameter in the rank one update equation. RHO >= 0 required. @param[in,out] dlamda REAL array, dimension (K) The first K elements of this array contain the old roots of the deflated updating problem. These are the poles of the secular equation. May be changed on output by having lowest order bit set to zero on Cray X-MP, Cray Y-MP, Cray-2, or Cray C-90, as described above. @param[in] Q2 REAL array, dimension (LDQ2, N) The first K columns of this matrix contain the non-deflated eigenvectors for the split problem. TODO what is LDQ2? @param[in] indx INTEGER array, dimension (N) The permutation used to arrange the columns of the deflated Q matrix into three groups (see SLAED2). The rows of the eigenvectors found by SLAED4 must be likewise permuted before the matrix multiply can take place. @param[in] ctot INTEGER array, dimension (4) A count of the total number of the various types of columns in Q, as described in INDX. The fourth column type is any column which has been deflated. @param[in,out] w REAL array, dimension (K) The first K elements of this array contain the components of the deflation-adjusted updating vector. Destroyed on output. @param s (workspace) REAL array, dimension (N1 + 1)*K Will contain the eigenvectors of the repaired matrix which will be multiplied by the previously accumulated eigenvectors to update the system. @param[out] indxq INTEGER array, dimension (N) On exit, the permutation which will reintegrate the subproblems back into sorted order, i.e. D( INDXQ( I = 1, N ) ) will be in ascending order. @param dwork (workspace) REAL array, dimension (3*N*N/2+3*N) @param[in] range magma_range_t - = MagmaRangeAll: all eigenvalues will be found. - = MagmaRangeV: all eigenvalues in the half-open interval (VL,VU] will be found. - = MagmaRangeI: the IL-th through IU-th eigenvalues will be found. TODO verify range, vl, vu, il, iu -- copied from slaex1. @param[in] vl REAL @param[in] vu REAL if RANGE=MagmaRangeV, the lower and upper bounds of the interval to be searched for eigenvalues. VL < VU. Not referenced if RANGE = MagmaRangeAll or MagmaRangeI. @param[in] il INTEGER @param[in] iu INTEGER if RANGE=MagmaRangeI, the indices (in ascending order) of the smallest and largest eigenvalues to be returned. 1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0. Not referenced if RANGE = MagmaRangeAll or MagmaRangeV. @param[out] info INTEGER - = 0: successful exit. - < 0: if INFO = -i, the i-th argument had an illegal value. - > 0: if INFO = 1, an eigenvalue did not converge Further Details --------------- Based on contributions by Jeff Rutter, Computer Science Division, University of California at Berkeley, USA Modified by Francoise Tisseur, University of Tennessee. @ingroup magma_ssyev_aux ********************************************************************/ extern "C" magma_int_t magma_slaex3( magma_int_t k, magma_int_t n, magma_int_t n1, float *d, float *Q, magma_int_t ldq, float rho, float *dlamda, float *Q2, magma_int_t *indx, magma_int_t *ctot, float *w, float *s, magma_int_t *indxq, magmaFloat_ptr dwork, magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu, magma_int_t *info ) { #define Q(i_,j_) (Q + (i_) + (j_)*ldq) #define dQ(i_,j_) (dQ + (i_) + (j_)*lddq) #define dQ2(i_,j_) (dQ2 + (i_) + (j_)*lddq) #define dS(i_,j_) (dS + (i_) + (j_)*lddq) float d_one = 1.; float d_zero = 0.; magma_int_t ione = 1; magma_int_t ineg_one = -1; magma_int_t iil, iiu, rk; magma_int_t lddq = n/2 + 1; magmaFloat_ptr dQ2 = dwork; magmaFloat_ptr dS = dQ2 + n*lddq; magmaFloat_ptr dQ = dS + n*lddq; magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2; float temp; magma_int_t alleig, valeig, indeig; alleig = (range == MagmaRangeAll); valeig = (range == MagmaRangeV); indeig = (range == MagmaRangeI); *info = 0; if (k < 0) *info=-1; else if (n < k) *info=-2; else if (ldq < max(1,n)) *info=-6; else if (! (alleig || valeig || indeig)) *info = -15; else { if (valeig) { if (n > 0 && vu <= vl) *info = -17; } else if (indeig) { if (il < 1 || il > max(1,n)) *info = -18; else if (iu < min(n,il) || iu > n) *info = -19; } } if (*info != 0) { magma_xerbla(__func__, -(*info)); return *info; } // Quick return if possible if (k == 0) return *info; /* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can be computed with high relative accuracy (barring over/underflow). This is a problem on machines without a guard digit in add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2). The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I), which on any of these machines zeros out the bottommost bit of DLAMDA(I) if it is 1; this makes the subsequent subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation occurs. On binary machines with a guard digit (almost all machines) it does not change DLAMDA(I) at all. On hexadecimal and decimal machines with a guard digit, it slightly changes the bottommost bits of DLAMDA(I). It does not account for hexadecimal or decimal machines without guard digits (we know of none). We use a subroutine call to compute 2*DLAMBDA(I) to prevent optimizing compilers from eliminating this code.*/ n2 = n - n1; n12 = ctot[0] + ctot[1]; n23 = ctot[1] + ctot[2]; iq2 = n1 * n12; lq2 = iq2 + n2 * n23; magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_ssetvector_async( lq2, Q2, 1, dQ2(0,0), 1, queue ); #ifdef _OPENMP ///////////////////////////////////////////////////////////////////////////////// //openmp implementation ///////////////////////////////////////////////////////////////////////////////// //magma_timer_t time=0; //timer_start( time ); #pragma omp parallel private(i, j, tmp, temp) { magma_int_t id = omp_get_thread_num(); magma_int_t tot = omp_get_num_threads(); magma_int_t ib = ( id * k) / tot; //start index of local loop magma_int_t ie = ((id+1) * k) / tot; //end index of local loop magma_int_t ik = ie - ib; //number of local indices for (i = ib; i < ie; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = ib; j < ie; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) { #pragma omp critical (info) *info=iinfo; break; } } #pragma omp barrier if (*info == 0) { #pragma omp single { //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) { magma_svrange(k, d, &iil, &iiu, vl, vu); } else if (indeig) { magma_sirange(k, indxq, &iil, &iiu, il, iu); } else { iil = 1; iiu = k; } rk = iiu - iil + 1; } if (k == 2) { #pragma omp single { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } } else if (k != 1) { // Compute updated W. blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione); for (j = 0; j < k; ++j) { magma_int_t i_tmp = min(j, ie); for (i = ib; i < i_tmp; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); i_tmp = max(j+1, ib); for (i = i_tmp; i < ie; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = ib; i < ie; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); #pragma omp barrier //reduce the number of used threads to have enough S workspace tot = min(n1, omp_get_num_threads()); if (id < tot) { ib = ( id * rk) / tot + iil - 1; ie = ((id+1) * rk) / tot + iil - 1; ik = ie - ib; } else { ib = -1; ie = -1; ik = -1; } // Compute eigenvectors of the modified rank-1 modification. for (j = ib; j < ie; ++j) { for (i = 0; i < k; ++i) s[id*k + i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s+id*k, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[id*k + iii] / temp; } } } } } // end omp parallel if (*info != 0) return *info; //timer_stop( time ); //timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #else ///////////////////////////////////////////////////////////////////////////////// // Non openmp implementation ///////////////////////////////////////////////////////////////////////////////// // magma_timer_t time=0; // timer_start( time ); for (i = 0; i < k; ++i) dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i]; for (j = 0; j < k; ++j) { magma_int_t tmpp=j+1; magma_int_t iinfo = 0; lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo); // If the zero finder fails, the computation is terminated. if (iinfo != 0) *info=iinfo; } if (*info != 0) return *info; //Prepare the INDXQ sorting permutation. magma_int_t nk = n - k; lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq); //compute the lower and upper bound of the non-deflated eigenvectors if (valeig) { magma_svrange(k, d, &iil, &iiu, vl, vu); } else if (indeig) { magma_sirange(k, indxq, &iil, &iiu, il, iu); } else { iil = 1; iiu = k; } rk = iiu - iil + 1; if (k == 2) { for (j = 0; j < k; ++j) { w[0] = *Q(0,j); w[1] = *Q(1,j); i = indx[0] - 1; *Q(0,j) = w[i]; i = indx[1] - 1; *Q(1,j) = w[i]; } } else if (k != 1) { // Compute updated W. blasf77_scopy( &k, w, &ione, s, &ione); // Initialize W(I) = Q(I,I) tmp = ldq + 1; blasf77_scopy( &k, Q, &tmp, w, &ione); for (j = 0; j < k; ++j) { for (i = 0; i < j; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); for (i = j+1; i < k; ++i) w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) ); } for (i = 0; i < k; ++i) w[i] = copysign( sqrt( -w[i] ), s[i]); // Compute eigenvectors of the modified rank-1 modification. for (j = iil-1; j < iiu; ++j) { for (i = 0; i < k; ++i) s[i] = w[i] / *Q(i,j); temp = magma_cblas_snrm2( k, s, 1 ); for (i = 0; i < k; ++i) { magma_int_t iii = indx[i] - 1; *Q(i,j) = s[iii] / temp; } } } //timer_stop( time ); //timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time ); #endif //_OPENMP // Compute the updated eigenvectors. //timer_start( time ); //magma_queue_sync( queue ); // previously, needed to setvector finished. Now all on same queue, so not needed? if (rk != 0) { if ( n23 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23); blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2, s, &n23, &d_zero, Q(n1,iil-1), &ldq ); } else { magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, dS(0,0), n23, queue ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23, d_one, dQ2(iq2,0), n2, dS(0,0), n23, d_zero, dQ(0,0), lddq, queue ); magma_sgetmatrix( n2, rk, dQ(0,0), lddq, Q(n1,iil-1), ldq, queue ); } } else lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq); if ( n12 != 0 ) { if (rk < magma_get_slaed3_k()) { lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12); blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1, s, &n12, &d_zero, Q(0,iil-1), &ldq); } else { magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, dS(0,0), n12, queue ); magma_sgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12, d_one, dQ2(0,0), n1, dS(0,0), n12, d_zero, dQ(0,0), lddq, queue ); magma_sgetmatrix( n1, rk, dQ(0,0), lddq, Q(0,iil-1), ldq, queue ); } } else lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq); } //timer_stop( time ); //timer_printf( "gemms = %6.2f\n", time ); magma_queue_destroy( queue ); return *info; } /* magma_slaex3 */
// update the tree, do pruning virtual void Update(const std::vector<bst_gpair> &gpair, IFMatrix *p_fmat, const BoosterInfo &info, const std::vector<RegTree*> &trees) { if (trees.size() == 0) return; // number of threads // thread temporal space std::vector< std::vector<TStats> > stemp; std::vector<RegTree::FVec> fvec_temp; // setup temp space for each thread int nthread; #pragma omp parallel { nthread = omp_get_num_threads(); } fvec_temp.resize(nthread, RegTree::FVec()); stemp.resize(nthread, std::vector<TStats>()); #pragma omp parallel { int tid = omp_get_thread_num(); int num_nodes = 0; for (size_t i = 0; i < trees.size(); ++i) { num_nodes += trees[i]->param.num_nodes; } stemp[tid].resize(num_nodes, TStats(param)); std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param)); fvec_temp[tid].Init(trees[0]->param.num_feature); } // if it is C++11, use lazy evaluation for Allreduce, // to gain speedup in recovery #if __cplusplus >= 201103L auto lazy_get_stats = [&]() #endif { // start accumulating statistics utils::IIterator<RowBatch> *iter = p_fmat->RowIterator(); iter->BeforeFirst(); while (iter->Next()) { const RowBatch &batch = iter->Value(); utils::Check(batch.size < std::numeric_limits<unsigned>::max(), "too large batch size "); const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size); #pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < nbatch; ++i) { RowBatch::Inst inst = batch[i]; const int tid = omp_get_thread_num(); const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i); RegTree::FVec &feats = fvec_temp[tid]; feats.Fill(inst); int offset = 0; for (size_t j = 0; j < trees.size(); ++j) { AddStats(*trees[j], feats, gpair, info, ridx, BeginPtr(stemp[tid]) + offset); offset += trees[j]->param.num_nodes; } feats.Drop(inst); } } // aggregate the statistics int num_nodes = static_cast<int>(stemp[0].size()); #pragma omp parallel for schedule(static) for (int nid = 0; nid < num_nodes; ++nid) { for (int tid = 1; tid < nthread; ++tid) { stemp[0][nid].Add(stemp[tid][nid]); } } }; #if __cplusplus >= 201103L reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats); #else reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size()); #endif // rescale learning rate according to size of trees float lr = param.learning_rate; param.learning_rate = lr / trees.size(); int offset = 0; for (size_t i = 0; i < trees.size(); ++i) { for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) { this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]); } offset += trees[i]->param.num_nodes; } // set learning rate back param.learning_rate = lr; }
int runMe(int argc, char *argv[]) { ArgProcessor args(argc, argv); if (args.isArgSet("--help") || (!(args.isArgSet("--reads") && args.isArgSet("--kmers")))) { cerr << usage(args) << endl << endl; exit(1); } string reads_fasta_file = args.getStringVal("--reads"); string kmers_fasta_file = args.getStringVal("--kmers"); bool is_DS = (!args.isArgSet("--SS")); if (args.isArgSet("--kmer_size")) { KMER_SIZE = args.getIntVal("--kmer_size"); if (KMER_SIZE < 20) { cerr << "Error, min kmer size is 20"; exit(2); } } if (args.isArgSet("--monitor")) { IRKE_COMMON::MONITOR = args.getIntVal("--monitor"); } if (args.isArgSet("--num_threads")) { int num_threads = args.getIntVal("--num_threads"); if (num_threads < MAX_THREADS) { omp_set_num_threads(num_threads); } else { // set to max omp_set_num_threads(MAX_THREADS); } } if (omp_get_max_threads() > MAX_THREADS) { omp_set_num_threads(MAX_THREADS); } KmerCounter kcounter(KMER_SIZE, is_DS); populate_kmer_counter(kcounter, kmers_fasta_file); Fasta_reader fasta_reader(reads_fasta_file); bool write_coverage_info = args.isArgSet("--capture_coverage_info"); int start_time = time(NULL); #pragma omp parallel while (true) { if (!fasta_reader.hasNext()) break; int myTid = omp_get_thread_num(); Fasta_entry fe = fasta_reader.getNext(); string sequence = fe.get_sequence(); if (sequence == "") continue; string header = fe.get_header(); vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter); unsigned int median_cov = median_coverage(kmer_coverage); float mean_cov = mean(kmer_coverage); float stdev = stDev(kmer_coverage); float pct_stdev_of_avg = stdev / mean_cov * 100; stringstream stats_text; stats_text << median_cov << "\t" << mean_cov << "\t" << stdev << "\t" << pct_stdev_of_avg << "\t" << fe.get_accession(); stats_text << "\tthread:" << myTid; if (write_coverage_info) { // add the coverage info stats_text << "\t"; for (size_t i = 0; i < kmer_coverage.size(); i++) { stats_text << kmer_coverage[i]; if (i != kmer_coverage.size() - 1) { stats_text << ","; } } } stats_text << endl; #pragma omp critical { cout << stats_text.str(); } if (mean_cov < 0) { cerr << "ERROR, cannot have negative coverage!!" << endl; exit(1); } } int end_time = time(NULL); cerr << "STATS_GENERATION_TIME: " << (end_time - start_time) << " seconds." << endl; return (0); }
static uint32_t PerformImprovementStep2(const CGraph* graph, CommunityPartition* partition, const double64_t alfa) { std::vector<Movement>* movements = new std::vector<Movement>[num_threads]; uint32_t N = graph->GetNumNodes(); #pragma omp parallel for schedule(SCD_SCHEDULING,SCD_THREAD_BLOCK_SIZE) for (uint32_t i = 0; i < N; i++) { int thread = omp_get_thread_num(); if (i % 100000 == 0) { printf("Thread %d: Checked movements of %d nodes.\n", thread, i); } Movement movement; movement = CheckForBestMovement(graph, i, partition, alfa); if (movement.m_MovementType != E_NO_MOVEMENT) { movements[thread].push_back(movement); } } printf("All movements checked\n"); for( uint32_t i = 0; i < N; i++) { std::sort((movements[i]).begin(), (movements[i]).end(),CompareMovements); } uint32_t* tempNodeLabels = new uint32_t[partition->m_NumNodes]; memcpy(&tempNodeLabels[0], &partition->m_NodeLabels[0], sizeof (uint32_t) * partition->m_NumNodes); uint32_t totalMovements = 0; //uint32_t nextLabel = partition->m_NumCommunities; uint32_t removeMovements = 0; uint32_t removeAndInsertMovements = 0; uint32_t insertMovements = 0; #pragma omp parallel for schedule(static,1) for (uint32_t thread = 0; thread < num_threads; thread++) { uint32_t numMovements = movements[thread].size(); totalMovements += numMovements; uint32_t nextLabelThread = partition->m_NumCommunities + numMovements * thread; uint32_t previousCommunity = 100000000; for (uint32_t i = 0; i < numMovements; i++) { Movement movement = (movements[thread])[i]; if(movement.m_Community != previousCommunity) { previousCommunity = movement.m_Community; switch (movement.m_MovementType) { case E_REMOVE: tempNodeLabels[movement.m_NodeId] = nextLabelThread; removeMovements++; nextLabelThread++; break; case E_REMOVE_AND_INSERT: tempNodeLabels[movement.m_NodeId] = movement.m_Community; if (partition->m_Communities[partition->m_CommunityIndices[partition->m_NodeLabels[movement.m_NodeId]]] == 1) { insertMovements++; } else { removeAndInsertMovements++; } break; } } } } delete [] movements; printf(" Number of removes performed: %d\n", removeMovements); printf(" Number of remove and insert performed: %d\n", removeAndInsertMovements); printf(" Number of insert performed: %d\n", insertMovements); FreeResources(partition); if (InitializeFromLabelsArray(graph, partition, tempNodeLabels, alfa)) { printf("Error initializing from label array.\n"); return 1; } delete [] tempNodeLabels; return 0; }
void function_c (void) { printf ("Thread %d is executing function C. \n", omp_get_thread_num()); }
SEXP spSVCPredictJoint(SEXP m_r, SEXP n_r, SEXP KDiag_r, SEXP obsD_r, SEXP predObsD_r, SEXP predD_r, SEXP q_r, SEXP samples_r, SEXP wSamples_r, SEXP nSamples_r, SEXP AIndx_r, SEXP phiIndx_r, SEXP nuIndx_r, SEXP covModel_r, SEXP verbose_r, SEXP nReport_r, SEXP nThreads_r){ /***************************************** Common variables *****************************************/ int i, j, k, l, b, s, h, info, nProtect=0; char const *lower = "L"; char const *upper = "U"; char const *nUnit = "N"; char const *yUnit = "U"; char const *ntran = "N"; char const *ytran = "T"; char const *rside = "R"; char const *lside = "L"; const double one = 1.0; const double negOne = -1.0; const double zero = 0.0; const int incOne = 1; /***************************************** Set-up *****************************************/ double *obsD = REAL(obsD_r); double *predObsD = REAL(predObsD_r); double *predD = REAL(predD_r); int m = INTEGER(m_r)[0]; int mm = m*m; int n = INTEGER(n_r)[0]; int nn = n*n; int nm = n*m; int nmnm = nm*nm; int q = INTEGER(q_r)[0];//number of prediction locations int qm = q*m; int qmnm = qm*nm; int qmqm = qm*qm; bool KDiag = static_cast<bool>(INTEGER(KDiag_r)[0]); int nLTr = m*(m-1)/2+m; double *samples = REAL(samples_r); double *wSamples = REAL(wSamples_r); int nSamples = INTEGER(nSamples_r)[0]; int AIndx = INTEGER(AIndx_r)[0]; int phiIndx = INTEGER(phiIndx_r)[0]; int nuIndx = INTEGER(nuIndx_r)[0]; std::string covModel = CHAR(STRING_ELT(covModel_r,0)); int verbose = INTEGER(verbose_r)[0]; int nReport = INTEGER(nReport_r)[0]; int nThreads = INTEGER(nThreads_r)[0]; /***************************************** Set-up MCMC alg. vars. matrices etc. *****************************************/ SEXP wPredSamples_r; PROTECT(wPredSamples_r = allocMatrix(REALSXP, qm, nSamples)); nProtect++; int status=1; double *A = (double *) R_alloc(mm, sizeof(double)); zeros(A, mm); //to simplify a future move to the more general cross-cov model double *K = (double *) R_alloc(nmnm, sizeof(double)); double *B = (double *) R_alloc(qmnm, sizeof(double)); double *C = (double *) R_alloc(qmqm, sizeof(double)); double *tmp_nltr = (double *) R_alloc(nLTr, sizeof(double)); double *tmp_qmnm = (double *) R_alloc(qmnm, sizeof(double)); double *tmp_qm = (double *) R_alloc(qm, sizeof(double)); double *tmp_qmqm = (double *) R_alloc(qmqm, sizeof(double)); double *phi = (double *) R_alloc(m, sizeof(double)); double *nu = (double *) R_alloc(m, sizeof(double)); zeros(nu, m); //this just remains empty of not matern double maxNu = 0; //needed for thread safe bessel if(covModel == "matern"){ for(s = 0; s < nSamples; s++){ for(i = 0; i < m; i++){ if(samples[(nuIndx+i)*nSamples+s] > maxNu){ maxNu = samples[(nuIndx+i)*nSamples+s]; } } } } int threadID = 0; int bessel_ws_inc = static_cast<int>(1.0+maxNu); double *bessel_ws = (double *) R_alloc(nThreads*bessel_ws_inc, sizeof(double)); #ifdef _OPENMP omp_set_num_threads(nThreads); if(verbose){ Rprintf("Source compiled with OpenMP, posterior sampling is using %i thread(s).\n", nThreads); } #else if(nThreads > 1){ warning("n.omp.threads = %i, but source not compiled with OpenMP support.", nThreads); nThreads = 1; } #endif if(verbose){ Rprintf("-------------------------------------------------\n"); Rprintf("\tJoint sampling of predicted w\n"); Rprintf("-------------------------------------------------\n"); #ifdef Win32 R_FlushConsole(); #endif } GetRNGstate(); for(s = 0; s < nSamples; s++){ if(KDiag == false){ dcopy_(&nLTr, &samples[AIndx*nSamples+s], &nSamples, tmp_nltr, &incOne); covExpand(tmp_nltr, A, m);//note this is K, so we need chol F77_NAME(dpotrf)(lower, &m, A, &m, &info); if(info != 0){error("c++ error: dpotrf failed 1\n");} clearUT(A, m); //make sure upper tri is clear } for(k = 0; k < m; k++){ if(KDiag){ A[k*m+k] = sqrt(samples[(AIndx+k)*nSamples+s]); } phi[k] = samples[(phiIndx+k)*nSamples+s]; if(covModel == "matern"){ nu[k] = samples[(nuIndx+k)*nSamples+s]; } } //construct covariance matrix #ifdef _OPENMP #pragma omp parallel for private(i, k, l, h, threadID) #endif for(j = 0; j < n; j++){ #ifdef _OPENMP threadID = omp_get_thread_num(); #endif for(i = 0; i < n; i++){ for(k = 0; k < m; k++){ for(l = 0; l < m; l++){ K[(k+j*m)*nm+(i*m+l)] = 0.0; for(h = 0; h < m; h++){ K[(k+j*m)*nm+(i*m+l)] += A[k+m*h]*A[l+m*h]*spCorTS(obsD[j*n+i], phi[h], nu[h], covModel, &bessel_ws[threadID*bessel_ws_inc]); } } } } } #ifdef _OPENMP #pragma omp parallel for private(i, k, l, h, threadID) #endif for(j = 0; j < n; j++){ #ifdef _OPENMP threadID = omp_get_thread_num(); #endif for(i = 0; i < q; i++){ for(k = 0; k < m; k++){ for(l = 0; l < m; l++){ B[(k+j*m)*qm+(i*m+l)] = 0.0; for(h = 0; h < m; h++){ B[(k+j*m)*qm+(i*m+l)] += A[k+m*h]*A[l+m*h]*spCorTS(predObsD[j*q+i], phi[h], nu[h], covModel, &bessel_ws[threadID*bessel_ws_inc]); } } } } } //printMtrx(B, qm, nm); #ifdef _OPENMP #pragma omp parallel for private(i, k, l, h, threadID) #endif for(j = 0; j < q; j++){ #ifdef _OPENMP threadID = omp_get_thread_num(); #endif for(i = 0; i < q; i++){ for(k = 0; k < m; k++){ for(l = 0; l < m; l++){ C[(k+j*m)*qm+(i*m+l)] = 0.0; for(h = 0; h < m; h++){ C[(k+j*m)*qm+(i*m+l)] += A[k+m*h]*A[l+m*h]*spCorTS(predD[j*q+i], phi[h], nu[h], covModel, &bessel_ws[threadID*bessel_ws_inc]); } } } } } F77_NAME(dpotrf)(lower, &nm, K, &nm, &info); if(info != 0){error("c++ error: dpotrf failed 1\n");} F77_NAME(dpotri)(lower, &nm, K, &nm, &info); if(info != 0){error("c++ error: dpotri failed\n");} F77_NAME(dsymm)(rside, lower, &qm, &nm, &one, K, &nm, B, &qm, &zero, tmp_qmnm, &qm); //mu F77_NAME(dgemv)(ntran, &qm, &nm, &one, tmp_qmnm, &qm, &wSamples[s*nm], &incOne, &zero, tmp_qm, &incOne); //var F77_NAME(dgemm)(ntran, ytran, &qm, &qm, &nm, &one, tmp_qmnm, &qm, B, &qm, &zero, tmp_qmqm, &qm); for(i = 0; i < qmqm; i++){ C[i] = C[i] - tmp_qmqm[i]; } F77_NAME(dpotrf)(lower, &qm, C, &qm, &info); if(info != 0){error("c++ error: dpotrf failed 2\n");} mvrnorm(&REAL(wPredSamples_r)[s*qm], tmp_qm, C, qm, false); //report if(verbose){ if(status == nReport){ Rprintf("Sampled: %i of %i, %3.2f%%\n", s, nSamples, 100.0*s/nSamples); #ifdef Win32 R_FlushConsole(); #endif status = 0; } } status++; R_CheckUserInterrupt(); }//end sample loop PutRNGstate(); //make return object SEXP result_r, resultName_r; int nResultListObjs = 1; PROTECT(result_r = allocVector(VECSXP, nResultListObjs)); nProtect++; PROTECT(resultName_r = allocVector(VECSXP, nResultListObjs)); nProtect++; //samples SET_VECTOR_ELT(result_r, 0, wPredSamples_r); SET_VECTOR_ELT(resultName_r, 0, mkChar("p.w.predictive.samples")); namesgets(result_r, resultName_r); //unprotect UNPROTECT(nProtect); return(result_r); }
void make_graph(int log_numverts, int64_t desired_nedges, uint64_t userseed1, uint64_t userseed2, const double initiator[4], int64_t* nedges_ptr, int64_t** result_ptr) { int64_t N, M; N = (int64_t)pow(GRAPHGEN_INITIATOR_SIZE, log_numverts); M = desired_nedges; /* Spread the two 64-bit numbers into five nonzero values in the correct * range. */ uint_fast32_t seed[5]; make_mrg_seed(userseed1, userseed2, seed); int64_t nedges = compute_edge_array_size(0, 1, M); *nedges_ptr = nedges; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES generated_edge* edges = (generated_edge*)xcalloc(nedges, sizeof(generated_edge)); /* multiplicity set to 0 for unused edges */ #else int64_t* edges = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #endif #pragma omp parallel { int rank = omp_get_thread_num(), size = omp_get_num_threads(); generate_kronecker(rank, size, seed, log_numverts, M, initiator, edges); } int64_t* vertex_perm = (int64_t*)xmalloc(N * sizeof(int64_t)); int64_t* result; #ifdef GRAPHGEN_KEEP_MULTIPLICITIES result = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t)); #else result = edges; #endif *result_ptr = result; mrg_state state; mrg_seed(&state, seed); rand_sort_shared(&state, N, vertex_perm); int64_t i; /* Apply vertex permutation to graph, optionally copying into user's result * array. */ #ifdef GRAPHGEN_KEEP_MULTIPLICITIES #pragma omp parallel for for (i = 0; i < nedges; ++i) { if (edges[i].multiplicity != 0) { int64_t v1 = vertex_perm[edges[i].src]; int64_t v2 = vertex_perm[edges[i].tgt]; /* Sort these since otherwise the directions of the permuted edges would * give away the unscrambled vertex order. */ result[i * 2] = (v1 < v2) ? v1 : v2; result[i * 2 + 1] = (v1 < v2) ? v2 : v1; } else { result[i * 2] = result[i * 2 + 1] = (int64_t)(-1); } } free(edges); #else #pragma omp parallel for for (i = 0; i < 2 * nedges; i += 2) { if (edges[i] != (int64_t)(-1)) { int64_t v1 = vertex_perm[edges[i]]; int64_t v2 = vertex_perm[edges[i + 1]]; /* Sort these since otherwise the directions of the permuted edges would * give away the unscrambled vertex order. */ edges[i] = (v1 < v2) ? v1 : v2; edges[i + 1] = (v1 < v2) ? v2 : v1; } } #endif free(vertex_perm); /* Randomly mix up the order of the edges. */ scramble_edges_shared(userseed1, userseed2, nedges, edges); }
void run_graph_program(GraphProgram<T,U,V>* gp, Graph<V>& g, int iterations=1, struct run_graph_program_temp_structure<T,U,V>* rgpts=NULL) { //iterations = -1 ==> until convergence int it = 0; int converged = 1; unsigned long long int init_start = __rdtsc(); auto act = gp->getActivity(); SparseInVector<T>* px; SparseOutVector<U>* py; if (rgpts == NULL) { px = new SparseInVector<T>(g.nvertices); py = new SparseOutVector<U>(g.nvertices); } SparseInVector<T>&x = (rgpts==NULL)?(*px):*(rgpts->px); SparseOutVector<U>& y = (rgpts==NULL)?(*py):*(rgpts->py); #ifdef __TIMING printf("Nvertices = %d numints = %d \n", g.nvertices, y.numInts); #endif unsigned long long int start, end; int* start_vertex = new int[nthreads+1]; //divide numInts to start_vertex //divide the active vertices in each into start_index start_vertex[nthreads] = g.nvertices; #pragma omp parallel num_threads(nthreads) { int tid = omp_get_thread_num(); int ints_per_th = (y.numInts/nthreads)*32; int sv = ints_per_th*tid; sv = (((sv/32)/4)*4)*32; //sv is multiple of 32 and sv/32 is a multiple of 4 sv = (((sv/32)/SIMD_WIDTH)*SIMD_WIDTH)*32; //sv is multiple of 32 and sv/32 is a multiple of SIMD_WIDTH if (sv >= g.nvertices) sv = g.nvertices; if (sv == 0) sv = 0; start_vertex[tid] = sv; } unsigned long long int init_end = __rdtsc(); #ifdef __TIMING printf("GraphMat init time = %f ms \n", (init_end-init_start)/(CPU_FREQ)*1e3); #endif while(1) { unsigned long long int iteration_start = __rdtsc(); x.clear(); y.clear(); converged = 1; start = __rdtsc(); //check active vector and set message vector int count = 0; #pragma omp parallel num_threads(nthreads) reduction(+:count) { int tid = omp_get_thread_num(); for (int i = start_vertex[tid]; i < start_vertex[tid+1]; i++){ if (g.active[i]) { T message; bool msg_opt = gp->send_message(g.vertexproperty[i], message); if (msg_opt) { x.set(i, message); count++; } } } } x.length = count; #ifdef __TIMING printf("x.length = %d \n", x.length); #endif end = __rdtsc(); #ifdef __TIMING printf("Send message time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3); #endif start = __rdtsc(); //do SpMV if (gp->getOrder() == OUT_EDGES) { SpMTSpV(g, gp, x, y); } else if (gp->getOrder() == IN_EDGES) { SpMSpV(g, gp, x, y); } else if (gp->getOrder() == ALL_EDGES) { SpMTSpV(g, gp, x, y); SpMSpV(g, gp, x, y); } else { printf("Unrecognized option \n"); exit(1); } end = __rdtsc(); #ifdef __TIMING printf("SPMV time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3); #endif start = __rdtsc(); g.setAllInactive(); //update state and activity and check for convergence if needed int nout = 0; int total_search = 0; converged = 1; #pragma omp parallel num_threads(nthreads) reduction(+:nout) reduction(&:converged) reduction(+:total_search) //schedule(static) { int zero = 0; SIMDINTTYPE xmm_zero = _MM_SET1(zero); int tid = omp_get_thread_num(); int count_ones = 0; int end_of_numInts = start_vertex[tid+1]/32; if (tid == nthreads-1) end_of_numInts = y.numInts; for (int ii = start_vertex[tid]/32; ii < end_of_numInts; ii+=SIMD_WIDTH) { __m128i xmm_local_bitvec = _mm_loadu_si128((__m128i*)(y.bitvector + ii)); __m128 xmm_cmp_mask = _mm_castsi128_ps(_mm_cmpeq_epi32((xmm_local_bitvec), (xmm_zero))); int mask_value_0 = _mm_movemask_ps(xmm_cmp_mask); if(mask_value_0 == 15) { continue; } for(int i = ii; i < ii+SIMD_WIDTH; i++) { unsigned int value = y.bitvector[i]; while (value != 0) { int last_bit = _bit_scan_forward(value); int idx = i*32 + last_bit; V old_prop; old_prop = g.vertexproperty[idx]; gp->apply(y.value[idx], g.vertexproperty[idx]); nout++; if (old_prop != g.vertexproperty[idx]) { g.setActive(idx); count_ones++; converged = 0; total_search++; } value &= (~(1<<last_bit)); } } } } if (act == ALL_VERTICES) { g.setAllActive(); } #ifdef __TIMING printf("Number of vertices that changed state = %d \n", total_search); #endif end = __rdtsc(); #ifdef __TIMING printf("Apply time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3); #endif gp->do_every_iteration(it); unsigned long long int iteration_end = __rdtsc(); #ifdef __TIMING printf("Iteration %d :: %f msec :: updated %d vertices \n", it, (iteration_end-iteration_start)/(CPU_FREQ)*1e3, nout); #endif it++; if (it == iterations) { break; } if (iterations <= 0 && converged == 1) { break; } } unsigned long long int clear_start = __rdtsc(); delete [] start_vertex; if (rgpts == NULL) { delete px; delete py; } unsigned long long int clear_end = __rdtsc(); #ifdef __TIMING printf("GraphMat clear time = %f msec \n", (clear_end-clear_start)/(CPU_FREQ)*1e3); #endif printf("Completed %d iterations \n", it); }
int Preprocessor::autoWork() { buildWorkGroup(); int groupNum = static_cast<int>(workPool.size()); string tarSrc = para.auto_src, autoDst = para.auto_dst, sysDel = para.sys_del, tarDst = autoDst + sysDel + "src", monSrc = tarDst, monDst = autoDst + sysDel + "montage", proSrc = monDst, proDst = autoDst + sysDel + "project", resSrc = monDst, resDst = autoDst + sysDel + "resize", tarPre = para.tar_file_pre, tarPost = para.tar_file_post, monInPre = para.mon_in_pre, monInPost = para.mon_in_post; u_int xBeg = para.x_beg, xEnd = para.x_end, yBeg = para.y_beg, yEnd = para.y_end, blockWidth = para.block_width, blockHeight = para.block_height; u_int monWidth = blockWidth * (xEnd - xBeg + 1); u_int monHeght = blockHeight * (yEnd - yBeg + 1); if(_access(autoDst.c_str(), 0) == -1){ _mkdir(autoDst.c_str()); _mkdir(tarDst.c_str()); _mkdir(monDst.c_str()); _mkdir(proDst.c_str()); _mkdir(resDst.c_str()); }else{ if(_access(tarDst.c_str(), 0) == -1){ _mkdir(tarDst.c_str()); } if(_access(monDst.c_str(), 0) == -1){ _mkdir(monDst.c_str()); } if(_access(proDst.c_str(), 0) == -1){ _mkdir(proDst.c_str()); } if(_access(resDst.c_str(), 0) == -1){ _mkdir(resDst.c_str()); } } #pragma omp parallel for num_threads(para.thread_num) for(int i = 0; i < groupNum; ++i){ vector<unsigned int> &tmpGroup = workPool.at(i); int groupSize = static_cast<int>(tmpGroup.size()); u_int proIndex = 1; string startSerial, endSerial; for(int j = 0; j < groupSize; ++j){ if(proIndex > para.pro_thick){ proIndex = 1; } string tmpSerial; stringstream tmpStream; tmpStream<<setw(5)<<setfill('0')<<tmpGroup.at(j); tmpStream>>tmpSerial; tmpStream.clear(); cv::Mat inImage, proImage, resImage, monImage, blockImage, inverseRowBlockImage; if(para.image_depth == 8){ monImage = cv::Mat::zeros(monHeght, monWidth, CV_8UC1); inverseRowBlockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_8UC1); }else if(para.image_depth == 16){ monImage = cv::Mat::zeros(monHeght, monWidth, CV_16UC1); inverseRowBlockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_16UC1); } if(proIndex == 1){ startSerial = tmpSerial; if(para.image_depth == 8){ proImage = cv::Mat::zeros(monHeght, monWidth, CV_8UC1); }else if(para.image_depth == 16){ proImage = cv::Mat::zeros(monHeght, monWidth, CV_16UC1); } } string tarName = tarSrc + sysDel + tarPre + tmpSerial + tarPost; Tar tmpTar(tarName); bool tarFlag = tmpTar.untar(tarDst); if(tarFlag){ cout<<tarName<<" "<<omp_get_thread_num()<<endl; for(u_int x = xBeg; x <= xEnd; ++x){ for(u_int y = yBeg; y <= yEnd; ++y){ string monInName; tmpStream<<monSrc<<sysDel<<tmpSerial<<sysDel<<monInPre<<tmpSerial<<"_"<<setw(2)<<setfill('0')<<x<<"_"<<setw(2)<<setfill('0')<<y<<monInPost; tmpStream>>monInName; tmpStream.clear(); blockImage = cv::imread(monInName, CV_LOAD_IMAGE_UNCHANGED); if(!blockImage.data){ cout<<"----------------------"<<endl; cout<<"Image Loaded Error!"<<endl; cout<<monInName<<endl; cout<<"----------------------"<<endl; if(para.image_depth == 8){ blockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_8UC1); }else if(para.image_depth == 16){ blockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_16UC1); } } for(int col = 0; col < blockImage.cols; ++col){ blockImage.col(col).copyTo(inverseRowBlockImage.col(blockImage.cols - col - 1)); } cv::Rect blockRoi((x - xBeg) * blockWidth, (y - yBeg) * blockHeight, blockWidth, blockHeight); inverseRowBlockImage.copyTo(monImage(blockRoi)); } } string monOutName = monDst + sysDel + para.mon_out_pre + tmpSerial + para.mon_out_post; string resOutName = resDst + sysDel + para.res_out_pre + tmpSerial + para.res_out_post; cv::imwrite(monOutName, monImage); cout<<monOutName<<" "<<omp_get_thread_num()<<endl; cv::resize(monImage, resImage, cv::Size(), para.res_fx, para.res_fy, cv::INTER_AREA); cv::imwrite(resOutName, resImage); cout<<resOutName<<" "<<omp_get_thread_num()<<endl; proImage = cv::max(monImage, proImage); if(proIndex == para.pro_thick || j == groupSize - 1){ endSerial = tmpSerial; string proOutName = proDst + sysDel + para.pro_out_pre + startSerial + "-" + endSerial + para.pro_out_post; cv::imwrite(proOutName, proImage); cout<<proOutName<<" "<<omp_get_thread_num()<<endl; } }else{ cout<<"Tar Error!"<<endl; } ++proIndex; } }
int Preprocessor::montage(){ string src = para.mon_src; string dst = para.mon_dst; string sys_del = para.sys_del; unsigned int serial_beg = para.serial_beg, serial_end = para.serial_end, x_beg = para.x_beg, x_end = para.x_end, y_beg = para.y_beg, y_end = para.y_end, block_width = para.block_width, block_height = para.block_height, serial_bits = para.serial_bits, thread_num = para.thread_num; unsigned int image_depth = para.image_depth; string image_pre = para.mon_in_pre, image_post = para.mon_in_post; string out_pre = para.mon_out_pre, out_post = para.mon_out_post; #pragma omp parallel for num_threads(thread_num) for(int serial_num = int(serial_beg); serial_num <= int(serial_end); ++ serial_num){ stringstream string_buffer; string serial_string(""); cv::Mat out_image, block_image, re_block_image; cout<<serial_num<<": "<<omp_get_thread_num()<<endl; if(image_depth == 16){ out_image = cv::Mat((y_end - y_beg + 1) * block_height, (x_end - x_beg + 1) * block_width, CV_16UC1, cv::Scalar(0, 0, 0)); //cout<<out_image.rows<<" "<<out_image.cols<<" "<<out_image.depth()<<endl; block_image = cv::Mat(block_height, block_width, CV_16UC1, cv::Scalar(0, 0, 0)); re_block_image = cv::Mat(block_height, block_width, CV_16UC1, cv::Scalar(0, 0, 0)); }else{ out_image = cv::Mat((y_end - y_beg + 1) * block_height, (x_end - x_beg + 1) * block_width, CV_8UC1, cv::Scalar(0, 0, 0)); //cout<<out_image.rows<<" "<<out_image.cols<<" "<<out_image.depth()<<endl; block_image = cv::Mat(block_height, block_width, CV_8UC1, cv::Scalar(0, 0, 0)); re_block_image = cv::Mat(block_height, block_width, CV_8UC1, cv::Scalar(0, 0, 0)); } string x_str, y_str, image_str, out_image_name; string_buffer<<setw(serial_bits)<<setfill('0')<<serial_num; string_buffer>>serial_string; string_buffer.clear(); //cout<<serial_string<<endl; for(unsigned int x_in = x_beg; x_in <= x_end; ++ x_in){ for(unsigned int y_in = y_beg; y_in <= y_end; ++ y_in){ string_buffer<<setw(2)<<setfill('0')<<x_in; string_buffer>>x_str; string_buffer.clear(); string_buffer<<setw(2)<<setfill('0')<<y_in; string_buffer>>y_str; string_buffer.clear(); image_str = src + sys_del + serial_string + sys_del +image_pre + serial_string + "_" + x_str + "_" + y_str + image_post; block_image = cv::imread(image_str.c_str(), CV_LOAD_IMAGE_UNCHANGED); if(!block_image.data){ cout<<"----------------------"<<endl; cout<<"Image Loaded Error!"<<endl; cout<<image_str<<endl; cout<<"----------------------"<<endl; if(image_depth == 8){ block_image = cv::Mat::zeros(block_height, block_width, CV_8UC1); }else if(image_depth == 16){ block_image = cv::Mat::zeros(block_height, block_width, CV_16UC1); } } for(int y_block = 0; y_block < block_image.cols; ++ y_block){ block_image.col(y_block).copyTo(re_block_image.col(block_image.cols - y_block - 1)); } cv::Rect sub_roi((x_in - x_beg) * block_width, (y_in - y_beg) * block_height, block_width, block_height); cv::Mat sub_image(out_image, sub_roi); re_block_image.clone().copyTo(sub_image); } } out_image_name = dst + sys_del + out_pre + serial_string + out_post; cout<<out_image_name<<endl; cv::imwrite(out_image_name, out_image); } return 0; }
void Solver::TripleStates_Parallel(){ // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%% SETTING UP K_h AND K_pph %%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Kh = zeros<mat>(0,2); Khpp = zeros<mat>(0,4); int n=0; for (int i=0; i<Nholes; i++){ int Nx = basis.States(i,1); // Combining x-momentum int Ny = basis.States(i,2); // Combining y-momentum int Nz = basis.States(i,3); // Combining z-momentum int Sz = basis.States(i,4); // Combining spin // Adding a new two-hole-state configuration to matrix. (i, j, Identifier) Kh.insert_rows(n,1); Kh(n,0) = i; Kh(n,1) = Identifier(Nx,Ny,Nz,Sz); n++; } #pragma omp parallel { int id = omp_get_thread_num(); int nthreads = omp_get_num_threads(); // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis. int size = floor( Nparticles/nthreads) * Nholes*(Nparticles-1); if ( id < Nparticles%nthreads) size += Nholes*(Nparticles-1); mat partialStates = zeros<mat>(size,4); int n=0; for (int aa=id; aa<Nparticles; aa += nthreads){ for (int i=0; i<Nholes; i++){ for (int bb=0; bb<Nparticles; bb++){ if (aa != bb){ int a=aa+Nholes; int b=bb+Nholes; int Nx = basis.States(a,1) + basis.States(b,1) - basis.States(i,1); int Ny = basis.States(a,2) + basis.States(b,2) - basis.States(i,2); int Nz = basis.States(a,3) + basis.States(b,3) - basis.States(i,3); int Sz = basis.States(a,4) + basis.States(b,4) - basis.States(i,4); partialStates(n,0) = i; partialStates(n,1) = a; partialStates(n,2) = b; partialStates(n,3) = Identifier(Nx,Ny,Nz,Sz); n++; } } } } #pragma omp critical { Khpp.insert_rows(0,partialStates); } } NKh3 = Khpp.n_rows; NKh = Kh.n_rows; // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%% SETTING UP K_p AND K_phh STATES %%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Kp = zeros<mat>(0,2); Kphh = zeros<mat>(0,4); n=0; for (int aa=0; aa<Nparticles; aa++){ int a = aa+Nholes; int Nx = basis.States(a,1); // Combining x-momentum int Ny = basis.States(a,2); // Combining y-momentum int Nz = basis.States(a,3); // Combining z-momentum int Sz = basis.States(a,4); // Combining spin // Adding a new two-hole-state configuration to matrix. (i, j, Identifier) Kp.insert_rows(n,1); Kp(n,0) = a; Kp(n,1) = Identifier(Nx,Ny,Nz,Sz); n++; } #pragma omp parallel { int id = omp_get_thread_num(); int nthreads = omp_get_num_threads(); // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis. int size = floor( Nholes/nthreads) * Nparticles*(Nholes-1); if ( id < Nholes%nthreads) size += Nparticles*(Nholes-1); mat partialStates = zeros<mat>(size,4); int n=0; for (int i=id; i<Nholes; i+=nthreads){ for (int j=0; j<Nholes; j++){ for (int aa=0; aa<Nparticles; aa++){ if (i != j){ int a=aa+Nholes; int Nx = basis.States(i,1) + basis.States(j,1) - basis.States(a,1); int Ny = basis.States(i,2) + basis.States(j,2) - basis.States(a,2); int Nz = basis.States(i,3) + basis.States(j,3) - basis.States(a,3); int Sz = basis.States(i,4) + basis.States(j,4) - basis.States(a,4); partialStates(n,0) = a; partialStates(n,1) = i; partialStates(n,2) = j; partialStates(n,3) = Identifier(Nx,Ny,Nz,Sz); n++; } } } } #pragma omp critical { Kphh.insert_rows(0,partialStates); } } NKp3 = Kphh.n_rows; NKp = Kp.n_rows; }
void Solver::DirectStates_Parallel(){ // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%% SETTING UP DIRECT STATES %%%%%%%%%%%%%%%%%%%%%%% // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Holes = zeros<mat>(0,3); #pragma omp parallel { int id = omp_get_thread_num(); int nthreads = omp_get_num_threads(); // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis. int size = floor( Nholes/nthreads) * (Nholes-1); if ( id < Nholes%nthreads) size += Nholes - 1; mat partialStates = zeros<mat>(size,3); int n=0; // n will count how many two-state combinations we find. Used as indice in the matrix for (int i=id; i<Nholes; i += nthreads){ for (int j=0; j<Nholes; j++){ if (i != j){ // Pauli principle demands that the particles must be unequal // Setting up direct channels for holes // Two-hole momentum and spin int Nx = basis.States(i,1) + basis.States(j,1); // Combining x-momentum int Ny = basis.States(i,2) + basis.States(j,2); // Combining y-momentum int Nz = basis.States(i,3) + basis.States(j,3); // Combining z-momentum int Sz = basis.States(i,4) + basis.States(j,4); // Combining spin // Adding a new two-hole-state configuration to matrix. (i, j, Identifier) partialStates(n,0) = i; partialStates(n,1) = j; partialStates(n,2) = Identifier(Nx,Ny,Nz,Sz); n++; } } } #pragma omp critical Holes.insert_rows(0,partialStates); } Particles = zeros<mat>(0,3); #pragma omp parallel { int id = omp_get_thread_num(); int nthreads = omp_get_num_threads(); // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis. int size = floor( Nparticles/nthreads) * (Nparticles-1); if ( id < Nparticles%nthreads) size += Nparticles - 1; mat partialStates = zeros<mat>(size,3); int n=0; // n will count how many two-state combinations we find. Used as indice in the matrix for (int aa=id; aa<Nparticles; aa+=nthreads){ for (int bb=0; bb<Nparticles; bb++){ if (aa != bb){ int a=aa+Nholes; int b=bb+Nholes; int Nx = basis.States(a,1) + basis.States(b,1); int Ny = basis.States(a,2) + basis.States(b,2); int Nz = basis.States(a,3) + basis.States(b,3); int Sz = basis.States(a,4) + basis.States(b,4); partialStates(n,0) = a; partialStates(n,1) = b; partialStates(n,2) = Identifier(Nx,Ny,Nz,Sz); n++; } } } #pragma omp critical Particles.insert_rows(0,partialStates); } NPARTICLES = Particles.n_rows; NHOLES = Holes.n_rows; }
int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) { uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES]; blake2s_state S[PARALLELISM_DEGREE][1]; blake2s_state FS[1]; size_t i; /* Verify parameters */ if ( NULL == in && inlen > 0 ) return -1; if ( NULL == out ) return -1; if ( NULL == key && keylen > 0) return -1; if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1; if( keylen > BLAKE2S_KEYBYTES ) return -1; for( i = 0; i < PARALLELISM_DEGREE; ++i ) if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1; S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */ if( keylen > 0 ) { uint8_t block[BLAKE2S_BLOCKBYTES]; memset( block, 0, BLAKE2S_BLOCKBYTES ); memcpy( block, key, keylen ); for( i = 0; i < PARALLELISM_DEGREE; ++i ) blake2s_update( S[i], block, BLAKE2S_BLOCKBYTES ); secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ } #if defined(_OPENMP) #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE) #else for( i = 0; i < PARALLELISM_DEGREE; ++i ) #endif { #if defined(_OPENMP) size_t i = omp_get_thread_num(); #endif size_t inlen__ = inlen; const unsigned char *in__ = ( const unsigned char * )in; in__ += i * BLAKE2S_BLOCKBYTES; while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES ) { blake2s_update( S[i], in__, BLAKE2S_BLOCKBYTES ); in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES; } if( inlen__ > i * BLAKE2S_BLOCKBYTES ) { const size_t left = inlen__ - i * BLAKE2S_BLOCKBYTES; const size_t len = left <= BLAKE2S_BLOCKBYTES ? left : BLAKE2S_BLOCKBYTES; blake2s_update( S[i], in__, len ); } blake2s_final( S[i], hash[i], BLAKE2S_OUTBYTES ); } if( blake2sp_init_root( FS, outlen, keylen ) < 0 ) return -1; FS->last_node = 1; for( i = 0; i < PARALLELISM_DEGREE; ++i ) blake2s_update( FS, hash[i], BLAKE2S_OUTBYTES ); return blake2s_final( FS, out, outlen ); }
inline void K_point::generate_fv_states() { PROFILE_WITH_TIMER("sirius::K_point::generate_fv_states"); if (!ctx_.full_potential()) { return; } mdarray<double_complex, 2> pw_coeffs; mdarray<double_complex, 2> mt_coeffs; int nbnd_loc; /* in both cases eigen-vectors are redistributed to the same "full column" storage */ if (ctx_.iterative_solver_input_section().type_ == "exact") { fv_eigen_vectors_->remap_forward(0, ctx_.num_fv_states()); /* local number of bands */ nbnd_loc = fv_eigen_vectors_->spl_num_col().local_size(); if (nbnd_loc) { pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(), gklo_basis_size(), nbnd_loc); mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(num_gkvec(), 0), gklo_basis_size(), nbnd_loc); } } else { fv_eigen_vectors_slab_->remap_to_full_column_distr(ctx_.num_fv_states()); assert(fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size() == fv_eigen_vectors_slab_->mt_coeffs().spl_num_col().local_size()); /* local number of bands */ nbnd_loc = fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size(); if (nbnd_loc) { pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->pw_coeffs().extra().at<CPU>(), num_gkvec(), nbnd_loc); mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->mt_coeffs().extra().at<CPU>(), unit_cell_.mt_lo_basis_size(), nbnd_loc); } } #ifdef __GPU if (ctx_.processing_unit() == GPU) { pw_coeffs.allocate(memory_t::device); pw_coeffs.copy_to_device(); } #endif fv_states().prepare_full_column_distr(ctx_.num_fv_states()); assert(nbnd_loc == fv_states().pw_coeffs().spl_num_col().local_size()); assert(nbnd_loc == fv_states().mt_coeffs().spl_num_col().local_size()); #pragma omp parallel { /* get thread id */ #ifdef __GPU int tid = omp_get_thread_num(); #endif mdarray<double_complex, 2> alm(num_gkvec(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned); mdarray<double_complex, 2> tmp; #ifdef __GPU if (ctx_.processing_unit() == GPU) { alm.allocate(memory_t::device); tmp = mdarray<double_complex, 2>(unit_cell_.max_mt_aw_basis_size(), nbnd_loc, memory_t::device); } #endif #pragma omp for for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) { /* number of alm coefficients for atom */ int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size(); /* offset in wave-function */ int offset_wf = unit_cell_.atom(ia).offset_mt_coeffs(); /* generate matching coefficients for all G-vectors */ alm_coeffs_->generate(ia, alm); /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */ if (ctx_.processing_unit() == CPU) { /* multiply eigen-vectors and matching coefficients */ linalg<CPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(), alm.at<CPU>(), alm.ld(), pw_coeffs.at<CPU>(), pw_coeffs.ld(), fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld()); } #ifdef __GPU if (ctx_.processing_unit() == GPU) { /* multiply eigen-vectors and matching coefficients */ alm.async_copy_to_device(tid); linalg<GPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(), alm.at<GPU>(), alm.ld(), pw_coeffs.at<GPU>(), pw_coeffs.ld(), tmp.at<GPU>(), tmp.ld(), tid); acc::copyout(fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld(), tmp.at<GPU>(), tmp.ld(), mt_aw_size, nbnd_loc, tid); acc::sync_stream(tid); } #endif for (int i = 0; i < nbnd_loc; i++) { /* lo block */ std::memcpy(fv_states().mt_coeffs().extra().at<CPU>(offset_wf + mt_aw_size, i), mt_coeffs.at<CPU>(unit_cell_.atom(ia).offset_lo(), i), unit_cell_.atom(ia).mt_lo_basis_size() * sizeof(double_complex)); } } #pragma omp for for (int i = 0; i < nbnd_loc; i++) { /* G+k block */ std::memcpy(fv_states().pw_coeffs().extra().at<CPU>(0, i), pw_coeffs.at<CPU>(0, i), num_gkvec() * sizeof(double_complex)); } } fv_states().remap_to_prime_distr(ctx_.num_fv_states()); }
int main(int argc,char **argv) { PetscErrorCode ierr; PetscInt i,j,k,N=100,**counters,tsize; PetscInitialize(&argc,&argv,(char *)0,help); ierr = PetscThreadCommView(PETSC_COMM_WORLD,PETSC_VIEWER_STDOUT_WORLD); CHKERRQ(ierr); ierr = PetscOptionsGetInt(PETSC_NULL,"-N",&N,PETSC_NULL); CHKERRQ(ierr); ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&tsize); CHKERRQ(ierr); ierr = PetscMalloc(tsize*sizeof(*counters),&counters); CHKERRQ(ierr); ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterInit_kernel,1,counters); CHKERRQ(ierr); for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { /* ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,1,counters);CHKERRQ(ierr); */ ierr = PetscThreadCommRunKernel1(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,counters); CHKERRQ(ierr); } ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Time per kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { #pragma omp parallel num_threads(tsize) { PetscInt trank = omp_get_thread_num(); CounterIncrement_kernel(trank,counters); } } ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"OpenMP inline time per kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { CounterIncrement_kernel(0,counters); } ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per single kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } for (i=0; i<10; i++) { PetscReal t0,t1; ierr = PetscGetTime(&t0); CHKERRQ(ierr); for (j=0; j<N; j++) { for (k=0; k<tsize; k++) CounterIncrement_kernel(k,counters); } ierr = PetscGetTime(&t1); CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per kernel: %g us\n",1e6*(t1-t0)/N); CHKERRQ(ierr); } ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterFree_kernel,1,counters); CHKERRQ(ierr); ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD); CHKERRQ(ierr); ierr = PetscFree(counters); CHKERRQ(ierr); PetscFinalize(); return 0; }
/** * @brief Computes the total source (fission and scattering) in each FSR. * @details This method computes the total source in each FSR based on * this iteration's current approximation to the scalar flux. A * residual for the source with respect to the source compute on * the previous iteration is computed and returned. The residual * is determined as follows: * \f$ res = \sqrt{\frac{\displaystyle\sum \displaystyle\sum * \left(\frac{Q^i - Q^{i-1}}{Q^i}\right)^2}{\# FSRs}} \f$ * * @return the residual between this source and the previous source */ FP_PRECISION CPUSolver::computeFSRSources() { int tid; Material* material; FP_PRECISION scatter_source; FP_PRECISION fission_source; FP_PRECISION fsr_fission_source; FP_PRECISION* nu_sigma_f; FP_PRECISION* sigma_s; FP_PRECISION* sigma_t; FP_PRECISION* chi; FP_PRECISION source_residual = 0.0; FP_PRECISION inverse_k_eff = 1.0 / _k_eff; /* For all FSRs, find the source */ #pragma omp parallel for private(tid, material, nu_sigma_f, chi, \ sigma_s, sigma_t, fission_source, scatter_source, fsr_fission_source) \ schedule(guided) for (int r=0; r < _num_FSRs; r++) { tid = omp_get_thread_num(); material = _FSR_materials[r]; nu_sigma_f = material->getNuSigmaF(); chi = material->getChi(); sigma_s = material->getSigmaS(); sigma_t = material->getSigmaT(); /* Initialize the source residual to zero */ _source_residuals[r] = 0.; fsr_fission_source = 0.0; /* Compute fission source for each group */ if (material->isFissionable()) { for (int e=0; e < _num_groups; e++) _fission_sources(r,e) = _scalar_flux(r,e) * nu_sigma_f[e]; fission_source = pairwise_sum<FP_PRECISION>(&_fission_sources(r,0), _num_groups); fission_source *= inverse_k_eff; } else fission_source = 0.0; /* Compute total scattering source for group G */ for (int G=0; G < _num_groups; G++) { scatter_source = 0; for (int g=0; g < _num_groups; g++) _scatter_sources(tid,g) = material->getSigmaSByGroupInline(g,G) * _scalar_flux(r,g); scatter_source=pairwise_sum<FP_PRECISION>(&_scatter_sources(tid,0), _num_groups); /* Set the fission source for FSR r in group G */ fsr_fission_source += fission_source * chi[G]; /* Set the reduced source for FSR r in group G */ _reduced_sources(r,G) = (fission_source * chi[G] + scatter_source) * ONE_OVER_FOUR_PI / sigma_t[G]; } /* Compute the norm of residual of the source in the FSR */ if (fsr_fission_source > 0.0) _source_residuals[r] = pow((fsr_fission_source - _old_fission_sources[r]) / fsr_fission_source, 2); /* Update the old source */ _old_fission_sources[r] = fsr_fission_source; } /* Sum up the residuals from each FSR */ source_residual = pairwise_sum<FP_PRECISION>(_source_residuals, _num_FSRs); source_residual = sqrt(source_residual \ / (_num_fissionable_FSRs * _num_groups)); return source_residual; }
int main(int argc, char** argv) { set_program_options(opts, argc, argv); ///Carica l'opportuna struttura di adiacenza, selezionata da linea di comando if (opts.topologia == TORO_2D) topologia = adiacenza_toroidal_lattice(opts.lato); else if (opts.topologia == LINEARE){ opts.partition_type = LINEAR_PARTITION; topologia = adiacenza_simple_line(opts.seq_len); } else { printf("Not supported topology\n"); exit(1); } opts.seq_len = topologia.N; //logarithm lookup table, 6x program speedup mylog = new double[3 * opts.seq_len + 10]; for (int i = 1; i < 3 * opts.seq_len + 10; i++) mylog[i] = log(i); mylog[0] = 0; myexp = new double[100]; for (int i = 0; i < 100; i++) myexp[i] = exp(- opts.beta[0] * i); double media_globale = 0; double media_globale_n2 = 0; double media_rid_globale = 0; double media_rid_globale_n2 = 0; int n_estrazioni = 100; int runs = 0; // <editor-fold defaultstate="collapsed" desc="Sequenze monodimensionali"> if (opts.topologia == LINEARE) #pragma omp parallel { linear_partition *partitions = new linear_partition[opts.n_seq]; int *buf_sequenze = new int[opts.n_seq * opts.seq_len]; distance d(opts.seq_len); RandMT generatore; for (int L = 0; L < n_estrazioni; L++) { // Generazione di un nuovo vettore J_ij random // e di opts.n_seq sequenze che hanno quel J double media_locale = 0; double media_locale_n2 = 0; double media_rid_locale = 0; double media_rid_locale_n2 = 0; ising_entries_jnorm(opts, buf_sequenze, generatore); //riempi le partizioni, a partire dalle sequenze date for (int i = 0; i < opts.n_seq; i++) partitions[i].fill(&buf_sequenze[i * opts.seq_len], opts.seq_len); //media delle distanze tra le coppie di sequenze generate //#pragma omp parallel for firstprivate(d) schedule(dynamic,10) reduction(+: media_n, media_n2) for (int i = 0; i < opts.n_seq; i++) { for (int j = i + 1; j < opts.n_seq; j++) { d.dist(partitions[i], partitions[j]); media_locale += d.dist_shan; media_rid_locale += d.dist_shan_r; media_locale_n2 += (d.dist_shan)*(d.dist_shan); media_rid_locale_n2 += (d.dist_shan_r)*(d.dist_shan_r); } } #pragma omp critical { media_globale += media_locale; media_globale_n2 += media_locale_n2; media_rid_globale += media_rid_locale; media_rid_globale_n2 += media_rid_locale_n2; runs += 1; } } }// </editor-fold> // <editor-fold defaultstate="collapsed" desc="Reticoli bidimensionali"> if (opts.topologia == TORO_2D) { std::clock_t start = std::clock(); double time_diff; double completed_ratio; #pragma omp parallel num_threads(opts.threads) { general_partition *partitions = new general_partition[opts.n_seq]; distance d(opts.seq_len); RandMT generatore; for (int L = 0; L < n_estrazioni; L++) { // Generazione di un nuovo vettore J_ij random // e di opts.n_seq sequenze che hanno quel J double media_locale = 0; double media_locale_n2 = 0; double media_rid_locale = 0; double media_rid_locale_n2 = 0; ising_lattice(opts, generatore, partitions); //media delle distanze tra le coppie di sequenze generate //#pragma omp parallel for firstprivate(d) schedule(dynamic,10) reduction(+: media_n, media_n2) for (int i = 0; i < opts.n_seq; i++) { for (int j = i + 1; j < opts.n_seq; j++) { d(partitions[i], partitions[j]); media_locale += d.dist_shan; media_rid_locale += d.dist_shan_r; media_locale_n2 += (d.dist_shan)*(d.dist_shan); media_rid_locale_n2 += (d.dist_shan_r)*(d.dist_shan_r); } } #pragma omp critical { media_globale += media_locale; media_globale_n2 += media_locale_n2; media_rid_globale += media_rid_locale; media_rid_globale_n2 += media_rid_locale_n2; runs += 1; } #ifdef _OPENMP int this_thread = omp_get_thread_num(); if (this_thread) continue; double time_ratio = omp_get_num_threads(); #else double time_ratio = 1.0; #endif fprintf(stderr, "\r"); time_diff = (std::clock() - start) / (double) CLOCKS_PER_SEC / time_ratio; completed_ratio = (L + 1.0) / n_estrazioni; fprintf(stderr, "%.1f%% done, ETA %.0fs ", completed_ratio * 100, ceil(time_diff * (1 / completed_ratio - 1))); fflush(stderr); } } time_diff = (std::clock() - start) / (double) CLOCKS_PER_SEC; fprintf(stderr, "\r100%% done in %.1f seconds of CPU time\n", time_diff); }// </editor-fold> double varianza_n, varianza_r; int Nd = runs * (opts.n_seq * (opts.n_seq - 1)) / 2; media_globale /= Nd; media_globale_n2 /= Nd; media_rid_globale /= Nd; media_rid_globale_n2 /= Nd; varianza_n = media_globale_n2 - media_globale*media_globale; varianza_r = media_rid_globale_n2 - media_rid_globale*media_rid_globale; int lunghezza; if(opts.topologia == TORO_2D) lunghezza=opts.lato; else lunghezza=opts.seq_len; printf("%d %f %f %f %f\n", lunghezza, media_globale, varianza_n, media_rid_globale, varianza_r); //fprintf(stderr, "%d %f %f\n", opts.seq_len, media_globale, varianza_n); return 0; }
/** * @brief Compute \f$ k_{eff} \f$ from the total, fission and scattering * reaction rates and leakage. * @details This method computes the current approximation to the * multiplication factor on this iteration as follows: * \f$ k_{eff} = \frac{\displaystyle\sum_{i \in I} * \displaystyle\sum_{g \in G} \nu \Sigma^F_g \Phi V_{i}} * {\displaystyle\sum_{i \in I} * \displaystyle\sum_{g \in G} (\Sigma^T_g \Phi V_{i} - * \Sigma^S_g \Phi V_{i} - L_{i,g})} \f$ */ void CPUSolver::computeKeff() { int tid; Material* material; FP_PRECISION* sigma; FP_PRECISION volume; FP_PRECISION total = 0.0; FP_PRECISION fission = 0.0; FP_PRECISION scatter = 0.0; FP_PRECISION* FSR_rates = new FP_PRECISION[_num_FSRs]; FP_PRECISION* group_rates = new FP_PRECISION[_num_threads * _num_groups]; /* Loop over all FSRs and compute the volume-weighted total rates */ #pragma omp parallel for private(tid, volume, \ material, sigma) schedule(guided) for (int r=0; r < _num_FSRs; r++) { tid = omp_get_thread_num() * _num_groups; volume = _FSR_volumes[r]; material = _FSR_materials[r]; sigma = material->getSigmaT(); for (int e=0; e < _num_groups; e++) group_rates[tid+e] = sigma[e] * _scalar_flux(r,e); FSR_rates[r]=pairwise_sum<FP_PRECISION>(&group_rates[tid], _num_groups); FSR_rates[r] *= volume; } /* Reduce total rates across FSRs */ total = pairwise_sum<FP_PRECISION>(FSR_rates, _num_FSRs); /* Loop over all FSRs and compute the volume-weighted fission rates */ #pragma omp parallel for private(tid, volume, \ material, sigma) schedule(guided) for (int r=0; r < _num_FSRs; r++) { tid = omp_get_thread_num() * _num_groups; volume = _FSR_volumes[r]; material = _FSR_materials[r]; sigma = material->getNuSigmaF(); for (int e=0; e < _num_groups; e++) group_rates[tid+e] = sigma[e] * _scalar_flux(r,e); FSR_rates[r]=pairwise_sum<FP_PRECISION>(&group_rates[tid], _num_groups); FSR_rates[r] *= volume; } /* Reduce fission rates across FSRs */ fission = pairwise_sum<FP_PRECISION>(FSR_rates, _num_FSRs); /* Loop over all FSRs and compute the volume-weighted scattering rates */ #pragma omp parallel for private(tid, volume, \ material) schedule(guided) for (int r=0; r < _num_FSRs; r++) { tid = omp_get_thread_num() * _num_groups; volume = _FSR_volumes[r]; material = _FSR_materials[r]; FSR_rates[r] = 0.; for (int G=0; G < _num_groups; G++) { for (int g=0; g < _num_groups; g++) group_rates[tid+g] = material->getSigmaSByGroupInline(g,G) * _scalar_flux(r,g); FSR_rates[r]+=pairwise_sum<FP_PRECISION>(&group_rates[tid], _num_groups); } FSR_rates[r] *= volume; } /* Reduce scattering rates across FSRs */ scatter = pairwise_sum<FP_PRECISION>(FSR_rates, _num_FSRs); /* Reduce leakage array across Tracks, energy groups, polar angles */ int size = 2 * _tot_num_tracks * _polar_times_groups; _leakage = pairwise_sum<FP_PRECISION>(_boundary_leakage, size) * 0.5; _k_eff = fission / (total - scatter + _leakage); log_printf(DEBUG, "tot = %f, fiss = %f, scatt = %f, leakage = %f," "k_eff = %f", total, fission, scatter, _leakage, _k_eff); delete [] FSR_rates; delete [] group_rates; return; }
int main() { const int nr_threads = 2; const int n = N; const int nr_runs = 20000000; double a[n], sum = 0.0; int j; omp_set_dynamic(0); omp_set_num_threads(nr_threads); #pragma omp parallel default(none) shared(a) { #pragma omp sections { #pragma omp section { struct timeval tv1, tv2; int i, run_nr; int thread_nr = omp_get_thread_num(); for (i = 0; i < n/2; i += 1) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = 0; i < n/2 ;i += 1) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } #pragma omp section { struct timeval tv1, tv2; int thread_nr = omp_get_thread_num(); int i, run_nr; for (i = n/2; i < n; i += 1) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = n/2; i < n ;i += 1) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } } } sum = 0.0; for (j = 0; j < n; j++) sum += a[j]; printf("no false sharing: %.1lf\n", sum); #pragma omp parallel default(none) shared(a) { #pragma omp sections { #pragma omp section { struct timeval tv1, tv2; int i, run_nr; int thread_nr = omp_get_thread_num(); for (i = 0; i < n; i += 2) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = 0; i < n ;i += 2) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } #pragma omp section { struct timeval tv1, tv2; int i, run_nr; int thread_nr = omp_get_thread_num(); for (i = 1; i < n; i += 2) a[i] = 0.0; gettimeofday(&tv1, NULL); for (run_nr = 0; run_nr < nr_runs; run_nr++) for (i = 1; i < n ;i += 2) a[i] += i; gettimeofday(&tv2, NULL); printf("thread %d: %.6f\n", thread_nr, 1.0e-6*(tv2.tv_usec - tv1.tv_usec) + (tv2.tv_sec - tv1.tv_sec)); } } } sum = 0.0; for (j = 0; j < n; j++) sum += a[j]; printf("false sharing: %.1lf\n", sum); return EXIT_SUCCESS; }
/** * @brief This method performs one transport sweep of all azimuthal angles, * Tracks, Track segments, polar angles and energy groups. * @details The method integrates the flux along each Track and updates the * boundary fluxes for the corresponding output Track, while updating * the scalar flux in each flat source region. */ void CPUSolver::transportSweep() { int tid; int min_track, max_track; Track* curr_track; int azim_index; int num_segments; segment* curr_segment; segment* segments; FP_PRECISION* track_flux; log_printf(DEBUG, "Transport sweep with %d OpenMP threads", _num_threads); /* Initialize flux in each FSr to zero */ flattenFSRFluxes(0.0); if (_cmfd != NULL && _cmfd->isFluxUpdateOn()) zeroSurfaceCurrents(); /* Loop over azimuthal angle halfspaces */ for (int i=0; i < 2; i++) { /* Compute the minimum and maximum Track IDs corresponding to * this azimuthal angular halfspace */ min_track = i * (_tot_num_tracks / 2); max_track = (i + 1) * (_tot_num_tracks / 2); /* Loop over each thread within this azimuthal angle halfspace */ #pragma omp parallel for private(curr_track, azim_index, num_segments, \ curr_segment, segments, track_flux, tid) schedule(guided) for (int track_id=min_track; track_id < max_track; track_id++) { tid = omp_get_thread_num(); /* Use local array accumulator to prevent false sharing*/ FP_PRECISION* thread_fsr_flux; thread_fsr_flux = new FP_PRECISION[_num_groups]; /* Initialize local pointers to important data structures */ curr_track = _tracks[track_id]; azim_index = curr_track->getAzimAngleIndex(); num_segments = curr_track->getNumSegments(); segments = curr_track->getSegments(); track_flux = &_boundary_flux(track_id,0,0,0); /* Loop over each Track segment in forward direction */ for (int s=0; s < num_segments; s++) { curr_segment = &segments[s]; scalarFluxTally(curr_segment, azim_index, track_flux, thread_fsr_flux, true); } /* Transfer boundary angular flux to outgoing Track */ transferBoundaryFlux(track_id, azim_index, true, track_flux); /* Loop over each Track segment in reverse direction */ track_flux += _polar_times_groups; for (int s=num_segments-1; s > -1; s--) { curr_segment = &segments[s]; scalarFluxTally(curr_segment, azim_index, track_flux, thread_fsr_flux, false); } delete thread_fsr_flux; /* Transfer boundary angular flux to outgoing Track */ transferBoundaryFlux(track_id, azim_index, false, track_flux); } } return; }
/** * @brief Initializes the SpringApp instance * @return whether initialization was successful */ bool SpringApp::Initialize() { #if !(defined(WIN32) || defined(__APPLE__) || defined(HEADLESS)) //! this MUST run before any other X11 call (esp. those by SDL!) //! we need it to make calls to X11 threadsafe if (!XInitThreads()) { LOG_L(L_FATAL, "Xlib is not thread safe"); return false; } #endif #if defined(_WIN32) && defined(__GNUC__) // load QTCreator's gdb helper dll; a variant of this should also work on other OSes { // don't display a dialog box if gdb helpers aren't found UINT olderrors = SetErrorMode(SEM_FAILCRITICALERRORS); if (LoadLibrary("gdbmacros.dll")) { LOG("QT Creator's gdbmacros.dll loaded"); } SetErrorMode(olderrors); } #endif // Initialize class system creg::System::InitializeClasses(); // Initialize crash reporting CrashHandler::Install(); globalRendering = new CGlobalRendering(); ParseCmdLine(); CMyMath::Init(); good_fpu_control_registers("::Run"); // log OS version LOG("OS: %s", Platform::GetOS().c_str()); if (Platform::Is64Bit()) LOG("OS: 64bit native mode"); else if (Platform::Is32BitEmulation()) LOG("OS: emulated 32bit mode"); else LOG("OS: 32bit native mode"); // Rename Threads // We give the process itself the name `unknown`, htop & co. will still show the binary's name. // But all child threads copy by default the name of their parent, so all threads that don't set // their name themselves will show up as 'unknown'. Threading::SetThreadName("unknown"); #ifdef _OPENMP #pragma omp parallel { int i = omp_get_thread_num(); if (i != 0) { // 0 is the source thread std::ostringstream buf; buf << "omp" << i; Threading::SetThreadName(buf.str().c_str()); } } #endif // Install Watchdog Watchdog::Install(); Watchdog::RegisterThread(WDT_MAIN, true); FileSystemInitializer::Initialize(); // Create Window if (!InitWindow(("Spring " + SpringVersion::GetSync()).c_str())) { SDL_Quit(); return false; } mouseInput = IMouseInput::GetInstance(); keyInput = KeyInput::GetInstance(); input.AddHandler(boost::bind(&SpringApp::MainEventHandler, this, _1)); // Global structures gs = new CGlobalSynced(); gu = new CGlobalUnsynced(); // Initialize GLEW LoadExtensions(); //! check if FSAA init worked fine if (globalRendering->FSAA && !MultisampleVerify()) globalRendering->FSAA = 0; InitOpenGL(); agui::InitGui(); LoadFonts(); globalRendering->PostInit(); // Initialize named texture handler CNamedTextures::Init(); // Initialize Lua GL LuaOpenGL::Init(); // Sound & Input ISound::Initialize(); InitJoystick(); // Multithreading & Affinity LOG("CPU Cores: %d", Threading::GetAvailableCores()); const uint32_t affinity = configHandler->GetUnsigned("SetCoreAffinity"); const uint32_t cpuMask = Threading::SetAffinity(affinity); if (cpuMask == 0xFFFFFF) { LOG("CPU affinity not set"); } else if (cpuMask != affinity) { LOG("CPU affinity mask set: %d (config is %d)", cpuMask, affinity); } else if (cpuMask == 0) { LOG_L(L_ERROR, "Failed to CPU affinity mask <%d>", affinity); } else { LOG("CPU affinity mask set: %d", cpuMask); } // Create CGameSetup and CPreGame objects Startup(); return true; }
/** * @brief Computes the contribution to the FSR scalar flux from a Track segment. * @details This method integrates the angular flux for a Track segment across * energy groups and polar angles, and tallies it into the FSR * scalar flux, and updates the Track's angular flux. * @param curr_segment a pointer to the Track segment of interest * @param azim_index a pointer to the azimuthal angle index for this segment * @param track_flux a pointer to the Track's angular flux * @param fsr_flux a pointer to the temporary FSR flux buffer * @param fwd */ void CPUSolver::scalarFluxTally(segment* curr_segment, int azim_index, FP_PRECISION* track_flux, FP_PRECISION* fsr_flux, bool fwd){ int tid = omp_get_thread_num(); int fsr_id = curr_segment->_region_id; FP_PRECISION length = curr_segment->_length; FP_PRECISION* sigma_t = curr_segment->_material->getSigmaT(); /* The change in angular flux along this Track segment in the FSR */ FP_PRECISION delta_psi; FP_PRECISION exponential; /* Set the FSR scalar flux buffer to zero */ memset(fsr_flux, 0.0, _num_groups * sizeof(FP_PRECISION)); /* Loop over energy groups */ for (int e=0; e < _num_groups; e++) { /* Loop over polar angles */ for (int p=0; p < _num_polar; p++){ exponential = computeExponential(sigma_t[e], length, p); delta_psi = (track_flux(p,e)-_reduced_sources(fsr_id,e))*exponential; fsr_flux[e] += delta_psi * _polar_weights(azim_index,p); track_flux(p,e) -= delta_psi; } } if (_cmfd != NULL && _cmfd->isFluxUpdateOn()){ if (curr_segment->_cmfd_surface_fwd != -1 && fwd){ int pe = 0; /* Atomically increment the Cmfd Mesh surface current from the * temporary array using mutual exclusion locks */ omp_set_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_fwd]); /* Loop over energy groups */ for (int e = 0; e < _num_groups; e++) { /* Loop over polar angles */ for (int p = 0; p < _num_polar; p++){ /* Increment current (polar and azimuthal weighted flux, group) */ _surface_currents(curr_segment->_cmfd_surface_fwd,e) += track_flux(p,e)*_polar_weights(azim_index,p)/2.0; pe++; } } /* Release Cmfd Mesh surface mutual exclusion lock */ omp_unset_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_fwd]); } else if (curr_segment->_cmfd_surface_bwd != -1 && !fwd){ int pe = 0; /* Atomically increment the Cmfd Mesh surface current from the * temporary array using mutual exclusion locks */ omp_set_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_bwd]); /* Loop over energy groups */ for (int e = 0; e < _num_groups; e++) { /* Loop over polar angles */ for (int p = 0; p < _num_polar; p++){ /* Increment current (polar and azimuthal weighted flux, group) */ _surface_currents(curr_segment->_cmfd_surface_bwd,e) += track_flux(p,e)*_polar_weights(azim_index,p)/2.0; pe++; } } /* Release Cmfd Mesh surface mutual exclusion lock */ omp_unset_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_bwd]); } } /* Atomically increment the FSR scalar flux from the temporary array */ omp_set_lock(&_FSR_locks[fsr_id]); { for (int e=0; e < _num_groups; e++) _scalar_flux(fsr_id,e) += fsr_flux[e]; } omp_unset_lock(&_FSR_locks[fsr_id]); return; }
NODE getfromBarrier() { int index = omp_get_thread_num()/2; return (*leaf[index]); }
void ParticleListCPUSorted::allocate(PlasmaData* pdata,int nptcls_in) { //printf("Allocating Particle List on the CPU\n"); if(pdata->plot_flag) plot = gnuplot_init(); //gnuplot_cmd(plot,"set pointsize 0.1"); // Allocate memory for particles nptcls_allocated = nptcls_in; nptcls = nptcls_in; num_cores = pdata->num_cores; // Allocate realkind arrays for(int i=0;i<ParticleList_nfloats;i++) { *get_float(i) = (realkind*)malloc(nptcls_allocated*sizeof(realkind)); } // Allocate int arrays for(int i=0;i<ParticleList_nints;i++) { *get_int(i) = (int*)malloc(nptcls_allocated*sizeof(realkind)); } buffer = (realkind*)malloc(nptcls_allocated*sizeof(realkind)); num_subcycles = (int*)malloc(nptcls_allocated*sizeof(realkind)); memset(num_subcycles,0,nptcls_allocated*sizeof(int)); num_piccard = (realkind*)malloc(nptcls_allocated*sizeof(double)); memset(num_piccard,0,nptcls_allocated*sizeof(double)); num_piccard2 = (realkind*)malloc(nptcls_allocated*sizeof(double)); memset(num_piccard2,0,nptcls_allocated*sizeof(double)); // allocate short ints for cluster id's cluster_id = (int*)malloc(nptcls_allocated*sizeof(int)); ptcl_index = (int*)malloc(nptcls_allocated*sizeof(int)); piccard_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); accel_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); tally_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); crossing_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); dtau_est_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); tally_timer2 = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); load_store_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer)); int tid; omp_set_num_threads(pdata->num_cores); #pragma omp parallel private(tid) default(shared) num_threads(pdata->num_cores) { tid = omp_get_thread_num(); piccard_timer[tid] = *(new CPUTimer()); accel_timer[tid] = *(new CPUTimer()); tally_timer[tid] = *(new CPUTimer()); crossing_timer[tid] = *(new CPUTimer()); dtau_est_timer[tid] = *(new CPUTimer()); tally_timer2[tid] = *(new CPUTimer()); load_store_timer[tid] = *(new CPUTimer()); } push_timer = new CPUTimer(); }
template <typename PointInT, typename PointOutT> void pcl::MovingLeastSquares<PointInT, PointOutT>::performProcessing (PointCloudOut &output) { // Compute the number of coefficients nr_coeff_ = (order_ + 1) * (order_ + 2) / 2; #ifdef _OPENMP // (Maximum) number of threads const unsigned int threads = threads_ == 0 ? 1 : threads_; // Create temporaries for each thread in order to avoid synchronization typename PointCloudOut::CloudVectorType projected_points (threads); typename NormalCloud::CloudVectorType projected_points_normals (threads); std::vector<PointIndices> corresponding_input_indices (threads); #endif // For all points #ifdef _OPENMP #pragma omp parallel for schedule (dynamic,1000) num_threads (threads) #endif for (int cp = 0; cp < static_cast<int> (indices_->size ()); ++cp) { // Allocate enough space to hold the results of nearest neighbor searches // \note resize is irrelevant for a radiusSearch (). std::vector<int> nn_indices; std::vector<float> nn_sqr_dists; // Get the initial estimates of point positions and their neighborhoods if (searchForNeighbors ((*indices_)[cp], nn_indices, nn_sqr_dists)) { // Check the number of nearest neighbors for normal estimation (and later for polynomial fit as well) if (nn_indices.size () >= 3) { // This thread's ID (range 0 to threads-1) #ifdef _OPENMP const int tn = omp_get_thread_num (); // Size of projected points before computeMLSPointNormal () adds points size_t pp_size = projected_points[tn].size (); #else PointCloudOut projected_points; NormalCloud projected_points_normals; #endif // Get a plane approximating the local surface's tangent and project point onto it const int index = (*indices_)[cp]; size_t mls_result_index = 0; if (cache_mls_results_) mls_result_index = index; // otherwise we give it a dummy location. #ifdef _OPENMP computeMLSPointNormal (index, nn_indices, projected_points[tn], projected_points_normals[tn], corresponding_input_indices[tn], mls_results_[mls_result_index]); // Copy all information from the input cloud to the output points (not doing any interpolation) for (size_t pp = pp_size; pp < projected_points[tn].size (); ++pp) copyMissingFields (input_->points[(*indices_)[cp]], projected_points[tn][pp]); #else computeMLSPointNormal (index, nn_indices, projected_points, projected_points_normals, *corresponding_input_indices_, mls_results_[mls_result_index]); // Append projected points to output output.insert (output.end (), projected_points.begin (), projected_points.end ()); if (compute_normals_) normals_->insert (normals_->end (), projected_points_normals.begin (), projected_points_normals.end ()); #endif } } } #ifdef _OPENMP // Combine all threads' results into the output vectors for (unsigned int tn = 0; tn < threads; ++tn) { output.insert (output.end (), projected_points[tn].begin (), projected_points[tn].end ()); corresponding_input_indices_->indices.insert (corresponding_input_indices_->indices.end (), corresponding_input_indices[tn].indices.begin (), corresponding_input_indices[tn].indices.end ()); if (compute_normals_) normals_->insert (normals_->end (), projected_points_normals[tn].begin (), projected_points_normals[tn].end ()); } #endif // Perform the distinct-cloud or voxel-grid upsampling performUpsampling (output); }
long long int ParticleListCPUSorted::pushT(PlasmaData* pdata, FieldData* fields, HOMoments* moments) { int tid; int nthreads = pdata->num_cores; int stride = (nptcls+nthreads-1)/nthreads; long long int nSubSteps_proc[nthreads]; omp_set_num_threads(nthreads); // for(int i=0;i<pdata->nx;i++) // { // realkind temp; // temp = fields->intrpE(0.5,0,0,i,0,0,0,FieldData_deriv_f); // printf("fields[%i] on cpu = %f\n",i,temp); // } //printf("particles ") //printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH); // Start the parallel loop #pragma omp parallel private(tid,nthreads,stride) default(shared) num_threads(nthreads) { nthreads = omp_get_num_threads(); //printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH); //nthreads = 1; stride = (nptcls+nthreads-1)/nthreads; tid = omp_get_thread_num(); //tid = 0; // auto cpu = sched_getcpu(); // std::ostringstream os; // os<<"\nThread "<<omp_get_thread_num()<<" on cpu "<<sched_getcpu()<<std::endl; // std::cout<<os.str()<<std::flush; PlasmaData pdata_local = *pdata; // Each thread gets a separate copy of the accumulation arrays HOMoments* my_moment = moments+tid; // Initialize the moment values //printf("Initializing moment values\n"); my_moment->set_vals(0); int nSubcycle_max = pdata->nSubcycle_max; int ptcl_start,ptcl_end; int nptcls_process; int nptcls_left; int ishrink = 0; int nptcl_replacements = 0; int nptcl_done; //int iptcl_max; int iptcl_new_v[VEC_LENGTH]; int iptcl_v[VEC_LENGTH]; int iter_array_v[VEC_LENGTH]; int* iptcl_new = iptcl_new_v; int* iptcl = iptcl_v; int* iter_array = iter_array_v; long long int nSubSteps_done = 0; ptcl_start = stride*tid; ptcl_end = fmin(stride*(tid+1)-1,nptcls-1); nptcls_process = ptcl_end-ptcl_start+1; //printf("Thread %i starting at %i to %i with %i ptcls\n", // tid,ptcl_start,ptcl_end,nptcls_process); ParticleObjNT<VEC_LENGTH,nSpatial,nVel,iEM> particle(iptcl); // Populate the timers particle.piccard_timer = piccard_timer+tid; particle.accel_timer = accel_timer+tid; particle.tally_timer = tally_timer+tid; particle.crossing_timer = crossing_timer+tid; particle.dtau_est_timer = dtau_est_timer+tid; // ParticleObjN<VEC_LENGTH> particle(iptcl); typevecN<int,VEC_LENGTH> iter; iter = 0; for(int i=0;i<VEC_LENGTH;i++) iter_array[i] = 0; CurrentTally currents(&my_moment->get_val(0,0,0,ispecies,HOMoments_currentx), &my_moment->get_val(0,0,0,ispecies,HOMoments_currenty), &my_moment->get_val(0,0,0,ispecies,HOMoments_currentz), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); ChargeTally charge(&my_moment->get_val(0,0,0,ispecies,HOMoments_charge), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); StressTally stress(&my_moment->get_val(0,0,0,ispecies,HOMoments_S2xx), make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz), moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi, moments->pdata->ndimensions); for(int i=0;i<VEC_LENGTH;i++) iptcl[i] = ptcl_start+i; nptcl_done = 0; load_store_timer[tid].start(); particle = *this; //for(int i=0;i<VEC_LENGTH;i++) // particle.dt_finished(i) = 0; // Each thread loops over its own particles // In order to avoid SIMD divergence we loop until // all particles in the threads work que have been // pushed. Anytime a particle finishes a subcycle // it is written back to the main list and a new particle // takes its slot while(nptcl_done < nptcls_process) { nptcls_left = nptcls_process-nptcl_done; //printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done); if((nptcls_left <= VEC_LENGTH)&&(VEC_LENGTH > 1)) { if(ishrink == 0) { for(int j=0;j<VEC_LENGTH;j++) { //printf("iptcl[%i] = %i\n",j,iptcl[0][j]); particle.write_back(*this,j); } int k = 0; for(int l=0;l<VEC_LENGTH;l++) { bool idone = 0; //printf("iter2(%i) = %f\n",j,particles2.dt_finished(j)); if(particle.dt_finished(l) >= pdata->dt) { idone = 1; } else if(iter(l) >= pdata->nSubcycle_max) { idone = 1; // printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[l],pdata->dt-particle.dt_finished(l)); } else if(iptcl[l] > ptcl_end) idone = 1; else idone = 0; if(idone) { nSubSteps_done += iter(l); num_subcycles[iptcl[l]] += iter(l); iter(l) = 0; // Accumulate Charge and S2 moment } else { iptcl[k] = iptcl[l]; iter_array[k] = iter(l); k++; } } nptcl_done = nptcls_process - k ; nptcls_left = k; ishrink = 1; } // Hack to compile all versions of ParticleObjN template shrink_pushT<VEC_LENGTH,nSpatial,nVel,iEM>(pdata,fields,¤ts,this, &iter_array,&iptcl,&iptcl_new, nptcls_left,nptcl_done,nptcls_process,nSubSteps_done); // shrink_push<VEC_LENGTH>(pdata,fields,¤ts,this, // &iter_array,&iptcl,&iptcl_new, // nptcls_left,nptcl_done,nptcls_process,nSubSteps_done); } else { // for(int j=0;j<VEC_LENGTH;j++) // printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n", // iptcl[j],particle.px(j),particle.py(j),particle.pz(j), // particle.ix(j),particle.iy(j),particle.iz(j), // particle.vx(j),particle.vy(j),particle.vz(j)); // Here our particle vector size is the same // size as our system vector size, and won't // change from step to step particle.push(pdata,fields,¤ts,iter,nSubcycle_max); // Replace the particle (or particles) that // have finished their subcycle steps //int k = 0; for(int j=0;j<VEC_LENGTH;j++) { bool idone = 0; if(particle.dt_finished(j) >= pdata->dt) { idone = 1; } else if(iter(j) >= pdata->nSubcycle_max) { idone = 1; // printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[j],pdata->dt-particle.dt_finished(j)); } if(idone) { // Accumulate Charge and S2 moment // printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n", // iptcl[j],particle.px(j),particle.py(j),particle.pz(j), // particle.ix(j),particle.iy(j),particle.iz(j), // particle.vx(j),particle.vy(j),particle.vz(j)); // Write results, and get a new particle from the list particle.write_back(*this,j); num_subcycles[iptcl[j]] += iter(j); iptcl[j] = ptcl_start + nptcl_done + VEC_LENGTH; nptcl_done++; if(nptcls_process-nptcl_done > 0) { particle.copy_in(*this,j); } nSubSteps_done += iter(j); iter(j) = 0; particle.dt_finished(j) = 0.0f; } } /* for(int j=0;j<nptcls_left;j++) */ //printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done); } /* else */ nptcl_replacements++; } /* while(nptcl_done < nptcls_process) */ load_store_timer[tid].stop(); tally_timer2[tid].start(); // accumulate charge and s2 moment for(int i=ptcl_start;i<=ptcl_end;i++) { charge.tally(px[i],py[i],pz[i], ix[i],iy[i],iz[i], 1.0); stress.tally1d1v(px[i], vx[i], ix[i], 1.0f); //if(fabs(dt_finished[i] - pdata->dt) > 1.0e-5) // printf("particle %i dt_finished = %e\n",i,dt_finished[i]); dt_finished[i] = 0.0f; } tally_timer2[tid].stop(); //nSubSteps_proc[0] = nSubSteps_done; nSubSteps_proc[tid] = nSubSteps_done; // printf("average particles processed per replacement: %f\n",nptcls_process/((double)nptcl_replacements)); } /* pragma omp parallel */ for(int i=1;i<nthreads;i++) nSubSteps_proc[0] += nSubSteps_proc[i]; //printf("nsteps avg = %i\n",nSubSteps_proc[0]); return nSubSteps_proc[0]; }
LIS_INT lis_vector_nrm2(LIS_VECTOR vx, LIS_REAL *value) { LIS_INT i,n; LIS_SCALAR dot; LIS_SCALAR *x; LIS_SCALAR tmp; #ifdef _OPENMP LIS_INT nprocs,my_rank; #endif #ifdef USE_MPI MPI_Comm comm; #endif LIS_DEBUG_FUNC_IN; n = vx->n; x = vx->value; #ifdef USE_MPI comm = vx->comm; #endif #ifdef _OPENMP nprocs = omp_get_max_threads(); #pragma omp parallel private(i,tmp,my_rank) { my_rank = omp_get_thread_num(); tmp = 0.0; #ifdef USE_VEC_COMP #pragma cdir nodep #endif #pragma omp for for(i=0; i<n; i++) { tmp += x[i]*x[i]; } lis_vec_tmp[my_rank*LIS_VEC_TMP_PADD] = tmp; } dot = 0.0; for(i=0;i<nprocs;i++) { dot += lis_vec_tmp[i*LIS_VEC_TMP_PADD]; } #else dot = 0.0; #ifdef USE_VEC_COMP #pragma cdir nodep #endif for(i=0; i<n; i++) { dot += x[i]*x[i]; } #endif #ifdef USE_MPI MPI_Allreduce(&dot,&tmp,1,MPI_DOUBLE,MPI_SUM,comm); *value = sqrt(tmp); #else *value = sqrt(dot); #endif LIS_DEBUG_FUNC_OUT; return LIS_SUCCESS; }
int main(int argc, char const *argv[]) { char* s; std::srand(std::time(0)); //use current time as seed for random generator int r = rand() % 1000; for(int i = 0; i < r; i++) { rand(); } if(argc < 3) { int forestSize = strtol(argv[1], &s, 10); for(int i = 0 ; i < forestSize ; i++) { printf("%lf\n",fRand(1,std::sqrt(10))); } return 1; } int forestSize = strtol(argv[1], &s, 10); int iterations = strtol(argv[2], &s, 10); double SIDE = std::sqrt(forestSize); SIDE = fRand(std::sqrt(SIDE),std::sqrt(2)*SIDE); double R = 1; double begin, end; std::vector<int> empty; std::vector<Tree*> Forest; std::vector< std::vector<int> > neighbors(forestSize,empty); std::vector< std::vector<double> > metrics(iterations,std::vector<double>(forestSize,0.0)); //Parallel variables int num_threads; std::vector<int> order; begin = omp_get_wtime(); #pragma omp parallel shared(Forest,neighbors,metrics,forestSize,iterations,order) { #pragma omp master { // INIT VARIABLES num_threads = omp_get_num_threads(); std::vector<Point> positions; std::cout << "Running " << forestSize << " trees for " << iterations << " iterations on " << num_threads << " processors" << std::endl; printf("SIDE = %lf, R = %lf\n",SIDE,R); for(int i = 0; i < forestSize; i++) { // double x = std::fabs((SIDE-1)*std::sin(i)); // double y = std::fabs(SIDE*std::cos(i*i)); double x = fRand(0,SIDE); double y = fRand(0,SIDE); Point p = {x,y}; Tree *T = new MonopodialTree(); Forest.push_back(T); positions.push_back(p); for(int j = 0 ; j < i ; j++) { Point q = positions[j]; if(pointDistance(p,q) < R) { neighbors[j].push_back(i); neighbors[i].push_back(j); } } } order = get_order(neighbors); for(int i = 0; i < order.size(); i++) std::cout << order[i] << " "; std::cout << std::endl; } #pragma omp barrier int thread_num = omp_get_thread_num(); // ITERATE int N = forestSize; int T = iterations; int P = omp_get_num_threads(); int x = thread_num; int y = 0; while( x+N*y < N*T) { int i = order[x]; // printf("%d (%d, %d)\n",thread_num,y,i ); while(Forest[i]->iteration < y); bool ready = false; while(!ready) { ready = true; for(int k = 0; k < neighbors[i].size() ; k++) { if( Forest[ neighbors[i][k] ]->iteration < y) { ready = false; break; } } } if(y > 0) { Forest[i]->updateMetric(metrics[y],neighbors[i]); } Forest[i]->next(); double metric = Forest[i]->calculateMetric(); #pragma omp critical(metrics) { metrics[y][i] = metric; } x+=P; if(x >= N) { x -= N; y++; } } } end = omp_get_wtime(); print_forest(Forest, neighbors, metrics[iterations-1]); std::vector< std::vector<int> > connected_components = get_connected_components(neighbors); print_connected_components( connected_components); char buffer[80]; FILE *f = fopen("Results_lookahead.txt", "a"); if(f != NULL) { fprintf(f, "%s\n", gettime(buffer)); fprintf(f,"%d threads\n",num_threads); fprintf(f,"%d trees\n",forestSize); fprintf(f,"%d iterations\n",iterations); for(int i = 0; i < connected_components.size(); i++) { fprintf(f, "%d ", connected_components[i].size()); } fprintf(f, "\n"); fprintf(f,"Time : %f seconds\n", end-begin); fprintf(f,"\n=====================\n"); } for(int i = 0; i < Forest.size() ; i++) { delete Forest[i]; } return 0; }