main ()
{
  thds = omp_get_max_threads ();
  if (thds == 1) {
    printf ("should be run this program on multi threads.\n");
    exit (0);
  }
  omp_set_dynamic (0);


  prvt.i = MAGICNO;
  prvt.d = MAGICNO+1;
  #pragma omp parallel firstprivate (prvt)
  {
    int	id = omp_get_thread_num ();

    if (prvt.i != MAGICNO) {
      #pragma omp critical
      errors += 1;
    }
    if (prvt.d != MAGICNO+1) {
      #pragma omp critical
      errors += 1;
    }

    prvt.i = id;
    prvt.d = id-1;

    #pragma omp barrier
    if (prvt.i != id) {
      #pragma omp critical
      errors += 1;
    }
    if (prvt.d != id-1) {
      #pragma omp critical
      errors += 1;
    }
    if (sizeof(prvt) != sizeof(struct x)) {
      #pragma omp critical
      errors += 1;
    }
  }


  prvt.i = MAGICNO*2;
  prvt.d = MAGICNO*2+1;
  #pragma omp parallel firstprivate (prvt)
  func1 (MAGICNO*2, &prvt);


  prvt.i = MAGICNO*3;
  prvt.d = MAGICNO*3+1;
  #pragma omp parallel firstprivate (prvt)
  func2 (MAGICNO*3);


  if (errors == 0) {
    printf ("firstprivate 013 : SUCCESS\n");
    return 0;
  } else {
    printf ("firstprivate 013 : FAILED\n");
    return 1;
  }
}
double computeGraph(graph* G, graphSDG* SDGdata) {

    VERT_T* endV;
    LONG_T *degree, *numEdges, *pos, *pSums;
    WEIGHT_T* w;
    double elapsed_time;

#ifdef _OPENMP
    omp_lock_t *vLock;
    LONG_T chunkSize;
#endif

    elapsed_time = get_seconds();

#ifdef _OPENMP
    omp_set_num_threads(NUM_THREADS);
#endif

#ifdef _OPENMP
#pragma omp parallel
#endif    
{
    LONG_T i, j, u, n, m, tid, nthreads;
#ifdef DIAGNOSTIC
    double elapsed_time_part;
#endif
    
#ifdef _OPENMP    
    nthreads = omp_get_num_threads();
    tid = omp_get_thread_num();
#else
    tid = 0;
    nthreads = 1;
#endif

    n = N;
    m = M;
    
    if (tid == 0) {
#ifdef _OPENMP
        vLock = (omp_lock_t *) malloc(n*sizeof(omp_lock_t));
        assert(vLock != NULL);
        chunkSize = n/nthreads;
#endif
        pos = (LONG_T *) malloc(m*sizeof(LONG_T));
        assert(pos != NULL);
        degree = (LONG_T *) calloc(n, sizeof(LONG_T));
        assert(degree != NULL);
    }
  
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds();
    }
#endif
    
#ifdef _OPENMP    
#pragma omp barrier
    
    #pragma omp for schedule(static, chunkSize)
    for (i=0; i<n; i++) {
        omp_init_lock(&vLock[i]);
    }

    #pragma omp barrier
  
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Lock initialization time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
 
    #pragma omp for
#endif
    for (i=0; i<m; i++) {
        u = SDGdata->startVertex[i];
#ifdef _OPENMP        
        omp_set_lock(&vLock[u]);
#endif
        pos[i] = degree[u]++;
#ifdef _OPENMP
        omp_unset_lock(&vLock[u]);
#endif
    } 
   
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Degree computation time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
   
#ifdef _OPENMP
#pragma omp barrier

#pragma omp for schedule(static, chunkSize)
    for (i=0; i<n; i++) {
        omp_destroy_lock(&vLock[i]);
    }

    if (tid == 0) 
        free(vLock);
#endif
    
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Lock destruction time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
   
    if (tid == 0) {
        numEdges = (LONG_T *) malloc((n+1)*sizeof(LONG_T));
        pSums = (LONG_T *) malloc(nthreads*sizeof(LONG_T));
   }

#ifdef _OPENMP
#pragma omp barrier
#endif

    prefix_sums(degree, numEdges, pSums, n); 
    
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Prefix sums time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
 
#ifdef _OPENMP
#pragma omp barrier
#endif

    if (tid == 0) {
        free(degree);
        free(pSums);
        w = (WEIGHT_T *) malloc(m*sizeof(WEIGHT_T));
        endV = (VERT_T *) malloc(m* sizeof(VERT_T));
    }

#ifdef _OPENMP
    #pragma omp barrier

    #pragma omp for
#endif
    for (i=0; i<m; i++) {
        u = SDGdata->startVertex[i];
        j = numEdges[u] + pos[i];
        endV[j] = SDGdata->endVertex[i];
        w[j] = SDGdata->weight[i]; 
    }
    
#ifdef DIAGNOSTIC
    if (tid == 0) {
        elapsed_time_part = get_seconds() - elapsed_time_part;
        fprintf(stderr, "Edge data structure construction time: %lf seconds\n",
                elapsed_time_part);
        elapsed_time_part = get_seconds();

    }
#endif
 
    if (tid == 0) {
        free(pos);
        G->n = n;
        G->m = m;
        G->numEdges = numEdges;
        G->endV = endV;
        G->weight = w;
    }
#ifdef _OPENMP    
#endif
}
    /* Verification */
#if 0 
    fprintf(stderr, "SDG data:\n");
    for (int i=0; i<SDGdata->m; i++) {
        fprintf(stderr, "[%ld %ld %ld] ", SDGdata->startVertex[i], 
                SDGdata->endVertex[i], SDGdata->weight[i]);
    }
 
    fprintf(stderr, "\n");

    for (int i=0; i<G->n + 1; i++) {
        fprintf(stderr, "[%ld] ", G->numEdges[i]);
    }
    
    fprintf(stderr, "\nGraph:\n");
    for (int i=0; i<G->n; i++) {
        for (int j=G->numEdges[i]; j<G->numEdges[i+1]; j++) {
            fprintf(stderr, "[%ld %ld %ld] ", i, G->endV[j], G->weight[j]);
        }
    }
#endif 
    
    free(SDGdata->startVertex);
    free(SDGdata->endVertex);
    free(SDGdata->weight);
    
    elapsed_time = get_seconds() - elapsed_time; 
    
    return elapsed_time;
}
Example #3
0
/**
    Purpose
    -------
    SLAEX3 finds the roots of the secular equation, as defined by the
    values in D, W, and RHO, between 1 and K.  It makes the
    appropriate calls to SLAED4 and then updates the eigenvectors by
    multiplying the matrix of eigenvectors of the pair of eigensystems
    being combined by the matrix of eigenvectors of the K-by-K system
    which is solved here.

    It is used in the last step when only a part of the eigenvectors
    is required.
    It compute only the required part of the eigenvectors and the rest
    is not used.

    This code makes very mild assumptions about floating point
    arithmetic. It will work on machines with a guard digit in
    add/subtract, or on those binary machines without guard digits
    which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
    It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    Arguments
    ---------
    @param[in]
    k       INTEGER
            The number of terms in the rational function to be solved by
            SLAED4.  K >= 0.

    @param[in]
    n       INTEGER
            The number of rows and columns in the Q matrix.
            N >= K (deflation may result in N > K).

    @param[in]
    n1      INTEGER
            The location of the last eigenvalue in the leading submatrix.
            min(1,N) <= N1 <= N/2.

    @param[out]
    d       REAL array, dimension (N)
            D(I) contains the updated eigenvalues for
            1 <= I <= K.

    @param[out]
    Q       REAL array, dimension (LDQ,N)
            Initially the first K columns are used as workspace.
            On output the columns ??? to ??? contain
            the updated eigenvectors.

    @param[in]
    ldq     INTEGER
            The leading dimension of the array Q.  LDQ >= max(1,N).

    @param[in]
    rho     REAL
            The value of the parameter in the rank one update equation.
            RHO >= 0 required.

    @param[in,out]
    dlamda  REAL array, dimension (K)
            The first K elements of this array contain the old roots
            of the deflated updating problem.  These are the poles
            of the secular equation. May be changed on output by
            having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
            Cray-2, or Cray C-90, as described above.

    @param[in]
    Q2      REAL array, dimension (LDQ2, N)
            The first K columns of this matrix contain the non-deflated
            eigenvectors for the split problem.
            TODO what is LDQ2?

    @param[in]
    indx    INTEGER array, dimension (N)
            The permutation used to arrange the columns of the deflated
            Q matrix into three groups (see SLAED2).
            The rows of the eigenvectors found by SLAED4 must be likewise
            permuted before the matrix multiply can take place.

    @param[in]
    ctot    INTEGER array, dimension (4)
            A count of the total number of the various types of columns
            in Q, as described in INDX.  The fourth column type is any
            column which has been deflated.

    @param[in,out]
    w       REAL array, dimension (K)
            The first K elements of this array contain the components
            of the deflation-adjusted updating vector. Destroyed on
            output.

    @param
    s       (workspace) REAL array, dimension (N1 + 1)*K
            Will contain the eigenvectors of the repaired matrix which
            will be multiplied by the previously accumulated eigenvectors
            to update the system.

    @param[out]
    indxq   INTEGER array, dimension (N)
            On exit, the permutation which will reintegrate the
            subproblems back into sorted order,
            i.e. D( INDXQ( I = 1, N ) ) will be in ascending order.

    @param
    dwork   (workspace) REAL array, dimension (3*N*N/2+3*N)

    @param[in]
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                             will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.
            TODO verify range, vl, vu, il, iu -- copied from slaex1.

    @param[in]
    vl      REAL
    @param[in]
    vu      REAL
            if RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    @param[in]
    il      INTEGER
    @param[in]
    iu      INTEGER
            if RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit.
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  if INFO = 1, an eigenvalue did not converge

    Further Details
    ---------------
    Based on contributions by
    Jeff Rutter, Computer Science Division, University of California
    at Berkeley, USA
    Modified by Francoise Tisseur, University of Tennessee.

    @ingroup magma_ssyev_aux
    ********************************************************************/
extern "C" magma_int_t
magma_slaex3(
    magma_int_t k, magma_int_t n, magma_int_t n1,
    float *d,
    float *Q, magma_int_t ldq, float rho,
    float *dlamda, float *Q2, magma_int_t *indx,
    magma_int_t *ctot, float *w, float *s, magma_int_t *indxq,
    magmaFloat_ptr dwork,
    magma_range_t range, float vl, float vu, magma_int_t il, magma_int_t iu,
    magma_int_t *info )
{
    #define   Q(i_,j_) (Q   + (i_) + (j_)*ldq)
    #define  dQ(i_,j_) (dQ  + (i_) + (j_)*lddq)
    #define dQ2(i_,j_) (dQ2 + (i_) + (j_)*lddq)
    #define  dS(i_,j_) (dS  + (i_) + (j_)*lddq)

    float d_one  = 1.;
    float d_zero = 0.;
    magma_int_t ione = 1;
    magma_int_t ineg_one = -1;

    magma_int_t iil, iiu, rk;

    magma_int_t lddq = n/2 + 1;
    magmaFloat_ptr dQ2 = dwork;
    magmaFloat_ptr dS  = dQ2  + n*lddq;
    magmaFloat_ptr dQ  = dS   + n*lddq;

    magma_int_t i, iq2, j, n12, n2, n23, tmp, lq2;
    float temp;
    magma_int_t alleig, valeig, indeig;

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    *info = 0;

    if (k < 0)
        *info=-1;
    else if (n < k)
        *info=-2;
    else if (ldq < max(1,n))
        *info=-6;
    else if (! (alleig || valeig || indeig))
        *info = -15;
    else {
        if (valeig) {
            if (n > 0 && vu <= vl)
                *info = -17;
        }
        else if (indeig) {
            if (il < 1 || il > max(1,n))
                *info = -18;
            else if (iu < min(n,il) || iu > n)
                *info = -19;
        }
    }


    if (*info != 0) {
        magma_xerbla(__func__, -(*info));
        return *info;
    }

    // Quick return if possible
    if (k == 0)
        return *info;
    /*
     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
     be computed with high relative accuracy (barring over/underflow).
     This is a problem on machines without a guard digit in
     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
     which on any of these machines zeros out the bottommost
     bit of DLAMDA(I) if it is 1; this makes the subsequent
     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
     occurs. On binary machines with a guard digit (almost all
     machines) it does not change DLAMDA(I) at all. On hexadecimal
     and decimal machines with a guard digit, it slightly
     changes the bottommost bits of DLAMDA(I). It does not account
     for hexadecimal or decimal machines without guard digits
     (we know of none). We use a subroutine call to compute
     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
     this code.*/

    n2 = n - n1;

    n12 = ctot[0] + ctot[1];
    n23 = ctot[1] + ctot[2];

    iq2 = n1 * n12;
    lq2 = iq2 + n2 * n23;
    
    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    magma_ssetvector_async( lq2, Q2, 1, dQ2(0,0), 1, queue );

#ifdef _OPENMP
    /////////////////////////////////////////////////////////////////////////////////
    //openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
    //magma_timer_t time=0;
    //timer_start( time );

    #pragma omp parallel private(i, j, tmp, temp)
    {
        magma_int_t id = omp_get_thread_num();
        magma_int_t tot = omp_get_num_threads();

        magma_int_t ib = (  id   * k) / tot; //start index of local loop
        magma_int_t ie = ((id+1) * k) / tot; //end index of local loop
        magma_int_t ik = ie - ib;           //number of local indices

        for (i = ib; i < ie; ++i)
            dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

        for (j = ib; j < ie; ++j) {
            magma_int_t tmpp=j+1;
            magma_int_t iinfo = 0;
            lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
            // If the zero finder fails, the computation is terminated.
            if (iinfo != 0) {
                #pragma omp critical (info)
                *info=iinfo;
                break;
            }
        }

        #pragma omp barrier

        if (*info == 0) {
            #pragma omp single
            {
                //Prepare the INDXQ sorting permutation.
                magma_int_t nk = n - k;
                lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

                //compute the lower and upper bound of the non-deflated eigenvectors
                if (valeig) {
                    magma_svrange(k, d, &iil, &iiu, vl, vu);
                }
                else if (indeig) {
                    magma_sirange(k, indxq, &iil, &iiu, il, iu);
                }
                else {
                    iil = 1;
                    iiu = k;
                }
                rk = iiu - iil + 1;
            }

            if (k == 2) {
                #pragma omp single
                {
                    for (j = 0; j < k; ++j) {
                        w[0] = *Q(0,j);
                        w[1] = *Q(1,j);

                        i = indx[0] - 1;
                        *Q(0,j) = w[i];
                        i = indx[1] - 1;
                        *Q(1,j) = w[i];
                    }
                }
            }
            else if (k != 1) {
                // Compute updated W.
                blasf77_scopy( &ik, &w[ib], &ione, &s[ib], &ione);

                // Initialize W(I) = Q(I,I)
                tmp = ldq + 1;
                blasf77_scopy( &ik, Q(ib,ib), &tmp, &w[ib], &ione);

                for (j = 0; j < k; ++j) {
                    magma_int_t i_tmp = min(j, ie);
                    for (i = ib; i < i_tmp; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                    i_tmp = max(j+1, ib);
                    for (i = i_tmp; i < ie; ++i)
                        w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
                }

                for (i = ib; i < ie; ++i)
                    w[i] = copysign( sqrt( -w[i] ), s[i]);

                #pragma omp barrier

                //reduce the number of used threads to have enough S workspace
                tot = min(n1, omp_get_num_threads());

                if (id < tot) {
                    ib = (  id   * rk) / tot + iil - 1;
                    ie = ((id+1) * rk) / tot + iil - 1;
                    ik = ie - ib;
                }
                else {
                    ib = -1;
                    ie = -1;
                    ik = -1;
                }

                // Compute eigenvectors of the modified rank-1 modification.
                for (j = ib; j < ie; ++j) {
                    for (i = 0; i < k; ++i)
                        s[id*k + i] = w[i] / *Q(i,j);
                    temp = magma_cblas_snrm2( k, s+id*k, 1 );
                    for (i = 0; i < k; ++i) {
                        magma_int_t iii = indx[i] - 1;
                        *Q(i,j) = s[id*k + iii] / temp;
                    }
                }
            }
        }
    }  // end omp parallel
    if (*info != 0)
        return *info;

    //timer_stop( time );
    //timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#else
    /////////////////////////////////////////////////////////////////////////////////
    // Non openmp implementation
    /////////////////////////////////////////////////////////////////////////////////
   // magma_timer_t time=0;
   // timer_start( time );

    for (i = 0; i < k; ++i)
        dlamda[i]=lapackf77_slamc3(&dlamda[i], &dlamda[i]) - dlamda[i];

    for (j = 0; j < k; ++j) {
        magma_int_t tmpp=j+1;
        magma_int_t iinfo = 0;
        lapackf77_slaed4(&k, &tmpp, dlamda, w, Q(0,j), &rho, &d[j], &iinfo);
        // If the zero finder fails, the computation is terminated.
        if (iinfo != 0)
            *info=iinfo;
    }
    if (*info != 0)
        return *info;

    //Prepare the INDXQ sorting permutation.
    magma_int_t nk = n - k;
    lapackf77_slamrg( &k, &nk, d, &ione, &ineg_one, indxq);

    //compute the lower and upper bound of the non-deflated eigenvectors
    if (valeig) {
        magma_svrange(k, d, &iil, &iiu, vl, vu);
    }
    else if (indeig) {
        magma_sirange(k, indxq, &iil, &iiu, il, iu);
    }
    else {
        iil = 1;
        iiu = k;
    }
    rk = iiu - iil + 1;

    if (k == 2) {
        for (j = 0; j < k; ++j) {
            w[0] = *Q(0,j);
            w[1] = *Q(1,j);

            i = indx[0] - 1;
            *Q(0,j) = w[i];
            i = indx[1] - 1;
            *Q(1,j) = w[i];
        }
    }
    else if (k != 1) {
        // Compute updated W.
        blasf77_scopy( &k, w, &ione, s, &ione);

        // Initialize W(I) = Q(I,I)
        tmp = ldq + 1;
        blasf77_scopy( &k, Q, &tmp, w, &ione);

        for (j = 0; j < k; ++j) {
            for (i = 0; i < j; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
            for (i = j+1; i < k; ++i)
                w[i] = w[i] * ( *Q(i, j) / ( dlamda[i] - dlamda[j] ) );
        }

        for (i = 0; i < k; ++i)
            w[i] = copysign( sqrt( -w[i] ), s[i]);

        // Compute eigenvectors of the modified rank-1 modification.
        for (j = iil-1; j < iiu; ++j) {
            for (i = 0; i < k; ++i)
                s[i] = w[i] / *Q(i,j);
            temp = magma_cblas_snrm2( k, s, 1 );
            for (i = 0; i < k; ++i) {
                magma_int_t iii = indx[i] - 1;
                *Q(i,j) = s[iii] / temp;
            }
        }
    }

    //timer_stop( time );
    //timer_printf( "eigenvalues/vector D+zzT = %6.2f\n", time );

#endif //_OPENMP
    // Compute the updated eigenvectors.

    //timer_start( time );
    //magma_queue_sync( queue );  // previously, needed to setvector finished. Now all on same queue, so not needed?

    if (rk != 0) {
        if ( n23 != 0 ) {
            if (rk < magma_get_slaed3_k()) {
                lapackf77_slacpy("A", &n23, &rk, Q(ctot[0],iil-1), &ldq, s, &n23);
                blasf77_sgemm("N", "N", &n2, &rk, &n23, &d_one, &Q2[iq2], &n2,
                              s, &n23, &d_zero, Q(n1,iil-1), &ldq );
            } else {
                magma_ssetmatrix( n23, rk, Q(ctot[0],iil-1), ldq, dS(0,0), n23, queue );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, n2, rk, n23,
                             d_one,  dQ2(iq2,0), n2,
                                     dS(0,0), n23,
                             d_zero, dQ(0,0), lddq, queue );
                magma_sgetmatrix( n2, rk, dQ(0,0), lddq, Q(n1,iil-1), ldq, queue );
            }
        } else
            lapackf77_slaset("A", &n2, &rk, &d_zero, &d_zero, Q(n1,iil-1), &ldq);

        if ( n12 != 0 ) {
            if (rk < magma_get_slaed3_k()) {
                lapackf77_slacpy("A", &n12, &rk, Q(0,iil-1), &ldq, s, &n12);
                blasf77_sgemm("N", "N", &n1, &rk, &n12, &d_one, Q2, &n1,
                              s, &n12, &d_zero, Q(0,iil-1), &ldq);
            } else {
                magma_ssetmatrix( n12, rk, Q(0,iil-1), ldq, dS(0,0), n12, queue );
                magma_sgemm( MagmaNoTrans, MagmaNoTrans, n1, rk, n12,
                             d_one,  dQ2(0,0), n1,
                                     dS(0,0), n12,
                             d_zero, dQ(0,0), lddq, queue );
                magma_sgetmatrix( n1, rk, dQ(0,0), lddq, Q(0,iil-1), ldq, queue );
            }
        } else
            lapackf77_slaset("A", &n1, &rk, &d_zero, &d_zero, Q(0,iil-1), &ldq);
    }
    //timer_stop( time );
    //timer_printf( "gemms = %6.2f\n", time );

    magma_queue_destroy( queue );

    return *info;
} /* magma_slaex3 */
  // update the tree, do pruning
  virtual void Update(const std::vector<bst_gpair> &gpair,
                      IFMatrix *p_fmat,
                      const BoosterInfo &info,
                      const std::vector<RegTree*> &trees) {
    if (trees.size() == 0) return;
    // number of threads
    // thread temporal space
    std::vector< std::vector<TStats> > stemp;
    std::vector<RegTree::FVec> fvec_temp;
    // setup temp space for each thread
    int nthread;
    #pragma omp parallel
    {
      nthread = omp_get_num_threads();
    }
    fvec_temp.resize(nthread, RegTree::FVec());
    stemp.resize(nthread, std::vector<TStats>());
    #pragma omp parallel
    {
      int tid = omp_get_thread_num();
      int num_nodes = 0;
      for (size_t i = 0; i < trees.size(); ++i) {
        num_nodes += trees[i]->param.num_nodes;
      }
      stemp[tid].resize(num_nodes, TStats(param));
      std::fill(stemp[tid].begin(), stemp[tid].end(), TStats(param));
      fvec_temp[tid].Init(trees[0]->param.num_feature);
    }
    // if it is C++11, use lazy evaluation for Allreduce,
    // to gain speedup in recovery
#if __cplusplus >= 201103L
    auto lazy_get_stats = [&]()
#endif
    {
      // start accumulating statistics
      utils::IIterator<RowBatch> *iter = p_fmat->RowIterator();
      iter->BeforeFirst();
      while (iter->Next()) {
        const RowBatch &batch = iter->Value();
        utils::Check(batch.size < std::numeric_limits<unsigned>::max(),
                     "too large batch size ");
        const bst_omp_uint nbatch = static_cast<bst_omp_uint>(batch.size);
        #pragma omp parallel for schedule(static)
        for (bst_omp_uint i = 0; i < nbatch; ++i) {
          RowBatch::Inst inst = batch[i];
          const int tid = omp_get_thread_num();
          const bst_uint ridx = static_cast<bst_uint>(batch.base_rowid + i);
          RegTree::FVec &feats = fvec_temp[tid];
          feats.Fill(inst);
          int offset = 0;
          for (size_t j = 0; j < trees.size(); ++j) {
            AddStats(*trees[j], feats, gpair, info, ridx,
                     BeginPtr(stemp[tid]) + offset);
            offset += trees[j]->param.num_nodes;
          }
          feats.Drop(inst);
        }
      }
      // aggregate the statistics
      int num_nodes = static_cast<int>(stemp[0].size());
      #pragma omp parallel for schedule(static)
      for (int nid = 0; nid < num_nodes; ++nid) {
        for (int tid = 1; tid < nthread; ++tid) {
          stemp[0][nid].Add(stemp[tid][nid]);
        }
      }
    };
#if __cplusplus >= 201103L
    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size(), lazy_get_stats);
#else
    reducer.Allreduce(BeginPtr(stemp[0]), stemp[0].size());
#endif
    // rescale learning rate according to size of trees
    float lr = param.learning_rate;
    param.learning_rate = lr / trees.size();
    int offset = 0;
    for (size_t i = 0; i < trees.size(); ++i) {
      for (int rid = 0; rid < trees[i]->param.num_roots; ++rid) {
        this->Refresh(BeginPtr(stemp[0]) + offset, rid, trees[i]);
      }
      offset += trees[i]->param.num_nodes;
    }
    // set learning rate back
    param.learning_rate = lr;
  }
int runMe(int argc, char *argv[])
{

    ArgProcessor args(argc, argv);
    if (args.isArgSet("--help") ||
        (!(args.isArgSet("--reads") && args.isArgSet("--kmers")))) {
        cerr << usage(args) << endl << endl;
        exit(1);
    }
    string reads_fasta_file = args.getStringVal("--reads");
    string kmers_fasta_file = args.getStringVal("--kmers");
    bool is_DS = (!args.isArgSet("--SS"));
    if (args.isArgSet("--kmer_size")) {
        KMER_SIZE = args.getIntVal("--kmer_size");
        if (KMER_SIZE < 20) {
            cerr << "Error, min kmer size is 20";
            exit(2);
        }
    }
    if (args.isArgSet("--monitor")) {
        IRKE_COMMON::MONITOR = args.getIntVal("--monitor");
    }
    if (args.isArgSet("--num_threads")) {
        int num_threads = args.getIntVal("--num_threads");
        if (num_threads < MAX_THREADS) {
            omp_set_num_threads(num_threads);
        }
        else {
            // set to max
            omp_set_num_threads(MAX_THREADS);
        }
    }

    if (omp_get_max_threads() > MAX_THREADS) {
        omp_set_num_threads(MAX_THREADS);
    }
    KmerCounter kcounter(KMER_SIZE, is_DS);
    populate_kmer_counter(kcounter, kmers_fasta_file);
    Fasta_reader fasta_reader(reads_fasta_file);
    bool write_coverage_info = args.isArgSet("--capture_coverage_info");

    int start_time = time(NULL);

#pragma omp parallel
    while (true) {

        if (!fasta_reader.hasNext())
            break;

        int myTid = omp_get_thread_num();

        Fasta_entry fe = fasta_reader.getNext();
        string sequence = fe.get_sequence();
        if (sequence == "")
            continue;

        string header = fe.get_header();
        vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter);
        unsigned int median_cov = median_coverage(kmer_coverage);
        float mean_cov = mean(kmer_coverage);
        float stdev = stDev(kmer_coverage);
        float pct_stdev_of_avg = stdev / mean_cov * 100;
        stringstream stats_text;

        stats_text << median_cov << "\t"
            << mean_cov << "\t"
            << stdev << "\t"
            << pct_stdev_of_avg << "\t"
            << fe.get_accession();

        stats_text << "\tthread:" << myTid;

        if (write_coverage_info) {
            // add the coverage info
            stats_text << "\t";
            for (size_t i = 0; i < kmer_coverage.size(); i++) {
                stats_text << kmer_coverage[i];
                if (i != kmer_coverage.size() - 1) {
                    stats_text << ",";
                }
            }
        }
        stats_text << endl;

#pragma omp critical
        {
            cout << stats_text.str();
        }

        if (mean_cov < 0) {
            cerr << "ERROR, cannot have negative coverage!!" << endl;
            exit(1);
        }

    }

    int end_time = time(NULL);

    cerr << "STATS_GENERATION_TIME: " << (end_time - start_time) << " seconds." << endl;

    return (0);
}
Example #6
0
    static uint32_t PerformImprovementStep2(const CGraph* graph, CommunityPartition* partition, const double64_t alfa) {
        std::vector<Movement>* movements = new std::vector<Movement>[num_threads];
        uint32_t N = graph->GetNumNodes();

#pragma omp parallel for schedule(SCD_SCHEDULING,SCD_THREAD_BLOCK_SIZE) 
        for (uint32_t i = 0; i < N; i++) {
            int thread = omp_get_thread_num();
            if (i % 100000 == 0) {
                printf("Thread %d: Checked movements of %d nodes.\n", thread, i);
            }
            Movement movement;
            movement = CheckForBestMovement(graph, i, partition, alfa);
            if (movement.m_MovementType != E_NO_MOVEMENT) {
                movements[thread].push_back(movement);
            }
        }
        printf("All movements checked\n");

        for( uint32_t i = 0; i < N; i++) {
            std::sort((movements[i]).begin(), (movements[i]).end(),CompareMovements);
        }

        uint32_t* tempNodeLabels = new uint32_t[partition->m_NumNodes];
        memcpy(&tempNodeLabels[0], &partition->m_NodeLabels[0], sizeof (uint32_t) * partition->m_NumNodes);
        uint32_t totalMovements = 0;

        //uint32_t nextLabel = partition->m_NumCommunities;
        uint32_t removeMovements = 0;
        uint32_t removeAndInsertMovements = 0;
        uint32_t insertMovements = 0;

        
#pragma omp parallel for schedule(static,1)   
        for (uint32_t thread = 0; thread < num_threads; thread++) {
            uint32_t numMovements = movements[thread].size();
            totalMovements += numMovements;
            uint32_t nextLabelThread = partition->m_NumCommunities + numMovements * thread;            

            uint32_t previousCommunity = 100000000;
            for (uint32_t i = 0; i < numMovements; i++) {
                Movement movement = (movements[thread])[i];
                if(movement.m_Community != previousCommunity) {
                    previousCommunity = movement.m_Community;
                    switch (movement.m_MovementType) {
                        case E_REMOVE:
                            tempNodeLabels[movement.m_NodeId] = nextLabelThread;
                            removeMovements++;
                            nextLabelThread++;
                            break;
                        case E_REMOVE_AND_INSERT:
                            tempNodeLabels[movement.m_NodeId] = movement.m_Community;
                            if (partition->m_Communities[partition->m_CommunityIndices[partition->m_NodeLabels[movement.m_NodeId]]] == 1) {
                                insertMovements++;
                            } else {
                                removeAndInsertMovements++;
                            }
                            break;
                    }
                }
            }
        }
        delete [] movements;
        printf(" Number of removes performed: %d\n", removeMovements);
        printf(" Number of remove and insert performed: %d\n", removeAndInsertMovements);
        printf(" Number of insert performed: %d\n", insertMovements);
        FreeResources(partition);

        if (InitializeFromLabelsArray(graph, partition, tempNodeLabels, alfa)) {
            printf("Error initializing from label array.\n");
            return 1;
        }
        delete [] tempNodeLabels;

        return 0;
    }
Example #7
0
void 
function_c (void)
{							
    printf ("Thread %d is executing function C. \n", omp_get_thread_num());
}
Example #8
0
  SEXP spSVCPredictJoint(SEXP m_r, SEXP n_r, SEXP KDiag_r, SEXP obsD_r, SEXP predObsD_r, SEXP predD_r, SEXP q_r,
			 SEXP samples_r, SEXP wSamples_r, SEXP nSamples_r, 
			 SEXP AIndx_r, SEXP phiIndx_r, SEXP nuIndx_r, 	   
			 SEXP covModel_r, 
			 SEXP verbose_r, SEXP nReport_r, SEXP nThreads_r){

    /*****************************************
                Common variables
    *****************************************/
    int i, j, k, l, b, s, h, info, nProtect=0;
    char const *lower = "L";
    char const *upper = "U";
    char const *nUnit = "N";
    char const *yUnit = "U";
    char const *ntran = "N";
    char const *ytran = "T";
    char const *rside = "R";
    char const *lside = "L";
    const double one = 1.0;
    const double negOne = -1.0;
    const double zero = 0.0;
    const int incOne = 1;
    
    /*****************************************
                     Set-up
    *****************************************/
    double *obsD = REAL(obsD_r);
    double *predObsD = REAL(predObsD_r);
    double *predD = REAL(predD_r);
    int m = INTEGER(m_r)[0];
    int mm = m*m;
    int n = INTEGER(n_r)[0];
    int nn = n*n;
    int nm = n*m;
    int nmnm = nm*nm;
    int q = INTEGER(q_r)[0];//number of prediction locations
    int qm = q*m;
    int qmnm = qm*nm;
    int qmqm = qm*qm;
    bool KDiag = static_cast<bool>(INTEGER(KDiag_r)[0]);
    int nLTr = m*(m-1)/2+m;
	
    double *samples = REAL(samples_r);
    double *wSamples = REAL(wSamples_r);
    int nSamples = INTEGER(nSamples_r)[0];

    int AIndx = INTEGER(AIndx_r)[0]; 
    int phiIndx = INTEGER(phiIndx_r)[0]; 
    int nuIndx  = INTEGER(nuIndx_r)[0]; 

    std::string covModel = CHAR(STRING_ELT(covModel_r,0));
    int verbose = INTEGER(verbose_r)[0];
    int nReport = INTEGER(nReport_r)[0];
    int nThreads = INTEGER(nThreads_r)[0];
    
    /*****************************************
       Set-up MCMC alg. vars. matrices etc.
    *****************************************/
    SEXP wPredSamples_r;
    PROTECT(wPredSamples_r = allocMatrix(REALSXP, qm, nSamples)); nProtect++; 

    int status=1;
    double *A = (double *) R_alloc(mm, sizeof(double)); zeros(A, mm); //to simplify a future move to the more general cross-cov model
    double *K = (double *) R_alloc(nmnm, sizeof(double)); 
    double *B = (double *) R_alloc(qmnm, sizeof(double)); 
    double *C = (double *) R_alloc(qmqm, sizeof(double));
    double *tmp_nltr = (double *) R_alloc(nLTr, sizeof(double)); 
    double *tmp_qmnm = (double *) R_alloc(qmnm, sizeof(double)); 
    double *tmp_qm = (double *) R_alloc(qm, sizeof(double));
    double *tmp_qmqm = (double *) R_alloc(qmqm, sizeof(double));
    double *phi = (double *) R_alloc(m, sizeof(double));
    double *nu = (double *) R_alloc(m, sizeof(double)); zeros(nu, m); //this just remains empty of not matern

    double maxNu = 0; //needed for thread safe bessel
    
    if(covModel == "matern"){
      for(s = 0; s < nSamples; s++){
	for(i = 0; i < m; i++){
	  if(samples[(nuIndx+i)*nSamples+s] > maxNu){
	    maxNu = samples[(nuIndx+i)*nSamples+s];
	  }
	}
      }
    }

    int threadID = 0;
    int bessel_ws_inc = static_cast<int>(1.0+maxNu);
    double *bessel_ws = (double *) R_alloc(nThreads*bessel_ws_inc, sizeof(double));

#ifdef _OPENMP
    omp_set_num_threads(nThreads);
      if(verbose){
    Rprintf("Source compiled with OpenMP, posterior sampling is using %i thread(s).\n", nThreads);
      }
#else
    if(nThreads > 1){
      warning("n.omp.threads = %i, but source not compiled with OpenMP support.", nThreads);
      nThreads = 1;
    }
#endif  
    
    if(verbose){
	Rprintf("-------------------------------------------------\n");
	Rprintf("\tJoint sampling of predicted w\n");
	Rprintf("-------------------------------------------------\n");
      #ifdef Win32
      R_FlushConsole();
      #endif
    }
    
    GetRNGstate();
    
    for(s = 0; s < nSamples; s++){
      
      if(KDiag == false){
	dcopy_(&nLTr, &samples[AIndx*nSamples+s], &nSamples, tmp_nltr, &incOne);
      	covExpand(tmp_nltr, A, m);//note this is K, so we need chol
      	F77_NAME(dpotrf)(lower, &m, A, &m, &info); if(info != 0){error("c++ error: dpotrf failed 1\n");} 
      	clearUT(A, m); //make sure upper tri is clear
      }

      for(k = 0; k < m; k++){

	if(KDiag){
	  A[k*m+k] = sqrt(samples[(AIndx+k)*nSamples+s]);
	}
	
	phi[k] = samples[(phiIndx+k)*nSamples+s]; 
	
	if(covModel == "matern"){
	  nu[k] = samples[(nuIndx+k)*nSamples+s]; 
	}
	
      }
      
      //construct covariance matrix
#ifdef _OPENMP
#pragma omp parallel for private(i, k, l, h, threadID)
#endif
      for(j = 0; j < n; j++){
#ifdef _OPENMP
	threadID = omp_get_thread_num();
#endif
	for(i = 0; i < n; i++){	
	  for(k = 0; k < m; k++){
	    for(l = 0; l < m; l++){
	      K[(k+j*m)*nm+(i*m+l)] = 0.0; 
	      for(h = 0; h < m; h++){
		K[(k+j*m)*nm+(i*m+l)] += A[k+m*h]*A[l+m*h]*spCorTS(obsD[j*n+i], phi[h], nu[h], covModel, &bessel_ws[threadID*bessel_ws_inc]);
	      }
	    }
	  }
	}
      }
      
#ifdef _OPENMP
#pragma omp parallel for private(i, k, l, h, threadID)
#endif
      for(j = 0; j < n; j++){
#ifdef _OPENMP
	threadID = omp_get_thread_num();
#endif	
	for(i = 0; i < q; i++){	
	  for(k = 0; k < m; k++){
	    for(l = 0; l < m; l++){
	      B[(k+j*m)*qm+(i*m+l)] = 0.0; 
	      for(h = 0; h < m; h++){
		B[(k+j*m)*qm+(i*m+l)] += A[k+m*h]*A[l+m*h]*spCorTS(predObsD[j*q+i], phi[h], nu[h], covModel, &bessel_ws[threadID*bessel_ws_inc]);
	      }
	    }
	  }
	}
      }
      
      //printMtrx(B, qm, nm);
      
#ifdef _OPENMP
#pragma omp parallel for private(i, k, l, h, threadID)
#endif
      for(j = 0; j < q; j++){
#ifdef _OPENMP
	threadID = omp_get_thread_num();
#endif
	for(i = 0; i < q; i++){	
	  for(k = 0; k < m; k++){
	    for(l = 0; l < m; l++){
	      C[(k+j*m)*qm+(i*m+l)] = 0.0; 
	      for(h = 0; h < m; h++){
		C[(k+j*m)*qm+(i*m+l)] += A[k+m*h]*A[l+m*h]*spCorTS(predD[j*q+i], phi[h], nu[h], covModel, &bessel_ws[threadID*bessel_ws_inc]);
	      }
	    }
	  }
	}
      }
         
      F77_NAME(dpotrf)(lower, &nm, K, &nm, &info); if(info != 0){error("c++ error: dpotrf failed 1\n");}
      F77_NAME(dpotri)(lower, &nm, K, &nm, &info); if(info != 0){error("c++ error: dpotri failed\n");}     
      F77_NAME(dsymm)(rside, lower, &qm, &nm, &one, K, &nm, B, &qm, &zero, tmp_qmnm, &qm);
      
      //mu
      F77_NAME(dgemv)(ntran, &qm, &nm, &one, tmp_qmnm, &qm, &wSamples[s*nm], &incOne, &zero, tmp_qm, &incOne);

      //var
      F77_NAME(dgemm)(ntran, ytran, &qm, &qm, &nm, &one, tmp_qmnm, &qm, B, &qm, &zero, tmp_qmqm, &qm);

      for(i = 0; i < qmqm; i++){
	C[i] = C[i] - tmp_qmqm[i];
      }
            
      F77_NAME(dpotrf)(lower, &qm, C, &qm, &info); if(info != 0){error("c++ error: dpotrf failed 2\n");}
      
      mvrnorm(&REAL(wPredSamples_r)[s*qm], tmp_qm, C, qm, false);
      
      //report
      if(verbose){
      	if(status == nReport){
	  Rprintf("Sampled: %i of %i, %3.2f%%\n", s, nSamples, 100.0*s/nSamples);
          #ifdef Win32
      	  R_FlushConsole();
          #endif
      	  status = 0;
      	}
      }
      status++;
      R_CheckUserInterrupt();
    }//end sample loop
    
    PutRNGstate();
    
    //make return object
    SEXP result_r, resultName_r;
    int nResultListObjs = 1;

    PROTECT(result_r = allocVector(VECSXP, nResultListObjs)); nProtect++;
    PROTECT(resultName_r = allocVector(VECSXP, nResultListObjs)); nProtect++;
    
    //samples
    SET_VECTOR_ELT(result_r, 0, wPredSamples_r);
    SET_VECTOR_ELT(resultName_r, 0, mkChar("p.w.predictive.samples")); 

    namesgets(result_r, resultName_r);
    
    //unprotect
    UNPROTECT(nProtect);
    
    return(result_r);

    }
Example #9
0
void make_graph(int log_numverts, int64_t desired_nedges, uint64_t userseed1, uint64_t userseed2, const double initiator[4], int64_t* nedges_ptr, int64_t** result_ptr) {
  int64_t N, M;

  N = (int64_t)pow(GRAPHGEN_INITIATOR_SIZE, log_numverts);
  M = desired_nedges;

  /* Spread the two 64-bit numbers into five nonzero values in the correct
   * range. */
  uint_fast32_t seed[5];
  make_mrg_seed(userseed1, userseed2, seed);

  int64_t nedges = compute_edge_array_size(0, 1, M);
  *nedges_ptr = nedges;
#ifdef GRAPHGEN_KEEP_MULTIPLICITIES
  generated_edge* edges = (generated_edge*)xcalloc(nedges, sizeof(generated_edge)); /* multiplicity set to 0 for unused edges */
#else
  int64_t* edges = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t));
#endif

#pragma omp parallel
  {
    int rank = omp_get_thread_num(), size = omp_get_num_threads();
    generate_kronecker(rank, size, seed, log_numverts, M, initiator, edges);
  }

  int64_t* vertex_perm = (int64_t*)xmalloc(N * sizeof(int64_t));
  int64_t* result;

#ifdef GRAPHGEN_KEEP_MULTIPLICITIES
  result = (int64_t*)xmalloc(2 * nedges * sizeof(int64_t));
#else
  result = edges;
#endif
  *result_ptr = result;

  mrg_state state;
  mrg_seed(&state, seed);
  rand_sort_shared(&state, N, vertex_perm);
  int64_t i;
  /* Apply vertex permutation to graph, optionally copying into user's result
   * array. */
#ifdef GRAPHGEN_KEEP_MULTIPLICITIES
#pragma omp parallel for
  for (i = 0; i < nedges; ++i) {
    if (edges[i].multiplicity != 0) {
      int64_t v1 = vertex_perm[edges[i].src];
      int64_t v2 = vertex_perm[edges[i].tgt];
      /* Sort these since otherwise the directions of the permuted edges would
       * give away the unscrambled vertex order. */
      result[i * 2] = (v1 < v2) ? v1 : v2;
      result[i * 2 + 1] = (v1 < v2) ? v2 : v1;
    } else {
      result[i * 2] = result[i * 2 + 1] = (int64_t)(-1);
    }
  }
  free(edges);
#else
#pragma omp parallel for
  for (i = 0; i < 2 * nedges; i += 2) {
    if (edges[i] != (int64_t)(-1)) {
      int64_t v1 = vertex_perm[edges[i]];
      int64_t v2 = vertex_perm[edges[i + 1]];
      /* Sort these since otherwise the directions of the permuted edges would
       * give away the unscrambled vertex order. */
      edges[i] = (v1 < v2) ? v1 : v2;
      edges[i + 1] = (v1 < v2) ? v2 : v1;
    }
  }
#endif

  free(vertex_perm);

  /* Randomly mix up the order of the edges. */
  scramble_edges_shared(userseed1, userseed2, nedges, edges);
}
Example #10
0
void run_graph_program(GraphProgram<T,U,V>* gp, Graph<V>& g, int iterations=1, struct run_graph_program_temp_structure<T,U,V>* rgpts=NULL) { //iterations = -1 ==> until convergence
  int it = 0;
  int converged = 1;

  unsigned long long int init_start = __rdtsc();

  auto act = gp->getActivity();

  SparseInVector<T>* px;
  SparseOutVector<U>* py;

  if (rgpts == NULL) {
    px  = new SparseInVector<T>(g.nvertices);
    py  = new SparseOutVector<U>(g.nvertices);
  }

  SparseInVector<T>&x = (rgpts==NULL)?(*px):*(rgpts->px);
  SparseOutVector<U>& y = (rgpts==NULL)?(*py):*(rgpts->py);

  #ifdef __TIMING
  printf("Nvertices = %d numints = %d \n", g.nvertices, y.numInts);
  #endif

  unsigned long long int start, end;
  int* start_vertex = new int[nthreads+1];

  //divide numInts to start_vertex
  //divide the active vertices in each into start_index
  start_vertex[nthreads] = g.nvertices;
  #pragma omp parallel num_threads(nthreads)
  {
    int tid = omp_get_thread_num();
    int ints_per_th = (y.numInts/nthreads)*32;
    int sv  = ints_per_th*tid;
    sv = (((sv/32)/4)*4)*32; //sv is multiple of 32 and sv/32 is a multiple of 4
    sv = (((sv/32)/SIMD_WIDTH)*SIMD_WIDTH)*32; //sv is multiple of 32 and sv/32 is a multiple of SIMD_WIDTH
    if (sv >= g.nvertices) sv = g.nvertices;
    if (sv == 0) sv = 0;
    start_vertex[tid] = sv;
  }

  unsigned long long int init_end = __rdtsc();
  #ifdef __TIMING
  printf("GraphMat init time = %f ms \n", (init_end-init_start)/(CPU_FREQ)*1e3);
  #endif

  while(1) {
    unsigned long long int iteration_start = __rdtsc();
    x.clear();
    y.clear();
    converged = 1;

    start = __rdtsc();

    //check active vector and set message vector
    int count = 0;
    #pragma omp parallel num_threads(nthreads) reduction(+:count)
    {
    int tid = omp_get_thread_num();
    for (int i = start_vertex[tid]; i < start_vertex[tid+1]; i++){
      if (g.active[i]) {
        T message;
        bool msg_opt = gp->send_message(g.vertexproperty[i], message);
        if (msg_opt) {
          x.set(i, message);
          count++;
        }
      }
    }
    }
    x.length = count;

    #ifdef __TIMING
    printf("x.length = %d \n", x.length);
    #endif
    end = __rdtsc();
    #ifdef __TIMING
    printf("Send message time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3);
    #endif

    start = __rdtsc();
    
    //do SpMV
    if (gp->getOrder() == OUT_EDGES) {

      SpMTSpV(g, gp, x, y);

    } else if (gp->getOrder() == IN_EDGES) {

      SpMSpV(g, gp, x, y);

    } else if (gp->getOrder() == ALL_EDGES) {

      SpMTSpV(g, gp, x, y);
      SpMSpV(g, gp, x, y);

    } else {
      printf("Unrecognized option \n");
      exit(1);
    }
    end = __rdtsc();
    #ifdef __TIMING
    printf("SPMV time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3);
    #endif
    
    start = __rdtsc();
    g.setAllInactive();

    //update state and activity and check for convergence if needed
    int nout = 0;
    int total_search = 0;
    converged = 1;
    #pragma omp parallel num_threads(nthreads) reduction(+:nout) reduction(&:converged) reduction(+:total_search) //schedule(static)
    {
      int zero = 0;
      SIMDINTTYPE xmm_zero = _MM_SET1(zero);
      int tid = omp_get_thread_num();
      int count_ones = 0;
    int end_of_numInts = start_vertex[tid+1]/32;
    if (tid == nthreads-1) end_of_numInts = y.numInts;
    for (int ii = start_vertex[tid]/32; ii < end_of_numInts; ii+=SIMD_WIDTH) {

      __m128i xmm_local_bitvec = _mm_loadu_si128((__m128i*)(y.bitvector + ii));
      __m128 xmm_cmp_mask = _mm_castsi128_ps(_mm_cmpeq_epi32((xmm_local_bitvec), (xmm_zero)));
      int mask_value_0 = _mm_movemask_ps(xmm_cmp_mask);
      if(mask_value_0 == 15)
      {
        continue;
      }
      for(int i = ii; i < ii+SIMD_WIDTH; i++)
      {
        unsigned int value = y.bitvector[i];
        while (value != 0) {
          int last_bit = _bit_scan_forward(value);
          int idx = i*32 + last_bit;

          V old_prop;
            old_prop = g.vertexproperty[idx];
      
          gp->apply(y.value[idx], g.vertexproperty[idx]);
          nout++;

            if (old_prop != g.vertexproperty[idx]) {
	      g.setActive(idx);
              count_ones++;
              converged = 0;
              total_search++;
            }

          value &= (~(1<<last_bit));
        }
      }
    }
    
    }
    if (act == ALL_VERTICES) {
      g.setAllActive();
    }

    #ifdef __TIMING
    printf("Number of vertices that changed state = %d \n", total_search);
    #endif

    end = __rdtsc();
    #ifdef __TIMING
    printf("Apply time = %.3f ms \n", (end-start)/(CPU_FREQ)*1e3);
    #endif
    
    gp->do_every_iteration(it);

    unsigned long long int iteration_end = __rdtsc();
    #ifdef __TIMING
    printf("Iteration %d :: %f msec :: updated %d vertices \n", it, (iteration_end-iteration_start)/(CPU_FREQ)*1e3, nout);
    #endif

    it++;
    if (it == iterations) {
      break;
    }
    if (iterations <= 0 && converged == 1) {
      break;
    }
  }

  unsigned long long int clear_start = __rdtsc();
  delete [] start_vertex;

  if (rgpts == NULL) {
    delete px;
    delete py;
  }

  unsigned long long int clear_end = __rdtsc();
  #ifdef __TIMING
  printf("GraphMat clear time = %f msec \n", (clear_end-clear_start)/(CPU_FREQ)*1e3);
  #endif

  printf("Completed %d iterations \n", it);

}
Example #11
0
int Preprocessor::autoWork()
{
    buildWorkGroup();
    int groupNum = static_cast<int>(workPool.size());
    string tarSrc = para.auto_src, autoDst = para.auto_dst, sysDel = para.sys_del,
             tarDst = autoDst +  sysDel + "src", monSrc = tarDst, monDst = autoDst + sysDel + "montage",
             proSrc = monDst, proDst = autoDst + sysDel + "project", resSrc = monDst, resDst = autoDst + sysDel + "resize",
             tarPre = para.tar_file_pre, tarPost = para.tar_file_post, monInPre = para.mon_in_pre, monInPost = para.mon_in_post;
    u_int xBeg = para.x_beg, xEnd = para.x_end, yBeg = para.y_beg, yEnd = para.y_end, blockWidth = para.block_width, blockHeight = para.block_height;
    u_int monWidth = blockWidth * (xEnd - xBeg + 1);
    u_int monHeght = blockHeight * (yEnd - yBeg + 1);
    if(_access(autoDst.c_str(), 0) == -1){
        _mkdir(autoDst.c_str());
        _mkdir(tarDst.c_str());
        _mkdir(monDst.c_str());
        _mkdir(proDst.c_str());
        _mkdir(resDst.c_str());
    }else{
        if(_access(tarDst.c_str(), 0) == -1){
            _mkdir(tarDst.c_str());
        }
        if(_access(monDst.c_str(), 0) == -1){
            _mkdir(monDst.c_str());
        }
        if(_access(proDst.c_str(), 0) == -1){
            _mkdir(proDst.c_str());
        }
        if(_access(resDst.c_str(), 0) == -1){
            _mkdir(resDst.c_str());
        }
    }
#pragma omp parallel for num_threads(para.thread_num)
    for(int i = 0; i < groupNum; ++i){
        vector<unsigned int> &tmpGroup = workPool.at(i);
        int groupSize = static_cast<int>(tmpGroup.size());
        u_int proIndex = 1;
        string startSerial, endSerial;
        for(int j = 0; j < groupSize; ++j){
            if(proIndex > para.pro_thick){
                proIndex = 1;
            }
            string tmpSerial;
            stringstream tmpStream;   
            tmpStream<<setw(5)<<setfill('0')<<tmpGroup.at(j);
            tmpStream>>tmpSerial;
            tmpStream.clear();
            cv::Mat inImage, proImage, resImage, monImage, blockImage, inverseRowBlockImage;
            if(para.image_depth == 8){
                monImage = cv::Mat::zeros(monHeght, monWidth, CV_8UC1);
                inverseRowBlockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_8UC1);
            }else if(para.image_depth == 16){
                monImage = cv::Mat::zeros(monHeght, monWidth, CV_16UC1);
                inverseRowBlockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_16UC1);
            }
            if(proIndex == 1){
                startSerial = tmpSerial;
                if(para.image_depth == 8){
                    proImage = cv::Mat::zeros(monHeght, monWidth, CV_8UC1);
                }else if(para.image_depth == 16){
                    proImage = cv::Mat::zeros(monHeght, monWidth, CV_16UC1);
                }
            }
            
            string tarName = tarSrc + sysDel + tarPre + tmpSerial + tarPost;
            Tar tmpTar(tarName);
            bool tarFlag = tmpTar.untar(tarDst);
            if(tarFlag){
                cout<<tarName<<" "<<omp_get_thread_num()<<endl;
                for(u_int x = xBeg; x <= xEnd; ++x){
                    for(u_int y = yBeg; y <= yEnd; ++y){
                        string monInName;
                        tmpStream<<monSrc<<sysDel<<tmpSerial<<sysDel<<monInPre<<tmpSerial<<"_"<<setw(2)<<setfill('0')<<x<<"_"<<setw(2)<<setfill('0')<<y<<monInPost;
                        tmpStream>>monInName;
                        tmpStream.clear();
                        blockImage = cv::imread(monInName, CV_LOAD_IMAGE_UNCHANGED);
                        if(!blockImage.data){
                            cout<<"----------------------"<<endl;
                            cout<<"Image Loaded Error!"<<endl;
                            cout<<monInName<<endl;
                            cout<<"----------------------"<<endl;
                            if(para.image_depth == 8){
                                blockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_8UC1);
                            }else if(para.image_depth == 16){
                                blockImage = cv::Mat::zeros(blockHeight, blockWidth, CV_16UC1);
                            }
                        }
                        for(int col = 0; col < blockImage.cols; ++col){
                            blockImage.col(col).copyTo(inverseRowBlockImage.col(blockImage.cols - col - 1));
                        }
                        cv::Rect blockRoi((x - xBeg) * blockWidth, (y - yBeg) * blockHeight, blockWidth, blockHeight);
                        inverseRowBlockImage.copyTo(monImage(blockRoi));
                    }  
                }
                string monOutName = monDst + sysDel + para.mon_out_pre + tmpSerial + para.mon_out_post;
                string resOutName = resDst + sysDel + para.res_out_pre + tmpSerial + para.res_out_post;
                cv::imwrite(monOutName, monImage);
                cout<<monOutName<<" "<<omp_get_thread_num()<<endl;
                cv::resize(monImage, resImage, cv::Size(), para.res_fx, para.res_fy, cv::INTER_AREA);
                cv::imwrite(resOutName, resImage);
                cout<<resOutName<<" "<<omp_get_thread_num()<<endl;
                proImage = cv::max(monImage, proImage);
                if(proIndex == para.pro_thick || j == groupSize - 1){
                    endSerial = tmpSerial;
                    string proOutName = proDst + sysDel + para.pro_out_pre + startSerial + "-" + endSerial + para.pro_out_post;
                    cv::imwrite(proOutName, proImage);
                    cout<<proOutName<<" "<<omp_get_thread_num()<<endl;
                }
                
            }else{
                cout<<"Tar Error!"<<endl;
            }
            ++proIndex;
        }
    }
Example #12
0
int Preprocessor::montage(){
    string src = para.mon_src;
    string dst = para.mon_dst;
    string sys_del = para.sys_del;
    unsigned int serial_beg = para.serial_beg, serial_end = para.serial_end, 
                        x_beg = para.x_beg, x_end = para.x_end, 
                        y_beg = para.y_beg, y_end = para.y_end, 
                        block_width = para.block_width, block_height = para.block_height, serial_bits = para.serial_bits,
                        thread_num = para.thread_num;
    unsigned int image_depth = para.image_depth;
    string image_pre = para.mon_in_pre, image_post = para.mon_in_post;
    string out_pre = para.mon_out_pre, out_post = para.mon_out_post; 
#pragma omp parallel for num_threads(thread_num) 
    for(int serial_num = int(serial_beg); serial_num <= int(serial_end); ++ serial_num){
        stringstream string_buffer;
        string serial_string("");
        cv::Mat out_image, block_image, re_block_image;
        cout<<serial_num<<": "<<omp_get_thread_num()<<endl;
        if(image_depth ==  16){
            out_image = cv::Mat((y_end - y_beg + 1) * block_height, (x_end - x_beg + 1) * block_width, CV_16UC1, cv::Scalar(0, 0, 0));
            //cout<<out_image.rows<<" "<<out_image.cols<<" "<<out_image.depth()<<endl;
            block_image = cv::Mat(block_height, block_width, CV_16UC1, cv::Scalar(0, 0, 0));
            re_block_image = cv::Mat(block_height, block_width, CV_16UC1, cv::Scalar(0, 0, 0));        
        }else{
            out_image = cv::Mat((y_end - y_beg + 1) * block_height, (x_end - x_beg + 1) * block_width, CV_8UC1, cv::Scalar(0, 0, 0));
            //cout<<out_image.rows<<" "<<out_image.cols<<" "<<out_image.depth()<<endl;
            block_image = cv::Mat(block_height, block_width, CV_8UC1, cv::Scalar(0, 0, 0));
            re_block_image = cv::Mat(block_height, block_width, CV_8UC1, cv::Scalar(0, 0, 0));    
        
        }
        string x_str, y_str, image_str, out_image_name;       
        string_buffer<<setw(serial_bits)<<setfill('0')<<serial_num;    
        string_buffer>>serial_string;
        string_buffer.clear();
        //cout<<serial_string<<endl;
        for(unsigned int x_in = x_beg; x_in <= x_end; ++ x_in){
            for(unsigned int y_in = y_beg; y_in <= y_end; ++ y_in){
                string_buffer<<setw(2)<<setfill('0')<<x_in;
                string_buffer>>x_str;
                string_buffer.clear();
                string_buffer<<setw(2)<<setfill('0')<<y_in;
                string_buffer>>y_str;
                string_buffer.clear();
                image_str = src + sys_del + serial_string + sys_del +image_pre + serial_string + "_" + x_str + "_" + y_str + image_post;
                block_image = cv::imread(image_str.c_str(), CV_LOAD_IMAGE_UNCHANGED);
                if(!block_image.data){
                    cout<<"----------------------"<<endl;
                    cout<<"Image Loaded Error!"<<endl;
                    cout<<image_str<<endl;
                    cout<<"----------------------"<<endl;
                    if(image_depth == 8){
                        block_image = cv::Mat::zeros(block_height, block_width, CV_8UC1);
                    }else if(image_depth == 16){
                        block_image = cv::Mat::zeros(block_height, block_width, CV_16UC1);
                    }
                    
                }
                for(int y_block = 0; y_block < block_image.cols; ++ y_block){
                    block_image.col(y_block).copyTo(re_block_image.col(block_image.cols - y_block - 1));       
                }
                cv::Rect sub_roi((x_in - x_beg) * block_width, (y_in - y_beg) * block_height, block_width, block_height);                 
                cv::Mat sub_image(out_image, sub_roi);
                re_block_image.clone().copyTo(sub_image);
            }
                        
        }
        
        out_image_name = dst + sys_del + out_pre + serial_string + out_post;
        cout<<out_image_name<<endl;
        cv::imwrite(out_image_name, out_image);            
    }
    
    
    
    return 0;
} 
Example #13
0
void Solver::TripleStates_Parallel(){
    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    // %%%%%%%%%%%%%%% SETTING UP K_h AND K_pph %%%%%%%%%%%%%%%%%%%%%%%
    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    Kh = zeros<mat>(0,2); Khpp = zeros<mat>(0,4);

    int n=0;
    for (int i=0; i<Nholes; i++){

        int Nx = basis.States(i,1); // Combining x-momentum
        int Ny = basis.States(i,2); // Combining y-momentum
        int Nz = basis.States(i,3); // Combining z-momentum
        int Sz = basis.States(i,4); // Combining spin

        // Adding a new two-hole-state configuration to matrix. (i, j, Identifier)
        Kh.insert_rows(n,1);
        Kh(n,0) = i; Kh(n,1) = Identifier(Nx,Ny,Nz,Sz);
        n++;
    }

    #pragma omp parallel
    {
        int id = omp_get_thread_num();
        int nthreads = omp_get_num_threads();

        // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis.
        int size = floor( Nparticles/nthreads) * Nholes*(Nparticles-1);
        if ( id < Nparticles%nthreads) size += Nholes*(Nparticles-1);

        mat partialStates = zeros<mat>(size,4);

        int n=0;
        for (int aa=id; aa<Nparticles; aa += nthreads){
            for (int i=0; i<Nholes; i++){
                for (int bb=0; bb<Nparticles; bb++){

                    if (aa != bb){
                        int a=aa+Nholes; int b=bb+Nholes;
                        int Nx = basis.States(a,1) + basis.States(b,1) - basis.States(i,1);
                        int Ny = basis.States(a,2) + basis.States(b,2) - basis.States(i,2);
                        int Nz = basis.States(a,3) + basis.States(b,3) - basis.States(i,3);
                        int Sz = basis.States(a,4) + basis.States(b,4) - basis.States(i,4);

                        partialStates(n,0) = i; partialStates(n,1) = a; partialStates(n,2) = b; partialStates(n,3) = Identifier(Nx,Ny,Nz,Sz);
                        n++;
                    }
                }
            }
        }
        #pragma omp critical
        {
            Khpp.insert_rows(0,partialStates);
        }
    }

    NKh3 = Khpp.n_rows; NKh = Kh.n_rows;

    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    // %%%%%%%%%%%%%%%%% SETTING UP K_p AND K_phh STATES %%%%%%%%%%%%%%%%%%%%%%
    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    Kp = zeros<mat>(0,2); Kphh = zeros<mat>(0,4);
    n=0;
    for (int aa=0; aa<Nparticles; aa++){
        int a = aa+Nholes;

        int Nx = basis.States(a,1); // Combining x-momentum
        int Ny = basis.States(a,2); // Combining y-momentum
        int Nz = basis.States(a,3); // Combining z-momentum
        int Sz = basis.States(a,4); // Combining spin

        // Adding a new two-hole-state configuration to matrix. (i, j, Identifier)
        Kp.insert_rows(n,1);
        Kp(n,0) = a; Kp(n,1) = Identifier(Nx,Ny,Nz,Sz);
        n++;
    }

    #pragma omp parallel
    {
        int id = omp_get_thread_num();
        int nthreads = omp_get_num_threads();

        // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis.
        int size = floor( Nholes/nthreads) * Nparticles*(Nholes-1);
        if ( id < Nholes%nthreads) size += Nparticles*(Nholes-1);

        mat partialStates = zeros<mat>(size,4);
        int n=0;
        for (int i=id; i<Nholes; i+=nthreads){
            for (int j=0; j<Nholes; j++){
                for (int aa=0; aa<Nparticles; aa++){

                    if (i != j){
                        int a=aa+Nholes;
                        int Nx = basis.States(i,1) + basis.States(j,1) - basis.States(a,1);
                        int Ny = basis.States(i,2) + basis.States(j,2) - basis.States(a,2);
                        int Nz = basis.States(i,3) + basis.States(j,3) - basis.States(a,3);
                        int Sz = basis.States(i,4) + basis.States(j,4) - basis.States(a,4);

                        partialStates(n,0) = a; partialStates(n,1) = i; partialStates(n,2) = j; partialStates(n,3) = Identifier(Nx,Ny,Nz,Sz);
                        n++;
                    }
                }
            }
        }
        #pragma omp critical
        {
            Kphh.insert_rows(0,partialStates);
        }
    }

    NKp3 = Kphh.n_rows; NKp = Kp.n_rows;

}
Example #14
0
void Solver::DirectStates_Parallel(){

    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    // %%%%%%%%%%%% SETTING UP DIRECT STATES %%%%%%%%%%%%%%%%%%%%%%%
    // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

    Holes = zeros<mat>(0,3);

    #pragma omp parallel
    {
        int id = omp_get_thread_num();
        int nthreads = omp_get_num_threads();

        // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis.
        int size = floor( Nholes/nthreads) * (Nholes-1);
        if ( id < Nholes%nthreads) size += Nholes - 1;

        mat partialStates = zeros<mat>(size,3);

        int n=0; // n will count how many two-state combinations we find. Used as indice in the matrix
        for (int i=id; i<Nholes; i += nthreads){
            for (int j=0; j<Nholes; j++){

                if (i != j){ // Pauli principle demands that the particles must be unequal

                    // Setting up direct channels for holes

                    // Two-hole momentum and spin
                    int Nx = basis.States(i,1) + basis.States(j,1); // Combining x-momentum
                    int Ny = basis.States(i,2) + basis.States(j,2); // Combining y-momentum
                    int Nz = basis.States(i,3) + basis.States(j,3); // Combining z-momentum
                    int Sz = basis.States(i,4) + basis.States(j,4); // Combining spin

                    // Adding a new two-hole-state configuration to matrix. (i, j, Identifier)
                    partialStates(n,0) = i; partialStates(n,1) = j; partialStates(n,2) = Identifier(Nx,Ny,Nz,Sz);

                    n++;
                }
            }
        }
        #pragma omp critical
        Holes.insert_rows(0,partialStates);
    }


    Particles = zeros<mat>(0,3);

    #pragma omp parallel
    {
        int id = omp_get_thread_num();
        int nthreads = omp_get_num_threads();

        // Setting up size for the partial Holes matrix. This size is more deeply explained in the thesis.
        int size = floor( Nparticles/nthreads) * (Nparticles-1);
        if ( id < Nparticles%nthreads) size += Nparticles - 1;

        mat partialStates = zeros<mat>(size,3);

        int n=0; // n will count how many two-state combinations we find. Used as indice in the matrix
        for (int aa=id; aa<Nparticles; aa+=nthreads){
            for (int bb=0; bb<Nparticles; bb++){

                if (aa != bb){
                    int a=aa+Nholes; int b=bb+Nholes;

                    int Nx = basis.States(a,1) + basis.States(b,1);
                    int Ny = basis.States(a,2) + basis.States(b,2);
                    int Nz = basis.States(a,3) + basis.States(b,3);
                    int Sz = basis.States(a,4) + basis.States(b,4);

                    partialStates(n,0) = a; partialStates(n,1) = b; partialStates(n,2) = Identifier(Nx,Ny,Nz,Sz);
                    n++;
                }
            }
        }
        #pragma omp critical
        Particles.insert_rows(0,partialStates);
    }
    NPARTICLES = Particles.n_rows;
    NHOLES = Holes.n_rows;
}
Example #15
0
int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
{
  uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
  blake2s_state S[PARALLELISM_DEGREE][1];
  blake2s_state FS[1];
  size_t i;

  /* Verify parameters */
  if ( NULL == in && inlen > 0 ) return -1;

  if ( NULL == out ) return -1;

  if ( NULL == key && keylen > 0) return -1;

  if( !outlen || outlen > BLAKE2S_OUTBYTES ) return -1;

  if( keylen > BLAKE2S_KEYBYTES ) return -1;

  for( i = 0; i < PARALLELISM_DEGREE; ++i )
    if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;

  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */

  if( keylen > 0 )
  {
    uint8_t block[BLAKE2S_BLOCKBYTES];
    memset( block, 0, BLAKE2S_BLOCKBYTES );
    memcpy( block, key, keylen );

    for( i = 0; i < PARALLELISM_DEGREE; ++i )
      blake2s_update( S[i], block, BLAKE2S_BLOCKBYTES );

    secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */
  }

#if defined(_OPENMP)
  #pragma omp parallel shared(S,hash), num_threads(PARALLELISM_DEGREE)
#else

  for( i = 0; i < PARALLELISM_DEGREE; ++i )
#endif
  {
#if defined(_OPENMP)
    size_t      i = omp_get_thread_num();
#endif
    size_t inlen__ = inlen;
    const unsigned char *in__ = ( const unsigned char * )in;
    in__ += i * BLAKE2S_BLOCKBYTES;

    while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
    {
      blake2s_update( S[i], in__, BLAKE2S_BLOCKBYTES );
      in__ += PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
      inlen__ -= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES;
    }

    if( inlen__ > i * BLAKE2S_BLOCKBYTES )
    {
      const size_t left = inlen__ - i * BLAKE2S_BLOCKBYTES;
      const size_t len = left <= BLAKE2S_BLOCKBYTES ? left : BLAKE2S_BLOCKBYTES;
      blake2s_update( S[i], in__, len );
    }

    blake2s_final( S[i], hash[i], BLAKE2S_OUTBYTES );
  }

  if( blake2sp_init_root( FS, outlen, keylen ) < 0 )
    return -1;

  FS->last_node = 1;

  for( i = 0; i < PARALLELISM_DEGREE; ++i )
    blake2s_update( FS, hash[i], BLAKE2S_OUTBYTES );

  return blake2s_final( FS, out, outlen );
}
inline void K_point::generate_fv_states()
{
    PROFILE_WITH_TIMER("sirius::K_point::generate_fv_states");
    
    if (!ctx_.full_potential()) {
        return;
    }

    mdarray<double_complex, 2> pw_coeffs;
    mdarray<double_complex, 2> mt_coeffs;
    
    int nbnd_loc;
    /* in both cases eigen-vectors are redistributed to the same "full column" storage */
    if (ctx_.iterative_solver_input_section().type_ == "exact") {
        fv_eigen_vectors_->remap_forward(0, ctx_.num_fv_states());
        /* local number of bands */
        nbnd_loc = fv_eigen_vectors_->spl_num_col().local_size();
        
        if (nbnd_loc) {
            pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(), gklo_basis_size(), nbnd_loc);
            mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_->extra().at<CPU>(num_gkvec(), 0), gklo_basis_size(), nbnd_loc);
        }

    } else {
        fv_eigen_vectors_slab_->remap_to_full_column_distr(ctx_.num_fv_states());
        assert(fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size() ==
               fv_eigen_vectors_slab_->mt_coeffs().spl_num_col().local_size());
        /* local number of bands */
        nbnd_loc = fv_eigen_vectors_slab_->pw_coeffs().spl_num_col().local_size();
        if (nbnd_loc) {
            pw_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->pw_coeffs().extra().at<CPU>(), num_gkvec(), nbnd_loc);
            mt_coeffs = mdarray<double_complex, 2>(fv_eigen_vectors_slab_->mt_coeffs().extra().at<CPU>(), unit_cell_.mt_lo_basis_size(), nbnd_loc);
        }
    }

    #ifdef __GPU
    if (ctx_.processing_unit() == GPU) {
        pw_coeffs.allocate(memory_t::device);
        pw_coeffs.copy_to_device();
    }
    #endif

    fv_states().prepare_full_column_distr(ctx_.num_fv_states());

    assert(nbnd_loc == fv_states().pw_coeffs().spl_num_col().local_size());
    assert(nbnd_loc == fv_states().mt_coeffs().spl_num_col().local_size());

    #pragma omp parallel
    {
        /* get thread id */
        #ifdef __GPU
        int tid = omp_get_thread_num();
        #endif
        mdarray<double_complex, 2> alm(num_gkvec(), unit_cell_.max_mt_aw_basis_size(), memory_t::host_pinned);
        mdarray<double_complex, 2> tmp;

        #ifdef __GPU
        if (ctx_.processing_unit() == GPU) {
            alm.allocate(memory_t::device);
            tmp = mdarray<double_complex, 2>(unit_cell_.max_mt_aw_basis_size(), nbnd_loc, memory_t::device);
        }
        #endif
        
        #pragma omp for
        for (int ia = 0; ia < unit_cell_.num_atoms(); ia++) {
            /* number of alm coefficients for atom */
            int mt_aw_size = unit_cell_.atom(ia).mt_aw_basis_size();
            /* offset in wave-function */
            int offset_wf = unit_cell_.atom(ia).offset_mt_coeffs();
            /* generate matching coefficients for all G-vectors */
            alm_coeffs_->generate(ia, alm);
            
            /* compute F(lm, i) = A(lm, G)^{T} * evec(G, i) for a single atom */
            if (ctx_.processing_unit() == CPU) {
                /* multiply eigen-vectors and matching coefficients */
                linalg<CPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(),
                                  alm.at<CPU>(), alm.ld(),
                                  pw_coeffs.at<CPU>(), pw_coeffs.ld(),
                                  fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld());
            }
            #ifdef __GPU
            if (ctx_.processing_unit() == GPU) {
                /* multiply eigen-vectors and matching coefficients */
                alm.async_copy_to_device(tid);
                linalg<GPU>::gemm(1, 0, mt_aw_size, nbnd_loc, num_gkvec(),
                                  alm.at<GPU>(), alm.ld(),
                                  pw_coeffs.at<GPU>(), pw_coeffs.ld(),
                                  tmp.at<GPU>(), tmp.ld(),
                                  tid);
                acc::copyout(fv_states().mt_coeffs().extra().at<CPU>(offset_wf, 0), fv_states().mt_coeffs().extra().ld(),
                             tmp.at<GPU>(), tmp.ld(),
                             mt_aw_size, nbnd_loc, tid);
                acc::sync_stream(tid);
            }
            #endif

            for (int i = 0; i < nbnd_loc; i++) {
                /* lo block */
                std::memcpy(fv_states().mt_coeffs().extra().at<CPU>(offset_wf + mt_aw_size, i),
                            mt_coeffs.at<CPU>(unit_cell_.atom(ia).offset_lo(), i),
                            unit_cell_.atom(ia).mt_lo_basis_size() * sizeof(double_complex));
            }
        }
        #pragma omp for
        for (int i = 0; i < nbnd_loc; i++) {
            /* G+k block */
            std::memcpy(fv_states().pw_coeffs().extra().at<CPU>(0, i),
                        pw_coeffs.at<CPU>(0, i), num_gkvec() * sizeof(double_complex));
        }
    }

    fv_states().remap_to_prime_distr(ctx_.num_fv_states());
}
Example #17
0
int main(int argc,char **argv)
{
    PetscErrorCode ierr;
    PetscInt       i,j,k,N=100,**counters,tsize;

    PetscInitialize(&argc,&argv,(char *)0,help);

    ierr = PetscThreadCommView(PETSC_COMM_WORLD,PETSC_VIEWER_STDOUT_WORLD);
    CHKERRQ(ierr);
    ierr = PetscOptionsGetInt(PETSC_NULL,"-N",&N,PETSC_NULL);
    CHKERRQ(ierr);

    ierr = PetscThreadCommGetNThreads(PETSC_COMM_WORLD,&tsize);
    CHKERRQ(ierr);
    ierr = PetscMalloc(tsize*sizeof(*counters),&counters);
    CHKERRQ(ierr);
    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterInit_kernel,1,counters);
    CHKERRQ(ierr);

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            /*      ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,1,counters);CHKERRQ(ierr); */
            ierr = PetscThreadCommRunKernel1(PETSC_COMM_WORLD,(PetscThreadKernel)CounterIncrement_kernel,counters);
            CHKERRQ(ierr);
        }
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"Time per kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            #pragma omp parallel num_threads(tsize)
            {
                PetscInt trank = omp_get_thread_num();
                CounterIncrement_kernel(trank,counters);
            }
        }
        ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
        CHKERRQ(ierr);
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"OpenMP inline time per kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            CounterIncrement_kernel(0,counters);
        }
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per single kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    for (i=0; i<10; i++) {
        PetscReal t0,t1;
        ierr = PetscGetTime(&t0);
        CHKERRQ(ierr);
        for (j=0; j<N; j++) {
            for (k=0; k<tsize; k++) CounterIncrement_kernel(k,counters);
        }
        ierr = PetscGetTime(&t1);
        CHKERRQ(ierr);
        ierr = PetscPrintf(PETSC_COMM_WORLD,"Serial inline time per kernel: %g us\n",1e6*(t1-t0)/N);
        CHKERRQ(ierr);
    }

    ierr = PetscThreadCommRunKernel(PETSC_COMM_WORLD,(PetscThreadKernel)CounterFree_kernel,1,counters);
    CHKERRQ(ierr);
    ierr = PetscThreadCommBarrier(PETSC_COMM_WORLD);
    CHKERRQ(ierr);
    ierr = PetscFree(counters);
    CHKERRQ(ierr);
    PetscFinalize();
    return 0;
}
Example #18
0
/**
 * @brief Computes the total source (fission and scattering) in each FSR.
 * @details This method computes the total source in each FSR based on
 *          this iteration's current approximation to the scalar flux. A
 *          residual for the source with respect to the source compute on
 *          the previous iteration is computed and returned. The residual
 *          is determined as follows:
 *          \f$ res = \sqrt{\frac{\displaystyle\sum \displaystyle\sum
 *                    \left(\frac{Q^i - Q^{i-1}}{Q^i}\right)^2}{\# FSRs}} \f$
 *
 * @return the residual between this source and the previous source
 */
FP_PRECISION CPUSolver::computeFSRSources() {

  int tid;
  Material* material;
  FP_PRECISION scatter_source;
  FP_PRECISION fission_source;
  FP_PRECISION fsr_fission_source;
  FP_PRECISION* nu_sigma_f;
  FP_PRECISION* sigma_s;
  FP_PRECISION* sigma_t;
  FP_PRECISION* chi;

  FP_PRECISION source_residual = 0.0;

  FP_PRECISION inverse_k_eff = 1.0 / _k_eff;

  /* For all FSRs, find the source */
  #pragma omp parallel for private(tid, material, nu_sigma_f, chi, \
    sigma_s, sigma_t, fission_source, scatter_source, fsr_fission_source) \
    schedule(guided)
  for (int r=0; r < _num_FSRs; r++) {

    tid = omp_get_thread_num();
    material = _FSR_materials[r];
    nu_sigma_f = material->getNuSigmaF();
    chi = material->getChi();
    sigma_s = material->getSigmaS();
    sigma_t = material->getSigmaT();

    /* Initialize the source residual to zero */
    _source_residuals[r] = 0.;
    fsr_fission_source = 0.0;

    /* Compute fission source for each group */
    if (material->isFissionable()) {
      for (int e=0; e < _num_groups; e++)
        _fission_sources(r,e) = _scalar_flux(r,e) * nu_sigma_f[e];

      fission_source = pairwise_sum<FP_PRECISION>(&_fission_sources(r,0),
                                                  _num_groups);
      fission_source *= inverse_k_eff;
    }

    else
      fission_source = 0.0;

    /* Compute total scattering source for group G */
    for (int G=0; G < _num_groups; G++) {
      scatter_source = 0;

      for (int g=0; g < _num_groups; g++)
        _scatter_sources(tid,g) = material->getSigmaSByGroupInline(g,G)
                      * _scalar_flux(r,g);

      scatter_source=pairwise_sum<FP_PRECISION>(&_scatter_sources(tid,0),
                                                _num_groups);

      /* Set the fission source for FSR r in group G */
      fsr_fission_source += fission_source * chi[G];

      /* Set the reduced source for FSR r in group G */
      _reduced_sources(r,G) = (fission_source * chi[G] + scatter_source) *
                      ONE_OVER_FOUR_PI / sigma_t[G];
    }

    /* Compute the norm of residual of the source in the FSR */
    if (fsr_fission_source > 0.0)
      _source_residuals[r] = pow((fsr_fission_source - _old_fission_sources[r])
                                 / fsr_fission_source, 2);

    /* Update the old source */
    _old_fission_sources[r] = fsr_fission_source;
  }

  /* Sum up the residuals from each FSR */
  source_residual = pairwise_sum<FP_PRECISION>(_source_residuals, _num_FSRs);
  source_residual = sqrt(source_residual \
                         / (_num_fissionable_FSRs * _num_groups));

  return source_residual;
}
int main(int argc, char** argv) {


    set_program_options(opts, argc, argv);
///Carica l'opportuna struttura di adiacenza, selezionata da linea di comando
    if (opts.topologia == TORO_2D) 
        topologia = adiacenza_toroidal_lattice(opts.lato);        
    else if (opts.topologia == LINEARE){
        opts.partition_type = LINEAR_PARTITION;
        topologia = adiacenza_simple_line(opts.seq_len);
    }
    else {
        printf("Not supported topology\n");
        exit(1);
    }
    opts.seq_len = topologia.N;
    
    //logarithm lookup table, 6x program speedup
    mylog = new double[3 * opts.seq_len + 10];
    for (int i = 1; i < 3 * opts.seq_len + 10; i++)
        mylog[i] = log(i);
    mylog[0] = 0;
    myexp = new double[100];
    for (int i = 0; i < 100; i++)
        myexp[i] = exp(- opts.beta[0] * i);

    double media_globale = 0;
    double media_globale_n2 = 0;
    double media_rid_globale = 0;
    double media_rid_globale_n2 = 0;
    int n_estrazioni = 100;
    int runs = 0;

    
    // <editor-fold defaultstate="collapsed" desc="Sequenze monodimensionali">
    if (opts.topologia == LINEARE)
#pragma omp parallel
    {
        linear_partition *partitions = new linear_partition[opts.n_seq];
        int *buf_sequenze = new int[opts.n_seq * opts.seq_len];
        distance d(opts.seq_len);
        RandMT generatore;

        for (int L = 0; L < n_estrazioni; L++) {
            // Generazione di un nuovo vettore J_ij random
            // e di opts.n_seq sequenze che hanno quel J 

            double media_locale = 0;
            double media_locale_n2 = 0;
            double media_rid_locale = 0;
            double media_rid_locale_n2 = 0;
            ising_entries_jnorm(opts, buf_sequenze, generatore);

            //riempi le partizioni, a partire dalle sequenze date
            for (int i = 0; i < opts.n_seq; i++)
                partitions[i].fill(&buf_sequenze[i * opts.seq_len], opts.seq_len);


            //media delle distanze tra le coppie di sequenze generate
            //#pragma omp parallel for firstprivate(d) schedule(dynamic,10) reduction(+: media_n, media_n2)
            for (int i = 0; i < opts.n_seq; i++) {
                for (int j = i + 1; j < opts.n_seq; j++) {
                    d.dist(partitions[i], partitions[j]);
                    media_locale += d.dist_shan;
                    media_rid_locale += d.dist_shan_r;
                    media_locale_n2 += (d.dist_shan)*(d.dist_shan);
                    media_rid_locale_n2 += (d.dist_shan_r)*(d.dist_shan_r);
                }
            }
#pragma omp critical
            {
                media_globale += media_locale;
                media_globale_n2 += media_locale_n2;
                media_rid_globale += media_rid_locale;
                media_rid_globale_n2 += media_rid_locale_n2;
                runs += 1;
            }


        }
    }// </editor-fold>

    // <editor-fold defaultstate="collapsed" desc="Reticoli bidimensionali">
    if (opts.topologia == TORO_2D) {
        std::clock_t start = std::clock();
        double time_diff;
        double completed_ratio;
#pragma omp parallel num_threads(opts.threads)
        {
            general_partition *partitions = new general_partition[opts.n_seq];
            distance d(opts.seq_len);
            RandMT generatore;


            for (int L = 0; L < n_estrazioni; L++) {
                // Generazione di un nuovo vettore J_ij random
                // e di opts.n_seq sequenze che hanno quel J 

                double media_locale = 0;
            double media_locale_n2 = 0;
            double media_rid_locale = 0;
            double media_rid_locale_n2 = 0;

                ising_lattice(opts, generatore, partitions);

                //media delle distanze tra le coppie di sequenze generate
                //#pragma omp parallel for firstprivate(d) schedule(dynamic,10) reduction(+: media_n, media_n2)
                for (int i = 0; i < opts.n_seq; i++) {
                    for (int j = i + 1; j < opts.n_seq; j++) {
                        d(partitions[i], partitions[j]);
                         media_locale += d.dist_shan;
                    media_rid_locale += d.dist_shan_r;
                    media_locale_n2 += (d.dist_shan)*(d.dist_shan);
                    media_rid_locale_n2 += (d.dist_shan_r)*(d.dist_shan_r);
                    }
                }
#pragma omp critical
                {
                   media_globale += media_locale;
                media_globale_n2 += media_locale_n2;
                media_rid_globale += media_rid_locale;
                media_rid_globale_n2 += media_rid_locale_n2;
                    runs += 1;
                }


#ifdef _OPENMP
                int this_thread = omp_get_thread_num();
                if (this_thread)
                    continue;
                double time_ratio = omp_get_num_threads();
#else
                double time_ratio = 1.0;
#endif
                fprintf(stderr, "\r");
                time_diff = (std::clock() - start) / (double) CLOCKS_PER_SEC / time_ratio;
                completed_ratio = (L + 1.0) / n_estrazioni;
                fprintf(stderr, "%.1f%% done, ETA %.0fs    ",
                        completed_ratio * 100, ceil(time_diff * (1 / completed_ratio - 1)));
                fflush(stderr);

            }
        }
        time_diff = (std::clock() - start) / (double) CLOCKS_PER_SEC;
        fprintf(stderr, "\r100%% done in %.1f seconds of CPU time\n", time_diff);


    }// </editor-fold>


    double varianza_n, varianza_r;
    int Nd = runs * (opts.n_seq * (opts.n_seq - 1)) / 2;
    media_globale /= Nd;
    media_globale_n2 /= Nd;
    media_rid_globale /= Nd;
    media_rid_globale_n2 /= Nd;

    varianza_n = media_globale_n2 - media_globale*media_globale;
    varianza_r = media_rid_globale_n2 - media_rid_globale*media_rid_globale;
	int lunghezza;
	if(opts.topologia == TORO_2D)
		lunghezza=opts.lato;
	else
		lunghezza=opts.seq_len;
    printf("%d %f %f %f %f\n", lunghezza, media_globale, varianza_n, media_rid_globale, varianza_r);
    //fprintf(stderr, "%d %f %f\n", opts.seq_len, media_globale, varianza_n);


    return 0;
}
Example #20
0
/**
 * @brief Compute \f$ k_{eff} \f$ from the total, fission and scattering
 *        reaction rates and leakage.
 * @details This method computes the current approximation to the
 *          multiplication factor on this iteration as follows:
 *          \f$ k_{eff} = \frac{\displaystyle\sum_{i \in I}
 *                        \displaystyle\sum_{g \in G} \nu \Sigma^F_g \Phi V_{i}}
 *                        {\displaystyle\sum_{i \in I}
 *                        \displaystyle\sum_{g \in G} (\Sigma^T_g \Phi V_{i} -
 *                        \Sigma^S_g \Phi V_{i} - L_{i,g})} \f$
 */
void CPUSolver::computeKeff() {

  int tid;
  Material* material;
  FP_PRECISION* sigma;
  FP_PRECISION volume;

  FP_PRECISION total = 0.0;
  FP_PRECISION fission = 0.0;
  FP_PRECISION scatter = 0.0;

  FP_PRECISION* FSR_rates = new FP_PRECISION[_num_FSRs];
  FP_PRECISION* group_rates = new FP_PRECISION[_num_threads * _num_groups];

  /* Loop over all FSRs and compute the volume-weighted total rates */
  #pragma omp parallel for private(tid, volume, \
    material, sigma) schedule(guided)
  for (int r=0; r < _num_FSRs; r++) {

    tid = omp_get_thread_num() * _num_groups;
    volume = _FSR_volumes[r];
    material = _FSR_materials[r];
    sigma = material->getSigmaT();

    for (int e=0; e < _num_groups; e++)
      group_rates[tid+e] = sigma[e] * _scalar_flux(r,e);

    FSR_rates[r]=pairwise_sum<FP_PRECISION>(&group_rates[tid], _num_groups);
    FSR_rates[r] *= volume;
  }

  /* Reduce total rates across FSRs */
  total = pairwise_sum<FP_PRECISION>(FSR_rates, _num_FSRs);

  /* Loop over all FSRs and compute the volume-weighted fission rates */
  #pragma omp parallel for private(tid, volume, \
    material, sigma) schedule(guided)
  for (int r=0; r < _num_FSRs; r++) {

    tid = omp_get_thread_num() * _num_groups;
    volume = _FSR_volumes[r];
    material = _FSR_materials[r];
    sigma = material->getNuSigmaF();

    for (int e=0; e < _num_groups; e++)
      group_rates[tid+e] = sigma[e] * _scalar_flux(r,e);

    FSR_rates[r]=pairwise_sum<FP_PRECISION>(&group_rates[tid], _num_groups);
    FSR_rates[r] *= volume;
  }

  /* Reduce fission rates across FSRs */
  fission = pairwise_sum<FP_PRECISION>(FSR_rates, _num_FSRs);

  /* Loop over all FSRs and compute the volume-weighted scattering rates */
  #pragma omp parallel for private(tid, volume, \
    material) schedule(guided)
  for (int r=0; r < _num_FSRs; r++) {

    tid = omp_get_thread_num() * _num_groups;
    volume = _FSR_volumes[r];
    material = _FSR_materials[r];

    FSR_rates[r] = 0.;

    for (int G=0; G < _num_groups; G++) {
      for (int g=0; g < _num_groups; g++)
        group_rates[tid+g] = material->getSigmaSByGroupInline(g,G)
                             * _scalar_flux(r,g);

      FSR_rates[r]+=pairwise_sum<FP_PRECISION>(&group_rates[tid], _num_groups);
    }

    FSR_rates[r] *= volume;
  }

  /* Reduce scattering rates across FSRs */
  scatter = pairwise_sum<FP_PRECISION>(FSR_rates, _num_FSRs);

  /* Reduce leakage array across Tracks, energy groups, polar angles */
  int size = 2 * _tot_num_tracks * _polar_times_groups;
  _leakage = pairwise_sum<FP_PRECISION>(_boundary_leakage, size) * 0.5;

  _k_eff = fission / (total - scatter + _leakage);

  log_printf(DEBUG, "tot = %f, fiss = %f, scatt = %f, leakage = %f,"
             "k_eff = %f", total, fission, scatter, _leakage, _k_eff);

  delete [] FSR_rates;
  delete [] group_rates;

  return;
}
int main() {
    const int nr_threads = 2;
    const int n = N;
    const int nr_runs = 20000000;
    double a[n], sum = 0.0;
    int j;
    omp_set_dynamic(0);
    omp_set_num_threads(nr_threads);

#pragma omp parallel default(none) shared(a)
    {
#pragma omp sections
        {
#pragma omp section
            {
                struct timeval tv1, tv2;
                int i, run_nr;
                int thread_nr = omp_get_thread_num();
                for (i = 0; i < n/2; i += 1)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = 0; i < n/2 ;i += 1)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
#pragma omp section
            {
                struct timeval tv1, tv2;
                int thread_nr = omp_get_thread_num();
                int i, run_nr;
                for (i = n/2; i < n; i += 1)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = n/2; i < n ;i += 1)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
        }
    }
    sum = 0.0;
    for (j = 0; j < n; j++)
        sum += a[j];
    printf("no false sharing: %.1lf\n", sum);

#pragma omp parallel default(none) shared(a)
    {
#pragma omp sections
        {
#pragma omp section
            {
                struct timeval tv1, tv2;
                int i, run_nr;
                int thread_nr = omp_get_thread_num();
                for (i = 0; i < n; i += 2)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = 0; i < n ;i += 2)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
#pragma omp section
            {
                struct timeval tv1, tv2;
                int i, run_nr;
                int thread_nr = omp_get_thread_num();
                for (i = 1; i < n; i += 2)
                    a[i] = 0.0;
                gettimeofday(&tv1, NULL);
                for (run_nr = 0; run_nr < nr_runs; run_nr++)
                    for (i = 1; i < n ;i += 2)
                        a[i] += i;
                gettimeofday(&tv2, NULL);
                printf("thread %d: %.6f\n", thread_nr,
                       1.0e-6*(tv2.tv_usec - tv1.tv_usec) +
                       (tv2.tv_sec - tv1.tv_sec));
            }
        }
    }
    sum = 0.0;
    for (j = 0; j < n; j++)
        sum += a[j];
    printf("false sharing: %.1lf\n", sum);

    return EXIT_SUCCESS;
}
Example #22
0
/**
 * @brief This method performs one transport sweep of all azimuthal angles,
 *        Tracks, Track segments, polar angles and energy groups.
 * @details The method integrates the flux along each Track and updates the
 *          boundary fluxes for the corresponding output Track, while updating
 *          the scalar flux in each flat source region.
 */
void CPUSolver::transportSweep() {

  int tid;
  int min_track, max_track;
  Track* curr_track;
  int azim_index;
  int num_segments;
  segment* curr_segment;
  segment* segments;
  FP_PRECISION* track_flux;

  log_printf(DEBUG, "Transport sweep with %d OpenMP threads", _num_threads);

  /* Initialize flux in each FSr to zero */
  flattenFSRFluxes(0.0);

  if (_cmfd != NULL && _cmfd->isFluxUpdateOn())
    zeroSurfaceCurrents();

  /* Loop over azimuthal angle halfspaces */
  for (int i=0; i < 2; i++) {

    /* Compute the minimum and maximum Track IDs corresponding to
     * this azimuthal angular halfspace */
    min_track = i * (_tot_num_tracks / 2);
    max_track = (i + 1) * (_tot_num_tracks / 2);

    /* Loop over each thread within this azimuthal angle halfspace */
    #pragma omp parallel for private(curr_track, azim_index, num_segments, \
      curr_segment, segments, track_flux, tid) schedule(guided)
    for (int track_id=min_track; track_id < max_track; track_id++) {

      tid = omp_get_thread_num();

      /* Use local array accumulator to prevent false sharing*/
      FP_PRECISION* thread_fsr_flux;
      thread_fsr_flux = new FP_PRECISION[_num_groups];

      /* Initialize local pointers to important data structures */
      curr_track = _tracks[track_id];
      azim_index = curr_track->getAzimAngleIndex();
      num_segments = curr_track->getNumSegments();
      segments = curr_track->getSegments();
      track_flux = &_boundary_flux(track_id,0,0,0);

      /* Loop over each Track segment in forward direction */
      for (int s=0; s < num_segments; s++) {
        curr_segment = &segments[s];
        scalarFluxTally(curr_segment, azim_index, track_flux,
                        thread_fsr_flux, true);
      }

      /* Transfer boundary angular flux to outgoing Track */
      transferBoundaryFlux(track_id, azim_index, true, track_flux);

      /* Loop over each Track segment in reverse direction */
      track_flux += _polar_times_groups;

      for (int s=num_segments-1; s > -1; s--) {
        curr_segment = &segments[s];
        scalarFluxTally(curr_segment, azim_index, track_flux,
                        thread_fsr_flux, false);
      }
      delete thread_fsr_flux;

      /* Transfer boundary angular flux to outgoing Track */
      transferBoundaryFlux(track_id, azim_index, false, track_flux);
    }
  }

  return;
}
/**
 * @brief Initializes the SpringApp instance
 * @return whether initialization was successful
 */
bool SpringApp::Initialize()
{
#if !(defined(WIN32) || defined(__APPLE__) || defined(HEADLESS))
	//! this MUST run before any other X11 call (esp. those by SDL!)
	//! we need it to make calls to X11 threadsafe
	if (!XInitThreads()) {
		LOG_L(L_FATAL, "Xlib is not thread safe");
		return false;
	}
#endif

#if defined(_WIN32) && defined(__GNUC__)
	// load QTCreator's gdb helper dll; a variant of this should also work on other OSes
	{
		// don't display a dialog box if gdb helpers aren't found
		UINT olderrors = SetErrorMode(SEM_FAILCRITICALERRORS);
		if (LoadLibrary("gdbmacros.dll")) {
			LOG("QT Creator's gdbmacros.dll loaded");
		}
		SetErrorMode(olderrors);
	}
#endif

	// Initialize class system
	creg::System::InitializeClasses();

	// Initialize crash reporting
	CrashHandler::Install();

	globalRendering = new CGlobalRendering();

	ParseCmdLine();
	CMyMath::Init();
	good_fpu_control_registers("::Run");

	// log OS version
	LOG("OS: %s", Platform::GetOS().c_str());
	if (Platform::Is64Bit())
		LOG("OS: 64bit native mode");
	else if (Platform::Is32BitEmulation())
		LOG("OS: emulated 32bit mode");
	else
		LOG("OS: 32bit native mode");

	// Rename Threads
	// We give the process itself the name `unknown`, htop & co. will still show the binary's name.
	// But all child threads copy by default the name of their parent, so all threads that don't set
	// their name themselves will show up as 'unknown'.
	Threading::SetThreadName("unknown");
#ifdef _OPENMP
	#pragma omp parallel
	{
		int i = omp_get_thread_num();
		if (i != 0) { // 0 is the source thread
			std::ostringstream buf;
			buf << "omp" << i;
			Threading::SetThreadName(buf.str().c_str());
		}
	}
#endif

	// Install Watchdog
	Watchdog::Install();
	Watchdog::RegisterThread(WDT_MAIN, true);

	FileSystemInitializer::Initialize();

	// Create Window
	if (!InitWindow(("Spring " + SpringVersion::GetSync()).c_str())) {
		SDL_Quit();
		return false;
	}

	mouseInput = IMouseInput::GetInstance();
	keyInput = KeyInput::GetInstance();
	input.AddHandler(boost::bind(&SpringApp::MainEventHandler, this, _1));

	// Global structures
	gs = new CGlobalSynced();
	gu = new CGlobalUnsynced();

	// Initialize GLEW
	LoadExtensions();

	//! check if FSAA init worked fine
	if (globalRendering->FSAA && !MultisampleVerify())
		globalRendering->FSAA = 0;

	InitOpenGL();
	agui::InitGui();
	LoadFonts();

	globalRendering->PostInit();

	// Initialize named texture handler
	CNamedTextures::Init();

	// Initialize Lua GL
	LuaOpenGL::Init();

	// Sound & Input
	ISound::Initialize();
	InitJoystick();

	// Multithreading & Affinity
	LOG("CPU Cores: %d", Threading::GetAvailableCores());
	const uint32_t affinity = configHandler->GetUnsigned("SetCoreAffinity");
	const uint32_t cpuMask  = Threading::SetAffinity(affinity);
	if (cpuMask == 0xFFFFFF) {
		LOG("CPU affinity not set");
	}
	else if (cpuMask != affinity) {
		LOG("CPU affinity mask set: %d (config is %d)", cpuMask, affinity);
	}
	else if (cpuMask == 0) {
		LOG_L(L_ERROR, "Failed to CPU affinity mask <%d>", affinity);
	}
	else {
		LOG("CPU affinity mask set: %d", cpuMask);
	}

	// Create CGameSetup and CPreGame objects
	Startup();

	return true;
}
Example #24
0
/**
 * @brief Computes the contribution to the FSR scalar flux from a Track segment.
 * @details This method integrates the angular flux for a Track segment across
 *          energy groups and polar angles, and tallies it into the FSR
 *          scalar flux, and updates the Track's angular flux.
 * @param curr_segment a pointer to the Track segment of interest
 * @param azim_index a pointer to the azimuthal angle index for this segment
 * @param track_flux a pointer to the Track's angular flux
 * @param fsr_flux a pointer to the temporary FSR flux buffer
 * @param fwd
 */
void CPUSolver::scalarFluxTally(segment* curr_segment,
                                int azim_index,
                                FP_PRECISION* track_flux,
                                FP_PRECISION* fsr_flux,
                                bool fwd){

  int tid = omp_get_thread_num();
  int fsr_id = curr_segment->_region_id;
  FP_PRECISION length = curr_segment->_length;
  FP_PRECISION* sigma_t = curr_segment->_material->getSigmaT();

  /* The change in angular flux along this Track segment in the FSR */
  FP_PRECISION delta_psi;
  FP_PRECISION exponential;

  /* Set the FSR scalar flux buffer to zero */
  memset(fsr_flux, 0.0, _num_groups * sizeof(FP_PRECISION));

  /* Loop over energy groups */
  for (int e=0; e < _num_groups; e++) {

    /* Loop over polar angles */
    for (int p=0; p < _num_polar; p++){
      exponential = computeExponential(sigma_t[e], length, p);
      delta_psi = (track_flux(p,e)-_reduced_sources(fsr_id,e))*exponential;
      fsr_flux[e] += delta_psi * _polar_weights(azim_index,p);
      track_flux(p,e) -= delta_psi;
    }
  }

  if (_cmfd != NULL && _cmfd->isFluxUpdateOn()){
    if (curr_segment->_cmfd_surface_fwd != -1 && fwd){

      int pe = 0;

      /* Atomically increment the Cmfd Mesh surface current from the
       * temporary array using mutual exclusion locks */
      omp_set_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_fwd]);

      /* Loop over energy groups */
      for (int e = 0; e < _num_groups; e++) {

        /* Loop over polar angles */
        for (int p = 0; p < _num_polar; p++){

          /* Increment current (polar and azimuthal weighted flux, group) */
          _surface_currents(curr_segment->_cmfd_surface_fwd,e) +=
              track_flux(p,e)*_polar_weights(azim_index,p)/2.0;
          pe++;
        }
      }

      /* Release Cmfd Mesh surface mutual exclusion lock */
      omp_unset_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_fwd]);

    }
    else if (curr_segment->_cmfd_surface_bwd != -1 && !fwd){

      int pe = 0;

      /* Atomically increment the Cmfd Mesh surface current from the
       * temporary array using mutual exclusion locks */
      omp_set_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_bwd]);

      /* Loop over energy groups */
      for (int e = 0; e < _num_groups; e++) {

        /* Loop over polar angles */
        for (int p = 0; p < _num_polar; p++){

          /* Increment current (polar and azimuthal weighted flux, group) */
          _surface_currents(curr_segment->_cmfd_surface_bwd,e) +=
              track_flux(p,e)*_polar_weights(azim_index,p)/2.0;
          pe++;
        }
      }

      /* Release Cmfd Mesh surface mutual exclusion lock */
      omp_unset_lock(&_cmfd_surface_locks[curr_segment->_cmfd_surface_bwd]);
    }
  }

  /* Atomically increment the FSR scalar flux from the temporary array */
  omp_set_lock(&_FSR_locks[fsr_id]);
  {
    for (int e=0; e < _num_groups; e++)
      _scalar_flux(fsr_id,e) += fsr_flux[e];
  }
  omp_unset_lock(&_FSR_locks[fsr_id]);

  return;
}
Example #25
0
NODE getfromBarrier()
{
    int index = omp_get_thread_num()/2;
    return (*leaf[index]);
}
Example #26
0
void ParticleListCPUSorted::allocate(PlasmaData* pdata,int nptcls_in)
{
	//printf("Allocating Particle List on the CPU\n");

	if(pdata->plot_flag)
	plot = gnuplot_init();

	//gnuplot_cmd(plot,"set pointsize 0.1");
	// Allocate memory for particles
	nptcls_allocated = nptcls_in;

	nptcls = nptcls_in;

	num_cores = pdata->num_cores;

	// Allocate realkind arrays
	for(int i=0;i<ParticleList_nfloats;i++)
	{
		*get_float(i) = (realkind*)malloc(nptcls_allocated*sizeof(realkind));
	}

	// Allocate int arrays
	for(int i=0;i<ParticleList_nints;i++)
	{
		*get_int(i) = (int*)malloc(nptcls_allocated*sizeof(realkind));
	}

	buffer = (realkind*)malloc(nptcls_allocated*sizeof(realkind));

	num_subcycles = (int*)malloc(nptcls_allocated*sizeof(realkind));
	memset(num_subcycles,0,nptcls_allocated*sizeof(int));

	num_piccard = (realkind*)malloc(nptcls_allocated*sizeof(double));
	memset(num_piccard,0,nptcls_allocated*sizeof(double));

	num_piccard2 = (realkind*)malloc(nptcls_allocated*sizeof(double));
	memset(num_piccard2,0,nptcls_allocated*sizeof(double));


	// allocate short ints for cluster id's
	cluster_id = (int*)malloc(nptcls_allocated*sizeof(int));
	ptcl_index = (int*)malloc(nptcls_allocated*sizeof(int));

	piccard_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));
	accel_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));
	tally_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));
	crossing_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));
	dtau_est_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));
	tally_timer2 = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));
	load_store_timer = (CPUTimer*)malloc(pdata->num_cores*sizeof(CPUTimer));

	int tid;
	omp_set_num_threads(pdata->num_cores);
#pragma omp parallel private(tid) default(shared) num_threads(pdata->num_cores)
	{
		tid = omp_get_thread_num();
		piccard_timer[tid] = *(new CPUTimer());
		accel_timer[tid] = *(new CPUTimer());
		tally_timer[tid] = *(new CPUTimer());
		crossing_timer[tid] = *(new CPUTimer());
		dtau_est_timer[tid] = *(new CPUTimer());

		tally_timer2[tid] = *(new CPUTimer());
		load_store_timer[tid] = *(new CPUTimer());
	}

	push_timer = new CPUTimer();


}
Example #27
0
template <typename PointInT, typename PointOutT> void
pcl::MovingLeastSquares<PointInT, PointOutT>::performProcessing (PointCloudOut &output)
{
  // Compute the number of coefficients
  nr_coeff_ = (order_ + 1) * (order_ + 2) / 2;

#ifdef _OPENMP
  // (Maximum) number of threads
  const unsigned int threads = threads_ == 0 ? 1 : threads_;
  // Create temporaries for each thread in order to avoid synchronization
  typename PointCloudOut::CloudVectorType projected_points (threads);
  typename NormalCloud::CloudVectorType projected_points_normals (threads);
  std::vector<PointIndices> corresponding_input_indices (threads);
#endif

  // For all points
#ifdef _OPENMP
#pragma omp parallel for schedule (dynamic,1000) num_threads (threads)
#endif
  for (int cp = 0; cp < static_cast<int> (indices_->size ()); ++cp)
  {
    // Allocate enough space to hold the results of nearest neighbor searches
    // \note resize is irrelevant for a radiusSearch ().
    std::vector<int> nn_indices;
    std::vector<float> nn_sqr_dists;

    // Get the initial estimates of point positions and their neighborhoods
    if (searchForNeighbors ((*indices_)[cp], nn_indices, nn_sqr_dists))
    {
      // Check the number of nearest neighbors for normal estimation (and later for polynomial fit as well)
      if (nn_indices.size () >= 3)
      {
        // This thread's ID (range 0 to threads-1)
#ifdef _OPENMP
        const int tn = omp_get_thread_num ();
        // Size of projected points before computeMLSPointNormal () adds points
        size_t pp_size = projected_points[tn].size ();
#else
        PointCloudOut projected_points;
        NormalCloud projected_points_normals;
#endif

        // Get a plane approximating the local surface's tangent and project point onto it
        const int index = (*indices_)[cp];

        size_t mls_result_index = 0;
        if (cache_mls_results_)
          mls_result_index = index; // otherwise we give it a dummy location.

#ifdef _OPENMP
        computeMLSPointNormal (index, nn_indices, projected_points[tn], projected_points_normals[tn], corresponding_input_indices[tn], mls_results_[mls_result_index]);

        // Copy all information from the input cloud to the output points (not doing any interpolation)
        for (size_t pp = pp_size; pp < projected_points[tn].size (); ++pp)
          copyMissingFields (input_->points[(*indices_)[cp]], projected_points[tn][pp]);
#else
        computeMLSPointNormal (index, nn_indices, projected_points, projected_points_normals, *corresponding_input_indices_, mls_results_[mls_result_index]);

        // Append projected points to output
        output.insert (output.end (), projected_points.begin (), projected_points.end ());
        if (compute_normals_)
          normals_->insert (normals_->end (), projected_points_normals.begin (), projected_points_normals.end ());
#endif
      }
    }
  }

#ifdef _OPENMP
  // Combine all threads' results into the output vectors
  for (unsigned int tn = 0; tn < threads; ++tn)
  {
    output.insert (output.end (), projected_points[tn].begin (), projected_points[tn].end ());
    corresponding_input_indices_->indices.insert (corresponding_input_indices_->indices.end (),
                                                  corresponding_input_indices[tn].indices.begin (), corresponding_input_indices[tn].indices.end ());
    if (compute_normals_)
      normals_->insert (normals_->end (), projected_points_normals[tn].begin (), projected_points_normals[tn].end ());
  }
#endif

  // Perform the distinct-cloud or voxel-grid upsampling
  performUpsampling (output);
}
Example #28
0
long long int ParticleListCPUSorted::pushT(PlasmaData* pdata, FieldData* fields, HOMoments* moments)
{

	int tid;
	int nthreads = pdata->num_cores;
	int stride = (nptcls+nthreads-1)/nthreads;

	long long int nSubSteps_proc[nthreads];

	omp_set_num_threads(nthreads);

//	for(int i=0;i<pdata->nx;i++)
//	{
//		realkind temp;
//		temp = fields->intrpE(0.5,0,0,i,0,0,0,FieldData_deriv_f);
//		printf("fields[%i] on cpu = %f\n",i,temp);
//	}

	//printf("particles ")

	//printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH);


	// Start the parallel loop
#pragma omp parallel private(tid,nthreads,stride) default(shared) num_threads(nthreads)
	{
		nthreads = omp_get_num_threads();
		//printf("nthreads = %i with vector length = %i\n",nthreads,VEC_LENGTH);
		//nthreads = 1;
		stride = (nptcls+nthreads-1)/nthreads;

		tid = omp_get_thread_num();
		//tid = 0;

//	    auto cpu = sched_getcpu();
//	    std::ostringstream os;
//	        os<<"\nThread "<<omp_get_thread_num()<<" on cpu "<<sched_getcpu()<<std::endl;
//	        std::cout<<os.str()<<std::flush;

		PlasmaData pdata_local = *pdata;

		// Each thread gets a separate copy of the accumulation arrays
		HOMoments* my_moment = moments+tid;

		// Initialize the moment values
		//printf("Initializing moment values\n");
		my_moment->set_vals(0);

		int nSubcycle_max = pdata->nSubcycle_max;

		int ptcl_start,ptcl_end;
		int nptcls_process;
		int nptcls_left;
		int ishrink = 0;
		int nptcl_replacements = 0;

		int nptcl_done;
		//int iptcl_max;
		int iptcl_new_v[VEC_LENGTH];
		int iptcl_v[VEC_LENGTH];
		int iter_array_v[VEC_LENGTH];

		int* iptcl_new = iptcl_new_v;
		int* iptcl = iptcl_v;
		int* iter_array = iter_array_v;

		long long int nSubSteps_done = 0;

		ptcl_start = stride*tid;
		ptcl_end = fmin(stride*(tid+1)-1,nptcls-1);

		nptcls_process = ptcl_end-ptcl_start+1;

		//printf("Thread %i starting at %i to %i with %i ptcls\n",
			//	tid,ptcl_start,ptcl_end,nptcls_process);


		ParticleObjNT<VEC_LENGTH,nSpatial,nVel,iEM> particle(iptcl);

		// Populate the timers
		particle.piccard_timer = piccard_timer+tid;
		particle.accel_timer = accel_timer+tid;
		particle.tally_timer = tally_timer+tid;
		particle.crossing_timer = crossing_timer+tid;
		particle.dtau_est_timer = dtau_est_timer+tid;

//		ParticleObjN<VEC_LENGTH> particle(iptcl);

		typevecN<int,VEC_LENGTH> iter;


		iter = 0;
		for(int i=0;i<VEC_LENGTH;i++)
			iter_array[i] = 0;

		CurrentTally currents(&my_moment->get_val(0,0,0,ispecies,HOMoments_currentx),
							  &my_moment->get_val(0,0,0,ispecies,HOMoments_currenty),
							  &my_moment->get_val(0,0,0,ispecies,HOMoments_currentz),
							  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
							  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
							  moments->pdata->ndimensions);

		ChargeTally charge(&my_moment->get_val(0,0,0,ispecies,HOMoments_charge),
							  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
							  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
							  moments->pdata->ndimensions);

		StressTally stress(&my_moment->get_val(0,0,0,ispecies,HOMoments_S2xx),
							  make_int3(moments->pdata->nx,moments->pdata->ny,moments->pdata->nz),
							  moments->pdata->dxdi,moments->pdata->dydi,moments->pdata->dzdi,
							  moments->pdata->ndimensions);

		for(int i=0;i<VEC_LENGTH;i++)
			iptcl[i] = ptcl_start+i;

		nptcl_done = 0;


		load_store_timer[tid].start();
		particle = *this;




		//for(int i=0;i<VEC_LENGTH;i++)
		//	particle.dt_finished(i) = 0;

		// Each thread loops over its own particles
		// In order to avoid SIMD divergence we loop until
		// all particles in the threads work que have been
		// pushed. Anytime a particle finishes a subcycle
		// it is written back to the main list and a new particle
		// takes its slot
		while(nptcl_done < nptcls_process)
		{
			nptcls_left = nptcls_process-nptcl_done;

			//printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done);

			if((nptcls_left <= VEC_LENGTH)&&(VEC_LENGTH > 1))
			{
				if(ishrink == 0)
				{
					for(int j=0;j<VEC_LENGTH;j++)
					{
						//printf("iptcl[%i] = %i\n",j,iptcl[0][j]);
						particle.write_back(*this,j);
					}

					int k = 0;
					for(int l=0;l<VEC_LENGTH;l++)
					{

						bool idone = 0;

						//printf("iter2(%i) = %f\n",j,particles2.dt_finished(j));
						if(particle.dt_finished(l) >= pdata->dt)
						{
							idone = 1;
						}
						else if(iter(l) >= pdata->nSubcycle_max)
						{
							idone = 1;
//							printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[l],pdata->dt-particle.dt_finished(l));
						}
						else if(iptcl[l] > ptcl_end)
							idone = 1;
						else
							idone = 0;


						if(idone)
						{
							nSubSteps_done += iter(l);
							num_subcycles[iptcl[l]] += iter(l);
							iter(l) = 0;

							// Accumulate Charge and S2 moment

						}
						else
						{
							iptcl[k] = iptcl[l];
							iter_array[k] = iter(l);


							k++;
						}
					}

					nptcl_done = nptcls_process - k ;
					nptcls_left = k;

					ishrink = 1;
				}

// Hack to compile all versions of ParticleObjN template
				shrink_pushT<VEC_LENGTH,nSpatial,nVel,iEM>(pdata,fields,&currents,this,
									&iter_array,&iptcl,&iptcl_new,
									nptcls_left,nptcl_done,nptcls_process,nSubSteps_done);
//				shrink_push<VEC_LENGTH>(pdata,fields,&currents,this,
//									&iter_array,&iptcl,&iptcl_new,
//									nptcls_left,nptcl_done,nptcls_process,nSubSteps_done);


			}
			else
			{
//				for(int j=0;j<VEC_LENGTH;j++)
//					printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n",
//							iptcl[j],particle.px(j),particle.py(j),particle.pz(j),
//							particle.ix(j),particle.iy(j),particle.iz(j),
//							particle.vx(j),particle.vy(j),particle.vz(j));

				// Here our particle vector size is the same
				// size as our system vector size, and won't
				// change from step to step

				particle.push(pdata,fields,&currents,iter,nSubcycle_max);



				// Replace the particle (or particles) that
				// have finished their subcycle steps
				//int k = 0;
				for(int j=0;j<VEC_LENGTH;j++)
				{
					bool idone = 0;

					if(particle.dt_finished(j) >= pdata->dt)
					{
						idone = 1;
					}
					else if(iter(j) >= pdata->nSubcycle_max)
					{
						idone = 1;

//						printf("warning particle finished before time step was finished dt_left[%i] = %e\n",iptcl[j],pdata->dt-particle.dt_finished(j));
					}

					if(idone)
					{
						// Accumulate Charge and S2 moment

//						printf("particle %i done = %f, %f, %f, %i, %i, %i, %f, %f, %f\n",
//								iptcl[j],particle.px(j),particle.py(j),particle.pz(j),
//								particle.ix(j),particle.iy(j),particle.iz(j),
//								particle.vx(j),particle.vy(j),particle.vz(j));

						// Write results, and get a new particle from the list
						particle.write_back(*this,j);

						num_subcycles[iptcl[j]] += iter(j);

						iptcl[j] = ptcl_start + nptcl_done + VEC_LENGTH;
						nptcl_done++;

						if(nptcls_process-nptcl_done > 0)
						{
							particle.copy_in(*this,j);
						}

						nSubSteps_done += iter(j);

						iter(j) = 0;
						particle.dt_finished(j) = 0.0f;


					}
				} /* for(int j=0;j<nptcls_left;j++) */
				//printf("nptcls_left = %i, ntpcl_done = %i\n",nptcls_left,nptcl_done);

			} /* else */

			nptcl_replacements++;

		} /* while(nptcl_done < nptcls_process) */

		load_store_timer[tid].stop();

		tally_timer2[tid].start();
		// accumulate charge and s2 moment
		for(int i=ptcl_start;i<=ptcl_end;i++)
		{
			charge.tally(px[i],py[i],pz[i],
					ix[i],iy[i],iz[i],
					1.0);

			stress.tally1d1v(px[i],
					vx[i],
					ix[i],
					1.0f);


			//if(fabs(dt_finished[i] - pdata->dt) > 1.0e-5)
			//	printf("particle %i dt_finished = %e\n",i,dt_finished[i]);

			dt_finished[i] = 0.0f;

		}
		tally_timer2[tid].stop();

		//nSubSteps_proc[0] = nSubSteps_done;

		nSubSteps_proc[tid] = nSubSteps_done;

//		printf("average particles processed per replacement: %f\n",nptcls_process/((double)nptcl_replacements));



	} /* pragma omp parallel */

	for(int i=1;i<nthreads;i++)
		nSubSteps_proc[0] += nSubSteps_proc[i];

	//printf("nsteps avg = %i\n",nSubSteps_proc[0]);

	return nSubSteps_proc[0];


}
Example #29
0
LIS_INT lis_vector_nrm2(LIS_VECTOR vx, LIS_REAL *value)
{
	LIS_INT i,n;
	LIS_SCALAR dot;
	LIS_SCALAR *x;
	LIS_SCALAR tmp;
	#ifdef _OPENMP
		LIS_INT nprocs,my_rank;
	#endif
	#ifdef USE_MPI
		MPI_Comm comm;
	#endif

	LIS_DEBUG_FUNC_IN;

	n      = vx->n;

	x      = vx->value;
	#ifdef USE_MPI
		comm   = vx->comm;
	#endif
	#ifdef _OPENMP
		nprocs = omp_get_max_threads();
		#pragma omp parallel private(i,tmp,my_rank)
		{
			my_rank = omp_get_thread_num();
			tmp     = 0.0;
			#ifdef USE_VEC_COMP
		    #pragma cdir nodep
			#endif
			#pragma omp for
			for(i=0; i<n; i++)
			{
				tmp += x[i]*x[i];
			}
			lis_vec_tmp[my_rank*LIS_VEC_TMP_PADD] = tmp;
		}
		dot = 0.0;
		for(i=0;i<nprocs;i++)
		{
			dot += lis_vec_tmp[i*LIS_VEC_TMP_PADD];
		}
	#else
		dot  = 0.0;
		#ifdef USE_VEC_COMP
	    #pragma cdir nodep
		#endif
		for(i=0; i<n; i++)
		{
			dot += x[i]*x[i];
		}
	#endif
	#ifdef USE_MPI
		MPI_Allreduce(&dot,&tmp,1,MPI_DOUBLE,MPI_SUM,comm);
		*value = sqrt(tmp);
	#else
		*value = sqrt(dot);
	#endif

	LIS_DEBUG_FUNC_OUT;
	return LIS_SUCCESS;
}
Example #30
0
int main(int argc, char const *argv[])
{
	char* s;
	std::srand(std::time(0)); //use current time as seed for random generator
	int r = rand() % 1000;
	for(int i = 0; i < r; i++)
	{
		rand();
	}
	if(argc < 3)
	{
		int forestSize = strtol(argv[1], &s, 10);
		for(int i = 0 ; i < forestSize ; i++)
		{
			printf("%lf\n",fRand(1,std::sqrt(10)));
		}
		return 1;
	}
	
	
	int forestSize = strtol(argv[1], &s, 10);
	int iterations = strtol(argv[2], &s, 10);

	double SIDE = std::sqrt(forestSize);
	SIDE = fRand(std::sqrt(SIDE),std::sqrt(2)*SIDE);
	double R = 1;

	double begin, end;

	std::vector<int> empty;

	std::vector<Tree*> Forest;
	std::vector< std::vector<int> > neighbors(forestSize,empty);
	std::vector< std::vector<double> > metrics(iterations,std::vector<double>(forestSize,0.0));

	//Parallel variables
	int num_threads;
	std::vector<int> order;

	begin = omp_get_wtime();
	#pragma omp parallel shared(Forest,neighbors,metrics,forestSize,iterations,order)
	{
		
		#pragma omp master
		{
			// INIT VARIABLES
			num_threads = omp_get_num_threads();
			std::vector<Point> positions;
			std::cout << "Running " << forestSize << " trees for " << iterations << " iterations on " << num_threads << " processors" << std::endl;
			printf("SIDE = %lf, R = %lf\n",SIDE,R);
			
			for(int i = 0; i < forestSize; i++)
			{
				// double x = std::fabs((SIDE-1)*std::sin(i));
				// double y = std::fabs(SIDE*std::cos(i*i));
				double x = fRand(0,SIDE);
				double y = fRand(0,SIDE);
				Point p = {x,y};
				Tree *T = new MonopodialTree();
				Forest.push_back(T);
				positions.push_back(p);
				for(int j = 0 ; j < i ; j++)
				{
					Point q = positions[j];
					if(pointDistance(p,q) < R)
					{
						neighbors[j].push_back(i);
						neighbors[i].push_back(j);
					}
				}
			}

			order = get_order(neighbors);
			for(int i = 0; i < order.size(); i++)
				std::cout << order[i] << " ";
			std::cout << std::endl;
		}

		#pragma omp barrier

		int thread_num = omp_get_thread_num();
		// ITERATE
		

		int N = forestSize;
		int T = iterations;
		int P = omp_get_num_threads();
		int x = thread_num;
		int y = 0;

		while( x+N*y < N*T)
		{
			int i = order[x];
			// printf("%d (%d, %d)\n",thread_num,y,i );

			while(Forest[i]->iteration < y);
			bool ready = false;
			while(!ready)
			{
				ready = true;
				for(int k = 0; k < neighbors[i].size() ; k++)
				{
					if( Forest[ neighbors[i][k] ]->iteration < y)
					{
						ready = false;
						break;
					}
				}
			}

			if(y > 0)
			{
				Forest[i]->updateMetric(metrics[y],neighbors[i]);
			}
			Forest[i]->next();
			double metric = Forest[i]->calculateMetric();
			#pragma omp critical(metrics)
			{
				metrics[y][i] = metric;
			}

			x+=P;
			if(x >= N)
			{
				x -= N;
				y++;
			}
			
			
		}
		

	}

	end = omp_get_wtime();
	
	print_forest(Forest, neighbors, metrics[iterations-1]);

	std::vector< std::vector<int> > connected_components = get_connected_components(neighbors);
	print_connected_components( connected_components);

	char buffer[80];

	FILE *f = fopen("Results_lookahead.txt", "a");
	if(f != NULL)
	{
	    fprintf(f, "%s\n", gettime(buffer));
	    fprintf(f,"%d threads\n",num_threads);
	    fprintf(f,"%d trees\n",forestSize);
	    fprintf(f,"%d iterations\n",iterations);
	    for(int i = 0; i < connected_components.size(); i++)
	    {
	    	fprintf(f, "%d ", connected_components[i].size());
	    }
	    fprintf(f, "\n");
	    fprintf(f,"Time : %f seconds\n", end-begin);
	    fprintf(f,"\n=====================\n");
	}

	for(int i = 0; i < Forest.size() ; i++)
	{
		delete Forest[i];
	}

	return 0;
}