Пример #1
void func_randomc(void)
	func_timer = &func_randomc_helper;
int main(int argc, char *argv[])
    int num_threads;
    int vec_size;
    int custom_reduce_method;
    double accu;
    double time;
    double *vec;

    if (argc < 4) {
        printf("Usage: %s num_threads vec_size [reduce method]\n", argv[0]);
        printf(" - num_threads: number of threads to use for simulation, "
                "should be >=1\n");
        printf(" - vec_size: size of the vector to do reduction on 10^n"
                "should be >=10\n");
        printf(" - [reduce_method]: custom | sequential.");

        return EXIT_FAILURE;

    num_threads = atoi(argv[1]);
    vec_size = pow(10, atoi(argv[2]));
    if (num_threads < 1) {
        printf("argument error: num_threads should be >=1.\n");
        return EXIT_FAILURE;

    if (vec_size < 4) {
        printf("argument error: vec_size should be >=4.\n");
        return EXIT_FAILURE;

    if (strcmp(argv[3], "sequential") == 0) {
        custom_reduce_method = 0;
    } else {
        custom_reduce_method = 1;

    vec = (double*)malloc(vec_size * sizeof(double));

    /* Fill a vector with a sinus values */
    #pragma omp parallel for
    for (int i = 0; i < vec_size; i++) {
        vec[i] = sin(i);


    /* Start timing the effeciency of openMP */

    /* Calculate a reduced vector with openMP */
    if (custom_reduce_method) {
        accu = reduce2(fun, vec, vec_size, 0);
        /* accu = sum(vec, vec_size); */
    } else {
        accu = reduce(fun, vec, vec_size, 0);
    /* Stop timing */

    time = timer_end();
    printf("%d, %d, %g\n",num_threads, vec_size, time);

    vec = NULL;

    return EXIT_SUCCESS;
Пример #3
    DSYEVD computes all eigenvalues and, optionally, eigenvectors of
    a real symmetric matrix A.  If eigenvectors are desired, it uses a
    divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    n       INTEGER
            The order of the matrix A.  N >= 0.

    A       DOUBLE_PRECISION array, dimension (LDA, N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_dsytrd_nb(N).
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through

    Further Details
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_dsyev_driver
extern "C" magma_int_t
magma_dsyevd(magma_vec_t jobz, magma_uplo_t uplo,
             magma_int_t n,
             double *A, magma_int_t lda,
             double *w,
             double *work, magma_int_t lwork,
             magma_int_t *iwork, magma_int_t liwork,
             magma_int_t *info)
    const char* uplo_ = lapack_uplo_const( uplo );
    const char* jobz_ = lapack_vec_const( jobz );
    magma_int_t ione = 1;
    magma_int_t izero = 0;
    double d_one = 1.;

    double d__1;

    double eps;
    magma_int_t inde;
    double anrm;
    double rmin, rmax;
    double sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    double safmin;
    double bignum;
    magma_int_t indtau;
    magma_int_t indwrk, liwmin;
    magma_int_t llwork;
    double smlnum;
    magma_int_t lquery;

    double* dwork;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);
    lquery = (lwork == -1 || liwork == -1);

    *info = 0;

    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;

    magma_int_t nb = magma_get_dsytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon");
    work[0]  = lwmin * one_eps;
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -8;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -10;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    else if (lquery) {
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        return *info;

    if (n == 1) {
        w[0] = A[0];
        if (wantz) {
            A[0] = 1.;
        return *info;
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        #ifdef ENABLE_DEBUG
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        lapackf77_dsyevd(jobz_, uplo_,
                         &n, A, &lda,
                         w, work, &lwork,
                         iwork, &liwork, info);
        return *info;

    /* Get machine constants. */
    safmin = lapackf77_dlamch("Safe minimum");
    eps    = lapackf77_dlamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_dsqrt(smlnum);
    rmax = magma_dsqrt(bignum);

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_dlansy("M", uplo_, &n, A, &lda, work );
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    if (iscale == 1) {
        lapackf77_dlascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A,
                &lda, info);

    /* Call DSYTRD to reduce symmetric matrix to tridiagonal form. */
    // dsytrd work: e (n) + tau (n) + llwork (n*nb)  ==>  2n + n*nb
    // dstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2)  ==>  1 + 6n + 2n^2
    inde   = 0;
    indtau = inde   + n;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

    magma_dsytrd(uplo, n, A, lda, w, &work[inde],
                 &work[indtau], &work[indwrk], llwork, &iinfo);

    timer_stop( time );
    timer_printf( "time dsytrd = %6.2f\n", time );

    /* For eigenvalues only, call DSTERF.  For eigenvectors, first call
       DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
       tridiagonal matrix, then call DORMTR to multiply it to the Householder
       transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_dsterf(&n, w, &work[inde], info);
    else {
        timer_start( time );

        if (MAGMA_SUCCESS != magma_dmalloc( &dwork, 3*n*(n/2 + 1) )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;

        // TTT Possible bug for n < 128
        magma_dstedx(MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde],
                     &work[indwrk], n, &work[indwk2],
                     llwrk2, iwork, liwork, dwork, info);

        magma_free( dwork );

        timer_stop( time );
        timer_printf( "time dstedx = %6.2f\n", time );
        timer_start( time );

        magma_dormtr(MagmaLeft, uplo, MagmaNoTrans, n, n, A, lda, &work[indtau],
                     &work[indwrk], n, &work[indwk2], llwrk2, &iinfo);

        lapackf77_dlacpy("A", &n, &n, &work[indwrk], &n, A, &lda);

        timer_stop( time );
        timer_printf( "time dormtr + copy = %6.2f\n", time );

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        d__1 = 1. / sigma;
        blasf77_dscal(&n, &d__1, w, &ione);

    work[0]  = lwmin * one_eps;  // round up
    iwork[0] = liwmin;

    return *info;
} /* magma_dsyevd */
Пример #4
    SGETRF_m computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.  This version does not
    require work space on the GPU passed as input. GPU memory is allocated
    in the routine. The matrix may exceed the GPU memory.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.

    Note: The factorization of big panel is done calling multiple-gpu-interface.
    Pivots are applied on GPU within the big panel.

    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    A       REAL array, dimension (LDA,N)
            On entry, the M-by-N matrix to be factored.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_getrf
extern "C" magma_int_t
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n,
    float *A, magma_int_t lda, magma_int_t *ipiv,
    magma_int_t *info)
#define     A(i,j) (A      + (j)*lda + (i))
#define dAT(d,i,j) (dAT[d] + (i)*nb*ldn_local + (j)*nb)
#define dPT(d,i,j) (dPT[d] + (i)*nb*nb + (j)*nb*maxm)

    magma_timer_t time=0, time_total=0, time_alloc=0, time_set=0, time_get=0, time_comp=0;
    timer_start( time_total );
    //real_Double_t flops;

    float c_one     = MAGMA_S_ONE;
    float c_neg_one = MAGMA_S_NEG_ONE;
    float *dAT[MagmaMaxGPUs], *dA[MagmaMaxGPUs], *dPT[MagmaMaxGPUs];
    magma_int_t        iinfo = 0, nb, nbi, maxm, n_local[MagmaMaxGPUs], ldn_local;
    magma_int_t        N, M, NB, NBk, I, d, ngpu0 = ngpu;
    magma_int_t        ii, jj, h, offset, ib, rows;
    magma_queue_t queues[MagmaMaxGPUs][2];
    magma_event_t  event[MagmaMaxGPUs][2];

    *info = 0;
    if (m < 0)
        *info = -1;
    else if (n < 0)
        *info = -2;
    else if (lda < max(1,m))
        *info = -4;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    /* initialize nb */
    nb = magma_get_sgetrf_nb( m, n );
    maxm = magma_roundup( m, 32 );

    /* figure out NB */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(float);
    /* number of columns in the big panel */
    h = 1+(2+ngpu0);
    NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    const char* ngr_nb_char = getenv("MAGMA_NGR_NB");
    if ( ngr_nb_char != NULL )
        NB = max( nb, min( NB, atoi(ngr_nb_char) ) );
    //NB = 5*max(nb,32);

    if ( ngpu0 > magma_ceildiv( NB, nb )) {
        ngpu = magma_ceildiv( NB, nb );
        h = 1+(2+ngpu);
        NB = (magma_int_t)(0.8*freeMem/maxm-h*nb);
    } else {
        ngpu = ngpu0;
    if ( ngpu*NB >= n ) {
        #ifdef CHECK_SGETRF_OOC
        printf( "      * still fit in GPU memory.\n" );
        NB = n;
    } else {
        #ifdef CHECK_SGETRF_OOC
        printf( "      * don't fit in GPU memory.\n" );
        NB = ngpu*NB;
        NB = max( nb, (NB / nb) * nb); /* making sure it's devisable by nb (x64) */

    #ifdef CHECK_SGETRF_OO
    if ( NB != n ) printf( "      * running in out-core mode (n=%lld, NB=%lld, nb=%lld, freeMem=%.2e).\n", (long long) n, (long long) NB, (long long) nb, (float) freeMem );
    else           printf( "      * running in in-core mode  (n=%lld, NB=%lld, nb=%lld, freeMem=%.2e).\n", (long long) n, (long long) NB, (long long) nb, (float) freeMem );

    if ( (nb <= 1) || (nb >= min(m,n)) ) {
        /* Use CPU code for scalar of one tile. */
        lapackf77_sgetrf(&m, &n, A, &lda, ipiv, info);
    } else {
        /* Use hybrid blocked code. */

        /* allocate memory on GPU to store the big panel */
        timer_start( time_alloc );
        n_local[0] = (NB/nb)/ngpu;
        if ( NB%(nb*ngpu) != 0 )
        n_local[0] *= nb;
        ldn_local = magma_roundup( n_local[0], 32 );
        for( d=0; d < ngpu; d++ ) {
            if (MAGMA_SUCCESS != magma_smalloc( &dA[d], (ldn_local+h*nb)*maxm )) {
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            dPT[d] = dA[d] + nb*maxm;      /* for storing the previous panel from CPU */
            dAT[d] = dA[d] + h*nb*maxm;    /* for storing the big panel               */
            magma_queue_create( d, &queues[d][0] );
            magma_queue_create( d, &queues[d][1] );
            magma_event_create( &event[d][0] );
            magma_event_create( &event[d][1] );
        timer_stop( time_alloc );
        for( I=0; I < n; I += NB ) {
            M = m;
            N = min( NB, n-I );       /* number of columns in this big panel             */
            //s = min( max(m-I,0), N )/nb; /* number of small block-columns in this big panel */
            maxm = magma_roundup( M, 32 );
            if ( ngpu0 > magma_ceildiv( NB, nb ) ) {
                ngpu = magma_ceildiv( NB, nb );
            } else {
                ngpu = ngpu0;
            for( d=0; d < ngpu; d++ ) {
                n_local[d] = ((N/nb)/ngpu)*nb;
                if (d < (N/nb)%ngpu)
                    n_local[d] += nb;
                else if (d == (N/nb)%ngpu)
                    n_local[d] += N%nb;
            ldn_local = magma_roundup( n_local[0], 32 );
            /* upload the next big panel into GPU, transpose (A->A'), and pivot it */
            timer_start( time );
            magmablas_ssetmatrix_transpose_mgpu(ngpu, M, N, nb, A(0,I), lda,
                                                dAT, ldn_local, dA, maxm, queues);
            for( d=0; d < ngpu; d++ ) {
                magma_queue_sync( queues[d][0] );
                magma_queue_sync( queues[d][1] );
            time_set += timer_stop( time );
            timer_start( time );
            /* --------------------------------------------------------------- */
            /* loop around the previous big-panels to update the new big-panel */
            for( offset = 0; offset < min(m,I); offset += NB ) {
                NBk = min( m-offset, NB );
                /* start sending the first tile from the previous big-panels to gpus */
                for( d=0; d < ngpu; d++ ) {
                    nbi  = min( nb, NBk );
                    magma_ssetmatrix_async( (M-offset), nbi,
                                            A(offset,offset), lda,
                                            dA[d],            (maxm-offset), queues[d][0] );
                    /* make sure the previous update finished */
                    //magma_queue_sync( queues[d][1] );
                    magma_queue_wait_event( queues[d][0], event[d][0] );
                    /* transpose */
                    magmablas_stranspose( M-offset, nbi, dA[d], maxm-offset, dPT(d,0,0), nb,
                /* applying the pivot from the previous big-panel */
                for( d=0; d < ngpu; d++ ) {
                    magmablas_slaswp_q( ldn_local, dAT(d,0,0), ldn_local, offset+1, offset+NBk, ipiv, 1, 
                                        queues[d][1] );
                /* going through each block-column of previous big-panels */
                for( jj=0, ib=offset/nb; jj < NBk; jj += nb, ib++ ) {
                    ii   = offset+jj;
                    rows = maxm - ii;
                    nbi  = min( nb, NBk-jj );
                    for( d=0; d < ngpu; d++ ) {
                        /* wait for a block-column on GPU */
                        magma_queue_sync( queues[d][0] );
                        /* start sending next column */
                        if ( jj+nb < NBk ) {
                            magma_ssetmatrix_async( (M-ii-nb), min(nb,NBk-jj-nb),
                                                    A(ii+nb,ii+nb), lda,
                                                    dA[d],          (rows-nb), 
                                                    queues[d][0] );
                            /* make sure the previous update finished */
                            //magma_queue_sync( queues[d][1] );
                            magma_queue_wait_event( queues[d][0], event[d][(1+jj/nb)%2] );
                            /* transpose next column */
                            magmablas_stranspose( M-ii-nb, nb, dA[d], rows-nb, dPT(d,0,(1+jj/nb)%2), nb,
                        /* update with the block column */
                        magma_strsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                     n_local[d], nbi, c_one, dPT(d,0,(jj/nb)%2), nb, dAT(d,ib,0), ldn_local,
                        if ( M > ii+nb ) {
                            magma_sgemm( MagmaNoTrans, MagmaNoTrans,
                                         n_local[d], M-(ii+nb), nbi, c_neg_one, dAT(d,ib,0), ldn_local,
                                         dPT(d,1,(jj/nb)%2), nb, c_one, dAT(d,ib+1,0), ldn_local,
                        magma_event_record( event[d][(jj/nb)%2], queues[d][1] );
                    } /* end of for each block-columns in a big-panel */
            } /* end of for each previous big-panels */
            for( d=0; d < ngpu; d++ ) {
                magma_queue_sync( queues[d][0] );
                magma_queue_sync( queues[d][1] );
            /* calling magma-gpu interface to panel-factorize the big panel */
            if ( M > I ) {
                magma_sgetrf2_mgpu(ngpu, M-I, N, nb, I, dAT, ldn_local, ipiv+I, dA, A(0,I), lda,
                                   queues, &iinfo);
                if ( iinfo < 0 ) {
                    *info = iinfo;
                } else if ( iinfo != 0 ) {
                    *info = iinfo + I * NB;
                /* adjust pivots */
                for( ii=I; ii < min(I+N,m); ii++ )
                    ipiv[ii] += I;
            time_comp += timer_stop( time );
            /* get the current big panel to CPU from devices */
            timer_start( time );
            magmablas_sgetmatrix_transpose_mgpu(ngpu, M, N, nb, dAT, ldn_local,
                                                A(0,I), lda, dA, maxm, queues);
            for( d=0; d < ngpu; d++ ) {
                magma_queue_sync( queues[d][0] );
                magma_queue_sync( queues[d][1] );
            time_get += timer_stop( time );
        } /* end of for */
        //timer_stop( time_total );
        //flops = FLOPS_SGETRF( m, n ) / 1e9;
        //timer_printf(" memory-allocation time: %e\n", time_alloc );
        //timer_printf(" NB=%lld nb=%lld\n", (long long) NB, (long long) nb );
        //timer_printf(" memcopy and transpose %e seconds\n", time_set );
        //timer_printf(" total time %e seconds\n", time_total );
        //timer_printf(" Performance %f GFlop/s, %f seconds without htod and dtoh\n",     flops / (time_comp),               time_comp               );
        //timer_printf(" Performance %f GFlop/s, %f seconds with    htod\n",              flops / (time_comp + time_set),    time_comp + time_set    );
        //timer_printf(" Performance %f GFlop/s, %f seconds with    dtoh\n",              flops / (time_comp + time_get),    time_comp + time_get    );
        //timer_printf(" Performance %f GFlop/s, %f seconds without memory-allocation\n", flops / (time_total - time_alloc), time_total - time_alloc );
        for( d=0; d < ngpu0; d++ ) {
            magma_free( dA[d] );
            magma_event_destroy( event[d][0] );
            magma_event_destroy( event[d][1] );
            magma_queue_destroy( queues[d][0] );
            magma_queue_destroy( queues[d][1] );
        magma_setdevice( orig_dev );
    if ( *info >= 0 )
        magma_sgetrf_piv(m, n, NB, A, lda, ipiv, info);
    return *info;
} /* magma_sgetrf_m */
Пример #5
int main(int argc, char * argv[]) {


    signal(SIGINT, interrupt);

    ipv4_addr_t multiaddress;

    ipv4_str_addr (RIP_MULTICAST_IPv4, multiaddress);

    if(argc == 2) {
        if( !strcmp(argv[1], "--verbose")) {
            is_verbose = 1;
            print_warning("(Debug mode ON) \n");
        printf("(Run with --verbose to print more info)\n");

    bold ("Starting RIP Server... \t\t\t\t\t");
    print_success("[ OK ]\n");

    table = rip_table_create ();
    if (initialize_rip (table, RIP_PORT) == -1) {
        /* Already printed advert inside function */
        return -1;
    int K = rip_route_table_read ( RIP_TABLE_TXT, table );

    /* set inf timer to routes read from file */
    int k;
    for( k = 0; k < K; k++ ) {

        timerms_reset(&table->routes[k]->time, INFINITE_TIMER);

    rip_table_t * table_aux;

    ipv4_addr_t src;
    long int r = random_number(-15, 15)*1000;
    timerms_t update_timer = timer_start ( UPDATE_TIME + r );


    rip_route_table_print ( table );

    for ( ;; ) {

        if (timer_ended (update_timer)) {

            /* Si se ha acabado el update timer */
            send_rip_response (multiaddress, message_ptr, table, RIP_PORT);
            r = random_number(-15, 15)*1000;
            if ( is_verbose ) printf("(update_time set to %ld)\n", r +UPDATE_TIME);
            update_timer = timer_start (UPDATE_TIME + r);

            bold ("\nCurrent table:\n");
            rip_route_table_print ( table );

        int src_port;

        int bytes_received = rip_recv (src, message_ptr, MIN_TIMER, &src_port);

        if (bytes_received>0) {

            table_aux = convert_message_table (message_ptr, rip_number_entries(bytes_received));

            if ( is_verbose ) {

                print_notice ("\nReceived packet\n");
                print_packet (message_ptr, rip_number_entries(bytes_received));

            if (message_ptr->command == 2) {

                //number of entries in the received message
                int num_entries = rip_number_entries (bytes_received);

                int trig_update_flag = 0; //by default, when receiving, do not send update

                trig_update_flag = compare_tables (table, table_aux, num_entries, src);

                if (trig_update_flag) {

                    send_rip_response (multiaddress, message_ptr, table, RIP_PORT);

                bold ("\nCurrent table:\n");
                rip_route_table_print ( table );

            if (message_ptr->command == 1 &&
                    metric_is_inf(ntohl(message_ptr->entry[0].metric))) {

                print_warning("Received a request for single entry, sending whole table\n");

                send_rip_response (src, message_ptr, table, src_port);

            }  else if (message_ptr->command == 1) {

                print_warning("Received a request for specific entries\n");
                rip_table_t * table_send = table_to_send (table, table_aux);
                send_rip_response (src, message_ptr, table_send, src_port);


        int is_garbage_on = garbage_collector_start (table, garbage_collector_timers);
        int garbage_collected = garbage_collector (table, garbage_collector_timers);

        if (garbage_collected || is_garbage_on) {

            if(garbage_collected)  print_notice("Garbage Collected \n");
            else if(is_garbage_on) print_notice("Garbage countdown ON\n");
            if( is_verbose ) rip_route_table_print(table);

    return 0;
Пример #6
    ZGETRF computes an LU factorization of a general M-by-N matrix A
    using partial pivoting with row interchanges.

    The factorization has the form
       A = P * L * U
    where P is a permutation matrix, L is lower triangular with unit
    diagonal elements (lower trapezoidal if m > n), and U is upper
    triangular (upper trapezoidal if m < n).

    This is the right-looking Level 3 BLAS version of the algorithm.
    Use two buffer to send panels.

    ngpu    INTEGER
            Number of GPUs to use. ngpu > 0.

    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    d_lAT   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            On entry, the M-by-N matrix A distributed over GPUs
            (d_lAT[d] points to the local matrix on d-th GPU).
            It uses a 1D block column cyclic format (with the block size
            nb), and each local matrix is stored by row.
            On exit, the factors L and U from the factorization
            A = P*L*U; the unit diagonal elements of L are not stored.

    lddat   INTEGER
            The leading dimension of the array d_lAT[d]. LDDA >= max(1,M).

    ipiv    INTEGER array, dimension (min(M,N))
            The pivot indices; for 1 <= i <= min(M,N), row i of the
            matrix was interchanged with row IPIV(i).

    @param (workspace) on device
    d_lAP   COMPLEX_16 array of pointers on the GPU, dimension (ngpu).
            d_lAP[d] is the workspace on d-th GPU. Each local workspace
            must be of size (3+ngpu)*nb*maxm, where maxm is m rounded
            up to a multiple of 32 and nb is the block size.

    @param (workspace)
    W       COMPLEX_16 array, dimension (ngpu*nb*maxm).
            It is used to store panel on CPU.

    ldw     INTEGER
            The leading dimension of the workspace w.

    queues  magma_queue_t
            queues[d] points to the streams for the d-th GPU to execute
            in. Each GPU require two streams.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.
      -     > 0:  if INFO = i, U(i,i) is exactly zero. The factorization
                  has been completed, but the factor U is exactly
                  singular, and division by zero will occur if it is used
                  to solve a system of equations.

    @ingroup magma_zgesv_comp
extern "C" magma_int_t
    magma_int_t ngpu,
    magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t offset,
    magmaDoubleComplex_ptr d_lAT[], magma_int_t lddat, magma_int_t *ipiv,
    magmaDoubleComplex_ptr d_lAP[],
    magmaDoubleComplex *W, magma_int_t ldw,
    magma_queue_t queues[][2],
    magma_int_t *info)
#define dAT(id,i,j)  (d_lAT[(id)] + ((offset)+(i)*nb)*lddat + (j)*nb)
#define W(j) (W + ((j)%ngpu)*nb*ldw)

    magmaDoubleComplex c_one     = MAGMA_Z_ONE;
    magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;

    magma_int_t block_size = 32;
    magma_int_t iinfo, n_local[MagmaMaxGPUs];
    magma_int_t maxm, mindim;
    magma_int_t i, j, d, dd, rows, cols, s, ldpan[MagmaMaxGPUs];
    magma_int_t id, j_local, j_local2, nb0, nb1, h = 2+ngpu;
    magmaDoubleComplex *d_panel[MagmaMaxGPUs], *panel_local[MagmaMaxGPUs];

    /* Check arguments */
    *info = 0;
    if (m < 0)
        *info = -2;
    else if (n < 0)
        *info = -3;
    else if (ngpu*lddat < max(1,n))
        *info = -5;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;

    /* Quick return if possible */
    if (m == 0 || n == 0)
        return *info;

    /* Function Body */
    mindim = min(m, n);
    if ( ngpu > ceil((double)n/nb) ) {
        *info = -1;
        return *info;

    magma_device_t orig_dev;
    magma_getdevice( &orig_dev );
    magma_queue_t orig_stream;
    magmablasGetKernelStream( &orig_stream );
    /* Use hybrid blocked code. */
    maxm  = ((m + block_size-1)/block_size)*block_size;

    /* some initializations */
    for (d=0; d < ngpu; d++) {
        n_local[d] = ((n/nb)/ngpu)*nb;
        if (d < (n/nb) % ngpu)
            n_local[d] += nb;
        else if (d == (n/nb) % ngpu)
            n_local[d] += n % nb;
        /* workspaces */
        d_panel[d] = &(d_lAP[d][h*nb*maxm]);   /* temporary panel storage */
    trace_init( 1, ngpu, 2, (CUstream_st**)queues );

    /* start sending the panel to cpu */
    nb0 = min(mindim, nb);
    trace_gpu_start( 0, 1, "comm", "get" );
    magmablas_ztranspose( nb0, m, dAT(0,0,0), lddat, d_lAP[0], maxm );
    magma_zgetmatrix_async( m, nb0,
                            d_lAP[0], maxm,
                            W(0),     ldw, queues[0][1] );
    trace_gpu_end( 0, 1 );

    /* ------------------------------------------------------------------------------------- */
    magma_timer_t time=0;
    timer_start( time );

    s = mindim / nb;
    for( j=0; j < s; j++ ) {
        /* Set the GPU number that holds the current panel */
        id = j % ngpu;
        /* Set the local index where the current panel is */
        j_local = j/ngpu;
        cols  = maxm - j*nb;
        rows  = m - j*nb;
        /* synchronize j-th panel from id-th gpu into work */
        magma_queue_sync( queues[id][1] );
        /* j-th panel factorization */
        trace_cpu_start( 0, "getrf", "getrf" );
        lapackf77_zgetrf( &rows, &nb, W(j), &ldw, ipiv+j*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) ) {
            *info = iinfo + j*nb;
        trace_cpu_end( 0 );
        /* start sending the panel to all the gpus */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            trace_gpu_start( 0, 1, "comm", "set" );
            magma_zsetmatrix_async( rows, nb,
                                    W(j),     ldw,
                                    &d_lAP[d][(j%h)*nb*maxm], cols,
                                    queues[d][1] );
            trace_gpu_end( 0, 1 );
            d = (d+1) % ngpu;
        /* apply the pivoting */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            trace_gpu_start( d, 1, "pivot", "pivot" );
            if ( dd == 0 ) {
                for( i=j*nb; i < j*nb + nb; ++i ) {
                    ipiv[i] += j*nb;
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, j*nb + 1, j*nb + nb, ipiv, 1, queues[d][0] );
            trace_gpu_end( d, 1 );
            d = (d+1) % ngpu;
        /* update the trailing-matrix/look-ahead */
        d = (j+1) % ngpu;
        for( dd=0; dd < ngpu; dd++ ) {
            /* storage for panel */
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,j,j_local);
                ldpan[d] = lddat;
                /* next column */
                j_local2 = j_local+1;
            } else {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                ldpan[d] = nb;
                /* next column */
                j_local2 = j_local;
                if ( d < id ) j_local2 ++;
            /* the size of the next column */
            if ( s > (j+1) ) {
                nb0 = nb;
            } else {
                nb0 = n_local[d]-nb*(s/ngpu);
                if ( d < s % ngpu ) nb0 -= nb;
            if ( d == (j+1) % ngpu) {
                /* owns the next column, look-ahead the column */
                nb1 = nb0;
                /* make sure all the pivoting has been applied */
                trace_gpu_start( d, 1, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
                /* synch for remaining update */
            } else {
                /* update the entire trailing matrix */
                nb1 = n_local[d] - j_local2*nb;
                /* synchronization to make sure panel arrived on gpu */
                trace_gpu_start( d, 0, "gemm", "gemm" );
                /* transpose panel on GPU */
                magmablas_ztranspose( rows, nb, &d_lAP[d][(j%h)*nb*maxm], cols, panel_local[d], ldpan[d] );
            /* gpu updating the trailing matrix */
            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         nb1, nb, c_one,
                         panel_local[d],       ldpan[d],
                         dAT(d, j, j_local2), lddat);
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         nb1, m-(j+1)*nb, nb,
                         c_neg_one, dAT(d, j,   j_local2),         lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d, j+1, j_local2),         lddat );
            if ( d == (j+1) % ngpu ) {
                /* Set the local index where the current panel is */
                int loff    = j+1;
                int j_local = (j+1)/ngpu;
                int ldda    = maxm - (j+1)*nb;
                int cols    = m - (j+1)*nb;
                nb0 = min(nb, mindim - (j+1)*nb); /* size of the diagonal block */
                trace_gpu_end( d, 1 );
                if ( nb0 > 0 ) {
                    /* transpose the panel for sending it to cpu */
                    trace_gpu_start( d, 1, "comm", "get" );
                    magmablas_ztranspose( nb0, m-(j+1)*nb, dAT(d,loff,j_local), lddat, &d_lAP[d][((j+1)%h)*nb*maxm], ldda );
                    /* send the panel to cpu */
                    magma_zgetmatrix_async( cols, nb0,
                                            &d_lAP[d][((j+1)%h)*nb*maxm], ldda,
                                            W(j+1), ldw, queues[d][1] );

                    trace_gpu_end( d, 1 );
            } else {
                trace_gpu_end( d, 0 );
            d = (d+1) % ngpu;
        /* update the remaining matrix by gpu owning the next panel */
        if ( (j+1) < s ) {
            int j_local = (j+1)/ngpu;
            int rows  = m - (j+1)*nb;
            d = (j+1) % ngpu;
            trace_gpu_start( d, 0, "gemm", "gemm" );

            magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                         n_local[d] - (j_local+1)*nb, nb,
                         c_one, panel_local[d],       ldpan[d],
                                dAT(d,j,j_local+1),  lddat );
            magma_zgemm( MagmaNoTrans, MagmaNoTrans,
                         n_local[d]-(j_local+1)*nb, rows, nb,
                         c_neg_one, dAT(d,j,j_local+1),            lddat,
                                    &(panel_local[d][nb*ldpan[d]]), ldpan[d],
                         c_one,     dAT(d,j+1,  j_local+1),        lddat );
            trace_gpu_end( d, 0 );
    } /* end of for j=1..s */
    /* ------------------------------------------------------------------------------ */
    /* Set the GPU number that holds the last panel */
    id = s % ngpu;
    /* Set the local index where the last panel is */
    j_local = s/ngpu;
    /* size of the last diagonal-block */
    nb0 = min(m - s*nb, n - s*nb);
    rows = m    - s*nb;
    cols = maxm - s*nb;
    if ( nb0 > 0 ) {
        /* wait for the last panel on cpu */
        magma_queue_sync( queues[id][1] );
        /* factor on cpu */
        lapackf77_zgetrf( &rows, &nb0, W(s), &ldw, ipiv+s*nb, &iinfo);
        if ( (*info == 0) && (iinfo > 0) )
            *info = iinfo + s*nb;
        /* send the factor to gpus */
        for( d=0; d < ngpu; d++ ) {
            j_local2 = j_local;
            if ( d < id ) j_local2 ++;
            if ( d == id || n_local[d] > j_local2*nb ) {
                magma_zsetmatrix_async( rows, nb0,
                                        W(s),     ldw,
                                        cols, queues[d][1] );
        for( d=0; d < ngpu; d++ ) {
            if ( d == 0 ) {
                for( i=s*nb; i < s*nb + nb0; ++i ) {
                    ipiv[i] += s*nb;
            magmablas_zlaswp_q( lddat, dAT(d,0,0), lddat, s*nb + 1, s*nb + nb0, ipiv, 1, queues[d][0] );
        for( d=0; d < ngpu; d++ ) {
            /* wait for the pivoting to be done */
            magma_queue_sync( queues[d][0] );
            j_local2 = j_local;
            if ( d < id ) j_local2++;
            if ( d == id ) {
                /* the panel belond to this gpu */
                panel_local[d] = dAT(d,s,j_local);
                /* next column */
                nb1 = n_local[d] - j_local*nb-nb0;
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], lddat );
                if ( nb1 > 0 ) {
                    magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                                 nb1, nb0, c_one,
                                 panel_local[d],        lddat,
                                 dAT(d,s,j_local)+nb0, lddat);
            } else if ( n_local[d] > j_local2*nb ) {
                /* the panel belong to another gpu */
                panel_local[d] = d_panel[d];
                /* next column */
                nb1 = n_local[d] - j_local2*nb;
                magmablas_ztranspose( rows, nb0, &d_lAP[d][(s%h)*nb*maxm], cols, panel_local[d], nb );
                magma_ztrsm( MagmaRight, MagmaUpper, MagmaNoTrans, MagmaUnit,
                             nb1, nb0, c_one,
                             panel_local[d],     nb,
                             dAT(d,s,j_local2), lddat);
    } /* if ( nb0 > 0 ) */
    /* clean up */
    trace_finalize( "zgetrf_mgpu.svg","trace.css" );
    for( d=0; d < ngpu; d++ ) {
        magma_queue_sync( queues[d][0] );
        magma_queue_sync( queues[d][1] );
    magma_setdevice( orig_dev );
    magmablasSetKernelStream( orig_stream );
    timer_start( time );
    timer_printf("\n Performance %f GFlop/s\n", FLOPS_ZGETRF(m,n) / 1e9 / time );

    return *info;
} /* magma_zgetrf2_mgpu */
Пример #7
int main(void)	{

	#define NMAX 100 //array size for storing the values of I_n

	int nminn = 0; //min value of n to be calulated for integrals
	int nmaxx = 100; //max value of n to be calculated for integrals

	double vals1[NMAX + 1], vals2[NMAX + 1];

	integral_recur (nminn, nmaxx, vals1);

	integral_gen (nminn, nmaxx, vals2);

	// prints and compares the values of the different methods
	for(int i = 0,j = nminn; i <= nmaxx - nminn; i++,j++)	{
		printf("%.18f\t%.18f\t%d\n", vals1[i], vals2[i], j);

	int count = 10;
	double time;
	double time1;
	double tmin = 1;
	double tmax = 2;

	// times integral_recur and integral_gen per function call
	// adjusts the number of calls such that there are enough calls to get a good
	// average time per function call

	do {
			timer_start ();

			for (int k = 0; k <= count; k++) {

				integral_recur (nminn, nmaxx, vals1);


			time = timer_stop ();

			time1 = time / count;

			printf (" %10.2f usec     %10.6f sec    %10d\n",
					time1 * 1.e6, time, count);

			 * adjust count such that cpu time is between
			 * tmin and tmax

			count = adjust_rep_count (count, time, tmin, tmax);

	} while ((time > tmax) || (time < tmin));


	count = 10;

	do {
			timer_start ();

			for (int k = 0; k <= count; k++) {

			integral_gen (nminn, nmaxx, vals2);


			time = timer_stop ();

			time1 = time / count;

			printf (" %10.2f usec     %10.6f sec    %10d\n",
					time1 * 1.e6, time, count);

			 * adjust count such that cpu time is between
			 * tmin and tmax

			count = adjust_rep_count (count, time, tmin, tmax);

	} while ((time > tmax) || (time < tmin));

	return 0;
Пример #8
// this function copies the face values of a variable defined on a set 
// of cells to the overlap locations of the adjacent sets of cells. 
// Because a set of cells interfaces in each direction with exactly one 
// other set, we only need to fill six different buffers. We could try to 
// overlap communication with computation, by computing
// some internal values while communicating boundary values, but this
// adds so much overhead that it's not clearly useful. 
void copy_faces()
  int c, i;
  cl_int ecode = 0;

  // exit immediately if there are no faces to be copied           
  if (num_devices == 1) {

  // because the difference stencil for the diagonalized scheme is 
  // orthogonal, we do not have to perform the staged copying of faces, 
  // but can send all face information simultaneously to the neighboring 
  // cells in all directions          
  if (timeron) timer_start(t_bpack);

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     COPY_FACES1_DIM, NULL,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces1");

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     COPY_FACES2_DIM, NULL,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces2");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     COPY_FACES3_DIM, NULL,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces3");

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);


  if (timeron) timer_stop(t_bpack);

  if (timeron) timer_start(t_exch);
  for (i = 0; i < num_devices; i++) {
    CHECK_FINISH(i * 2);

    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][0] * 2 + 1],
                                0, NULL, NULL);
        CHECK_FINISH(successor[i][0] * 2 + 1);

  for (i = 0; i < num_devices; i++) {
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][0] * 2 + 1],
                                0, NULL, NULL);

        CHECK_FINISH(predecessor[i][0] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][1] * 2 + 1],
                                0, NULL, NULL);

        CHECK_FINISH(successor[i][1] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][1] * 2 + 1],
                                0, NULL, NULL);

        CHECK_FINISH(predecessor[i][1] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[successor[i][2] * 2 + 1],
                                0, NULL, NULL);

        CHECK_FINISH(successor[i][2] * 2 + 1);
    ecode = clEnqueueCopyBuffer(cmd_queue[predecessor[i][2] * 2 + 1],
                                0, NULL, NULL);
        CHECK_FINISH(predecessor[i][2] * 2 + 1);
  if (timeron) timer_stop(t_exch);

  // unpack the data that has just been received;             
  if (timeron) timer_start(t_bpack);

  for (c = 0; c < ncells; c++) {
    for (i = 0; i < num_devices; i++) {
      if (c == 0) CHECK_FINISH(i * 2 + 1);

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     COPY_FACES4_DIM, NULL,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces4");

    for (i = 0; i < num_devices; i++) {
      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     COPY_FACES5_DIM, NULL,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces5");

      ecode = clEnqueueNDRangeKernel(cmd_queue[i * 2],
                                     COPY_FACES6_DIM, NULL,
                                     0, NULL, NULL);
      clu_CheckError(ecode, "clEnqueueNDRange() for copy_faces6");

    for (i = 0; i < num_devices; i++) {
      CHECK_FINISH(i * 2);

  if (timeron) timer_stop(t_bpack);

  for (i = 0; i < num_devices; i++)
    CHECK_FINISH(i * 2);

  // now that we have all the data, compute the rhs
Пример #9
void * recive_udp_data(void * parameters)
    struct timerInfo timeOut[MAX_PLAYERS];
    int i;
    char buffer[128];
    struct stcInfo moveInfo;
    struct playerInfo * player = (struct playerInfo*) parameters;
    //Init the timer
    for (i = 0; i < MAX_PLAYERS; i++)
    while (1)
        recvfrom(udpInfo.udpSd, buffer, sizeof(buffer), 0, udpInfo.serverIp, sizeof(udpInfo.serverIp));
        //printf("recived from server: %s \n", buffer);
        sscanf(buffer, "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d", &moveInfo.team,&moveInfo.x, &moveInfo.y , &moveInfo.player, &moveInfo.mouseX, &moveInfo.mouseY,&moveInfo.fire, &moveInfo.bulletX, &moveInfo.bulletY, &moveInfo.tankAngle, &moveInfo.cannonAngle, &moveInfo.dead, &moveInfo.healthPoints, &moveInfo.redPoints, &moveInfo.bluePoints);
        //Saves the incoming data in the players struct.
        player[moveInfo.player].slot = moveInfo.player;
        player[moveInfo.player].team = moveInfo.team;

        player[moveInfo.player].xCord = moveInfo.x;
        player[moveInfo.player].yCord = moveInfo.y;
        player[moveInfo.player].mouseX = moveInfo.mouseX;
        player[moveInfo.player].mouseY = moveInfo.mouseY;
        player[moveInfo.player].bulletX = moveInfo.bulletX;
        player[moveInfo.player].bulletY = moveInfo.bulletY;
        player[moveInfo.player].fire = moveInfo.fire;
        player[moveInfo.player].tankAngle = -1*moveInfo.tankAngle;
        player[moveInfo.player].cannonAngle = -1*moveInfo.cannonAngle;
        player[moveInfo.player].healthPoints = moveInfo.healthPoints;
        player[moveInfo.player].dead = moveInfo.dead;
        player[moveInfo.player].connected = 1;
        redPoints = moveInfo.redPoints;
        bluePoints = moveInfo.bluePoints;

        //If no new udp from a client it disconnects after 0.5 sec
        for (i = 0; i < MAX_PLAYERS; i++)
            if (timer_get_ticks(&timeOut[i]) > 500)
                player[i].connected = 0;
Пример #10
int main(int argc, char ** argv) {

    // enhanced usage, useful for testing
    if (argc != 1 && argc != 2 && argc != 7) {
        fprintf(stderr, "Usage: %s [[threads] yMin yMax xMin xMax dxy]\n", argv[0]);
        fprintf(stderr, "Either specify no args, or only threads, or all args.\n");
        return -2;

    // determine amount of threads
    if (argc > 1)

    // set constants if supplied
    if (argc == 7) {
        yMin = atof(argv[2]);
        yMax = atof(argv[3]);
        xMin = atof(argv[4]);
        xMax = atof(argv[5]);
        dxy  = atof(argv[6]);
    double time;
    double cx, cy;
    double zx, zy, new_zx;
    unsigned char n;
    int nx, ny;

    // The Mandelbrot calculation is to iterate the equation
    // z = z*z + c, where z and c are complex numbers, z is initially
    // zero, and c is the coordinate of the point being tested. If
    // the magnitude of z remains less than 2 for ever, then the point
    // c is in the Mandelbrot set. We write out the number of iterations
    // before the magnitude of z exceeds 2, or UCHAR_MAX, whichever is
    // smaller.

    nx = 0;
    ny = 0;
    nx = (xMax - xMin) / dxy;
    ny = (yMax - yMin) / dxy;
    int i, j;
    unsigned char * buffer = malloc(nx * ny * sizeof(unsigned char));
    if (buffer == NULL) {
      fprintf (stderr, "Couldn't malloc buffer!\n");
      return EXIT_FAILURE;
    // do the calculations parallel
    #pragma omp parallel for private(i, j, cx, zx, zy, n, new_zx, cy)
    for (i = 0; i < ny; i++) {
        cy = yMin - dxy + i * dxy;
        for (j = 0; j < nx; j++) {
            cx = xMin - dxy + j * dxy;
            zx = 0.0; 
            zy = 0.0; 
            n = 0;
            while ((zx*zx + zy*zy < 4.0) && (n != UCHAR_MAX)) {
                new_zx = zx*zx - zy*zy + cx;
                zy = 2.0*zx*zy + cy;
                zx = new_zx;
            buffer[i * nx + j] = n;
    time = timer_end();
    fprintf (stderr, "Took %g seconds.\nNow writing file...\n", time);
    fwrite(buffer, sizeof(unsigned char), nx * ny, stdout);

    fprintf (stderr, "All done! To process the image: convert -depth 8 -size " \
             "%dx%d gray:output out.jpg\n", nx, ny);
    return 0;
Пример #11
bool loop_play(
  boardsPBN * bop,
  playTracesPBN * playsp,
  solvedPlays * solvedplp,
  dealPBN * deal_list,
  playTracePBN * play_list,
  solvedPlay * trace_list,
  int number)
  printf("%8s %24s\n", "Hand no.", "Time");

  for (int i = 0; i < number; i += input_number)
    int count = (i + input_number > number ? number - i : input_number);

    bop->noOfBoards = count;
    playsp->noOfBoards = count;

    for (int j = 0; j < count; j++)
      bop->deals[j] = deal_list[i + j];
      bop->target[j] = 0;
      bop->solutions[j] = 3;
      bop->mode[j] = 1;

      playsp->plays[j] = play_list[i + j];

    int ret;
    if ((ret = AnalyseAllPlaysPBN(bop, playsp, solvedplp, 1))
        != RETURN_NO_FAULT)
      printf("loop_play i %i: Return %d\n", i, ret);
    tu = timer_end();

    printf("%8d (%5.1f%%) %15d\n",
           i + count,
           100. * (i + count) / static_cast<double>(number),

    for (int j = 0; j < count; j++)
      if (! compare_TRACE(&solvedplp->solved[j], &trace_list[i + j]))
        printf("loop_play i %d, j %d: Difference\n", i, j);
        // printf("trace_list[%d]: \n", i+j);
        // print_TRACE(&trace_list[i+j]);
        // printf("solvedplp[%d]: \n", j);
        // print_TRACE(&solvedplp->solved[j]);


  return true;
Пример #12
    DSYGVDX computes selected eigenvalues and, optionally, eigenvectors
    of a real generalized symmetric-definite eigenproblem, of the form
    A*x=(lambda)*B*x,  A*Bx=(lambda)*x,  or B*A*x=(lambda)*x.  Here A and
    B are assumed to be symmetric and B is also positive definite.
    Eigenvalues and eigenvectors can be selected by specifying either a
    range of values or a range of indices for the desired eigenvalues.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    itype   INTEGER
            Specifies the problem type to be solved:
            = 1:  A*x = (lambda)*B*x
            = 2:  A*B*x = (lambda)*x
            = 3:  B*A*x = (lambda)*x

    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                   will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.

    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangles of A and B are stored;
      -     = MagmaLower:  Lower triangles of A and B are stored.

    n       INTEGER
            The order of the matrices A and B.  N >= 0.

    A       DOUBLE PRECISION array, dimension (LDA, N)
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            matrix Z of eigenvectors.  The eigenvectors are normalized
            as follows:
            if ITYPE = 1 or 2, Z**T *   B    * Z = I;
            if ITYPE = 3,      Z**T * inv(B) * Z = I.
            If JOBZ = MagmaNoVec, then on exit the upper triangle (if UPLO=MagmaUpper)
            or the lower triangle (if UPLO=MagmaLower) of A, including the
            diagonal, is destroyed.

    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    B       DOUBLE PRECISION array, dimension (LDB, N)
            On entry, the symmetric matrix B.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of B contains the
            upper triangular part of the matrix B.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of B contains
            the lower triangular part of the matrix B.
            On exit, if INFO <= N, the part of B containing the matrix is
            overwritten by the triangular factor U or L from the Cholesky
            factorization B = U**T * U or B = L * L**T.

    ldb     INTEGER
            The leading dimension of the array B.  LDB >= max(1,N).

            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    il      INTEGER
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    mout    INTEGER
            The total number of eigenvalues found.  0 <= MOUT <= N.
            If RANGE = MagmaRangeAll, MOUT = N, and if RANGE = MagmaRangeI, MOUT = IU-IL+1.
    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    work    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    work    (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_dsytrd_nb(N).
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  DPOTRF or DSYEVD returned an error code:
               <= N:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm
                      failed to converge; i off-diagonal elements of an
                      intermediate tridiagonal form did not converge to
                      if INFO = i and JOBZ = MagmaVec, then the algorithm
                      failed to compute an eigenvalue while working on
                      the submatrix lying in rows and columns INFO/(N+1)
                      through mod(INFO,N+1);
               > N:   if INFO = N + i, for 1 <= i <= N, then the leading
                      minor of order i of B is not positive definite.
                      The factorization of B could not be completed and
                      no eigenvalues or eigenvectors were computed.

    Further Details
    Based on contributions by
       Mark Fahey, Department of Mathematics, Univ. of Kentucky, USA

    Modified so that no backsubstitution is performed if DSYEVD fails to
    converge (NEIG in old code could be greater than N causing out of
    bounds reference to A - reported by Ralf Meyer).  Also corrected the
    description of INFO and the test on ITYPE. Sven, 16 Feb 05.

    @ingroup magma_dsygv_driver
extern "C" magma_int_t
    magma_int_t itype, magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo, magma_int_t n,
    double *A, magma_int_t lda,
    double *B, magma_int_t ldb,
    double vl, double vu, magma_int_t il, magma_int_t iu,
    magma_int_t *mout, double *w,
    double *work, magma_int_t lwork,
    #ifdef COMPLEX
    double *rwork, magma_int_t lrwork,
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
    const char* uplo_  = lapack_uplo_const( uplo  );
    const char* jobz_  = lapack_vec_const( jobz  );

    double d_one = MAGMA_D_ONE;

    double *dA=NULL, *dB=NULL;
    magma_int_t ldda = magma_roundup( n, 32 );
    magma_int_t lddb = ldda;

    magma_int_t lower;
    magma_trans_t trans;
    magma_int_t wantz, lquery;
    magma_int_t alleig, valeig, indeig;

    magma_int_t lwmin, liwmin;

    wantz  = (jobz  == MagmaVec);
    lower  = (uplo  == MagmaLower);
    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);
    lquery = (lwork == -1 || liwork == -1);

    *info = 0;
    if (itype < 1 || itype > 3) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -3;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -4;
    } else if (n < 0) {
        *info = -5;
    } else if (lda < max(1,n)) {
        *info = -7;
    } else if (ldb < max(1,n)) {
        *info = -9;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -11;
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -12;
            } else if (iu < min(n,il) || iu > n) {
                *info = -13;

    magma_int_t nb = magma_get_dsytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    work[0]  = magma_dmake_lwork( lwmin );
    iwork[0] = liwmin;

    if (lwork < lwmin && ! lquery) {
        *info = -17;
    } else if (liwork < liwmin && ! lquery) {
        *info = -19;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    else if (lquery) {
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        return *info;
    /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        lapackf77_dsygvd( &itype, jobz_, uplo_,
                          &n, A, &lda, B, &ldb,
                          w, work, &lwork,
                          iwork, &liwork, info );
        *mout = n;
        return *info;

    if (MAGMA_SUCCESS != magma_dmalloc( &dA, n*ldda ) ||
        MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb )) {
        magma_free( dA );
        magma_free( dB );
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    /* Form a Cholesky factorization of B. */
    magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue );
    magma_dsetmatrix_async( n, n,
                            A,  lda,
                            dA, ldda, queue );

    magma_timer_t time=0;
    timer_start( time );

    magma_dpotrf_gpu( uplo, n, dB, lddb, info );
    if (*info != 0) {
        *info = n + *info;
        return *info;

    timer_stop( time );
    timer_printf( "time dpotrf_gpu = %6.2f\n", time );

    magma_queue_sync( queue );
    magma_dgetmatrix_async( n, n,
                            dB, lddb,
                            B,  ldb, queue );

    timer_start( time );

    /* Transform problem to standard eigenvalue problem and solve. */
    magma_dsygst_gpu( itype, uplo, n, dA, ldda, dB, lddb, info );

    timer_stop( time );
    timer_printf( "time dsygst_gpu = %6.2f\n", time );

    /* simple fix to be able to run bigger size.
     * set dB=NULL so we know to re-allocate below
     * TODO: have dwork here that will be used as dB and then passed to  dsyevd.
    if (n > 5000) {
        magma_queue_sync( queue );
        magma_free( dB );  dB=NULL;

    timer_start( time );
    magma_dsyevdx_gpu( jobz, range, uplo, n, dA, ldda, vl, vu, il, iu, mout, w, A, lda,
                       work, lwork, iwork, liwork, info );
    timer_stop( time );
    timer_printf( "time dsyevdx_gpu = %6.2f\n", time );

    if (wantz && *info == 0) {
        timer_start( time );
        /* allocate and copy dB back */
        if (dB == NULL) {
            if (MAGMA_SUCCESS != magma_dmalloc( &dB, n*lddb ) ) {
                magma_free( dA );  dA=NULL;
                *info = MAGMA_ERR_DEVICE_ALLOC;
                return *info;
            magma_dsetmatrix( n, n, B, ldb, dB, lddb, queue );
        /* Backtransform eigenvectors to the original problem. */
        if (itype == 1 || itype == 2) {
            /* For A*x=(lambda)*B*x and A*B*x=(lambda)*x;
               backtransform eigenvectors: x = inv(L)'*y or inv(U)*y */
            if (lower) {
                trans = MagmaTrans;
            } else {
                trans = MagmaNoTrans;
            magma_dtrsm( MagmaLeft, uplo, trans, MagmaNonUnit,
                         n, *mout, d_one, dB, lddb, dA, ldda, queue );
        else if (itype == 3) {
            /* For B*A*x=(lambda)*x;
               backtransform eigenvectors: x = L*y or U'*y */
            if (lower) {
                trans = MagmaNoTrans;
            } else {
                trans = MagmaTrans;
            magma_dtrmm( MagmaLeft, uplo, trans, MagmaNonUnit,
                         n, *mout, d_one, dB, lddb, dA, ldda, queue );
        magma_dgetmatrix( n, *mout, dA, ldda, A, lda, queue );
        timer_stop( time );
        timer_printf( "time dtrsm/mm + getmatrix = %6.2f\n", time );

    magma_queue_sync( queue );
    magma_queue_destroy( queue );

    work[0]  = magma_dmake_lwork( lwmin );
    iwork[0] = liwmin;

    magma_free( dA );  dA=NULL;
    magma_free( dB );  dB=NULL;

    return *info;
} /* magma_dsygvd */
Пример #13
c   This is the serial version of the APP Benchmark 1,
c   the "embarassingly parallel" benchmark.
c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
c   numbers.  MK is the Log_2 of the size of each batch of uniform random
c   numbers.  MK can be set for convenience on a given system, since it does
c   not affect the results.
int main(int argc, char **argv) {

    double *x, **xx, *q, **qq;

    double Mops, t1, t2, t3, t4, x1, x2, sx, sy, tm, an, tt, gc;
    double dum[3] = { 1.0, 1.0, 1.0 };
    const int TRANSFER_X = 1;
    int np, nn, ierr, node, no_nodes, i, l, k, nit, ierrcode,
    no_large_nodes, np_add, k_offset, j;
    double loc_x,loc_t1,loc_t2,loc_t3,loc_t4;
    double loc_a1,loc_a2,loc_x1,loc_x2,loc_z;
    boolean verified;
    char size[13+1];	/* character*13 */
/*     Allocate working memory       */

    x = (double*) malloc(sizeof(double) * 2*NK);
    xx = (double**) malloc(sizeof(double*) * NN);
    xx[0] = (double*) malloc(sizeof(double) * NN * 2*NK);
    for (i = 1; i < NN; i++) xx[i] = xx[i-1] + (2*NK);
    q = (double*) malloc(sizeof(double) * NQ);
    qq = (double**) malloc(sizeof(double*) * NN);
    qq[0] = (double*) malloc(sizeof(double) * NN * NQ);
    for (i = 1; i < NN; i++) qq[i] = qq[i-1] + NQ;

c   Because the size of the problem is too large to store in a 32-bit
c   integer for some classes, we put it into a string (for printing).
c   Have to strip off the decimal point put in there by the floating
c   point print statement (internal file)

    printf("\n\n NAS Parallel Benchmarks 2.3 OpenACC C version"
	   " - EP Benchmark\n");
    sprintf(size, "%12.0f", pow(2.0, M+1));
    for (j = 13; j >= 1; j--) {
	if (size[j] == '.') size[j] = ' ';
    printf(" Number of random numbers generated: %13s\n", size);

    verified = FALSE;

c   Compute the number of "batches" of random number pairs generated 
c   per processor. Adjust if the number of processors does not evenly 
c   divide the total number
    np = NN;

c   Call the random number generator functions and initialize
c   the x-array to reduce the effects of paging on the timings.
c   Also, call all mathematical functions that are used. Make
c   sure these initializations cannot be eliminated as dead code.
#pragma acc data create(qq[0:NN][0:NQ],x[0:2*NK],xx[0:NN][0:2*NK]) \
    vranlc(0, &(dum[0]), dum[1], &(dum[2]));
    dum[0] = randlc(&(dum[1]), dum[2]);
    for (i = 0; i < 2*NK; i++) x[i] = -1.0e99;
    Mops = log(sqrt(fabs(max(1.0, 1.0))));


    vranlc(0, &t1, A, x);
    #pragma acc update device(x[0:2*NK])

/*   Compute AN = A ^ (2 * NK) (mod 2^46). */

    t1 = A;

    for ( i = 1; i <= MK+1; i++) {
      t2 = randlc(&t1, t1);

    an = t1;
    tt = S;
    gc = 0.0;
    sx = 0.0;
    sy = 0.0;
    #pragma acc parallel loop
    for (k = 0; k < np; k++) {
      /* Initialize private q (qq) */
      #pragma acc loop
      for (i = 0; i < NQ; i++)
          qq[k][i] = 0.0;
      /* Initialize private x (xx)  */
      #pragma acc loop
      for (i = 0; i < 2*NK; i++)
          xx[k][i] = x[i];
c   Each instance of this loop may be performed independently. We compute
c   the k offsets separately to take into account the fact that some nodes
c   have more numbers to generate than others
    k_offset = -1;

    double t1, t2, t3, t4, x1, x2;
    int kk, i, ik, l;
    double psx, psy;

    #pragma acc parallel loop reduction(+:sx,sy)
    for (k = 1; k <= np; k++) {
      kk = k_offset + k;
      t1 = S;
      t2 = an;

/*      Find starting seed t1 for this kk. */

      #pragma acc loop seq
      for (i = 1; i <= 100; i++) {
          ik = kk / 2;
          if (2 * ik != kk) t3 = RANDLC(&t1, t2);
          if (ik == 0) break;
          t3 = RANDLC(&t2, t2);
          kk = ik;

/*      Compute uniform pseudorandom numbers. */

      loc_t1 = r23 * A;
      loc_a1 = (int)loc_t1;
      loc_a2 = A - t23 * loc_a1;
      loc_x = t1;

      #pragma acc loop seq
      for (i = 1; i <= 2*NK; i++) {
          loc_t1 = r23 * loc_x;
          loc_x1 = (int)loc_t1;
          loc_x2 = loc_x - t23 * loc_x1;
          loc_t1 = loc_a1 * loc_x2 + loc_a2 * loc_x1;
          loc_t2 = (int)(r23 * loc_t1);
          loc_z = loc_t1 - t23 * loc_t2;
          loc_t3 = t23 * loc_z + loc_a2 * loc_x2;
          loc_t4 = (int)(r46 * loc_t3);
          loc_x = loc_t3 - t46 * loc_t4;
          xx[k-1][i-1] = r46 * loc_x;
      t1 = loc_x;

c       Compute Gaussian deviates by acceptance-rejection method and 
c       tally counts in concentric square annuli.  This loop is not 
c       vectorizable.
      psx = psy = 0.0;

      #pragma acc loop reduction(+:psx,psy)
      for ( i = 0; i < NK; i++) {
          x1 = 2.0 * xx[k-1][2*i] - 1.0;
          x2 = 2.0 * xx[k-1][2*i+1] - 1.0;
          t1 = pow2(x1) + pow2(x2);
          if (t1 <= 1.0) {
            t2 = sqrt(-2.0 * log(t1) / t1);
            t3 = (x1 * t2);             /* Xi */
            t4 = (x2 * t2);             /* Yi */
            l = max(fabs(t3), fabs(t4));
            qq[k-1][l] += 1.0;                      /* counts */
            psx = psx + t3;  /* sum of Xi */
            psy = psy + t4;               /* sum of Yi */

      sx += psx;
      sy += psy;
/*      Reduce private qq to q          */
    #pragma acc parallel loop reduction(+:gc)
    for ( i = 0; i < NQ; i++ ) {
      double sumq = 0.0;
      #pragma acc loop reduction(+:sumq)
      for (k = 0; k < np; k++)
          sumq = sumq + qq[k][i];
      q[i] = sumq;
      gc += sumq;

} /* end acc data */

    tm = timer_read(1);

    nit = 0;
    if (M == 24) {
	if((fabs((sx- (-3.247834652034740e3))/sx) <= EPSILON) &&
	   (fabs((sy- (-6.958407078382297e3))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 25) {
	if ((fabs((sx- (-2.863319731645753e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-6.320053679109499e3))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 28) {
	if ((fabs((sx- (-4.295875165629892e3))/sx) <= EPSILON) &&
	    (fabs((sy- (-1.580732573678431e4))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 30) {
	if ((fabs((sx- (4.033815542441498e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-2.660669192809235e4))/sy) <= EPSILON)) {
	    verified = TRUE;
    } else if (M == 32) {
	if ((fabs((sx- (4.764367927995374e4))/sx) <= EPSILON) &&
	    (fabs((sy- (-8.084072988043731e4))/sy) <= EPSILON)) {
	    verified = TRUE;

    Mops = pow(2.0, M+1)/tm/1000000.0;

    printf("EP Benchmark Results: \n"
	   "CPU Time = %10.4f\n"
	   "N = 2^%5d\n"
	   "No. Gaussian Pairs = %15.0f\n"
	   "Sums = %25.15e %25.15e\n"
	   tm, M, gc, sx, sy);
    for (i = 0; i  <= NQ-1; i++) {
	printf("%3d %15.0f\n", i, q[i]);
    c_print_results("EP", CLASS, M+1, 0, 0, nit,
          tm, Mops, "Random numbers generated",
		  CS1, CS2, CS3, CS4, CS5, CS6, CS7);

    return 0;
Пример #14
int main(int argc, char *argv[])
    double err = 0.0;
    struct timeval start;
    float time = 0.0;
    int n;

    nr_nodes =  get_nr_nodes();
    initial_node = get_node_id () ;
    next_node = (initial_node + 1) % nr_nodes ;
    nr_migrations = nr_nodes + 1 ;

    parse_args(argc, argv);

    if (verbose>=2)
	printf("gram-schmidt 'uni-proc' for %dx%d in", numcols, numrows);

    time = 0 ;
    for (n=0 ; n<numloops ; n++)
	time += timer_stop(&start);
	if (verbose > 2)
	  printf ("up_gs finished, starting orth_err\n");

	if (migration == 1 &&
	    get_node_id() != initial_node)
	  migrate_self (initial_node);
	err = orth_err(&mat);
	printf ("\n   Err = %g\n", err);
	if ( err < ERROR )
	  printf ("-- MGS test : PASSED --\n\n\n");
	    printf ("-- MGS test : FAILED --\n\n\n");
	    exit(-1) ;

    /* migration ici = pb */

    if (verbose>2)
      printf ("up-gs : terminé\n");

    if (verbose >= 1)
        printf("\ntime :");
	printf(" %g s (err: %g)\n", time/(float)numloops, err);

    if (save_matrix)

    if (verbose>2)
      printf ("up_gs termine\n");
    return 0;
Пример #15
/* Main program. */
int main(void)
    int i, j, n_samples, max_n, step_n;
    int array_size;
    int radix;
    test_item_t a[N_ITEMS], test_array[N_ITEMS];
    test_item_t *timing_array, *copy_array;
    timing_t *t;
    /* Assign random values to the key of each array element. */
    rand_array(a, N_ITEMS, 100);

    /* Now test quicksort(). */
    memcpy(test_array, a, sizeof(test_array));
    printf("array before quicksort\n = ");
    print_array(test_array, N_ITEMS);
    quicksort(test_array, N_ITEMS, sizeof(test_item_t), item_cmp);
    printf("array after quicksort\n = ");
    print_array(test_array, N_ITEMS);

    /* Now test mergesort0(). */
    memcpy(test_array, a, sizeof(test_array));
    printf("array before mergesort0\n = ");
    print_array(test_array, N_ITEMS);
    mergesort0(test_array, N_ITEMS, sizeof(test_item_t), item_cmp);
    printf("array after mergesort0\n = ");
    print_array(test_array, N_ITEMS);

    /* Now test mergesort(). */
    memcpy(test_array, a, sizeof(test_array));
    printf("array before mergesort\n = ");
    print_array(test_array, N_ITEMS);
    mergesort(test_array, N_ITEMS, sizeof(test_item_t), item_cmp);
    printf("array after mergesort\n = ");
    print_array(test_array, N_ITEMS);
    /* Now test radix sort. */
    memcpy(test_array, a, sizeof(test_array));
    printf("array before radixsort\n = ");
    print_array(test_array, N_ITEMS);
    radixsort(test_array, N_ITEMS, sizeof(test_item_t), get_value, 10);
    printf("array after radixsort\n = ");
    print_array(test_array, N_ITEMS);

    /* Now test heapsort. */
    memcpy(test_array, a, sizeof(test_array));
    printf("array before heapsort\n = ");
    print_array(test_array, N_ITEMS);
    heapsort(test_array, N_ITEMS, sizeof(test_item_t), item_cmp);
    printf("array after heapsort\n = ");
    print_array(test_array, N_ITEMS);
    /* Time the quicksort and mergesort sorting functions. */

    printf("Enter the number of samples to use: ");
    scanf("%d", &n_samples);
    printf("Enter the maximum array length to sort: ");
    scanf("%d", &max_n);
    printf("Enter the step size for array lengths: ");
    scanf("%d", &step_n);

    t = timing_alloc(5);  /* Five different sorting algorithms. */
    printf("\nResults (n, qsort, quicksort, mergesort, mergesort0, heapsort) (msec)\n"
    for(i = step_n; i <= max_n; i += step_n) {
	array_size = i * sizeof(test_item_t);
        timing_array = malloc(array_size);
	copy_array = malloc(array_size);
	rand_array(copy_array, i, MAX_VALUE);

	for(j = 0; j < n_samples; j++) {
	    memcpy(timing_array, copy_array, array_size);
	    qsort(timing_array, i, sizeof(test_item_t), item_cmp);

	    memcpy(timing_array, copy_array, array_size);
	    quicksort(timing_array, i, sizeof(test_item_t), item_cmp);

	    memcpy(timing_array, copy_array, array_size);
	    mergesort(timing_array, i, sizeof(test_item_t), item_cmp);

	    memcpy(timing_array, copy_array, array_size);
	    mergesort0(timing_array, i, sizeof(test_item_t), item_cmp);

	    memcpy(timing_array, copy_array, array_size);
	    heapsort(timing_array, i, sizeof(test_item_t), item_cmp);
	printf("%d", i);


    /* Time radix sort on the largest array, using different radix sizes. */
    printf("\nRadix Sort Results.  Using n = %d\n", max_n);
    printf("(radix, time)\n");
    array_size = max_n * sizeof(test_item_t);
    timing_array = malloc(array_size);
    copy_array = malloc(array_size);
    rand_array(copy_array, max_n, MAX_VALUE);
    for(radix = 2; radix <= max_n; radix <<= 1) {

	for(j = 0; j < n_samples; j++) {
	    memcpy(timing_array, copy_array, array_size);
	    radixsort(timing_array, max_n, sizeof(test_item_t), get_value,
	printf("%d", radix);
	timer_print("\t%.2f", n_samples);
    return 0;
Пример #16
int main(int argc, char *argv[])
    //Network vars
    int udpSd, tcpSd;
    char buffer[5];
    int myId;
    //SDL vars
    SDL_Event event;
    SDL_Surface* screen = NULL;
    SDL_Surface* blueTank = NULL;
    SDL_Surface* blueCannon = NULL;
    SDL_Surface* redTank = NULL;
    SDL_Surface* redCannon = NULL;
    SDL_Surface* bullet = NULL;
    SDL_Surface* worldMap = NULL;
    //rotation Images
    //SDL_Surface* rotatedTank[6] = {NULL,NULL,NULL,NULL,NULL,NULL};
    //SDL_Surface* rotatedCannon[6] = {NULL,NULL,NULL,NULL,NULL,NULL};
    SDL_Surface* rotatedBullet[6] = {NULL,NULL,NULL,NULL,NULL,NULL};
    //SDL_Rect Tankoffset[6] = {400,300,0,0};
    //SDL_Rect Cannonoffset[6] = {400,300,0,0};

    //UserInterface vars
    SDL_Surface* UIhealth;
    SDL_Surface* UIreload;
    SDL_Surface* UIredPoints;
    SDL_Surface* UIbluePoints;
    TTF_Font *font = NULL;
    TTF_Font *reloadFont = NULL;
    SDL_Color textColor = { 255, 255, 255 };
    char textBuffer[32];
    char reload[32] = "FIRE: ";
    //Thread vars
    pthread_t reciveUdpData;

    //Game vars
    int run;
    struct playerInfo player[6];
    struct timerInfo fps;
    struct cameraInfo camera;
    int bulletAngle[6];
    //int oldCannonAngle[6];
    //int oldTankAngle[6];
    //Other vars
    int i;
    //Inits the player struct
    for (i = 0; i < MAX_PLAYERS; i++)
        player[i].slot = -1;
        player[i].connected = 0;
    //inits Sdl and opens the screen
    screen = init_sdl();
    if(screen == 0)
        printf("Error initializing SDL\n");
        return 0;
    //Makes the connection to the server
    if(!(init_udp_tcp(&udpSd, &tcpSd, argv[1], argv[2])))
        printf("Error making connection\n");
        return 0;
    //load the images (Function maybe)
    blueTank = load_image( "./images/blueTank.bmp" );
    blueCannon = load_image( "./images/blueCannon.bmp" );
    redTank = load_image( "./images/redTank.bmp" );
    redCannon = load_image( "./images/redCannon.bmp" );
    worldMap = load_image( "./images/worldMap.bmp" );
    bullet = load_image( "./images/bullet.bmp" );

    //Load the fonts
    font = TTF_OpenFont( "./fonts/Army.ttf", 24 );
    reloadFont = TTF_OpenFont( "./fonts/Armyfat.ttf", 24 );
    //Moves udp info to global var
    udpInfo.udpSd = udpSd;
    strcpy(udpInfo.serverIp, argv[1]);

    //Recives the first information from the server
    recv(tcpSd, buffer, sizeof(buffer), 0);
    myId = atoi(buffer);
    //Starts the Recive data thread
    pthread_create( &reciveUdpData, NULL, recive_udp_data, &(player));

    while (run)
        //Start the timer

        while( SDL_PollEvent( &event ) )
            if( event.type == SDL_QUIT || event.key.keysym.sym == SDLK_ESCAPE)
                run = FALSE;
            handel_input(&event, tcpSd );
        camera.xCord = -player[myId].xCord;
        camera.yCord = -player[myId].yCord;
        //From here we draw stuff on the screen
        SDL_FillRect(screen,NULL, 0x000000);

        //Draws WorldMAps
        draw_map(player[myId].xCord,player[myId].yCord, worldMap, screen);

        //DISPLAYES YOUR TANK+++++++++++++++++++++++++++++
        if (player[myId].team == 1)
            draw_tank_self(player[myId].tankAngle, blueTank, screen);
            draw_cannon_self(player[myId].cannonAngle, blueCannon, screen);
            draw_tank_self(player[myId].tankAngle, redTank, screen);
            draw_cannon_self(player[myId].cannonAngle, redCannon, screen);

        //DISPLAYES OTHER TANKS+++++++++++++++++++++++++++++
        for (i = 0; i < MAX_PLAYERS; i++)
            if (player[i].slot == myId)
            if (player[i].slot > -1 && player[i].connected == 1)
                if (player[i].team == 1)
                    if (player[i].dead == 0)
                    if (player[i].dead == 0)
        //DRAWS ALL THE BULLETS ON THE SCREEEN+++++++++++++++++++++++++++
        for (i = 0; i < MAX_PLAYERS; i++)
            if (player[i].slot > -1 && player[i].connected == 1)
                if (player[i].fire > 0)
                    if (bulletAngle[i] == 0)
                        SDL_FreeSurface( rotatedBullet[i] );
                        rotatedBullet[i] = rotozoomSurface(bullet,player[i].cannonAngle,1.0,0);
                    draw_bullet(&player[i], &camera, rotatedBullet[i], screen );
                    bulletAngle[i] = 1;
                if (player[i].fire == 0)
        //DRAWS THE USER INTERFACE ON SCREEN+++++++++++++++++++++++++++++
        sprintf(textBuffer, "BLUE POINTS: %d", bluePoints);
        UIbluePoints = TTF_RenderText_Solid( font, textBuffer, textColor );
        draw_UI( 10, 10, UIbluePoints, screen);
        sprintf(textBuffer, "RED POINTS: %d", redPoints);
        UIredPoints = TTF_RenderText_Solid( font, textBuffer, textColor );
        draw_UI( 600, 10, UIredPoints, screen);
        sprintf(textBuffer, "HP: %d", player[myId].healthPoints);
        UIhealth = TTF_RenderText_Solid( font, textBuffer, textColor );
        draw_UI( 20, 570, UIhealth, screen);
        if (player[myId].fire == 0)
            strcpy(reload, "FIRE: READY");
            UIreload = TTF_RenderText_Solid( font, reload, textColor );
            strcpy(reload, "FIRE: RELOADING");
            UIreload = TTF_RenderText_Solid( font, reload, textColor );
        draw_UI( 580, 570, UIreload, screen);

        //Update Screen
        SDL_Flip( screen );

        //Cap the frame rate
        if( timer_get_ticks(&fps) < 1000 / FPS )
            SDL_Delay( ( 1000 / FPS ) - timer_get_ticks(&fps) );

    return 0;
Пример #17
main(int argc, char **argv)
    bool shuffle_keys = true;

    if (argc != 3) {
	fprintf(stderr, "usage: %s [type] [input]\n", appname);
	fprintf(stderr, "type: specifies the dictionary type:\n");
	fprintf(stderr, "   h: height-balanced tree\n");
	fprintf(stderr, "   p: path-reduction tree\n");
	fprintf(stderr, "   r: red-black tree\n");
	fprintf(stderr, "   t: treap\n");
	fprintf(stderr, "   s: splay tree\n");
	fprintf(stderr, "   w: weight-balanced tree\n");
	fprintf(stderr, "   S: skiplist\n");
	fprintf(stderr, "   H: hashtable\n");
	fprintf(stderr, "   2: hashtable 2\n");
	fprintf(stderr, "input: text file consisting of newline-separated keys\n");


    dict_malloc_func = xmalloc;

    const char type = argv[1][0];
    const char *container_name = NULL;
    dict *dct = create_dictionary(type, &container_name);
    if (!dct)
	quit("can't create container");

    ASSERT(comp_count == 0);
    ASSERT(hash_count == 0);

    const size_t malloced_save = malloced;

    FILE *fp = fopen(argv[2], "r");
    if (fp == NULL)
	quit("cant open file '%s': %s", argv[2], strerror(errno));

    size_t nwords = 0;
    char buf[512];
    while (fgets(buf, sizeof(buf), fp))

    if (!nwords)
	quit("nothing read from file");

    char **words = xmalloc(sizeof(*words) * nwords);
    size_t words_read = 0;
    while (words_read < nwords && fgets(buf, sizeof(buf), fp)) {
	strtok(buf, "\n");
	words[words_read++] = xstrdup(buf);
    if (words_read < nwords)
	quit("Only read %zu/%zu words!", words_read, nwords);
    printf("Loaded %zu keys from %s.\n", nwords, argv[2]);

    malloced = malloced_save;
    size_t total_comp = 0, total_hash = 0, total_rotations = 0;

    struct rusage start, end;
    struct timeval total = { 0, 0 };

    for (unsigned i = 0; i < nwords; i++) {
	dict_insert_result result = dict_insert(dct, words[i]);
	if (!result.inserted)
	    quit("insert #%d failed for '%s'", i, words[i]);
	ASSERT(result.datum_ptr != NULL);
	ASSERT(*result.datum_ptr == NULL);
	*result.datum_ptr = words[i];
    timer_end(&start, &end, &total);
    printf("    %s container: %.02fkB\n", container_name, malloced_save * 1e-3);
    printf("       %s memory: %.02fkB\n", container_name, malloced * 1e-3);
    printf("       %s insert: %6.03fs %9zu cmp (%.02f/insert)",
	   (end.ru_utime.tv_sec * 1000000 + end.ru_utime.tv_usec) * 1e-6,
	   comp_count, comp_count / (double) nwords);
    if (hash_count)
	printf(" %9zu hash", hash_count);
    total_comp += comp_count; comp_count = 0;
    total_hash += hash_count; hash_count = 0;
    if (dict_is_sorted(dct) && type != 'S') {
	tree_base *tree = dict_private(dct);
	printf(" min path length: %zu\n", tree_min_path_length(tree));
	printf(" max path length: %zu\n", tree_max_path_length(tree));
	printf(" tot path length: %zu\n", tree_total_path_length(tree));
	printf("insert rotations: %zu\n", tree->rotation_count);
	total_rotations += tree->rotation_count;
	tree->rotation_count = 0;
    } else if (type == 'S') {
	size_t counts[16] = { 0 };
	size_t num_counts = skiplist_link_count_histogram(dict_private(dct), counts, sizeof(counts) / sizeof(counts[0]));
	size_t count_sum = 0;
	for (size_t i = 0; i <= num_counts; ++i) {
	    printf("skiplist %zu-node(s): %zu\n", i, counts[i]);
	    count_sum += counts[i];
	ASSERT(count_sum == nwords);

    comp_count = hash_count = 0; /* Ignore comparisons/hashes incurred by dict_verify() */

    size_t n = dict_count(dct);
    if (n != nwords)
	quit("bad count (%u - should be %u)!", n, nwords);

    dict_itor *itor = dict_itor_new(dct);

    n = 0;
    do {
	ASSERT(dict_itor_key(itor) == *dict_itor_datum(itor));
    } while (dict_itor_next(itor));
    timer_end(&start, &end, &total);
    printf("  %s fwd iterate: %6.03fs\n",
	   (end.ru_utime.tv_sec * 1000000 + end.ru_utime.tv_usec) * 1e-6);
    if (n != nwords)
	warn("Fwd iteration returned %u items - should be %u", n, nwords);

    comp_count = hash_count = 0; /* Ignore comparisons/hashes incurred by dict_verify() */

    n = 0;
    do {
	ASSERT(dict_itor_key(itor) == *dict_itor_datum(itor));
    } while (dict_itor_prev(itor));
    timer_end(&start, &end, &total);
    printf("  %s rev iterate: %6.03fs\n",
	   (end.ru_utime.tv_sec * 1000000 + end.ru_utime.tv_usec) * 1e-6);
    if (n != nwords)
	warn("Rev iteration returned %u items - should be %u", n, nwords);


    if (shuffle_keys) shuffle(words, nwords);

    comp_count = hash_count = 0; /* Ignore comparisons/hashes incurred by dict_verify() */

    for (unsigned i = 0; i < nwords; i++) {
	void **p = dict_search(dct, words[i]);
	if (!p)
	    quit("lookup failed for '%s'", buf);
	if (*p != words[i])
	    quit("bad data for '%s', got '%s' instead", words[i], *(char **)p);
    timer_end(&start, &end, &total);
    printf("  %s good search: %6.03fs %9zu cmp (%.02f/search)",
	   (end.ru_utime.tv_sec * 1000000 + end.ru_utime.tv_usec) * 1e-6,
	   comp_count, comp_count / (double) nwords);
    if (hash_count)
	printf(" %9zu hash", hash_count);
    total_comp += comp_count; comp_count = 0;
    total_hash += hash_count; hash_count = 0;
    if (type != 'H' && type != '2' && type != 'S') {
	tree_base *tree = dict_private(dct);
	printf("search rotations: %zu\n", tree->rotation_count);
	total_rotations += tree->rotation_count;
	tree->rotation_count = 0;

    comp_count = hash_count = 0; /* Ignore comparisons/hashes incurred by dict_verify() */

    for (unsigned i = 0; i < nwords; i++) {
	unsigned rv = dict_rand() % strlen(words[i]);
	dict_search(dct, words[i]);
    timer_end(&start, &end, &total);
    printf("   %s bad search: %6.03fs %9zu cmp (%.02f/search)",
	   (end.ru_utime.tv_sec * 1000000 + end.ru_utime.tv_usec) * 1e-6,
	   comp_count, comp_count / (double) nwords);
    if (hash_count)
	printf(" %9zu hash", hash_count);
    total_comp += comp_count; comp_count = 0;
    total_hash += hash_count; hash_count = 0;

    comp_count = hash_count = 0; /* Ignore comparisons/hashes incurred by dict_verify() */

    if (shuffle_keys) shuffle(words, nwords);

    for (unsigned i = 0; i < nwords; i++) {
	dict_remove_result result = dict_remove(dct, words[i]);
	if (!result.removed)
	    quit("removing #%d '%s' failed!\n", i, words[i]);
	ASSERT(result.key == words[i]);
	ASSERT(result.datum == words[i]);
    timer_end(&start, &end, &total);
    printf("       %s remove: %6.03fs %9zu cmp (%.2f/remove)",
	   (end.ru_utime.tv_sec * 1000000 + end.ru_utime.tv_usec) * 1e-6,
	   comp_count, comp_count / (double)nwords);
    if (hash_count)
	printf(" %9zu hash", hash_count);
    total_comp += comp_count; comp_count = 0;
    total_hash += hash_count; hash_count = 0;
    if (type != 'H' && type != '2' && type != 'S') {
	tree_base *tree = dict_private(dct);
	printf("remove rotations: %zu\n", tree->rotation_count);
	total_rotations += tree->rotation_count;
	tree->rotation_count = 0;

    comp_count = hash_count = 0; /* Ignore comparisons/hashes incurred by dict_verify() */

    if ((n = dict_count(dct)) != 0)
	quit("error - count not zero (%u)!", n);

    dict_free(dct, key_str_free);

    printf("        %s total: %6.03fs %9zu cmp",
	   (total.tv_sec * 1000000 + total.tv_usec) * 1e-6,
    if (total_hash)
	printf(" %9zu hash", total_hash);

    if (type != 'H' && type != '2' && type != 'S') {
	printf(" total rotations: %zu\n", total_rotations);


Пример #18
    SSYEVD_GPU computes all eigenvalues and, optionally, eigenvectors of
    a real symmetric matrix A.  If eigenvectors are desired, it uses a
    divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    n       INTEGER
            The order of the matrix A.  N >= 0.

    dA      REAL array on the GPU,
            dimension (LDDA, N).
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    ldda    INTEGER
            The leading dimension of the array DA.  LDDA >= max(1,N).

    w       REAL array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    wA      (workspace) REAL array, dimension (LDWA, N)

    ldwa    INTEGER
            The leading dimension of the array wA.  LDWA >= max(1,N).

    work    (workspace) REAL array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_ssytrd_nb(N).
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                       LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ  = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through

    Further Details
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_ssyev_driver
extern "C" magma_int_t
    magma_vec_t jobz, magma_uplo_t uplo,
    magma_int_t n,
    magmaFloat_ptr dA, magma_int_t ldda,
    float *w,
    float *wA,  magma_int_t ldwa,
    float *work, magma_int_t lwork,
    #ifdef COMPLEX
    float *rwork, magma_int_t lrwork,
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
    magma_int_t ione = 1;

    float d__1;

    float eps;
    magma_int_t inde;
    float anrm;
    float rmin, rmax;
    float sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    float safmin;
    float bignum;
    magma_int_t indtau;
    magma_int_t indwrk, liwmin;
    magma_int_t llwork;
    float smlnum;
    magma_int_t lquery;

    magmaFloat_ptr dwork;
    magma_int_t lddc = ldda;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);
    lquery = (lwork == -1 || liwork == -1);

    *info = 0;
    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (ldda < max(1,n)) {
        *info = -5;

    magma_int_t nb = magma_get_ssytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    work[0]  = magma_smake_lwork( lwmin );
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -10;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -12;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    else if (lquery) {
        return *info;

    magma_queue_t queue;
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queue );

    /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        magma_int_t lda = n;
        float *A;
        magma_smalloc_cpu( &A, lda*n );
        magma_sgetmatrix( n, n, dA, ldda, A, lda, queue );
        lapackf77_ssyevd( lapack_vec_const(jobz), lapack_uplo_const(uplo),
                          &n, A, &lda,
                          w, work, &lwork,
                          iwork, &liwork, info );
        magma_ssetmatrix( n, n, A, lda, dA, ldda, queue );
        magma_free_cpu( A );
        magma_queue_destroy( queue );
        return *info;

    // ssytrd2_gpu requires ldda*ceildiv(n,64) + 2*ldda*nb
    // sormtr_gpu  requires lddc*n
    // slansy      requires n
    magma_int_t ldwork = max( ldda*magma_ceildiv(n,64) + 2*ldda*nb, lddc*n );
    ldwork = max( ldwork, n );
    if ( wantz ) {
        // sstedx requires 3n^2/2
        ldwork = max( ldwork, 3*n*(n/2 + 1) );
    if (MAGMA_SUCCESS != magma_smalloc( &dwork, ldwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;

    /* Get machine constants. */
    safmin = lapackf77_slamch("Safe minimum");
    eps    = lapackf77_slamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_ssqrt( smlnum );
    rmax = magma_ssqrt( bignum );

    /* Scale matrix to allowable range, if necessary. */
    anrm = magmablas_slansy( MagmaMaxNorm, uplo, n, dA, ldda, dwork, ldwork, queue );
    iscale = 0;
    sigma  = 1;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    if (iscale == 1) {
        magmablas_slascl( uplo, 0, 0, 1., sigma, n, n, dA, ldda, queue, info );

    /* Call SSYTRD to reduce symmetric matrix to tridiagonal form. */
    // ssytrd work: e (n) + tau (n) + llwork (n*nb)  ==>  2n + n*nb
    // sstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2)  ==>  1 + 6n + 2n^2
    inde   = 0;
    indtau = inde   + n;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

#ifdef FAST_SYMV
    magma_ssytrd2_gpu( uplo, n, dA, ldda, w, &work[inde],
                       &work[indtau], wA, ldwa, &work[indwrk], llwork,
                       dwork, ldwork, &iinfo );
    magma_ssytrd_gpu(  uplo, n, dA, ldda, w, &work[inde],
                       &work[indtau], wA, ldwa, &work[indwrk], llwork,
                       &iinfo );

    timer_stop( time );
    #ifdef FAST_SYMV
    timer_printf( "time ssytrd2 = %6.2f\n", time );
    timer_printf( "time ssytrd = %6.2f\n", time );

    /* For eigenvalues only, call SSTERF.  For eigenvectors, first call
       SSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
       tridiagonal matrix, then call SORMTR to multiply it to the Householder
       transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_ssterf( &n, w, &work[inde], info );
    else {
        timer_start( time );

        magma_sstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &work[inde],
                      &work[indwrk], n, &work[indwk2],
                      llwrk2, iwork, liwork, dwork, info );

        timer_stop( time );
        timer_printf( "time sstedx = %6.2f\n", time );
        timer_start( time );

        magma_ssetmatrix( n, n, &work[indwrk], n, dwork, lddc, queue );

        magma_sormtr_gpu( MagmaLeft, uplo, MagmaNoTrans, n, n, dA, ldda, &work[indtau],
                          dwork, lddc, wA, ldwa, &iinfo );

        magma_scopymatrix( n, n, dwork, lddc, dA, ldda, queue );

        timer_stop( time );
        timer_printf( "time sormtr + copy = %6.2f\n", time );

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        d__1 = 1. / sigma;
        blasf77_sscal( &n, &d__1, w, &ione );

    work[0]  = magma_smake_lwork( lwmin );
    iwork[0] = liwmin;

    magma_queue_destroy( queue );
    magma_free( dwork );

    return *info;
} /* magma_ssyevd_gpu */
Пример #19
int main() 
  double Mops, t1, t2, t3, t4, x1, x2;
  double sx, sy, tm, an, tt, gc;
  double sx_verify_value, sy_verify_value, sx_err, sy_err;
  int    np;
  int    i, ik, kk, l, k, nit;
  int    k_offset, j;
  logical verified, timers_enabled;

  double dum[3] = {1.0, 1.0, 1.0};
  char   size[16];

  FILE *fp;

  if ((fp = fopen("timer.flag", "r")) == NULL) {
    timers_enabled = false;
  } else {
    timers_enabled = true;

  //  Because the size of the problem is too large to store in a 32-bit
  //  integer for some classes, we put it into a string (for printing).
  //  Have to strip off the decimal point put in there by the floating
  //  point print statement (internal file)

  sprintf(size, "%15.0lf", pow(2.0, M+1));
  j = 14;
  if (size[j] == '.') j--;
  size[j+1] = '\0';
  printf("\n\n NAS Parallel Benchmarks (NPB3.3-SER-C) - EP Benchmark\n");
  printf("\n Number of random numbers generated: %15s\n", size);

  verified = false;

  //  Compute the number of "batches" of random number pairs generated 
  //  per processor. Adjust if the number of processors does not evenly 
  //  divide the total number

  np = NN; 

  //  Call the random number generator functions and initialize
  //  the x-array to reduce the effects of paging on the timings.
  //  Also, call all mathematical functions that are used. Make
  //  sure these initializations cannot be eliminated as dead code.

  vranlc(0, &dum[0], dum[1], &dum[2]);
  dum[0] = randlc(&dum[1], dum[2]);
  for (i = 0; i < 2 * NK; i++) {
    x[i] = -1.0e99;
  Mops = log(sqrt(fabs(MAX(1.0, 1.0))));   


  t1 = A;
  vranlc(0, &t1, A, x);

  //  Compute AN = A ^ (2 * NK) (mod 2^46).

  t1 = A;

  for (i = 0; i < MK + 1; i++) {
    t2 = randlc(&t1, t1);

  an = t1;
  tt = S;
  gc = 0.0;
  sx = 0.0;
  sy = 0.0;

  for (i = 0; i < NQ; i++) {
    q[i] = 0.0;

  //  Each instance of this loop may be performed independently. We compute
  //  the k offsets separately to take into account the fact that some nodes
  //  have more numbers to generate than others

  k_offset = -1;

  for (k = 1; k <= np; k++) {
    kk = k_offset + k; 
    t1 = S;
    t2 = an;

    // Find starting seed t1 for this kk.

    for (i = 1; i <= 100; i++) {
      ik = kk / 2;
      if ((2 * ik) != kk) t3 = randlc(&t1, t2);
      if (ik == 0) break;
      t3 = randlc(&t2, t2);
      kk = ik;

    //  Compute uniform pseudorandom numbers.
    if (timers_enabled) timer_start(2);
    vranlc(2 * NK, &t1, A, x);
    if (timers_enabled) timer_stop(2);

    //  Compute Gaussian deviates by acceptance-rejection method and 
    //  tally counts in concentri//square annuli.  This loop is not 
    //  vectorizable. 
    if (timers_enabled) timer_start(1);

    for (i = 0; i < NK; i++) {
      x1 = 2.0 * x[2*i] - 1.0;
      x2 = 2.0 * x[2*i+1] - 1.0;
      t1 = x1 * x1 + x2 * x2;
      if (t1 <= 1.0) {
        t2   = sqrt(-2.0 * log(t1) / t1);
        t3   = (x1 * t2);
        t4   = (x2 * t2);
        l    = MAX(fabs(t3), fabs(t4));
        q[l] = q[l] + 1.0;
        sx   = sx + t3;
        sy   = sy + t4;

    if (timers_enabled) timer_stop(1);

  for (i = 0; i < NQ; i++) {
    gc = gc + q[i];

  tm = timer_read(0);

  nit = 0;
  verified = true;
  if (M == 24) {
    sx_verify_value = -3.247834652034740e+3;
    sy_verify_value = -6.958407078382297e+3;
  } else if (M == 25) {
    sx_verify_value = -2.863319731645753e+3;
    sy_verify_value = -6.320053679109499e+3;
  } else if (M == 28) {
    sx_verify_value = -4.295875165629892e+3;
    sy_verify_value = -1.580732573678431e+4;
  } else if (M == 30) {
    sx_verify_value =  4.033815542441498e+4;
    sy_verify_value = -2.660669192809235e+4;
  } else if (M == 32) {
    sx_verify_value =  4.764367927995374e+4;
    sy_verify_value = -8.084072988043731e+4;
  } else if (M == 36) {
    sx_verify_value =  1.982481200946593e+5;
    sy_verify_value = -1.020596636361769e+5;
  } else if (M == 40) {
    sx_verify_value = -5.319717441530e+05;
    sy_verify_value = -3.688834557731e+05;
  } else {
    verified = false;

  if (verified) {
    sx_err = fabs((sx - sx_verify_value) / sx_verify_value);
    sy_err = fabs((sy - sy_verify_value) / sy_verify_value);
    verified = ((sx_err <= EPSILON) && (sy_err <= EPSILON));

  Mops = pow(2.0, M+1) / tm / 1000000.0;

  printf("\nEP Benchmark Results:\n\n");
  printf("CPU Time =%10.4lf\n", tm);
  printf("N = 2^%5d\n", M);
  printf("No. Gaussian Pairs = %15.0lf\n", gc);
  printf("Sums = %25.15lE %25.15lE\n", sx, sy);
  printf("Counts: \n");
  for (i = 0; i < NQ; i++) {
    printf("%3d%15.0lf\n", i, q[i]);

  print_results("EP", CLASS, M+1, 0, 0, nit,
      tm, Mops, 
      "Random numbers generated",
      CS2, CS3, CS4, CS5, CS6, CS7);

  if (timers_enabled) {
    if (tm <= 0.0) tm = 1.0;
    tt = timer_read(0);
    printf("\nTotal time:     %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(1);
    printf("Gaussian pairs: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);
    tt = timer_read(2);
    printf("Random numbers: %9.3lf (%6.2lf)\n", tt, tt*100.0/tm);

  return 0;
Пример #20
int main(int argc, char * argv[])
	if (argc > 1){
		parse_args(argc, argv);

	const int size = n * n;

	float *mat = malloc(size * sizeof(float));
	float *vec = malloc(n * sizeof(float));
	float *output = malloc(n * sizeof(float));
	float *expected = malloc(n * sizeof(float));
	float *mat_transposed = malloc(n * n * sizeof(float));

	generate_matrix(n, mat, range);
	generate_vector(n, vec, range);

	timing_t timer1;

	transpose(n, mat, mat_transposed);
	MatVecMultiply(size, n, mat_transposed, vec, output);

	float sum = sum_vec(n, output);

	printf("%d %f %ld %ld\n", n, sum, timer1.realtime, timer1.cputime);

	if (trace == 1) {

		printf("\nInput matrix\n");

		for (int i=0; i<n; i++){
			for (int j=0; j<n; j++){
				printf("%f " , mat[i*n+j]);
		printf("\nInput vector \n");

		for (int i=0; i<n; i++){
			printf("%f " , vec[i]);

		for (int i=0; i<n; i++){
			printf("%f " , output[i]);
	else if (trace == 2) {
		multiply_CPU(n, mat, vec, expected);
		int status = check(n, output, expected);
		if (status)
			printf("Test failed.\n");
			printf("Test passed OK!\n");
		return status;


	return 0;
Пример #21
    DSYEVDX computes selected eigenvalues and, optionally, eigenvectors
    of a real symmetric matrix A. Eigenvalues and eigenvectors can
    be selected by specifying either a range of values or a range of
    indices for the desired eigenvalues.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                   will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.

    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    n       INTEGER
            The order of the matrix A.  N >= 0.

    dA      DOUBLE_PRECISION array on the GPU,
            dimension (LDDA, N).
            On entry, the symmetric matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns
            of A contains the required
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    ldda    INTEGER
            The leading dimension of the array DA.  LDDA >= max(1,N).

            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    il      INTEGER
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    m       INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1.

    w       DOUBLE PRECISION array, dimension (N)
            If INFO = 0, the required m eigenvalues in ascending order.

    wA      (workspace) DOUBLE PRECISION array, dimension (LDWA, N)

    ldwa    INTEGER
            The leading dimension of the array wA.  LDWA >= max(1,N).

    work    (workspace) DOUBLE_PRECISION array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= 2*N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( 2*N + N*NB, 1 + 6*N + 2*N**2 ).
            NB can be obtained through magma_get_dsytrd_nb(N).
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK and IWORK
            arrays, returns these values as the first entries of the WORK
            and IWORK arrays, and no error message related to LWORK or
            LIWORK is issued by XERBLA.

    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK and
            IWORK arrays, returns these values as the first entries of
            the WORK and IWORK arrays, and no error message related to
            LWORK or LIWORK is issued by XERBLA.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through

    Further Details
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_dsyev_driver
extern "C" magma_int_t
magma_dsyevdx_gpu(magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo,
                  magma_int_t n,
                  double *dA, magma_int_t ldda,
                  double vl, double vu, magma_int_t il, magma_int_t iu,
                  magma_int_t *m, double *w,
                  double *wA,  magma_int_t ldwa,
                  double *work, magma_int_t lwork,
                  magma_int_t *iwork, magma_int_t liwork,
                  magma_int_t *info)
    magma_int_t ione = 1;

    double d__1;

    double eps;
    magma_int_t inde;
    double anrm;
    double rmin, rmax;
    double sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    double safmin;
    double bignum;
    magma_int_t indtau;
    magma_int_t indwrk, liwmin;
    magma_int_t llwork;
    double smlnum;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;

    double *dwork;
    magma_int_t lddc = ldda;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    lquery = (lwork == -1 || liwork == -1);

    *info = 0;
    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (ldda < max(1,n)) {
        *info = -6;
    } else if (ldwa < max(1,n)) {
        *info = -14;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -8;
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -9;
            } else if (iu < min(n,il) || iu > n) {
                *info = -10;

    magma_int_t nb = magma_get_dsytrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        liwmin = 1;
    else if ( wantz ) {
        lwmin  = max( 2*n + n*nb, 1 + 6*n + 2*n*n );
        liwmin = 3 + 5*n;
    else {
        lwmin  = 2*n + n*nb;
        liwmin = 1;
    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_dlamch("Epsilon");
    work[0]  = lwmin * one_eps;
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -16;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -18;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    else if (lquery) {
        return *info;

    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        #ifdef ENABLE_DEBUG
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        const char* jobz_ = lapack_vec_const( jobz );
        const char* uplo_ = lapack_uplo_const( uplo );
        double *A;
        magma_dmalloc_cpu( &A, n*n );
        magma_dgetmatrix(n, n, dA, ldda, A, n);
        lapackf77_dsyevd(jobz_, uplo_,
                         &n, A, &n,
                         w, work, &lwork,
                         iwork, &liwork, info);
        magma_dsetmatrix( n, n, A, n, dA, ldda);
        return *info;

    magma_queue_t stream;
    magma_queue_create( &stream );

    // n*lddc for dsytrd2_gpu
    // n for dlansy
    magma_int_t ldwork = n*lddc;
    if ( wantz ) {
        // need 3n^2/2 for dstedx
        ldwork = max( ldwork, 3*n*(n/2 + 1));
    if (MAGMA_SUCCESS != magma_dmalloc( &dwork, ldwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;

    /* Get machine constants. */
    safmin = lapackf77_dlamch("Safe minimum");
    eps    = lapackf77_dlamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_dsqrt(smlnum);
    rmax = magma_dsqrt(bignum);

    /* Scale matrix to allowable range, if necessary. */
    anrm = magmablas_dlansy(MagmaMaxNorm, uplo, n, dA, ldda, dwork);
    iscale = 0;
    sigma  = 1;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    if (iscale == 1) {
        magmablas_dlascl(uplo, 0, 0, 1., sigma, n, n, dA, ldda, info);

    /* Call DSYTRD to reduce symmetric matrix to tridiagonal form. */
    // dsytrd work: e (n) + tau (n) + llwork (n*nb)  ==>  2n + n*nb
    // dstedx work: e (n) + tau (n) + z (n*n) + llwrk2 (1 + 4*n + n^2)  ==>  1 + 6n + 2n^2
    inde   = 0;
    indtau = inde   + n;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

#ifdef FAST_SYMV
    magma_dsytrd2_gpu(uplo, n, dA, ldda, w, &work[inde],
                      &work[indtau], wA, ldwa, &work[indwrk], llwork,
                      dwork, n*lddc, &iinfo);
    magma_dsytrd_gpu(uplo, n, dA, ldda, w, &work[inde],
                     &work[indtau], wA, ldwa, &work[indwrk], llwork,

    timer_stop( time );
    timer_printf( "time dsytrd = %6.2f\n", time );

    /* For eigenvalues only, call DSTERF.  For eigenvectors, first call
       DSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
       tridiagonal matrix, then call DORMTR to multiply it to the Householder
       transformations represented as Householder vectors in A. */

    if (! wantz) {
        lapackf77_dsterf(&n, w, &work[inde], info);

        magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m);
    else {
        timer_start( time );

        magma_dstedx(range, n, vl, vu, il, iu, w, &work[inde],
                     &work[indwrk], n, &work[indwk2],
                     llwrk2, iwork, liwork, dwork, info);

        timer_stop( time );
        timer_printf( "time dstedx = %6.2f\n", time );
        timer_start( time );

        magma_dmove_eig(range, n, w, &il, &iu, vl, vu, m);

        magma_dsetmatrix( n, *m, &work[indwrk + n* (il-1) ], n, dwork, lddc );

        magma_dormtr_gpu(MagmaLeft, uplo, MagmaNoTrans, n, *m, dA, ldda, &work[indtau],
                         dwork, lddc, wA, ldwa, &iinfo);

        magma_dcopymatrix( n, *m, dwork, lddc, dA, ldda );

        timer_stop( time );
        timer_printf( "time dormtr + copy = %6.2f\n", time );

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        d__1 = 1. / sigma;
        blasf77_dscal(&n, &d__1, w, &ione);

    work[0]  = lwmin * one_eps;  // round up
    iwork[0] = liwmin;

    magma_queue_destroy( stream );
    magma_free( dwork );

    return *info;
} /* magma_dsyevd_gpu */
Пример #22
    SLAEX0 computes all eigenvalues and the choosen eigenvectors of a
    symmetric tridiagonal matrix using the divide and conquer method.

    n       INTEGER
            The dimension of the symmetric tridiagonal matrix.  N >= 0.
    d       REAL array, dimension (N)
            On entry, the main diagonal of the tridiagonal matrix.
            On exit, its eigenvalues.
    e       REAL array, dimension (N-1)
            The off-diagonal elements of the tridiagonal matrix.
            On exit, E has been destroyed.
    Q       REAL array, dimension (LDQ, N)
            On entry, Q will be the identity matrix.
            On exit, Q contains the eigenvectors of the
            tridiagonal matrix.
    ldq     INTEGER
            The leading dimension of the array Q.  If eigenvectors are
            desired, then  LDQ >= max(1,N).  In any case,  LDQ >= 1.
    work    (workspace) REAL array,
            the dimension of WORK >= 4*N + N**2.
    iwork   (workspace) INTEGER array,
            the dimension of IWORK >= 3 + 5*N.
    dwork   (workspace) REAL array, dimension (3*N*N/2+3*N)
    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                             will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.
    vl      REAL
    vu      REAL
            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.
    il      INTEGER
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.
    info    INTEGER
      -     = 0:  successful exit.
      -     < 0:  if INFO = -i, the i-th argument had an illegal value.
      -     > 0:  The algorithm failed to compute an eigenvalue while
                  working on the submatrix lying in rows and columns
                  INFO/(N+1) through mod(INFO,N+1).

    Further Details
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    @ingroup magma_ssyev_aux
extern "C" magma_int_t
    magma_int_t n,
    float *d, float *e,
    float *Q, magma_int_t ldq,
    float *work, magma_int_t *iwork,
    magmaFloat_ptr dwork,
    magma_range_t range, float vl, float vu,
    magma_int_t il, magma_int_t iu,
    magma_int_t *info)
#define Q(i_,j_) (Q + (i_) + (j_)*ldq)

    magma_int_t ione = 1;
    magma_range_t range2;
    magma_int_t curlvl, i, indxq;
    magma_int_t j, k, matsiz, msd2, smlsiz;
    magma_int_t submat, subpbs, tlvls;

    // Test the input parameters.
    *info = 0;

    if ( n < 0 )
        *info = -1;
    else if ( ldq < max(1, n) )
        *info = -5;
    if ( *info != 0 ) {
        magma_xerbla( __func__, -(*info) );
        return *info;

    // Quick return if possible
    if (n == 0)
        return *info;

    smlsiz = magma_get_smlsize_divideconquer();

    // Determine the size and placement of the submatrices, and save in
    // the leading elements of IWORK.
    iwork[0] = n;
    subpbs= 1;
    tlvls = 0;
    while (iwork[subpbs - 1] > smlsiz) {
        for (j = subpbs; j > 0; --j) {
            iwork[2*j - 1] = (iwork[j-1]+1)/2;
            iwork[2*j - 2] = iwork[j-1]/2;
        subpbs *= 2;
    for (j=1; j < subpbs; ++j)
        iwork[j] += iwork[j-1];

    // Divide the matrix into SUBPBS submatrices of size at most SMLSIZ+1
    // using rank-1 modifications (cuts).
    for (i=0; i < subpbs-1; ++i) {
        submat = iwork[i];
        d[submat-1] -= MAGMA_S_ABS(e[submat-1]);
        d[submat] -= MAGMA_S_ABS(e[submat-1]);

    indxq = 4*n + 3;

    // Solve each submatrix eigenproblem at the bottom of the divide and
    // conquer tree.
    magma_timer_t time=0;
    timer_start( time );

    for (i = 0; i < subpbs; ++i) {
        if (i == 0) {
            submat = 0;
            matsiz = iwork[0];
        } else {
            submat = iwork[i-1];
            matsiz = iwork[i] - iwork[i-1];
        lapackf77_ssteqr("I", &matsiz, &d[submat], &e[submat],
                         Q(submat, submat), &ldq, work, info);  // change to edc?
        if (*info != 0) {
            printf("info: %d\n, submat: %d\n", (int) *info, (int) submat);
            *info = (submat+1)*(n+1) + submat + matsiz;
            printf("info: %d\n", (int) *info);
            return *info;
        k = 1;
        for (j = submat; j < iwork[i]; ++j) {
            iwork[indxq+j] = k;

    timer_stop( time );
    timer_printf( "  for: ssteqr = %6.2f\n", time );
    // Successively merge eigensystems of adjacent submatrices
    // into eigensystem for the corresponding larger matrix.
    curlvl = 1;
    while (subpbs > 1) {
        timer_start( time );
        for (i=0; i < subpbs-1; i += 2) {
            if (i == 0) {
                submat = 0;
                matsiz = iwork[1];
                msd2 = iwork[0];
            } else {
                submat = iwork[i-1];
                matsiz = iwork[i+1] - iwork[i-1];
                msd2 = matsiz / 2;

            // Merge lower order eigensystems (of size MSD2 and MATSIZ - MSD2)
            // into an eigensystem of size MATSIZ.
            // SLAEX1 is used only for the full eigensystem of a tridiagonal
            // matrix.
            if (matsiz == n)
                range2 = range;
                // We need all the eigenvectors if it is not last step
                range2 = MagmaRangeAll;

            magma_slaex1(matsiz, &d[submat], Q(submat, submat), ldq,
                         &iwork[indxq+submat], e[submat+msd2-1], msd2,
                         work, &iwork[subpbs], dwork,
                         range2, vl, vu, il, iu, info);

            if (*info != 0) {
                *info = (submat+1)*(n+1) + submat + matsiz;
                return *info;
            iwork[i/2]= iwork[i+1];
        subpbs /= 2;
        timer_stop( time );
        timer_printf("%d: time: %6.2f\n", (int) curlvl, time );

    // Re-merge the eigenvalues/vectors which were deflated at the final
    // merge step.
    for (i = 0; i < n; ++i) {
        j = iwork[indxq+i] - 1;
        work[i] = d[j];
        blasf77_scopy(&n, Q(0, j), &ione, &work[ n*(i+1) ], &ione);
    blasf77_scopy(&n, work, &ione, d, &ione);
    lapackf77_slacpy( "A", &n, &n, &work[n], &n, Q, &ldq );

    return *info;
} /* magma_slaex0 */
Пример #23
// run_martet: The game loop. Returns final score.
int run_martet(SDL_Surface* screen, SDL_Surface* board) {
    Tetromino* active_tetromino;
    Tetromino* next_tetromino;
    // Create ingame menu
    struct Menu* ingame_menu = menu_create();
    menu_addelement(ingame_menu, "Continue");
    menu_addelement(ingame_menu, "Quit Current Game");
    ingame_menu->active_element = &ingame_menu->elements[0];
    ingame_menu->active_element->active = 1;
    int score = 0;
    srand((unsigned) time(NULL));
    next_tetromino   = tetcreaterand();
    update_status_bar(next_tetromino, screen, score);
    active_tetromino = tetcreaterand();
    active_tetromino->position[0] = 4;
    struct Timer* timer = create_timer();
    timer_change_alarm_interval(timer, GAME_SPEED);
    bool running = true;
    while (running){
        int event = process_key_events(active_tetromino, tetaction);
        if (event == KEYEVENT_EXIT)
        else if (event == KEYEVENT_MENU) {
            if (ingame_menu_martet(screen, board, ingame_menu) == 1)
                running = false;
        else if (event == KEYEVENT_PAUSE)
            pause_martet(screen, board);
        if (timer_update(timer)) {
            // If collision and tetromino not deleted
            if ( tetmove('d', active_tetromino) == 0
                && active_tetromino->color != TETROMINO_DELETE) {
                active_tetromino->color = TETROMINO_DELETE;
            else if ( active_tetromino->color == TETROMINO_DELETE ) {
                active_tetromino = next_tetromino;
                next_tetromino   = tetcreaterand();
                if (check_rows(&score)) { // If score has increased.
                                                GAME_SPEED /
                                                sqrt( (double) score/5) + 1);
            if ( next_tetromino == NULL ) // If game over
            update_status_bar(next_tetromino, screen, score);
        clear_surface(board, NULL);
        draw_tetromino(board, active_tetromino);
        draw_ghost_tetromino(board, active_tetromino);
        draw_surface(0, 0, board, screen, NULL);
    if (active_tetromino) {
    return score;
Пример #24
static int test_timer(unsigned num)
    int set = 0;

    /* reset state */
    sw_count = 0;
    fired = 0;
    for (unsigned i = 0; i < MAX_CHANNELS; i++) {
        timeouts[i] = 0;
        args[i] = UINT_MAX;

    /* initialize and halt timer */
    if (timer_init(TIMER_DEV(num), TIM_SPEED, cb, (void *)(COOKIE * num)) < 0) {
        printf("TIMER_%u: ERROR on initialization - skipping\n\n", num);
        return 0;
    else {
        printf("TIMER_%u: initialization successful\n", num);
    printf("TIMER_%u: stopped\n", num);
    /* set each available channel */
    for (unsigned i = 0; i < MAX_CHANNELS; i++) {
        unsigned timeout = ((i + 1) * CHAN_OFFSET);
        if (timer_set(TIMER_DEV(num), i, timeout) < 0) {
        else {
            printf("TIMER_%u: set channel %u to %u\n", num, i, timeout);
    if (set == 0) {
        printf("TIMER_%u: ERROR setting any channel\n\n", num);
        return 0;
    /* start the timer */
    printf("TIMER_%u: starting\n", num);
    /* wait for all channels to fire */
    do {
    } while (fired != set);
    /* collect results */
    for (int i = 0; i < fired; i++) {
        if (args[i] != ((COOKIE * num) + i)) {
            printf("TIMER_%u: ERROR callback argument mismatch\n\n", num);
            return 0;
        printf("TIMER_%u: channel %i fired at SW count %8u",
               num, i, (unsigned)timeouts[i]);
        if (i == 0) {
            printf(" - init: %8u\n", (unsigned)timeouts[i]);
        else {
            printf(" - diff: %8u\n", (unsigned)(timeouts[i] - timeouts[i - 1]));
    return 1;
Пример #25
int main(int argc, char** argv )

    int             i, iteration, itemp;
    int             nthreads = 1;
    double          timecounter, maxtime;

/*  Initialize the verification arrays if a valid class */
    for( i=0; i<TEST_ARRAY_SIZE; i++ )
        switch( CLASS )
            case 'S':
                test_index_array[i] = S_test_index_array[i];
                test_rank_array[i]  = S_test_rank_array[i];
            case 'A':
                test_index_array[i] = A_test_index_array[i];
                test_rank_array[i]  = A_test_rank_array[i];
            case 'W':
                test_index_array[i] = W_test_index_array[i];
                test_rank_array[i]  = W_test_rank_array[i];
            case 'B':
                test_index_array[i] = B_test_index_array[i];
                test_rank_array[i]  = B_test_rank_array[i];
            case 'C':
                test_index_array[i] = C_test_index_array[i];
                test_rank_array[i]  = C_test_rank_array[i];

/*  Printout initial NPB info */
    printf( "\n\n NAS Parallel Benchmarks 2.3 OpenMP C version"
            " - IS Benchmark\n\n" );
    printf( " Size:  %d  (class %c)\n", TOTAL_KEYS, CLASS );
    printf( " Iterations:   %d\n", MAX_ITERATIONS );

/*  Initialize timer  */
    timer_clear( 0 );

/*  Generate random number sequence and subsequent keys on all procs */
    create_seq( 314159265.00,                    /* Random number gen seed */
                1220703125.00 );                 /* Random number gen mult */

/*  Do one interation for free (i.e., untimed) to guarantee initialization of
    all data and code pages and respective tables */
#pragma omp parallel
    rank( 1 );

/*  Start verification counter */
    passed_verification = 0;

    if( CLASS != 'S' ) printf( "\n   iteration\n" );

/*  Start timer  */
    timer_start( 0 );

/*  This is the main iteration */

#pragma omp parallel private(iteration)
    for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
#pragma omp master
        if( CLASS != 'S' ) printf( "        %d\n", iteration );

        rank( iteration );

#if defined(_OPENMP)
#pragma omp master
        nthreads = omp_get_num_threads();
#endif /* _OPENMP */

/*  End of timing, obtain maximum time of all processors */
    timer_stop( 0 );
    timecounter = timer_read( 0 );

/*  This tests that keys are in sequence: sorting of last ranked key seq
    occurs here, but is an untimed operation                             */

/*  The final printout  */
    if( passed_verification != 5*MAX_ITERATIONS + 1 )
        passed_verification = 0;
    c_print_results( "IS",
                     ((double) (MAX_ITERATIONS*TOTAL_KEYS))
                     "keys ranked",

  return 0;
}        /*  E N D  P R O G R A M  */
Пример #26
int main(int argc, char **argv) {

  int i,j,k,iad;
  int aad,bad,cad;
  int N=16;

  if(argc>1) {

  if(N>(i*4)) N=(i+1)*4;

  double l2=log(N)/log(2.0);
  int il2=(int)l2;
  if(il2<l2) il2++;
  int MSZ=pow(2,il2);
  int N4=N/4;

  printf("N=%d; N/4=%d; msize=%d\n",N,N4,MSZ);

// actual memory allocation must be power of 2 although N can be any multiple of 4
  float *a=(float *)_mm_malloc(MSZ*MSZ*sizeof(float),128);
  float *b=(float *)_mm_malloc(MSZ*MSZ*sizeof(float),128);
  float *c=(float *)_mm_malloc(MSZ*MSZ*sizeof(float),128);

  int iv=0;
#pragma omp parallel for private (i,j,iv,iad)
  for(i=0; i<N; i++) {
    for(j=0; j<N; j++) {

  tv ts;

  for(i=0; i<N4; i++) {
#pragma omp parallel for private (aad,bad,cad,i,j,k)
    for(j=0; j<N4; j++) {
      for(k=0; k<N4; k++) {
  printf("sec: %lf\n",timer_sec(ts));
  float fN=N;
  printf("GFlops/s: %lf\n",(double)(fN*fN*(2*fN-1))/timer_sec(ts)/1000000000.0);

//  pmat(N,a);
//  pmat(N,b);
//  pmat(N,c);


Пример #27
    CHEEVDX computes selected eigenvalues and, optionally, eigenvectors
    of a complex Hermitian matrix A. Eigenvalues and eigenvectors can
    be selected by specifying either a range of values or a range of
    indices for the desired eigenvalues.
    If eigenvectors are desired, it uses a divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    range   magma_range_t
      -     = MagmaRangeAll: all eigenvalues will be found.
      -     = MagmaRangeV:   all eigenvalues in the half-open interval (VL,VU]
                   will be found.
      -     = MagmaRangeI:   the IL-th through IU-th eigenvalues will be found.

    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    n       INTEGER
            The order of the matrix A.  N >= 0.

    A       COMPLEX array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, the first m columns
            of A contains the required
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    vl      REAL
    vu      REAL
            If RANGE=MagmaRangeV, the lower and upper bounds of the interval to
            be searched for eigenvalues. VL < VU.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeI.

    il      INTEGER
    iu      INTEGER
            If RANGE=MagmaRangeI, the indices (in ascending order) of the
            smallest and largest eigenvalues to be returned.
            1 <= IL <= IU <= N, if N > 0; IL = 1 and IU = 0 if N = 0.
            Not referenced if RANGE = MagmaRangeAll or MagmaRangeV.

    m       INTEGER
            The total number of eigenvalues found.  0 <= M <= N.
            If RANGE = MagmaRangeAll, M = N, and if RANGE = MagmaRangeI, M = IU-IL+1.

    w       REAL array, dimension (N)
            If INFO = 0, the required m eigenvalues in ascending order.

    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ).
            NB can be obtained through magma_get_chetrd_nb(N).
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    rwork   (workspace) REAL array,
                                           dimension (LRWORK)
            On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK.

    lrwork  INTEGER
            The dimension of the array RWORK.
            If N <= 1,                      LRWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LRWORK >= N.
            If JOBZ = MagmaVec   and N > 1, LRWORK >= 1 + 5*N + 2*N**2.
            If LRWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through

    Further Details
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_cheev_driver
extern "C" magma_int_t
    magma_vec_t jobz, magma_range_t range, magma_uplo_t uplo,
    magma_int_t n,
    magmaFloatComplex *A, magma_int_t lda,
    float vl, float vu, magma_int_t il, magma_int_t iu,
    magma_int_t *m, float *w,
    magmaFloatComplex *work, magma_int_t lwork,
    #ifdef COMPLEX
    float *rwork, magma_int_t lrwork,
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
    const char* uplo_  = lapack_uplo_const( uplo  );
    const char* jobz_  = lapack_vec_const( jobz  );
    magma_int_t ione = 1;
    magma_int_t izero = 0;
    float d_one = 1.;

    float d__1;

    float eps;
    magma_int_t inde;
    float anrm;
    magma_int_t imax;
    float rmin, rmax;
    float sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t llrwk;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    float safmin;
    float bignum;
    magma_int_t indtau;
    magma_int_t indrwk, indwrk, liwmin;
    magma_int_t lrwmin, llwork;
    float smlnum;
    magma_int_t lquery;
    magma_int_t alleig, valeig, indeig;

    float* dwork;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);

    alleig = (range == MagmaRangeAll);
    valeig = (range == MagmaRangeV);
    indeig = (range == MagmaRangeI);

    lquery = (lwork == -1 || lrwork == -1 || liwork == -1);

    *info = 0;
    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (alleig || valeig || indeig)) {
        *info = -2;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -3;
    } else if (n < 0) {
        *info = -4;
    } else if (lda < max(1,n)) {
        *info = -6;
    } else {
        if (valeig) {
            if (n > 0 && vu <= vl) {
                *info = -8;
        } else if (indeig) {
            if (il < 1 || il > max(1,n)) {
                *info = -9;
            } else if (iu < min(n,il) || iu > n) {
                *info = -10;

    magma_int_t nb = magma_get_chetrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        lrwmin = 1;
        liwmin = 1;
    else if ( wantz ) {
        lwmin  = max( n + n*nb, 2*n + n*n );
        lrwmin = 1 + 5*n + 2*n*n;
        liwmin = 3 + 5*n;
    else {
        lwmin  = n + n*nb;
        lrwmin = n;
        liwmin = 1;
    work[0]  = magma_cmake_lwork( lwmin );
    rwork[0] = magma_smake_lwork( lrwmin );
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -14;
    } else if ((lrwork < lrwmin) && ! lquery) {
        *info = -16;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -18;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    else if (lquery) {
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        return *info;

    if (n == 1) {
        w[0] = MAGMA_C_REAL(A[0]);
        if (wantz) {
            A[0] = MAGMA_C_ONE;
        return *info;
    /* Check if matrix is very small then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        #ifdef ENABLE_DEBUG
        printf("  warning matrix too small N=%d NB=%d, calling lapack on CPU  \n", (int) n, (int) nb);
        lapackf77_cheevd(jobz_, uplo_,
                         &n, A, &lda,
                         w, work, &lwork,
                         #ifdef COMPLEX
                         rwork, &lrwork,
                         iwork, &liwork, info);
        return *info;
    /* Get machine constants. */
    safmin = lapackf77_slamch("Safe minimum");
    eps    = lapackf77_slamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_ssqrt(smlnum);
    rmax = magma_ssqrt(bignum);

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_clanhe("M", uplo_, &n, A, &lda, rwork);
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    if (iscale == 1) {
        lapackf77_clascl(uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A,
                         &lda, info);

    /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */
    // chetrd rwork: e (n)
    // cstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2)  ==>  1 + 5n + 2n^2
    inde   = 0;
    indrwk = inde + n;
    llrwk  = lrwork - indrwk;

    // chetrd work: tau (n) + llwork (n*nb)  ==>  n + n*nb
    // cstedx work: tau (n) + z (n^2)
    // cunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb)  ==>  2n + n^2, or n + n*nb + n^2
    indtau = 0;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

    magma_chetrd(uplo, n, A, lda, w, &rwork[inde],
                 &work[indtau], &work[indwrk], llwork, &iinfo);

    timer_stop( time );
    timer_printf( "time chetrd = %6.2f\n", time );

    /* For eigenvalues only, call SSTERF.  For eigenvectors, first call
     CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
     tridiagonal matrix, then call CUNMTR to multiply it to the Householder
     transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_ssterf(&n, w, &rwork[inde], info);

        magma_smove_eig(range, n, w, &il, &iu, vl, vu, m);
    else {
        timer_start( time );

        if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;

        magma_cstedx(range, n, vl, vu, il, iu, w, &rwork[inde],
                     &work[indwrk], n, &rwork[indrwk],
                     llrwk, iwork, liwork, dwork, info);

        magma_free( dwork );

        timer_stop( time );
        timer_printf( "time cstedx = %6.2f\n", time );
        timer_start( time );

        magma_smove_eig(range, n, w, &il, &iu, vl, vu, m);

        magma_cunmtr(MagmaLeft, uplo, MagmaNoTrans, n, *m, A, lda, &work[indtau],
                     &work[indwrk + n * (il-1) ], n, &work[indwk2], llwrk2, &iinfo);

        lapackf77_clacpy("A", &n, m, &work[indwrk + n * (il-1)], &n, A, &lda);

        timer_stop( time );
        timer_printf( "time cunmtr + copy = %6.2f\n", time );

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        if (*info == 0) {
            imax = n;
        } else {
            imax = *info - 1;
        d__1 = 1. / sigma;
        blasf77_sscal(&imax, &d__1, w, &ione);

    work[0]  = magma_cmake_lwork( lwmin );
    rwork[0] = magma_smake_lwork( lrwmin );
    iwork[0] = liwmin;

    return *info;
} /* magma_cheevdx */
Пример #28
Файл: test_2.c Проект: 8l/rose
int main() {

  const int n = 16 * 1024;
  const int m = 16 * 1024;

  float ** a;

  test_timer_t timer = timer_build();


    a = create_array(n, m);

    kernel_0(n, m, a, 3.5);


    printf("#0 : %d\n", timer->delta);


    a = create_array(n, m);

    kernel_1(n, m, a, 3.5);


    printf("#1 : %d\n", timer->delta);


    a = create_array(n, m);

    kernel_2(n, m, a, 3.5);


    printf("#2 : %d\n", timer->delta);


    a = create_array(n, m);

    kernel_3(n, m, a, 3.5);


    printf("#3 : %d\n", timer->delta);

  return 0;
Пример #29
    CHEEVD computes all eigenvalues and, optionally, eigenvectors of a
    complex Hermitian matrix A.  If eigenvectors are desired, it uses a
    divide and conquer algorithm.

    The divide and conquer algorithm makes very mild assumptions about
    floating point arithmetic. It will work on machines with a guard
    digit in add/subtract, or on those binary machines without guard
    digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
    Cray-2. It could conceivably fail on hexadecimal or decimal machines
    without guard digits, but we know of none.

    jobz    magma_vec_t
      -     = MagmaNoVec:  Compute eigenvalues only;
      -     = MagmaVec:    Compute eigenvalues and eigenvectors.

    uplo    magma_uplo_t
      -     = MagmaUpper:  Upper triangle of A is stored;
      -     = MagmaLower:  Lower triangle of A is stored.

    n       INTEGER
            The order of the matrix A.  N >= 0.

    A       COMPLEX array, dimension (LDA, N)
            On entry, the Hermitian matrix A.  If UPLO = MagmaUpper, the
            leading N-by-N upper triangular part of A contains the
            upper triangular part of the matrix A.  If UPLO = MagmaLower,
            the leading N-by-N lower triangular part of A contains
            the lower triangular part of the matrix A.
            On exit, if JOBZ = MagmaVec, then if INFO = 0, A contains the
            orthonormal eigenvectors of the matrix A.
            If JOBZ = MagmaNoVec, then on exit the lower triangle (if UPLO=MagmaLower)
            or the upper triangle (if UPLO=MagmaUpper) of A, including the
            diagonal, is destroyed.

    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,N).

    w       REAL array, dimension (N)
            If INFO = 0, the eigenvalues in ascending order.

    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.

    lwork   INTEGER
            The length of the array WORK.
            If N <= 1,                      LWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LWORK >= N + N*NB.
            If JOBZ = MagmaVec   and N > 1, LWORK >= max( N + N*NB, 2*N + N**2 ).
            NB can be obtained through magma_get_chetrd_nb(N).
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal sizes of the WORK, RWORK and
            IWORK arrays, returns these values as the first entries of
            the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    rwork   (workspace) REAL array, dimension (LRWORK)
            On exit, if INFO = 0, RWORK[0] returns the optimal LRWORK.

    lrwork  INTEGER
            The dimension of the array RWORK.
            If N <= 1,                      LRWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LRWORK >= N.
            If JOBZ = MagmaVec   and N > 1, LRWORK >= 1 + 5*N + 2*N**2.
            If LRWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    iwork   (workspace) INTEGER array, dimension (MAX(1,LIWORK))
            On exit, if INFO = 0, IWORK[0] returns the optimal LIWORK.

    liwork  INTEGER
            The dimension of the array IWORK.
            If N <= 1,                      LIWORK >= 1.
            If JOBZ = MagmaNoVec and N > 1, LIWORK >= 1.
            If JOBZ = MagmaVec   and N > 1, LIWORK >= 3 + 5*N.
            If LIWORK = -1, then a workspace query is assumed; the
            routine only calculates the optimal sizes of the WORK, RWORK
            and IWORK arrays, returns these values as the first entries
            of the WORK, RWORK and IWORK arrays, and no error message
            related to LWORK or LRWORK or LIWORK is issued by XERBLA.

    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
      -     > 0:  if INFO = i and JOBZ = MagmaNoVec, then the algorithm failed
                  to converge; i off-diagonal elements of an intermediate
                  tridiagonal form did not converge to zero;
                  if INFO = i and JOBZ = MagmaVec, then the algorithm failed
                  to compute an eigenvalue while working on the submatrix
                  lying in rows and columns INFO/(N+1) through

    Further Details
    Based on contributions by
       Jeff Rutter, Computer Science Division, University of California
       at Berkeley, USA

    Modified description of INFO. Sven, 16 Feb 05.

    @ingroup magma_cheev_driver
extern "C" magma_int_t
    magma_vec_t jobz, magma_uplo_t uplo,
    magma_int_t n,
    magmaFloatComplex *A, magma_int_t lda,
    float *w,
    magmaFloatComplex *work, magma_int_t lwork,
    #ifdef COMPLEX
    float *rwork, magma_int_t lrwork,
    magma_int_t *iwork, magma_int_t liwork,
    magma_int_t *info)
    const char* uplo_ = lapack_uplo_const( uplo );
    const char* jobz_ = lapack_vec_const( jobz );
    magma_int_t ione = 1;
    magma_int_t izero = 0;
    float d_one = 1.;

    float d__1;

    float eps;
    magma_int_t inde;
    float anrm;
    magma_int_t imax;
    float rmin, rmax;
    float sigma;
    magma_int_t iinfo, lwmin;
    magma_int_t lower;
    magma_int_t llrwk;
    magma_int_t wantz;
    magma_int_t indwk2, llwrk2;
    magma_int_t iscale;
    float safmin;
    float bignum;
    magma_int_t indtau;
    magma_int_t indrwk, indwrk, liwmin;
    magma_int_t lrwmin, llwork;
    float smlnum;
    magma_int_t lquery;

    float* dwork;

    wantz = (jobz == MagmaVec);
    lower = (uplo == MagmaLower);
    lquery = (lwork == -1 || lrwork == -1 || liwork == -1);

    *info = 0;

    if (! (wantz || (jobz == MagmaNoVec))) {
        *info = -1;
    } else if (! (lower || (uplo == MagmaUpper))) {
        *info = -2;
    } else if (n < 0) {
        *info = -3;
    } else if (lda < max(1,n)) {
        *info = -5;

    magma_int_t nb = magma_get_chetrd_nb( n );
    if ( n <= 1 ) {
        lwmin  = 1;
        lrwmin = 1;
        liwmin = 1;
    else if ( wantz ) {
        lwmin  = max( n + n*nb, 2*n + n*n );
        lrwmin = 1 + 5*n + 2*n*n;
        liwmin = 3 + 5*n;
    else {
        lwmin  = n + n*nb;
        lrwmin = n;
        liwmin = 1;
    // multiply by 1+eps (in Double!) to ensure length gets rounded up,
    // if it cannot be exactly represented in floating point.
    real_Double_t one_eps = 1. + lapackf77_slamch("Epsilon");
    work[0]  = MAGMA_C_MAKE( lwmin * one_eps, 0 );
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    if ((lwork < lwmin) && !lquery) {
        *info = -8;
    } else if ((lrwork < lrwmin) && ! lquery) {
        *info = -10;
    } else if ((liwork < liwmin) && ! lquery) {
        *info = -12;

    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    else if (lquery) {
        return *info;

    /* Quick return if possible */
    if (n == 0) {
        return *info;

    if (n == 1) {
        w[0] = MAGMA_C_REAL( A[0] );
        if (wantz) {
            A[0] = MAGMA_C_ONE;
        return *info;

    /* If matrix is very small, then just call LAPACK on CPU, no need for GPU */
    if (n <= 128) {
        lapackf77_cheevd( jobz_, uplo_,
                          &n, A, &lda,
                          w, work, &lwork,
                          #ifdef COMPLEX
                          rwork, &lrwork,
                          iwork, &liwork, info );
        return *info;

    /* Get machine constants. */
    safmin = lapackf77_slamch("Safe minimum");
    eps    = lapackf77_slamch("Precision");
    smlnum = safmin / eps;
    bignum = 1. / smlnum;
    rmin = magma_ssqrt( smlnum );
    rmax = magma_ssqrt( bignum );

    /* Scale matrix to allowable range, if necessary. */
    anrm = lapackf77_clanhe( "M", uplo_, &n, A, &lda, rwork );
    iscale = 0;
    if (anrm > 0. && anrm < rmin) {
        iscale = 1;
        sigma = rmin / anrm;
    } else if (anrm > rmax) {
        iscale = 1;
        sigma = rmax / anrm;
    if (iscale == 1) {
        lapackf77_clascl( uplo_, &izero, &izero, &d_one, &sigma, &n, &n, A, &lda, info );

    /* Call CHETRD to reduce Hermitian matrix to tridiagonal form. */
    // chetrd rwork: e (n)
    // cstedx rwork: e (n) + llrwk (1 + 4*N + 2*N**2)  ==>  1 + 5n + 2n^2
    inde   = 0;
    indrwk = inde + n;
    llrwk  = lrwork - indrwk;

    // chetrd work: tau (n) + llwork (n*nb)  ==>  n + n*nb
    // cstedx work: tau (n) + z (n^2)
    // cunmtr work: tau (n) + z (n^2) + llwrk2 (n or n*nb)  ==>  2n + n^2, or n + n*nb + n^2
    indtau = 0;
    indwrk = indtau + n;
    indwk2 = indwrk + n*n;
    llwork = lwork - indwrk;
    llwrk2 = lwork - indwk2;

    magma_timer_t time=0;
    timer_start( time );

    magma_chetrd( uplo, n, A, lda, w, &rwork[inde],
                  &work[indtau], &work[indwrk], llwork, &iinfo );

    timer_stop( time );
    timer_printf( "time chetrd = %6.2f\n", time );

    /* For eigenvalues only, call SSTERF.  For eigenvectors, first call
     * CSTEDC to generate the eigenvector matrix, WORK(INDWRK), of the
     * tridiagonal matrix, then call CUNMTR to multiply it to the Householder
     * transformations represented as Householder vectors in A. */
    if (! wantz) {
        lapackf77_ssterf( &n, w, &rwork[inde], info );
    else {
        timer_start( time );

        if (MAGMA_SUCCESS != magma_smalloc( &dwork, 3*n*(n/2 + 1) )) {
            *info = MAGMA_ERR_DEVICE_ALLOC;
            return *info;

        magma_cstedx( MagmaRangeAll, n, 0., 0., 0, 0, w, &rwork[inde],
                      &work[indwrk], n, &rwork[indrwk], llrwk,
                      iwork, liwork, dwork, info );

        magma_free( dwork );

        timer_stop( time );
        timer_printf( "time cstedx = %6.2f\n", time );
        timer_start( time );

        magma_cunmtr( MagmaLeft, uplo, MagmaNoTrans, n, n, A, lda, &work[indtau],
                      &work[indwrk], n, &work[indwk2], llwrk2, &iinfo );

        lapackf77_clacpy( "A", &n, &n, &work[indwrk], &n, A, &lda );

        timer_stop( time );
        timer_printf( "time cunmtr + copy = %6.2f\n", time );

    /* If matrix was scaled, then rescale eigenvalues appropriately. */
    if (iscale == 1) {
        if (*info == 0) {
            imax = n;
        } else {
            imax = *info - 1;
        d__1 = 1. / sigma;
        blasf77_sscal( &imax, &d__1, w, &ione );

    work[0]  = MAGMA_C_MAKE( lwmin * one_eps, 0 );  // round up
    rwork[0] = lrwmin * one_eps;
    iwork[0] = liwmin;

    return *info;
} /* magma_cheevd */
Пример #30
int main(int argc, char **argv)
	Timer *render_timer;
	Sdl *sdl;
	FILE *out;
	Colour *buffer;
	Pixel *pixels;
	int num_pixels;

	SDL_Event event = {0};

	if (argc < 2)
		return 1;

	sdl = sdl_load(argv[1]);
	if (sdl == NULL)
		return 1;

	if (!init_SDL())
		return 1;

	num_pixels = config->width * config->height;
	buffer = calloc(num_pixels, sizeof(Colour));
	pixels = calloc(num_pixels, sizeof(Pixel));
	for (int j = 0; j < config->height; j++)
	for (int i = 0; i < config->width; i++)
		pixels[j*config->width + i].x = i;
		pixels[j*config->width + i].y = j;
	shuffle_pixels(pixels, config->width, config->height);

	/* START */
	render_timer = timer_start("Rendering");

	for (int i = 0; i < num_pixels; i++)
		Camera *cam = scene->camera;
		Colour c;
		Ray r;
		int x = pixels[i].x, y = pixels[i].y;

		/* The last parameter is the near plane, which is irrelevant for
		 * the moment. */
		r = camera_ray(cam, x, y, 1);

		c = ray_colour(r, 0);

		buffer[config->width*y + x] = c;
		put_pixel(display_surface, x, y, c);
		if (i % config->width == 0)
			while (SDL_PollEvent(&event))
				if (event.type == SDL_QUIT)
					return 0;

	/* STOP */
	printf("%.2f kilopixels per second\n",

	out = fopen("ray.ppm", "w");
	ppm_write(buffer, config->width, config->height, out);

		while (SDL_WaitEvent(&event))
			if (event.type == SDL_QUIT)
				return 0;
			else if (event.type == SDL_VIDEOEXPOSE)

	return 0;