Beispiel #1
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pzhbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static PLASMA_Complex64_t zone  = (PLASMA_Complex64_t) 1.0;
    static double             dzero = (double) 0.0;
    PLASMA_Complex64_t ztmp;
    double absztmp;
#endif

    PLASMA_Complex64_t *C, *S;
    int blksweep, lcsweep, blkid, lcNB;
    int N, NB, NT, grsiz, lcgrsiz;
    int i;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    NT = A.nt;
    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }

    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = cabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = creal( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = cabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  creal( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = cabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        PLASMA_Complex64_t *work, *TTau;
        int info, ldwork = N*N;
        work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble);
        TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);

        info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    C   = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);
    S   = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /*
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    grsiz = 1;
    if( NB > 160 ) {
        grsiz = 1;
    }
    else if( NB > 100 ) {
        grsiz = 1;
        /*
        if( N < 5000 )
            grsiz = 1;
        else
            grsiz = 2;
        */
    } else {
        grsiz = 2;
    }
    grsiz = max(1, grsiz);

    /*grsiz=1;*/
    /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d    A.ln %5d   A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/
    for (blksweep = 0; blksweep<NT; blksweep++){
       lcNB =  blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb;
       /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d blksweep%5d     NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/
       for (lcsweep = 0; lcsweep<lcNB; lcsweep++){
          for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){
              lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid;
              /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d lcgrsiz %5d  blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/
              QUARK_CORE_ztrdalg_v2(
                        plasma->quark, &task_flags,
                        uplo, &A, C, S,
                        lcgrsiz, lcsweep, blkid, blksweep);
             }
       }
    }
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = creal( *A(i,i) );
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = creal(*A(i+1, i));
          else
              E[i] = cabs( *A(i+1, i));
       }
        D[i] = creal( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  creal( *A(i,i) );
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = creal(*A(i, (i+1)));
            else
                E[i] = cabs(*A(i, (i+1)));
        }
        D[i] = creal( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
Beispiel #2
0
/***************************************************************************//**
 *
 * @ingroup PLASMA_Complex64_t_Tile_Async
 *
 *  PLASMA_zlansy_Tile_Async - Non-blocking equivalent of PLASMA_zlansy_Tile().
 *  May return before the computation is finished.
 *  Allows for pipelining of operations at runtime.
 *
 *******************************************************************************
 *
 * @param[in] sequence
 *          Identifies the sequence of function calls that this call belongs to
 *          (for completion checks and exception handling purposes).
 *
 * @param[out] request
 *          Identifies this function call (for exception handling purposes).
 *
 *******************************************************************************
 *
 * @sa PLASMA_zlansy
 * @sa PLASMA_zlansy_Tile
 * @sa PLASMA_clansy_Tile_Async
 * @sa PLASMA_dlansy_Tile_Async
 * @sa PLASMA_slansy_Tile_Async
 *
 ******************************************************************************/
int PLASMA_zlansy_Tile_Async(PLASMA_enum norm, PLASMA_enum uplo, PLASMA_desc *A, double *value,
                             PLASMA_sequence *sequence, PLASMA_request *request)
{
    PLASMA_desc descA;
    double *work = NULL;
    plasma_context_t *plasma;

    plasma = plasma_context_self();
    if (plasma == NULL) {
        plasma_fatal_error("PLASMA_zlansy_Tile", "PLASMA not initialized");
        return PLASMA_ERR_NOT_INITIALIZED;
    }
    if (sequence == NULL) {
        plasma_fatal_error("PLASMA_zlansy_Tile", "NULL sequence");
        return PLASMA_ERR_UNALLOCATED;
    }
    if (request == NULL) {
        plasma_fatal_error("PLASMA_zlansy_Tile", "NULL request");
        return PLASMA_ERR_UNALLOCATED;
    }
    /* Check sequence status */
    if (sequence->status == PLASMA_SUCCESS)
        request->status = PLASMA_SUCCESS;
    else
        return plasma_request_fail(sequence, request, PLASMA_ERR_SEQUENCE_FLUSHED);

    /* Check descriptors for correctness */
    if (plasma_desc_check(A) != PLASMA_SUCCESS) {
        plasma_error("PLASMA_zlansy_Tile", "invalid descriptor");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    } else {
        descA = *A;
    }
    /* Check input arguments */
    if (descA.nb != descA.mb) {
        plasma_error("PLASMA_zlansy_Tile", "only square tiles supported");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if ( (norm != PlasmaMaxNorm) && (norm != PlasmaOneNorm)
         && (norm != PlasmaInfNorm) && (norm != PlasmaFrobeniusNorm) ) {
        plasma_error("PLASMA_zlansy_Tile", "illegal value of norm");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if ( (uplo != PlasmaUpper) && (uplo != PlasmaLower) ) {
        plasma_error("PLASMA_zlansy_Tile", "illegal value of uplo");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    /* Quick return */
    if ( descA.m == 0) {
        *value = 0.0;
        return PLASMA_SUCCESS;
    }

    if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) {
        if (norm == PlasmaFrobeniusNorm) {
            work = plasma_shared_alloc(plasma, 2*PLASMA_SIZE, PlasmaRealDouble );
        } else {
            work = plasma_shared_alloc(plasma,   PLASMA_SIZE, PlasmaRealDouble );
        }
    }

    plasma_parallel_call_7(plasma_pzlansy,
        PLASMA_enum, norm,
        PLASMA_enum, uplo,
        PLASMA_desc, descA,
        double*, work,
        double*, value,
        PLASMA_sequence*, sequence,
        PLASMA_request*, request);

    if (work != NULL)
        plasma_shared_free( plasma, work );

    return PLASMA_SUCCESS;
}
Beispiel #3
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pdsbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static double zone  = (double) 1.0;
    static double             dzero = (double) 0.0;
    double ztmp;
    double absztmp;
#endif

    double *C, *S;
    int N, NB, INgrsiz, INthgrsiz, BAND;
    int myid, grsiz, shift=3, stt, st, ed, stind, edind;
    int blklastind, colpt, PCOL, ACOL, MCOL;
    int stepercol, mylastid, grnb, grid;
    int *DEP,*MAXID;
    int i, j, m;
    int thgrsiz, thgrnb, thgrid, thed;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }
    
    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = fabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = ( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = fabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  ( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = fabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        double *work, *TTau;
        int info, ldwork = N*N;
        work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble);
        TTau = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
        
        info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    DEP   = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    MAXID = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    C     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    S     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    memset(MAXID,0,(N+1)*sizeof(int));

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /* 
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    INgrsiz = 1;
    if( NB > 160 ) {
        INgrsiz = 2;
    }
    else if( NB > 100 ) {
        if( N < 5000 )
            INgrsiz = 2;
        else
            INgrsiz = 4;
    } else {
        INgrsiz = 6;
    }
    INthgrsiz = N;
    BAND      = 0;

    grsiz   = INgrsiz;
    thgrsiz = INthgrsiz;
    if( grsiz   == 0 ) grsiz   = 6;
    if( thgrsiz == 0 ) thgrsiz = N;

    i = shift/grsiz;
    stepercol =  i*grsiz == shift ? i:i+1;

    i       = (N-2)/thgrsiz;
    thgrnb  = i*thgrsiz == (N-2) ? i:i+1;

    for (thgrid = 1; thgrid<=thgrnb; thgrid++){
        stt  = (thgrid-1)*thgrsiz+1;
        thed = min( (stt + thgrsiz -1), (N-2));
        for (i = stt; i <= N-2; i++){
            ed=min(i,thed);
            if(stt>ed)break;
            for (m = 1; m <=stepercol; m++){
                st=stt;
                for (j = st; j <=ed; j++){
                    /* PCOL:  dependency on the ID of the master of the group of the previous column.  (Previous Column:PCOL). */
                    /* ACOL:  dependency on the ID of the master of the previous group of my column.   (Acctual  Column:ACOL). (it is 0(NULL) for myid=1) */
                    /* MCOL:  OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
                    myid     = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
                    mylastid = myid+grsiz-1;
                    PCOL     = mylastid+shift-1;  /* to know the dependent ID of the previous column. need to know the master of its group */
                    MAXID[j] = myid;
                    PCOL     = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
                    grnb     = PCOL/grsiz;
                    grid     = grnb*grsiz == PCOL ? grnb:grnb+1;
                    PCOL     = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */
                    ACOL     = myid-grsiz;
                    if(myid==1)ACOL=0;
                    MCOL     = myid;
                    
                    QUARK_CORE_dtrdalg(
                        plasma->quark, &task_flags,
                        uplo, N, NB,
                        &A, C, S, i, j, m, grsiz, BAND,
                        DEP(PCOL), DEP(ACOL), DEP(MCOL) );
                    
                    if(mylastid%2 ==0){
                        blklastind      = (mylastid/2)*NB+1+j-1;
                    }else{
                        colpt      = ((mylastid+1)/2)*NB + 1 +j -1 ;
                        stind      = colpt-NB+1;
                        edind      = min(colpt,N);
                        if( (stind>=edind-1) && (edind==N) )
                            blklastind=N;
                        else
                            blklastind=0;
                    }
                    if(blklastind >= (N-1))  stt=stt+1;
                } /* END for j=st:ed    */
            } /* END for m=1:stepercol */
        } /* END for i=1:MINMN-2      */
    } /* END for thgrid=1:thgrnb     */
    
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    //printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) DEP);
    plasma_shared_free(plasma, (void*) MAXID);
    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = ( *A(i,i) );               
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = (*A(i+1, i));
          else
              E[i] = fabs( *A(i+1, i));
       }
        D[i] = ( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  ( *A(i,i) );               
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = (*A(i, (i+1)));
            else
                E[i] = fabs(*A(i, (i+1)));
        }
        D[i] = ( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
Beispiel #4
0
/** ****************************************************************************
 *
 * @ingroup InPlaceTransformation
 *
 *  plasma_dgetmi2 Implementation of inplace transposition
 *    based on the GKK algorithm by Gustavson, Karlsson, Kagstrom.
 *    This algorithm shift some cycles to transpose the matrix.
 *
 *******************************************************************************
 *
 * @param[in] m
 *         Number of rows of matrix A
 *
 * @param[in] n
 *         Number of columns of matrix A
 *
 * @param[in,out] A
 *         Matrix of size L*m*n
 *
 * @param[in] nprob
 *         Number of parallel and independant problems
 *
 * @param[in] me
 *         Number of rows of the problem
 *
 * @param[in] ne
 *         Number of columns in the problem
 *
 * @param[in] L
 *         Size of chunk to use for transformation
 *
 ******************************************************************************/
int plasma_dshift(plasma_context_t *plasma, int m, int n, double *A,
                  int nprob, int me, int ne, int L,
                  PLASMA_sequence *sequence, PLASMA_request *request) 
{
    int *leaders = NULL;
    int ngrp, thrdbypb, thrdtot, nleaders;

    /* Check Plasma context */
    thrdtot  = PLASMA_SIZE;
    thrdbypb = PLASMA_GRPSIZE;
    ngrp = thrdtot/thrdbypb;

    /* check input */
    if( (nprob * me * ne * L) != (m * n) ) {
        plasma_error(__func__, "problem size does not match matrix size");
        /*printf("m=%d,  n=%d,  nprob=%d,  me=%d,  ne=%d, L=%d\n", m, n, nprob, me, ne, L);*/
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if( thrdbypb > thrdtot ) {
        plasma_error(__func__, "number of thread per problem must be less or equal to total number of threads");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }
    if( (thrdtot % thrdbypb) != 0 ) {
        plasma_error(__func__, "number of thread per problem must divide the total number of thread");
        return plasma_request_fail(sequence, request, PLASMA_ERR_ILLEGAL_VALUE);
    }

    /* quick return */
    if( (me < 2) || (ne < 2) || (nprob < 1) ) {
        return PLASMA_SUCCESS;
    }

    GKK_getLeaderNbr(me, ne, &nleaders, &leaders);
    nleaders *= 3;

    if (PLASMA_SCHEDULING == PLASMA_STATIC_SCHEDULING) {
        int *Tp      = NULL;
        int i, ipb;
        int q, owner;

        q = me*ne - 1;
    
        Tp = (int *)plasma_shared_alloc(plasma, thrdtot, PlasmaInteger);
        for (i=0; i<thrdtot; i++)
            Tp[i] = 0;

        ipb = 0;
        
        /* First part with coarse parallelism */
        if (nprob > ngrp) {
            ipb = (nprob / ngrp)*ngrp;
        
            /* loop over leader */
            if (thrdbypb > 1) {
                for (i=0; i<nleaders; i+=3) {
                    /* assign this cycle to a thread */
                    owner = minloc(thrdbypb, Tp);
                
                    /* assign it to owner */
                    Tp[owner] = Tp[owner] + leaders[i+1] * L;
                    leaders[i+2] = owner;
                }
            
                GKK_BalanceLoad(thrdbypb, Tp, leaders, nleaders, L);
            }
            else {
                for (i=0; i<nleaders; i+=3) {
                    Tp[0] = Tp[0] + leaders[i+1] * L;
                    leaders[i+2] = 0;
                }
            }

            /* shift in parallel */
            for (i=0; i< (nprob/ngrp); i++) {
                plasma_static_call_9(plasma_pdshift,
                                     int,                 me,
                                     int,                 ne,
                                     int,                 L,
                                     double*, &(A[i*ngrp*me*ne*L]),
                                     int *,               leaders,
                                     int,                 nleaders,
                                     int,                 thrdbypb,
                                     PLASMA_sequence*,    sequence,
                                     PLASMA_request*,     request);
            }
        }