Exemple #1
0
void warmup(Quark *q){
    int NB = 200;
    double *H = (double*) malloc(NB*NB*OOC_NTHREADS*sizeof(double));
    double *D = (double*) offload_Alloc(NB*NB*OOC_NTHREADS*sizeof(double), 0);
    
    {
        Quark_Task_Flags tflags = Quark_Task_Flags_Initializer;
//      for(int r = 0; r < OOC_NTHREADS; r++){
        for(int r = 0; r < 2; r++){
            QUARK_Task_Flag_Set(&tflags, TASK_LOCK_TO_THREAD, r);
//          QUARK_Task_Flag_Set(&tflags, THREAD_SET_TO_MANUAL_SCHEDULING, (r==0)||(r==1));
            QUARK_Insert_Task(q, CORE_H2D, &tflags,
                sizeof(int),                &NB,        VALUE,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             H+r*NB*NB,  INPUT,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             D+r*NB*NB,  OUTPUT,
                sizeof(int),                &NB,        VALUE,
                0);
            QUARK_Insert_Task(q, CORE_D2H, &tflags,
                sizeof(int),                &NB,        VALUE,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             D+r*NB*NB,  INPUT,
                sizeof(int),                &NB,        VALUE,
                sizeof(double),             H+r*NB*NB,  OUTPUT,
                sizeof(int),                &NB,        VALUE,
                0);
        }
    }
    QUARK_Barrier(q);
    offload_Free(D, 0);
    free(H);
}
Exemple #2
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pzhbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static PLASMA_Complex64_t zone  = (PLASMA_Complex64_t) 1.0;
    static double             dzero = (double) 0.0;
    PLASMA_Complex64_t ztmp;
    double absztmp;
#endif

    PLASMA_Complex64_t *C, *S;
    int blksweep, lcsweep, blkid, lcNB;
    int N, NB, NT, grsiz, lcgrsiz;
    int i;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    NT = A.nt;
    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }

    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = cabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = creal( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = cabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  creal( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = cabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (PLASMA_Complex64_t) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        PLASMA_Complex64_t *work, *TTau;
        int info, ldwork = N*N;
        work = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, ldwork, PlasmaComplexDouble);
        TTau = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);

        info = LAPACKE_zhetrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    C   = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);
    S   = (PLASMA_Complex64_t *) plasma_shared_alloc(plasma, N,   PlasmaComplexDouble);

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /*
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    grsiz = 1;
    if( NB > 160 ) {
        grsiz = 1;
    }
    else if( NB > 100 ) {
        grsiz = 1;
        /*
        if( N < 5000 )
            grsiz = 1;
        else
            grsiz = 2;
        */
    } else {
        grsiz = 2;
    }
    grsiz = max(1, grsiz);

    /*grsiz=1;*/
    /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d    A.ln %5d   A.nb %5d \n",N,NB,lcNB,grsiz,A.ln,A.nb);*/
    for (blksweep = 0; blksweep<NT; blksweep++){
       lcNB =  blksweep == NT-1 ? A.n-blksweep*A.nb : A.nb;
       /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d blksweep%5d     NT %5d \n",N,NB,lcNB,grsiz,blksweep,NT);*/
       for (lcsweep = 0; lcsweep<lcNB; lcsweep++){
          for (blkid = blksweep; blkid<NT; blkid=blkid+grsiz){
              lcgrsiz = (blkid+1) < NT ? grsiz : NT-blkid;
              /*printf("  Version -dp- N %5d   NB %5d    lcNB %5d  grsiz %5d lcgrsiz %5d  blkid %5d \n",N,NB,lcNB,grsiz,lcgrsiz,blkid);*/
              QUARK_CORE_ztrdalg_v2(
                        plasma->quark, &task_flags,
                        uplo, &A, C, S,
                        lcgrsiz, lcsweep, blkid, blksweep);
             }
       }
    }
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = creal( *A(i,i) );
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = creal(*A(i+1, i));
          else
              E[i] = cabs( *A(i+1, i));
       }
        D[i] = creal( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  creal( *A(i,i) );
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = creal(*A(i, (i+1)));
            else
                E[i] = cabs(*A(i, (i+1)));
        }
        D[i] = creal( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
Exemple #3
0
/***************************************************************************//**
 *  Parallel Reduction from BAND tridiagonal to the final condensed form - dynamic scheduler
 **/
void plasma_pdsbrdt_quark(PLASMA_enum uplo,
                          PLASMA_desc A, double *D, double *E, PLASMA_desc T,
                          PLASMA_sequence *sequence, PLASMA_request *request)
{
    plasma_context_t *plasma;
    Quark_Task_Flags task_flags = Quark_Task_Flags_Initializer;

#ifdef COMPLEX
    static double zone  = (double) 1.0;
    static double             dzero = (double) 0.0;
    double ztmp;
    double absztmp;
#endif

    double *C, *S;
    int N, NB, INgrsiz, INthgrsiz, BAND;
    int myid, grsiz, shift=3, stt, st, ed, stind, edind;
    int blklastind, colpt, PCOL, ACOL, MCOL;
    int stepercol, mylastid, grnb, grid;
    int *DEP,*MAXID;
    int i, j, m;
    int thgrsiz, thgrnb, thgrid, thed;
    size_t eltsize = plasma_element_size(A.dtyp);

    plasma = plasma_context_self();
    if (sequence->status != PLASMA_SUCCESS)
        return;

    QUARK_Task_Flag_Set(&task_flags, TASK_SEQUENCE, (intptr_t)sequence->quark_sequence);

    N  = A.m;
    NB = A.mb;

    /* Quick return */
    if (N == 0){
        return;
    }
    
    if (NB == 0) {
        memset(D, 0,     N*sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        for (i=0; i<N; i++)
            D[i]  = fabs(*A(i,i));
#else
        for (i=0; i<N; i++)
          D[i]  = *A(i,i);
#endif
        return;
    }

    /*
     * Barrier is used because the bulge have to wait until
     * the reduction to band has been finish.
     * otherwise, I can remove this BARRIER when I integrate
     * the function dependencies link inside the reduction to
     * band. Keep in min the case when NB=1, where no bulge-chasing.
     */
    /***************************************************************/
    QUARK_Barrier(plasma->quark);
    tblg   = -Wtimming();
    /***************************************************************/

    /*
     * Case NB=1 ==> matrix is already Bidiagonal. no need to bulge.
     * Make diagonal and superdiagonal elements real, storing them in
     * D and E. if PlasmaLower, first transform lower bidiagonal form
     * to upper bidiagonal by applying plane rotations/ Householder
     * from the left, overwriting superdiagonal elements then make
     * elements real of the resulting upper Bidiagonal. if PlasmaUpper
     * then make its elements real.  For Q, PT: ZSCAL should be done
     * in case of WANTQ.
     */
    if (NB == 1){
        memset(D, 0, N    *sizeof(double));
        memset(E, 0, (N-1)*sizeof(double));
#ifdef COMPLEX
        if(uplo==PlasmaLower){
            for (i=0; i<N; i++)
            {
                D[i] = ( *A(i, i) );               /* diag value */
                if( i < (N-1)) {                            /* lower off-diag value */
                    ztmp        = *A((i+1),i);
                    absztmp     = fabs(ztmp);
                    *A((i+1),i) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+2),(i+1)) = *A((i+2),(i+1)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ */
                }
            }
        } else { /* PlasmaUpper */
            for (i=0; i<N; i++)
            {
                D[i]  =  ( *A(i,i) );               /* diag value*/
                if(i<(N-1)) {                            /* lower off-diag value */
                    ztmp        = *A(i, (i+1));
                    absztmp     = fabs(ztmp);
                    *A(i,(i+1)) = absztmp;
                    E[i]        = absztmp;
                    if(absztmp != dzero)
                        ztmp = (double) (ztmp / absztmp);
                    else
                        ztmp = zone;
                    if(i<(N-2)) *A((i+1),(i+2)) = *A((i+1),(i+2)) * ztmp;
                    /* for Q: ZSCAL should be done in case of WANTQ. HERE NEED THE multiply by CONJ(T) */
                }
            }
        } /* end PlasmaUpper*/
#else
       if( uplo == PlasmaLower ){
           for (i=0; i < N-1; i++) {
               D[i] = *A(i,   i);
               E[i] = *A(i+1, i);
           }
           D[i] = *A(i, i);
       } else {
           for (i=0; i < N-1; i++) {
               D[i] = *A(i, i  );
               E[i] = *A(i, i+1);
           }
           D[i] = *A(i, i);
       }
#endif
       return;
    }

    /* Case N<NB ==> matrix is very small and better to call lapack XHETRD. */
    if( N <= 0 ) /* this will be removed we don t need it. */
    {
        double *work, *TTau;
        int info, ldwork = N*N;
        work = (double *) plasma_shared_alloc(plasma, ldwork, PlasmaRealDouble);
        TTau = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
        
        info = LAPACKE_dsytrd_work(LAPACK_COL_MAJOR, lapack_const(uplo), N,
                                   A(0,0), A.lm, D, E, TTau, work, ldwork);
        plasma_shared_free(plasma, (void*) work);
        plasma_shared_free(plasma, (void*) TTau);

        if( info == 0 )
            sequence->status = PLASMA_SUCCESS;
        else
            plasma_sequence_flush(plasma->quark, sequence, request, info);
        return;
    }

    /* General case NB > 1 && N > NB */
    DEP   = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    MAXID = (int *)                plasma_shared_alloc(plasma, N+1, PlasmaInteger      );
    C     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    S     = (double *) plasma_shared_alloc(plasma, N,   PlasmaRealDouble);
    memset(MAXID,0,(N+1)*sizeof(int));

    /***************************************************************************
     *                       START BULGE CHASING CODE
     **************************************************************************/
    /* 
     * Initialisation of local parameter. those parameter should be
     * input or tuned parameter.
     */
    INgrsiz = 1;
    if( NB > 160 ) {
        INgrsiz = 2;
    }
    else if( NB > 100 ) {
        if( N < 5000 )
            INgrsiz = 2;
        else
            INgrsiz = 4;
    } else {
        INgrsiz = 6;
    }
    INthgrsiz = N;
    BAND      = 0;

    grsiz   = INgrsiz;
    thgrsiz = INthgrsiz;
    if( grsiz   == 0 ) grsiz   = 6;
    if( thgrsiz == 0 ) thgrsiz = N;

    i = shift/grsiz;
    stepercol =  i*grsiz == shift ? i:i+1;

    i       = (N-2)/thgrsiz;
    thgrnb  = i*thgrsiz == (N-2) ? i:i+1;

    for (thgrid = 1; thgrid<=thgrnb; thgrid++){
        stt  = (thgrid-1)*thgrsiz+1;
        thed = min( (stt + thgrsiz -1), (N-2));
        for (i = stt; i <= N-2; i++){
            ed=min(i,thed);
            if(stt>ed)break;
            for (m = 1; m <=stepercol; m++){
                st=stt;
                for (j = st; j <=ed; j++){
                    /* PCOL:  dependency on the ID of the master of the group of the previous column.  (Previous Column:PCOL). */
                    /* ACOL:  dependency on the ID of the master of the previous group of my column.   (Acctual  Column:ACOL). (it is 0(NULL) for myid=1) */
                    /* MCOL:  OUTPUT dependency on the my ID, to be used by the next ID. (My Column: MCOL). I am the master of this group. */
                    myid     = (i-j)*(stepercol*grsiz) +(m-1)*grsiz + 1;
                    mylastid = myid+grsiz-1;
                    PCOL     = mylastid+shift-1;  /* to know the dependent ID of the previous column. need to know the master of its group */
                    MAXID[j] = myid;
                    PCOL     = min(PCOL,MAXID[j-1]); /* for the last columns, we might do only 1 or 2 kernel, so the PCOL will be wrong. this is to force it to the last ID of the previous col.*/
                    grnb     = PCOL/grsiz;
                    grid     = grnb*grsiz == PCOL ? grnb:grnb+1;
                    PCOL     = (grid-1)*grsiz +1; /* give me the ID of the master of the group of the previous column. */
                    ACOL     = myid-grsiz;
                    if(myid==1)ACOL=0;
                    MCOL     = myid;
                    
                    QUARK_CORE_dtrdalg(
                        plasma->quark, &task_flags,
                        uplo, N, NB,
                        &A, C, S, i, j, m, grsiz, BAND,
                        DEP(PCOL), DEP(ACOL), DEP(MCOL) );
                    
                    if(mylastid%2 ==0){
                        blklastind      = (mylastid/2)*NB+1+j-1;
                    }else{
                        colpt      = ((mylastid+1)/2)*NB + 1 +j -1 ;
                        stind      = colpt-NB+1;
                        edind      = min(colpt,N);
                        if( (stind>=edind-1) && (edind==N) )
                            blklastind=N;
                        else
                            blklastind=0;
                    }
                    if(blklastind >= (N-1))  stt=stt+1;
                } /* END for j=st:ed    */
            } /* END for m=1:stepercol */
        } /* END for i=1:MINMN-2      */
    } /* END for thgrid=1:thgrnb     */
    
    /*
     * Barrier used only for now, to be sure that everything
     * is done before copying the D and E and free workspace.
     * this will be removed later when D and E are directly filled
     * during the bulge process.
     */
    QUARK_Barrier(plasma->quark);
    tblg   += Wtimming();
    //printf("   done with bulge %lf \n\n\n",tblg);

    plasma_shared_free(plasma, (void*) DEP);
    plasma_shared_free(plasma, (void*) MAXID);
    plasma_shared_free(plasma, (void*) C);
    plasma_shared_free(plasma, (void*) S);

    /*
     * STORE THE RESULTING diagonal/off-diagonal in D AND E
     */
    memset(D, 0,  N   *sizeof(double));
    memset(E, 0, (N-1)*sizeof(double));
    /* Make diagonal and superdiagonal elements real,
     * storing them in D and E
     */
    /* In complex case, the off diagonal element are
     * not necessary real. we have to make off-diagonal
     * elements real and copy them to E.
     * When using HouseHolder elimination,
     * the ZLARFG give us a real as output so, all the
     * diagonal/off-diagonal element except the last one are already
     * real and thus we need only to take the abs of the last
     * one.
     *  */
#ifdef COMPLEX
    if(uplo==PlasmaLower){
       for (i=0; i < N-1 ; i++)
       {
          D[i] = ( *A(i,i) );               
          /*
           * Alternative for Householder case, all off-diag
           * are real except the last off-diag, where we
           * have to take the abs
           */
          if(i<(N-2))
              E[i] = (*A(i+1, i));
          else
              E[i] = fabs( *A(i+1, i));
       }
        D[i] = ( *A(i, i) );
    } else { /* PlasmaUpper */
        for (i=0; i<N-1; i++)
        {
            D[i]  =  ( *A(i,i) );               
            /*
             * Alternative for Householder case, all off-diag
             * are real except the last off-diag, where we
             * have to take the abs
             */
            if( i < (N-2) )
                E[i] = (*A(i, (i+1)));
            else
                E[i] = fabs(*A(i, (i+1)));
        }
        D[i] = ( *A(i, i) );
    } /* end PlasmaUpper */
#else
    if( uplo == PlasmaLower ){
        for (i=0; i < N-1; i++) {
            D[i] = *A(i,   i);
            E[i] = *A(i+1, i);
        }
        D[i] = *A(i, i);
    } else {
        for (i=0; i < N-1; i++) {
            D[i] = *A(i, i  );
            E[i] = *A(i, i+1);
        }
        D[i] = *A(i, i);
    }
#endif

} /* END FUNCTION */
Exemple #4
0
double Cholesky(Quark *quark, double *A, int N, int NB, int LDA, size_t memsize) 
{
    #define A(ib,jb) A[(size_t)(jb)*NB*LDA+(ib)*NB]

    #ifndef USE_MIC
    cublasStatus cu_status;
    #endif
         
    int bb = (N + NB - 1) / NB;
    int YM, YN;
    int Ym, Yn;
    int JB;
    int jb, jjb;
    int memBlock = memsize/sizeof(double)/NB/NB;
    double *X, *Y;

    #ifdef USE_MIC
        Y = (double*) offload_Alloc((size_t)memBlock*NB*NB*sizeof(double), 0);
        assert(Y != NULL);
    #else
        #ifdef USE_CUBLASV2
        {
            cudaError_t ierr;
            ierr = cudaMalloc((void **) &Y, (size_t) memBlock*NB*NB*sizeof(double));
            assert(ierr == cudaSuccess);
        }
        #else
            cu_status = cublasAlloc((size_t) memBlock*NB*NB, sizeof(double), (void **) &Y);
            CHKERR(cu_status);
        #endif
    #endif
    
    double t1;
    double llttime = MPI_Wtime();

    
    /*--------------------------------------*/   

    /*     The main Ypanel loop     */

//  QUARK_Barrier(quark);
    for (JB = 0, jb = 0; JB < N; JB+=YN, jb+=Yn)
    {
        //determine size of Ypanel
        Ym = bb - jb;
        Yn = find_Yn(bb, memBlock, jb);
        YM = N - JB;
        YN = MIN((jb+Yn)*NB, N) - jb*NB;
        X = Y + (size_t)(memBlock-Ym)*NB*NB;
        printf("bb %d jb %d YM %d YN %d Ym %d Yn %d Y %p X %p\n", bb, jb, YM, YN, Ym, Yn, Y, X);

        /* Copy in data */
        A2Y(quark, &A(jb,jb), Y, LDA, NB, YM, YN);

        /* Left-looking */
        for(jjb = 0; jjb < jb; jjb++){
            /* copy from A to X */
            A2X(quark, &A(jb,jjb), LDA, X, NB, YM);
            ooc_syrk(quark, X, Y, YM, YN, NB);
        }

        /* incore factorization */
        ooc_incore(quark, &A(jb,jb), Y, LDA, NB, YM, YN);
    
        /* Copy out data */
//      Y2A(quark, Y, &A(jb,jb), LDA, NB, YM, YN);
//      QUARK_Barrier(quark); // reduce parallelism
//      goto oasdfh; // early stop

    }
oasdfh:
    QUARK_Barrier(quark);
    llttime = MPI_Wtime() - llttime;
    printf("llt time %lf %lf\n", llttime, MPI_Wtime());
    printf("%lf %lf\n", A[(N-1)*LDA+N-1], MPI_Wtime());
    /*--------------------------------------*/   

    #ifdef USE_MIC
        offload_Free(Y,0);
    #else
        #ifdef USE_CUBLASV2
        {
            cudaError_t ierr;
            ierr = cudaFree((void *) Y);
            assert(ierr == cudaSuccess);
            Y = 0;
        }
        #else
            cu_status = cublasFree(Y);
            CHKERR(cu_status);
        #endif
    #endif
    return llttime;
    #undef A
} 
Exemple #5
0
extern "C" magma_int_t 
magma_zgetrf_mc(magma_context *cntxt,
        int *m, int *n,
        cuDoubleComplex *a, int *lda,
        int *ipiv, int *info)
{
/*  -- MAGMA (version 1.6.1) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date January 2015

    Purpose   
    =======   
    ZGETRF computes an LU factorization of a general COMPLEX_16 
    M-by-N matrix A using partial pivoting with row interchanges.   

    The factorization has the form   
       A = P * L * U   
    where P is a permutation matrix, L is lower triangular with unit   
    diagonal elements (lower trapezoidal if m > n), and U is upper   
    triangular (upper trapezoidal if m < n).   

    This is the right-looking Level 3 BLAS version of the algorithm.   

    Arguments   
    =========   
    CNTXT   (input) MAGMA_CONTEXT
            CNTXT specifies the MAGMA hardware context for this routine.   

    M       (input) INTEGER   
            The number of rows of the matrix A.  M >= 0.   

    N       (input) INTEGER   
            The number of columns of the matrix A.  N >= 0.   

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)   
            On entry, the M-by-N matrix to be factored.   
            On exit, the factors L and U from the factorization   
            A = P*L*U; the unit diagonal elements of L are not stored.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,M).   

    IPIV    (output) INTEGER array, dimension (min(M,N))   
            The pivot indices; for 1 <= i <= min(M,N), row i of the   
            matrix was interchanged with row IPIV(i).   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, U(i,i) is exactly zero. The factorization   
                  has been completed, but the factor U is exactly   
                  singular, and division by zero will occur if it is used   
                  to solve a system of equations.   
    =====================================================================    */

    if (cntxt->num_cores == 1 && cntxt->num_gpus == 1)
      {
    //int result = magma_zgetrf(*m, *n, a, *lda, ipiv, info);
    //return result;
      }
    
    int EN_BEE   = cntxt->nb;
    Quark* quark = cntxt->quark;

    int i,j,l;
    int ii,jj,ll;

    void *fakedep;

    int ione=1;

    cuDoubleComplex fone = MAGMA_Z_ONE;
    cuDoubleComplex mone = MAGMA_Z_NEG_ONE;

    int M,N,MM,NN,MMM,K;

    int priority=0;

    *info = 0;
    
    int nb = (EN_BEE==-1)? magma_get_zpotrf_nb(*n): EN_BEE;

    /* Check arguments */
    if (*m < 0) {
      *info = -1;
    } else if (*n < 0) {
      *info = -2;
    } else if (*lda < max(1,*m)) {
      *info = -4;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }
    
    int k = min(*m,*n);

    int iinfo[2];
    iinfo[1] = 0;

    char label[10000];
    
    ii = -1;
    
    /* Loop across diagonal blocks */
    for (i = 0; i < k; i += nb) 
      {
    ii++;

    jj = -1;

    priority = 10000 - ii;

    /* Update panels in left looking fashion */
    for (j = 0; j < i; j += nb) 
      { 
        jj++;

        NN=min(nb,(*n)-i);
        MM=min(nb,(*m)-j);

        l = j + nb;

        MMM = min(nb,(*m)-l);

        sprintf(label, "UPDATE %d %d", ii, jj);
        
        QUARK_Insert_Task(quark, SCHED_panel_update, 0,
                  sizeof(int),             &NN,      VALUE,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i),   INOUT,
                  sizeof(int),             lda,        VALUE,
                  sizeof(int),             &MM,      VALUE,
                  sizeof(cuDoubleComplex)*nb,        &ipiv[j], INPUT,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(j,j),   INPUT,
                  sizeof(int),             &MMM,     VALUE,
                  sizeof(int),             &nb,      VALUE,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j),   INPUT,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i),   INOUT,
                  sizeof(int),             &priority,VALUE | TASK_PRIORITY,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i),   OUTPUT,
                  strlen(label)+1,         label,    VALUE | TASKLABEL,
                  5,                       "cyan",   VALUE | TASKCOLOR,
                  0);
        
        ll = jj + 1;
        
        /* Split gemm into tiles */
        for (l = j + (2*nb); l < (*m); l += nb) 
          {
        ll++;
        
        MMM = min(nb,(*m)-l);
        
        fakedep = (void *)(intptr_t)(j+1);
        
        sprintf(label, "GEMM %d %d %d", ii, jj, ll);
        
        QUARK_Insert_Task(quark, SCHED_zgemm, 0,
                  sizeof(int),             &MMM,     VALUE,
                  sizeof(int),             &NN,      VALUE,
                  sizeof(int),             &nb,      VALUE,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j),   INPUT,
                  sizeof(int),             lda,        VALUE,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i),   INPUT,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i),   INOUT,
                  sizeof(int),             &priority,VALUE | TASK_PRIORITY,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i),   OUTPUT | GATHERV,
                  sizeof(void*),           fakedep,  OUTPUT | GATHERV,
                  strlen(label)+1,         label,    VALUE | TASKLABEL,
                  5,                       "blue",   VALUE | TASKCOLOR,
                  0);
        
          }  
        
      }
    
    M=(*m)-i;
    N=min(nb,(*n)-i);
    
    iinfo[0] = i;
    
    sprintf(label, "GETRF %d", ii);
    
    QUARK_Insert_Task(quark, SCHED_zgetrf, 0,
              sizeof(int),             &M,       VALUE,
              sizeof(int),             &N,       VALUE,
              sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i),   INOUT,
              sizeof(int),             lda,        VALUE,
              sizeof(cuDoubleComplex)*nb,        &ipiv[i], OUTPUT,
              sizeof(int),             iinfo,    OUTPUT,
              sizeof(int),             &priority,VALUE | TASK_PRIORITY,
              strlen(label)+1,         label,    VALUE | TASKLABEL,
              6,                       "green",  VALUE | TASKCOLOR,
              0);
    
      }
    
    K = (*m)/nb;
    
    if ((K*nb)==(*m)) {
      ii = K - 1;
      K = *m;
    } else {
      ii = k;
      K = (K+1)*nb;
    }
    
    priority = 0;
    
    /* If n > m */
    for (i = K; i < (*n); i += nb) 
      {
    ii++;
    
    jj = -1;
    
    /* Update remaining panels in left looking fashion */
    for (j = 0; j < (*m); j += nb) 
      { 
        jj++;

        NN=min(nb,(*n)-i);
        MM=min(nb,(*m)-j);
        
        l = j + nb;
        
        MMM = min(nb,(*m)-l);
        
        sprintf(label, "UPDATE %d %d", ii, jj);
        
        QUARK_Insert_Task(quark, SCHED_panel_update, 0,
                  sizeof(int),             &NN,      VALUE,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i),   INOUT,
                  sizeof(int),             lda,        VALUE,
                  sizeof(int),             &MM,      VALUE,
                  sizeof(cuDoubleComplex)*nb,        &ipiv[j], INPUT,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(j,j),   INPUT,
                  sizeof(int),             &MMM,     VALUE,
                  sizeof(int),             &nb,      VALUE,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j),   INPUT,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i),   INOUT,
                  sizeof(int),             &priority,VALUE | TASK_PRIORITY,
                  sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i),   OUTPUT,
                  strlen(label)+1,         label,    VALUE | TASKLABEL,
                  5,                       "cyan",   VALUE | TASKCOLOR,
                  0);
        
        ll = jj + 1;
        
        /* Split gemm into tiles */
        for (l = j + (2*nb); l < (*m); l += nb) {
          
          ll++;
          
          MMM = min(nb,(*m)-l);
          
          fakedep = (void *)(intptr_t)(j+1);
          
          sprintf(label, "GEMM %d %d %d", ii, jj, ll);
          
          QUARK_Insert_Task(quark, SCHED_zgemm, 0,
                sizeof(int),             &MMM,     VALUE,
                sizeof(int),             &NN,      VALUE,
                sizeof(int),             &nb,      VALUE,
                sizeof(cuDoubleComplex)*(*m)*(*n), A(l,j),   INPUT,
                sizeof(int),             lda,        VALUE,
                sizeof(cuDoubleComplex)*(*m)*(*n), A(j,i),   INPUT,
                sizeof(cuDoubleComplex)*(*m)*(*n), A(l,i),   INOUT,
                sizeof(int),             &priority,VALUE | TASK_PRIORITY,
                sizeof(cuDoubleComplex)*(*m)*(*n), A(i,i),   OUTPUT | GATHERV,
                sizeof(void*),           fakedep,  OUTPUT | GATHERV,
                strlen(label)+1,         label,    VALUE | TASKLABEL,
                5,                       "blue",   VALUE | TASKCOLOR,
                0);
          
        }
        
      }
    
      }
    
    ii = -1;
    
    /* Swap behinds */
    for (i = 0; i < k; i += nb) {
      
      ii++;
      
      jj = -1;
      
      MM = min(nb,(*m)-i);
      MM = min(MM,(*n)-i);
      
      for (j = 0; j < i; j += nb) {
    
    jj++;
    
    fakedep = (void *)(intptr_t)(j+1);
    
    sprintf(label, "LASWPF %d %d", ii, jj);
    
    QUARK_Insert_Task(quark, SCHED_zlaswp, 0,
              sizeof(int),             &nb,       VALUE,
              sizeof(cuDoubleComplex)*(*m)*(*n), A(i,j),    INOUT,
              sizeof(int),             lda,         VALUE,
              sizeof(int),             &MM,       VALUE,
              sizeof(cuDoubleComplex)*nb,        &ipiv[i],  INPUT,
              sizeof(int),             &priority, VALUE | TASK_PRIORITY,
              sizeof(void*),           fakedep,   INPUT,
              sizeof(cuDoubleComplex)*(*m)*(*n), A(i+nb,j), OUTPUT,
              strlen(label)+1,         label,     VALUE | TASKLABEL,
              7,                       "purple",  VALUE | TASKCOLOR,
              0);
    
      }
      
    }
    
    /* Synchronization point */
    QUARK_Barrier(quark);
    
    /* Fix pivot */
    ii = -1;

    for (i = 0; i < k; i +=nb) {
      ii++;
      for (j = 0; j < min(nb,(k-i)); j++) {
    ipiv[ii*nb+j] += ii*nb;
      } 
    } 
    
    QUARK_Barrier(quark);
    
}
Exemple #6
0
extern "C" magma_int_t 
magma_zpotrf_mc(magma_context *cntxt, char *uplo,
        magma_int_t *n,
        cuDoubleComplex *a, magma_int_t *lda,
        magma_int_t *info)
{
/*  -- MAGMA (version 1.5.0) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       @date September 2014

    Purpose   
    =======   
    ZPOTRF computes the Cholesky factorization of a Hermitian   
    positive definite matrix A.   

    The factorization has the form   
       A = U**T * U,  if UPLO = 'U', or   
       A = L  * L**T,  if UPLO = 'L',   
    where U is an upper triangular matrix and L is lower triangular.   

    This is the block version of the algorithm, calling Level 3 BLAS.   

    Arguments   
    =========   
    CNTXT   (input) MAGMA_CONTEXT
            CNTXT specifies the MAGMA hardware context for this routine.   

    UPLO    (input) CHARACTER*1   
            = 'U':  Upper triangle of A is stored;   
            = 'L':  Lower triangle of A is stored.   

    N       (input) INTEGER   
            The order of the matrix A.  N >= 0.   

    A       (input/output) COMPLEX_16 array, dimension (LDA,N)   
            On entry, the Hermitian matrix A.  If UPLO = 'U', the leading   
            N-by-N upper triangular part of A contains the upper   
            triangular part of the matrix A, and the strictly lower   
            triangular part of A is not referenced.  If UPLO = 'L', the   
            leading N-by-N lower triangular part of A contains the lower   
            triangular part of the matrix A, and the strictly upper   
            triangular part of A is not referenced.   

            On exit, if INFO = 0, the factor U or L from the Cholesky   
            factorization A = U**T*U or A = L*L**T.   

    LDA     (input) INTEGER   
            The leading dimension of the array A.  LDA >= max(1,N).   

    INFO    (output) INTEGER   
            = 0:  successful exit   
            < 0:  if INFO = -i, the i-th argument had an illegal value   
            > 0:  if INFO = i, the leading minor of order i is not   
                  positive definite, and the factorization could not be   
                  completed.   
    =====================================================================   */

  if (cntxt->num_cores == 1 && cntxt->num_gpus == 1)
  {
    //magma_int_t result = magma_zpotrf(*uplo, *n, a, *lda, info);
    //return result;
  }

  // check arguments
  magma_int_t upper = (magma_int_t) lsame_(uplo, "U");                                          
  *info = 0;
  if (! upper && ! lsame_(uplo, "L")) {
    *info = -1;
  } else if (*n < 0) {
    *info = -2;
  } else if (*lda < max(1,*n)) {
    *info = -4;
  }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return MAGMA_ERR_ILLEGAL_VALUE;
    }

  Quark* quark = cntxt->quark;

  // get block size
  magma_int_t nb = (cntxt->nb ==-1)? magma_get_zpotrf_nb(*n): cntxt->nb;

  magma_int_t i,j,k;
  magma_int_t ii,jj,kk;
  magma_int_t temp,temp2,temp3;

  char label[10000];

  magma_int_t iinfo[2];
  iinfo[1] = 0;
  ii = -1;

  // traverse diagonal blocks
  for (i = 0; i < (*n); i += nb) {
    ii++;
    temp2 = min(nb,(*n)-i);
 
    // if not first block
    if (i > 0) {

      // first do large syrk, then split
      if (i < (*n)/2) {

        sprintf(label, "SYRK %d", ii);

        if (upper) {

          QUARK_Insert_Task(quark, SCHED_zsyrk, 0,
            sizeof(magma_int_t),             &upper,    VALUE,
            sizeof(magma_int_t),             &temp2,    VALUE,
            sizeof(magma_int_t),             &i,        VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(0,i),    INPUT,
            sizeof(magma_int_t),             lda,       VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INOUT,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i-nb,i), INPUT,
            strlen(label)+1,         label,     VALUE | TASKLABEL,
            6,                       "green",   VALUE | TASKCOLOR,
            0);

        } else {

          QUARK_Insert_Task(quark, SCHED_zsyrk, 0,
            sizeof(magma_int_t),             &upper,    VALUE,
            sizeof(magma_int_t),             &temp2,    VALUE,
            sizeof(magma_int_t),             &i,        VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,0),    INPUT,
            sizeof(magma_int_t),             lda,       VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INOUT,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i-nb), INPUT,
            strlen(label)+1,         label,     VALUE | TASKLABEL,
            6,                       "green",   VALUE | TASKCOLOR,
            0);
        }   

      } else {

        jj = -1;

        // split syrk into tiles
        for (j = 0; j < i; j += nb) {
          jj++;

          sprintf(label, "SYRK %d %d", ii, jj);

          if (upper) {

            QUARK_Insert_Task(quark, SCHED_zsyrk, 0,
              sizeof(magma_int_t),             &upper,    VALUE,
              sizeof(magma_int_t),             &temp2,    VALUE,
              sizeof(magma_int_t),             &nb,       VALUE,
              sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i),    INPUT,
              sizeof(magma_int_t),             lda,       VALUE,
              sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INOUT,
              strlen(label)+1,         label,     VALUE | TASKLABEL,
              6,                       "green",   VALUE | TASKCOLOR,
              0);

          } else {

            QUARK_Insert_Task(quark, SCHED_zsyrk, 0,
              sizeof(magma_int_t),             &upper,    VALUE,
              sizeof(magma_int_t),             &temp2,    VALUE,
              sizeof(magma_int_t),             &nb,       VALUE,
              sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j),    INPUT,
              sizeof(magma_int_t),             lda,       VALUE,
              sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INOUT,
              strlen(label)+1,         label,     VALUE | TASKLABEL,
              6,                       "green",   VALUE | TASKCOLOR,
              0);
          }
        }
      }

      // if not last block
      if (i < ((*n)-nb)) {

        jj = -1;

        // split gemm into tiles
        for (j = i+nb; j < (*n); j += nb){
      jj++;
          kk = -1;

          for (k = 0; k < i; k += nb) {
            kk++;
            temp = min(nb,(*n)-j);

            sprintf(label, "GEMM %d %d %d", ii, jj, kk);

            if (upper) {
              QUARK_Insert_Task(quark, SCHED_zgemm, 0,
                sizeof(magma_int_t),             &upper,    VALUE,
                sizeof(magma_int_t),             &nb,       VALUE,
                sizeof(magma_int_t),             &temp,     VALUE,
                sizeof(magma_int_t),             &nb,       VALUE,
                sizeof(cuDoubleComplex)*(*n)*(*n), A(k,i), INPUT,
                sizeof(magma_int_t),             lda,       VALUE,
                sizeof(cuDoubleComplex)*(*n)*(*n), A(k,j),    INPUT,
                sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j), INOUT,
                strlen(label)+1,         label,     VALUE | TASKLABEL,
                5,                       "blue",    VALUE | TASKCOLOR,
                0);

            } else {

              QUARK_Insert_Task(quark, SCHED_zgemm, 0,
                sizeof(magma_int_t),             &upper,    VALUE,
                sizeof(magma_int_t),             &temp,     VALUE,
                sizeof(magma_int_t),             &nb,       VALUE,
                sizeof(magma_int_t),             &nb,       VALUE,
                sizeof(cuDoubleComplex)*(*n)*(*n), A(j,k), INPUT,
                sizeof(magma_int_t),             lda,       VALUE,
                sizeof(cuDoubleComplex)*(*n)*(*n), A(i,k),    INPUT,
                sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i), INOUT,
                strlen(label)+1,         label,     VALUE | TASKLABEL,
                5,                       "blue",    VALUE | TASKCOLOR,
                0);
            }
          }
        }
      }  
    }

    iinfo[0] = i;

    sprintf(label, "POTRF %d", ii);

    QUARK_Insert_Task(quark, SCHED_zpotrf, 0,
      sizeof(magma_int_t),             &upper,    VALUE,
      sizeof(magma_int_t),             &temp2,    VALUE,
      sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INOUT,
      sizeof(magma_int_t),             lda,       VALUE,
      sizeof(magma_int_t),             iinfo,     OUTPUT,
      strlen(label)+1,         label,     VALUE | TASKLABEL,
      5,                       "cyan",    VALUE | TASKCOLOR,
      0);

    // if not last block
    if (i < ((*n)-nb)) {

      // split trsm into tiles
      for (j = i + nb; j < (*n); j += nb) {

        temp = min(nb,(*n)-j);

        sprintf(label, "TRSM %d", ii);

        if (upper) {

          QUARK_Insert_Task(quark, SCHED_ztrsm, 0,
            sizeof(magma_int_t),             &upper,    VALUE,
            sizeof(magma_int_t),             &nb,       VALUE,
            sizeof(magma_int_t),             &temp,     VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INPUT,
            sizeof(magma_int_t),             lda,       VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,j),    INOUT,
            strlen(label)+1,         label,     VALUE | TASKLABEL,
            4,                       "red",     VALUE | TASKCOLOR,
            0);

        } else {

          QUARK_Insert_Task(quark, SCHED_ztrsm, 0,
            sizeof(magma_int_t),             &upper,    VALUE,
            sizeof(magma_int_t),             &temp,     VALUE,
            sizeof(magma_int_t),             &nb,       VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(i,i),    INPUT,
            sizeof(magma_int_t),             lda,       VALUE,
            sizeof(cuDoubleComplex)*(*n)*(*n), A(j,i),    INOUT,
            strlen(label)+1,         label,     VALUE | TASKLABEL,
            4,                       "red",     VALUE | TASKCOLOR,
            0);
        }
      } 
    }
  }
  
  QUARK_Barrier(quark);
}
/* Try various ways to do matmul and time them.  Tiled algorithms
 * running serially; multi-threaded QUARK runtime with tiled
 * algorithms; and direct serial computation over standard layout. */
int main_algorithm(int NB, int N, int THREADS)
{
    int i, j, k, nerr=0;
    int BB = N/NB;
    double *A = (double*)malloc(N*N*sizeof(double));
    double *Ablk = (double*)malloc(N*N*sizeof(double));
    double *B = (double*)malloc(N*N*sizeof(double));
    double *Bblk = (double*)malloc(N*N*sizeof(double));
    double *C_direct = (double*)malloc(N*N*sizeof(double));
    double *C = (double*)malloc(N*N*sizeof(double));
    double *Cblk = (double*)malloc(N*N*sizeof(double));
    double *C_quark = (double*)malloc(N*N*sizeof(double));
    double *C_quark_blk = (double*)malloc(N*N*sizeof(double));
    struct timeval tstart, tend, tdiff;
    double t_blk=0, t_quark=0, t_direct=0;

    // Initialize
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            A[i+j*N] = (double)1.0+i;
            B[i+j*N] = (double)2.0+i+j;
            C_quark[i+j*N] = C_direct[i+j*N] = C[i+j*N] = 3.0;
        }
    }

    matrix_print("Printing A", A, N);
    matrix_print("Printing B", B, N);
    matrix_print("Printing C before computation", C, N);

    // Move from F77 to BDL
    std_to_bdl( A, Ablk, N, NB );
    std_to_bdl( B, Bblk, N, NB );
    std_to_bdl( C, Cblk, N, NB );
    std_to_bdl( C_quark, C_quark_blk, N, NB );

    /* ORIGINAL TILED ROUTINE */
    /* This is the code for the serial tile-by-tile multiplication */
    printf("Doing matrix multiplication using serial tile-by-tile algorithm\n");
    gettimeofday( &tstart, NULL );
    for (i = 0; i < BB; i++)
        for (j = 0; j < BB; j++)
            for (k = 0; k < BB; k++)
                matmul ( &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &Cblk[NB*NB*i + NB*NB*BB*j], NB);
    gettimeofday( &tend, NULL );
    t_blk = timeval_subtract( &tdiff, &tend, &tstart );
    printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 );
    bdl_to_std( C, Cblk, N, NB );
    matrix_print("Printing C produced by serial tile-algorithm after computation", C, N);
    printf("\n");

    /* QUARK PARALLEL TILED ROUTINE */
    /* This is the code for the QUARK runtime do do the parallel multi-threaded tile-by-tile algorithm */
    printf("Doing matrix multiplication using the multi-threaded QUARK runtime for a tile based algorithm\n");
    Quark *quark = QUARK_New(THREADS);
    gettimeofday( &tstart, NULL );
    for (i = 0; i < BB; i++)
        for (j = 0; j < BB; j++)
            for (k = 0; k < BB; k++) 
                matmul_quark_call ( quark, &Ablk[NB*NB*i + NB*NB*BB*k], &Bblk[NB*NB*k + NB*NB*BB*j], &C_quark_blk[NB*NB*i + NB*NB*BB*j], NB);
    QUARK_Barrier( quark );
    gettimeofday( &tend, NULL );
    t_quark = timeval_subtract( &tdiff, &tend, &tstart );
    printf("Time taken: %f\n", tdiff.tv_sec + (double)tdiff.tv_usec/1000000 );
    QUARK_Delete(quark);
    bdl_to_std( C_quark, C_quark_blk, N, NB );    
    matrix_print("Printing C produced by QUARK runtime after computation", C_quark, N);
    printf("\n");

    /* DIRECT COMPUTATION OVER STANDARD LAYOUT */
    /* Compute direct C if desired */
    printf("Doing matrix multiplication using direct loops (ie, view matrix as one big tile)\n");
    gettimeofday( &tstart, NULL );
    matmul ( A, B, C_direct, N );
    gettimeofday( &tend, NULL );
    t_direct = timeval_subtract( &tdiff, &tend, &tstart );
    printf("Time taken: %f\n", (double)(tdiff.tv_sec + (double)tdiff.tv_usec/1000000) );
    matrix_print("Printing C produced by direct matmul after computation", C_direct, N);
    printf("\n");

    /* Check for errors */
    printf("Comparing result matrices (direct versus QUARK)\n");
    nerr = matrix_compare( C_direct, C_quark, N );
    printf("Number of differences: %d\n", nerr);    
    printf("\n");    

    printf("Summary of time taken\n");
    printf("Direct       SerialBlock  QUARK(%d threads)\n", THREADS);
    printf("%-12.5f %-12.5f %-12.5f\n", t_direct, t_blk, t_quark);
    
    free(A); free(Ablk); free(B); free(Bblk); free(C); free(Cblk); free(C_direct); free(C_quark); free(C_quark_blk);
    return 0;
}