Пример #1
0
extern "C" magma_int_t
magma_zgeqrf_batched(
    magma_int_t m, magma_int_t n,
    magmaDoubleComplex **dA_array,
    magma_int_t ldda,
    magmaDoubleComplex **tau_array,
    magma_int_t *info_array, magma_int_t batchCount, magma_queue_t queue)
{
#define dA(i, j)  (dA + (i) + (j)*ldda)   // A(i, j) means at i row, j column

    magma_int_t min_mn = min(m, n);
    cudaMemset(info_array, 0, batchCount*sizeof(magma_int_t));

    /* Check arguments */
    magma_int_t arginfo = 0;
    if (m < 0)
        arginfo = -1;
    else if (n < 0)
        arginfo = -2;
    else if (ldda < max(1,m))
        arginfo = -4;

    if (arginfo != 0) {
        magma_xerbla( __func__, -(arginfo) );
        return arginfo;
    }

    /* Quick return if possible */
    if (m == 0 || n == 0)
        if(min_mn == 0 ) return arginfo;

    if( m >  2048 || n > 2048 ) {
        printf("=========================================================================================\n");
        printf("   WARNING batched routines are designed for small sizes it might be better to use the\n   Native/Hybrid classical routines if you want performance\n");
        printf("=========================================================================================\n");
    }


    magma_int_t nb = 32;
    magma_int_t nnb = 8;
    magma_int_t i, k, ib=nb, jb=nnb;
    magma_int_t ldw, ldt, ldr, offset;

    cublasHandle_t myhandle;
    cublasCreate_v2(&myhandle);


    magmaDoubleComplex **dW0_displ = NULL;
    magmaDoubleComplex **dW1_displ = NULL;
    magmaDoubleComplex **dW2_displ = NULL;
    magmaDoubleComplex **dW3_displ = NULL;
    magmaDoubleComplex **dW4_displ = NULL;
    magmaDoubleComplex **dW5_displ = NULL;

    magmaDoubleComplex *dwork = NULL;
    magmaDoubleComplex *dT   = NULL;
    magmaDoubleComplex *dR   = NULL;
    magmaDoubleComplex **dR_array = NULL;
    magmaDoubleComplex **dT_array = NULL;
    magmaDoubleComplex **cpuAarray = NULL;
    magmaDoubleComplex **cpuTarray = NULL;

    magma_malloc((void**)&dW0_displ, batchCount * sizeof(*dW0_displ));
    magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ));
    magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ));
    magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ));
    magma_malloc((void**)&dW4_displ, batchCount * sizeof(*dW4_displ));  // used in zlarfb
    magma_malloc((void**)&dW5_displ, batchCount * sizeof(*dW5_displ));
    magma_malloc((void**)&dR_array, batchCount * sizeof(*dR_array));
    magma_malloc((void**)&dT_array, batchCount * sizeof(*dT_array));

    ldt = ldr = min(nb, min_mn);
    magma_zmalloc(&dwork,  (2 * nb * n) * batchCount);
    magma_zmalloc(&dR,  ldr * n   * batchCount);
    magma_zmalloc(&dT,  ldt * ldt * batchCount);
    magma_malloc_cpu((void**) &cpuAarray, batchCount*sizeof(magmaDoubleComplex*));
    magma_malloc_cpu((void**) &cpuTarray, batchCount*sizeof(magmaDoubleComplex*));

    /* check allocation */
    if ( dW0_displ == NULL || dW1_displ == NULL || dW2_displ == NULL || dW3_displ == NULL ||
            dW4_displ == NULL || dW5_displ == NULL || dR_array  == NULL || dT_array  == NULL ||
            dR == NULL || dT == NULL || dwork == NULL || cpuAarray == NULL || cpuTarray == NULL ) {
        magma_free(dW0_displ);
        magma_free(dW1_displ);
        magma_free(dW2_displ);
        magma_free(dW3_displ);
        magma_free(dW4_displ);
        magma_free(dW5_displ);
        magma_free(dR_array);
        magma_free(dT_array);
        magma_free(dR);
        magma_free(dT);
        magma_free(dwork);
        free(cpuAarray);
        free(cpuTarray);
        magma_int_t info = MAGMA_ERR_DEVICE_ALLOC;
        magma_xerbla( __func__, -(info) );
        return info;
    }


    magmablas_zlaset_q(MagmaFull, ldr, n*batchCount  , MAGMA_Z_ZERO, MAGMA_Z_ZERO, dR, ldr, queue);
    magmablas_zlaset_q(MagmaFull, ldt, ldt*batchCount, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dT, ldt, queue);
    zset_pointer(dR_array, dR, 1, 0, 0, ldr*min(nb, min_mn), batchCount, queue);
    zset_pointer(dT_array, dT, 1, 0, 0, ldt*min(nb, min_mn), batchCount, queue);


    magma_queue_t cstream;
    magmablasGetKernelStream(&cstream);
    magma_int_t streamid;
    const magma_int_t nbstreams=32;
    magma_queue_t stream[nbstreams];
    for(i=0; i<nbstreams; i++) {
        magma_queue_create( &stream[i] );
    }
    magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dA_array, 1, cpuAarray, 1);
    magma_getvector( batchCount, sizeof(magmaDoubleComplex*), dT_array, 1, cpuTarray, 1);


    magmablasSetKernelStream(NULL);

    for(i=0; i<min_mn; i+=nb)
    {
        ib = min(nb, min_mn-i);

        //===============================================
        // panel factorization
        //===============================================

        magma_zdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue);
        magma_zdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue);


        //dwork is used in panel factorization and trailing matrix update
        //dW4_displ, dW5_displ are used as workspace and configured inside
        magma_zgeqrf_panel_batched(m-i, ib, jb,
                                   dW0_displ, ldda,
                                   dW2_displ,
                                   dT_array, ldt,
                                   dR_array, ldr,
                                   dW1_displ,
                                   dW3_displ,
                                   dwork,
                                   dW4_displ, dW5_displ,
                                   info_array,
                                   batchCount, myhandle, queue);

        //===============================================
        // end of panel
        //===============================================

        //direct panel matrix V in dW0_displ,
        magma_zdisplace_pointers(dW0_displ, dA_array, ldda, i, i, batchCount, queue);
        // copy the upper part of V into dR
        zgeqrf_copy_upper_batched(ib, jb, dW0_displ, ldda, dR_array, ldr, batchCount, queue);

        //===============================================
        // update trailing matrix
        //===============================================

        //dwork is used in panel factorization and trailing matrix update
        //reset dW4_displ
        ldw = nb;
        zset_pointer(dW4_displ, dwork, 1, 0, 0,  ldw*n, batchCount, queue );
        offset = ldw*n*batchCount;
        zset_pointer(dW5_displ, dwork + offset, 1, 0, 0,  ldw*n, batchCount, queue );

        if( (n-ib-i) > 0)
        {

            // set the diagonal of v as one and the upper triangular part as zero
            magmablas_zlaset_batched(MagmaUpper, ib, ib, MAGMA_Z_ZERO, MAGMA_Z_ONE, dW0_displ, ldda, batchCount, queue);
            magma_zdisplace_pointers(dW2_displ, tau_array, 1, i, 0, batchCount, queue);

            // it is faster since it is using BLAS-3 GEMM routines, different from lapack implementation
            magma_zlarft_batched(m-i, ib, 0,
                                 dW0_displ, ldda,
                                 dW2_displ,
                                 dT_array, ldt,
                                 dW4_displ, nb*ldt,
                                 batchCount, myhandle, queue);


            // perform C = (I-V T^H V^H) * C, C is the trailing matrix
            //-------------------------------------------
            //          USE STREAM  GEMM
            //-------------------------------------------
            if( (m-i) > 100  && (n-i-ib) > 100)
            {
                // But since the code use the NULL stream everywhere,
                // so I don't need it, because the NULL stream do the sync by itself
                //magma_device_sync();
                for(k=0; k<batchCount; k++)
                {
                    streamid = k%nbstreams;
                    magmablasSetKernelStream(stream[streamid]);

                    // the stream gemm must take cpu pointer
                    magma_zlarfb_gpu_gemm(MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                                          m-i, n-i-ib, ib,
                                          cpuAarray[k] + i + i * ldda, ldda,
                                          cpuTarray[k], ldt,
                                          cpuAarray[k] + i + (i+ib) * ldda, ldda,
                                          dwork + nb * n * k, -1,
                                          dwork + nb * n * batchCount + nb * n * k, -1);

                }

                // need to synchronise to be sure that panel does not start before
                // finishing the update at least of the next panel
                // BUT no need for it as soon as the other portion of the code
                // use the NULL stream which do the sync by itself
                //magma_device_sync();
                magmablasSetKernelStream(NULL);
            }
            //-------------------------------------------
            //          USE BATCHED GEMM
            //-------------------------------------------
            else
            {
                //direct trailing matrix in dW1_displ
                magma_zdisplace_pointers(dW1_displ, dA_array, ldda, i, i+ib, batchCount, queue);

                magma_zlarfb_gemm_batched(
                    MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                    m-i, n-i-ib, ib,
                    (const magmaDoubleComplex**)dW0_displ, ldda,
                    (const magmaDoubleComplex**)dT_array, ldt,
                    dW1_displ,  ldda,
                    dW4_displ,  ldw,
                    dW5_displ, ldw,
                    batchCount, myhandle, queue);

            }

        }// update the trailing matrix
        //===============================================

        // copy dR back to V after the trailing matrix update
        magmablas_zlacpy_batched(MagmaUpper, ib, ib, dR_array, ldr, dW0_displ, ldda, batchCount, queue);

    }

    for(k=0; k<nbstreams; k++) {
        magma_queue_destroy( stream[k] );
    }
    magmablasSetKernelStream(cstream);
    cublasDestroy_v2(myhandle);

    magma_free(dW0_displ);
    magma_free(dW1_displ);
    magma_free(dW2_displ);
    magma_free(dW3_displ);
    magma_free(dW4_displ);
    magma_free(dW5_displ);
    magma_free(dR_array);
    magma_free(dT_array);
    magma_free(dR);
    magma_free(dT);
    magma_free(dwork);
    free(cpuAarray);
    free(cpuTarray);

    return arginfo;

}
Пример #2
0
extern "C" magma_int_t
magma_zgeqrf_panel_batched(
        magma_int_t m, magma_int_t n, magma_int_t nb,    
        magmaDoubleComplex** dA_array,    magma_int_t ldda,
        magmaDoubleComplex** tau_array, 
        magmaDoubleComplex** dT_array, magma_int_t ldt, 
        magmaDoubleComplex** dR_array, magma_int_t ldr,
        magmaDoubleComplex** dW0_displ, 
        magmaDoubleComplex** dW1_displ,
        magmaDoubleComplex   *dwork,  
        magmaDoubleComplex** dW2_displ, 
        magmaDoubleComplex** dW3_displ,
        magma_int_t *info_array,
        magma_int_t batchCount, magma_queue_t queue)
{
    magma_int_t j, jb;
    magma_int_t ldw = nb; 
    magma_int_t minmn = min(m,n); 

    for( j=0; j < minmn; j += nb)
    {
        jb = min(nb, minmn-j);

        magma_zdisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); 
        magma_zdisplace_pointers(dW2_displ, tau_array, 1, j, 0, batchCount, queue);
        magma_zdisplace_pointers(dW3_displ, dR_array, ldr, j, j, batchCount, queue); // 

        //sub-panel factorization 
        magma_zgeqr2_batched(
                m-j, jb,
                dW0_displ, ldda,      
                dW2_displ, 
                info_array, 
                batchCount,
                queue);

        //copy th whole rectangular n,jb from of dA to dR (it's lower portion (which is V's) will be set to zero if needed at the end)
        magma_zdisplace_pointers(dW0_displ, dA_array, ldda, 0, j, batchCount, queue); 
        magma_zdisplace_pointers(dW3_displ, dR_array, ldr, 0, j, batchCount, queue); 
        magmablas_zlacpy_batched( MagmaFull, minmn, jb, dW0_displ, ldda, dW3_displ, ldr, batchCount, queue );

        //set the upper jbxjb portion of V dA(j,j) to 1/0s (note that the rectangular on the top of this triangular of V still non zero but has been copied to dR).
        magma_zdisplace_pointers(dW0_displ, dA_array, ldda, j, j, batchCount, queue); 
        magmablas_zlaset_batched( MagmaUpper, jb, jb, MAGMA_Z_ZERO, MAGMA_Z_ONE, dW0_displ, ldda, batchCount, queue ); 


        if ( (n-j-jb) > 0) //update the trailing matrix inside the panel
        {
            magma_zlarft_sm32x32_batched(m-j, jb,
                    dW0_displ, ldda,
                    dW2_displ,
                    dT_array, ldt, 
                    batchCount, queue);

            magma_zdisplace_pointers( dW1_displ, dA_array, ldda, j, j + jb, batchCount, queue );
            magma_zset_pointer( dW2_displ,  dwork, 1, 0, 0,  ldw*n, batchCount, queue );
            magma_zset_pointer( dW3_displ, dwork + ldw*n*batchCount, 1, 0, 0,  ldw*n, batchCount, queue );

            magma_zlarfb_gemm_batched( 
                    MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                    m-j, n-j-jb, jb,
                    (const magmaDoubleComplex**)dW0_displ, ldda,
                    (const magmaDoubleComplex**)dT_array, ldt,
                    dW1_displ,  ldda,
                    dW2_displ,  ldw, 
                    dW3_displ, ldw,
                    batchCount, queue );
        }
    }

    // copy the remaining portion of dR from dA in case m < n
    if ( m < n )
    {
        magma_zdisplace_pointers(dW0_displ, dA_array, ldda, 0, minmn, batchCount, queue); 
        magma_zdisplace_pointers(dW3_displ, dR_array, ldr, 0, minmn, batchCount, queue); 
        magmablas_zlacpy_batched( MagmaFull, minmn, n-minmn, dW0_displ, ldda, dW3_displ, ldr, batchCount, queue );
    }
    // to be consistent set the whole upper nbxnb of V to 0/1s, in this case no need to set it inside zgeqrf_batched
    magma_zdisplace_pointers(dW0_displ, dA_array, ldda, 0, 0, batchCount, queue); 
    magmablas_zlaset_batched( MagmaUpper, minmn, n, MAGMA_Z_ZERO, MAGMA_Z_ONE, dW0_displ, ldda, batchCount, queue ); 


    return MAGMA_SUCCESS;
}