C++ (Cpp) magma_get_dbulge_gcperf示例

示例#1

0

显示文件

void magmaf_get_dbulge_gcperf(
     )
{
    magma_get_dbulge_gcperf(
         );
}

示例#2

0

显示文件

文件： dbulge_back.cpp 项目： cjy7117/FT-MAGMA

extern "C" magma_int_t
magma_dbulge_back(
    magma_uplo_t uplo,
    magma_int_t n, magma_int_t nb,
    magma_int_t ne, magma_int_t Vblksiz,
    double *Z, magma_int_t ldz,
    magmaDouble_ptr dZ, magma_int_t lddz,
    double *V, magma_int_t ldv,
    double *TAU,
    double *T, magma_int_t ldt,
    magma_int_t* info)
{
    magma_int_t threads = magma_get_parallel_numthreads();
    magma_int_t mklth   = magma_get_lapack_numthreads();
    magma_set_lapack_numthreads(1);

    real_Double_t timeaplQ2=0.0;
    double f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
    //double gpu_cpu_perf = 50;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
    //double gpu_cpu_perf = 16;  // gpu over cpu performance  //100% ev // SandyB. - Fermi (M2090)
//#else
//    double gpu_cpu_perf = 27.5;  // gpu over cpu performance  //100% ev // Westmere - Fermi (M2090)
    //double gpu_cpu_perf = 37;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
//    double gpu_cpu_perf = 130;  // gpu over cpu performance  //100% ev // Bulldozer - Kepler (K20X)
//#endif

    magma_int_t gpu_cpu_perf = magma_get_dbulge_gcperf();
    if (threads > 1) {
        f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf)    );
        n_gpu = (magma_int_t)(f*ne);
    }

    /****************************************************
     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//n_gpu=ne;
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
    timeaplQ2 = magma_wtime();
    /*============================
     *  use GPU+CPU's
     *==========================*/

    if (n_gpu < ne) {
        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
        #ifdef ENABLE_DEBUG
        printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
        #endif
        magma_dapplyQ_data data_applyQ;
        magma_dapplyQ_data_init(&data_applyQ, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);

        magma_dapplyQ_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_dapplyQ_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_init(&thread_attr);
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
        pthread_setconcurrency(threads);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++) {
            magma_dapplyQ_id_data_init(&(arg[thread]), thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_dapplyQ_parallel_section, &arg[thread]);
        }
        magma_dapplyQ_id_data_init(&(arg[0]), 0, &data_applyQ);
        magma_dapplyQ_parallel_section(&arg[0]);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++) {
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);
        }

        magma_free_cpu(thread_id);
        magma_free_cpu(arg);
        magma_dapplyQ_data_destroy(&data_applyQ);


        magma_dsetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);

        /*============================
         *  use only GPU
         *==========================*/
    } else {
        magma_dsetmatrix(n, ne, Z, ldz, dZ, lddz);
        magma_dbulge_applyQ_v2(MagmaLeft, ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
        magma_device_sync();
    }

    timeaplQ2 = magma_wtime()-timeaplQ2;

    magma_set_lapack_numthreads(mklth);
    return MAGMA_SUCCESS;
}

示例#3

0

显示文件

文件： dbulge_back_m.cpp 项目： soulsheng/magma

extern "C" magma_int_t 
magma_dbulge_back_m(magma_int_t nrgpu, magma_int_t threads, char uplo, 
                        magma_int_t n, magma_int_t nb, 
                        magma_int_t ne, magma_int_t Vblksiz,
                        double *Z, magma_int_t ldz,
                        double *V, magma_int_t ldv, 
                        double *TAU, 
                        double *T, magma_int_t ldt, 
                        magma_int_t* info)
{
    magma_setlapack_numthreads(1);

    double timeaplQ2=0.0;

    double f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
//    double gpu_cpu_perf = 32; //gpu over cpu performance
//#else
//    double gpu_cpu_perf = 32;  // gpu over cpu performance
//#endif

    double perf_temp= .85;
    double perf_temp2= perf_temp;
    for (magma_int_t itmp=1; itmp<nrgpu; ++itmp)
        perf_temp2*=perf_temp;
    magma_int_t gpu_cpu_perf = magma_get_dbulge_gcperf();
    if(threads>1){
        f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf*(1.-perf_temp2)/(1.-perf_temp)));
        n_gpu = (magma_int_t)(f*ne);
    }






    /****************************************************
     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/

    timeaplQ2 = magma_wtime();

    /*============================
     *  use GPU+CPU's
     *==========================*/
//n_gpu = ne;
    if(n_gpu < ne)
    {

        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
        #ifdef ENABLE_DEBUG
        printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
        #endif
        magma_dapplyQ_m_data data_applyQ(nrgpu, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt);

        magma_dapplyQ_m_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_dapplyQ_m_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_init(&thread_attr);
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
        pthread_setconcurrency(threads);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            arg[thread] = magma_dapplyQ_m_id_data(thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_dapplyQ_m_parallel_section, &arg[thread]);
        }
        arg[0] = magma_dapplyQ_m_id_data(0, &data_applyQ);
        magma_dapplyQ_m_parallel_section(&arg[0]);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);
        }

        magma_free_cpu(thread_id);
        magma_free_cpu(arg);

        /*============================
         *  use only GPU
         *==========================*/
    }else{
        magma_dbulge_applyQ_v2_m(nrgpu, 'L', ne, n, nb, Vblksiz, Z, ldz, V, ldv, T, ldt, info);
        magma_device_sync();
    }

    timeaplQ2 = magma_wtime()-timeaplQ2;

    magma_setlapack_numthreads(threads);
    return MAGMA_SUCCESS;
}