예제 #1
0
extern "C" magma_int_t
magma_cbulge_back(magma_int_t threads, char uplo,
                  magma_int_t n, magma_int_t nb,
                  magma_int_t ne, magma_int_t Vblksiz,
                  magmaFloatComplex *Z, magma_int_t ldz,
                  magmaFloatComplex *dZ, magma_int_t lddz,
                  magmaFloatComplex *V, magma_int_t ldv,
                  magmaFloatComplex *TAU,
                  magmaFloatComplex *T, magma_int_t ldt,
                  magma_int_t* info)
{
    magma_setlapack_numthreads(1);

    float timeaplQ2=0.0;
    float f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
    //float gpu_cpu_perf = 50;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
    //float gpu_cpu_perf = 16;  // gpu over cpu performance  //100% ev // SandyB. - Fermi (M2090)
//#else
//    float gpu_cpu_perf = 27.5;  // gpu over cpu performance  //100% ev // Westmere - Fermi (M2090)
    //float gpu_cpu_perf = 37;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
//    float gpu_cpu_perf = 130;  // gpu over cpu performance  //100% ev // Bulldozer - Kepler (K20X)
//#endif

    magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf();
    if(threads>1) {
        f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf)    );
        n_gpu = (magma_int_t)(f*ne);
    }

    /****************************************************
     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//n_gpu=ne;
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
    timeaplQ2 = magma_wtime();
    /*============================
     *  use GPU+CPU's
     *==========================*/

    if(n_gpu < ne)
    {

        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
#ifdef ENABLE_DEBUG
        printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
#endif
        magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);

        magma_capplyQ_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_init(&thread_attr);
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
        pthread_setconcurrency(threads);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
        }
        arg[0] = magma_capplyQ_id_data(0, &data_applyQ);
        magma_capplyQ_parallel_section(&arg[0]);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++)
        {
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);
        }

        magma_free_cpu(thread_id);
        magma_free_cpu(arg);

        magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);

        /*============================
         *  use only GPU
         *==========================*/
    } else {
        magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
        magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
        magma_device_sync();
    }

    timeaplQ2 = magma_wtime()-timeaplQ2;

    magma_setlapack_numthreads(threads);
    return MAGMA_SUCCESS;
}
예제 #2
0
extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz,
                                         cuFloatComplex *Z, magma_int_t ldz, cuFloatComplex *dZ, magma_int_t lddz,
                                         cuFloatComplex *V, magma_int_t ldv, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t ldt, magma_int_t* info)
{
    magma_int_t mklth = threads;
    
    float timeaplQ2=0.0;
    
#if defined(USEMKL)
        mkl_set_num_threads(1);
#endif
#if defined(USEACML)
        omp_set_num_threads(1);
#endif
    
            float f= 1.;
            magma_int_t n_gpu = ne;
            
            if(threads>40){
                f = 0.5;
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            else if(threads>10){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.68;
#else
                f = 0.72;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            else if(threads>5){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.82;
#else
                f = 0.86;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }            
            else if(threads>1){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.96;
#else
                f = 0.96;
#endif
                n_gpu = (magma_int_t)(f*ne)/64*64;
            }
            
            /****************************************************
             *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
             * **************************************************/

            timeaplQ2 = magma_wtime();
            
            /*============================
             *  use GPU+CPU's
             *==========================*/  
            
            if(n_gpu < ne)
            {
                
                // define the size of Q to be done on CPU's and the size on GPU's
                // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)

                printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu); 
                
                magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);
                
                magma_capplyQ_id_data* arg = new magma_capplyQ_id_data[threads];
                pthread_t* thread_id = new pthread_t[threads];
                
                pthread_attr_t thread_attr;
                
                // ===============================
                // relaunch thread to apply Q
                // ===============================
                // Set one thread per core
                pthread_attr_init(&thread_attr);
                pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
                pthread_setconcurrency(threads);
                
                // Launch threads
                for (magma_int_t thread = 1; thread < threads; thread++)
                {
                    arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
                    pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
                }
                arg[0] = magma_capplyQ_id_data(0, &data_applyQ);
                magma_capplyQ_parallel_section(&arg[0]);
                
                // Wait for completion
                for (magma_int_t thread = 1; thread < threads; thread++)
                {
                    void *exitcodep;
                    pthread_join(thread_id[thread], &exitcodep);
                }
                
                delete[] thread_id;
                delete[] arg;
                
                magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);
                
                /*============================
                 *  use only GPU
                 *==========================*/  
            }else{
                magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
                magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);
                magma_device_sync();
            }

            timeaplQ2 = magma_wtime()-timeaplQ2;
            
#if defined(USEMKL)
        mkl_set_num_threads(mklth);
#endif
#if defined(USEACML)
        omp_set_num_threads(mklth);
#endif
    
    return MAGMA_SUCCESS;
}