extern "C" magma_int_t
magma_cbulge_back(magma_int_t threads, char uplo,
                  magma_int_t n, magma_int_t nb,
                  magma_int_t ne, magma_int_t Vblksiz,
                  magmaFloatComplex *Z, magma_int_t ldz,
                  magmaFloatComplex *dZ, magma_int_t lddz,
                  magmaFloatComplex *V, magma_int_t ldv,
                  magmaFloatComplex *TAU,
                  magmaFloatComplex *T, magma_int_t ldt,
                  magma_int_t* info)

    float timeaplQ2=0.0;
    float f= 1.;
    magma_int_t n_gpu = ne;

//#if defined(PRECISION_s) || defined(PRECISION_d)
    //float gpu_cpu_perf = 50;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
    //float gpu_cpu_perf = 16;  // gpu over cpu performance  //100% ev // SandyB. - Fermi (M2090)
//    float gpu_cpu_perf = 27.5;  // gpu over cpu performance  //100% ev // Westmere - Fermi (M2090)
    //float gpu_cpu_perf = 37;  // gpu over cpu performance  //100% ev // SandyB. - Kepler (K20c)
//    float gpu_cpu_perf = 130;  // gpu over cpu performance  //100% ev // Bulldozer - Kepler (K20X)

    magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf();
    if(threads>1) {
        f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf)    );
        n_gpu = (magma_int_t)(f*ne);

     *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
     * **************************************************/
    timeaplQ2 = magma_wtime();
     *  use GPU+CPU's

    if(n_gpu < ne)

        // define the size of Q to be done on CPU's and the size on GPU's
        // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)
        printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu);
        magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);

        magma_capplyQ_id_data* arg;
        magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data));

        pthread_t* thread_id;
        magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t));

        pthread_attr_t thread_attr;

        // ===============================
        // relaunch thread to apply Q
        // ===============================
        // Set one thread per core
        pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);

        // Launch threads
        for (magma_int_t thread = 1; thread < threads; thread++)
            arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
            pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
        arg[0] = magma_capplyQ_id_data(0, &data_applyQ);

        // Wait for completion
        for (magma_int_t thread = 1; thread < threads; thread++)
            void *exitcodep;
            pthread_join(thread_id[thread], &exitcodep);


        magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);

         *  use only GPU
    } else {
        magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
        magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);

    timeaplQ2 = magma_wtime()-timeaplQ2;

    return MAGMA_SUCCESS;
extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz,
                                         cuFloatComplex *Z, magma_int_t ldz, cuFloatComplex *dZ, magma_int_t lddz,
                                         cuFloatComplex *V, magma_int_t ldv, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t ldt, magma_int_t* info)
    magma_int_t mklth = threads;
    float timeaplQ2=0.0;
#if defined(USEMKL)
#if defined(USEACML)
            float f= 1.;
            magma_int_t n_gpu = ne;
                f = 0.5;
                n_gpu = (magma_int_t)(f*ne)/64*64;
            else if(threads>10){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.68;
                f = 0.72;
                n_gpu = (magma_int_t)(f*ne)/64*64;
            else if(threads>5){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.82;
                f = 0.86;
                n_gpu = (magma_int_t)(f*ne)/64*64;
            else if(threads>1){
#if (defined(PRECISION_s) || defined(PRECISION_d))
                f = 0.96;
                f = 0.96;
                n_gpu = (magma_int_t)(f*ne)/64*64;
             *  apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z
             * **************************************************/

            timeaplQ2 = magma_wtime();
             *  use GPU+CPU's
            if(n_gpu < ne)
                // define the size of Q to be done on CPU's and the size on GPU's
                // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N)

                printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d     N_GPU %d   N_CPU %d\n",ne, n_gpu, ne-n_gpu); 
                magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz);
                magma_capplyQ_id_data* arg = new magma_capplyQ_id_data[threads];
                pthread_t* thread_id = new pthread_t[threads];
                pthread_attr_t thread_attr;
                // ===============================
                // relaunch thread to apply Q
                // ===============================
                // Set one thread per core
                pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM);
                // Launch threads
                for (magma_int_t thread = 1; thread < threads; thread++)
                    arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ);
                    pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]);
                arg[0] = magma_capplyQ_id_data(0, &data_applyQ);
                // Wait for completion
                for (magma_int_t thread = 1; thread < threads; thread++)
                    void *exitcodep;
                    pthread_join(thread_id[thread], &exitcodep);
                delete[] thread_id;
                delete[] arg;
                magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz);
                 *  use only GPU
                magma_csetmatrix(n, ne, Z, ldz, dZ, lddz);
                magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info);

            timeaplQ2 = magma_wtime()-timeaplQ2;
#if defined(USEMKL)
#if defined(USEACML)
    return MAGMA_SUCCESS;