extern "C" magma_int_t magma_zbulge_back_m( magma_int_t ngpu, magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaDoubleComplex *Z, magma_int_t ldz, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t* info) { magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); real_Double_t timeaplQ2=0.0; double f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) // double gpu_cpu_perf = 32; //gpu over cpu performance //#else // double gpu_cpu_perf = 32; // gpu over cpu performance //#endif double perf_temp= .85; double perf_temp2= perf_temp; for (magma_int_t itmp=1; itmp < ngpu; ++itmp) perf_temp2 *= perf_temp; magma_int_t gpu_cpu_perf = magma_get_zbulge_gcperf(); if (threads > 1) { f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf*(1.-perf_temp2)/(1.-perf_temp))); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ //n_gpu = ne; if (n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_zapplyQ_m_data data_applyQ(ngpu, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt); magma_zapplyQ_m_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zapplyQ_m_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_zapplyQ_m_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_zapplyQ_m_parallel_section, &arg[thread]); } arg[0] = magma_zapplyQ_m_id_data(0, &data_applyQ); magma_zapplyQ_m_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); /*============================ * use only GPU *==========================*/ } else { magma_zbulge_applyQ_v2_m(ngpu, MagmaLeft, ne, n, nb, Vblksiz, Z, ldz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_set_lapack_numthreads(mklth); return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_zbulge_back(magma_uplo_t uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaDoubleComplex *Z, magma_int_t ldz, magmaDoubleComplex *dZ, magma_int_t lddz, magmaDoubleComplex *V, magma_int_t ldv, magmaDoubleComplex *TAU, magmaDoubleComplex *T, magma_int_t ldt, magma_int_t* info) { magma_int_t threads = magma_get_parallel_numthreads(); magma_int_t mklth = magma_get_lapack_numthreads(); magma_set_lapack_numthreads(1); real_Double_t timeaplQ2=0.0; double f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) //double gpu_cpu_perf = 50; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) //double gpu_cpu_perf = 16; // gpu over cpu performance //100% ev // SandyB. - Fermi (M2090) //#else // double gpu_cpu_perf = 27.5; // gpu over cpu performance //100% ev // Westmere - Fermi (M2090) //double gpu_cpu_perf = 37; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) // double gpu_cpu_perf = 130; // gpu over cpu performance //100% ev // Bulldozer - Kepler (K20X) //#endif magma_int_t gpu_cpu_perf = magma_get_zbulge_gcperf(); if (threads > 1) { f = 1. / (1. + (double)(threads-1)/ ((double)gpu_cpu_perf) ); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //n_gpu=ne; //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if (n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU > 0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_zapplyQ_data data_applyQ; magma_zapplyQ_data_init(&data_applyQ, threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_zapplyQ_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_zapplyQ_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { magma_zapplyQ_id_data_init(&(arg[thread]), thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_zapplyQ_parallel_section, &arg[thread]); } magma_zapplyQ_id_data_init(&(arg[0]), 0, &data_applyQ); magma_zapplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); magma_zapplyQ_data_destroy(&data_applyQ); magma_zsetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ } else { magma_zsetmatrix(n, ne, Z, ldz, dZ, lddz); magma_zbulge_applyQ_v2(MagmaLeft, ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_set_lapack_numthreads(mklth); return MAGMA_SUCCESS; }