extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, magmaFloatComplex *Z, magma_int_t ldz, magmaFloatComplex *dZ, magma_int_t lddz, magmaFloatComplex *V, magma_int_t ldv, magmaFloatComplex *TAU, magmaFloatComplex *T, magma_int_t ldt, magma_int_t* info) { magma_setlapack_numthreads(1); float timeaplQ2=0.0; float f= 1.; magma_int_t n_gpu = ne; //#if defined(PRECISION_s) || defined(PRECISION_d) //float gpu_cpu_perf = 50; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) //float gpu_cpu_perf = 16; // gpu over cpu performance //100% ev // SandyB. - Fermi (M2090) //#else // float gpu_cpu_perf = 27.5; // gpu over cpu performance //100% ev // Westmere - Fermi (M2090) //float gpu_cpu_perf = 37; // gpu over cpu performance //100% ev // SandyB. - Kepler (K20c) // float gpu_cpu_perf = 130; // gpu over cpu performance //100% ev // Bulldozer - Kepler (K20X) //#endif magma_int_t gpu_cpu_perf = magma_get_cbulge_gcperf(); if(threads>1) { f = 1. / (1. + (float)(threads-1)/ ((float)gpu_cpu_perf) ); n_gpu = (magma_int_t)(f*ne); } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //n_gpu=ne; //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ //$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if(n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) #ifdef ENABLE_DEBUG printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); #endif magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_capplyQ_id_data* arg; magma_malloc_cpu((void**) &arg, threads*sizeof(magma_capplyQ_id_data)); pthread_t* thread_id; magma_malloc_cpu((void**) &thread_id, threads*sizeof(pthread_t)); pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]); } arg[0] = magma_capplyQ_id_data(0, &data_applyQ); magma_capplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } magma_free_cpu(thread_id); magma_free_cpu(arg); magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ } else { magma_csetmatrix(n, ne, Z, ldz, dZ, lddz); magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; magma_setlapack_numthreads(threads); return MAGMA_SUCCESS; }
extern "C" magma_int_t magma_cbulge_back(magma_int_t threads, char uplo, magma_int_t n, magma_int_t nb, magma_int_t ne, magma_int_t Vblksiz, cuFloatComplex *Z, magma_int_t ldz, cuFloatComplex *dZ, magma_int_t lddz, cuFloatComplex *V, magma_int_t ldv, cuFloatComplex *TAU, cuFloatComplex *T, magma_int_t ldt, magma_int_t* info) { magma_int_t mklth = threads; float timeaplQ2=0.0; #if defined(USEMKL) mkl_set_num_threads(1); #endif #if defined(USEACML) omp_set_num_threads(1); #endif float f= 1.; magma_int_t n_gpu = ne; if(threads>40){ f = 0.5; n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>10){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.68; #else f = 0.72; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>5){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.82; #else f = 0.86; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } else if(threads>1){ #if (defined(PRECISION_s) || defined(PRECISION_d)) f = 0.96; #else f = 0.96; #endif n_gpu = (magma_int_t)(f*ne)/64*64; } /**************************************************** * apply V2 from left to the eigenvectors Z. dZ = (I-V2*T2*V2')*Z * **************************************************/ timeaplQ2 = magma_wtime(); /*============================ * use GPU+CPU's *==========================*/ if(n_gpu < ne) { // define the size of Q to be done on CPU's and the size on GPU's // note that GPU use Q(1:N_GPU) and CPU use Q(N_GPU+1:N) printf("---> calling GPU + CPU(if N_CPU>0) to apply V2 to Z with NE %d N_GPU %d N_CPU %d\n",ne, n_gpu, ne-n_gpu); magma_capplyQ_data data_applyQ(threads, n, ne, n_gpu, nb, Vblksiz, Z, ldz, V, ldv, TAU, T, ldt, dZ, lddz); magma_capplyQ_id_data* arg = new magma_capplyQ_id_data[threads]; pthread_t* thread_id = new pthread_t[threads]; pthread_attr_t thread_attr; // =============================== // relaunch thread to apply Q // =============================== // Set one thread per core pthread_attr_init(&thread_attr); pthread_attr_setscope(&thread_attr, PTHREAD_SCOPE_SYSTEM); pthread_setconcurrency(threads); // Launch threads for (magma_int_t thread = 1; thread < threads; thread++) { arg[thread] = magma_capplyQ_id_data(thread, &data_applyQ); pthread_create(&thread_id[thread], &thread_attr, magma_capplyQ_parallel_section, &arg[thread]); } arg[0] = magma_capplyQ_id_data(0, &data_applyQ); magma_capplyQ_parallel_section(&arg[0]); // Wait for completion for (magma_int_t thread = 1; thread < threads; thread++) { void *exitcodep; pthread_join(thread_id[thread], &exitcodep); } delete[] thread_id; delete[] arg; magma_csetmatrix(n, ne-n_gpu, Z + n_gpu*ldz, ldz, dZ + n_gpu*ldz, lddz); /*============================ * use only GPU *==========================*/ }else{ magma_csetmatrix(n, ne, Z, ldz, dZ, lddz); magma_cbulge_applyQ_v2('L', ne, n, nb, Vblksiz, dZ, lddz, V, ldv, T, ldt, info); magma_device_sync(); } timeaplQ2 = magma_wtime()-timeaplQ2; #if defined(USEMKL) mkl_set_num_threads(mklth); #endif #if defined(USEACML) omp_set_num_threads(mklth); #endif return MAGMA_SUCCESS; }