//################################################################################################## static void *magma_dapplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_dapplyQ_id_data*)arg) -> id; magma_dapplyQ_data* data = ((magma_dapplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; double *E = data -> E; magma_int_t lde = data -> lde; double *V = data -> V; magma_int_t ldv = data -> ldv; double *TAU = data -> TAU; double *T = data -> T; magma_int_t ldt = data -> ldt; double *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif cpu_set_t old_set, new_set; //store current affinity CPU_ZERO(&old_set); sched_getaffinity( 0, sizeof(old_set), &old_set); //set new affinity // bind threads CPU_ZERO(&new_set); CPU_SET(my_core_id, &new_set); sched_setaffinity( 0, sizeof(new_set), &new_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_dsetmatrix(n, n_gpu, E, lde, dE, ldde); magma_dbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); double* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_dtile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY //restore old affinity sched_setaffinity(0, sizeof(old_set), &old_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
//################################################################################################## static void *magma_dapplyQ_m_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_dapplyQ_m_id_data*)arg) -> id; magma_dapplyQ_m_data* data = ((magma_dapplyQ_m_id_data*)arg) -> data; magma_int_t nrgpu = data -> nrgpu; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; double *E = data -> E; magma_int_t lde = data -> lde; double *V = data -> V; magma_int_t ldv = data -> ldv; double *TAU = data -> TAU; double *T = data -> T; magma_int_t ldt = data -> ldt; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_setlapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if(my_core_id==0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_dbulge_applyQ_v2_m(nrgpu, 'L', n_gpu, n, nb, Vblksiz, E, lde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %lf \n" ,timeQgpu); #endif }else{ //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if(my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); double* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_dtile_bulge_applyQ(my_core_id, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(my_core_id == 1){ timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %lf \n" ,timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0){ check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }