//################################################################################################## static void *magma_capplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_capplyQ_id_data*)arg) -> id; magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; cuFloatComplex *E = data -> E; magma_int_t lde = data -> lde; cuFloatComplex *V = data -> V; magma_int_t ldv = data -> ldv; cuFloatComplex *TAU = data -> TAU; cuFloatComplex *T = data -> T; magma_int_t ldt = data -> ldt; cuFloatComplex *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; real_Double_t timeQcpu=0.0, timeQgpu=0.0; magma_int_t n_cpu = ne - n_gpu; #if defined(SETAFFINITY) cpu_set_t set; CPU_ZERO( &set ); CPU_SET( my_core_id, &set ); sched_setaffinity( 0, sizeof(set), &set) ; #endif if(my_core_id==0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= timeQgpu = magma_wtime(); magma_csetmatrix(n, n_gpu, E, lde, dE, ldde); magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f \n" ,timeQgpu); }else{ //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= if(my_core_id == 1) timeQcpu = magma_wtime(); magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); cuFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ctile_bulge_applyQ('L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); if(my_core_id == 1){ timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f \n" ,timeQcpu); } } // END if my_core_id #if defined(SETAFFINITY) // unbind threads magma_int_t sys_corenbr = 1; sys_corenbr = sysconf(_SC_NPROCESSORS_ONLN); CPU_ZERO( &set ); for(magma_int_t i=0; i<sys_corenbr; i++) CPU_SET( i, &set ); sched_setaffinity( 0, sizeof(set), &set) ; #endif return 0; }
//################################################################################################## static void *magma_capplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_capplyQ_id_data*)arg) -> id; magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; magmaFloatComplex *E = data -> E; magma_int_t lde = data -> lde; magmaFloatComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaFloatComplex *TAU = data -> TAU; magmaFloatComplex *T = data -> T; magma_int_t ldt = data -> ldt; magmaFloatComplex *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_setlapack_numthreads(1); #ifdef MAGMA_SETAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif affinity_set original_set; affinity_set new_set(my_core_id); int check = 0; int check2 = 0; // bind threads check = original_set.get_affinity(); if (check == 0) { check2 = new_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (single cpu)\n"); } else { printf("Error in sched_getaffinity\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if(my_core_id==0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_csetmatrix(n, n_gpu, E, lde, dE, ldde); magma_cbulge_applyQ_v2('L', n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_device_sync(); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f \n" ,timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if(my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ctile_bulge_applyQ(my_core_id, 'L', n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if(my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f \n" ,timeQcpu); } #endif } // END if my_core_id #ifdef MAGMA_SETAFFINITY // unbind threads if (check == 0) { check2 = original_set.set_affinity(); if (check2 != 0) printf("Error in sched_setaffinity (restore cpu list)\n"); } #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }
//################################################################################################## static void *magma_capplyQ_parallel_section(void *arg) { magma_int_t my_core_id = ((magma_capplyQ_id_data*)arg) -> id; magma_capplyQ_data* data = ((magma_capplyQ_id_data*)arg) -> data; magma_int_t allcores_num = data -> threads_num; magma_int_t n = data -> n; magma_int_t ne = data -> ne; magma_int_t n_gpu = data -> n_gpu; magma_int_t nb = data -> nb; magma_int_t Vblksiz = data -> Vblksiz; magmaFloatComplex *E = data -> E; magma_int_t lde = data -> lde; magmaFloatComplex *V = data -> V; magma_int_t ldv = data -> ldv; magmaFloatComplex *TAU = data -> TAU; magmaFloatComplex *T = data -> T; magma_int_t ldt = data -> ldt; magmaFloatComplex *dE = data -> dE; magma_int_t ldde = data -> ldde; pthread_barrier_t* barrier = &(data -> barrier); magma_int_t info; #ifdef ENABLE_TIMER real_Double_t timeQcpu=0.0, timeQgpu=0.0; #endif magma_int_t n_cpu = ne - n_gpu; // with MKL and when using omp_set_num_threads instead of mkl_set_num_threads // it need that all threads setting it to 1. magma_set_lapack_numthreads(1); #ifndef MAGMA_NOAFFINITY //#define PRINTAFFINITY #ifdef PRINTAFFINITY affinity_set print_set; print_set.print_affinity(my_core_id, "starting affinity"); #endif cpu_set_t old_set, new_set; //store current affinity CPU_ZERO(&old_set); sched_getaffinity( 0, sizeof(old_set), &old_set); //set new affinity // bind threads CPU_ZERO(&new_set); CPU_SET(my_core_id, &new_set); sched_setaffinity( 0, sizeof(new_set), &new_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "set affinity"); #endif #endif if (my_core_id == 0) { //============================================= // on GPU on thread 0: // - apply V2*Z(:,1:N_GPU) //============================================= #ifdef ENABLE_TIMER timeQgpu = magma_wtime(); #endif magma_queue_t queue; magma_device_t cdev; magma_getdevice( &cdev ); magma_queue_create( cdev, &queue ); magma_csetmatrix( n, n_gpu, E, lde, dE, ldde, queue ); magma_cbulge_applyQ_v2(MagmaLeft, n_gpu, n, nb, Vblksiz, dE, ldde, V, ldv, T, ldt, &info); magma_queue_destroy( queue ); #ifdef ENABLE_TIMER timeQgpu = magma_wtime()-timeQgpu; printf(" Finish Q2_GPU GGG timing= %f\n", timeQgpu); #endif } else { //============================================= // on CPU on threads 1:allcores_num-1: // - apply V2*Z(:,N_GPU+1:NE) //============================================= #ifdef ENABLE_TIMER if (my_core_id == 1) timeQcpu = magma_wtime(); #endif magma_int_t n_loc = magma_ceildiv(n_cpu, allcores_num-1); magmaFloatComplex* E_loc = E + (n_gpu+ n_loc * (my_core_id-1))*lde; n_loc = min(n_loc,n_cpu - n_loc * (my_core_id-1)); magma_ctile_bulge_applyQ(my_core_id, MagmaLeft, n_loc, n, nb, Vblksiz, E_loc, lde, V, ldv, TAU, T, ldt); pthread_barrier_wait(barrier); #ifdef ENABLE_TIMER if (my_core_id == 1) { timeQcpu = magma_wtime()-timeQcpu; printf(" Finish Q2_CPU CCC timing= %f\n", timeQcpu); } #endif } // END if my_core_id #ifndef MAGMA_NOAFFINITY //restore old affinity sched_setaffinity(0, sizeof(old_set), &old_set); #ifdef PRINTAFFINITY print_set.print_affinity(my_core_id, "restored_affinity"); #endif #endif return 0; }