static void exec_gaussblur_app(DATA *image_out, DATA *image_in, ARGUMENTS *settings) { size_t size_in_bytes = settings->size*settings->size*(PIXEL_CHANNELS*sizeof(DATA)); DATA *image_in_gpu = (DATA *)acc_malloc(size_in_bytes); DATA *image_out_gpu = (DATA *)acc_malloc(size_in_bytes); uint32_t e; uint32_t i; for (e=0; e<settings->energy_loops; ++e) { acc_memcpy_to_device(image_in_gpu , image_in , size_in_bytes); acc_memcpy_to_device(image_out_gpu, image_out, size_in_bytes); for (i=0; i<settings->checkpoints; ++i) { gaussblur_calc_gpu_compute((uint16_t *)image_in_gpu , (uint16_t *)image_out_gpu, settings->size); gaussblur_calc_gpu_compute((uint16_t *)image_out_gpu, (uint16_t *)image_in_gpu , settings->size); } acc_memcpy_from_device(image_in , image_in_gpu , size_in_bytes); acc_memcpy_from_device(image_out, image_out_gpu, size_in_bytes); } acc_free(image_in_gpu); acc_free(image_out_gpu); image_in_gpu = NULL; image_out_gpu = NULL; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; acc_init (acc_device_nvidia); h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); acc_memcpy_to_device (d, h, N); memset (&h[0], 0, N); acc_memcpy_to_device (d, h, N << 1); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); acc_shutdown (acc_device_nvidia); return 0; }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); fprintf (stderr, "CheCKpOInT\n"); acc_memcpy_to_device (0, h, N); memset (&h[0], 0, N); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); return 0; }
void nrn_init(NrnThread* _nt, Memb_list* _ml, int _type){ double* _p; Datum* _ppvar; ThreadDatum* _thread; double _v, v; int* _ni; int _iml, _cntml_padded, _cntml_actual; _ni = _ml->_nodeindices; _cntml_actual = _ml->_nodecount; _cntml_padded = _ml->_nodecount_padded; _thread = _ml->_thread; #pragma acc update device (_mechtype) if(_nt->compute_gpu) if (!_thread[_spth1]._pvoid) { _thread[_spth1]._pvoid = nrn_cons_sparseobj(_kinetic_kstates_NMDA16_2, 16, _ml, _threadargs_); #ifdef _OPENACC if (_nt->compute_gpu) { void* _d_so = (void*) acc_deviceptr(_thread[_spth1]._pvoid); ThreadDatum* _d_td = (ThreadDatum*)acc_deviceptr(_thread); acc_memcpy_to_device(&(_d_td[_spth1]._pvoid), &_d_so, sizeof(void*)); } #endif } _acc_globals_update(); double * _nt_data = _nt->_data; double * _vec_v = _nt->_actual_v; int stream_id = _nt->stream_id; if (_nrn_skip_initmodel == 0) { #if LAYOUT == 1 /*AoS*/ for (_iml = 0; _iml < _cntml_actual; ++_iml) { _p = _ml->_data + _iml*_psize; _ppvar = _ml->_pdata + _iml*_ppsize; #elif LAYOUT == 0 /*SoA*/ _p = _ml->_data; _ppvar = _ml->_pdata; /* insert compiler dependent ivdep like pragma */ _PRAGMA_FOR_VECTOR_LOOP_ _PRAGMA_FOR_INIT_ACC_LOOP_ for (_iml = 0; _iml < _cntml_actual; ++_iml) { #else /* LAYOUT > 1 */ /*AoSoA*/ #error AoSoA not implemented. for (;;) { /* help clang-format properly indent */ #endif int _nd_idx = _ni[_iml]; _tsav = -1e20; _v = _vec_v[_nd_idx]; _PRCELLSTATE_V v = _v; _PRCELLSTATE_V nao = _ion_nao; initmodel(_threadargs_); } } #if NET_RECEIVE_BUFFERING NetSendBuffer_t* _nsb = _ml->_net_send_buffer; #if defined(_OPENACC) && !defined(DISABLE_OPENACC) #pragma acc wait(stream_id) #pragma acc update self(_nsb->_cnt) if(_nt->compute_gpu) update_net_send_buffer_on_host(_nt, _nsb); #endif {int _i; for (_i=0; _i < _nsb->_cnt; ++_i) { net_sem_from_gpu(_nsb->_sendtype[_i], _nsb->_vdata_index[_i], _nsb->_weight_index[_i], _nt->_id, _nsb->_pnt_index[_i], _nsb->_nsb_t[_i], _nsb->_nsb_flag[_i]); }} _nsb->_cnt = 0; #if defined(_OPENACC) && !defined(DISABLE_OPENACC) #pragma acc update device(_nsb->_cnt) if(_nt->compute_gpu) #endif #endif } static double _nrn_current(_threadargsproto_, double _v){double _current=0.;v=_v;{ { g = w * gmax * O ; i = g * ( v - Erev ) ; } _current += i; } return _current; } #if defined(ENABLE_CUDA_INTERFACE) && defined(_OPENACC) void nrn_state_launcher(NrnThread*, Memb_list*, int, int); void nrn_jacob_launcher(NrnThread*, Memb_list*, int, int); void nrn_cur_launcher(NrnThread*, Memb_list*, int, int); #endif void nrn_cur(NrnThread* _nt, Memb_list* _ml, int _type) { double* _p; Datum* _ppvar; ThreadDatum* _thread; int* _ni; double _rhs, _g, _v, v; int _iml, _cntml_padded, _cntml_actual; _ni = _ml->_nodeindices; _cntml_actual = _ml->_nodecount; _cntml_padded = _ml->_nodecount_padded; _thread = _ml->_thread; double * _vec_rhs = _nt->_actual_rhs; double * _vec_d = _nt->_actual_d; double * _vec_shadow_rhs = _nt->_shadow_rhs; double * _vec_shadow_d = _nt->_shadow_d; #if defined(ENABLE_CUDA_INTERFACE) && defined(_OPENACC) && !defined(DISABLE_OPENACC) NrnThread* d_nt = acc_deviceptr(_nt); Memb_list* d_ml = acc_deviceptr(_ml); nrn_cur_launcher(d_nt, d_ml, _type, _cntml_actual); return; #endif double * _nt_data = _nt->_data; double * _vec_v = _nt->_actual_v; int stream_id = _nt->stream_id; #if LAYOUT == 1 /*AoS*/ for (_iml = 0; _iml < _cntml_actual; ++_iml) { _p = _ml->_data + _iml*_psize; _ppvar = _ml->_pdata + _iml*_ppsize; #elif LAYOUT == 0 /*SoA*/ _p = _ml->_data; _ppvar = _ml->_pdata; /* insert compiler dependent ivdep like pragma */ _PRAGMA_FOR_VECTOR_LOOP_ _PRAGMA_FOR_CUR_SYN_ACC_LOOP_ for (_iml = 0; _iml < _cntml_actual; ++_iml) { #else /* LAYOUT > 1 */ /*AoSoA*/ #error AoSoA not implemented. for (;;) { /* help clang-format properly indent */ #endif int _nd_idx = _ni[_iml]; _v = _vec_v[_nd_idx]; _PRCELLSTATE_V nao = _ion_nao; _g = _nrn_current(_threadargs_, _v + .001); { _rhs = _nrn_current(_threadargs_, _v); } _g = (_g - _rhs)/.001; double _mfact = 1.e2/(_nd_area); _g *= _mfact; _rhs *= _mfact; _PRCELLSTATE_G #ifdef _OPENACC if(_nt->compute_gpu) { #pragma acc atomic update _vec_rhs[_nd_idx] -= _rhs; #pragma acc atomic update _vec_d[_nd_idx] += _g; } else { _vec_shadow_rhs[_iml] = _rhs; _vec_shadow_d[_iml] = _g; } #else _vec_shadow_rhs[_iml] = _rhs; _vec_shadow_d[_iml] = _g; #endif } #ifdef _OPENACC if(!(_nt->compute_gpu)) { for (_iml = 0; _iml < _cntml_actual; ++_iml) { int _nd_idx = _ni[_iml]; _vec_rhs[_nd_idx] -= _vec_shadow_rhs[_iml]; _vec_d[_nd_idx] += _vec_shadow_d[_iml]; } #else for (_iml = 0; _iml < _cntml_actual; ++_iml) { int _nd_idx = _ni[_iml]; _vec_rhs[_nd_idx] -= _vec_shadow_rhs[_iml]; _vec_d[_nd_idx] += _vec_shadow_d[_iml]; #endif } } void nrn_state(NrnThread* _nt, Memb_list* _ml, int _type) { double* _p; Datum* _ppvar; ThreadDatum* _thread; double v, _v = 0.0; int* _ni; int _iml, _cntml_padded, _cntml_actual; _ni = _ml->_nodeindices; _cntml_actual = _ml->_nodecount; _cntml_padded = _ml->_nodecount_padded; _thread = _ml->_thread; #if defined(ENABLE_CUDA_INTERFACE) && defined(_OPENACC) && !defined(DISABLE_OPENACC) NrnThread* d_nt = acc_deviceptr(_nt); Memb_list* d_ml = acc_deviceptr(_ml); nrn_state_launcher(d_nt, d_ml, _type, _cntml_actual); return; #endif double * _nt_data = _nt->_data; double * _vec_v = _nt->_actual_v; int stream_id = _nt->stream_id; #if LAYOUT == 1 /*AoS*/ for (_iml = 0; _iml < _cntml_actual; ++_iml) { _p = _ml->_data + _iml*_psize; _ppvar = _ml->_pdata + _iml*_ppsize; #elif LAYOUT == 0 /*SoA*/ _p = _ml->_data; _ppvar = _ml->_pdata; /* insert compiler dependent ivdep like pragma */ _PRAGMA_FOR_VECTOR_LOOP_ _PRAGMA_FOR_STATE_ACC_LOOP_ for (_iml = 0; _iml < _cntml_actual; ++_iml) { #else /* LAYOUT > 1 */ /*AoSoA*/ #error AoSoA not implemented. for (;;) { /* help clang-format properly indent */ #endif int _nd_idx = _ni[_iml]; _v = _vec_v[_nd_idx]; _PRCELLSTATE_V v=_v; { nao = _ion_nao; { #if !defined(_kinetic_kstates_NMDA16_2) #define _kinetic_kstates_NMDA16_2 0 #endif sparse_thread((SparseObj*)_thread[_spth1]._pvoid, 16, _slist1, _dlist1, &t, dt, _kinetic_kstates_NMDA16_2, _linmat1, _threadargs_); }}} } static void terminal(){} static void _initlists(){ double _x; double* _p = &_x; int _i; static int _first = 1; int _cntml_actual=1; int _cntml_padded=1; int _iml=0; if (!_first) return; _slist1 = (int*)malloc(sizeof(int)*16); _dlist1 = (int*)malloc(sizeof(int)*16); _slist1[0] = &(RA2sMg) - _p; _dlist1[0] = &(DRA2sMg) - _p; _slist1[1] = &(OMg) - _p; _dlist1[1] = &(DOMg) - _p; _slist1[2] = &(O) - _p; _dlist1[2] = &(DO) - _p; _slist1[3] = &(RA2fMg) - _p; _dlist1[3] = &(DRA2fMg) - _p; _slist1[4] = &(RA2d2Mg) - _p; _dlist1[4] = &(DRA2d2Mg) - _p; _slist1[5] = &(RA2d1Mg) - _p; _dlist1[5] = &(DRA2d1Mg) - _p; _slist1[6] = &(RA2Mg) - _p; _dlist1[6] = &(DRA2Mg) - _p; _slist1[7] = &(RAMg) - _p; _dlist1[7] = &(DRAMg) - _p; _slist1[8] = &(RMg) - _p; _dlist1[8] = &(DRMg) - _p; _slist1[9] = &(RA2s) - _p; _dlist1[9] = &(DRA2s) - _p; _slist1[10] = &(RA2f) - _p; _dlist1[10] = &(DRA2f) - _p; _slist1[11] = &(RA2d2) - _p; _dlist1[11] = &(DRA2d2) - _p; _slist1[12] = &(RA2d1) - _p; _dlist1[12] = &(DRA2d1) - _p; _slist1[13] = &(RA2) - _p; _dlist1[13] = &(DRA2) - _p; _slist1[14] = &(RA) - _p; _dlist1[14] = &(DRA) - _p; _slist1[15] = &(R) - _p; _dlist1[15] = &(DR) - _p; #pragma acc enter data copyin(_slist1[0:16]) #pragma acc enter data copyin(_dlist1[0:16]) _first = 0; } } // namespace coreneuron_lib