end genesis __kernel void kernel_mc02_orig(unsigned int outer_tc, unsigned int inner_tc, __global __volatile float *arr){ for (unsigned int it00 = get_local_id(0); it00 < outer_tc; it00 += get_local_size(0)) { for (unsigned int it01 = get_local_id(1); it01 < inner_tc; it01 += get_local_size(1)) { ${epoch[5]} } } }
size_t _CL_OVERLOADABLE get_global_id(unsigned int dimindx) { switch(dimindx) { /* TODO: add get_global_offset(X) to these! */ case 0: return get_local_size(0) * get_group_id(0) + get_local_id(0); case 1: return get_local_size(1) * get_group_id(1) + get_local_id(1); case 2: return get_local_size(2) * get_group_id(2) + get_local_id(2); default: return 0; } }
/// \fn _copyKernel /// \brief generate a copy kernel program compute::program _copyKernel(const compute::context& context) { const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE( __kernel void copy_kernel(__global const float *src, __global float *dst) { uint x = get_group_id(0) * TILE_DIM + get_local_id(0); uint y = get_group_id(1) * TILE_DIM + get_local_id(1); uint width = get_num_groups(0) * TILE_DIM; for(uint i = 0 ; i < TILE_DIM ; i+= BLOCK_ROWS) { dst[(y+i)*width +x] = src[(y+i)*width + x]; } }
__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\ int tid = get_local_id(0); \n\ int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\ int i = tid + (bx) * (get_local_size(0)); \n\ //debug \n\ //if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\ if (i < size ) { mem[i]=val; } \n\ }" };
int get_global_id(unsigned int dim) { if (dim < _numDimensions) { return _globalIds[dim] * _localSizes[dim] + get_local_id(dim); } return 0; }
std::vector<unsigned> get_nodes(unsigned elem_index) const { std::vector<unsigned> node_conn(m_bulkData.num_nodes(m_localIdToElement[elem_index])); const stk::mesh::Entity* nodes = m_bulkData.begin_nodes(m_localIdToElement[elem_index]); for(size_t i=0;i<node_conn.size();++i) node_conn[i] = get_local_id(nodes[i]); return node_conn; }
void ComputeGradinetsRTLR_SetGradients(local float* tmpGradients, global float* gradients, global float* gradientSums) { Reduce_Sum(tmpGradients); if (get_local_id(0) == 0) { int ijValueIndex = get_group_id(0); if (gradients != null) gradients[ijValueIndex] = tmpGradients[0]; if (gradientSums != null) gradientSums[ijValueIndex] += tmpGradients[0]; } }
__kernel void kernel_scan(__global float* input, __global float* output) { int global_idx = get_global_id(0); int local_idx = get_local_id(0); int block_size = get_local_size(0); int group_id = get_group_id(0); output[global_idx] = input[global_idx]; mem_fence(CLK_GLOBAL_MEM_FENCE); for(int i = 1; i < block_size; i <<= 1) { if(global_idx >= i) output[global_idx] += output[global_idx - i]; mem_fence(CLK_GLOBAL_MEM_FENCE); } }
__kernel void kernel_reduce(__global float* input, __global float* output) { int global_idx = get_global_id(0); int local_idx = get_local_id(0); int block_size = get_local_size(0); int group_id = get_group_id(0); for(int i = block_size/2; i > 0; i >>= 1) { if(local_idx < i) input[global_idx] += input[global_idx + i]; barrier(CLK_GLOBAL_MEM_FENCE); } if(local_idx == 0) output[group_id] = input[global_idx]; }
__kernel void custom_daxpy_kernel( __global double * x, const double c ) { // variables int localIndex; // function body // get the index we're working on localIndex = get_local_id( 0 ); // output[ index ] = input[ index ] + c x[ localIndex ] = x[ localIndex ] + c; // clean up return; }
__kernel void metric_area( ulong_t linext_offset, lattice_info info, __global ideal_t *linext, DB, llf_criteria cfg, __local ideal_t *pasv_scratch, __local result_t* scratch, __global result_t* result) { result_t metric = { 0, 0 }; count_t nth_extension = linext_offset + get_global_id(0); linext += info.linext_width * get_global_id(0); pasv_scratch += PASV_SCRATCH_LEN(info.linext_width) * get_local_id(0); ZERO_INIT(pasv_scratch, PASV_SCRATCH_LEN(info.linext_width)); metric = RateBuildpath(linext, db_items, &cfg, pasv_scratch, &info, nth_extension); Reduce(metric, scratch, result); }
__kernel void TRIPLE_DGEMM_UPDATE_192_12_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na) { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 12 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8 //Work group size is [12] //global work size is [96*number of blocks] //each work item in each work group is responsible for every element in that row //each work group is responsible for one gemm;\ ////////////// A12*invA22 const uint gidx = get_group_id(0); const uint idx = get_local_id(0); const uint page = gidx % npages; const uint page_block = page / 8;//8 pages per page block const uint page_index_in_block = page % 8; __global double *B, *C; __local double lA[12][12]; __local double lB[12][12]; double privateC[12] = { (double)0 }; //decide A12 location for each page Ain = Ain + offAin; Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; //decide invA22 (B) location for each page B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk + blk; //decide invA12 location for each page C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk; //read A and B into LDS no transpose operated here lA[idx][0] = Ain[idx]; lA[idx][1] = Ain[idx + lda]; lA[idx][2] = Ain[idx + lda * 2]; lA[idx][3] = Ain[idx + lda * 3]; lA[idx][4] = Ain[idx + lda * 4]; lA[idx][5] = Ain[idx + lda * 5]; lA[idx][6] = Ain[idx + lda * 6]; lA[idx][7] = Ain[idx + lda * 7]; lA[idx][8] = Ain[idx + lda * 8]; lA[idx][9] = Ain[idx + lda * 9]; lA[idx][10] = Ain[idx + lda * 10]; lA[idx][11] = Ain[idx + lda * 11]; lB[idx][0] = B[idx]; lB[idx][1] = B[idx + NB]; lB[idx][2] = B[idx + NB * 2]; lB[idx][3] = B[idx + NB * 3]; lB[idx][4] = B[idx + NB * 4]; lB[idx][5] = B[idx + NB * 5]; lB[idx][6] = B[idx + NB * 6]; lB[idx][7] = B[idx + NB * 7]; lB[idx][8] = B[idx + NB * 8]; lB[idx][9] = B[idx + NB * 9]; lB[idx][10] = B[idx + NB * 10]; lB[idx][11] = B[idx + NB * 11]; barrier(CLK_LOCAL_MEM_FENCE); //do math uint i = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]); //mem_fence(CLK_LOCAL_MEM_FENCE); i = i + 1; } while (i < 12); i = 0; do{ C[NB*i + idx] = privateC[i]; i = i + 1; } while (i < 12); ////////////// -invA11*invA12 barrier(CLK_GLOBAL_MEM_FENCE); //A is moving to invA11 __global double *A; A = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2) * NB + (page % 4) * 2 * blk; //both B and C are pointing at invA12 B = C; //read A and B into LDS no transpose operated here lA[idx][0] = A[idx]; lA[idx][1] = A[idx + NB]; lA[idx][2] = A[idx + NB * 2]; lA[idx][3] = A[idx + NB * 3]; lA[idx][4] = A[idx + NB * 4]; lA[idx][5] = A[idx + NB * 5]; lA[idx][6] = A[idx + NB * 6]; lA[idx][7] = A[idx + NB * 7]; lA[idx][8] = A[idx + NB * 8]; lA[idx][9] = A[idx + NB * 9]; lA[idx][10] = A[idx + NB * 10]; lA[idx][11] = A[idx + NB * 11]; lB[idx][0] = B[idx]; lB[idx][1] = B[idx + NB]; lB[idx][2] = B[idx + NB * 2]; lB[idx][3] = B[idx + NB * 3]; lB[idx][4] = B[idx + NB * 4]; lB[idx][5] = B[idx + NB * 5]; lB[idx][6] = B[idx + NB * 6]; lB[idx][7] = B[idx + NB * 7]; lB[idx][8] = B[idx + NB * 8]; lB[idx][9] = B[idx + NB * 9]; lB[idx][10] = B[idx + NB * 10]; lB[idx][11] = B[idx + NB * 11]; barrier(CLK_LOCAL_MEM_FENCE); //do math i = 0; privateC[0] = 0; privateC[1] = 0; privateC[2] = 0; privateC[3] = 0; privateC[4] = 0; privateC[5] = 0; privateC[6] = 0; privateC[7] = 0; privateC[8] = 0; privateC[9] = 0; privateC[10] = 0; privateC[11] = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]); //mem_fence(CLK_LOCAL_MEM_FENCE); i = i + 1; } while (i < 12); i = 0; do{ C[NB*i + idx] = -1 * privateC[i]; i = i + 1; } while (i < 12); }
MAYINLINE G4double G4NewNavigation_ComputeStep( G4NewNavigation *This, G4ThreeVector localPoint, G4ThreeVector localDirection, const G4double currentProposedStepLength, G4double *newSafety, G4NavigationHistory *history, G4bool *validExitNormal, G4ThreeVector *exitNormal, G4bool *exiting, G4bool *entering, GEOMETRYLOC G4VPhysicalVolume *(*pBlockedPhysical) ,SHAREDMEM int * Numbers_Of_Solid, SHAREDMEM int * Sum_Of_Solids, SHAREDTYPE SolidInfo * Solids, SHAREDTYPE ResultInfo * Result_For_Current_Solid, SHAREDTYPE FinalResult * Compacter_Result, SHAREDMEM bool * noStepArray, SHAREDMEM PointInformation * LocationArray, GEOMETRYLOC G4SmartVoxelNode *nullVNode, G4bool cur_vol_local #ifdef CHECK ,GEOMETRYLOC float * Result #endif ) { GEOMETRYLOC G4VPhysicalVolume *motherPhysical, *samplePhysical, *blockedExitedVol = GEOMETRYNULL; GEOMETRYLOC G4LogicalVolume *motherLogical; GEOMETRYLOC G4VSolid *motherSolid; G4ThreeVector sampleDirection; G4double ourStep=currentProposedStepLength, motherSafety, ourSafety; G4int sampleNo; // , localNoDaughters; //EDIT: Defining these here for global scope. GEOMETRYLOC G4VSolid *sampleSolid; G4AffineTransform sampleTf; const G4ThreeVector samplePoint; G4double sampleSafety; int PrevSum; // PrevSum stores the sum of all solids of one type so far. int Count_of_Solid_type; // This integer is to fill up the shared mem array for solid types // For definition of shared memory arrays see kernel trace in gpu.c // _--------------------------------__ G4bool initialNode; GEOMETRYLOC const G4SmartVoxelNode *curVoxelNode; G4int curNoVolumes, contentNo; G4double voxelSafety; G4double sampleStep; #if ( GLOBAL_MODE == 1) int locationId = get_global_id(0); #else int locationId = get_local_id(0); #endif motherPhysical = G4NavigationHistory_GetTopVolume( history ); motherLogical = G4VPhysicalVolume_GetLogicalVolume(motherPhysical); motherSolid = G4LogicalVolume_GetSolid(motherLogical); // // Compute mother safety // motherSafety = G4VSolid_DistanceToOut(motherSolid, localPoint); ourSafety = motherSafety; // Working isotropic safety // // Compute daughter safeties & intersections // // Exiting normal optimisation // if ( *exiting && *validExitNormal ) { if ( G4ThreeVector_dot(localDirection,*exitNormal)>=kMinExitingNormalCosine ) { // Block exited daughter volume // blockedExitedVol = *pBlockedPhysical; ourSafety = 0; } } *exiting = false; *entering = false; #ifdef USE_BLIST G4NewNavigation_EnlargeAndResetBlist( This, G4LogicalVolume_GetNoDaughters(motherLogical) ); #endif // noStepLocal -> Local boolean storing the noStep value for the current track. // noStepAll -> Value returned after a reduction of all noStepLocals initialNode = true; G4bool noStepLocal = true; G4bool noStepAll = true; // GEOMETRYLOC G4SmartVoxelNode * nullVNodePointer; //G4SmartVoxelNode nullNode; //nullVNodePointer = &nullNode; int counter = -1; while ( noStepAll ) { counter ++; if( !noStepLocal ) { curVoxelNode = nullVNode; curNoVolumes = 0; } else { curVoxelNode = This->fVoxelNode; curNoVolumes = G4VoxelNode_GetNoContained( curVoxelNode ); } int number_of_volume_types = Solidcount, Curr_vol_type=0; // Volume type is from the enum ESolids in everything.h int current_solid_sum = 0; // if (GLOBAL_MODE ==1) // number_of_threads = get_global_size(0); int number_of_threads = get_local_size(0); // STEP 1: Iteration through all volume types. Min is calculated one type at a time. //REMEMBER:EDIT: Changed starting value of Curr_vol_type to 1 for (Curr_vol_type = 1; Curr_vol_type < number_of_volume_types; (Curr_vol_type)++ ) { current_solid_sum = 0; int number_of_solids; number_of_solids = curVoxelNode->SolidType[ Curr_vol_type ]; // The number of solids of this type in the current voxel for the current track // NOTE: There may be an overhead here. If more than one thread is in the same voxel and reads this data, mem access not coalesced Numbers_Of_Solid[ locationId ] = number_of_solids; // At this stage a parallel scan sum has to be called which updates the sums for all the solids. The function is inlined. BARRIER_FLEXIBLE; // Barrier to ensure that all threads have filled up the shared mem array. See everything.h for definition //int all_threads = get_global_size(1); Prefix_Sum( Numbers_Of_Solid, Sum_Of_Solids, number_of_threads ); // The array Sum_Of_Solids stores the final result after the Prefix sum scan. //Result[PrevSum + current_solid_sum ] = locationId; PrevSum = Sum_Of_Solids[ locationId ]; // NOTE: The use of contentNo below is to maintain some resemblance to Otto's definition in VoxelNavigation.c if ( noStepLocal ) { for( contentNo = curNoVolumes-1; contentNo>=0 ; contentNo--) { sampleNo = G4VoxelNode_GetVolume(curVoxelNode,contentNo); samplePhysical = G4LogicalVolume_GetDaughter(motherLogical,sampleNo); //if ( samplePhysical != blockedExitedVol ) { // NOTE: blockedExitedVol check makes sense for the serial version. Does not make as much sense to keep it on in the parallel version as well. sampleSolid = G4LogicalVolume_GetSolid( G4VPhysicalVolume_GetLogicalVolume( samplePhysical )); // NOTE: We iterate over all solids and compare if the solid type is what we are looking for. If it is then it is added // to the array Solids in shared mem. However, a better implementation is to have a way such that the solids are returned in sorted order // the first place. if( sampleSolid->type == Curr_vol_type) { // Should the solid be stored or the Physical Volume?? SolidInfo Info = { samplePhysical, locationId }; Solids[ PrevSum + current_solid_sum ] = Info; // PrevSum is the sum of all solids of that type for all threads upto this element not including the current thread. // current_solid_sum at the end of all iterations should be equal to the number_of_solids. current_solid_sum++; } } } } //EDIT: Change to BARRIER_LOCAL when not testing GLOBAL_MODE BARRIER_FLEXIBLE; //MODIFY : Perhaps a check is in order here. // One check at this point could be whether current_solid_sum is equal to the number_of_solids. // Before proceeding to call the kernel for calculating the min, check if code works up to this point int Total_solids_of_this_type; Total_solids_of_this_type = Numbers_Of_Solid[ number_of_threads - 1 ] + Sum_Of_Solids[ number_of_threads - 1 ]; if ( Total_solids_of_this_type > BlockSize * Multiplier) { Total_solids_of_this_type = BlockSize * Multiplier; // Update Errors or return error from here? } // Checking a candidate solid of current type. int k=0; int iterations = (Total_solids_of_this_type / number_of_threads); for( k = 0; k < iterations ; k++ ) { int Work_id = locationId + number_of_threads*k; if( ( Work_id ) < Total_solids_of_this_type) { GEOMETRYLOC G4VPhysicalVolume * candPhysical = ( Solids[ Work_id ].PVolume); GEOMETRYLOC G4VSolid * candSolid = ((candPhysical)->flogical)->fSolid; int candId = Solids[ Work_id].trackId; G4ThreeVector candGlobalPoint = LocationArray[candId].Point; G4ThreeVector candGlobalDirection = LocationArray[candId].Direction; G4AffineTransform candTf = G4AffineTransform_create_full( G4VPhysicalVolume_GetObjectRotationValue(candPhysical), G4VPhysicalVolume_GetTranslation(candPhysical)); G4AffineTransform_Invert( &candTf ); const G4ThreeVector candPoint= G4AffineTransform_TransformPoint(&candTf,candGlobalPoint); const G4double candSafety = G4VSolid_DistanceToIn( candSolid, candPoint ); const G4ThreeVector candDirection = G4AffineTransform_TransformAxis( &candTf, candGlobalDirection ); const G4double candStep = G4VSolid_DistanceToIn_full( candSolid, candPoint, candDirection ); // Result_For_Current_Solid should hold data for the sampleSafety and the sampleStep // NOTE: In this version of navigation the step is calculated along with the safety. It is not checked if // step<safety; the safety is only calculated here because the physics may require it later. ResultInfo Result_of_Solid = { candSafety, candStep, candId, candPhysical } ; Result_For_Current_Solid[ Work_id ] = Result_of_Solid ; } BARRIER_FLEXIBLE; // REMOVE: Barrier used in debugging stage. Remove when done. } // At this point all the safeties and steps have been calculated, now to find the minimum step for the current solid per track. // find_minimum... // Most basic implementation for finding minimum, a proper min finding algorithm that takes into account threadIds is hard to find. Find_minimum ( Result_For_Current_Solid, Compacter_Result, PrevSum, number_of_solids ); // Minimum finding function that finds the minimum step per track in Result_For_Current_Solid and // stores the minimum step, the safety and the pointer to the sampleSolid ( physical? ) // The minimum is basically compared to the existing value and stored if smaller. This way find_minimum does not // care about which solid type is currently being processed. // See gpu.c where Compacter Results initial step value is set to kInfinfity. //if( Curr_vol_type ==1) } //if( Curr_vol_type == 1) if ( Compacter_Result[ locationId ].step <= ourStep ) { ourStep = Compacter_Result[ locationId ].step; *pBlockedPhysical = Compacter_Result[ locationId ].PVolume; *entering = true; *exiting = false; } if (initialNode) { initialNode = false; voxelSafety = G4NewNavigation_ComputeVoxelSafety(This,localPoint); if ( voxelSafety<ourSafety ) { ourSafety = voxelSafety; } if ( currentProposedStepLength<ourSafety ) { // Guaranteed physics limited // noStepLocal = false; *entering = false; *exiting = false; *pBlockedPhysical = GEOMETRYNULL; ourStep = kInfinity; } else { // // Compute mother intersection if required // if ( motherSafety<=ourStep ) { G4double motherStep = G4VSolid_DistanceToOut_full( motherSolid, localPoint, localDirection, true, validExitNormal, exitNormal); if ( motherStep<=ourStep ) { ourStep = motherStep; *exiting = true; *entering = false; if ( *validExitNormal ) { G4RotationMatrix rot = G4VPhysicalVolume_GetObjectRotationValue(motherPhysical); G4RotationMatrix inv = G4RotationMatrix_inverse(&rot); *exitNormal = G4RotationMatrix_apply( &inv, *exitNormal ); } } else { *validExitNormal = false; } } } *newSafety = ourSafety; } if (noStepLocal) { noStepLocal = G4NewNavigation_LocateNextVoxel(This, localPoint, localDirection, ourStep); } noStepArray[ locationId ] = noStepLocal; BARRIER_FLEXIBLE; noStepAll = NoStepReduction( noStepArray, number_of_threads); //Prefix_Sum ( noStepArray, noStepArray, number_of_threads); } // end -while (noStep)- loop // Double check to make sure all threads have reached here before exiting BARRIER_FLEXIBLE; //Result[ locationId ] = ourStep; return ourStep; }
__kernel void TRIPLE_DGEMM_UPDATE_192_24_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na) { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 24 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4 //Work group size is [24, 2] //global work size is [96*number of blocks, 2] //each work item in each work group is responsible for 12 elements (half) in that row //each work group is responsible for one gemm; ////////////// A12*invA22 const uint gidx = get_group_id(0); const uint gidy = get_group_id(1); const uint idx = get_local_id(0); const uint idy = get_local_id(1); const uint page = gidx % npages; //0-3 for 192; 0-1 for 96 const uint page_block = page / 4; //4 pages per page block __global double *B, *C; __local double lA[24][24]; __local double lB[24][24]; double privateC[12] = { (double)0 }; //decide A12 location for each page Ain = Ain + offAin; Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; //decide invA22 (B) location for each page B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk + blk; //decide invA12 location for each page C = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; //read A and B into LDS no transpose operated here //each work iteam loads half a row lA[idx][0 + idy * 12] = Ain[idx + idy * 12 * lda]; lA[idx][1 + idy * 12] = Ain[idx + lda + idy * 12 * lda]; lA[idx][2 + idy * 12] = Ain[idx + lda * 2 + idy * 12 * lda]; lA[idx][3 + idy * 12] = Ain[idx + lda * 3 + idy * 12 * lda]; lA[idx][4 + idy * 12] = Ain[idx + lda * 4 + idy * 12 * lda]; lA[idx][5 + idy * 12] = Ain[idx + lda * 5 + idy * 12 * lda]; lA[idx][6 + idy * 12] = Ain[idx + lda * 6 + idy * 12 * lda]; lA[idx][7 + idy * 12] = Ain[idx + lda * 7 + idy * 12 * lda]; lA[idx][8 + idy * 12] = Ain[idx + lda * 8 + idy * 12 * lda]; lA[idx][9 + idy * 12] = Ain[idx + lda * 9 + idy * 12 * lda]; lA[idx][10 + idy * 12] = Ain[idx + lda * 10 + idy * 12 * lda]; lA[idx][11 + idy * 12] = Ain[idx + lda * 11 + idy * 12 * lda]; lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB]; lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB]; lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB]; lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB]; lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB]; lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB]; lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB]; lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB]; lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB]; lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB]; lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB]; lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB]; barrier(CLK_LOCAL_MEM_FENCE); //do math uint i = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); i = i + 1; } while (i < 24); i = 0; do{ C[NB*idy * 12 + NB*i + idx] = privateC[i]; i = i + 1; } while (i < 12); }
__kernel void SliceTimingCorrection(__global float* Corrected_Volumes, __global const float* Volumes, __private float delta, __private int DATA_W, __private int DATA_H, __private int DATA_D, __private int DATA_T) { int x = get_global_id(0); int y = get_global_id(1); int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)}; if (x >= DATA_W || y >= DATA_H) return; float t0, t1, t2, t3; // Forward interpolation if (delta > 0.0f) { t0 = Volumes[Calculate3DIndex(x,y,0,DATA_W,DATA_H)]; t1 = t0; t2 = Volumes[Calculate3DIndex(x,y,1,DATA_W,DATA_H)]; t3 = Volumes[Calculate3DIndex(x,y,2,DATA_W,DATA_H)]; // Loop over timepoints for (int t = 0; t < DATA_T - 3; t++) { // Cubic interpolation in time Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); // Shift old values backwards t0 = t1; t1 = t2; t2 = t3; // Read one new value t3 = Volumes[Calculate3DIndex(x,y,t+3,DATA_W,DATA_H)]; } int t = DATA_T - 3; Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); t = DATA_T - 2; t0 = t1; t1 = t2; t2 = t3; Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); t = DATA_T - 1; t0 = t1; t1 = t2; Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); } // Backward interpolation else { delta = 1.0f - (-delta); t0 = Volumes[Calculate3DIndex(x,y,0,DATA_W,DATA_H)]; t1 = t0; t2 = t0; t3 = Volumes[Calculate3DIndex(x,y,1,DATA_W,DATA_H)]; // Loop over timepoints for (int t = 0; t < DATA_T - 2; t++) { // Cubic interpolation in time Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); // Shift old values backwards t0 = t1; t1 = t2; t2 = t3; // Read one new value t3 = Volumes[Calculate3DIndex(x,y,t+2,DATA_W,DATA_H)]; } int t = DATA_T - 2; Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); t = DATA_T - 1; t0 = t1; t1 = t2; t2 = t3; Corrected_Volumes[Calculate3DIndex(x,y,t,DATA_W,DATA_H)] = InterpolateCubic(t0,t1,t2,t3,delta); }
value stride2 sample dist1 report(1) value offset sample dist1 report(1) ${varj} = arr[${stride1}*it00 + ${stride2}*it01 + ${offset}]; end feature epoch value numcomps sample compdist ${computation[${numcomps}]} ${access} end feature loop value linterchange sample dist4 report(1) genif ${linterchange}==0 for (unsigned int it00 = get_local_id(0); it00 < outer_tc; it00 += get_local_size(0)) { for (unsigned int it01 = get_local_id(1); it01 < inner_tc; it01 += get_local_size(1)) { ${epoch[${numepochs}]} } } end genif ${linterchange} == 1 for (unsigned int it01 = get_local_id(0); it01 < inner_tc; it01 += get_local_size(0)) { for (unsigned int it00 = get_local_id(1); it00 < outer_tc; it00 += get_local_size(1)) { ${epoch[${numepochs}]} } } end end generate 5 with dist1 = {1:10},numvardist = {2:8},epochdist = {1:5},compdist = {1:5},dist4 = {0:1}
__kernel void CalculateStatisticalMapsGLMBayesian(__global float* Statistical_Maps, __global float* Beta_Volumes, __global float* AR_Estimates, __global const float* Volumes, __global const float* Mask, __global const int* Seeds, __constant float* c_X_GLM, __constant float* c_InvOmega0, __constant float* c_S00, __constant float* c_S01, __constant float* c_S11, __private int DATA_W, __private int DATA_H, __private int DATA_D, __private int NUMBER_OF_VOLUMES, __private int NUMBER_OF_REGRESSORS, __private int NUMBER_OF_ITERATIONS, __private int slice) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)}; if (x >= DATA_W || y >= DATA_H || z >= DATA_D) return; if ( Mask[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)] != 1.0f ) { Statistical_Maps[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = 0.0f; Statistical_Maps[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = 0.0f; Statistical_Maps[Calculate4DIndex(x,y,slice,2,DATA_W,DATA_H,DATA_D)] = 0.0f; Statistical_Maps[Calculate4DIndex(x,y,slice,3,DATA_W,DATA_H,DATA_D)] = 0.0f; Statistical_Maps[Calculate4DIndex(x,y,slice,4,DATA_W,DATA_H,DATA_D)] = 0.0f; Statistical_Maps[Calculate4DIndex(x,y,slice,5,DATA_W,DATA_H,DATA_D)] = 0.0f; Beta_Volumes[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = 0.0f; Beta_Volumes[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = 0.0f; AR_Estimates[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)] = 0.0f; return; } // Get seed from host int seed = Seeds[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)]; // Prior options float iota = 1.0f; // Decay factor for lag length in prior for rho. float r = 0.5f; // Prior mean on rho1 float c = 0.3f; // Prior standard deviation on first lag. float a0 = 0.01f; // First parameter in IG prior for sigma^2 float b0 = 0.01f; // Second parameter in IG prior for sigma^2 float InvA0 = c * c; // Algorithmic options float prcBurnin = 10.0f; // Percentage of nIter used for burnin. Note: effective number of iter is nIter. float beta[2]; float betaT[2]; int nBurnin = (int)round((float)NUMBER_OF_ITERATIONS*(prcBurnin/100.0f)); int probability1 = 0; int probability2 = 0; int probability3 = 0; int probability4 = 0; int probability5 = 0; int probability6 = 0; float m00[2]; float m01[2]; float m10[2]; float m11[2]; m00[0] = 0.0f; m00[1] = 0.0f; m01[0] = 0.0f; m01[1] = 0.0f; m10[0] = 0.0f; m10[1] = 0.0f; m11[0] = 0.0f; m11[1] = 0.0f; float g00 = 0.0f; float g01 = 0.0f; float g11 = 0.0f; float old_value = Volumes[Calculate3DIndex(x,y,0,DATA_W,DATA_H)]; m00[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + 0] * old_value; m00[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + 0] * old_value; g00 += old_value * old_value; for (int v = 1; v < NUMBER_OF_VOLUMES; v++) { float value = Volumes[Calculate3DIndex(x,y,v,DATA_W,DATA_H)]; m00[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + v] * value; m00[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + v] * value; m01[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + v] * old_value; m01[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + v] * old_value; m10[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + (v - 1)] * value; m10[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + (v - 1)] * value; m11[0] += c_X_GLM[NUMBER_OF_VOLUMES * 0 + (v - 1)] * old_value; m11[1] += c_X_GLM[NUMBER_OF_VOLUMES * 1 + (v - 1)] * old_value; g00 += value * value; g01 += value * old_value; g11 += old_value * old_value; old_value = value; } float InvOmegaT[2][2]; float OmegaT[2][2]; float Xtildesquared[2][2]; float XtildeYtilde[2]; float Ytildesquared; Xtildesquared[0][0] = c_S00[0 + 0*2]; Xtildesquared[0][1] = c_S00[0 + 1*2]; Xtildesquared[1][0] = c_S00[1 + 0*2]; Xtildesquared[1][1] = c_S00[1 + 1*2]; XtildeYtilde[0] = m00[0]; XtildeYtilde[1] = m00[1]; Ytildesquared = g00; float sigma2; float rho, rhoT, rhoProp, bT; rho = 0.0f; // Loop over iterations for (int i = 0; i < (nBurnin + NUMBER_OF_ITERATIONS); i++) { InvOmegaT[0][0] = c_InvOmega0[0 + 0 * NUMBER_OF_REGRESSORS] + Xtildesquared[0][0]; InvOmegaT[0][1] = c_InvOmega0[0 + 1 * NUMBER_OF_REGRESSORS] + Xtildesquared[0][1]; InvOmegaT[1][0] = c_InvOmega0[1 + 0 * NUMBER_OF_REGRESSORS] + Xtildesquared[1][0]; InvOmegaT[1][1] = c_InvOmega0[1 + 1 * NUMBER_OF_REGRESSORS] + Xtildesquared[1][1]; Invert_2x2(InvOmegaT, OmegaT); betaT[0] = OmegaT[0][0] * XtildeYtilde[0] + OmegaT[0][1] * XtildeYtilde[1]; betaT[1] = OmegaT[1][0] * XtildeYtilde[0] + OmegaT[1][1] * XtildeYtilde[1]; float aT = a0 + (float)NUMBER_OF_VOLUMES/2.0f; float temp[2]; temp[0] = InvOmegaT[0][0] * betaT[0] + InvOmegaT[0][1] * betaT[1]; temp[1] = InvOmegaT[1][0] * betaT[0] + InvOmegaT[1][1] * betaT[1]; bT = b0 + 0.5f * (Ytildesquared - betaT[0] * temp[0] - betaT[1] * temp[1]); // Block 1 - Step 1a. Update sigma2 sigma2 = gamrnd(aT,bT,&seed); // Block 1 - Step 1b. Update beta | sigma2 MultivariateRandom2(beta,betaT,OmegaT,sigma2,&seed); if (i > nBurnin) { if (beta[0] > 0.0f) { probability1++; } if (beta[1] > 0.0f) { probability2++; } if (beta[0] < 0.0f) { probability3++; } if (beta[1] < 0.0f) { probability4++; } if ((beta[0] - beta[1]) > 0.0f) { probability5++; } if ((beta[1] - beta[0]) > 0.0f) { probability6++; } } // Block 2, update rho float zsquared = 0.0f; float zu = 0.0f; float old_eps = 0.0f; // Calculate residuals for (int v = 1; v < NUMBER_OF_VOLUMES; v++) { float eps = Volumes[Calculate3DIndex(x,y,v,DATA_W,DATA_H)]; eps -= c_X_GLM[NUMBER_OF_VOLUMES * 0 + v] * beta[0]; eps -= c_X_GLM[NUMBER_OF_VOLUMES * 1 + v] * beta[1]; zsquared += eps * eps; zu += eps * old_eps; old_eps = eps; } // Generate rho float InvAT = InvA0 + zsquared / sigma2; float AT = 1.0f / InvAT; rhoT = AT * zu / sigma2; MultivariateRandom1(&rhoProp,rhoT,AT,sigma2,&seed); if (myabs(rhoProp) < 1.0f) { rho = rhoProp; } // Prewhitening of regressors and data Xtildesquared[0][0] = c_S00[0 + 0*2] - 2.0f * rho * c_S01[0 + 0*2] + rho * rho * c_S11[0 + 0*2]; Xtildesquared[0][1] = c_S00[0 + 1*2] - 2.0f * rho * c_S01[0 + 1*2] + rho * rho * c_S11[0 + 1*2]; Xtildesquared[1][0] = c_S00[1 + 0*2] - 2.0f * rho * c_S01[1 + 0*2] + rho * rho * c_S11[1 + 0*2]; Xtildesquared[1][1] = c_S00[1 + 1*2] - 2.0f * rho * c_S01[1 + 1*2] + rho * rho * c_S11[1 + 1*2]; XtildeYtilde[0] = m00[0] - rho * (m01[0] + m10[0]) + rho * rho * m11[0]; XtildeYtilde[1] = m00[1] - rho * (m01[1] + m10[1]) + rho * rho * m11[1]; Ytildesquared = g00 - 2.0f * rho * g01 + rho * rho * g11; } Statistical_Maps[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = (float)probability1/(float)NUMBER_OF_ITERATIONS; Statistical_Maps[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = (float)probability2/(float)NUMBER_OF_ITERATIONS; Statistical_Maps[Calculate4DIndex(x,y,slice,2,DATA_W,DATA_H,DATA_D)] = (float)probability3/(float)NUMBER_OF_ITERATIONS; Statistical_Maps[Calculate4DIndex(x,y,slice,3,DATA_W,DATA_H,DATA_D)] = (float)probability4/(float)NUMBER_OF_ITERATIONS; Statistical_Maps[Calculate4DIndex(x,y,slice,4,DATA_W,DATA_H,DATA_D)] = (float)probability5/(float)NUMBER_OF_ITERATIONS; Statistical_Maps[Calculate4DIndex(x,y,slice,5,DATA_W,DATA_H,DATA_D)] = (float)probability6/(float)NUMBER_OF_ITERATIONS; Beta_Volumes[Calculate4DIndex(x,y,slice,0,DATA_W,DATA_H,DATA_D)] = beta[0]; Beta_Volumes[Calculate4DIndex(x,y,slice,1,DATA_W,DATA_H,DATA_D)] = beta[1]; AR_Estimates[Calculate3DIndex(x,y,slice,DATA_W,DATA_H)] = rhoT; }
kernel void ComputeGradientsRTLR_V0_CPU( global float* pValuesOfWeights , int uLayersCount , int maxULayerSize , int p_i_j_l_LayerIndex_0_0 , int p_i_j_l_LayerSize_0_0 , global float$* weights_0_0 , int p_i_j_l_LayerIndex_1_0 , int p_i_j_l_LayerSize_1_0 , global float$* weights_1_0 , int p_i_j_l_LayerIndex_2_0 , int p_i_j_l_LayerSize_2_0 , global float$* weights_2_0 , int p_i_j_l_LayerIndex_3_0 , int p_i_j_l_LayerSize_3_0 , global float$* weights_3_0 , int p_i_j_k_LayerSize_0 , global float* netDerivValues_0 , int p_i_j_l_LayerIndex_0_1 , int p_i_j_l_LayerSize_0_1 , global float$* weights_0_1 , int p_i_j_l_LayerIndex_1_1 , int p_i_j_l_LayerSize_1_1 , global float$* weights_1_1 , int p_i_j_l_LayerIndex_2_1 , int p_i_j_l_LayerSize_2_1 , global float$* weights_2_1 , int p_i_j_l_LayerIndex_3_1 , int p_i_j_l_LayerSize_3_1 , global float$* weights_3_1 , int p_i_j_k_LayerSize_1 , global float* netDerivValues_1 , int p_i_j_l_LayerIndex_0_2 , int p_i_j_l_LayerSize_0_2 , global float$* weights_0_2 , int p_i_j_l_LayerIndex_1_2 , int p_i_j_l_LayerSize_1_2 , global float$* weights_1_2 , int p_i_j_l_LayerIndex_2_2 , int p_i_j_l_LayerSize_2_2 , global float$* weights_2_2 , int p_i_j_l_LayerIndex_3_2 , int p_i_j_l_LayerSize_3_2 , global float$* weights_3_2 , int p_i_j_k_LayerSize_2 , global float* netDerivValues_2 , int p_i_j_l_LayerIndex_0_3 , int p_i_j_l_LayerSize_0_3 , global float$* weights_0_3 , int p_i_j_l_LayerIndex_1_3 , int p_i_j_l_LayerSize_1_3 , global float$* weights_1_3 , int p_i_j_l_LayerIndex_2_3 , int p_i_j_l_LayerSize_2_3 , global float$* weights_2_3 , int p_i_j_l_LayerIndex_3_3 , int p_i_j_l_LayerSize_3_3 , global float$* weights_3_3 , int p_i_j_k_LayerSize_3 , global float* netDerivValues_3 , int iLayerIndex , global float* inputs , int inputsSize // + bias (null) = 1, inputs: size , global float* outputs , global float* desiredOutputs , local float* tmpGradients // size = local size , global float* gradients , global float* gradientSums) { int localId = get_local_id(0); int localSize = get_local_size(0); int ijValueIndex = get_group_id(0); int iValueIndex = ijValueIndex / inputsSize; int jValueIndex = ijValueIndex % inputsSize; tmpGradients[localId] = 0.0f; barrier(CLK_LOCAL_MEM_FENCE); // Local size ~ avg uLayerSize for (int kLayerIndex = 0; kLayerIndex < uLayersCount; kLayerIndex++) { int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex); bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null; int block = kLayerSize / localSize + (kLayerSize % localSize != 0 ? 1 : 0); int kValueIndex = localId * block; int max = kValueIndex + block; if (max > kLayerSize) max = kLayerSize; while (kValueIndex < max) { float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f; int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex); int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex); global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex); float p = netDerivValues[kValueIndex] * sum; GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p; if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p; kValueIndex++; } barrier(CLK_LOCAL_MEM_FENCE); } if (gradients != null || gradientSums != null) { ComputeGradinetsRTLR_SetGradients(tmpGradients, gradients, gradientSums); } /*int pValuesOfWeightsSize2 = uLayersCount * maxULayerSize; int block = pValuesOfWeightsSize2 / localSize + (pValuesOfWeightsSize2 % localSize != 0 ? 1 : 0); int kLayerAndValueIndex = localId * block; int max = kLayerAndValueIndex + block; if (max > pValuesOfWeightsSize2) max = pValuesOfWeightsSize2; while (kLayerAndValueIndex < max) { int kLayerIndex = kLayerAndValueIndex / maxULayerSize; int kValueIndex = kLayerAndValueIndex % maxULayerSize; int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex); if (kValueIndex < kLayerSize) { bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null; float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f; int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex); int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex); global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex); float p = netDerivValues[kValueIndex] * sum; GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p; if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p; } kLayerAndValueIndex++; }*/ }
__kernel void TRIPLE_DGEMM_UPDATE_192_96_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na) { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [48*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// -invA11*invA12 const uint gidx = get_group_id(0); const uint gidy = get_group_id(1); const uint idx = get_local_id(0); const uint idy = get_local_id(1); //uint page = gidx / 2;//0-1 for 192; 0 for 96 //const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block const uint page_block = gidx / 4; //#index of page_block; 4 WG per page; 4 WG per page_block __global double *A, *B, *C; __local double lA[24][48]; __local double lB[48][24]; double privateC[12] = { (double)0 }; //decide invA11 location for each page //each workgroup loads half of A (left or right) //A = d_dinvA + page*NB*NB + gidx%2*(blk/2); A = d_dinvA + page_block*NB*NB + gidx % 4 * (blk / 4); //decide invA12 (B) location for each page //actually it was saved in invA21 from last kernel //each workgroup loads half of B (up or down) //B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB; B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk / 4)*NB; //decide invA12 location for each page //each workgroup writes 1/4 of C //C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB; C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 uint block_k = blk / 48; //thus we need 2 iterations here do{ barrier(CLK_LOCAL_MEM_FENCE); lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB]; lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB]; lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB]; lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB]; lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB]; lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB]; lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB]; lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB]; lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB]; lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB]; lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB]; lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB]; lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB]; lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB]; lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB]; lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB]; lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB]; lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB]; lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB]; lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB]; lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB]; lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB]; lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1]; lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2]; lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3]; lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4]; lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5]; lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6]; lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7]; lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8]; lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9]; lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10]; lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11]; lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12]; lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13]; lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14]; lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15]; lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16]; lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17]; lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18]; lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19]; lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20]; lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21]; lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22]; lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23]; barrier(CLK_LOCAL_MEM_FENCE); //do math uint i = 0; do{ privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); i = i + 1; } while (i < 48); A += 48 * NB; B += 48; } while (--block_k>0); uint i = 0; do{ C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; i = i + 1; } while (i < 12); }
kernel void convolute(int4 imagesize, global unsigned char *input, global unsigned char *output, global kernf *filterG) { int4 gid = (int4)(get_global_id(0)*CONV_UNROLL, get_global_id(1), get_global_id(2), 0); int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0); int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0); // First (?) pixel to process with this kernel int4 pixelid = gid; // Starting offset of the first pixel to process int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2; int i,j; int dx,dy,dz; /* MAD performs a single convolution operation for each kernel, using the current 'raw' value as the input image 'ko' as an instance of an unrolled convolution filter 'pos' as the X-offset for each of the unrolled convolution filters Note that all the if statements dependent only on static values - meaning that they can be optimized away by the compiler */ #define MAD(ko,pos) {if(CONV_UNROLL>ko) { \ if(pos-ko >= 0 && pos-ko < kernsize) { \ val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]); \ }}} #define MADS(pos) {if(pos<kernsize) { \ raw=input[imoffset2+pos]; \ MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \ MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \ MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \ MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \ MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \ }} kernf val[CONV_UNROLL]; for(j=0;j<CONV_UNROLL;j++) val[j]=(kernf)(0.0); int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2); local kernf filter[kernsize*kernsize*kernsize]; /* Copy global filter to local memory */ event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0); wait_group_events(1, &event); if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 || gid.s1 + kernsize > imagesize.s1 || gid.s2 + kernsize > imagesize.s2) return; for(dz=0;dz<kernsize;dz++) for(dy=0;dy<kernsize;dy++) { int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels; int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1; unsigned char raw; /* kernsize + convolution_unroll < 42 */ MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5); MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11); MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17); MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23); MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29); MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35); MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41); } for(j=0;j<CONV_UNROLL;j++) { kernstore( convert_kernuc(val[j]), imoffset+j, output); } }
__kernel calculate(__global float *sunGeom, __global float *sunVarGeom, __global float *sunSlopeGeom, __local float *gridGeom, __constant float *const_f, __constant float *const_i, __global float *horizonArr, __constant float *z, __global float *o, __constant float *s, __constant float *li, __global float *a, __constant float *latitudeArray, __constant float *longitudeArray, __global float *cbhr, __global float *cdhr, __global float *lumcl, __global float *beam, __global float *globrad, __global float *insol, __global float *diff, __global float *refl ) { unsigned int gid = get_global_id(0); unsigned int gsz = const_i[0]*const_i[1]; unsigned int lid = get_local_id(0); unsigned int lsz = get_local_size(0); float longitTime = 0.0f; float o_orig; //Don't overrun arrays if (gid >= gsz) return; if (const_i[3]) longitTime = -longitudeArray[gid] / 15.0f; gridGeom[4*lsz+lid] = gridGeom[2*lsz+lid] = (float)(gid / const_i[1]) *const_f[14]; gridGeom[5*lsz+lid] = gridGeom[3*lsz+lid] = (float)(gid % const_i[1]) *const_f[15]; gridGeom[ lid] = const_f[7] + gridGeom[2*lsz+lid]; gridGeom[lsz+lid] = const_f[8] + gridGeom[3*lsz+lid]; if (const_i[13]) { float coslat = cos(const_f[3] * gridGeom[lsz+lid]); sunVarGeom[11*gsz+gid] = coslat * coslat; } float z1 = sunVarGeom[gsz+gid] = sunVarGeom[3*gsz+gid] = z[gid]; if (z1 == UNDEFZ) return; float latitude, longitude, aspect, slope; if (const_i[14] != NULL) { if (o[gid] != 0.0f) aspect = sunSlopeGeom[3*gsz+gid] = o[gid] * const_f[3]; else aspect = sunSlopeGeom[3*gsz+gid] = UNDEF; } else { aspect = sunSlopeGeom[3*gsz+gid] = const_f[23]; } if (const_i[18] != NULL) latitude = latitudeArray[gid]*const_f[3]; if (const_i[19] != NULL) longitude = longitudeArray[gid]*const_f[3]; if (const_i[9] == PROJECTION_LL) { /* ll projection */ longitude = gridGeom[lid]*const_f[3]; latitude = gridGeom[lsz+lid]*const_f[3]; } if (const_i[15] == NULL) slope = const_f[22]; else slope = s[gid]; float cos_u = cos(const_f[1] - slope); /* = sin(slope) */ float sin_u = sin(const_f[1] - slope); /* = cos(slope) */ float cos_v = cos(const_f[1] + aspect); float sin_v = sin(const_f[1] + aspect); if (const_i[5] != NULL) sunGeom[7*gsz+gid] = const_i[8]; float geom_sinlat = gridGeom[6*lsz+lid] = sin(-latitude); float geom_coslat = gridGeom[7*lsz+lid] = cos(-latitude); float sin_phi_l = -geom_coslat * cos_u * sin_v + geom_sinlat * sin_u; sunSlopeGeom[ gid] = atan(-cos_u * cos_v / (geom_sinlat * cos_u * sin_v + geom_coslat * sin_u)); sunSlopeGeom[ gsz+gid] = cos(asin(sin_phi_l)) * const_f[40]; sunSlopeGeom[2*gsz+gid] = sin_phi_l * const_f[39]; if ((const_i[22] != NULL) || someRadiation) com_par_const(sunGeo, gridGeom, const_f, const_i, longitTime); if (const_i[22] != NULL) { com_par(sunGeom, sunVarGeom, gridGeom, const_f, const_i, latitude, longitude); float lum = lumcline2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom, const_f, const_f, horizonArr, z, gid*const_i[7]); if (lum > 0.0f) { lum = rad2deg * asin(lum); lumcl[gid] = (float)lum; } else { lumcl[gid] = UNDEFZ; } } if (someRadiation) { joules2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom, const_f, const_i, horizonArr, z, s, li, a, cbhr, cdhr, beam, insol, diff, refl, globrad, gid*const_i[7], latitude, longitude); } }
\n \n uA.d += offsetA; \n uB.d += offsetB; \n C += offsetC; \n \n \n double rC[6][6] = { {(double)0} }; \n double rA[6]; \n double rB[6]; \n \n __local double lA[392]; \n __local double lB[392]; \n \n int gidx = get_group_id(0); \n int gidy = get_group_id(1); \n int idx = get_local_id(0); \n int idy = get_local_id(1); \n \n \n uA.d += 2*(gidx*24 + idx) + idy*lda; \n uB.d += 2*(gidy*24 + idx) + idy*ldb; \n \n int block_k = K >> 3; \n do { \n __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); \n __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); \n barrier(CLK_LOCAL_MEM_FENCE); \n plB[0 ] = uB.d2v[0 ]; \n plB[8 ] = uB.d2v[8 ]; \n plB[16] = uB.d2v[16]; \n plA[0 ] = uA.d2v[0 ];
{ float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*64*lda+ idxT + idyT*lda; B += gidy*64*ldb+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idxT*65+idyT; __local float* plB = lB + idxT*65+idyT;
__kernel void diag_dtrtri_upper_192_12_src( int isDiagUnit, __global double const * restrict A, uint offA, __global double *d_dinvA, uint lda, uint na) { int i, j; double Ystx = 0; __local double *y = 0; double switcher; double neg_switcher; // Thread index int tx = get_local_id(0); // Thread index int gx = get_global_id(0); // Block index int bx = get_group_id(0); A = A + offA; __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; int NumBLperNB = NB / BLOCK_SIZE; d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); __local double Bs[BLOCK_SIZE*BLOCK_SIZE]; __local double workspace[BLOCK_SIZE]; // workspace used to store the current working column
__ArgLast(float, epsSqr) // dynemically defined local memory as argument // __ArgLast(__local float4*, localPos) { unsigned int tid = get_local_id(0); unsigned int gid = get_global_id(0); unsigned int localSize = get_local_size(0); // Number of tiles we need to iterate unsigned int numTiles = numBodies / localSize; // statically declared local memory __Local(float4, localPos, 256); // position of this work-item float4 myPos = pos[gid]; float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); for(int i = 0; i < (int)numTiles; ++i) { // load one tile into local memory // int idx = i * localSize + tid; uint idx = mad24( (uint)i, localSize, tid); localPos[tid] = pos[idx]; // Synchronize to make sure data is available for processing barrier(CLK_LOCAL_MEM_FENCE); // calculate acceleration effect due to each body // a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2) for(int j = 0; j < (int)localSize; ++j) { // Calculate acceleartion caused by particle j on particle i float4 r = localPos[j] - myPos; float distSqr = r.x * r.x + r.y * r.y + r.z * r.z; float invDist = 1.0f / sqrt(distSqr + epsSqr); float invDistCube = invDist * invDist * invDist; float s = localPos[j].w * invDistCube; // accumulate effect of all particles acc += ((float4)s * r); } // Synchronize so that next tile can be loaded barrier(CLK_LOCAL_MEM_FENCE); } float4 oldVel = vel[gid]; // updated position and velocity float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime; newPos.w = myPos.w; float4 newVel = oldVel + acc * deltaTime; // write to global memory pos[gid] = newPos; vel[gid] = newVel; // MANDATORY __Return; }
__kernel void CalculateStatisticalMapSearchlight_(__global float* Classifier_Performance, __global const float* Volumes, __global const float* Mask, __constant float* c_d, __constant float* c_Correct_Classes, __private int DATA_W, __private int DATA_H, __private int DATA_D, __private int NUMBER_OF_VOLUMES, __private float n, __private int EPOCS) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)}; if (x >= DATA_W || y >= DATA_H || z >= DATA_D) return; if ( Mask[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] != 1.0f ) { Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f; return; } __local float l_Volume[16][16][16]; // z, y, x int classification_performance = 0; // Leave one out cross validation for (int validation = 0; validation < NUMBER_OF_VOLUMES; validation++) { float weights[20]; weights[0] = 0.0f; weights[1] = 0.0f; weights[2] = 0.0f; weights[3] = 0.0f; weights[4] = 0.0f; weights[5] = 0.0f; weights[6] = 0.0f; weights[7] = 0.0f; weights[8] = 0.0f; weights[9] = 0.0f; weights[10] = 0.0f; weights[11] = 0.0f; weights[12] = 0.0f; weights[13] = 0.0f; weights[14] = 0.0f; weights[15] = 0.0f; weights[16] = 0.0f; weights[17] = 0.0f; weights[18] = 0.0f; weights[19] = 0.0f; // Do training for a number of iterations for (int epoc = 0; epoc < EPOCS; epoc++) { float gradient[20]; gradient[0] = 0.0f; gradient[1] = 0.0f; gradient[2] = 0.0f; gradient[3] = 0.0f; gradient[4] = 0.0f; gradient[5] = 0.0f; gradient[6] = 0.0f; gradient[7] = 0.0f; gradient[8] = 0.0f; gradient[9] = 0.0f; gradient[10] = 0.0f; gradient[11] = 0.0f; gradient[12] = 0.0f; gradient[13] = 0.0f; gradient[14] = 0.0f; gradient[15] = 0.0f; gradient[16] = 0.0f; gradient[17] = 0.0f; gradient[18] = 0.0f; gradient[19] = 0.0f; for (int t = 0; t < NUMBER_OF_VOLUMES; t++) { // Skip training with validation time point if (t == validation) { continue; } float s; // Classification for current timepoint ReadSphere((__local float*)l_Volume, Volumes, x, y, z, t, tIdx, DATA_W, DATA_H, DATA_D); // Make sure all threads have written to local memory barrier(CLK_LOCAL_MEM_FENCE); // Make classification s = weights[0] * 1.0f; // z - 1 s += weights[1] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 - 1]; // s += weights[2] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 - 1][tIdx.x + 4]; // s += weights[3] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4]; // center pixel s += weights[4] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 + 1][tIdx.x + 4]; // s += weights[5] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 + 1]; // // z s += weights[6] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 - 1]; // s += weights[7] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4]; // s += weights[8] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 + 1]; // s += weights[9] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 - 1]; // s += weights[10] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4]; // center pixel s += weights[11] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 + 1]; // s += weights[12] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 - 1]; // s += weights[13] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4]; // s += weights[14] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 + 1]; // // z + 1 s += weights[15] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 - 1]; // s += weights[16] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 - 1][tIdx.x + 4]; // s += weights[17] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4]; // center pixel s += weights[18] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 + 1][tIdx.x + 4]; // s += weights[19] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 + 1]; // // Calculate contribution to gradient gradient[0] += (s - c_d[t]) * 1.0f; // z - 1 gradient[1] += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 - 1]; // gradient[2] += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 - 1][tIdx.x + 4]; // gradient[3] += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4]; // center pixel gradient[4] += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 + 1][tIdx.x + 4]; // gradient[5] += (s - c_d[t]) * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 + 1]; // // z gradient[6] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 - 1]; // gradient[7] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4]; // gradient[8] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 + 1]; // gradient[9] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 - 1]; // gradient[10] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4]; // center pixel gradient[11] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 + 1]; // gradient[12] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 - 1]; // gradient[13] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4]; // gradient[14] += (s - c_d[t]) * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 + 1]; // // z + 1 gradient[15] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 - 1]; // gradient[16] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 - 1][tIdx.x + 4]; // gradient[17] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4]; // center pixel gradient[18] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 + 1][tIdx.x + 4]; // gradient[19] += (s - c_d[t]) * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 + 1]; // // end for t } // Update weights weights[0] -= n/(float)NUMBER_OF_VOLUMES * gradient[0]; weights[1] -= n/(float)NUMBER_OF_VOLUMES * gradient[1]; weights[2] -= n/(float)NUMBER_OF_VOLUMES * gradient[2]; weights[3] -= n/(float)NUMBER_OF_VOLUMES * gradient[3]; weights[4] -= n/(float)NUMBER_OF_VOLUMES * gradient[4]; weights[5] -= n/(float)NUMBER_OF_VOLUMES * gradient[5]; weights[6] -= n/(float)NUMBER_OF_VOLUMES * gradient[6]; weights[7] -= n/(float)NUMBER_OF_VOLUMES * gradient[7]; weights[8] -= n/(float)NUMBER_OF_VOLUMES * gradient[8]; weights[9] -= n/(float)NUMBER_OF_VOLUMES * gradient[9]; weights[10] -= n/(float)NUMBER_OF_VOLUMES * gradient[10]; weights[11] -= n/(float)NUMBER_OF_VOLUMES * gradient[11]; weights[12] -= n/(float)NUMBER_OF_VOLUMES * gradient[12]; weights[13] -= n/(float)NUMBER_OF_VOLUMES * gradient[13]; weights[14] -= n/(float)NUMBER_OF_VOLUMES * gradient[14]; weights[15] -= n/(float)NUMBER_OF_VOLUMES * gradient[15]; weights[16] -= n/(float)NUMBER_OF_VOLUMES * gradient[16]; weights[17] -= n/(float)NUMBER_OF_VOLUMES * gradient[17]; weights[18] -= n/(float)NUMBER_OF_VOLUMES * gradient[18]; weights[19] -= n/(float)NUMBER_OF_VOLUMES * gradient[19]; // end for epocs } // Make classification on validation timepoint ReadSphere((__local float*)l_Volume, Volumes, x, y, z, validation, tIdx, DATA_W, DATA_H, DATA_D); // Make sure all threads have written to local memory barrier(CLK_LOCAL_MEM_FENCE); // Make classification float s; s = weights[0] * 1.0f; // z - 1 s += weights[1] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 - 1]; // s += weights[2] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 - 1][tIdx.x + 4]; // s += weights[3] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4]; // center pixel s += weights[4] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4 + 1][tIdx.x + 4]; // s += weights[5] * l_Volume[tIdx.z + 4 - 1][tIdx.y + 4][tIdx.x + 4 + 1]; // // z s += weights[6] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 - 1]; // s += weights[7] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4]; // s += weights[8] * l_Volume[tIdx.z + 4][tIdx.y + 4 - 1][tIdx.x + 4 + 1]; // s += weights[9] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 - 1]; // s += weights[10] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4]; // center pixel s += weights[11] * l_Volume[tIdx.z + 4][tIdx.y + 4][tIdx.x + 4 + 1]; // s += weights[12] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 - 1]; // s += weights[13] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4]; // s += weights[14] * l_Volume[tIdx.z + 4][tIdx.y + 4 + 1][tIdx.x + 4 + 1]; // // z + 1 s += weights[15] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 - 1]; // s += weights[16] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 - 1][tIdx.x + 4]; // s += weights[17] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4]; // center pixel s += weights[18] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4 + 1][tIdx.x + 4]; // s += weights[19] * l_Volume[tIdx.z + 4 + 1][tIdx.y + 4][tIdx.x + 4 + 1]; // float classification; if (s > 0.0f) { classification = 0.0f; } else { classification = 1.0f; } if (classification == c_Correct_Classes[validation]) { classification_performance++; } // end for validation } Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = (float)classification_performance / (float)NUMBER_OF_VOLUMES; }
__kernel void CalculateStatisticalMapSearchlight___(__global float* Classifier_Performance, __global const float* Volumes, __global const float* Mask, __constant float* c_d, __constant float* c_Correct_Classes, __private int DATA_W, __private int DATA_H, __private int DATA_D, __private int NUMBER_OF_VOLUMES, __private float n, __private int EPOCS) { int x = get_global_id(0); int y = get_global_id(1); int z = get_global_id(2); int3 tIdx = {get_local_id(0), get_local_id(1), get_local_id(2)}; if (x >= DATA_W || y >= DATA_H || z >= DATA_D) return; if ( Mask[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] != 1.0f ) { Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f; return; } if ( ((x + 1) >= DATA_W) || ((y + 1) >= DATA_H) || ((z + 1) >= DATA_D) ) { Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f; return; } if ( ((x - 1) < 0) || ((y - 1) < 0) || ((z - 1) < 0) ) { Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = 0.0f; return; } int classification_performance = 0; // // Training // float weights[20]; weights[0] = 0.0f; weights[1] = 0.0f; weights[2] = 0.0f; weights[3] = 0.0f; weights[4] = 0.0f; weights[5] = 0.0f; weights[6] = 0.0f; weights[7] = 0.0f; weights[8] = 0.0f; weights[9] = 0.0f; weights[10] = 0.0f; weights[11] = 0.0f; weights[12] = 0.0f; weights[13] = 0.0f; weights[14] = 0.0f; weights[15] = 0.0f; weights[16] = 0.0f; weights[17] = 0.0f; weights[18] = 0.0f; weights[19] = 0.0f; // Do training for a number of iterations for (int epoc = 0; epoc < EPOCS; epoc++) { float gradient[20]; gradient[0] = 0.0f; gradient[1] = 0.0f; gradient[2] = 0.0f; gradient[3] = 0.0f; gradient[4] = 0.0f; gradient[5] = 0.0f; gradient[6] = 0.0f; gradient[7] = 0.0f; gradient[8] = 0.0f; gradient[9] = 0.0f; gradient[10] = 0.0f; gradient[11] = 0.0f; gradient[12] = 0.0f; gradient[13] = 0.0f; gradient[14] = 0.0f; gradient[15] = 0.0f; gradient[16] = 0.0f; gradient[17] = 0.0f; gradient[18] = 0.0f; gradient[19] = 0.0f; for (int t = 0; t < NUMBER_OF_VOLUMES / 2; t++) { // Ignore censored volumes if (c_Correct_Classes[t] == 9999.0f) { continue; } // Make classification float s; s = weights[0] * 1.0f; float x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19; x1 = Volumes[Calculate4DIndex(x-1,y,z-1,t,DATA_W,DATA_H,DATA_D)]; x2 = Volumes[Calculate4DIndex(x,y-1,z-1,t,DATA_W,DATA_H,DATA_D)]; x3 = Volumes[Calculate4DIndex(x,y,z-1,t,DATA_W,DATA_H,DATA_D)]; x4 = Volumes[Calculate4DIndex(x,y+1,z-1,t,DATA_W,DATA_H,DATA_D)]; x5 = Volumes[Calculate4DIndex(x+1,y,z-1,t,DATA_W,DATA_H,DATA_D)]; x6 = Volumes[Calculate4DIndex(x-1,y-1,z,t,DATA_W,DATA_H,DATA_D)]; x7 = Volumes[Calculate4DIndex(x-1,y,z,t,DATA_W,DATA_H,DATA_D)]; x8 = Volumes[Calculate4DIndex(x-1,y+1,z,t,DATA_W,DATA_H,DATA_D)]; x9 = Volumes[Calculate4DIndex(x,y-1,z,t,DATA_W,DATA_H,DATA_D)]; x10 = Volumes[Calculate4DIndex(x,y,z,t,DATA_W,DATA_H,DATA_D)]; x11 = Volumes[Calculate4DIndex(x,y+1,z,t,DATA_W,DATA_H,DATA_D)]; x12 = Volumes[Calculate4DIndex(x+1,y-1,z,t,DATA_W,DATA_H,DATA_D)]; x13 = Volumes[Calculate4DIndex(x+1,y,z,t,DATA_W,DATA_H,DATA_D)]; x14 = Volumes[Calculate4DIndex(x+1,y+1,z,t,DATA_W,DATA_H,DATA_D)]; x15 = Volumes[Calculate4DIndex(x-1,y,z+1,t,DATA_W,DATA_H,DATA_D)]; x16 = Volumes[Calculate4DIndex(x,y-1,z+1,t,DATA_W,DATA_H,DATA_D)]; x17 = Volumes[Calculate4DIndex(x,y,z+1,t,DATA_W,DATA_H,DATA_D)]; x18 = Volumes[Calculate4DIndex(x,y+1,z+1,t,DATA_W,DATA_H,DATA_D)]; x19 = Volumes[Calculate4DIndex(x+1,y,z+1,t,DATA_W,DATA_H,DATA_D)]; // z - 1 s += weights[1] * x1; s += weights[2] * x2; s += weights[3] * x3; s += weights[4] * x4; s += weights[5] * x5; // z s += weights[6] * x6; s += weights[7] * x7; s += weights[8] * x8; s += weights[9] * x9; s += weights[10] * x10; s += weights[11] * x11; s += weights[12] * x12; s += weights[13] * x13; s += weights[14] * x14; // z + 1 s += weights[15] * x15; s += weights[16] * x16; s += weights[17] * x17; s += weights[18] * x18; s += weights[19] * x19; // Calculate contribution to gradient gradient[0] += (s - c_d[t]) * 1.0f; // z - 1 gradient[1] += (s - c_d[t]) * x1; gradient[2] += (s - c_d[t]) * x2; gradient[3] += (s - c_d[t]) * x3; gradient[4] += (s - c_d[t]) * x4; gradient[5] += (s - c_d[t]) * x5; // z gradient[6] += (s - c_d[t]) * x6; gradient[7] += (s - c_d[t]) * x7; gradient[8] += (s - c_d[t]) * x8; gradient[9] += (s - c_d[t]) * x9; gradient[10] += (s - c_d[t]) * x10; gradient[11] += (s - c_d[t]) * x11; gradient[12] += (s - c_d[t]) * x12; gradient[13] += (s - c_d[t]) * x13; gradient[14] += (s - c_d[t]) * x14; // z + 1 gradient[15] += (s - c_d[t]) * x15; gradient[16] += (s - c_d[t]) * x16; gradient[17] += (s - c_d[t]) * x17; gradient[18] += (s - c_d[t]) * x18; gradient[19] += (s - c_d[t]) * x19; // end for t } // Update weights weights[0] -= n/(float)NUMBER_OF_VOLUMES * gradient[0]; weights[1] -= n/(float)NUMBER_OF_VOLUMES * gradient[1]; weights[2] -= n/(float)NUMBER_OF_VOLUMES * gradient[2]; weights[3] -= n/(float)NUMBER_OF_VOLUMES * gradient[3]; weights[4] -= n/(float)NUMBER_OF_VOLUMES * gradient[4]; weights[5] -= n/(float)NUMBER_OF_VOLUMES * gradient[5]; weights[6] -= n/(float)NUMBER_OF_VOLUMES * gradient[6]; weights[7] -= n/(float)NUMBER_OF_VOLUMES * gradient[7]; weights[8] -= n/(float)NUMBER_OF_VOLUMES * gradient[8]; weights[9] -= n/(float)NUMBER_OF_VOLUMES * gradient[9]; weights[10] -= n/(float)NUMBER_OF_VOLUMES * gradient[10]; weights[11] -= n/(float)NUMBER_OF_VOLUMES * gradient[11]; weights[12] -= n/(float)NUMBER_OF_VOLUMES * gradient[12]; weights[13] -= n/(float)NUMBER_OF_VOLUMES * gradient[13]; weights[14] -= n/(float)NUMBER_OF_VOLUMES * gradient[14]; weights[15] -= n/(float)NUMBER_OF_VOLUMES * gradient[15]; weights[16] -= n/(float)NUMBER_OF_VOLUMES * gradient[16]; weights[17] -= n/(float)NUMBER_OF_VOLUMES * gradient[17]; weights[18] -= n/(float)NUMBER_OF_VOLUMES * gradient[18]; weights[19] -= n/(float)NUMBER_OF_VOLUMES * gradient[19]; // end for epocs } // // Testing // float s; int uncensoredVolumes = 0; // Make classifications for (int t = NUMBER_OF_VOLUMES / 2 + 1; t < NUMBER_OF_VOLUMES; t++) { // Ignore censored volumes if (c_Correct_Classes[t] == 9999.0f) { continue; } uncensoredVolumes++; s = weights[0] * 1.0f; float x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16, x17, x18, x19; x1 = Volumes[Calculate4DIndex(x-1,y,z-1,t,DATA_W,DATA_H,DATA_D)]; x2 = Volumes[Calculate4DIndex(x,y-1,z-1,t,DATA_W,DATA_H,DATA_D)]; x3 = Volumes[Calculate4DIndex(x,y,z-1,t,DATA_W,DATA_H,DATA_D)]; x4 = Volumes[Calculate4DIndex(x,y+1,z-1,t,DATA_W,DATA_H,DATA_D)]; x5 = Volumes[Calculate4DIndex(x+1,y,z-1,t,DATA_W,DATA_H,DATA_D)]; x6 = Volumes[Calculate4DIndex(x-1,y-1,z,t,DATA_W,DATA_H,DATA_D)]; x7 = Volumes[Calculate4DIndex(x-1,y,z,t,DATA_W,DATA_H,DATA_D)]; x8 = Volumes[Calculate4DIndex(x-1,y+1,z,t,DATA_W,DATA_H,DATA_D)]; x9 = Volumes[Calculate4DIndex(x,y-1,z,t,DATA_W,DATA_H,DATA_D)]; x10 = Volumes[Calculate4DIndex(x,y,z,t,DATA_W,DATA_H,DATA_D)]; x11 = Volumes[Calculate4DIndex(x,y+1,z,t,DATA_W,DATA_H,DATA_D)]; x12 = Volumes[Calculate4DIndex(x+1,y-1,z,t,DATA_W,DATA_H,DATA_D)]; x13 = Volumes[Calculate4DIndex(x+1,y,z,t,DATA_W,DATA_H,DATA_D)]; x14 = Volumes[Calculate4DIndex(x+1,y+1,z,t,DATA_W,DATA_H,DATA_D)]; x15 = Volumes[Calculate4DIndex(x-1,y,z+1,t,DATA_W,DATA_H,DATA_D)]; x16 = Volumes[Calculate4DIndex(x,y-1,z+1,t,DATA_W,DATA_H,DATA_D)]; x17 = Volumes[Calculate4DIndex(x,y,z+1,t,DATA_W,DATA_H,DATA_D)]; x18 = Volumes[Calculate4DIndex(x,y+1,z+1,t,DATA_W,DATA_H,DATA_D)]; x19 = Volumes[Calculate4DIndex(x+1,y,z+1,t,DATA_W,DATA_H,DATA_D)]; // z - 1 s += weights[1] * x1; s += weights[2] * x2; s += weights[3] * x3; s += weights[4] * x4; s += weights[5] * x5; // z s += weights[6] * x6; s += weights[7] * x7; s += weights[8] * x8; s += weights[9] * x9; s += weights[10] * x10; s += weights[11] * x11; s += weights[12] * x12; s += weights[13] * x13; s += weights[14] * x14; // z + 1 s += weights[15] * x15; s += weights[16] * x16; s += weights[17] * x17; s += weights[18] * x18; s += weights[19] * x19; float classification; if (s > 0.0f) { classification = 0.0f; } else { classification = 1.0f; } if (classification == c_Correct_Classes[t]) { classification_performance++; } } Classifier_Performance[Calculate3DIndex(x,y,z,DATA_W,DATA_H)] = (float)classification_performance / (float)uncensoredVolumes; }
void step_bodies( struct Body * bodies, struct Pair * pairs, unsigned int * map, float dt, unsigned int num_bodies, // in float * velocity_ratio, // in/out float * mass_center, // in float mass, // in unsigned int * number_escaped // out ) { /* work group */ int local_block = num_bodies / get_num_groups(0); unsigned int i_group0 = get_group_id(0) * local_block; unsigned int i_group1 = i_group0 + local_block; if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies; /* work item */ int block = (i_group1 - i_group0) / get_local_size(0); unsigned int i_local0 = i_group0 + get_local_id(0) * block; unsigned int i_local1 = i_local0 + block; if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1; /* printf("local_block = %i\n", local_block); printf("block = %i\n", block); */ /* printf("i_local0 = %i\n", i_local0); printf("i_local1 = %i\n", i_local1); */ /* copy data for work group */ //__local struct Pair local_pairs[NUM_PAIRS]; //__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS]; //event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0); //wait_group_events(1, &e0); //event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0); //wait_group_events(1, &e1); /* */ float f[3]; //__local struct BodyMap * pbm = 0; //struct BodyMap * pbm = 0; Body * pb = 0; for(unsigned int b = i_local0; b < i_local1; b++) { //pbm = local_bodymaps + b; //pbm = bodymaps + b; pb = bodies + b; if(pb->alive == 0) { //puts("body dead"); continue; } f[0] = 0; f[1] = 0; f[2] = 0; for(unsigned int i = 0; i < num_bodies; i++) { if(b == i) continue; //__local struct Pair * pp = &local_pairs[pbm->pair[p]]; Pair * pp = pairs + map[b * num_bodies + i]; if(pp->_M_alive == 0) continue; if(pp->b0 == b) { f[0] -= pp->u[0] * pp->f; f[1] -= pp->u[1] * pp->f; f[2] -= pp->u[2] * pp->f; } else if(pp->b1 == b) { f[0] += pp->u[0] * pp->f; f[1] += pp->u[1] * pp->f; f[2] += pp->u[2] * pp->f; } else { assert(0); } } float dv[3]; if(0) { dv[0] = dt * f[0] / pb->mass; dv[1] = dt * f[1] / pb->mass; dv[2] = dt * f[2] / pb->mass; } else { dv[0] = dt * pb->f[0] / pb->mass; dv[1] = dt * pb->f[1] / pb->mass; dv[2] = dt * pb->f[2] / pb->mass; } //print(pb->f); if( (!feq(pb->f[0], f[0])) || (!feq(pb->f[1], f[1])) || (!feq(pb->f[2], f[2])) ) { print(f); print(pb->f); abort(); } assert(std::isfinite(pb->mass)); assert(std::isfinite(dt)); assert(std::isfinite(pb->f[0])); assert(std::isfinite(pb->f[1])); assert(std::isfinite(pb->f[2])); // reset accumulating force pb->f[0] = 0; pb->f[1] = 0; pb->f[2] = 0; float e = 0.01; float rat[3]; rat[0] = fabs(dv[0] / pb->v[0]); rat[1] = fabs(dv[1] / pb->v[1]); rat[2] = fabs(dv[2] / pb->v[2]); // atomic if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0]; if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1]; if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2]; if(0) { if( ((std::isfinite(rat[0])) && (rat[0] > e)) || ((std::isfinite(rat[1])) && (rat[1] > e)) || ((std::isfinite(rat[2])) && (rat[2] > e)) ) { printf("% 12f % 12f % 12f\n", rat[0], rat[1], rat[2]); } } pb->v[0] += dv[0]; pb->v[1] += dv[1]; pb->v[2] += dv[2]; pb->x[0] += dt * pb->v[0]; pb->x[1] += dt * pb->v[1]; pb->x[2] += dt * pb->v[2]; // distance from mass center float r[3]; r[0] = pb->x[0] - mass_center[0]; r[1] = pb->x[1] - mass_center[1]; r[2] = pb->x[2] - mass_center[2]; float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]); float escape_speed2 = 2.0 * 6.67e-11 * mass / d; float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2]; // dot product of velocity and displacement vector float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2]; if(s2 > (escape_speed2)) // speed exceeds escape speed { if(dot > 0.0) // parallel componenet points away from mass_center { // atomic (*number_escaped)++; //printf("escape!\n"); } } } }
__kernel void TRIPLE_DGEMM_UPDATE_192_48_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [96*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// A12*invA22 const uint gidx = get_group_id(0);\n const uint gidy = get_group_id(1);\n const uint idx = get_local_id(0);\n const uint idy = get_local_id(1);\n //uint page = gidx / 2;//0-1 for 192; 0 for 96 const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block __global double *B, *C; \n __local double lA[24][48]; \n __local double lB[48][24]; \n double privateC[12] = { (double)0 }; \n //decide A12 location for each page //each workgroup loads half of A (left or right) Ain = Ain + offAin; \n Ain += page_block*NB*lda + page_block*NB + page*blk * 2 * lda + page*blk * 2 + blk*lda + gidx % 2 * (blk / 2); \n //decide invA22 (B) location for each page //each workgroup loads half of B (up or down) B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4 of C C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 48); \n i = 0; \n do{\n C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n