end genesis __kernel void kernel_mc02_orig(unsigned int outer_tc, unsigned int inner_tc, __global __volatile float *arr){ for (unsigned int it00 = get_local_id(0); it00 < outer_tc; it00 += get_local_size(0)) { for (unsigned int it01 = get_local_id(1); it01 < inner_tc; it01 += get_local_size(1)) { ${epoch[5]} } } }
size_t _CL_OVERLOADABLE get_global_id(unsigned int dimindx) { switch(dimindx) { /* TODO: add get_global_offset(X) to these! */ case 0: return get_local_size(0) * get_group_id(0) + get_local_id(0); case 1: return get_local_size(1) * get_group_id(1) + get_local_id(1); case 2: return get_local_size(2) * get_group_id(2) + get_local_id(2); default: return 0; } }
__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\ int tid = get_local_id(0); \n\ int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\ int i = tid + (bx) * (get_local_size(0)); \n\ //debug \n\ //if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\ if (i < size ) { mem[i]=val; } \n\ }" };
__kernel void kernel_scan(__global float* input, __global float* output) { int global_idx = get_global_id(0); int local_idx = get_local_id(0); int block_size = get_local_size(0); int group_id = get_group_id(0); output[global_idx] = input[global_idx]; mem_fence(CLK_GLOBAL_MEM_FENCE); for(int i = 1; i < block_size; i <<= 1) { if(global_idx >= i) output[global_idx] += output[global_idx - i]; mem_fence(CLK_GLOBAL_MEM_FENCE); } }
__kernel void kernel_reduce(__global float* input, __global float* output) { int global_idx = get_global_id(0); int local_idx = get_local_id(0); int block_size = get_local_size(0); int group_id = get_group_id(0); for(int i = block_size/2; i > 0; i >>= 1) { if(local_idx < i) input[global_idx] += input[global_idx + i]; barrier(CLK_GLOBAL_MEM_FENCE); } if(local_idx == 0) output[group_id] = input[global_idx]; }
kernel void ComputeGradientsRTLR_V0_CPU( global float* pValuesOfWeights , int uLayersCount , int maxULayerSize , int p_i_j_l_LayerIndex_0_0 , int p_i_j_l_LayerSize_0_0 , global float$* weights_0_0 , int p_i_j_l_LayerIndex_1_0 , int p_i_j_l_LayerSize_1_0 , global float$* weights_1_0 , int p_i_j_l_LayerIndex_2_0 , int p_i_j_l_LayerSize_2_0 , global float$* weights_2_0 , int p_i_j_l_LayerIndex_3_0 , int p_i_j_l_LayerSize_3_0 , global float$* weights_3_0 , int p_i_j_k_LayerSize_0 , global float* netDerivValues_0 , int p_i_j_l_LayerIndex_0_1 , int p_i_j_l_LayerSize_0_1 , global float$* weights_0_1 , int p_i_j_l_LayerIndex_1_1 , int p_i_j_l_LayerSize_1_1 , global float$* weights_1_1 , int p_i_j_l_LayerIndex_2_1 , int p_i_j_l_LayerSize_2_1 , global float$* weights_2_1 , int p_i_j_l_LayerIndex_3_1 , int p_i_j_l_LayerSize_3_1 , global float$* weights_3_1 , int p_i_j_k_LayerSize_1 , global float* netDerivValues_1 , int p_i_j_l_LayerIndex_0_2 , int p_i_j_l_LayerSize_0_2 , global float$* weights_0_2 , int p_i_j_l_LayerIndex_1_2 , int p_i_j_l_LayerSize_1_2 , global float$* weights_1_2 , int p_i_j_l_LayerIndex_2_2 , int p_i_j_l_LayerSize_2_2 , global float$* weights_2_2 , int p_i_j_l_LayerIndex_3_2 , int p_i_j_l_LayerSize_3_2 , global float$* weights_3_2 , int p_i_j_k_LayerSize_2 , global float* netDerivValues_2 , int p_i_j_l_LayerIndex_0_3 , int p_i_j_l_LayerSize_0_3 , global float$* weights_0_3 , int p_i_j_l_LayerIndex_1_3 , int p_i_j_l_LayerSize_1_3 , global float$* weights_1_3 , int p_i_j_l_LayerIndex_2_3 , int p_i_j_l_LayerSize_2_3 , global float$* weights_2_3 , int p_i_j_l_LayerIndex_3_3 , int p_i_j_l_LayerSize_3_3 , global float$* weights_3_3 , int p_i_j_k_LayerSize_3 , global float* netDerivValues_3 , int iLayerIndex , global float* inputs , int inputsSize // + bias (null) = 1, inputs: size , global float* outputs , global float* desiredOutputs , local float* tmpGradients // size = local size , global float* gradients , global float* gradientSums) { int localId = get_local_id(0); int localSize = get_local_size(0); int ijValueIndex = get_group_id(0); int iValueIndex = ijValueIndex / inputsSize; int jValueIndex = ijValueIndex % inputsSize; tmpGradients[localId] = 0.0f; barrier(CLK_LOCAL_MEM_FENCE); // Local size ~ avg uLayerSize for (int kLayerIndex = 0; kLayerIndex < uLayersCount; kLayerIndex++) { int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex); bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null; int block = kLayerSize / localSize + (kLayerSize % localSize != 0 ? 1 : 0); int kValueIndex = localId * block; int max = kValueIndex + block; if (max > kLayerSize) max = kLayerSize; while (kValueIndex < max) { float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f; int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex); int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex); global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex); float p = netDerivValues[kValueIndex] * sum; GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p; if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p; kValueIndex++; } barrier(CLK_LOCAL_MEM_FENCE); } if (gradients != null || gradientSums != null) { ComputeGradinetsRTLR_SetGradients(tmpGradients, gradients, gradientSums); } /*int pValuesOfWeightsSize2 = uLayersCount * maxULayerSize; int block = pValuesOfWeightsSize2 / localSize + (pValuesOfWeightsSize2 % localSize != 0 ? 1 : 0); int kLayerAndValueIndex = localId * block; int max = kLayerAndValueIndex + block; if (max > pValuesOfWeightsSize2) max = pValuesOfWeightsSize2; while (kLayerAndValueIndex < max) { int kLayerIndex = kLayerAndValueIndex / maxULayerSize; int kValueIndex = kLayerAndValueIndex % maxULayerSize; int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex); if (kValueIndex < kLayerSize) { bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null; float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f; int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex); int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex); global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex); if (p_i_j_l_LayerIndex != -1) { p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex); weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex); sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex); } global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex); float p = netDerivValues[kValueIndex] * sum; GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p; if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p; } kLayerAndValueIndex++; }*/ }
MAYINLINE G4double G4NewNavigation_ComputeStep( G4NewNavigation *This, G4ThreeVector localPoint, G4ThreeVector localDirection, const G4double currentProposedStepLength, G4double *newSafety, G4NavigationHistory *history, G4bool *validExitNormal, G4ThreeVector *exitNormal, G4bool *exiting, G4bool *entering, GEOMETRYLOC G4VPhysicalVolume *(*pBlockedPhysical) ,SHAREDMEM int * Numbers_Of_Solid, SHAREDMEM int * Sum_Of_Solids, SHAREDTYPE SolidInfo * Solids, SHAREDTYPE ResultInfo * Result_For_Current_Solid, SHAREDTYPE FinalResult * Compacter_Result, SHAREDMEM bool * noStepArray, SHAREDMEM PointInformation * LocationArray, GEOMETRYLOC G4SmartVoxelNode *nullVNode, G4bool cur_vol_local #ifdef CHECK ,GEOMETRYLOC float * Result #endif ) { GEOMETRYLOC G4VPhysicalVolume *motherPhysical, *samplePhysical, *blockedExitedVol = GEOMETRYNULL; GEOMETRYLOC G4LogicalVolume *motherLogical; GEOMETRYLOC G4VSolid *motherSolid; G4ThreeVector sampleDirection; G4double ourStep=currentProposedStepLength, motherSafety, ourSafety; G4int sampleNo; // , localNoDaughters; //EDIT: Defining these here for global scope. GEOMETRYLOC G4VSolid *sampleSolid; G4AffineTransform sampleTf; const G4ThreeVector samplePoint; G4double sampleSafety; int PrevSum; // PrevSum stores the sum of all solids of one type so far. int Count_of_Solid_type; // This integer is to fill up the shared mem array for solid types // For definition of shared memory arrays see kernel trace in gpu.c // _--------------------------------__ G4bool initialNode; GEOMETRYLOC const G4SmartVoxelNode *curVoxelNode; G4int curNoVolumes, contentNo; G4double voxelSafety; G4double sampleStep; #if ( GLOBAL_MODE == 1) int locationId = get_global_id(0); #else int locationId = get_local_id(0); #endif motherPhysical = G4NavigationHistory_GetTopVolume( history ); motherLogical = G4VPhysicalVolume_GetLogicalVolume(motherPhysical); motherSolid = G4LogicalVolume_GetSolid(motherLogical); // // Compute mother safety // motherSafety = G4VSolid_DistanceToOut(motherSolid, localPoint); ourSafety = motherSafety; // Working isotropic safety // // Compute daughter safeties & intersections // // Exiting normal optimisation // if ( *exiting && *validExitNormal ) { if ( G4ThreeVector_dot(localDirection,*exitNormal)>=kMinExitingNormalCosine ) { // Block exited daughter volume // blockedExitedVol = *pBlockedPhysical; ourSafety = 0; } } *exiting = false; *entering = false; #ifdef USE_BLIST G4NewNavigation_EnlargeAndResetBlist( This, G4LogicalVolume_GetNoDaughters(motherLogical) ); #endif // noStepLocal -> Local boolean storing the noStep value for the current track. // noStepAll -> Value returned after a reduction of all noStepLocals initialNode = true; G4bool noStepLocal = true; G4bool noStepAll = true; // GEOMETRYLOC G4SmartVoxelNode * nullVNodePointer; //G4SmartVoxelNode nullNode; //nullVNodePointer = &nullNode; int counter = -1; while ( noStepAll ) { counter ++; if( !noStepLocal ) { curVoxelNode = nullVNode; curNoVolumes = 0; } else { curVoxelNode = This->fVoxelNode; curNoVolumes = G4VoxelNode_GetNoContained( curVoxelNode ); } int number_of_volume_types = Solidcount, Curr_vol_type=0; // Volume type is from the enum ESolids in everything.h int current_solid_sum = 0; // if (GLOBAL_MODE ==1) // number_of_threads = get_global_size(0); int number_of_threads = get_local_size(0); // STEP 1: Iteration through all volume types. Min is calculated one type at a time. //REMEMBER:EDIT: Changed starting value of Curr_vol_type to 1 for (Curr_vol_type = 1; Curr_vol_type < number_of_volume_types; (Curr_vol_type)++ ) { current_solid_sum = 0; int number_of_solids; number_of_solids = curVoxelNode->SolidType[ Curr_vol_type ]; // The number of solids of this type in the current voxel for the current track // NOTE: There may be an overhead here. If more than one thread is in the same voxel and reads this data, mem access not coalesced Numbers_Of_Solid[ locationId ] = number_of_solids; // At this stage a parallel scan sum has to be called which updates the sums for all the solids. The function is inlined. BARRIER_FLEXIBLE; // Barrier to ensure that all threads have filled up the shared mem array. See everything.h for definition //int all_threads = get_global_size(1); Prefix_Sum( Numbers_Of_Solid, Sum_Of_Solids, number_of_threads ); // The array Sum_Of_Solids stores the final result after the Prefix sum scan. //Result[PrevSum + current_solid_sum ] = locationId; PrevSum = Sum_Of_Solids[ locationId ]; // NOTE: The use of contentNo below is to maintain some resemblance to Otto's definition in VoxelNavigation.c if ( noStepLocal ) { for( contentNo = curNoVolumes-1; contentNo>=0 ; contentNo--) { sampleNo = G4VoxelNode_GetVolume(curVoxelNode,contentNo); samplePhysical = G4LogicalVolume_GetDaughter(motherLogical,sampleNo); //if ( samplePhysical != blockedExitedVol ) { // NOTE: blockedExitedVol check makes sense for the serial version. Does not make as much sense to keep it on in the parallel version as well. sampleSolid = G4LogicalVolume_GetSolid( G4VPhysicalVolume_GetLogicalVolume( samplePhysical )); // NOTE: We iterate over all solids and compare if the solid type is what we are looking for. If it is then it is added // to the array Solids in shared mem. However, a better implementation is to have a way such that the solids are returned in sorted order // the first place. if( sampleSolid->type == Curr_vol_type) { // Should the solid be stored or the Physical Volume?? SolidInfo Info = { samplePhysical, locationId }; Solids[ PrevSum + current_solid_sum ] = Info; // PrevSum is the sum of all solids of that type for all threads upto this element not including the current thread. // current_solid_sum at the end of all iterations should be equal to the number_of_solids. current_solid_sum++; } } } } //EDIT: Change to BARRIER_LOCAL when not testing GLOBAL_MODE BARRIER_FLEXIBLE; //MODIFY : Perhaps a check is in order here. // One check at this point could be whether current_solid_sum is equal to the number_of_solids. // Before proceeding to call the kernel for calculating the min, check if code works up to this point int Total_solids_of_this_type; Total_solids_of_this_type = Numbers_Of_Solid[ number_of_threads - 1 ] + Sum_Of_Solids[ number_of_threads - 1 ]; if ( Total_solids_of_this_type > BlockSize * Multiplier) { Total_solids_of_this_type = BlockSize * Multiplier; // Update Errors or return error from here? } // Checking a candidate solid of current type. int k=0; int iterations = (Total_solids_of_this_type / number_of_threads); for( k = 0; k < iterations ; k++ ) { int Work_id = locationId + number_of_threads*k; if( ( Work_id ) < Total_solids_of_this_type) { GEOMETRYLOC G4VPhysicalVolume * candPhysical = ( Solids[ Work_id ].PVolume); GEOMETRYLOC G4VSolid * candSolid = ((candPhysical)->flogical)->fSolid; int candId = Solids[ Work_id].trackId; G4ThreeVector candGlobalPoint = LocationArray[candId].Point; G4ThreeVector candGlobalDirection = LocationArray[candId].Direction; G4AffineTransform candTf = G4AffineTransform_create_full( G4VPhysicalVolume_GetObjectRotationValue(candPhysical), G4VPhysicalVolume_GetTranslation(candPhysical)); G4AffineTransform_Invert( &candTf ); const G4ThreeVector candPoint= G4AffineTransform_TransformPoint(&candTf,candGlobalPoint); const G4double candSafety = G4VSolid_DistanceToIn( candSolid, candPoint ); const G4ThreeVector candDirection = G4AffineTransform_TransformAxis( &candTf, candGlobalDirection ); const G4double candStep = G4VSolid_DistanceToIn_full( candSolid, candPoint, candDirection ); // Result_For_Current_Solid should hold data for the sampleSafety and the sampleStep // NOTE: In this version of navigation the step is calculated along with the safety. It is not checked if // step<safety; the safety is only calculated here because the physics may require it later. ResultInfo Result_of_Solid = { candSafety, candStep, candId, candPhysical } ; Result_For_Current_Solid[ Work_id ] = Result_of_Solid ; } BARRIER_FLEXIBLE; // REMOVE: Barrier used in debugging stage. Remove when done. } // At this point all the safeties and steps have been calculated, now to find the minimum step for the current solid per track. // find_minimum... // Most basic implementation for finding minimum, a proper min finding algorithm that takes into account threadIds is hard to find. Find_minimum ( Result_For_Current_Solid, Compacter_Result, PrevSum, number_of_solids ); // Minimum finding function that finds the minimum step per track in Result_For_Current_Solid and // stores the minimum step, the safety and the pointer to the sampleSolid ( physical? ) // The minimum is basically compared to the existing value and stored if smaller. This way find_minimum does not // care about which solid type is currently being processed. // See gpu.c where Compacter Results initial step value is set to kInfinfity. //if( Curr_vol_type ==1) } //if( Curr_vol_type == 1) if ( Compacter_Result[ locationId ].step <= ourStep ) { ourStep = Compacter_Result[ locationId ].step; *pBlockedPhysical = Compacter_Result[ locationId ].PVolume; *entering = true; *exiting = false; } if (initialNode) { initialNode = false; voxelSafety = G4NewNavigation_ComputeVoxelSafety(This,localPoint); if ( voxelSafety<ourSafety ) { ourSafety = voxelSafety; } if ( currentProposedStepLength<ourSafety ) { // Guaranteed physics limited // noStepLocal = false; *entering = false; *exiting = false; *pBlockedPhysical = GEOMETRYNULL; ourStep = kInfinity; } else { // // Compute mother intersection if required // if ( motherSafety<=ourStep ) { G4double motherStep = G4VSolid_DistanceToOut_full( motherSolid, localPoint, localDirection, true, validExitNormal, exitNormal); if ( motherStep<=ourStep ) { ourStep = motherStep; *exiting = true; *entering = false; if ( *validExitNormal ) { G4RotationMatrix rot = G4VPhysicalVolume_GetObjectRotationValue(motherPhysical); G4RotationMatrix inv = G4RotationMatrix_inverse(&rot); *exitNormal = G4RotationMatrix_apply( &inv, *exitNormal ); } } else { *validExitNormal = false; } } } *newSafety = ourSafety; } if (noStepLocal) { noStepLocal = G4NewNavigation_LocateNextVoxel(This, localPoint, localDirection, ourStep); } noStepArray[ locationId ] = noStepLocal; BARRIER_FLEXIBLE; noStepAll = NoStepReduction( noStepArray, number_of_threads); //Prefix_Sum ( noStepArray, noStepArray, number_of_threads); } // end -while (noStep)- loop // Double check to make sure all threads have reached here before exiting BARRIER_FLEXIBLE; //Result[ locationId ] = ourStep; return ourStep; }
__kernel calculate(__global float *sunGeom, __global float *sunVarGeom, __global float *sunSlopeGeom, __local float *gridGeom, __constant float *const_f, __constant float *const_i, __global float *horizonArr, __constant float *z, __global float *o, __constant float *s, __constant float *li, __global float *a, __constant float *latitudeArray, __constant float *longitudeArray, __global float *cbhr, __global float *cdhr, __global float *lumcl, __global float *beam, __global float *globrad, __global float *insol, __global float *diff, __global float *refl ) { unsigned int gid = get_global_id(0); unsigned int gsz = const_i[0]*const_i[1]; unsigned int lid = get_local_id(0); unsigned int lsz = get_local_size(0); float longitTime = 0.0f; float o_orig; //Don't overrun arrays if (gid >= gsz) return; if (const_i[3]) longitTime = -longitudeArray[gid] / 15.0f; gridGeom[4*lsz+lid] = gridGeom[2*lsz+lid] = (float)(gid / const_i[1]) *const_f[14]; gridGeom[5*lsz+lid] = gridGeom[3*lsz+lid] = (float)(gid % const_i[1]) *const_f[15]; gridGeom[ lid] = const_f[7] + gridGeom[2*lsz+lid]; gridGeom[lsz+lid] = const_f[8] + gridGeom[3*lsz+lid]; if (const_i[13]) { float coslat = cos(const_f[3] * gridGeom[lsz+lid]); sunVarGeom[11*gsz+gid] = coslat * coslat; } float z1 = sunVarGeom[gsz+gid] = sunVarGeom[3*gsz+gid] = z[gid]; if (z1 == UNDEFZ) return; float latitude, longitude, aspect, slope; if (const_i[14] != NULL) { if (o[gid] != 0.0f) aspect = sunSlopeGeom[3*gsz+gid] = o[gid] * const_f[3]; else aspect = sunSlopeGeom[3*gsz+gid] = UNDEF; } else { aspect = sunSlopeGeom[3*gsz+gid] = const_f[23]; } if (const_i[18] != NULL) latitude = latitudeArray[gid]*const_f[3]; if (const_i[19] != NULL) longitude = longitudeArray[gid]*const_f[3]; if (const_i[9] == PROJECTION_LL) { /* ll projection */ longitude = gridGeom[lid]*const_f[3]; latitude = gridGeom[lsz+lid]*const_f[3]; } if (const_i[15] == NULL) slope = const_f[22]; else slope = s[gid]; float cos_u = cos(const_f[1] - slope); /* = sin(slope) */ float sin_u = sin(const_f[1] - slope); /* = cos(slope) */ float cos_v = cos(const_f[1] + aspect); float sin_v = sin(const_f[1] + aspect); if (const_i[5] != NULL) sunGeom[7*gsz+gid] = const_i[8]; float geom_sinlat = gridGeom[6*lsz+lid] = sin(-latitude); float geom_coslat = gridGeom[7*lsz+lid] = cos(-latitude); float sin_phi_l = -geom_coslat * cos_u * sin_v + geom_sinlat * sin_u; sunSlopeGeom[ gid] = atan(-cos_u * cos_v / (geom_sinlat * cos_u * sin_v + geom_coslat * sin_u)); sunSlopeGeom[ gsz+gid] = cos(asin(sin_phi_l)) * const_f[40]; sunSlopeGeom[2*gsz+gid] = sin_phi_l * const_f[39]; if ((const_i[22] != NULL) || someRadiation) com_par_const(sunGeo, gridGeom, const_f, const_i, longitTime); if (const_i[22] != NULL) { com_par(sunGeom, sunVarGeom, gridGeom, const_f, const_i, latitude, longitude); float lum = lumcline2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom, const_f, const_f, horizonArr, z, gid*const_i[7]); if (lum > 0.0f) { lum = rad2deg * asin(lum); lumcl[gid] = (float)lum; } else { lumcl[gid] = UNDEFZ; } } if (someRadiation) { joules2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom, const_f, const_i, horizonArr, z, s, li, a, cbhr, cdhr, beam, insol, diff, refl, globrad, gid*const_i[7], latitude, longitude); } }
void step_bodies( struct Body * bodies, struct Pair * pairs, unsigned int * map, float dt, unsigned int num_bodies, // in float * velocity_ratio, // in/out float * mass_center, // in float mass, // in unsigned int * number_escaped // out ) { /* work group */ int local_block = num_bodies / get_num_groups(0); unsigned int i_group0 = get_group_id(0) * local_block; unsigned int i_group1 = i_group0 + local_block; if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies; /* work item */ int block = (i_group1 - i_group0) / get_local_size(0); unsigned int i_local0 = i_group0 + get_local_id(0) * block; unsigned int i_local1 = i_local0 + block; if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1; /* printf("local_block = %i\n", local_block); printf("block = %i\n", block); */ /* printf("i_local0 = %i\n", i_local0); printf("i_local1 = %i\n", i_local1); */ /* copy data for work group */ //__local struct Pair local_pairs[NUM_PAIRS]; //__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS]; //event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0); //wait_group_events(1, &e0); //event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0); //wait_group_events(1, &e1); /* */ float f[3]; //__local struct BodyMap * pbm = 0; //struct BodyMap * pbm = 0; Body * pb = 0; for(unsigned int b = i_local0; b < i_local1; b++) { //pbm = local_bodymaps + b; //pbm = bodymaps + b; pb = bodies + b; if(pb->alive == 0) { //puts("body dead"); continue; } f[0] = 0; f[1] = 0; f[2] = 0; for(unsigned int i = 0; i < num_bodies; i++) { if(b == i) continue; //__local struct Pair * pp = &local_pairs[pbm->pair[p]]; Pair * pp = pairs + map[b * num_bodies + i]; if(pp->_M_alive == 0) continue; if(pp->b0 == b) { f[0] -= pp->u[0] * pp->f; f[1] -= pp->u[1] * pp->f; f[2] -= pp->u[2] * pp->f; } else if(pp->b1 == b) { f[0] += pp->u[0] * pp->f; f[1] += pp->u[1] * pp->f; f[2] += pp->u[2] * pp->f; } else { assert(0); } } float dv[3]; if(0) { dv[0] = dt * f[0] / pb->mass; dv[1] = dt * f[1] / pb->mass; dv[2] = dt * f[2] / pb->mass; } else { dv[0] = dt * pb->f[0] / pb->mass; dv[1] = dt * pb->f[1] / pb->mass; dv[2] = dt * pb->f[2] / pb->mass; } //print(pb->f); if( (!feq(pb->f[0], f[0])) || (!feq(pb->f[1], f[1])) || (!feq(pb->f[2], f[2])) ) { print(f); print(pb->f); abort(); } assert(std::isfinite(pb->mass)); assert(std::isfinite(dt)); assert(std::isfinite(pb->f[0])); assert(std::isfinite(pb->f[1])); assert(std::isfinite(pb->f[2])); // reset accumulating force pb->f[0] = 0; pb->f[1] = 0; pb->f[2] = 0; float e = 0.01; float rat[3]; rat[0] = fabs(dv[0] / pb->v[0]); rat[1] = fabs(dv[1] / pb->v[1]); rat[2] = fabs(dv[2] / pb->v[2]); // atomic if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0]; if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1]; if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2]; if(0) { if( ((std::isfinite(rat[0])) && (rat[0] > e)) || ((std::isfinite(rat[1])) && (rat[1] > e)) || ((std::isfinite(rat[2])) && (rat[2] > e)) ) { printf("% 12f % 12f % 12f\n", rat[0], rat[1], rat[2]); } } pb->v[0] += dv[0]; pb->v[1] += dv[1]; pb->v[2] += dv[2]; pb->x[0] += dt * pb->v[0]; pb->x[1] += dt * pb->v[1]; pb->x[2] += dt * pb->v[2]; // distance from mass center float r[3]; r[0] = pb->x[0] - mass_center[0]; r[1] = pb->x[1] - mass_center[1]; r[2] = pb->x[2] - mass_center[2]; float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]); float escape_speed2 = 2.0 * 6.67e-11 * mass / d; float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2]; // dot product of velocity and displacement vector float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2]; if(s2 > (escape_speed2)) // speed exceeds escape speed { if(dot > 0.0) // parallel componenet points away from mass_center { // atomic (*number_escaped)++; //printf("escape!\n"); } } } }
__ArgLast(float, epsSqr) // dynemically defined local memory as argument // __ArgLast(__local float4*, localPos) { unsigned int tid = get_local_id(0); unsigned int gid = get_global_id(0); unsigned int localSize = get_local_size(0); // Number of tiles we need to iterate unsigned int numTiles = numBodies / localSize; // statically declared local memory __Local(float4, localPos, 256); // position of this work-item float4 myPos = pos[gid]; float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f); for(int i = 0; i < (int)numTiles; ++i) { // load one tile into local memory // int idx = i * localSize + tid; uint idx = mad24( (uint)i, localSize, tid); localPos[tid] = pos[idx]; // Synchronize to make sure data is available for processing barrier(CLK_LOCAL_MEM_FENCE); // calculate acceleration effect due to each body // a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2) for(int j = 0; j < (int)localSize; ++j) { // Calculate acceleartion caused by particle j on particle i float4 r = localPos[j] - myPos; float distSqr = r.x * r.x + r.y * r.y + r.z * r.z; float invDist = 1.0f / sqrt(distSqr + epsSqr); float invDistCube = invDist * invDist * invDist; float s = localPos[j].w * invDistCube; // accumulate effect of all particles acc += ((float4)s * r); } // Synchronize so that next tile can be loaded barrier(CLK_LOCAL_MEM_FENCE); } float4 oldVel = vel[gid]; // updated position and velocity float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime; newPos.w = myPos.w; float4 newVel = oldVel + acc * deltaTime; // write to global memory pos[gid] = newPos; vel[gid] = newVel; // MANDATORY __Return; }
kernel void convolute(int4 imagesize, global unsigned char *input, global unsigned char *output, global kernf *filterG) { int4 gid = (int4)(get_global_id(0)*CONV_UNROLL, get_global_id(1), get_global_id(2), 0); int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0); int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0); // First (?) pixel to process with this kernel int4 pixelid = gid; // Starting offset of the first pixel to process int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2; int i,j; int dx,dy,dz; /* MAD performs a single convolution operation for each kernel, using the current 'raw' value as the input image 'ko' as an instance of an unrolled convolution filter 'pos' as the X-offset for each of the unrolled convolution filters Note that all the if statements dependent only on static values - meaning that they can be optimized away by the compiler */ #define MAD(ko,pos) {if(CONV_UNROLL>ko) { \ if(pos-ko >= 0 && pos-ko < kernsize) { \ val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]); \ }}} #define MADS(pos) {if(pos<kernsize) { \ raw=input[imoffset2+pos]; \ MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \ MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \ MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \ MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \ MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \ }} kernf val[CONV_UNROLL]; for(j=0;j<CONV_UNROLL;j++) val[j]=(kernf)(0.0); int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2); local kernf filter[kernsize*kernsize*kernsize]; /* Copy global filter to local memory */ event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0); wait_group_events(1, &event); if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 || gid.s1 + kernsize > imagesize.s1 || gid.s2 + kernsize > imagesize.s2) return; for(dz=0;dz<kernsize;dz++) for(dy=0;dy<kernsize;dy++) { int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels; int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1; unsigned char raw; /* kernsize + convolution_unroll < 42 */ MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5); MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11); MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17); MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23); MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29); MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35); MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41); } for(j=0;j<CONV_UNROLL;j++) { kernstore( convert_kernuc(val[j]), imoffset+j, output); } }