Beispiel #1
0
end genesis

__kernel
void kernel_mc02_orig(unsigned int outer_tc, unsigned int inner_tc, __global __volatile float *arr){

	for (unsigned int it00 = get_local_id(0); it00 < outer_tc; it00 += get_local_size(0)) {
		for (unsigned int it01 = get_local_id(1); it01 < inner_tc; it01 += get_local_size(1)) {
            ${epoch[5]}
        }
	}
 
}
Beispiel #2
0
size_t _CL_OVERLOADABLE
get_global_id(unsigned int dimindx)
{
  switch(dimindx)
    {
        /* TODO: add get_global_offset(X) to these! */
    case 0: return get_local_size(0) * get_group_id(0) + get_local_id(0);
    case 1: return get_local_size(1) * get_group_id(1) + get_local_id(1);
    case 2: return get_local_size(2) * get_group_id(2) + get_local_id(2);
    default: return 0;
    }
}
__kernel void memset_uint4(__global int *mem, const int size, __private int val) { \n\
int tid = get_local_id(0); \n\
int bx = (get_group_id(1)) * (get_num_groups(0)) + get_group_id(0); \n\
int i = tid + (bx) * (get_local_size(0)); \n\
//debug \n\
//if (i == 0) { printf(\"memset size = %i value = %i buffer %i \\n\",size,val,mem[0]); } \n\
if (i < size ) { mem[i]=val; } \n\
}" };
__kernel void kernel_scan(__global float* input, __global float* output) {
    int global_idx = get_global_id(0);
    int local_idx = get_local_id(0);
    int block_size = get_local_size(0);
    int group_id = get_group_id(0);

    output[global_idx] = input[global_idx];
    mem_fence(CLK_GLOBAL_MEM_FENCE);

    for(int i = 1; i < block_size; i <<= 1) {
        if(global_idx >= i)
            output[global_idx] += output[global_idx - i];
        mem_fence(CLK_GLOBAL_MEM_FENCE);
    }
}
Beispiel #5
0
__kernel void kernel_reduce(__global float* input, __global float* output) {
    int global_idx = get_global_id(0);
    int local_idx = get_local_id(0);
    int block_size = get_local_size(0);
    int group_id = get_group_id(0);

    for(int i = block_size/2; i > 0; i >>= 1) {
        if(local_idx < i)
            input[global_idx] += input[global_idx + i];

        barrier(CLK_GLOBAL_MEM_FENCE);
    }

    if(local_idx == 0)
        output[group_id] = input[global_idx];
}
Beispiel #6
0
kernel void ComputeGradientsRTLR_V0_CPU(
    global float* pValuesOfWeights
    , int uLayersCount
    , int maxULayerSize
    , int p_i_j_l_LayerIndex_0_0
    , int p_i_j_l_LayerSize_0_0
    , global float$* weights_0_0
    , int p_i_j_l_LayerIndex_1_0
    , int p_i_j_l_LayerSize_1_0
    , global float$* weights_1_0
    , int p_i_j_l_LayerIndex_2_0
    , int p_i_j_l_LayerSize_2_0
    , global float$* weights_2_0
    , int p_i_j_l_LayerIndex_3_0
    , int p_i_j_l_LayerSize_3_0
    , global float$* weights_3_0
    , int p_i_j_k_LayerSize_0
    , global float* netDerivValues_0
    , int p_i_j_l_LayerIndex_0_1
    , int p_i_j_l_LayerSize_0_1
    , global float$* weights_0_1
    , int p_i_j_l_LayerIndex_1_1
    , int p_i_j_l_LayerSize_1_1
    , global float$* weights_1_1
    , int p_i_j_l_LayerIndex_2_1
    , int p_i_j_l_LayerSize_2_1
    , global float$* weights_2_1
    , int p_i_j_l_LayerIndex_3_1
    , int p_i_j_l_LayerSize_3_1
    , global float$* weights_3_1
    , int p_i_j_k_LayerSize_1
    , global float* netDerivValues_1
    , int p_i_j_l_LayerIndex_0_2
    , int p_i_j_l_LayerSize_0_2
    , global float$* weights_0_2
    , int p_i_j_l_LayerIndex_1_2
    , int p_i_j_l_LayerSize_1_2
    , global float$* weights_1_2
    , int p_i_j_l_LayerIndex_2_2
    , int p_i_j_l_LayerSize_2_2
    , global float$* weights_2_2
    , int p_i_j_l_LayerIndex_3_2
    , int p_i_j_l_LayerSize_3_2
    , global float$* weights_3_2
    , int p_i_j_k_LayerSize_2
    , global float* netDerivValues_2
    , int p_i_j_l_LayerIndex_0_3
    , int p_i_j_l_LayerSize_0_3
    , global float$* weights_0_3
    , int p_i_j_l_LayerIndex_1_3
    , int p_i_j_l_LayerSize_1_3
    , global float$* weights_1_3
    , int p_i_j_l_LayerIndex_2_3
    , int p_i_j_l_LayerSize_2_3
    , global float$* weights_2_3
    , int p_i_j_l_LayerIndex_3_3
    , int p_i_j_l_LayerSize_3_3
    , global float$* weights_3_3
    , int p_i_j_k_LayerSize_3
    , global float* netDerivValues_3
    , int iLayerIndex
    , global float* inputs
    , int inputsSize // + bias (null) = 1, inputs: size
    , global float* outputs
    , global float* desiredOutputs
    , local float* tmpGradients // size = local size
    , global float* gradients
    , global float* gradientSums)
{
    int localId = get_local_id(0);
    int localSize = get_local_size(0);

    int ijValueIndex = get_group_id(0);
    int iValueIndex = ijValueIndex / inputsSize;
    int jValueIndex = ijValueIndex % inputsSize;

    tmpGradients[localId] = 0.0f;

    barrier(CLK_LOCAL_MEM_FENCE);

    // Local size ~ avg uLayerSize
    for (int kLayerIndex = 0; kLayerIndex < uLayersCount; kLayerIndex++)
    {
        int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex);
        bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null;

        int block = kLayerSize / localSize + (kLayerSize % localSize != 0 ? 1 : 0);
        int kValueIndex = localId * block;
        int max = kValueIndex + block;
        if (max > kLayerSize) max = kLayerSize;
        while (kValueIndex < max)
        {
            float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f;

            int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex);
            int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex);
            global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex);
            sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex);
            float p = netDerivValues[kValueIndex] * sum;
            GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p;

            if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p;

            kValueIndex++;
        }

        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (gradients != null || gradientSums != null)
    {
        ComputeGradinetsRTLR_SetGradients(tmpGradients, gradients, gradientSums);
    }

    /*int pValuesOfWeightsSize2 = uLayersCount * maxULayerSize;
    int block = pValuesOfWeightsSize2 / localSize + (pValuesOfWeightsSize2 % localSize != 0 ? 1 : 0);
    int kLayerAndValueIndex = localId * block;
    int max = kLayerAndValueIndex + block;
    if (max > pValuesOfWeightsSize2) max = pValuesOfWeightsSize2;
    while (kLayerAndValueIndex < max)
    {
        int kLayerIndex = kLayerAndValueIndex / maxULayerSize;
        int kValueIndex = kLayerAndValueIndex % maxULayerSize;
        int kLayerSize = PickIntValueByLayerIndex(p_i_j_k_LayerSize_0, p_i_j_k_LayerSize_1, p_i_j_k_LayerSize_2, p_i_j_k_LayerSize_3, kLayerIndex);
        
        if (kValueIndex < kLayerSize)
        {
            bool computeGradient = (kLayerIndex == uLayersCount - 1) && outputs != null && desiredOutputs != null;

            float sum = (iLayerIndex == kLayerIndex && iValueIndex == kValueIndex) ? (inputs != null ? inputs[jValueIndex] : 1.0f) : 0.0f;

            int p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_0_0, p_i_j_l_LayerIndex_0_1, p_i_j_l_LayerIndex_0_2, p_i_j_l_LayerIndex_0_3, kLayerIndex);
            int p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_0_0, p_i_j_l_LayerSize_0_1, p_i_j_l_LayerSize_0_2, p_i_j_l_LayerSize_0_3, kLayerIndex);
            global float$* weights = PickFPValueByLayerIndex$(weights_0_0, weights_0_1, weights_0_2, weights_0_3, kLayerIndex);
            sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            
            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_1_0, p_i_j_l_LayerIndex_1_1, p_i_j_l_LayerIndex_1_2, p_i_j_l_LayerIndex_1_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_1_0, p_i_j_l_LayerSize_1_1, p_i_j_l_LayerSize_1_2, p_i_j_l_LayerSize_1_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_1_0, weights_1_1, weights_1_2, weights_1_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_2_0, p_i_j_l_LayerIndex_2_1, p_i_j_l_LayerIndex_2_2, p_i_j_l_LayerIndex_2_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_2_0, p_i_j_l_LayerSize_2_1, p_i_j_l_LayerSize_2_2, p_i_j_l_LayerSize_2_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_2_0, weights_2_1, weights_2_2, weights_2_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }

            p_i_j_l_LayerIndex = PickIntValueByLayerIndex(p_i_j_l_LayerIndex_3_0, p_i_j_l_LayerIndex_3_1, p_i_j_l_LayerIndex_3_2, p_i_j_l_LayerIndex_3_3, kLayerIndex);
            if (p_i_j_l_LayerIndex != -1)
            {
                p_i_j_l_LayerSize = PickIntValueByLayerIndex(p_i_j_l_LayerSize_3_0, p_i_j_l_LayerSize_3_1, p_i_j_l_LayerSize_3_2, p_i_j_l_LayerSize_3_3, kLayerIndex);
                weights = PickFPValueByLayerIndex$(weights_3_0, weights_3_1, weights_3_2, weights_3_3, kLayerIndex);
                sum += ComputeForward_Sum$((global float$*)GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, p_i_j_l_LayerIndex), p_i_j_l_LayerSize, weights, kValueIndex);
            }
            
            global float* netDerivValues = PickFPValueByLayerIndex(netDerivValues_0, netDerivValues_1, netDerivValues_2, netDerivValues_3, kLayerIndex);
            float p = netDerivValues[kValueIndex] * sum;
            GetPValuesPtr(pValuesOfWeights, uLayersCount, maxULayerSize, kLayerIndex)[kValueIndex] = p;
            
            if (computeGradient) tmpGradients[localId] += (desiredOutputs[kValueIndex] - outputs[kValueIndex]) * p;
        }

        kLayerAndValueIndex++;
    }*/
}
Beispiel #7
0
MAYINLINE
G4double
G4NewNavigation_ComputeStep(
			G4NewNavigation *This,
			G4ThreeVector localPoint,
			G4ThreeVector localDirection,
			const G4double currentProposedStepLength,
			G4double *newSafety,
			G4NavigationHistory *history,
			G4bool *validExitNormal,
			G4ThreeVector *exitNormal,
			G4bool *exiting,
			G4bool *entering,
			GEOMETRYLOC G4VPhysicalVolume *(*pBlockedPhysical)
			 ,SHAREDMEM int * Numbers_Of_Solid,
			 SHAREDMEM int * Sum_Of_Solids,
		     SHAREDTYPE SolidInfo  * Solids,
			 SHAREDTYPE ResultInfo * Result_For_Current_Solid,
		     SHAREDTYPE FinalResult * Compacter_Result,
			 SHAREDMEM bool * noStepArray,
			 SHAREDMEM PointInformation * LocationArray,
			 GEOMETRYLOC G4SmartVoxelNode  *nullVNode,
			 G4bool cur_vol_local
#ifdef CHECK 
			,GEOMETRYLOC float * Result
#endif			
			)
{
	
  GEOMETRYLOC G4VPhysicalVolume *motherPhysical, *samplePhysical,
	*blockedExitedVol = GEOMETRYNULL;
  GEOMETRYLOC G4LogicalVolume *motherLogical;
  GEOMETRYLOC G4VSolid *motherSolid;
  G4ThreeVector sampleDirection;
  G4double ourStep=currentProposedStepLength, motherSafety, ourSafety;
  G4int sampleNo; // , localNoDaughters;
    
  
  //EDIT: Defining these here for global scope.
  GEOMETRYLOC  G4VSolid *sampleSolid;
  G4AffineTransform sampleTf;
  const G4ThreeVector samplePoint;
  G4double sampleSafety;

  int PrevSum;
	//  PrevSum stores the sum of all solids of one type so far. 
  int Count_of_Solid_type;
	// This integer is to fill up the shared mem array for solid types

// For definition of shared memory arrays see kernel trace in gpu.c 
// _--------------------------------__   

  G4bool initialNode;
  GEOMETRYLOC const G4SmartVoxelNode *curVoxelNode;
  G4int curNoVolumes, contentNo;
  G4double voxelSafety;
  G4double sampleStep;
  	
#if ( GLOBAL_MODE  == 1)
  int locationId = get_global_id(0);
#else  
  int locationId = get_local_id(0);
#endif

  motherPhysical = G4NavigationHistory_GetTopVolume( history );
  motherLogical = G4VPhysicalVolume_GetLogicalVolume(motherPhysical);
  motherSolid = G4LogicalVolume_GetSolid(motherLogical);
  
  //
  // Compute mother safety
  //

  motherSafety = G4VSolid_DistanceToOut(motherSolid, localPoint);
  ourSafety = motherSafety;                 // Working isotropic safety
 
  
  //
  // Compute daughter safeties & intersections
  //

  // Exiting normal optimisation
  //
  if ( *exiting && *validExitNormal )
  {
		if ( G4ThreeVector_dot(localDirection,*exitNormal)>=kMinExitingNormalCosine )
		{
			// Block exited daughter volume
			//
			blockedExitedVol = *pBlockedPhysical;
			ourSafety = 0;
		}
  }
  *exiting = false;
  *entering = false;

#ifdef USE_BLIST
  G4NewNavigation_EnlargeAndResetBlist( This, G4LogicalVolume_GetNoDaughters(motherLogical) );
#endif
  
  // noStepLocal -> Local boolean storing the noStep value for the current track.
  // noStepAll -> Value returned after a reduction of all noStepLocals

  initialNode = true;
  G4bool noStepLocal = true;
  G4bool noStepAll = true;
 // GEOMETRYLOC G4SmartVoxelNode * nullVNodePointer;
  //G4SmartVoxelNode nullNode;
  //nullVNodePointer = &nullNode;
  int counter = -1;
  while ( noStepAll )
  {	 counter ++;
        if( !noStepLocal )
        {   
             curVoxelNode = nullVNode;
             curNoVolumes = 0;
        }
		else
		{
		    curVoxelNode = This->fVoxelNode;
			curNoVolumes = G4VoxelNode_GetNoContained( curVoxelNode );
		}
		int number_of_volume_types = Solidcount, Curr_vol_type=0;
			// Volume type is from the enum ESolids in everything.h

		int current_solid_sum = 0;
		// if (GLOBAL_MODE ==1)
		//     number_of_threads = get_global_size(0);
		int number_of_threads = get_local_size(0);
	 
		// STEP 1:  Iteration through all volume types. Min is calculated one type at a time.
		//REMEMBER:EDIT: Changed starting value of Curr_vol_type to 1	
		for (Curr_vol_type = 1; Curr_vol_type < number_of_volume_types; (Curr_vol_type)++ )
		{	
			
			current_solid_sum = 0;
			int number_of_solids;
			number_of_solids = curVoxelNode->SolidType[ Curr_vol_type ];
				// The number of solids of this type in the current voxel for the current track
				// NOTE: There may be an overhead here. If more than one thread is in the same voxel and reads this data, mem access not coalesced
		
            Numbers_Of_Solid[ locationId ] = number_of_solids;
                // At this stage a parallel scan sum has to be called which updates the sums for all the solids. The function is inlined.
			
			
		    BARRIER_FLEXIBLE;
		        // Barrier to ensure that all threads have filled up the shared mem array. See everything.h for definition
			
			//int all_threads = get_global_size(1);
			
			Prefix_Sum( Numbers_Of_Solid, Sum_Of_Solids, number_of_threads );
				// The array Sum_Of_Solids stores the final result after the Prefix sum scan. 
			
				//Result[PrevSum + current_solid_sum ] = locationId;

			PrevSum = Sum_Of_Solids[ locationId ];
			
			
			
			// NOTE: The use of contentNo below is to maintain some resemblance to Otto's definition in VoxelNavigation.c 
			if ( noStepLocal )
			{
			  for( contentNo = curNoVolumes-1; contentNo>=0 ; contentNo--)
			  {	
				sampleNo = G4VoxelNode_GetVolume(curVoxelNode,contentNo);
				samplePhysical = G4LogicalVolume_GetDaughter(motherLogical,sampleNo);
				//if ( samplePhysical != blockedExitedVol )
				{	
					// NOTE: blockedExitedVol check makes sense for the serial version. Does not make as much sense to keep it on in the parallel version as well.
					sampleSolid = G4LogicalVolume_GetSolid( G4VPhysicalVolume_GetLogicalVolume( samplePhysical ));
					
					// NOTE: We iterate over all solids and compare if the solid type is what we are looking for. If it is then it is added
					// to the array Solids in shared mem. However, a better implementation is to have a way such that the solids are returned in sorted order 
					// the first place.

					if( sampleSolid->type == Curr_vol_type)
					{	
						// Should the solid be stored or the Physical Volume??
						SolidInfo Info = { samplePhysical, locationId };
						Solids[ PrevSum + current_solid_sum ] = Info;
						
						// PrevSum is the sum of all solids of that type for all threads upto this element not including the current thread.
						// current_solid_sum at the end of all iterations should be equal to the number_of_solids.
		      			current_solid_sum++;
					}
				  }
			   } 
			}
			//EDIT: Change to BARRIER_LOCAL when not testing GLOBAL_MODE 
            BARRIER_FLEXIBLE;
			
			//MODIFY : Perhaps a check is in order here.
			// One check at this point could be whether current_solid_sum is equal to the number_of_solids.
			// Before proceeding to call the kernel for calculating the min, check if code works up to this point
			
			
			
			int Total_solids_of_this_type;
			Total_solids_of_this_type = Numbers_Of_Solid[ number_of_threads - 1 ] + Sum_Of_Solids[ number_of_threads - 1 ];
			
			if ( Total_solids_of_this_type > BlockSize * Multiplier)
		   {
			   Total_solids_of_this_type = BlockSize * Multiplier;
			   // Update Errors or return error from here?
			}
		   
			
				// Checking a candidate solid of current type.
				int k=0;
				int iterations = (Total_solids_of_this_type / number_of_threads);
				for( k = 0; k < iterations ; k++ )
				{	
					int Work_id = locationId + number_of_threads*k;
					if( ( Work_id ) < Total_solids_of_this_type)
					{	
						GEOMETRYLOC G4VPhysicalVolume * candPhysical = ( Solids[ Work_id  ].PVolume);
						GEOMETRYLOC G4VSolid * candSolid = ((candPhysical)->flogical)->fSolid;
						int candId = Solids[ Work_id].trackId;
						G4ThreeVector candGlobalPoint = LocationArray[candId].Point;
						G4ThreeVector candGlobalDirection = LocationArray[candId].Direction;
						G4AffineTransform candTf = G4AffineTransform_create_full(
					                                G4VPhysicalVolume_GetObjectRotationValue(candPhysical),
													G4VPhysicalVolume_GetTranslation(candPhysical));

						G4AffineTransform_Invert( &candTf );
				     	const G4ThreeVector candPoint=
						           G4AffineTransform_TransformPoint(&candTf,candGlobalPoint);
						const G4double candSafety = G4VSolid_DistanceToIn( candSolid, candPoint );
						const G4ThreeVector candDirection = G4AffineTransform_TransformAxis( &candTf, candGlobalDirection );
						const G4double candStep = G4VSolid_DistanceToIn_full( candSolid, candPoint, candDirection );
							// Result_For_Current_Solid should hold data for the sampleSafety and the sampleStep
						// NOTE: In this version of navigation the step is calculated along with the safety. It is not checked if
						// step<safety; the safety is only calculated here because the physics may require it later.
						ResultInfo Result_of_Solid = { candSafety, candStep, candId, candPhysical } ;
						Result_For_Current_Solid[ Work_id ] = Result_of_Solid ;					
					}
				    BARRIER_FLEXIBLE;
					// REMOVE: Barrier used in debugging stage. Remove when done.
				}
			
			    	
				// At this point all the safeties and steps have been calculated, now to find the minimum step for the current solid per track.
				// find_minimum...
				// Most basic implementation for finding minimum, a proper min finding algorithm that takes into account threadIds is hard to find.
				
				Find_minimum ( Result_For_Current_Solid, Compacter_Result, PrevSum, number_of_solids );
					// Minimum finding function that finds the minimum step per track in Result_For_Current_Solid and
					// stores the minimum step, the safety and the pointer to the sampleSolid ( physical? ) 
					// The minimum is basically compared to the existing value and stored if smaller. This way find_minimum does not
					// care about which solid type is currently being processed.
					// See gpu.c where Compacter Results initial step value is set to kInfinfity.
					//if( Curr_vol_type ==1)	

		}
		
			//if( Curr_vol_type == 1)
			
	    if (  Compacter_Result[ locationId ].step <= ourStep )	
       {
	     ourStep = Compacter_Result[ locationId ].step;
	 	 *pBlockedPhysical = Compacter_Result[ locationId ].PVolume;
	     *entering = true;
		 *exiting = false;
		 
		}
		
	    if (initialNode)
	    {	
			initialNode = false;
			voxelSafety = G4NewNavigation_ComputeVoxelSafety(This,localPoint);
			if ( voxelSafety<ourSafety )
			{
				ourSafety = voxelSafety;
			}

		    if ( currentProposedStepLength<ourSafety )
			{
				// Guaranteed physics limited
				//      
				noStepLocal = false;
		        *entering = false;
				*exiting = false;
			    *pBlockedPhysical = GEOMETRYNULL;
				ourStep = kInfinity;
			}
			else
			{
				//
				// Compute mother intersection if required
				//
				if ( motherSafety<=ourStep )
				{
			
			        G4double motherStep =
						G4VSolid_DistanceToOut_full( motherSolid, localPoint, localDirection,
                                         true, validExitNormal, exitNormal);

					if ( motherStep<=ourStep )
					{
						ourStep = motherStep;
						*exiting = true;
						*entering = false;
						if ( *validExitNormal )
						{
							G4RotationMatrix rot = G4VPhysicalVolume_GetObjectRotationValue(motherPhysical);
							G4RotationMatrix inv = G4RotationMatrix_inverse(&rot);
				
							*exitNormal = G4RotationMatrix_apply( &inv, *exitNormal );
						}
					}
					else
					{
						*validExitNormal = false;
					}
				}	
			}
			*newSafety = ourSafety;
		}
	
		if (noStepLocal)
		{
			noStepLocal = G4NewNavigation_LocateNextVoxel(This, localPoint, localDirection, ourStep);
		}

		noStepArray[ locationId ] = noStepLocal;
		BARRIER_FLEXIBLE;

		noStepAll = NoStepReduction( noStepArray, number_of_threads);
		
		//Prefix_Sum ( noStepArray, noStepArray, number_of_threads);
		
 }  // end -while (noStep)- loop

 // Double check to make sure all threads have reached here before exiting

BARRIER_FLEXIBLE;

	//Result[ locationId ] = ourStep;	
  return ourStep;
}
__kernel calculate(__global float *sunGeom,
                   __global float *sunVarGeom,
                   __global float *sunSlopeGeom,
                   __local float *gridGeom,
                   __constant float *const_f,
                   __constant float *const_i,
                   
                   __global float *horizonArr,
                   __constant float *z,
                   __global float *o,
                   __constant float *s,
                   __constant float *li,
                   __global float *a,
                   __constant float *latitudeArray,
                   __constant float *longitudeArray,
                   __global float *cbhr,
                   __global float *cdhr,

                   __global float *lumcl,
                   __global float *beam,
                   __global float *globrad,
                   __global float *insol,
                   __global float *diff,
                   __global float *refl )
{
    unsigned int gid = get_global_id(0);
    unsigned int gsz = const_i[0]*const_i[1];
    unsigned int lid = get_local_id(0);
    unsigned int lsz = get_local_size(0);
    float longitTime = 0.0f;
    float o_orig;
    
    //Don't overrun arrays
    if (gid >= gsz)
        return;
    
    if (const_i[3])
        longitTime = -longitudeArray[gid] / 15.0f;
    
    gridGeom[4*lsz+lid] = gridGeom[2*lsz+lid] = (float)(gid / const_i[1]) *const_f[14];
    gridGeom[5*lsz+lid] = gridGeom[3*lsz+lid] = (float)(gid % const_i[1]) *const_f[15];
    
    gridGeom[    lid] = const_f[7] + gridGeom[2*lsz+lid];
    gridGeom[lsz+lid] = const_f[8] + gridGeom[3*lsz+lid];
    
    if (const_i[13]) {
        float coslat = cos(const_f[3] * gridGeom[lsz+lid]);
        sunVarGeom[11*gsz+gid] = coslat * coslat;
    }
    
    float z1 = sunVarGeom[gsz+gid] = sunVarGeom[3*gsz+gid] = z[gid];
    
    if (z1 == UNDEFZ)
        return;
    
    float latitude, longitude, aspect, slope;
    
    if (const_i[14] != NULL) {
        if (o[gid] != 0.0f)
            aspect = sunSlopeGeom[3*gsz+gid] = o[gid] * const_f[3];
        else
            aspect = sunSlopeGeom[3*gsz+gid] = UNDEF;
    } else {
        aspect = sunSlopeGeom[3*gsz+gid] = const_f[23];
    }
    
    if (const_i[18] != NULL)
        latitude = latitudeArray[gid]*const_f[3];
    
    if (const_i[19] != NULL)
        longitude = longitudeArray[gid]*const_f[3];
    
    if (const_i[9] == PROJECTION_LL) {		/* ll projection */
        longitude = gridGeom[lid]*const_f[3];
        latitude = gridGeom[lsz+lid]*const_f[3];
    }
    
    if (const_i[15] == NULL)
        slope = const_f[22];
    else
        slope = s[gid];
        
    float cos_u = cos(const_f[1] - slope);	/* = sin(slope) */
    float sin_u = sin(const_f[1] - slope);	/* = cos(slope) */
    float cos_v = cos(const_f[1] + aspect);
    float sin_v = sin(const_f[1] + aspect);
    
    if (const_i[5] != NULL)
        sunGeom[7*gsz+gid] = const_i[8];
    
    float geom_sinlat = gridGeom[6*lsz+lid] = sin(-latitude);
    float geom_coslat = gridGeom[7*lsz+lid] = cos(-latitude);
    float sin_phi_l = -geom_coslat * cos_u * sin_v + geom_sinlat * sin_u;
    
    sunSlopeGeom[      gid] = atan(-cos_u * cos_v / (geom_sinlat * cos_u * sin_v + geom_coslat * sin_u));
    sunSlopeGeom[  gsz+gid] = cos(asin(sin_phi_l)) * const_f[40];
    sunSlopeGeom[2*gsz+gid] = sin_phi_l * const_f[39];
    
    if ((const_i[22] != NULL) || someRadiation)
        com_par_const(sunGeo, gridGeom, const_f, const_i, longitTime);
    
    if (const_i[22] != NULL) {
        com_par(sunGeom, sunVarGeom, gridGeom,
                const_f, const_i, latitude, longitude);
        float lum = lumcline2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom,
                              const_f, const_f, horizonArr, z, gid*const_i[7]);
        
        if (lum > 0.0f) {
            lum = rad2deg * asin(lum);
            lumcl[gid] = (float)lum;
        } else {
            lumcl[gid] = UNDEFZ;
        }
    }

    if (someRadiation) {
        joules2(sunGeom, sunVarGeom, sunSlopeGeom, gridGeom,
                const_f, const_i, horizonArr, z, s, li, a, cbhr, cdhr,
                beam, insol, diff, refl, globrad,
                gid*const_i[7], latitude, longitude);
    }
}
Beispiel #9
0
void step_bodies(
		struct Body * bodies,
		struct Pair * pairs,
		unsigned int * map,
		float dt,
		unsigned int num_bodies, // in
		float * velocity_ratio, // in/out
		float * mass_center, // in
		float mass, // in
		unsigned int * number_escaped // out
		)
{
	/* work group */
	int local_block = num_bodies / get_num_groups(0);

	unsigned int i_group0 = get_group_id(0) * local_block;
	unsigned int i_group1 = i_group0 + local_block;

	if(get_group_id(0) == (get_num_groups(0) - 1)) i_group1 = num_bodies;

	/* work item */
	int block = (i_group1 - i_group0) / get_local_size(0);

	unsigned int i_local0 = i_group0 + get_local_id(0) * block;
	unsigned int i_local1 = i_local0 + block;

	if(get_local_id(0) == (get_local_size(0) - 1)) i_local1 = i_group1;

	/*
	   printf("local_block = %i\n", local_block);
	   printf("block = %i\n", block);
	   */
	/*
	   printf("i_local0 = %i\n", i_local0);
	   printf("i_local1 = %i\n", i_local1);
	   */
	/* copy data for work group */
	//__local struct Pair local_pairs[NUM_PAIRS];
	//__local struct BodyMap local_bodymaps[NUM_BODIES / NUM_GROUPS];

	//event_t e0 = async_work_group_copy((__local char *)local_pairs, (char *)pairs, NUM_PAIRS * sizeof(struct Pair), 0);
	//wait_group_events(1, &e0);

	//event_t e1 = async_work_group_copy((__local char *)local_bodymaps, (char *)(bodymaps + i_group0), (i_group1 - i_group0) * sizeof(struct BodyMap), 0);
	//wait_group_events(1, &e1);

	/* */
	float f[3];

	//__local struct BodyMap * pbm = 0;
	//struct BodyMap * pbm = 0;


	Body * pb = 0;

	for(unsigned int b = i_local0; b < i_local1; b++)
	{
		//pbm = local_bodymaps + b;
		//pbm = bodymaps + b;

		pb = bodies + b;

		if(pb->alive == 0)
		{
			//puts("body dead");
			continue;
		}

		f[0] = 0;
		f[1] = 0;
		f[2] = 0;

		for(unsigned int i = 0; i < num_bodies; i++)
		{
			if(b == i) continue;

			//__local struct Pair * pp = &local_pairs[pbm->pair[p]];
			Pair * pp = pairs + map[b * num_bodies + i];

			if(pp->_M_alive == 0) continue;


			if(pp->b0 == b)
			{
				f[0] -= pp->u[0] * pp->f;
				f[1] -= pp->u[1] * pp->f;
				f[2] -= pp->u[2] * pp->f;
			}
			else if(pp->b1 == b)
			{
				f[0] += pp->u[0] * pp->f;
				f[1] += pp->u[1] * pp->f;
				f[2] += pp->u[2] * pp->f;
			}
			else
			{
				assert(0);
			}
		}

		float dv[3];

		if(0)
		{
			dv[0] = dt * f[0] / pb->mass;
			dv[1] = dt * f[1] / pb->mass;
			dv[2] = dt * f[2] / pb->mass;
		}
		else
		{
			dv[0] = dt * pb->f[0] / pb->mass;
			dv[1] = dt * pb->f[1] / pb->mass;
			dv[2] = dt * pb->f[2] / pb->mass;
		}
		//print(pb->f);

		if(
				(!feq(pb->f[0], f[0])) ||
				(!feq(pb->f[1], f[1])) ||
				(!feq(pb->f[2], f[2]))
		  )
		{
			print(f);
			print(pb->f);
			abort();
		}

		assert(std::isfinite(pb->mass));
		assert(std::isfinite(dt));
		assert(std::isfinite(pb->f[0]));
		assert(std::isfinite(pb->f[1]));
		assert(std::isfinite(pb->f[2]));

		// reset accumulating force
		pb->f[0] = 0;
		pb->f[1] = 0;
		pb->f[2] = 0;


		float e = 0.01;

		float rat[3];
		rat[0] = fabs(dv[0] / pb->v[0]);
		rat[1] = fabs(dv[1] / pb->v[1]);
		rat[2] = fabs(dv[2] / pb->v[2]);

		// atomic
		if(std::isfinite(rat[0])) if(rat[0] > velocity_ratio[0]) velocity_ratio[0] = rat[0];
		if(std::isfinite(rat[1])) if(rat[1] > velocity_ratio[1]) velocity_ratio[1] = rat[1];
		if(std::isfinite(rat[2])) if(rat[2] > velocity_ratio[2]) velocity_ratio[2] = rat[2];

		if(0)
		{
			if(
					((std::isfinite(rat[0])) && (rat[0] > e)) ||
					((std::isfinite(rat[1])) && (rat[1] > e)) ||
					((std::isfinite(rat[2])) && (rat[2] > e))
			  )
			{
				printf("% 12f % 12f % 12f\n",
						rat[0],
						rat[1],
						rat[2]);
			}
		}

		pb->v[0] += dv[0];
		pb->v[1] += dv[1];
		pb->v[2] += dv[2];

		pb->x[0] += dt * pb->v[0];
		pb->x[1] += dt * pb->v[1];
		pb->x[2] += dt * pb->v[2];

		// distance from mass center
		float r[3];
		r[0] = pb->x[0] - mass_center[0];
		r[1] = pb->x[1] - mass_center[1];
		r[2] = pb->x[2] - mass_center[2];

		float d = sqrt(r[0]*r[0] + r[1]*r[1] + r[2]*r[2]);

		float escape_speed2 = 2.0 * 6.67e-11 * mass / d;

		float s2 = pb->v[0]*pb->v[0] + pb->v[1]*pb->v[1] + pb->v[2]*pb->v[2];

		// dot product of velocity and displacement vector
		float dot = pb->v[0] * r[0] + pb->v[1] * r[1] + pb->v[2] * r[2];

		if(s2 > (escape_speed2)) // speed exceeds escape speed
		{
			if(dot > 0.0) // parallel componenet points away from mass_center
			{
				// atomic
				(*number_escaped)++;
				//printf("escape!\n");
			}
		}
	}
}
		 __ArgLast(float, epsSqr)
// dynemically defined local memory as argument
//         __ArgLast(__local float4*, localPos)

{
    unsigned int tid = get_local_id(0);

    unsigned int gid = get_global_id(0);

    unsigned int localSize = get_local_size(0);


    // Number of tiles we need to iterate

    unsigned int numTiles = numBodies / localSize;

// statically declared local memory
    __Local(float4, localPos, 256);

    // position of this work-item

    float4 myPos = pos[gid];

    float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);



    for(int i = 0; i < (int)numTiles; ++i)
    {

        // load one tile into local memory

//        int idx = i * localSize + tid;
        uint idx = mad24( (uint)i, localSize, tid);
        localPos[tid] = pos[idx];



        // Synchronize to make sure data is available for processing

        barrier(CLK_LOCAL_MEM_FENCE);


        // calculate acceleration effect due to each body

        // a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)

        for(int j = 0; j < (int)localSize; ++j)

        {

            // Calculate acceleartion caused by particle j on particle i

            float4 r = localPos[j] - myPos;

            float distSqr = r.x * r.x  +  r.y * r.y  +  r.z * r.z;

            float invDist = 1.0f / sqrt(distSqr + epsSqr);

            float invDistCube = invDist * invDist * invDist;

            float s = localPos[j].w * invDistCube;



            // accumulate effect of all particles
            acc += ((float4)s * r);

        }



        // Synchronize so that next tile can be loaded

        barrier(CLK_LOCAL_MEM_FENCE);

    }



    float4 oldVel = vel[gid];



    // updated position and velocity

    float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;

    newPos.w = myPos.w;



    float4 newVel = oldVel + acc * deltaTime;



    // write to global memory

    pos[gid] = newPos;

    vel[gid] = newVel;

// MANDATORY
	__Return;

}
Beispiel #11
0
kernel void convolute(int4 imagesize, global unsigned char *input,
                      global unsigned char *output, global kernf *filterG) {
  int4 gid = (int4)(get_global_id(0)*CONV_UNROLL,  get_global_id(1),  get_global_id(2),  0);
  int4 lid = (int4)(get_local_id(0), get_local_id(1), get_local_id(2), 0);
  int4 group = (int4)(get_group_id(0), get_group_id(1), get_group_id(2), 0);
  // First (?) pixel to process with this kernel
  int4 pixelid = gid;

  // Starting offset of the first pixel to process
  int imoffset = pixelid.s0 + imagesize.s0 * pixelid.s1 + imagesize.s0 * imagesize.s1 * pixelid.s2;
  int i,j;

  int dx,dy,dz;

  /* MAD performs a single convolution operation for each kernel,
     using the current 'raw' value as the input image
     'ko' as an instance of an unrolled convolution filter
     'pos' as the X-offset for each of the unrolled convolution filters
     Note that all the if statements dependent only on static values -
     meaning that they can be optimized away by the compiler
  */
#define MAD(ko,pos) {if(CONV_UNROLL>ko) {    \
      if(pos-ko >= 0 && pos-ko < kernsize) {    \
        val[ko] = mmad(val[ko],(kernf)(raw),filter[(pos-ko)+offset]);   \
      }}}
#define MADS(pos) {if(pos<kernsize) { \
    raw=input[imoffset2+pos];       \
    MAD(0,pos); MAD(1,pos); MAD(2,pos); MAD(3,pos); MAD(4,pos); MAD(5,pos); MAD(6,pos); MAD(7,pos); \
    MAD(8,pos); MAD(9,pos); MAD(10,pos); MAD(11,pos); MAD(12,pos); MAD(13,pos); MAD(14,pos); MAD(15,pos); \
    MAD(16,pos); MAD(17,pos); MAD(18,pos); MAD(19,pos); MAD(20,pos); MAD(21,pos); MAD(22,pos); MAD(23,pos); \
    MAD(24,pos); MAD(25,pos); MAD(26,pos); MAD(27,pos); MAD(28,pos); MAD(29,pos); MAD(30,pos); MAD(31,pos); \
    MAD(32,pos); MAD(33,pos); MAD(34,pos); MAD(35,pos); MAD(36,pos); MAD(37,pos); MAD(38,pos); MAD(39,pos); \
    }}

  kernf val[CONV_UNROLL];
  for(j=0;j<CONV_UNROLL;j++)
    val[j]=(kernf)(0.0);

  int localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
  local kernf filter[kernsize*kernsize*kernsize];

  /* Copy global filter to local memory */
  event_t event = async_work_group_copy(filter,filterG,kernsize*kernsize*kernsize,0);
  wait_group_events(1, &event);

  if(gid.s0 + kernsize + CONV_UNROLL > imagesize.s0 ||
     gid.s1 + kernsize > imagesize.s1 ||
     gid.s2 + kernsize > imagesize.s2) return;

  for(dz=0;dz<kernsize;dz++)
    for(dy=0;dy<kernsize;dy++)  {
      int offset = dy*kernsize*nkernels + dz*kernsize*kernsize*nkernels;
      int imoffset2 = imoffset+dy*imagesize.s0 + dz*imagesize.s0*imagesize.s1;
      unsigned char raw;

      /* kernsize + convolution_unroll < 42 */
      MADS(0); MADS(1); MADS(2); MADS(3); MADS(4); MADS(5);
      MADS(6); MADS(7); MADS(8); MADS(9); MADS(10); MADS(11);
      MADS(12); MADS(13); MADS(14); MADS(15); MADS(16); MADS(17);
      MADS(18); MADS(19); MADS(20); MADS(21); MADS(22); MADS(23);
      MADS(24); MADS(25); MADS(26); MADS(27); MADS(28); MADS(29);
      MADS(30); MADS(31); MADS(32); MADS(33); MADS(34); MADS(35);
      MADS(36); MADS(37); MADS(38); MADS(39); MADS(40); MADS(41);
    }

  for(j=0;j<CONV_UNROLL;j++) {
    kernstore( convert_kernuc(val[j]), imoffset+j, output);
  }
}